@aws/ml-container-creator 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/bin/cli.js +31 -137
  2. package/config/parameter-schema-v2.json +2065 -0
  3. package/package.json +6 -3
  4. package/servers/lib/catalogs/jumpstart-public.json +101 -16
  5. package/servers/lib/catalogs/models.json +182 -26
  6. package/src/app.js +6 -389
  7. package/src/lib/bootstrap-command-handler.js +75 -1078
  8. package/src/lib/bootstrap-profile-manager.js +634 -0
  9. package/src/lib/bootstrap-provisioners.js +421 -0
  10. package/src/lib/config-loader.js +405 -0
  11. package/src/lib/config-manager.js +59 -1668
  12. package/src/lib/config-mcp-client.js +118 -0
  13. package/src/lib/config-validator.js +634 -0
  14. package/src/lib/cuda-resolver.js +140 -0
  15. package/src/lib/e2e-catalog-validator.js +251 -3
  16. package/src/lib/e2e-ci-recorder.js +103 -0
  17. package/src/lib/generated/cli-options.js +471 -0
  18. package/src/lib/generated/parameter-matrix.js +671 -0
  19. package/src/lib/generated/validation-rules.js +202 -0
  20. package/src/lib/marketplace-flow.js +276 -0
  21. package/src/lib/mcp-query-runner.js +768 -0
  22. package/src/lib/parameter-schema-validator.js +62 -18
  23. package/src/lib/prompt-runner.js +41 -1504
  24. package/src/lib/prompts/feature-prompts.js +172 -0
  25. package/src/lib/prompts/index.js +48 -0
  26. package/src/lib/prompts/infrastructure-prompts.js +690 -0
  27. package/src/lib/prompts/model-prompts.js +552 -0
  28. package/src/lib/prompts/project-prompts.js +70 -0
  29. package/src/lib/prompts.js +2 -1446
  30. package/src/lib/registry-command-handler.js +135 -3
  31. package/src/lib/secrets-prompt-runner.js +251 -0
  32. package/src/lib/template-variable-resolver.js +398 -0
  33. package/templates/code/serve +5 -134
  34. package/templates/code/serve.d/lmi.ejs +19 -0
  35. package/templates/code/serve.d/sglang.ejs +47 -0
  36. package/templates/code/serve.d/tensorrt-llm.ejs +53 -0
  37. package/templates/code/serve.d/vllm.ejs +48 -0
  38. package/templates/do/clean +1 -1387
  39. package/templates/do/clean.d/async-inference.ejs +508 -0
  40. package/templates/do/clean.d/batch-transform.ejs +512 -0
  41. package/templates/do/clean.d/hyperpod-eks.ejs +481 -0
  42. package/templates/do/clean.d/managed-inference.ejs +1043 -0
  43. package/templates/do/deploy +1 -1766
  44. package/templates/do/deploy.d/async-inference.ejs +501 -0
  45. package/templates/do/deploy.d/batch-transform.ejs +529 -0
  46. package/templates/do/deploy.d/hyperpod-eks.ejs +339 -0
  47. package/templates/do/deploy.d/managed-inference.ejs +726 -0
  48. package/config/parameter-schema.json +0 -88
@@ -1,1766 +1 @@
1
- #!/bin/bash
2
- # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- set -e
6
- set -u
7
- set -o pipefail
8
-
9
- # Parse flags
10
- FORCE_NEW=false
11
- FORCE_IC=false
12
- IC_TARGET=""
13
- while [ $# -gt 0 ]; do
14
- case "$1" in
15
- --force) FORCE_NEW=true; shift ;;
16
- --force-ic)
17
- FORCE_IC=true
18
- shift
19
- <% if (deploymentTarget === 'realtime-inference') { %>
20
- # Optional name argument: --force-ic <name>
21
- if [ $# -gt 0 ] && [[ ! "$1" == --* ]]; then
22
- IC_TARGET="$1"
23
- shift
24
- fi
25
- <% } %>
26
- ;;
27
- <% if (deploymentTarget === 'realtime-inference') { %>
28
- --ic)
29
- if [ -z "${2:-}" ]; then
30
- echo "❌ --ic requires a name argument"
31
- echo " Usage: ./do/deploy --ic <name>"
32
- exit 1
33
- fi
34
- IC_TARGET="$2"
35
- shift 2
36
- ;;
37
- <% } %>
38
- --help|-h)
39
- <% if (deploymentTarget === 'realtime-inference') { %>
40
- echo "Usage: ./do/deploy [--force] [--force-ic [<name>]] [--ic <name>]"
41
- echo ""
42
- echo "Options:"
43
- echo " --force Create a new endpoint and IC, even if one already exists."
44
- echo " --force-ic Recreate ALL inference components on the existing endpoint."
45
- echo " --force-ic <name> Recreate only the named IC on the existing endpoint."
46
- echo " --ic <name> Deploy only the named IC (from do/ic/<name>.conf)."
47
- echo ""
48
- echo "Without flags, deploy resumes from the last run."
49
- <% } else { %>
50
- echo "Usage: ./do/deploy [--force] [--force-ic]"
51
- echo ""
52
- echo "Options:"
53
- echo " --force Create a new endpoint, even if one already exists."
54
- echo " --force-ic Recreate the inference component on the existing endpoint."
55
- echo ""
56
- echo "Without flags, deploy resumes from the last run."
57
- <% } %>
58
- exit 0
59
- ;;
60
- *)
61
- echo "❌ Unknown option: $1"
62
- echo " Run ./do/deploy --help for usage."
63
- exit 1
64
- ;;
65
- esac
66
- done
67
-
68
- # Source configuration
69
- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
70
- source "${SCRIPT_DIR}/config"
71
-
72
- echo "🚀 Deploying to AWS"
73
- echo " Project: ${PROJECT_NAME}"
74
- echo " Deployment config: ${DEPLOYMENT_CONFIG}"
75
- echo " Region: ${AWS_REGION}"
76
- echo " Build target: ${BUILD_TARGET}"
77
- echo " Deployment target: ${DEPLOYMENT_TARGET}"
78
- <% if (deploymentTarget === 'realtime-inference') { %>
79
- if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
80
- echo " Endpoint: ${ENDPOINT_NAME} (external)"
81
- else
82
- echo " Instance type: ${INSTANCE_TYPE}"
83
- fi
84
- <% } else if (deploymentTarget === 'async-inference') { %>
85
- echo " Instance type: ${INSTANCE_TYPE}"
86
- echo " S3 output: ${ASYNC_S3_OUTPUT_PATH}"
87
- echo " SNS success: ${ASYNC_SNS_SUCCESS_TOPIC}"
88
- echo " SNS error: ${ASYNC_SNS_ERROR_TOPIC}"
89
- <% if (asyncMaxConcurrentInvocations) { %>
90
- echo " Max concurrent: ${ASYNC_MAX_CONCURRENT_INVOCATIONS}"
91
- <% } %>
92
- <% } else if (deploymentTarget === 'hyperpod-eks') { %>
93
- echo " HyperPod cluster: ${HYPERPOD_CLUSTER_NAME}"
94
- echo " Namespace: ${HYPERPOD_NAMESPACE}"
95
- echo " Replicas: ${HYPERPOD_REPLICAS}"
96
- <% } else if (deploymentTarget === 'batch-transform') { %>
97
- echo " Instance type: ${INSTANCE_TYPE}"
98
- echo " S3 input: ${BATCH_INPUT_PATH}"
99
- echo " S3 output: ${BATCH_OUTPUT_PATH}"
100
- echo " Instance count: ${BATCH_INSTANCE_COUNT}"
101
- echo " Split type: ${BATCH_SPLIT_TYPE}"
102
- echo " Strategy: ${BATCH_STRATEGY}"
103
- <% } %>
104
-
105
- # Check AWS credentials
106
- echo "🔍 Validating AWS credentials..."
107
- if ! aws sts get-caller-identity &> /dev/null; then
108
- echo "❌ AWS credentials not configured"
109
- echo " Run: aws configure"
110
- echo " Or set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables"
111
- exit 4
112
- fi
113
-
114
- AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
115
- echo "✅ AWS credentials validated (Account: ${AWS_ACCOUNT_ID})"
116
-
117
- # Construct ECR repository URL
118
- ECR_REPOSITORY="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_REPOSITORY_NAME}"
119
-
120
- # ============================================================
121
- # Shared: Verify ECR image exists
122
- # ============================================================
123
- echo "🔍 Verifying ECR image exists..."
124
- if ! aws ecr describe-images \
125
- --repository-name "${ECR_REPOSITORY_NAME}" \
126
- --image-ids imageTag="${PROJECT_NAME}-latest" \
127
- --region "${AWS_REGION}" &> /dev/null; then
128
-
129
- echo "❌ ECR image not found: ${ECR_REPOSITORY}:${PROJECT_NAME}-latest"
130
- echo ""
131
- echo "Please build and push your image first:"
132
- echo " ./do/submit"
133
- echo ""
134
- echo "After the build completes successfully, run this deploy script again."
135
- exit 4
136
- fi
137
-
138
- echo "✅ ECR image found: ${ECR_REPOSITORY}:${PROJECT_NAME}-latest"
139
- IMAGE_TAG="${PROJECT_NAME}-latest"
140
-
141
- # ============================================================
142
- # Shared: Resolve secrets for container environment
143
- # ============================================================
144
- CONTAINER_ENV_JSON=""
145
-
146
- if [ -n "${HF_TOKEN_ARN:-}" ]; then
147
- echo "🔐 Resolving HuggingFace token from Secrets Manager..."
148
- RESOLVED_HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "${HF_TOKEN_ARN}" --query SecretString --output text --region "${AWS_REGION}") || {
149
- echo "❌ Failed to resolve HuggingFace token from Secrets Manager"
150
- exit 3
151
- }
152
- CONTAINER_ENV_JSON="\"HF_TOKEN\":\"${RESOLVED_HF_TOKEN}\""
153
- elif [ -n "${HF_TOKEN:-}" ]; then
154
- CONTAINER_ENV_JSON="\"HF_TOKEN\":\"${HF_TOKEN}\""
155
- fi
156
-
157
- if [ -n "${NGC_API_KEY_ARN:-}" ]; then
158
- echo "🔐 Resolving NGC API key from Secrets Manager..."
159
- RESOLVED_NGC_KEY=$(aws secretsmanager get-secret-value --secret-id "${NGC_API_KEY_ARN}" --query SecretString --output text --region "${AWS_REGION}") || {
160
- echo "❌ Failed to resolve NGC API key from Secrets Manager"
161
- exit 3
162
- }
163
- if [ -n "${CONTAINER_ENV_JSON}" ]; then
164
- CONTAINER_ENV_JSON="${CONTAINER_ENV_JSON},\"NGC_API_KEY\":\"${RESOLVED_NGC_KEY}\""
165
- else
166
- CONTAINER_ENV_JSON="\"NGC_API_KEY\":\"${RESOLVED_NGC_KEY}\""
167
- fi
168
- elif [ -n "${NGC_API_KEY:-}" ]; then
169
- if [ -n "${CONTAINER_ENV_JSON}" ]; then
170
- CONTAINER_ENV_JSON="${CONTAINER_ENV_JSON},\"NGC_API_KEY\":\"${NGC_API_KEY}\""
171
- else
172
- CONTAINER_ENV_JSON="\"NGC_API_KEY\":\"${NGC_API_KEY}\""
173
- fi
174
- fi
175
-
176
- <% if (deploymentTarget === 'realtime-inference') { %>
177
- # ============================================================
178
- # SageMaker Real-Time Inference Deployment (Inference Components)
179
- # ============================================================
180
-
181
- # Source shared helpers
182
- source "${SCRIPT_DIR}/lib/secrets.sh"
183
- source "${SCRIPT_DIR}/lib/wait.sh"
184
- source "${SCRIPT_DIR}/lib/endpoint-config.sh"
185
- source "${SCRIPT_DIR}/lib/inference-component.sh"
186
-
187
- # Validate execution role ARN
188
- if [ -z "${ROLE_ARN:-}" ]; then
189
- echo "❌ Execution role ARN not provided"
190
- echo ""
191
- echo "Usage:"
192
- echo " export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE"
193
- echo " ./do/deploy"
194
- echo ""
195
- echo "Or set ROLE_ARN in do/config"
196
- echo ""
197
- echo "The execution role must have permissions for:"
198
- echo " • SageMaker endpoint and inference component management"
199
- echo " • ECR image access"
200
- echo " • S3 access (if using model artifacts)"
201
- echo " • CloudWatch Logs"
202
- exit 3
203
- fi
204
-
205
- echo " Using execution role: ${ROLE_ARN}"
206
-
207
- # Validate --ic argument if specified (set by --ic <name> or --force-ic <name>)
208
- if [ -n "${IC_TARGET}" ]; then
209
- if [ ! -d "${SCRIPT_DIR}/ic" ]; then
210
- echo "❌ IC name specified but no do/ic/ directory found"
211
- echo " This project does not use multi-IC configuration."
212
- echo " Remove --ic/--force-ic <name> to deploy using the legacy single-IC path."
213
- exit 1
214
- fi
215
- if [ ! -f "${SCRIPT_DIR}/ic/${IC_TARGET}.conf" ]; then
216
- echo "❌ IC config not found: do/ic/${IC_TARGET}.conf"
217
- echo ""
218
- echo " Available ICs:"
219
- for conf in "${SCRIPT_DIR}"/ic/*.conf; do
220
- [ -f "${conf}" ] || continue
221
- echo " • $(basename "${conf}" .conf)"
222
- done
223
- echo ""
224
- echo " Usage: ./do/deploy --ic <name>"
225
- exit 1
226
- fi
227
- fi
228
-
229
- # Resolve container secrets (HF_TOKEN, NGC_API_KEY)
230
- resolve_secrets
231
-
232
- # ============================================================
233
- # Idempotency: check for existing deployment from a previous run
234
- # ============================================================
235
- SKIP_TO=""
236
-
237
- if [ "${FORCE_NEW}" = true ]; then
238
- echo "🔄 --force: ignoring previous deployment, creating new resources."
239
- elif [ "${FORCE_IC}" = true ] && [ -n "${ENDPOINT_NAME:-}" ]; then
240
- EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")
241
- if [ "${EP_STATUS}" = "InService" ]; then
242
- if [ -n "${IC_TARGET}" ]; then
243
- echo "🔄 --force-ic: recreating IC '${IC_TARGET}' on existing endpoint: ${ENDPOINT_NAME}"
244
- else
245
- echo "🔄 --force-ic: recreating ALL inference components on existing endpoint: ${ENDPOINT_NAME}"
246
- fi
247
- SKIP_TO="create_ic"
248
- else
249
- echo "⚠️ --force-ic requires an InService endpoint, but ${ENDPOINT_NAME} is: ${EP_STATUS:-not found}"
250
- echo " Use --force to create a new endpoint, or wait for the current one."
251
- exit 4
252
- fi
253
- elif [ -n "${ENDPOINT_NAME:-}" ]; then
254
- echo "🔍 Checking for existing deployment: ${ENDPOINT_NAME}"
255
-
256
- EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")
257
-
258
- case "${EP_STATUS}" in
259
- InService)
260
- echo "✅ Endpoint already InService: ${ENDPOINT_NAME}"
261
-
262
- # Check inference component
263
- if [ -n "${INFERENCE_COMPONENT_NAME:-}" ]; then
264
- IC_STATUS=$(_get_ic_status "${INFERENCE_COMPONENT_NAME}")
265
-
266
- case "${IC_STATUS}" in
267
- InService)
268
- echo "✅ Inference component already InService: ${INFERENCE_COMPONENT_NAME}"
269
- echo ""
270
- echo "📋 Deployment is already live. Nothing to do."
271
- echo " Endpoint: ${ENDPOINT_NAME}"
272
- echo " Inference Component: ${INFERENCE_COMPONENT_NAME}"
273
- echo ""
274
- echo "🧪 Test your endpoint:"
275
- echo " ./do/test"
276
- echo ""
277
- echo "🧹 Clean up when done:"
278
- echo " ./do/clean endpoint"
279
- exit 0
280
- ;;
281
- Creating)
282
- echo "⏳ Inference component still creating: ${INFERENCE_COMPONENT_NAME}"
283
- SKIP_TO="wait_ic"
284
- IC_DEPLOYED_NAME="${INFERENCE_COMPONENT_NAME}"
285
- ;;
286
- Failed)
287
- echo "⚠️ Inference component failed: ${INFERENCE_COMPONENT_NAME}"
288
- echo " Will create a new inference component on the existing endpoint."
289
- SKIP_TO="create_ic"
290
- ;;
291
- *)
292
- # Stored IC not found — check if a different IC is running on this endpoint
293
- if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
294
- # External endpoint: never adopt ICs we didn't create
295
- echo " Stored IC not found on external endpoint. Will create a new one."
296
- SKIP_TO="create_ic"
297
- else
298
- LIVE_IC=$(_find_active_ic_on_endpoint "${ENDPOINT_NAME}")
299
- if [ -n "${LIVE_IC}" ] && [ "${LIVE_IC}" != "None" ]; then
300
- echo "✅ Found running inference component on endpoint: ${LIVE_IC}"
301
- echo " (config had stale reference: ${INFERENCE_COMPONENT_NAME})"
302
- _update_config_var "INFERENCE_COMPONENT_NAME" "${LIVE_IC}"
303
- echo ""
304
- echo "📋 Deployment is already live. Nothing to do."
305
- echo " Endpoint: ${ENDPOINT_NAME}"
306
- echo " Inference Component: ${LIVE_IC}"
307
- echo ""
308
- echo "🧪 Test your endpoint:"
309
- echo " ./do/test"
310
- echo ""
311
- echo "🧹 Clean up when done:"
312
- echo " ./do/clean endpoint"
313
- exit 0
314
- else
315
- echo " No existing inference component found on endpoint. Will create one."
316
- SKIP_TO="create_ic"
317
- fi
318
- fi
319
- ;;
320
- esac
321
- else
322
- # No IC name in config — check if one is already running on the endpoint
323
- if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
324
- # External endpoint: never adopt ICs we didn't create
325
- echo " No previous IC deployed by this project. Will create a new one."
326
- SKIP_TO="create_ic"
327
- else
328
- LIVE_IC=$(_find_active_ic_on_endpoint "${ENDPOINT_NAME}")
329
- if [ -n "${LIVE_IC}" ] && [ "${LIVE_IC}" != "None" ]; then
330
- echo "✅ Found running inference component on endpoint: ${LIVE_IC}"
331
- _update_config_var "INFERENCE_COMPONENT_NAME" "${LIVE_IC}"
332
- echo ""
333
- echo "📋 Deployment is already live. Nothing to do."
334
- echo " Endpoint: ${ENDPOINT_NAME}"
335
- echo " Inference Component: ${LIVE_IC}"
336
- echo ""
337
- echo "🧪 Test your endpoint:"
338
- echo " ./do/test"
339
- echo ""
340
- echo "🧹 Clean up when done:"
341
- echo " ./do/clean endpoint"
342
- exit 0
343
- else
344
- SKIP_TO="create_ic"
345
- fi
346
- fi
347
- fi
348
- ;;
349
- Creating|Updating)
350
- echo "⏳ Endpoint still ${EP_STATUS}: ${ENDPOINT_NAME}"
351
- SKIP_TO="wait_endpoint"
352
- ;;
353
- Failed)
354
- echo "⚠️ Previous endpoint failed: ${ENDPOINT_NAME}"
355
- echo " Creating a new deployment. Clean up the failed endpoint with:"
356
- echo " ./do/clean endpoint"
357
- echo ""
358
- # Fall through to create new resources
359
- ;;
360
- "")
361
- echo " Previous endpoint not found (may have been cleaned up). Creating new deployment."
362
- ;;
363
- *)
364
- echo " Endpoint in unexpected state: ${EP_STATUS}. Creating new deployment."
365
- ;;
366
- esac
367
- fi
368
-
369
- # ============================================================
370
- # Step 1: Create endpoint configuration and endpoint (skip if resuming)
371
- # ============================================================
372
- if [ -z "${SKIP_TO}" ]; then
373
- if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
374
- # External endpoint: validate it still exists and is InService
375
- echo "🔗 Using external endpoint: ${ENDPOINT_NAME}"
376
- echo " Validating endpoint status..."
377
-
378
- EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")
379
-
380
- if [ -z "${EP_STATUS}" ]; then
381
- echo "❌ External endpoint not found: ${ENDPOINT_NAME}"
382
- echo " The endpoint may have been deleted. Update ENDPOINT_NAME in do/config"
383
- echo " or remove ENDPOINT_EXTERNAL=true to create a new endpoint."
384
- exit 4
385
- fi
386
-
387
- if [ "${EP_STATUS}" != "InService" ]; then
388
- echo "❌ External endpoint not InService: ${ENDPOINT_NAME} (status: ${EP_STATUS})"
389
- echo " The endpoint must be InService before attaching inference components."
390
- echo " Wait for the endpoint to become InService, or update do/config."
391
- exit 4
392
- fi
393
-
394
- echo "✅ External endpoint is InService: ${ENDPOINT_NAME}"
395
- # Skip directly to IC creation — no endpoint config, no endpoint creation, no wait
396
- SKIP_TO="create_ic"
397
- else
398
- TIMESTAMP=$(date +%s)
399
- ENDPOINT_NAME="${PROJECT_NAME}-endpoint-${TIMESTAMP}"
400
-
401
- _update_config_var "ENDPOINT_NAME" "${ENDPOINT_NAME}"
402
-
403
- # Create endpoint configuration via shared helper
404
- create_endpoint_config
405
-
406
- _update_config_var "ENDPOINT_CONFIG_NAME" "${ENDPOINT_CONFIG_NAME}"
407
-
408
- # Record endpoint config in manifest (non-blocking)
409
- ENDPOINT_CONFIG_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint-config/${ENDPOINT_CONFIG_NAME}"
410
- ./do/manifest add \
411
- --type sagemaker-endpoint-config \
412
- --id "${ENDPOINT_CONFIG_ARN}" \
413
- --project "${PROJECT_NAME}" \
414
- --meta "{\"endpointConfigName\":\"${ENDPOINT_CONFIG_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
415
- 2>/dev/null || true
416
-
417
- # Step 2: Create endpoint
418
- echo "🚀 Creating endpoint: ${ENDPOINT_NAME}"
419
- if ! aws sagemaker create-endpoint \
420
- --endpoint-name "${ENDPOINT_NAME}" \
421
- --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
422
- --region "${AWS_REGION}"; then
423
-
424
- echo "❌ Failed to create endpoint"
425
- echo " Check that:"
426
- echo " • Your IAM credentials have sagemaker:CreateEndpoint permission"
427
- echo " • You have sufficient service quota in region: ${AWS_REGION}"
428
- exit 4
429
- fi
430
-
431
- echo "✅ Endpoint creation initiated: ${ENDPOINT_NAME}"
432
-
433
- # Record endpoint in manifest (non-blocking)
434
- ENDPOINT_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint/${ENDPOINT_NAME}"
435
- ./do/manifest add \
436
- --type sagemaker-endpoint \
437
- --id "${ENDPOINT_ARN}" \
438
- --project "${PROJECT_NAME}" \
439
- --meta "{\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
440
- 2>/dev/null || true
441
- fi
442
- fi
443
-
444
- # ============================================================
445
- # Wait for endpoint (skip if already InService or external)
446
- # ============================================================
447
- if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
448
- echo "⏳ Waiting for endpoint to reach InService status..."
449
- echo " This may take a few minutes..."
450
- echo " If this times out, re-run ./do/deploy to resume."
451
-
452
- wait_endpoint "${ENDPOINT_NAME}"
453
-
454
- echo "✅ Endpoint is InService: ${ENDPOINT_NAME}"
455
- fi
456
-
457
- # ============================================================
458
- # Step 3: Deploy inference components (skip if resuming from wait_ic)
459
- # ============================================================
460
- if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "create_ic" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
461
-
462
- if [ -d "${SCRIPT_DIR}/ic" ]; then
463
- # _check_gpu_capacity
464
- # Best-effort capacity guardrail: sums IC_GPU_COUNT across all do/ic/*.conf
465
- # and compares against known GPU count for the instance type.
466
- # Warns (does not error) if total exceeds instance capacity.
467
- # Skips check if instance type is not in the known map.
468
- _check_gpu_capacity() {
469
- # Skip check if no INSTANCE_TYPE (external endpoints)
470
- if [ -z "${INSTANCE_TYPE:-}" ]; then
471
- return 0
472
- fi
473
-
474
- # Best-effort capacity guardrail: sums GPU requirements from base ICs only.
475
- # NOTE: Only do/ic/*.conf files are counted. Adapter ICs (do/adapters/*.conf)
476
- # share the base IC's GPU resources and have no ComputeResourceRequirements,
477
- # so they are intentionally excluded from this capacity check.
478
- #
479
- # Hardcoded GPU counts for common SageMaker GPU instance types
480
- local instance_gpus=""
481
- case "${INSTANCE_TYPE}" in
482
- ml.g4dn.xlarge) instance_gpus=1 ;;
483
- ml.g4dn.12xlarge) instance_gpus=4 ;;
484
- ml.g5.xlarge) instance_gpus=1 ;;
485
- ml.g5.2xlarge) instance_gpus=1 ;;
486
- ml.g5.4xlarge) instance_gpus=1 ;;
487
- ml.g5.8xlarge) instance_gpus=1 ;;
488
- ml.g5.12xlarge) instance_gpus=4 ;;
489
- ml.g5.48xlarge) instance_gpus=8 ;;
490
- ml.g6.xlarge) instance_gpus=1 ;;
491
- ml.g6.12xlarge) instance_gpus=4 ;;
492
- ml.g6.48xlarge) instance_gpus=8 ;;
493
- ml.g6e.xlarge) instance_gpus=1 ;;
494
- ml.g6e.2xlarge) instance_gpus=1 ;;
495
- ml.g6e.4xlarge) instance_gpus=1 ;;
496
- ml.g6e.8xlarge) instance_gpus=1 ;;
497
- ml.g6e.12xlarge) instance_gpus=4 ;;
498
- ml.g6e.48xlarge) instance_gpus=8 ;;
499
- ml.g7e.xlarge) instance_gpus=1 ;;
500
- ml.g7e.2xlarge) instance_gpus=1 ;;
501
- ml.g7e.4xlarge) instance_gpus=1 ;;
502
- ml.g7e.8xlarge) instance_gpus=1 ;;
503
- ml.g7e.12xlarge) instance_gpus=4 ;;
504
- ml.g7e.48xlarge) instance_gpus=8 ;;
505
- ml.p3.2xlarge) instance_gpus=1 ;;
506
- ml.p3.8xlarge) instance_gpus=4 ;;
507
- ml.p3.16xlarge) instance_gpus=8 ;;
508
- ml.p4d.24xlarge) instance_gpus=8 ;;
509
- ml.p4de.24xlarge) instance_gpus=8 ;;
510
- ml.p5.48xlarge) instance_gpus=8 ;;
511
- *) instance_gpus="" ;;
512
- esac
513
-
514
- # Skip check if instance type not in map
515
- if [ -z "${instance_gpus}" ]; then
516
- return 0
517
- fi
518
-
519
- # Sum IC_GPU_COUNT across all IC config files
520
- local total_gpu_requested=0
521
- for conf in "${SCRIPT_DIR}"/ic/*.conf; do
522
- [ -f "${conf}" ] || continue
523
- local ic_gpus
524
- ic_gpus=$(grep "^export IC_GPU_COUNT=" "${conf}" 2>/dev/null | sed 's/^export IC_GPU_COUNT=//' | tr -d '"' || echo "1")
525
- if [ -z "${ic_gpus}" ]; then
526
- ic_gpus=1
527
- fi
528
- total_gpu_requested=$(( total_gpu_requested + ic_gpus ))
529
- done
530
-
531
- if [ "${total_gpu_requested}" -gt "${instance_gpus}" ]; then
532
- echo ""
533
- echo "⚠️ GPU capacity warning: ICs request ${total_gpu_requested} GPUs total, but ${INSTANCE_TYPE} has ${instance_gpus} GPUs."
534
- echo " SageMaker will likely reject IC creation if capacity is exceeded."
535
- echo " Consider reducing IC_GPU_COUNT values or using a larger instance type."
536
- echo ""
537
- fi
538
- }
539
-
540
- # Run capacity guardrail before deploying ICs
541
- _check_gpu_capacity
542
-
543
- # _delete_and_wait_ic <ic_name>
544
- # Deletes an inference component and waits for deletion to complete.
545
- # Polls until the IC is no longer found (avoids name conflicts on recreate).
546
- _delete_and_wait_ic() {
547
- local ic_name="$1"
548
- local delete_timeout=600 # 10 minutes max wait for deletion
549
-
550
- echo "🗑️ Deleting inference component: ${ic_name}"
551
- if ! aws sagemaker delete-inference-component \
552
- --inference-component-name "${ic_name}" \
553
- --region "${AWS_REGION}" 2>/dev/null; then
554
- echo " ⚠️ Delete call failed (IC may already be gone). Continuing..."
555
- return 0
556
- fi
557
-
558
- echo " Waiting for deletion to complete..."
559
- local delete_start
560
- delete_start=$(date +%s)
561
-
562
- while true; do
563
- local ic_status
564
- ic_status=$(_get_ic_status "${ic_name}")
565
-
566
- if [ -z "${ic_status}" ]; then
567
- echo " ✅ Inference component deleted: ${ic_name}"
568
- break
569
- fi
570
-
571
- local elapsed=$(( $(date +%s) - delete_start ))
572
- if [ "${elapsed}" -ge "${delete_timeout}" ]; then
573
- echo " ⚠️ Deletion timed out after ${delete_timeout}s. IC status: ${ic_status}"
574
- echo " Proceeding anyway — SageMaker may reject the new IC if name conflicts."
575
- break
576
- fi
577
-
578
- echo " $(date +%H:%M:%S) Deleting... (${ic_status}, ${elapsed}s elapsed)"
579
- sleep 15
580
- done
581
- }
582
-
583
- # _deploy_single_ic <conf_file>
584
- # Deploys a single IC with per-IC idempotency:
585
- # - If FORCE_IC is true: delete existing IC, clear state, create fresh
586
- # - If IC_DEPLOYED_NAME is set and InService → skip
587
- # - If IC_DEPLOYED_NAME is set and Creating → wait for it
588
- # - If IC_DEPLOYED_NAME is set and Failed → recreate with new timestamp
589
- # - If IC_DEPLOYED_NAME is not set → create new IC
590
- # Fail-fast: exits immediately on failure.
591
- _deploy_single_ic() {
592
- local ic_conf="$1"
593
- local ic_basename
594
- ic_basename=$(basename "${ic_conf}" .conf)
595
-
596
- # Source the IC config to check IC_DEPLOYED_NAME
597
- # Use a subshell-safe approach: read the variable without polluting scope
598
- local existing_ic_name=""
599
- if grep -q "^export IC_DEPLOYED_NAME=" "${ic_conf}" 2>/dev/null; then
600
- existing_ic_name=$(grep "^export IC_DEPLOYED_NAME=" "${ic_conf}" | sed 's/^export IC_DEPLOYED_NAME="//' | sed 's/"$//')
601
- fi
602
-
603
- # --force-ic: delete existing IC before recreating
604
- if [ "${FORCE_IC}" = true ] && [ -n "${existing_ic_name}" ]; then
605
- echo "🔄 --force-ic: recreating IC '${ic_basename}'"
606
- _delete_and_wait_ic "${existing_ic_name}"
607
-
608
- # Clear deployed state from config before recreating
609
- _update_config_var "IC_DEPLOYED_NAME" "" "${ic_conf}"
610
- _update_config_var "IC_DEPLOYED_AT" "" "${ic_conf}"
611
- existing_ic_name=""
612
- fi
613
-
614
- if [ "${FORCE_IC}" = true ] && [ -z "${existing_ic_name}" ]; then
615
- # Force mode with no existing IC — just create new
616
- create_inference_component "${ic_conf}"
617
- elif [ -n "${existing_ic_name}" ]; then
618
- # IC was previously deployed — check its current status
619
- local ic_status
620
- ic_status=$(_get_ic_status "${existing_ic_name}")
621
-
622
- case "${ic_status}" in
623
- InService)
624
- echo "✅ IC '${ic_basename}' already InService: ${existing_ic_name} — skipping"
625
- IC_DEPLOYED_NAME="${existing_ic_name}"
626
- return 0
627
- ;;
628
- Creating)
629
- echo "⏳ IC '${ic_basename}' is still Creating: ${existing_ic_name} — waiting..."
630
- IC_DEPLOYED_NAME="${existing_ic_name}"
631
- wait_ic "${IC_DEPLOYED_NAME}"
632
- echo "✅ Inference component is InService: ${IC_DEPLOYED_NAME}"
633
- return 0
634
- ;;
635
- Failed)
636
- echo "⚠️ IC '${ic_basename}' previously Failed: ${existing_ic_name} — recreating..."
637
- create_inference_component "${ic_conf}"
638
- ;;
639
- *)
640
- echo " IC '${ic_basename}' has unknown/missing status for ${existing_ic_name} — creating new..."
641
- create_inference_component "${ic_conf}"
642
- ;;
643
- esac
644
- else
645
- # No previous deployment — create new IC
646
- create_inference_component "${ic_conf}"
647
- fi
648
-
649
- echo "⏳ Waiting for inference component to reach InService status..."
650
- echo " This may take 5-10 minutes..."
651
-
652
- wait_ic "${IC_DEPLOYED_NAME}"
653
-
654
- echo "✅ Inference component is InService: ${IC_DEPLOYED_NAME}"
655
-
656
- # Record inference component in manifest (non-blocking)
657
- local ic_arn="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:inference-component/${IC_DEPLOYED_NAME}"
658
- ./do/manifest add \
659
- --type sagemaker-inference-component \
660
- --id "${ic_arn}" \
661
- --project "${PROJECT_NAME}" \
662
- --meta "{\"inferenceComponentName\":\"${IC_DEPLOYED_NAME}\",\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE:-external}\",\"region\":\"${AWS_REGION}\"}" \
663
- 2>/dev/null || true
664
- }
665
-
666
- if [ -n "${IC_TARGET}" ]; then
667
- # Single IC path: deploy only the named IC
668
- echo ""
669
- echo "── Deploying IC: ${IC_TARGET} ──"
670
- _deploy_single_ic "${SCRIPT_DIR}/ic/${IC_TARGET}.conf"
671
- else
672
- # Multi-IC path: iterate all IC config files (alphabetical order)
673
- IC_SUMMARY=""
674
- IC_DEPLOY_FAILED=false
675
-
676
- for conf in "${SCRIPT_DIR}"/ic/*.conf; do
677
- [ -f "${conf}" ] || continue
678
- local_ic_basename=$(basename "${conf}" .conf)
679
- echo ""
680
- echo "── Deploying IC: ${local_ic_basename} ──"
681
-
682
- if ! _deploy_single_ic "${conf}"; then
683
- echo "❌ IC '${local_ic_basename}' failed to deploy. Stopping."
684
- IC_SUMMARY="${IC_SUMMARY} ${local_ic_basename}: FAILED\n"
685
- IC_DEPLOY_FAILED=true
686
- break
687
- fi
688
-
689
- IC_SUMMARY="${IC_SUMMARY} ${local_ic_basename}: ${IC_DEPLOYED_NAME} [InService]\n"
690
- done
691
-
692
- # Print summary
693
- echo ""
694
- echo "📋 IC Deployment Summary:"
695
- echo -e "${IC_SUMMARY}"
696
-
697
- if [ "${IC_DEPLOY_FAILED}" = true ]; then
698
- echo "❌ Deployment stopped due to IC failure. Fix the issue and re-run ./do/deploy to resume."
699
- exit 4
700
- fi
701
- fi
702
- else
703
- # Legacy single-IC path: no do/ic/ directory
704
- create_inference_component_legacy
705
-
706
- echo "⏳ Waiting for inference component to reach InService status..."
707
- echo " This may take 5-10 minutes..."
708
- echo " If this times out, re-run ./do/deploy to resume."
709
-
710
- wait_ic "${IC_DEPLOYED_NAME}"
711
-
712
- echo "✅ Inference component is InService: ${IC_DEPLOYED_NAME}"
713
-
714
- # Record inference component in manifest (non-blocking)
715
- IC_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:inference-component/${IC_DEPLOYED_NAME}"
716
- ./do/manifest add \
717
- --type sagemaker-inference-component \
718
- --id "${IC_ARN}" \
719
- --project "${PROJECT_NAME}" \
720
- --meta "{\"inferenceComponentName\":\"${IC_DEPLOYED_NAME}\",\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE:-external}\",\"region\":\"${AWS_REGION}\"}" \
721
- 2>/dev/null || true
722
- fi
723
-
724
- elif [ "${SKIP_TO}" = "wait_ic" ]; then
725
- # Resuming: just wait for the IC that was already being created
726
- echo "⏳ Waiting for inference component to reach InService status..."
727
- echo " This may take 5-10 minutes..."
728
- echo " If this times out, re-run ./do/deploy to resume."
729
-
730
- wait_ic "${IC_DEPLOYED_NAME}"
731
-
732
- echo "✅ Inference component is InService: ${IC_DEPLOYED_NAME}"
733
- fi
734
-
735
- echo "✅ Deployment complete!"
736
- echo ""
737
- echo "📋 Deployment Details:"
738
- echo " Endpoint: ${ENDPOINT_NAME}"
739
- if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
740
- echo " Endpoint Config: (external — not managed by this project)"
741
- echo " Region: ${AWS_REGION}"
742
- else
743
- echo " Endpoint Config: ${ENDPOINT_CONFIG_NAME:-N/A}"
744
- echo " Region: ${AWS_REGION}"
745
- echo " Instance Type: ${INSTANCE_TYPE}"
746
- fi
747
- echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
748
- echo ""
749
- echo "📋 What's next?"
750
- echo " • Test your endpoint: ./do/test"
751
- <% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
752
- echo " • Benchmark performance: ./do/benchmark"
753
- <% } %>
754
- <% if (typeof enableLora !== 'undefined' && enableLora) { %>
755
- echo " • Add a LoRA adapter: ./do/adapter add <name> --weights s3://..."
756
- <% } %>
757
- echo " • View endpoint status: ./do/status"
758
- echo " • Register this deployment: ./do/register"
759
- echo " • View logs: ./do/logs"
760
- <% if (!(typeof existingEndpointName !== 'undefined' && existingEndpointName)) { %>
761
- echo " • Clean up when done: ./do/clean endpoint"
762
- <% } %>
763
-
764
- <% } else if (deploymentTarget === 'async-inference') { %>
765
- # ============================================================
766
- # SageMaker Async Inference Deployment (Model-Based)
767
- # SageMaker async inference does NOT support Inference Components.
768
- # Flow: create-model → create-endpoint-config (with AsyncInferenceConfig) → create-endpoint
769
- # ============================================================
770
-
771
- # Source shared helpers
772
- source "${SCRIPT_DIR}/lib/secrets.sh"
773
- source "${SCRIPT_DIR}/lib/wait.sh"
774
-
775
- # Resolve container secrets (HF_TOKEN, NGC_API_KEY)
776
- resolve_secrets
777
-
778
- # Validate execution role ARN
779
- if [ -z "${ROLE_ARN:-}" ]; then
780
- echo "❌ Execution role ARN not provided"
781
- echo ""
782
- echo "Usage:"
783
- echo " export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE"
784
- echo " ./do/deploy"
785
- echo ""
786
- echo "Or set ROLE_ARN in do/config"
787
- echo ""
788
- echo "The execution role must have permissions for:"
789
- echo " • SageMaker model and endpoint management"
790
- echo " • ECR image access"
791
- echo " • S3 write access for async output path: ${ASYNC_S3_OUTPUT_PATH}"
792
- echo " • SNS publish permissions (optional, for notifications)"
793
- echo " • CloudWatch Logs"
794
- exit 3
795
- fi
796
-
797
- echo " Using execution role: ${ROLE_ARN}"
798
-
799
- # ============================================================
800
- # Bootstrap async infrastructure (S3 bucket + SNS topics)
801
- # ============================================================
802
-
803
- # Extract bucket name from S3 output path
804
- ASYNC_S3_BUCKET=$(echo "${ASYNC_S3_OUTPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
805
-
806
- <% if (!asyncS3OutputPath) { %>
807
- # Bootstrap default S3 bucket (check-and-create)
808
- echo "🔍 Checking if S3 bucket exists: ${ASYNC_S3_BUCKET}"
809
- if ! aws s3api head-bucket --bucket "${ASYNC_S3_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
810
- echo "📦 Creating S3 bucket: ${ASYNC_S3_BUCKET}"
811
- if [ "${AWS_REGION}" = "us-east-1" ]; then
812
- if ! aws s3api create-bucket \
813
- --bucket "${ASYNC_S3_BUCKET}" \
814
- --region "${AWS_REGION}"; then
815
- echo "❌ Failed to create S3 bucket: ${ASYNC_S3_BUCKET}"
816
- echo ""
817
- echo " Check that:"
818
- echo " • Your IAM credentials have s3:CreateBucket permission"
819
- echo " • The bucket name is not already taken globally"
820
- exit 4
821
- fi
822
- else
823
- if ! aws s3api create-bucket \
824
- --bucket "${ASYNC_S3_BUCKET}" \
825
- --region "${AWS_REGION}" \
826
- --create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
827
- echo "❌ Failed to create S3 bucket: ${ASYNC_S3_BUCKET}"
828
- echo ""
829
- echo " Check that:"
830
- echo " • Your IAM credentials have s3:CreateBucket permission"
831
- echo " • The bucket name is not already taken globally"
832
- exit 4
833
- fi
834
- fi
835
- echo "✅ S3 bucket created: ${ASYNC_S3_BUCKET}"
836
- else
837
- echo "✅ S3 bucket exists: ${ASYNC_S3_BUCKET}"
838
- fi
839
- <% } else { %>
840
- # Custom S3 output path provided — skip bucket creation
841
- echo "✅ Using custom S3 output path: ${ASYNC_S3_OUTPUT_PATH}"
842
- <% } %>
843
-
844
- # Extract topic name from SNS success topic ARN
845
- ASYNC_SNS_SUCCESS_TOPIC_NAME=$(echo "${ASYNC_SNS_SUCCESS_TOPIC}" | awk -F: '{print $NF}')
846
-
847
- <% if (!asyncSnsSuccessTopic) { %>
848
- # Bootstrap default SNS success topic (check-and-create)
849
- echo "🔍 Checking if SNS success topic exists: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
850
- if ! aws sns get-topic-attributes --topic-arn "${ASYNC_SNS_SUCCESS_TOPIC}" --region "${AWS_REGION}" 2>/dev/null; then
851
- echo "📦 Creating SNS success topic: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
852
- if ! aws sns create-topic \
853
- --name "${ASYNC_SNS_SUCCESS_TOPIC_NAME}" \
854
- --region "${AWS_REGION}" > /dev/null; then
855
- echo "❌ Failed to create SNS success topic: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
856
- echo ""
857
- echo " Check that:"
858
- echo " • Your IAM credentials have sns:CreateTopic permission"
859
- exit 4
860
- fi
861
- echo "✅ SNS success topic created: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
862
- else
863
- echo "✅ SNS success topic exists: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
864
- fi
865
-
866
- # Record SNS success topic in manifest (non-blocking)
867
- ./do/manifest add \
868
- --type sns-topic \
869
- --id "${ASYNC_SNS_SUCCESS_TOPIC}" \
870
- --project "${PROJECT_NAME}" \
871
- --meta "{\"topicName\":\"${ASYNC_SNS_SUCCESS_TOPIC_NAME}\",\"purpose\":\"async-success\",\"region\":\"${AWS_REGION}\"}" \
872
- 2>/dev/null || true
873
-
874
- <% } else { %>
875
- # Custom SNS success topic ARN provided — skip topic creation
876
- echo "✅ Using custom SNS success topic: ${ASYNC_SNS_SUCCESS_TOPIC}"
877
-
878
- # Record SNS success topic in manifest (non-blocking)
879
- ASYNC_SNS_SUCCESS_TOPIC_NAME=$(echo "${ASYNC_SNS_SUCCESS_TOPIC}" | awk -F: '{print $NF}')
880
- ./do/manifest add \
881
- --type sns-topic \
882
- --id "${ASYNC_SNS_SUCCESS_TOPIC}" \
883
- --project "${PROJECT_NAME}" \
884
- --meta "{\"topicName\":\"${ASYNC_SNS_SUCCESS_TOPIC_NAME}\",\"purpose\":\"async-success\",\"region\":\"${AWS_REGION}\"}" \
885
- 2>/dev/null || true
886
-
887
- <% } %>
888
-
889
- # Extract topic name from SNS error topic ARN
890
- ASYNC_SNS_ERROR_TOPIC_NAME=$(echo "${ASYNC_SNS_ERROR_TOPIC}" | awk -F: '{print $NF}')
891
-
892
- <% if (!asyncSnsErrorTopic) { %>
893
- # Bootstrap default SNS error topic (check-and-create)
894
- echo "🔍 Checking if SNS error topic exists: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
895
- if ! aws sns get-topic-attributes --topic-arn "${ASYNC_SNS_ERROR_TOPIC}" --region "${AWS_REGION}" 2>/dev/null; then
896
- echo "📦 Creating SNS error topic: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
897
- if ! aws sns create-topic \
898
- --name "${ASYNC_SNS_ERROR_TOPIC_NAME}" \
899
- --region "${AWS_REGION}" > /dev/null; then
900
- echo "❌ Failed to create SNS error topic: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
901
- echo ""
902
- echo " Check that:"
903
- echo " • Your IAM credentials have sns:CreateTopic permission"
904
- exit 4
905
- fi
906
- echo "✅ SNS error topic created: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
907
- else
908
- echo "✅ SNS error topic exists: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
909
- fi
910
-
911
- # Record SNS error topic in manifest (non-blocking)
912
- ./do/manifest add \
913
- --type sns-topic \
914
- --id "${ASYNC_SNS_ERROR_TOPIC}" \
915
- --project "${PROJECT_NAME}" \
916
- --meta "{\"topicName\":\"${ASYNC_SNS_ERROR_TOPIC_NAME}\",\"purpose\":\"async-error\",\"region\":\"${AWS_REGION}\"}" \
917
- 2>/dev/null || true
918
-
919
- <% } else { %>
920
- # Custom SNS error topic ARN provided — skip topic creation
921
- echo "✅ Using custom SNS error topic: ${ASYNC_SNS_ERROR_TOPIC}"
922
-
923
- # Record SNS error topic in manifest (non-blocking)
924
- ASYNC_SNS_ERROR_TOPIC_NAME=$(echo "${ASYNC_SNS_ERROR_TOPIC}" | awk -F: '{print $NF}')
925
- ./do/manifest add \
926
- --type sns-topic \
927
- --id "${ASYNC_SNS_ERROR_TOPIC}" \
928
- --project "${PROJECT_NAME}" \
929
- --meta "{\"topicName\":\"${ASYNC_SNS_ERROR_TOPIC_NAME}\",\"purpose\":\"async-error\",\"region\":\"${AWS_REGION}\"}" \
930
- 2>/dev/null || true
931
-
932
- <% } %>
933
-
934
- # ============================================================
935
- # Create async endpoint (classic model-based flow)
936
- # SageMaker async inference does NOT support Inference Components.
937
- # Flow: create-model → create-endpoint-config (with AsyncInferenceConfig) → create-endpoint
938
- # ============================================================
939
-
940
- # ============================================================
941
- # Idempotency: check for existing deployment from a previous run
942
- # ============================================================
943
- SKIP_TO=""
944
-
945
- if [ "${FORCE_NEW}" = true ]; then
946
- echo "🔄 --force: ignoring previous deployment, creating new resources."
947
- elif [ -n "${ENDPOINT_NAME:-}" ]; then
948
- echo "🔍 Checking for existing deployment: ${ENDPOINT_NAME}"
949
-
950
- EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")
951
-
952
- case "${EP_STATUS}" in
953
- InService)
954
- echo "✅ Async endpoint already InService: ${ENDPOINT_NAME}"
955
- echo ""
956
- echo "📋 Deployment is already live. Nothing to do."
957
- echo " Endpoint: ${ENDPOINT_NAME}"
958
- echo ""
959
- echo "🧪 Test your async endpoint:"
960
- echo " ./do/test"
961
- echo ""
962
- echo "🧹 Clean up when done:"
963
- echo " ./do/clean endpoint"
964
- exit 0
965
- ;;
966
- Creating|Updating)
967
- echo "⏳ Endpoint still ${EP_STATUS}: ${ENDPOINT_NAME}"
968
- SKIP_TO="wait_endpoint"
969
- ;;
970
- Failed)
971
- echo "⚠️ Previous endpoint failed: ${ENDPOINT_NAME}"
972
- echo " Creating a new deployment. Clean up the failed endpoint with:"
973
- echo " ./do/clean endpoint"
974
- echo ""
975
- ;;
976
- "")
977
- echo " Previous endpoint not found (may have been cleaned up). Creating new deployment."
978
- ;;
979
- *)
980
- echo " Endpoint in unexpected state: ${EP_STATUS}. Creating new deployment."
981
- ;;
982
- esac
983
- fi
984
-
985
- # ============================================================
986
- # Create async resources (skip if resuming from wait)
987
- # ============================================================
988
- if [ -z "${SKIP_TO}" ]; then
989
- TIMESTAMP=$(date +%s)
990
- MODEL_NAME_SM="${PROJECT_NAME}-async-model-${TIMESTAMP}"
991
- ENDPOINT_CONFIG_NAME="${PROJECT_NAME}-async-epc-${TIMESTAMP}"
992
- ENDPOINT_NAME="${PROJECT_NAME}-async-ep-${TIMESTAMP}"
993
-
994
- _update_config_var "ENDPOINT_NAME" "${ENDPOINT_NAME}"
995
- _update_config_var "ENDPOINT_CONFIG_NAME" "${ENDPOINT_CONFIG_NAME}"
996
- _update_config_var "SAGEMAKER_MODEL_NAME" "${MODEL_NAME_SM}"
997
-
998
- # Step 1: Create SageMaker model
999
- # Build primary container spec
1000
- PRIMARY_CONTAINER="{\"Image\":\"${ECR_REPOSITORY}:${IMAGE_TAG}\""
1001
- if [ -n "${CONTAINER_ENV_JSON}" ]; then
1002
- PRIMARY_CONTAINER="${PRIMARY_CONTAINER},\"Environment\":{${CONTAINER_ENV_JSON}}"
1003
- fi
1004
- PRIMARY_CONTAINER="${PRIMARY_CONTAINER}}"
1005
-
1006
- echo "📦 Creating SageMaker model: ${MODEL_NAME_SM}"
1007
- if ! aws sagemaker create-model \
1008
- --model-name "${MODEL_NAME_SM}" \
1009
- --primary-container "${PRIMARY_CONTAINER}" \
1010
- --execution-role-arn "${ROLE_ARN}" \
1011
- --region "${AWS_REGION}"; then
1012
-
1013
- echo "❌ Failed to create SageMaker model"
1014
- echo " Check that:"
1015
- echo " • The execution role ARN is valid"
1016
- echo " • The ECR image exists and is accessible"
1017
- echo " • The IAM role has ecr:GetDownloadUrlForLayer permission"
1018
- exit 4
1019
- fi
1020
-
1021
- echo "✅ SageMaker model created: ${MODEL_NAME_SM}"
1022
-
1023
- # Record model in manifest (non-blocking)
1024
- MODEL_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:model/${MODEL_NAME_SM}"
1025
- ./do/manifest add \
1026
- --type sagemaker-model \
1027
- --id "${MODEL_ARN}" \
1028
- --project "${PROJECT_NAME}" \
1029
- --meta "{\"modelName\":\"${MODEL_NAME_SM}\",\"region\":\"${AWS_REGION}\"}" \
1030
- 2>/dev/null || true
1031
-
1032
- # Build production variant JSON (classic: includes ModelName, no execution-role-arn on endpoint config)
1033
- VARIANT_JSON="[{\"VariantName\":\"AllTraffic\",\"ModelName\":\"${MODEL_NAME_SM}\",\"InstanceType\":\"${INSTANCE_TYPE}\",\"InitialInstanceCount\":1"
1034
-
1035
- # Append InferenceAmiVersion if configured
1036
- if [ -n "${INFERENCE_AMI_VERSION:-}" ]; then
1037
- VARIANT_JSON="${VARIANT_JSON},\"InferenceAmiVersion\":\"${INFERENCE_AMI_VERSION}\""
1038
- echo " AMI version: ${INFERENCE_AMI_VERSION}"
1039
- fi
1040
-
1041
- VARIANT_JSON="${VARIANT_JSON}}]"
1042
-
1043
- # Build AsyncInferenceConfig JSON
1044
- ASYNC_CONFIG="{\"OutputConfig\":{\"S3OutputPath\":\"${ASYNC_S3_OUTPUT_PATH}\",\"NotificationConfig\":{\"SuccessTopic\":\"${ASYNC_SNS_SUCCESS_TOPIC}\",\"ErrorTopic\":\"${ASYNC_SNS_ERROR_TOPIC}\"}}"
1045
-
1046
- if [ -n "${ASYNC_MAX_CONCURRENT_INVOCATIONS:-}" ]; then
1047
- ASYNC_CONFIG="${ASYNC_CONFIG},\"ClientConfig\":{\"MaxConcurrentInvocationsPerInstance\":${ASYNC_MAX_CONCURRENT_INVOCATIONS}}"
1048
- fi
1049
-
1050
- ASYNC_CONFIG="${ASYNC_CONFIG}}"
1051
-
1052
- # Step 2: Create endpoint configuration with AsyncInferenceConfig (no --execution-role-arn)
1053
- echo "⚙️ Creating async endpoint configuration: ${ENDPOINT_CONFIG_NAME}"
1054
- if ! aws sagemaker create-endpoint-config \
1055
- --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
1056
- --production-variants "${VARIANT_JSON}" \
1057
- --async-inference-config "${ASYNC_CONFIG}" \
1058
- --region "${AWS_REGION}"; then
1059
-
1060
- echo "❌ Failed to create async endpoint configuration"
1061
- echo " Check that:"
1062
- echo " • The S3 output path is accessible: ${ASYNC_S3_OUTPUT_PATH}"
1063
- echo " • The IAM role has s3:PutObject permission on the output path"
1064
- echo " • The instance type is valid: ${INSTANCE_TYPE}"
1065
- echo " • The instance type is available in region: ${AWS_REGION}"
1066
- echo " • You have sufficient service quota for the instance type"
1067
- exit 4
1068
- fi
1069
-
1070
- echo "✅ Async endpoint configuration created: ${ENDPOINT_CONFIG_NAME}"
1071
-
1072
- # Record endpoint config in manifest (non-blocking)
1073
- ENDPOINT_CONFIG_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint-config/${ENDPOINT_CONFIG_NAME}"
1074
- ./do/manifest add \
1075
- --type sagemaker-endpoint-config \
1076
- --id "${ENDPOINT_CONFIG_ARN}" \
1077
- --project "${PROJECT_NAME}" \
1078
- --meta "{\"endpointConfigName\":\"${ENDPOINT_CONFIG_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
1079
- 2>/dev/null || true
1080
-
1081
- # Step 3: Create endpoint
1082
- echo "🚀 Creating async endpoint: ${ENDPOINT_NAME}"
1083
- if ! aws sagemaker create-endpoint \
1084
- --endpoint-name "${ENDPOINT_NAME}" \
1085
- --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
1086
- --region "${AWS_REGION}"; then
1087
-
1088
- echo "❌ Failed to create async endpoint"
1089
- echo " Check that:"
1090
- echo " • Your IAM credentials have sagemaker:CreateEndpoint permission"
1091
- echo " • You have sufficient service quota in region: ${AWS_REGION}"
1092
- exit 4
1093
- fi
1094
-
1095
- echo "✅ Async endpoint creation initiated: ${ENDPOINT_NAME}"
1096
-
1097
- # Record endpoint in manifest (non-blocking)
1098
- ENDPOINT_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint/${ENDPOINT_NAME}"
1099
- ./do/manifest add \
1100
- --type sagemaker-endpoint \
1101
- --id "${ENDPOINT_ARN}" \
1102
- --project "${PROJECT_NAME}" \
1103
- --meta "{\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
1104
- 2>/dev/null || true
1105
- fi
1106
-
1107
- # ============================================================
1108
- # Wait for endpoint (skip if already InService)
1109
- # ============================================================
1110
- if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
1111
- echo "⏳ Waiting for async endpoint to reach InService status..."
1112
- echo " This may take several minutes..."
1113
- echo " If this times out, re-run ./do/deploy to resume."
1114
-
1115
- wait_endpoint "${ENDPOINT_NAME}"
1116
- fi
1117
-
1118
- echo "✅ Async deployment complete!"
1119
- echo ""
1120
- echo "📋 Deployment Details:"
1121
- echo " Endpoint: ${ENDPOINT_NAME}"
1122
- echo " Endpoint Config: ${ENDPOINT_CONFIG_NAME}"
1123
- echo " Model: ${MODEL_NAME_SM}"
1124
- echo " Region: ${AWS_REGION}"
1125
- echo " Instance Type: ${INSTANCE_TYPE}"
1126
- echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
1127
- echo " S3 Output: ${ASYNC_S3_OUTPUT_PATH}"
1128
- echo " SNS Success: ${ASYNC_SNS_SUCCESS_TOPIC}"
1129
- echo " SNS Error: ${ASYNC_SNS_ERROR_TOPIC}"
1130
- echo ""
1131
- echo "📋 What's next?"
1132
- echo " • Test your async endpoint: ./do/test"
1133
- echo " • Check async output: aws s3 ls ${ASYNC_S3_OUTPUT_PATH}"
1134
- <% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
1135
- echo " • Benchmark performance: ./do/benchmark"
1136
- <% } %>
1137
- echo " • Register this deployment: ./do/register"
1138
- echo " • View logs: ./do/logs"
1139
- echo " • Clean up when done: ./do/clean endpoint"
1140
-
1141
- <% } else if (deploymentTarget === 'hyperpod-eks') { %>
1142
- # ============================================================
1143
- # HyperPod EKS Deployment
1144
- # ============================================================
1145
-
1146
- # Get kubeconfig for HyperPod cluster
1147
- echo "🔑 Configuring kubectl for HyperPod cluster..."
1148
- KUBECONFIG_PATH="${HOME}/.kube/hyperpod-${HYPERPOD_CLUSTER_NAME}"
1149
-
1150
- # Step 1: Describe the HyperPod cluster to get the underlying EKS cluster ARN
1151
- EKS_CLUSTER_ARN=$(aws sagemaker describe-cluster \
1152
- --cluster-name "${HYPERPOD_CLUSTER_NAME}" \
1153
- --region "${AWS_REGION}" \
1154
- --query "Orchestrator.Eks.ClusterArn" \
1155
- --output text 2>&1) || {
1156
- echo "❌ Failed to describe HyperPod cluster: ${HYPERPOD_CLUSTER_NAME}"
1157
- echo ""
1158
- echo " Error details:"
1159
- echo " ${EKS_CLUSTER_ARN}"
1160
- echo ""
1161
- echo " Check that:"
1162
- echo " • The cluster name is correct"
1163
- echo " • The cluster exists in region: ${AWS_REGION}"
1164
- echo " • Your IAM user/role has permission to access the cluster"
1165
- echo ""
1166
- echo " Required IAM permissions:"
1167
- echo " • sagemaker:DescribeCluster"
1168
- echo " • eks:DescribeCluster"
1169
- exit 4
1170
- }
1171
-
1172
- # Step 2: Extract the EKS cluster name from the ARN
1173
- EKS_CLUSTER_NAME=$(echo "${EKS_CLUSTER_ARN}" | awk -F'/' '{print $NF}')
1174
- echo " HyperPod cluster: ${HYPERPOD_CLUSTER_NAME}"
1175
- echo " EKS cluster: ${EKS_CLUSTER_NAME}"
1176
-
1177
- # Step 3: Update kubeconfig using the EKS cluster
1178
- if ! aws eks update-kubeconfig \
1179
- --name "${EKS_CLUSTER_NAME}" \
1180
- --region "${AWS_REGION}" \
1181
- --kubeconfig "${KUBECONFIG_PATH}" 2>&1; then
1182
- echo "❌ Failed to configure kubectl for EKS cluster: ${EKS_CLUSTER_NAME}"
1183
- echo ""
1184
- echo " Required IAM permissions:"
1185
- echo " • eks:DescribeCluster"
1186
- echo " • eks:AccessKubernetesApi"
1187
- exit 4
1188
- fi
1189
-
1190
- export KUBECONFIG="${KUBECONFIG_PATH}"
1191
- echo "✅ Kubeconfig saved to: ${KUBECONFIG_PATH}"
1192
-
1193
- # Verify cluster connectivity
1194
- echo "🔍 Verifying cluster connectivity..."
1195
- if ! kubectl cluster-info &> /dev/null; then
1196
- echo "❌ Cannot connect to HyperPod cluster"
1197
- echo ""
1198
- echo " Check that:"
1199
- echo " • The cluster is in 'InService' status"
1200
- echo " • Your network can reach the cluster API server"
1201
- echo " • Your IAM credentials are valid"
1202
- exit 4
1203
- fi
1204
- echo "✅ Connected to HyperPod cluster"
1205
-
1206
- # Create namespace if it doesn't exist
1207
- echo "📁 Ensuring namespace exists: ${HYPERPOD_NAMESPACE}"
1208
- if ! kubectl create namespace "${HYPERPOD_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - 2>&1; then
1209
- echo "⚠️ Warning: Could not create/verify namespace"
1210
- fi
1211
-
1212
- # Apply Kubernetes manifests
1213
- echo "📄 Applying Kubernetes manifests from hyperpod/..."
1214
-
1215
- # Substitute shell variables (e.g. ${AWS_ACCOUNT_ID}) in manifests before applying
1216
- export AWS_ACCOUNT_ID
1217
- export ECR_IMAGE="${ECR_REPOSITORY}:${IMAGE_TAG}"
1218
-
1219
- APPLY_OUTPUT=""
1220
- APPLY_EXIT_CODE=0
1221
- for manifest in hyperpod/*.yaml; do
1222
- # Skip files that contain no Kubernetes objects (e.g. comment-only PVC stubs)
1223
- RENDERED=$(envsubst < "${manifest}")
1224
- if echo "${RENDERED}" | grep -q '^kind:'; then
1225
- FILE_OUTPUT=$(echo "${RENDERED}" | kubectl apply -n "${HYPERPOD_NAMESPACE}" -f - 2>&1) || {
1226
- APPLY_EXIT_CODE=$?
1227
- }
1228
- APPLY_OUTPUT="${APPLY_OUTPUT}${FILE_OUTPUT}\n"
1229
- fi
1230
- done
1231
-
1232
- if [ "${APPLY_EXIT_CODE}" -ne 0 ]; then
1233
- echo ""
1234
- echo "❌ Failed to apply Kubernetes manifests"
1235
- echo ""
1236
- echo " Error details:"
1237
- echo " ${APPLY_OUTPUT}"
1238
- echo ""
1239
- echo " Common issues:"
1240
- echo " • Insufficient node capacity - check available GPU nodes"
1241
- echo " • Resource requests exceed node capacity"
1242
- echo " • RBAC permissions - ensure you have permission to create resources in namespace '${HYPERPOD_NAMESPACE}'"
1243
- echo " • Invalid manifest syntax"
1244
- <% if (fsxVolumeHandle) { %>
1245
- echo " • PVC creation failure - verify the FSx CSI driver is installed on the cluster"
1246
- echo " kubectl get csidriver -o name | grep fsx"
1247
- <% } %>
1248
- echo ""
1249
- echo " Debug commands:"
1250
- echo " kubectl get nodes -o wide"
1251
- echo " kubectl describe nodes"
1252
- echo " kubectl get events -n ${HYPERPOD_NAMESPACE}"
1253
- exit ${APPLY_EXIT_CODE}
1254
- fi
1255
-
1256
- echo "✅ Kubernetes manifests applied"
1257
-
1258
- # Record k8s deployment and service in manifest (non-blocking)
1259
- ./do/manifest add \
1260
- --type k8s-deployment \
1261
- --id "${HYPERPOD_NAMESPACE}/${PROJECT_NAME}" \
1262
- --project "${PROJECT_NAME}" \
1263
- --meta "{\"namespace\":\"${HYPERPOD_NAMESPACE}\",\"deploymentName\":\"${PROJECT_NAME}\",\"clusterName\":\"${HYPERPOD_CLUSTER_NAME}\",\"region\":\"${AWS_REGION}\"}" \
1264
- 2>/dev/null || true
1265
-
1266
- ./do/manifest add \
1267
- --type k8s-service \
1268
- --id "${HYPERPOD_NAMESPACE}/${PROJECT_NAME}" \
1269
- --project "${PROJECT_NAME}" \
1270
- --meta "{\"namespace\":\"${HYPERPOD_NAMESPACE}\",\"serviceName\":\"${PROJECT_NAME}\",\"clusterName\":\"${HYPERPOD_CLUSTER_NAME}\",\"region\":\"${AWS_REGION}\"}" \
1271
- 2>/dev/null || true
1272
-
1273
- # Wait for deployment to be ready
1274
- DEPLOY_TIMEOUT=${DEPLOY_TIMEOUT:-1200}
1275
- echo "⏳ Waiting for deployment to be ready (timeout: ${DEPLOY_TIMEOUT}s)..."
1276
- echo " This may take several minutes for GPU workloads..."
1277
- echo ""
1278
-
1279
- # Poll pod status every 30s while rollout is in progress
1280
- (
1281
- while true; do
1282
- sleep 30
1283
- POD_STATUS=$(kubectl get pods -n "${HYPERPOD_NAMESPACE}" -l app=${PROJECT_NAME} \
1284
- --no-headers 2>/dev/null | head -5)
1285
- if [ -n "${POD_STATUS}" ]; then
1286
- echo " 📊 $(date +%H:%M:%S) Pod status:"
1287
- echo "${POD_STATUS}" | while read -r line; do echo " ${line}"; done
1288
- fi
1289
- done
1290
- ) &
1291
- STATUS_PID=$!
1292
- trap "kill ${STATUS_PID} 2>/dev/null; wait ${STATUS_PID} 2>/dev/null" EXIT
1293
-
1294
- ROLLOUT_OUTPUT=$(kubectl rollout status deployment/${PROJECT_NAME} -n "${HYPERPOD_NAMESPACE}" --timeout=${DEPLOY_TIMEOUT}s 2>&1) || {
1295
- ROLLOUT_EXIT_CODE=$?
1296
- kill ${STATUS_PID} 2>/dev/null
1297
- echo ""
1298
- echo "❌ Deployment failed to become ready within timeout"
1299
- echo ""
1300
- echo " Error details:"
1301
- echo " ${ROLLOUT_OUTPUT}"
1302
- echo ""
1303
- echo " Current pod state:"
1304
- kubectl get pods -n "${HYPERPOD_NAMESPACE}" -l app=${PROJECT_NAME} -o wide 2>/dev/null
1305
- echo ""
1306
- echo " Debug commands:"
1307
- echo " kubectl describe pods -n ${HYPERPOD_NAMESPACE} -l app=${PROJECT_NAME}"
1308
- echo " kubectl logs -n ${HYPERPOD_NAMESPACE} -l app=${PROJECT_NAME} --tail=100"
1309
- echo ""
1310
- echo " Common issues:"
1311
- echo " • Image pull errors - check ECR permissions"
1312
- echo " • Resource scheduling - insufficient GPU nodes"
1313
- echo " • Container crash - check application logs"
1314
- <% if (fsxVolumeHandle) { %>
1315
- echo " • PVC binding errors - verify FSx CSI driver is installed on the cluster"
1316
- echo " kubectl get pvc -n ${HYPERPOD_NAMESPACE}"
1317
- echo " kubectl describe pvc -n ${HYPERPOD_NAMESPACE}"
1318
- echo " kubectl get csidriver -o name | grep fsx"
1319
- <% } %>
1320
- exit ${ROLLOUT_EXIT_CODE}
1321
- }
1322
-
1323
- kill ${STATUS_PID} 2>/dev/null
1324
- wait ${STATUS_PID} 2>/dev/null
1325
-
1326
- echo "✅ HyperPod EKS deployment complete!"
1327
- echo ""
1328
- echo "📋 Deployment Details:"
1329
- echo " Cluster: ${HYPERPOD_CLUSTER_NAME}"
1330
- echo " Namespace: ${HYPERPOD_NAMESPACE}"
1331
- echo " Deployment: ${PROJECT_NAME}"
1332
- echo " Replicas: ${HYPERPOD_REPLICAS}"
1333
- echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
1334
- echo ""
1335
- echo "📋 What's next?"
1336
- echo " • Test your deployment: ./do/test"
1337
- echo " • Check pod status: kubectl get pods -n ${HYPERPOD_NAMESPACE}"
1338
- echo " • View pod logs: kubectl logs -n ${HYPERPOD_NAMESPACE} -l app=${PROJECT_NAME}"
1339
- <% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
1340
- echo " • Benchmark performance: ./do/benchmark"
1341
- <% } %>
1342
- echo " • Register this deployment: ./do/register"
1343
- echo " • View logs: ./do/logs"
1344
- echo " • Clean up when done: ./do/clean hyperpod"
1345
-
1346
- # Write kubeconfig path to config so other scripts can use it (idempotent)
1347
- _update_config_var() {
1348
- local var_name="$1" var_value="$2" config_file="${SCRIPT_DIR}/config"
1349
- if grep -q "^export ${var_name}=" "${config_file}" 2>/dev/null; then
1350
- sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
1351
- rm -f "${config_file}.bak"
1352
- else
1353
- echo "" >> "${config_file}"
1354
- echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
1355
- fi
1356
- }
1357
-
1358
- _update_config_var "KUBECONFIG" "${KUBECONFIG_PATH}"
1359
-
1360
- <% } else if (deploymentTarget === 'batch-transform') { %>
1361
- # ============================================================
1362
- # SageMaker Batch Transform Deployment
1363
- # Flow: create-model → create-transform-job → poll until completion
1364
- # ============================================================
1365
-
1366
- # Source shared helpers
1367
- source "${SCRIPT_DIR}/lib/secrets.sh"
1368
- source "${SCRIPT_DIR}/lib/wait.sh"
1369
-
1370
- # Resolve container secrets (HF_TOKEN, NGC_API_KEY)
1371
- resolve_secrets
1372
-
1373
- # Validate execution role ARN
1374
- if [ -z "${ROLE_ARN:-}" ]; then
1375
- echo "❌ Execution role ARN not provided"
1376
- echo ""
1377
- echo "Usage:"
1378
- echo " export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE"
1379
- echo " ./do/deploy"
1380
- echo ""
1381
- echo "Or set ROLE_ARN in do/config"
1382
- echo ""
1383
- echo "The execution role must have permissions for:"
1384
- echo " • SageMaker model and transform job management"
1385
- echo " • ECR image access"
1386
- echo " • S3 read access for input path: ${BATCH_INPUT_PATH}"
1387
- echo " • S3 write access for output path: ${BATCH_OUTPUT_PATH}"
1388
- echo " • CloudWatch Logs"
1389
- exit 3
1390
- fi
1391
-
1392
- echo " Using execution role: ${ROLE_ARN}"
1393
-
1394
- # Validate S3 input path
1395
- if [ -z "${BATCH_INPUT_PATH:-}" ]; then
1396
- echo "❌ S3 input path not provided"
1397
- echo ""
1398
- echo "Set BATCH_INPUT_PATH in do/config or provide via CLI:"
1399
- echo " export BATCH_INPUT_PATH=s3://my-bucket/input/"
1400
- echo " ./do/deploy"
1401
- exit 3
1402
- fi
1403
-
1404
- if [[ "${BATCH_INPUT_PATH}" != s3://* ]]; then
1405
- echo "❌ S3 input path must start with s3://"
1406
- echo " Current value: ${BATCH_INPUT_PATH}"
1407
- echo " Example: s3://my-bucket/input/"
1408
- exit 3
1409
- fi
1410
-
1411
- # Validate S3 output path
1412
- if [ -z "${BATCH_OUTPUT_PATH:-}" ]; then
1413
- echo "❌ S3 output path not provided"
1414
- echo ""
1415
- echo "Set BATCH_OUTPUT_PATH in do/config or provide via CLI:"
1416
- echo " export BATCH_OUTPUT_PATH=s3://my-bucket/output/"
1417
- echo " ./do/deploy"
1418
- exit 3
1419
- fi
1420
-
1421
- if [[ "${BATCH_OUTPUT_PATH}" != s3://* ]]; then
1422
- echo "❌ S3 output path must start with s3://"
1423
- echo " Current value: ${BATCH_OUTPUT_PATH}"
1424
- echo " Example: s3://my-bucket/output/"
1425
- exit 3
1426
- fi
1427
-
1428
- # ============================================================
1429
- # Bootstrap S3 buckets for batch transform
1430
- # ============================================================
1431
-
1432
- # Extract bucket names from S3 paths
1433
- BATCH_INPUT_BUCKET=$(echo "${BATCH_INPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
1434
- BATCH_OUTPUT_BUCKET=$(echo "${BATCH_OUTPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
1435
-
1436
- <% if (!batchInputPath) { %>
1437
- # Bootstrap default S3 input bucket (check-and-create)
1438
- echo "🔍 Checking if S3 input bucket exists: ${BATCH_INPUT_BUCKET}"
1439
- if ! aws s3api head-bucket --bucket "${BATCH_INPUT_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
1440
- echo "📦 Creating S3 input bucket: ${BATCH_INPUT_BUCKET}"
1441
- if [ "${AWS_REGION}" = "us-east-1" ]; then
1442
- if ! aws s3api create-bucket \
1443
- --bucket "${BATCH_INPUT_BUCKET}" \
1444
- --region "${AWS_REGION}"; then
1445
- echo "❌ Failed to create S3 input bucket: ${BATCH_INPUT_BUCKET}"
1446
- echo ""
1447
- echo " Check that:"
1448
- echo " • Your IAM credentials have s3:CreateBucket permission"
1449
- echo " • The bucket name is not already taken globally"
1450
- exit 4
1451
- fi
1452
- else
1453
- if ! aws s3api create-bucket \
1454
- --bucket "${BATCH_INPUT_BUCKET}" \
1455
- --region "${AWS_REGION}" \
1456
- --create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
1457
- echo "❌ Failed to create S3 input bucket: ${BATCH_INPUT_BUCKET}"
1458
- echo ""
1459
- echo " Check that:"
1460
- echo " • Your IAM credentials have s3:CreateBucket permission"
1461
- echo " • The bucket name is not already taken globally"
1462
- exit 4
1463
- fi
1464
- fi
1465
- echo "✅ S3 input bucket created: ${BATCH_INPUT_BUCKET}"
1466
- else
1467
- echo "✅ S3 input bucket exists: ${BATCH_INPUT_BUCKET}"
1468
- fi
1469
-
1470
- # Upload sample input file if the input prefix is empty
1471
- EXISTING_OBJECTS=$(aws s3 ls "${BATCH_INPUT_PATH}" --region "${AWS_REGION}" 2>/dev/null | head -1 || true)
1472
- if [ -z "${EXISTING_OBJECTS}" ]; then
1473
- echo "📄 Uploading sample input file to ${BATCH_INPUT_PATH}"
1474
- <% if (framework === 'transformers' && (modelServer === 'vllm' || modelServer === 'sglang')) { %>
1475
- echo '{"model": "<%= modelName %>", "messages": [{"role": "user", "content": "What is machine learning?"}], "max_tokens": 50}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
1476
- <% } else if (framework === 'transformers') { %>
1477
- echo '{"inputs": "What is machine learning?", "parameters": {"max_new_tokens": 50}}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
1478
- <% } else if (framework === 'diffusors') { %>
1479
- echo '{"prompt": "A white cat", "n": 1, "size": "512x512"}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
1480
- <% } else { %>
1481
- echo '{"instances": [[1.0, 2.0, 3.0, 4.0]]}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
1482
- <% } %>
1483
- echo "✅ Sample input uploaded: ${BATCH_INPUT_PATH}sample.jsonl"
1484
- echo " ⚠️ Replace this with your actual input data before running production jobs"
1485
- fi
1486
- <% } else { %>
1487
- # Custom S3 input path provided — skip bucket creation
1488
- echo "✅ Using custom S3 input path: ${BATCH_INPUT_PATH}"
1489
- <% } %>
1490
-
1491
- <% if (!batchOutputPath) { %>
1492
- # Bootstrap default S3 output bucket (check-and-create, may be same as input)
1493
- if [ "${BATCH_OUTPUT_BUCKET}" != "${BATCH_INPUT_BUCKET}" ]; then
1494
- echo "🔍 Checking if S3 output bucket exists: ${BATCH_OUTPUT_BUCKET}"
1495
- if ! aws s3api head-bucket --bucket "${BATCH_OUTPUT_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
1496
- echo "📦 Creating S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
1497
- if [ "${AWS_REGION}" = "us-east-1" ]; then
1498
- if ! aws s3api create-bucket \
1499
- --bucket "${BATCH_OUTPUT_BUCKET}" \
1500
- --region "${AWS_REGION}"; then
1501
- echo "❌ Failed to create S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
1502
- exit 4
1503
- fi
1504
- else
1505
- if ! aws s3api create-bucket \
1506
- --bucket "${BATCH_OUTPUT_BUCKET}" \
1507
- --region "${AWS_REGION}" \
1508
- --create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
1509
- echo "❌ Failed to create S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
1510
- exit 4
1511
- fi
1512
- fi
1513
- echo "✅ S3 output bucket created: ${BATCH_OUTPUT_BUCKET}"
1514
- else
1515
- echo "✅ S3 output bucket exists: ${BATCH_OUTPUT_BUCKET}"
1516
- fi
1517
- else
1518
- echo "✅ S3 output bucket same as input: ${BATCH_OUTPUT_BUCKET}"
1519
- fi
1520
- <% } else { %>
1521
- # Custom S3 output path provided — skip bucket creation
1522
- echo "✅ Using custom S3 output path: ${BATCH_OUTPUT_PATH}"
1523
- <% } %>
1524
-
1525
- # ============================================================
1526
- # Check for previous transform job still running
1527
- # ============================================================
1528
- if [ "${FORCE_NEW}" != true ] && [ -n "${TRANSFORM_JOB_NAME:-}" ]; then
1529
- echo "🔍 Checking previous transform job: ${TRANSFORM_JOB_NAME}"
1530
- PREV_JOB_STATUS=$(aws sagemaker describe-transform-job \
1531
- --transform-job-name "${TRANSFORM_JOB_NAME}" \
1532
- --region "${AWS_REGION}" \
1533
- --query "TransformJobStatus" \
1534
- --output text 2>/dev/null || echo "")
1535
-
1536
- case "${PREV_JOB_STATUS}" in
1537
- InProgress)
1538
- echo "⚠️ Previous transform job is still running: ${TRANSFORM_JOB_NAME}"
1539
- echo " Wait for it to complete, or stop it with:"
1540
- echo " aws sagemaker stop-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION}"
1541
- echo ""
1542
- echo " Use --force to create a new job anyway."
1543
- exit 4
1544
- ;;
1545
- Completed)
1546
- echo "✅ Previous transform job completed: ${TRANSFORM_JOB_NAME}"
1547
- echo " Creating a new job. Results from the previous job are in:"
1548
- echo " ${BATCH_OUTPUT_PATH}"
1549
- echo ""
1550
- ;;
1551
- *)
1552
- # Failed, Stopped, or not found — proceed with new job
1553
- ;;
1554
- esac
1555
- fi
1556
-
1557
- # Generate unique names with timestamp
1558
- TIMESTAMP=$(date +%s)
1559
- MODEL_NAME_SM="${PROJECT_NAME}-batch-model-${TIMESTAMP}"
1560
- TRANSFORM_JOB_NAME="${PROJECT_NAME}-batch-job-${TIMESTAMP}"
1561
-
1562
- _update_config_var "TRANSFORM_JOB_NAME" "${TRANSFORM_JOB_NAME}"
1563
- _update_config_var "SAGEMAKER_MODEL_NAME" "${MODEL_NAME_SM}"
1564
-
1565
- # Step 1: Create SageMaker model
1566
- echo "📦 Creating SageMaker model: ${MODEL_NAME_SM}"
1567
-
1568
- # Build primary container spec
1569
- BATCH_PRIMARY_CONTAINER="{\"Image\":\"${ECR_REPOSITORY}:${IMAGE_TAG}\""
1570
- if [ -n "${CONTAINER_ENV_JSON}" ]; then
1571
- BATCH_PRIMARY_CONTAINER="${BATCH_PRIMARY_CONTAINER},\"Environment\":{${CONTAINER_ENV_JSON}}"
1572
- fi
1573
- BATCH_PRIMARY_CONTAINER="${BATCH_PRIMARY_CONTAINER}}"
1574
-
1575
- if ! aws sagemaker create-model \
1576
- --model-name "${MODEL_NAME_SM}" \
1577
- --primary-container "${BATCH_PRIMARY_CONTAINER}" \
1578
- --execution-role-arn "${ROLE_ARN}" \
1579
- --region "${AWS_REGION}"; then
1580
-
1581
- echo "❌ Failed to create SageMaker model"
1582
- echo " Check that:"
1583
- echo " • The execution role ARN is valid"
1584
- echo " • The ECR image exists and is accessible"
1585
- echo " • The IAM role has ecr:GetDownloadUrlForLayer permission"
1586
- exit 4
1587
- fi
1588
-
1589
- echo "✅ SageMaker model created: ${MODEL_NAME_SM}"
1590
-
1591
- # Record model in manifest (non-blocking)
1592
- MODEL_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:model/${MODEL_NAME_SM}"
1593
- ./do/manifest add \
1594
- --type sagemaker-model \
1595
- --id "${MODEL_ARN}" \
1596
- --project "${PROJECT_NAME}" \
1597
- --meta "{\"modelName\":\"${MODEL_NAME_SM}\",\"region\":\"${AWS_REGION}\"}" \
1598
- 2>/dev/null || true
1599
-
1600
- # Step 2: Build transform job JSON
1601
- TRANSFORM_JOB_JSON="{
1602
- \"TransformJobName\": \"${TRANSFORM_JOB_NAME}\",
1603
- \"ModelName\": \"${MODEL_NAME_SM}\",
1604
- \"TransformInput\": {
1605
- \"DataSource\": {
1606
- \"S3DataSource\": {
1607
- \"S3DataType\": \"S3Prefix\",
1608
- \"S3Uri\": \"${BATCH_INPUT_PATH}\"
1609
- }
1610
- },
1611
- \"ContentType\": \"application/json\",
1612
- \"SplitType\": \"${BATCH_SPLIT_TYPE}\"
1613
- },
1614
- \"TransformOutput\": {
1615
- \"S3OutputPath\": \"${BATCH_OUTPUT_PATH}\"
1616
- $([ "${BATCH_JOIN_SOURCE:-None}" = "Input" ] && echo ",\"Accept\": \"application/json\", \"AssembleWith\": \"${BATCH_SPLIT_TYPE}\"")
1617
- },
1618
- \"TransformResources\": {
1619
- \"InstanceType\": \"${INSTANCE_TYPE}\",
1620
- \"InstanceCount\": ${BATCH_INSTANCE_COUNT}
1621
- },
1622
- \"MaxConcurrentTransforms\": ${BATCH_MAX_CONCURRENT_TRANSFORMS:-1},
1623
- \"MaxPayloadInMB\": ${BATCH_MAX_PAYLOAD_IN_MB:-6},
1624
- \"BatchStrategy\": \"${BATCH_STRATEGY}\"
1625
- $([ "${BATCH_JOIN_SOURCE:-None}" = "Input" ] && echo ",\"DataProcessing\": { \"JoinSource\": \"Input\" }")
1626
- }"
1627
-
1628
- # Step 3: Create transform job
1629
- echo "🚀 Creating transform job: ${TRANSFORM_JOB_NAME}"
1630
- if ! aws sagemaker create-transform-job \
1631
- --cli-input-json "${TRANSFORM_JOB_JSON}" \
1632
- --region "${AWS_REGION}"; then
1633
-
1634
- echo "❌ Failed to create transform job"
1635
- echo " Check that:"
1636
- echo " • The S3 input path exists and is accessible: ${BATCH_INPUT_PATH}"
1637
- echo " • The S3 output path is writable: ${BATCH_OUTPUT_PATH}"
1638
- echo " • The IAM role has s3:GetObject permission on the input path"
1639
- echo " • The IAM role has s3:PutObject permission on the output path"
1640
- echo " • The instance type is valid: ${INSTANCE_TYPE}"
1641
- echo " • The instance type is available in region: ${AWS_REGION}"
1642
- echo " • You have sufficient service quota for the instance type"
1643
- exit 4
1644
- fi
1645
-
1646
- echo "✅ Transform job created: ${TRANSFORM_JOB_NAME}"
1647
-
1648
- # Record transform job in manifest (non-blocking)
1649
- TRANSFORM_JOB_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:transform-job/${TRANSFORM_JOB_NAME}"
1650
- ./do/manifest add \
1651
- --type sagemaker-transform-job \
1652
- --id "${TRANSFORM_JOB_ARN}" \
1653
- --project "${PROJECT_NAME}" \
1654
- --meta "{\"transformJobName\":\"${TRANSFORM_JOB_NAME}\",\"modelName\":\"${MODEL_NAME_SM}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
1655
- 2>/dev/null || true
1656
-
1657
- # Step 4: Poll transform job status until completion or failure
1658
- echo "⏳ Waiting for transform job to complete..."
1659
- echo " This may take several minutes depending on dataset size..."
1660
- echo " If this times out, check status with:"
1661
- echo " aws sagemaker describe-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION}"
1662
- echo ""
1663
-
1664
- while true; do
1665
- JOB_STATUS=$(aws sagemaker describe-transform-job \
1666
- --transform-job-name "${TRANSFORM_JOB_NAME}" \
1667
- --region "${AWS_REGION}" \
1668
- --query "TransformJobStatus" \
1669
- --output text 2>&1) || {
1670
- # Check if it was a credential expiration
1671
- if echo "${JOB_STATUS}" | grep -qi "expired\|token"; then
1672
- echo ""
1673
- echo "⚠️ Credentials expired, but the transform job is still running."
1674
- echo " Refresh your credentials and check status with:"
1675
- echo " aws sagemaker describe-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION} --query TransformJobStatus"
1676
- exit 4
1677
- fi
1678
- echo "❌ Failed to describe transform job: ${TRANSFORM_JOB_NAME}"
1679
- echo " Error: ${JOB_STATUS}"
1680
- exit 4
1681
- }
1682
-
1683
- case "${JOB_STATUS}" in
1684
- Completed)
1685
- echo "✅ Transform job completed successfully!"
1686
- break
1687
- ;;
1688
- Failed)
1689
- FAILURE_REASON=$(aws sagemaker describe-transform-job \
1690
- --transform-job-name "${TRANSFORM_JOB_NAME}" \
1691
- --region "${AWS_REGION}" \
1692
- --query "FailureReason" \
1693
- --output text 2>/dev/null || echo "Unknown")
1694
- echo "❌ Transform job failed"
1695
- echo " Reason: ${FAILURE_REASON}"
1696
- echo ""
1697
- echo " Check CloudWatch Logs for details:"
1698
- echo " https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/TransformJobs"
1699
- echo ""
1700
- echo " Verify that:"
1701
- echo " • The S3 input path exists and contains data: ${BATCH_INPUT_PATH}"
1702
- echo " • The input data format matches the container's expected format"
1703
- echo " • The container's /ping and /invocations endpoints work correctly"
1704
- exit 4
1705
- ;;
1706
- Stopped)
1707
- echo "⚠️ Transform job was stopped"
1708
- exit 4
1709
- ;;
1710
- InProgress)
1711
- echo " $(date +%H:%M:%S) Job status: InProgress..."
1712
- sleep 30
1713
- ;;
1714
- *)
1715
- echo " $(date +%H:%M:%S) Job status: ${JOB_STATUS}..."
1716
- sleep 30
1717
- ;;
1718
- esac
1719
- done
1720
-
1721
- echo ""
1722
- echo "📋 Deployment Details:"
1723
- echo " Transform Job: ${TRANSFORM_JOB_NAME}"
1724
- echo " Model: ${MODEL_NAME_SM}"
1725
- echo " Region: ${AWS_REGION}"
1726
- echo " Instance Type: ${INSTANCE_TYPE}"
1727
- echo " Instance Count: ${BATCH_INSTANCE_COUNT}"
1728
- echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
1729
- echo " S3 Input: ${BATCH_INPUT_PATH}"
1730
- echo " S3 Output: ${BATCH_OUTPUT_PATH}"
1731
- echo " Split Type: ${BATCH_SPLIT_TYPE}"
1732
- echo " Strategy: ${BATCH_STRATEGY}"
1733
- echo ""
1734
-
1735
- # Download results locally
1736
- LOCAL_OUTPUT_DIR="${SCRIPT_DIR}/../batch-output"
1737
- mkdir -p "${LOCAL_OUTPUT_DIR}"
1738
- echo "📥 Downloading results to ${LOCAL_OUTPUT_DIR}/"
1739
- if aws s3 sync "${BATCH_OUTPUT_PATH}" "${LOCAL_OUTPUT_DIR}/" --region "${AWS_REGION}"; then
1740
- DOWNLOADED=$(ls -1 "${LOCAL_OUTPUT_DIR}" 2>/dev/null | wc -l | tr -d ' ')
1741
- echo "✅ Downloaded ${DOWNLOADED} file(s) to ${LOCAL_OUTPUT_DIR}/"
1742
- echo ""
1743
-
1744
- # Display first output file preview
1745
- FIRST_FILE=$(ls -1 "${LOCAL_OUTPUT_DIR}" 2>/dev/null | head -1)
1746
- if [ -n "${FIRST_FILE}" ]; then
1747
- echo "📄 Sample output (${FIRST_FILE}):"
1748
- head -5 "${LOCAL_OUTPUT_DIR}/${FIRST_FILE}"
1749
- LINES=$(wc -l < "${LOCAL_OUTPUT_DIR}/${FIRST_FILE}" | tr -d ' ')
1750
- if [ "${LINES}" -gt 5 ]; then
1751
- echo " ... (${LINES} total lines)"
1752
- fi
1753
- fi
1754
- else
1755
- echo "⚠️ Could not download output files"
1756
- fi
1757
-
1758
- echo ""
1759
- echo "📋 What's next?"
1760
- echo " • View results: cat batch-output/"
1761
- echo " • Review results: ./do/test"
1762
- echo " • Register this deployment: ./do/register"
1763
- echo " • View logs: ./do/logs"
1764
- echo " • Clean up when done: ./do/clean"
1765
-
1766
- <% } %>
1
+ <%- include('deploy.d/' + (deploymentTarget === 'realtime-inference' ? 'managed-inference' : deploymentTarget)) %>