@aws/ml-container-creator 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. package/LICENSE +202 -0
  2. package/LICENSE-THIRD-PARTY +68620 -0
  3. package/NOTICE +2 -0
  4. package/README.md +106 -0
  5. package/bin/cli.js +365 -0
  6. package/config/defaults.json +32 -0
  7. package/config/presets/transformers-djl.json +26 -0
  8. package/config/presets/transformers-gpu.json +24 -0
  9. package/config/presets/transformers-lmi.json +27 -0
  10. package/package.json +129 -0
  11. package/servers/README.md +419 -0
  12. package/servers/base-image-picker/catalogs/model-servers.json +1191 -0
  13. package/servers/base-image-picker/catalogs/python-slim.json +38 -0
  14. package/servers/base-image-picker/catalogs/triton-backends.json +51 -0
  15. package/servers/base-image-picker/catalogs/triton.json +38 -0
  16. package/servers/base-image-picker/index.js +495 -0
  17. package/servers/base-image-picker/manifest.json +17 -0
  18. package/servers/base-image-picker/package.json +15 -0
  19. package/servers/hyperpod-cluster-picker/LICENSE +202 -0
  20. package/servers/hyperpod-cluster-picker/index.js +424 -0
  21. package/servers/hyperpod-cluster-picker/manifest.json +14 -0
  22. package/servers/hyperpod-cluster-picker/package.json +17 -0
  23. package/servers/instance-recommender/LICENSE +202 -0
  24. package/servers/instance-recommender/catalogs/instances.json +852 -0
  25. package/servers/instance-recommender/index.js +284 -0
  26. package/servers/instance-recommender/manifest.json +16 -0
  27. package/servers/instance-recommender/package.json +15 -0
  28. package/servers/lib/LICENSE +202 -0
  29. package/servers/lib/bedrock-client.js +160 -0
  30. package/servers/lib/custom-validators.js +46 -0
  31. package/servers/lib/dynamic-resolver.js +36 -0
  32. package/servers/lib/package.json +11 -0
  33. package/servers/lib/schemas/image-catalog.schema.json +185 -0
  34. package/servers/lib/schemas/instances.schema.json +124 -0
  35. package/servers/lib/schemas/manifest.schema.json +64 -0
  36. package/servers/lib/schemas/model-catalog.schema.json +91 -0
  37. package/servers/lib/schemas/regions.schema.json +26 -0
  38. package/servers/lib/schemas/triton-backends.schema.json +51 -0
  39. package/servers/model-picker/catalogs/jumpstart-public.json +66 -0
  40. package/servers/model-picker/catalogs/popular-diffusors.json +88 -0
  41. package/servers/model-picker/catalogs/popular-transformers.json +226 -0
  42. package/servers/model-picker/index.js +1693 -0
  43. package/servers/model-picker/manifest.json +18 -0
  44. package/servers/model-picker/package.json +20 -0
  45. package/servers/region-picker/LICENSE +202 -0
  46. package/servers/region-picker/catalogs/regions.json +263 -0
  47. package/servers/region-picker/index.js +230 -0
  48. package/servers/region-picker/manifest.json +16 -0
  49. package/servers/region-picker/package.json +15 -0
  50. package/src/app.js +1007 -0
  51. package/src/copy-tpl.js +77 -0
  52. package/src/lib/accelerator-validator.js +39 -0
  53. package/src/lib/asset-manager.js +385 -0
  54. package/src/lib/aws-profile-parser.js +181 -0
  55. package/src/lib/bootstrap-command-handler.js +1647 -0
  56. package/src/lib/bootstrap-config.js +238 -0
  57. package/src/lib/ci-register-helpers.js +124 -0
  58. package/src/lib/ci-report-helpers.js +158 -0
  59. package/src/lib/ci-stage-helpers.js +268 -0
  60. package/src/lib/cli-handler.js +529 -0
  61. package/src/lib/comment-generator.js +544 -0
  62. package/src/lib/community-reports-validator.js +91 -0
  63. package/src/lib/config-manager.js +2106 -0
  64. package/src/lib/configuration-exporter.js +204 -0
  65. package/src/lib/configuration-manager.js +695 -0
  66. package/src/lib/configuration-matcher.js +221 -0
  67. package/src/lib/cpu-validator.js +36 -0
  68. package/src/lib/cuda-validator.js +57 -0
  69. package/src/lib/deployment-config-resolver.js +103 -0
  70. package/src/lib/deployment-entry-schema.js +125 -0
  71. package/src/lib/deployment-registry.js +598 -0
  72. package/src/lib/docker-introspection-validator.js +51 -0
  73. package/src/lib/engine-prefix-resolver.js +60 -0
  74. package/src/lib/huggingface-client.js +172 -0
  75. package/src/lib/key-value-parser.js +37 -0
  76. package/src/lib/known-flags-validator.js +200 -0
  77. package/src/lib/manifest-cli.js +280 -0
  78. package/src/lib/mcp-client.js +303 -0
  79. package/src/lib/mcp-command-handler.js +532 -0
  80. package/src/lib/neuron-validator.js +80 -0
  81. package/src/lib/parameter-schema-validator.js +284 -0
  82. package/src/lib/prompt-runner.js +1349 -0
  83. package/src/lib/prompts.js +1138 -0
  84. package/src/lib/registry-command-handler.js +519 -0
  85. package/src/lib/registry-loader.js +198 -0
  86. package/src/lib/rocm-validator.js +80 -0
  87. package/src/lib/schema-validator.js +157 -0
  88. package/src/lib/sensitive-redactor.js +59 -0
  89. package/src/lib/template-engine.js +156 -0
  90. package/src/lib/template-manager.js +341 -0
  91. package/src/lib/validation-engine.js +314 -0
  92. package/src/prompt-adapter.js +63 -0
  93. package/templates/Dockerfile +300 -0
  94. package/templates/IAM_PERMISSIONS.md +84 -0
  95. package/templates/MIGRATION.md +488 -0
  96. package/templates/PROJECT_README.md +439 -0
  97. package/templates/TEMPLATE_SYSTEM.md +243 -0
  98. package/templates/buildspec.yml +64 -0
  99. package/templates/code/chat_template.jinja +1 -0
  100. package/templates/code/flask/gunicorn_config.py +35 -0
  101. package/templates/code/flask/wsgi.py +10 -0
  102. package/templates/code/model_handler.py +387 -0
  103. package/templates/code/serve +300 -0
  104. package/templates/code/serve.py +175 -0
  105. package/templates/code/serving.properties +105 -0
  106. package/templates/code/start_server.py +39 -0
  107. package/templates/code/start_server.sh +39 -0
  108. package/templates/diffusors/Dockerfile +72 -0
  109. package/templates/diffusors/patch_image_api.py +35 -0
  110. package/templates/diffusors/serve +115 -0
  111. package/templates/diffusors/start_server.sh +114 -0
  112. package/templates/do/.gitkeep +1 -0
  113. package/templates/do/README.md +541 -0
  114. package/templates/do/build +83 -0
  115. package/templates/do/ci +681 -0
  116. package/templates/do/clean +811 -0
  117. package/templates/do/config +260 -0
  118. package/templates/do/deploy +1560 -0
  119. package/templates/do/export +306 -0
  120. package/templates/do/logs +319 -0
  121. package/templates/do/manifest +12 -0
  122. package/templates/do/push +119 -0
  123. package/templates/do/register +580 -0
  124. package/templates/do/run +113 -0
  125. package/templates/do/submit +417 -0
  126. package/templates/do/test +1147 -0
  127. package/templates/hyperpod/configmap.yaml +24 -0
  128. package/templates/hyperpod/deployment.yaml +71 -0
  129. package/templates/hyperpod/pvc.yaml +42 -0
  130. package/templates/hyperpod/service.yaml +17 -0
  131. package/templates/nginx-diffusors.conf +74 -0
  132. package/templates/nginx-predictors.conf +47 -0
  133. package/templates/nginx-tensorrt.conf +74 -0
  134. package/templates/requirements.txt +61 -0
  135. package/templates/sample_model/test_inference.py +123 -0
  136. package/templates/sample_model/train_abalone.py +252 -0
  137. package/templates/test/test_endpoint.sh +79 -0
  138. package/templates/test/test_local_image.sh +80 -0
  139. package/templates/test/test_model_handler.py +180 -0
  140. package/templates/triton/Dockerfile +128 -0
  141. package/templates/triton/config.pbtxt +163 -0
  142. package/templates/triton/model.py +130 -0
  143. package/templates/triton/requirements.txt +11 -0
@@ -0,0 +1,1560 @@
1
+ #!/bin/bash
2
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ set -e
6
+ set -u
7
+ set -o pipefail
8
+
9
+ # Parse flags
10
+ FORCE_NEW=false
11
+ FORCE_IC=false
12
+ for arg in "$@"; do
13
+ case "$arg" in
14
+ --force) FORCE_NEW=true ;;
15
+ --force-ic) FORCE_IC=true ;;
16
+ --help|-h)
17
+ echo "Usage: ./do/deploy [--force] [--force-ic]"
18
+ echo ""
19
+ echo "Options:"
20
+ echo " --force Create a new endpoint and IC, even if one already exists."
21
+ echo " --force-ic Recreate just the IC on the existing endpoint."
22
+ echo ""
23
+ echo "Without flags, deploy resumes from the last run."
24
+ exit 0
25
+ ;;
26
+ esac
27
+ done
28
+
29
+ # Source configuration
30
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
31
+ source "${SCRIPT_DIR}/config"
32
+
33
+ echo "๐Ÿš€ Deploying to AWS"
34
+ echo " Project: ${PROJECT_NAME}"
35
+ echo " Deployment config: ${DEPLOYMENT_CONFIG}"
36
+ echo " Region: ${AWS_REGION}"
37
+ echo " Build target: ${BUILD_TARGET}"
38
+ echo " Deployment target: ${DEPLOYMENT_TARGET}"
39
+ <% if (deploymentTarget === 'managed-inference') { %>
40
+ echo " Instance type: ${INSTANCE_TYPE}"
41
+ <% } else if (deploymentTarget === 'async-inference') { %>
42
+ echo " Instance type: ${INSTANCE_TYPE}"
43
+ echo " S3 output: ${ASYNC_S3_OUTPUT_PATH}"
44
+ echo " SNS success: ${ASYNC_SNS_SUCCESS_TOPIC}"
45
+ echo " SNS error: ${ASYNC_SNS_ERROR_TOPIC}"
46
+ <% if (asyncMaxConcurrentInvocations) { %>
47
+ echo " Max concurrent: ${ASYNC_MAX_CONCURRENT_INVOCATIONS}"
48
+ <% } %>
49
+ <% } else if (deploymentTarget === 'hyperpod-eks') { %>
50
+ echo " HyperPod cluster: ${HYPERPOD_CLUSTER_NAME}"
51
+ echo " Namespace: ${HYPERPOD_NAMESPACE}"
52
+ echo " Replicas: ${HYPERPOD_REPLICAS}"
53
+ <% } else if (deploymentTarget === 'batch-transform') { %>
54
+ echo " Instance type: ${INSTANCE_TYPE}"
55
+ echo " S3 input: ${BATCH_INPUT_PATH}"
56
+ echo " S3 output: ${BATCH_OUTPUT_PATH}"
57
+ echo " Instance count: ${BATCH_INSTANCE_COUNT}"
58
+ echo " Split type: ${BATCH_SPLIT_TYPE}"
59
+ echo " Strategy: ${BATCH_STRATEGY}"
60
+ <% } %>
61
+
62
+ # Check AWS credentials
63
+ echo "๐Ÿ” Validating AWS credentials..."
64
+ if ! aws sts get-caller-identity &> /dev/null; then
65
+ echo "โŒ AWS credentials not configured"
66
+ echo " Run: aws configure"
67
+ echo " Or set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables"
68
+ exit 4
69
+ fi
70
+
71
+ AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
72
+ echo "โœ… AWS credentials validated (Account: ${AWS_ACCOUNT_ID})"
73
+
74
+ # Construct ECR repository URL
75
+ ECR_REPOSITORY="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_REPOSITORY_NAME}"
76
+
77
+ # ============================================================
78
+ # Shared: Verify ECR image exists
79
+ # ============================================================
80
+ echo "๐Ÿ” Verifying ECR image exists..."
81
+ if ! aws ecr describe-images \
82
+ --repository-name "${ECR_REPOSITORY_NAME}" \
83
+ --image-ids imageTag="${PROJECT_NAME}-latest" \
84
+ --region "${AWS_REGION}" &> /dev/null; then
85
+
86
+ echo "โŒ ECR image not found: ${ECR_REPOSITORY}:${PROJECT_NAME}-latest"
87
+ echo ""
88
+ echo "Please build and push your image first:"
89
+ echo " ./do/submit"
90
+ echo ""
91
+ echo "After the build completes successfully, run this deploy script again."
92
+ exit 4
93
+ fi
94
+
95
+ echo "โœ… ECR image found: ${ECR_REPOSITORY}:${PROJECT_NAME}-latest"
96
+ IMAGE_TAG="${PROJECT_NAME}-latest"
97
+
98
+ <% if (deploymentTarget === 'managed-inference') { %>
99
+ # ============================================================
100
+ # SageMaker Managed Inference Deployment (Inference Components)
101
+ # ============================================================
102
+
103
+ # Validate execution role ARN
104
+ if [ -z "${ROLE_ARN:-}" ]; then
105
+ echo "โŒ Execution role ARN not provided"
106
+ echo ""
107
+ echo "Usage:"
108
+ echo " export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE"
109
+ echo " ./do/deploy"
110
+ echo ""
111
+ echo "Or set ROLE_ARN in do/config"
112
+ echo ""
113
+ echo "The execution role must have permissions for:"
114
+ echo " โ€ข SageMaker endpoint and inference component management"
115
+ echo " โ€ข ECR image access"
116
+ echo " โ€ข S3 access (if using model artifacts)"
117
+ echo " โ€ข CloudWatch Logs"
118
+ exit 3
119
+ fi
120
+
121
+ echo " Using execution role: ${ROLE_ARN}"
122
+
123
+ # Helper: persist a variable to do/config so other scripts can use it
124
+ _update_config_var() {
125
+ local var_name="$1" var_value="$2" config_file="${SCRIPT_DIR}/config"
126
+ if grep -q "^export ${var_name}=" "${config_file}" 2>/dev/null; then
127
+ sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
128
+ rm -f "${config_file}.bak"
129
+ else
130
+ echo "" >> "${config_file}"
131
+ echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
132
+ fi
133
+ }
134
+
135
+ # Helper: query a SageMaker resource status, returns empty string if not found
136
+ _get_endpoint_status() {
137
+ aws sagemaker describe-endpoint \
138
+ --endpoint-name "$1" \
139
+ --region "${AWS_REGION}" \
140
+ --query EndpointStatus \
141
+ --output text 2>/dev/null || echo ""
142
+ }
143
+
144
+ _get_ic_status() {
145
+ aws sagemaker describe-inference-component \
146
+ --inference-component-name "$1" \
147
+ --region "${AWS_REGION}" \
148
+ --query InferenceComponentStatus \
149
+ --output text 2>/dev/null || echo ""
150
+ }
151
+
152
+ # Helper: find an InService IC on an endpoint (returns first match or empty)
153
+ _find_active_ic_on_endpoint() {
154
+ aws sagemaker list-inference-components \
155
+ --endpoint-name "$1" \
156
+ --status-equals InService \
157
+ --region "${AWS_REGION}" \
158
+ --query 'InferenceComponents[0].InferenceComponentName' \
159
+ --output text 2>/dev/null || echo ""
160
+ }
161
+
162
+ # ============================================================
163
+ # Idempotency: check for existing deployment from a previous run
164
+ # ============================================================
165
+ SKIP_TO=""
166
+
167
+ if [ "${FORCE_NEW}" = true ]; then
168
+ echo "๐Ÿ”„ --force: ignoring previous deployment, creating new resources."
169
+ elif [ "${FORCE_IC}" = true ] && [ -n "${ENDPOINT_NAME:-}" ]; then
170
+ EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")
171
+ if [ "${EP_STATUS}" = "InService" ]; then
172
+ echo "๐Ÿ”„ --force-ic: recreating inference component on existing endpoint: ${ENDPOINT_NAME}"
173
+ SKIP_TO="create_ic"
174
+ else
175
+ echo "โš ๏ธ --force-ic requires an InService endpoint, but ${ENDPOINT_NAME} is: ${EP_STATUS:-not found}"
176
+ echo " Use --force to create a new endpoint, or wait for the current one."
177
+ exit 4
178
+ fi
179
+ elif [ -n "${ENDPOINT_NAME:-}" ]; then
180
+ echo "๐Ÿ” Checking for existing deployment: ${ENDPOINT_NAME}"
181
+
182
+ EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")
183
+
184
+ case "${EP_STATUS}" in
185
+ InService)
186
+ echo "โœ… Endpoint already InService: ${ENDPOINT_NAME}"
187
+
188
+ # Check inference component
189
+ if [ -n "${INFERENCE_COMPONENT_NAME:-}" ]; then
190
+ IC_STATUS=$(_get_ic_status "${INFERENCE_COMPONENT_NAME}")
191
+
192
+ case "${IC_STATUS}" in
193
+ InService)
194
+ echo "โœ… Inference component already InService: ${INFERENCE_COMPONENT_NAME}"
195
+ echo ""
196
+ echo "๐Ÿ“‹ Deployment is already live. Nothing to do."
197
+ echo " Endpoint: ${ENDPOINT_NAME}"
198
+ echo " Inference Component: ${INFERENCE_COMPONENT_NAME}"
199
+ echo ""
200
+ echo "๐Ÿงช Test your endpoint:"
201
+ echo " ./do/test"
202
+ echo ""
203
+ echo "๐Ÿงน Clean up when done:"
204
+ echo " ./do/clean endpoint"
205
+ exit 0
206
+ ;;
207
+ Creating)
208
+ echo "โณ Inference component still creating: ${INFERENCE_COMPONENT_NAME}"
209
+ SKIP_TO="wait_ic"
210
+ IC_NAME="${INFERENCE_COMPONENT_NAME}"
211
+ ;;
212
+ Failed)
213
+ echo "โš ๏ธ Inference component failed: ${INFERENCE_COMPONENT_NAME}"
214
+ echo " Will create a new inference component on the existing endpoint."
215
+ SKIP_TO="create_ic"
216
+ ;;
217
+ *)
218
+ # Stored IC not found โ€” check if a different IC is running on this endpoint
219
+ LIVE_IC=$(_find_active_ic_on_endpoint "${ENDPOINT_NAME}")
220
+ if [ -n "${LIVE_IC}" ] && [ "${LIVE_IC}" != "None" ]; then
221
+ echo "โœ… Found running inference component on endpoint: ${LIVE_IC}"
222
+ echo " (config had stale reference: ${INFERENCE_COMPONENT_NAME})"
223
+ _update_config_var "INFERENCE_COMPONENT_NAME" "${LIVE_IC}"
224
+ echo ""
225
+ echo "๐Ÿ“‹ Deployment is already live. Nothing to do."
226
+ echo " Endpoint: ${ENDPOINT_NAME}"
227
+ echo " Inference Component: ${LIVE_IC}"
228
+ echo ""
229
+ echo "๐Ÿงช Test your endpoint:"
230
+ echo " ./do/test"
231
+ echo ""
232
+ echo "๐Ÿงน Clean up when done:"
233
+ echo " ./do/clean endpoint"
234
+ exit 0
235
+ else
236
+ echo " No existing inference component found on endpoint. Will create one."
237
+ SKIP_TO="create_ic"
238
+ fi
239
+ ;;
240
+ esac
241
+ else
242
+ # No IC name in config โ€” check if one is already running on the endpoint
243
+ LIVE_IC=$(_find_active_ic_on_endpoint "${ENDPOINT_NAME}")
244
+ if [ -n "${LIVE_IC}" ] && [ "${LIVE_IC}" != "None" ]; then
245
+ echo "โœ… Found running inference component on endpoint: ${LIVE_IC}"
246
+ _update_config_var "INFERENCE_COMPONENT_NAME" "${LIVE_IC}"
247
+ echo ""
248
+ echo "๐Ÿ“‹ Deployment is already live. Nothing to do."
249
+ echo " Endpoint: ${ENDPOINT_NAME}"
250
+ echo " Inference Component: ${LIVE_IC}"
251
+ echo ""
252
+ echo "๐Ÿงช Test your endpoint:"
253
+ echo " ./do/test"
254
+ echo ""
255
+ echo "๐Ÿงน Clean up when done:"
256
+ echo " ./do/clean endpoint"
257
+ exit 0
258
+ else
259
+ SKIP_TO="create_ic"
260
+ fi
261
+ fi
262
+ ;;
263
+ Creating|Updating)
264
+ echo "โณ Endpoint still ${EP_STATUS}: ${ENDPOINT_NAME}"
265
+ SKIP_TO="wait_endpoint"
266
+ ;;
267
+ Failed)
268
+ echo "โš ๏ธ Previous endpoint failed: ${ENDPOINT_NAME}"
269
+ echo " Creating a new deployment. Clean up the failed endpoint with:"
270
+ echo " ./do/clean endpoint"
271
+ echo ""
272
+ # Fall through to create new resources
273
+ ;;
274
+ "")
275
+ echo " Previous endpoint not found (may have been cleaned up). Creating new deployment."
276
+ ;;
277
+ *)
278
+ echo " Endpoint in unexpected state: ${EP_STATUS}. Creating new deployment."
279
+ ;;
280
+ esac
281
+ fi
282
+
283
+ # ============================================================
284
+ # Step 1: Create endpoint configuration (skip if resuming)
285
+ # ============================================================
286
+ if [ -z "${SKIP_TO}" ]; then
287
+ TIMESTAMP=$(date +%s)
288
+ ENDPOINT_CONFIG_NAME="${PROJECT_NAME}-epc-${TIMESTAMP}"
289
+ ENDPOINT_NAME="${PROJECT_NAME}-endpoint-${TIMESTAMP}"
290
+ IC_NAME="${PROJECT_NAME}-ic-${TIMESTAMP}"
291
+
292
+ _update_config_var "ENDPOINT_NAME" "${ENDPOINT_NAME}"
293
+ _update_config_var "ENDPOINT_CONFIG_NAME" "${ENDPOINT_CONFIG_NAME}"
294
+ _update_config_var "INFERENCE_COMPONENT_NAME" "${IC_NAME}"
295
+
296
+ # Build production variant JSON
297
+ VARIANT_JSON="[{\"VariantName\":\"AllTraffic\",\"InstanceType\":\"${INSTANCE_TYPE}\",\"InitialInstanceCount\":1"
298
+
299
+ if [ -n "${INFERENCE_AMI_VERSION:-}" ]; then
300
+ VARIANT_JSON="${VARIANT_JSON},\"InferenceAmiVersion\":\"${INFERENCE_AMI_VERSION}\""
301
+ echo " AMI version: ${INFERENCE_AMI_VERSION}"
302
+ fi
303
+
304
+ VARIANT_JSON="${VARIANT_JSON}}]"
305
+
306
+ echo "โš™๏ธ Creating endpoint configuration: ${ENDPOINT_CONFIG_NAME}"
307
+ if ! aws sagemaker create-endpoint-config \
308
+ --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
309
+ --execution-role-arn "${ROLE_ARN}" \
310
+ --production-variants "${VARIANT_JSON}" \
311
+ --region "${AWS_REGION}"; then
312
+
313
+ echo "โŒ Failed to create endpoint configuration"
314
+ echo " Check that:"
315
+ echo " โ€ข The execution role ARN is valid"
316
+ echo " โ€ข The instance type is valid: ${INSTANCE_TYPE}"
317
+ echo " โ€ข The instance type is available in region: ${AWS_REGION}"
318
+ echo " โ€ข You have sufficient service quota for the instance type"
319
+ exit 4
320
+ fi
321
+
322
+ echo "โœ… Endpoint configuration created: ${ENDPOINT_CONFIG_NAME}"
323
+
324
+ # Record endpoint config in manifest (non-blocking)
325
+ ENDPOINT_CONFIG_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint-config/${ENDPOINT_CONFIG_NAME}"
326
+ ./do/manifest add \
327
+ --type sagemaker-endpoint-config \
328
+ --id "${ENDPOINT_CONFIG_ARN}" \
329
+ --project "${PROJECT_NAME}" \
330
+ --meta "{\"endpointConfigName\":\"${ENDPOINT_CONFIG_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
331
+ 2>/dev/null || true
332
+
333
+ # Step 2: Create endpoint
334
+ echo "๐Ÿš€ Creating endpoint: ${ENDPOINT_NAME}"
335
+ if ! aws sagemaker create-endpoint \
336
+ --endpoint-name "${ENDPOINT_NAME}" \
337
+ --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
338
+ --region "${AWS_REGION}"; then
339
+
340
+ echo "โŒ Failed to create endpoint"
341
+ echo " Check that:"
342
+ echo " โ€ข Your IAM credentials have sagemaker:CreateEndpoint permission"
343
+ echo " โ€ข You have sufficient service quota in region: ${AWS_REGION}"
344
+ exit 4
345
+ fi
346
+
347
+ echo "โœ… Endpoint creation initiated: ${ENDPOINT_NAME}"
348
+
349
+ # Record endpoint in manifest (non-blocking)
350
+ ENDPOINT_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint/${ENDPOINT_NAME}"
351
+ ./do/manifest add \
352
+ --type sagemaker-endpoint \
353
+ --id "${ENDPOINT_ARN}" \
354
+ --project "${PROJECT_NAME}" \
355
+ --meta "{\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
356
+ 2>/dev/null || true
357
+ fi
358
+
359
+ # ============================================================
360
+ # Wait for endpoint (skip if already InService)
361
+ # ============================================================
362
+ if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
363
+ echo "โณ Waiting for endpoint to reach InService status..."
364
+ echo " This may take a few minutes..."
365
+ echo " If this times out, re-run ./do/deploy to resume."
366
+
367
+ if ! aws sagemaker wait endpoint-in-service \
368
+ --endpoint-name "${ENDPOINT_NAME}" \
369
+ --region "${AWS_REGION}"; then
370
+
371
+ # Check if it was a credential expiration vs actual failure
372
+ EP_CHECK=$(_get_endpoint_status "${ENDPOINT_NAME}" 2>/dev/null)
373
+ if [ "${EP_CHECK}" = "Creating" ]; then
374
+ echo ""
375
+ echo "โš ๏ธ Wait interrupted (credentials may have expired), but endpoint is still creating."
376
+ echo " Refresh your credentials and re-run ./do/deploy to resume."
377
+ echo ""
378
+ echo " Or check status manually:"
379
+ echo " aws sagemaker describe-endpoint --endpoint-name ${ENDPOINT_NAME} --region ${AWS_REGION} --query EndpointStatus"
380
+ exit 4
381
+ fi
382
+
383
+ echo "โŒ Endpoint failed to reach InService status"
384
+ echo " Check CloudWatch Logs for details:"
385
+ echo " https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/Endpoints/${ENDPOINT_NAME}"
386
+ exit 4
387
+ fi
388
+
389
+ echo "โœ… Endpoint is InService: ${ENDPOINT_NAME}"
390
+ fi
391
+
392
+ # ============================================================
393
+ # Step 3: Create inference component (skip if resuming from wait_ic)
394
+ # ============================================================
395
+ if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "create_ic" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
396
+ # Generate new IC name if resuming after endpoint wait or failed IC
397
+ if [ "${SKIP_TO}" = "create_ic" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
398
+ TIMESTAMP=$(date +%s)
399
+ IC_NAME="${PROJECT_NAME}-ic-${TIMESTAMP}"
400
+ _update_config_var "INFERENCE_COMPONENT_NAME" "${IC_NAME}"
401
+ fi
402
+
403
+ echo "๐Ÿ“ฆ Creating inference component: ${IC_NAME}"
404
+ if ! aws sagemaker create-inference-component \
405
+ --inference-component-name "${IC_NAME}" \
406
+ --endpoint-name "${ENDPOINT_NAME}" \
407
+ --variant-name "AllTraffic" \
408
+ --specification "{
409
+ \"Container\": {
410
+ \"Image\": \"${ECR_REPOSITORY}:${IMAGE_TAG}\"
411
+ },
412
+ \"StartupParameters\": {
413
+ \"ContainerStartupHealthCheckTimeoutInSeconds\": 900
414
+ },
415
+ \"ComputeResourceRequirements\": {
416
+ \"NumberOfAcceleratorDevicesRequired\": 1,
417
+ \"MinMemoryRequiredInMb\": 1024
418
+ }
419
+ }" \
420
+ --runtime-config "{\"CopyCount\": 1}" \
421
+ --region "${AWS_REGION}"; then
422
+
423
+ echo "โŒ Failed to create inference component"
424
+ echo " Check that:"
425
+ echo " โ€ข The ECR image exists and is accessible"
426
+ echo " โ€ข The endpoint is in InService status"
427
+ echo " โ€ข The compute resource requirements fit the instance type: ${INSTANCE_TYPE}"
428
+ exit 4
429
+ fi
430
+
431
+ echo "โœ… Inference component creation initiated: ${IC_NAME}"
432
+
433
+ # Record inference component in manifest (non-blocking)
434
+ IC_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:inference-component/${IC_NAME}"
435
+ ./do/manifest add \
436
+ --type sagemaker-inference-component \
437
+ --id "${IC_ARN}" \
438
+ --project "${PROJECT_NAME}" \
439
+ --meta "{\"inferenceComponentName\":\"${IC_NAME}\",\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
440
+ 2>/dev/null || true
441
+ fi
442
+
443
+ # ============================================================
444
+ # Wait for inference component
445
+ # ============================================================
446
+ echo "โณ Waiting for inference component to reach InService status..."
447
+ echo " This may take 5-10 minutes..."
448
+ echo " If this times out, re-run ./do/deploy to resume."
449
+
450
+ # Poll loop โ€” replaces `aws sagemaker wait inference-component-in-service`
451
+ # which is only available in AWS CLI v2.15+
452
+ IC_WAIT_TIMEOUT=1800 # 30 minutes max
453
+ IC_WAIT_START=$(date +%s)
454
+
455
+ while true; do
456
+ IC_STATUS=$(_get_ic_status "${IC_NAME}" 2>/dev/null)
457
+
458
+ case "${IC_STATUS}" in
459
+ InService)
460
+ break
461
+ ;;
462
+ Failed)
463
+ echo "โŒ Inference component failed to reach InService status"
464
+ echo " Check CloudWatch Logs for details:"
465
+ echo " https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/Endpoints/${ENDPOINT_NAME}"
466
+ echo ""
467
+ echo " Debug:"
468
+ echo " aws sagemaker describe-inference-component --inference-component-name ${IC_NAME} --region ${AWS_REGION}"
469
+ exit 4
470
+ ;;
471
+ Creating)
472
+ # Check timeout
473
+ IC_ELAPSED=$(( $(date +%s) - IC_WAIT_START ))
474
+ if [ "${IC_ELAPSED}" -ge "${IC_WAIT_TIMEOUT}" ]; then
475
+ echo ""
476
+ echo "โš ๏ธ Inference component still creating after ${IC_WAIT_TIMEOUT}s."
477
+ echo " Re-run ./do/deploy to resume waiting."
478
+ echo ""
479
+ echo " Or check status manually:"
480
+ echo " aws sagemaker describe-inference-component --inference-component-name ${IC_NAME} --region ${AWS_REGION}"
481
+ exit 4
482
+ fi
483
+ echo " $(date +%H:%M:%S) Status: Creating (${IC_ELAPSED}s elapsed)..."
484
+ sleep 30
485
+ ;;
486
+ "")
487
+ echo "โš ๏ธ Could not determine inference component status (credentials may have expired)."
488
+ echo " Re-run ./do/deploy to resume."
489
+ exit 4
490
+ ;;
491
+ *)
492
+ echo " $(date +%H:%M:%S) Status: ${IC_STATUS}..."
493
+ sleep 30
494
+ ;;
495
+ esac
496
+ done
497
+
498
+ echo "โœ… Deployment complete!"
499
+ echo ""
500
+ echo "๐Ÿ“‹ Deployment Details:"
501
+ echo " Endpoint: ${ENDPOINT_NAME}"
502
+ echo " Endpoint Config: ${ENDPOINT_CONFIG_NAME}"
503
+ echo " Inference Component: ${IC_NAME}"
504
+ echo " Region: ${AWS_REGION}"
505
+ echo " Instance Type: ${INSTANCE_TYPE}"
506
+ echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
507
+ echo ""
508
+ echo "๐Ÿงช Test your endpoint:"
509
+ echo " ./do/test"
510
+ echo ""
511
+ echo "๐Ÿ“ Register this deployment:"
512
+ echo " ./do/register"
513
+ echo ""
514
+ echo "๐Ÿ” Monitor your deployment:"
515
+ echo " aws sagemaker describe-inference-component --inference-component-name ${IC_NAME} --region ${AWS_REGION}"
516
+ echo " aws sagemaker describe-endpoint --endpoint-name ${ENDPOINT_NAME} --region ${AWS_REGION}"
517
+ echo ""
518
+ echo "๐Ÿงน Clean up when done:"
519
+ echo " ./do/clean endpoint"
520
+
521
+ <% } else if (deploymentTarget === 'async-inference') { %>
522
+ # ============================================================
523
+ # SageMaker Managed Inference - Async Deployment (Model-Based)
524
+ # SageMaker async inference does NOT support Inference Components.
525
+ # Flow: create-model โ†’ create-endpoint-config (with AsyncInferenceConfig) โ†’ create-endpoint
526
+ # ============================================================
527
+
528
+ # Validate execution role ARN
529
+ if [ -z "${ROLE_ARN:-}" ]; then
530
+ echo "โŒ Execution role ARN not provided"
531
+ echo ""
532
+ echo "Usage:"
533
+ echo " export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE"
534
+ echo " ./do/deploy"
535
+ echo ""
536
+ echo "Or set ROLE_ARN in do/config"
537
+ echo ""
538
+ echo "The execution role must have permissions for:"
539
+ echo " โ€ข SageMaker model and endpoint management"
540
+ echo " โ€ข ECR image access"
541
+ echo " โ€ข S3 write access for async output path: ${ASYNC_S3_OUTPUT_PATH}"
542
+ echo " โ€ข SNS publish permissions (optional, for notifications)"
543
+ echo " โ€ข CloudWatch Logs"
544
+ exit 3
545
+ fi
546
+
547
+ echo " Using execution role: ${ROLE_ARN}"
548
+
549
+ # ============================================================
550
+ # Bootstrap async infrastructure (S3 bucket + SNS topics)
551
+ # ============================================================
552
+
553
+ # Extract bucket name from S3 output path
554
+ ASYNC_S3_BUCKET=$(echo "${ASYNC_S3_OUTPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
555
+
556
+ <% if (!asyncS3OutputPath) { %>
557
+ # Bootstrap default S3 bucket (check-and-create)
558
+ echo "๐Ÿ” Checking if S3 bucket exists: ${ASYNC_S3_BUCKET}"
559
+ if ! aws s3api head-bucket --bucket "${ASYNC_S3_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
560
+ echo "๐Ÿ“ฆ Creating S3 bucket: ${ASYNC_S3_BUCKET}"
561
+ if [ "${AWS_REGION}" = "us-east-1" ]; then
562
+ if ! aws s3api create-bucket \
563
+ --bucket "${ASYNC_S3_BUCKET}" \
564
+ --region "${AWS_REGION}"; then
565
+ echo "โŒ Failed to create S3 bucket: ${ASYNC_S3_BUCKET}"
566
+ echo ""
567
+ echo " Check that:"
568
+ echo " โ€ข Your IAM credentials have s3:CreateBucket permission"
569
+ echo " โ€ข The bucket name is not already taken globally"
570
+ exit 4
571
+ fi
572
+ else
573
+ if ! aws s3api create-bucket \
574
+ --bucket "${ASYNC_S3_BUCKET}" \
575
+ --region "${AWS_REGION}" \
576
+ --create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
577
+ echo "โŒ Failed to create S3 bucket: ${ASYNC_S3_BUCKET}"
578
+ echo ""
579
+ echo " Check that:"
580
+ echo " โ€ข Your IAM credentials have s3:CreateBucket permission"
581
+ echo " โ€ข The bucket name is not already taken globally"
582
+ exit 4
583
+ fi
584
+ fi
585
+ echo "โœ… S3 bucket created: ${ASYNC_S3_BUCKET}"
586
+ else
587
+ echo "โœ… S3 bucket exists: ${ASYNC_S3_BUCKET}"
588
+ fi
589
+ <% } else { %>
590
+ # Custom S3 output path provided โ€” skip bucket creation
591
+ echo "โœ… Using custom S3 output path: ${ASYNC_S3_OUTPUT_PATH}"
592
+ <% } %>
593
+
594
+ # Extract topic name from SNS success topic ARN
595
+ ASYNC_SNS_SUCCESS_TOPIC_NAME=$(echo "${ASYNC_SNS_SUCCESS_TOPIC}" | awk -F: '{print $NF}')
596
+
597
+ <% if (!asyncSnsSuccessTopic) { %>
598
+ # Bootstrap default SNS success topic (check-and-create)
599
+ echo "๐Ÿ” Checking if SNS success topic exists: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
600
+ if ! aws sns get-topic-attributes --topic-arn "${ASYNC_SNS_SUCCESS_TOPIC}" --region "${AWS_REGION}" 2>/dev/null; then
601
+ echo "๐Ÿ“ฆ Creating SNS success topic: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
602
+ if ! aws sns create-topic \
603
+ --name "${ASYNC_SNS_SUCCESS_TOPIC_NAME}" \
604
+ --region "${AWS_REGION}" > /dev/null; then
605
+ echo "โŒ Failed to create SNS success topic: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
606
+ echo ""
607
+ echo " Check that:"
608
+ echo " โ€ข Your IAM credentials have sns:CreateTopic permission"
609
+ exit 4
610
+ fi
611
+ echo "โœ… SNS success topic created: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
612
+ else
613
+ echo "โœ… SNS success topic exists: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
614
+ fi
615
+
616
+ # Record SNS success topic in manifest (non-blocking)
617
+ ./do/manifest add \
618
+ --type sns-topic \
619
+ --id "${ASYNC_SNS_SUCCESS_TOPIC}" \
620
+ --project "${PROJECT_NAME}" \
621
+ --meta "{\"topicName\":\"${ASYNC_SNS_SUCCESS_TOPIC_NAME}\",\"purpose\":\"async-success\",\"region\":\"${AWS_REGION}\"}" \
622
+ 2>/dev/null || true
623
+
624
+ <% } else { %>
625
+ # Custom SNS success topic ARN provided โ€” skip topic creation
626
+ echo "โœ… Using custom SNS success topic: ${ASYNC_SNS_SUCCESS_TOPIC}"
627
+
628
+ # Record SNS success topic in manifest (non-blocking)
629
+ ASYNC_SNS_SUCCESS_TOPIC_NAME=$(echo "${ASYNC_SNS_SUCCESS_TOPIC}" | awk -F: '{print $NF}')
630
+ ./do/manifest add \
631
+ --type sns-topic \
632
+ --id "${ASYNC_SNS_SUCCESS_TOPIC}" \
633
+ --project "${PROJECT_NAME}" \
634
+ --meta "{\"topicName\":\"${ASYNC_SNS_SUCCESS_TOPIC_NAME}\",\"purpose\":\"async-success\",\"region\":\"${AWS_REGION}\"}" \
635
+ 2>/dev/null || true
636
+
637
+ <% } %>
638
+
639
+ # Extract topic name from SNS error topic ARN
640
+ ASYNC_SNS_ERROR_TOPIC_NAME=$(echo "${ASYNC_SNS_ERROR_TOPIC}" | awk -F: '{print $NF}')
641
+
642
+ <% if (!asyncSnsErrorTopic) { %>
643
+ # Bootstrap default SNS error topic (check-and-create)
644
+ echo "๐Ÿ” Checking if SNS error topic exists: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
645
+ if ! aws sns get-topic-attributes --topic-arn "${ASYNC_SNS_ERROR_TOPIC}" --region "${AWS_REGION}" 2>/dev/null; then
646
+ echo "๐Ÿ“ฆ Creating SNS error topic: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
647
+ if ! aws sns create-topic \
648
+ --name "${ASYNC_SNS_ERROR_TOPIC_NAME}" \
649
+ --region "${AWS_REGION}" > /dev/null; then
650
+ echo "โŒ Failed to create SNS error topic: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
651
+ echo ""
652
+ echo " Check that:"
653
+ echo " โ€ข Your IAM credentials have sns:CreateTopic permission"
654
+ exit 4
655
+ fi
656
+ echo "โœ… SNS error topic created: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
657
+ else
658
+ echo "โœ… SNS error topic exists: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
659
+ fi
660
+
661
+ # Record SNS error topic in manifest (non-blocking)
662
+ ./do/manifest add \
663
+ --type sns-topic \
664
+ --id "${ASYNC_SNS_ERROR_TOPIC}" \
665
+ --project "${PROJECT_NAME}" \
666
+ --meta "{\"topicName\":\"${ASYNC_SNS_ERROR_TOPIC_NAME}\",\"purpose\":\"async-error\",\"region\":\"${AWS_REGION}\"}" \
667
+ 2>/dev/null || true
668
+
669
+ <% } else { %>
670
+ # Custom SNS error topic ARN provided โ€” skip topic creation
671
+ echo "โœ… Using custom SNS error topic: ${ASYNC_SNS_ERROR_TOPIC}"
672
+
673
+ # Record SNS error topic in manifest (non-blocking)
674
+ ASYNC_SNS_ERROR_TOPIC_NAME=$(echo "${ASYNC_SNS_ERROR_TOPIC}" | awk -F: '{print $NF}')
675
+ ./do/manifest add \
676
+ --type sns-topic \
677
+ --id "${ASYNC_SNS_ERROR_TOPIC}" \
678
+ --project "${PROJECT_NAME}" \
679
+ --meta "{\"topicName\":\"${ASYNC_SNS_ERROR_TOPIC_NAME}\",\"purpose\":\"async-error\",\"region\":\"${AWS_REGION}\"}" \
680
+ 2>/dev/null || true
681
+
682
+ <% } %>
683
+
684
+ # ============================================================
685
+ # Create async endpoint (classic model-based flow)
686
+ # SageMaker async inference does NOT support Inference Components.
687
+ # Flow: create-model โ†’ create-endpoint-config (with AsyncInferenceConfig) โ†’ create-endpoint
688
+ # ============================================================
689
+
690
+ # Helper: persist a variable to do/config so other scripts can use it
691
+ _update_config_var() {
692
+ local var_name="$1" var_value="$2" config_file="${SCRIPT_DIR}/config"
693
+ if grep -q "^export ${var_name}=" "${config_file}" 2>/dev/null; then
694
+ sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
695
+ rm -f "${config_file}.bak"
696
+ else
697
+ echo "" >> "${config_file}"
698
+ echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
699
+ fi
700
+ }
701
+
702
+ # Helper: query a SageMaker resource status, returns empty string if not found
703
+ _get_endpoint_status() {
704
+ aws sagemaker describe-endpoint \
705
+ --endpoint-name "$1" \
706
+ --region "${AWS_REGION}" \
707
+ --query EndpointStatus \
708
+ --output text 2>/dev/null || echo ""
709
+ }
710
+
711
+ # ============================================================
712
+ # Idempotency: check for existing deployment from a previous run
713
+ # ============================================================
714
+ SKIP_TO=""
715
+
716
+ if [ "${FORCE_NEW}" = true ]; then
717
+ echo "๐Ÿ”„ --force: ignoring previous deployment, creating new resources."
718
+ elif [ -n "${ENDPOINT_NAME:-}" ]; then
719
+ echo "๐Ÿ” Checking for existing deployment: ${ENDPOINT_NAME}"
720
+
721
+ EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")
722
+
723
+ case "${EP_STATUS}" in
724
+ InService)
725
+ echo "โœ… Async endpoint already InService: ${ENDPOINT_NAME}"
726
+ echo ""
727
+ echo "๐Ÿ“‹ Deployment is already live. Nothing to do."
728
+ echo " Endpoint: ${ENDPOINT_NAME}"
729
+ echo ""
730
+ echo "๐Ÿงช Test your async endpoint:"
731
+ echo " ./do/test"
732
+ echo ""
733
+ echo "๐Ÿงน Clean up when done:"
734
+ echo " ./do/clean endpoint"
735
+ exit 0
736
+ ;;
737
+ Creating|Updating)
738
+ echo "โณ Endpoint still ${EP_STATUS}: ${ENDPOINT_NAME}"
739
+ SKIP_TO="wait_endpoint"
740
+ ;;
741
+ Failed)
742
+ echo "โš ๏ธ Previous endpoint failed: ${ENDPOINT_NAME}"
743
+ echo " Creating a new deployment. Clean up the failed endpoint with:"
744
+ echo " ./do/clean endpoint"
745
+ echo ""
746
+ ;;
747
+ "")
748
+ echo " Previous endpoint not found (may have been cleaned up). Creating new deployment."
749
+ ;;
750
+ *)
751
+ echo " Endpoint in unexpected state: ${EP_STATUS}. Creating new deployment."
752
+ ;;
753
+ esac
754
+ fi
755
+
756
+ # ============================================================
757
+ # Create async resources (skip if resuming from wait)
758
+ # ============================================================
759
+ if [ -z "${SKIP_TO}" ]; then
760
+ TIMESTAMP=$(date +%s)
761
+ MODEL_NAME_SM="${PROJECT_NAME}-async-model-${TIMESTAMP}"
762
+ ENDPOINT_CONFIG_NAME="${PROJECT_NAME}-async-epc-${TIMESTAMP}"
763
+ ENDPOINT_NAME="${PROJECT_NAME}-async-ep-${TIMESTAMP}"
764
+
765
+ _update_config_var "ENDPOINT_NAME" "${ENDPOINT_NAME}"
766
+ _update_config_var "ENDPOINT_CONFIG_NAME" "${ENDPOINT_CONFIG_NAME}"
767
+ _update_config_var "SAGEMAKER_MODEL_NAME" "${MODEL_NAME_SM}"
768
+
769
+ # Step 1: Create SageMaker model
770
+ echo "๐Ÿ“ฆ Creating SageMaker model: ${MODEL_NAME_SM}"
771
+ if ! aws sagemaker create-model \
772
+ --model-name "${MODEL_NAME_SM}" \
773
+ --primary-container "{\"Image\":\"${ECR_REPOSITORY}:${IMAGE_TAG}\"}" \
774
+ --execution-role-arn "${ROLE_ARN}" \
775
+ --region "${AWS_REGION}"; then
776
+
777
+ echo "โŒ Failed to create SageMaker model"
778
+ echo " Check that:"
779
+ echo " โ€ข The execution role ARN is valid"
780
+ echo " โ€ข The ECR image exists and is accessible"
781
+ echo " โ€ข The IAM role has ecr:GetDownloadUrlForLayer permission"
782
+ exit 4
783
+ fi
784
+
785
+ echo "โœ… SageMaker model created: ${MODEL_NAME_SM}"
786
+
787
+ # Record model in manifest (non-blocking)
788
+ MODEL_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:model/${MODEL_NAME_SM}"
789
+ ./do/manifest add \
790
+ --type sagemaker-model \
791
+ --id "${MODEL_ARN}" \
792
+ --project "${PROJECT_NAME}" \
793
+ --meta "{\"modelName\":\"${MODEL_NAME_SM}\",\"region\":\"${AWS_REGION}\"}" \
794
+ 2>/dev/null || true
795
+
796
+ # Build production variant JSON (classic: includes ModelName, no execution-role-arn on endpoint config)
797
+ VARIANT_JSON="[{\"VariantName\":\"AllTraffic\",\"ModelName\":\"${MODEL_NAME_SM}\",\"InstanceType\":\"${INSTANCE_TYPE}\",\"InitialInstanceCount\":1"
798
+
799
+ # Append InferenceAmiVersion if configured
800
+ if [ -n "${INFERENCE_AMI_VERSION:-}" ]; then
801
+ VARIANT_JSON="${VARIANT_JSON},\"InferenceAmiVersion\":\"${INFERENCE_AMI_VERSION}\""
802
+ echo " AMI version: ${INFERENCE_AMI_VERSION}"
803
+ fi
804
+
805
+ VARIANT_JSON="${VARIANT_JSON}}]"
806
+
807
+ # Build AsyncInferenceConfig JSON
808
+ ASYNC_CONFIG="{\"OutputConfig\":{\"S3OutputPath\":\"${ASYNC_S3_OUTPUT_PATH}\",\"NotificationConfig\":{\"SuccessTopic\":\"${ASYNC_SNS_SUCCESS_TOPIC}\",\"ErrorTopic\":\"${ASYNC_SNS_ERROR_TOPIC}\"}}"
809
+
810
+ if [ -n "${ASYNC_MAX_CONCURRENT_INVOCATIONS:-}" ]; then
811
+ ASYNC_CONFIG="${ASYNC_CONFIG},\"ClientConfig\":{\"MaxConcurrentInvocationsPerInstance\":${ASYNC_MAX_CONCURRENT_INVOCATIONS}}"
812
+ fi
813
+
814
+ ASYNC_CONFIG="${ASYNC_CONFIG}}"
815
+
816
+ # Step 2: Create endpoint configuration with AsyncInferenceConfig (no --execution-role-arn)
817
+ echo "โš™๏ธ Creating async endpoint configuration: ${ENDPOINT_CONFIG_NAME}"
818
+ if ! aws sagemaker create-endpoint-config \
819
+ --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
820
+ --production-variants "${VARIANT_JSON}" \
821
+ --async-inference-config "${ASYNC_CONFIG}" \
822
+ --region "${AWS_REGION}"; then
823
+
824
+ echo "โŒ Failed to create async endpoint configuration"
825
+ echo " Check that:"
826
+ echo " โ€ข The S3 output path is accessible: ${ASYNC_S3_OUTPUT_PATH}"
827
+ echo " โ€ข The IAM role has s3:PutObject permission on the output path"
828
+ echo " โ€ข The instance type is valid: ${INSTANCE_TYPE}"
829
+ echo " โ€ข The instance type is available in region: ${AWS_REGION}"
830
+ echo " โ€ข You have sufficient service quota for the instance type"
831
+ exit 4
832
+ fi
833
+
834
+ echo "โœ… Async endpoint configuration created: ${ENDPOINT_CONFIG_NAME}"
835
+
836
+ # Record endpoint config in manifest (non-blocking)
837
+ ENDPOINT_CONFIG_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint-config/${ENDPOINT_CONFIG_NAME}"
838
+ ./do/manifest add \
839
+ --type sagemaker-endpoint-config \
840
+ --id "${ENDPOINT_CONFIG_ARN}" \
841
+ --project "${PROJECT_NAME}" \
842
+ --meta "{\"endpointConfigName\":\"${ENDPOINT_CONFIG_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
843
+ 2>/dev/null || true
844
+
845
+ # Step 3: Create endpoint
846
+ echo "๐Ÿš€ Creating async endpoint: ${ENDPOINT_NAME}"
847
+ if ! aws sagemaker create-endpoint \
848
+ --endpoint-name "${ENDPOINT_NAME}" \
849
+ --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
850
+ --region "${AWS_REGION}"; then
851
+
852
+ echo "โŒ Failed to create async endpoint"
853
+ echo " Check that:"
854
+ echo " โ€ข Your IAM credentials have sagemaker:CreateEndpoint permission"
855
+ echo " โ€ข You have sufficient service quota in region: ${AWS_REGION}"
856
+ exit 4
857
+ fi
858
+
859
+ echo "โœ… Async endpoint creation initiated: ${ENDPOINT_NAME}"
860
+
861
+ # Record endpoint in manifest (non-blocking)
862
+ ENDPOINT_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint/${ENDPOINT_NAME}"
863
+ ./do/manifest add \
864
+ --type sagemaker-endpoint \
865
+ --id "${ENDPOINT_ARN}" \
866
+ --project "${PROJECT_NAME}" \
867
+ --meta "{\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
868
+ 2>/dev/null || true
869
+ fi
870
+
871
+ # ============================================================
872
+ # Wait for endpoint (skip if already InService)
873
+ # ============================================================
874
+ if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
875
+ echo "โณ Waiting for async endpoint to reach InService status..."
876
+ echo " This may take several minutes..."
877
+ echo " If this times out, re-run ./do/deploy to resume."
878
+
879
+ if ! aws sagemaker wait endpoint-in-service \
880
+ --endpoint-name "${ENDPOINT_NAME}" \
881
+ --region "${AWS_REGION}"; then
882
+
883
+ # Check if it was a credential expiration vs actual failure
884
+ EP_CHECK=$(_get_endpoint_status "${ENDPOINT_NAME}" 2>/dev/null)
885
+ if [ "${EP_CHECK}" = "Creating" ]; then
886
+ echo ""
887
+ echo "โš ๏ธ Wait interrupted (credentials may have expired), but endpoint is still creating."
888
+ echo " Refresh your credentials and re-run ./do/deploy to resume."
889
+ echo ""
890
+ echo " Or check status manually:"
891
+ echo " aws sagemaker describe-endpoint --endpoint-name ${ENDPOINT_NAME} --region ${AWS_REGION} --query EndpointStatus"
892
+ exit 4
893
+ fi
894
+
895
+ echo "โŒ Async endpoint failed to reach InService status"
896
+ echo " Check CloudWatch Logs for details:"
897
+ echo " https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/Endpoints/${ENDPOINT_NAME}"
898
+ exit 4
899
+ fi
900
+ fi
901
+
902
+ echo "โœ… Async deployment complete!"
903
+ echo ""
904
+ echo "๐Ÿ“‹ Deployment Details:"
905
+ echo " Endpoint: ${ENDPOINT_NAME}"
906
+ echo " Endpoint Config: ${ENDPOINT_CONFIG_NAME}"
907
+ echo " Model: ${MODEL_NAME_SM}"
908
+ echo " Region: ${AWS_REGION}"
909
+ echo " Instance Type: ${INSTANCE_TYPE}"
910
+ echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
911
+ echo " S3 Output: ${ASYNC_S3_OUTPUT_PATH}"
912
+ echo " SNS Success: ${ASYNC_SNS_SUCCESS_TOPIC}"
913
+ echo " SNS Error: ${ASYNC_SNS_ERROR_TOPIC}"
914
+ echo ""
915
+ echo "๐Ÿงช Test your async endpoint:"
916
+ echo " ./do/test"
917
+ echo ""
918
+ echo "๐Ÿ“ Register this deployment:"
919
+ echo " ./do/register"
920
+ echo ""
921
+ echo "๐Ÿ” Monitor your deployment:"
922
+ echo " aws sagemaker describe-endpoint --endpoint-name ${ENDPOINT_NAME} --region ${AWS_REGION}"
923
+ echo ""
924
+ echo "๐Ÿงน Clean up when done:"
925
+ echo " ./do/clean endpoint"
926
+
927
+ <% } else if (deploymentTarget === 'hyperpod-eks') { %>
928
+ # ============================================================
929
+ # HyperPod EKS Deployment
930
+ # ============================================================
931
+
932
+ # Get kubeconfig for HyperPod cluster
933
+ echo "๐Ÿ”‘ Configuring kubectl for HyperPod cluster..."
934
+ KUBECONFIG_PATH="${HOME}/.kube/hyperpod-${HYPERPOD_CLUSTER_NAME}"
935
+
936
+ # Step 1: Describe the HyperPod cluster to get the underlying EKS cluster ARN
937
+ EKS_CLUSTER_ARN=$(aws sagemaker describe-cluster \
938
+ --cluster-name "${HYPERPOD_CLUSTER_NAME}" \
939
+ --region "${AWS_REGION}" \
940
+ --query "Orchestrator.Eks.ClusterArn" \
941
+ --output text 2>&1) || {
942
+ echo "โŒ Failed to describe HyperPod cluster: ${HYPERPOD_CLUSTER_NAME}"
943
+ echo ""
944
+ echo " Error details:"
945
+ echo " ${EKS_CLUSTER_ARN}"
946
+ echo ""
947
+ echo " Check that:"
948
+ echo " โ€ข The cluster name is correct"
949
+ echo " โ€ข The cluster exists in region: ${AWS_REGION}"
950
+ echo " โ€ข Your IAM user/role has permission to access the cluster"
951
+ echo ""
952
+ echo " Required IAM permissions:"
953
+ echo " โ€ข sagemaker:DescribeCluster"
954
+ echo " โ€ข eks:DescribeCluster"
955
+ exit 4
956
+ }
957
+
958
+ # Step 2: Extract the EKS cluster name from the ARN
959
+ EKS_CLUSTER_NAME=$(echo "${EKS_CLUSTER_ARN}" | awk -F'/' '{print $NF}')
960
+ echo " HyperPod cluster: ${HYPERPOD_CLUSTER_NAME}"
961
+ echo " EKS cluster: ${EKS_CLUSTER_NAME}"
962
+
963
+ # Step 3: Update kubeconfig using the EKS cluster
964
+ if ! aws eks update-kubeconfig \
965
+ --name "${EKS_CLUSTER_NAME}" \
966
+ --region "${AWS_REGION}" \
967
+ --kubeconfig "${KUBECONFIG_PATH}" 2>&1; then
968
+ echo "โŒ Failed to configure kubectl for EKS cluster: ${EKS_CLUSTER_NAME}"
969
+ echo ""
970
+ echo " Required IAM permissions:"
971
+ echo " โ€ข eks:DescribeCluster"
972
+ echo " โ€ข eks:AccessKubernetesApi"
973
+ exit 4
974
+ fi
975
+
976
+ export KUBECONFIG="${KUBECONFIG_PATH}"
977
+ echo "โœ… Kubeconfig saved to: ${KUBECONFIG_PATH}"
978
+
979
+ # Verify cluster connectivity
980
+ echo "๐Ÿ” Verifying cluster connectivity..."
981
+ if ! kubectl cluster-info &> /dev/null; then
982
+ echo "โŒ Cannot connect to HyperPod cluster"
983
+ echo ""
984
+ echo " Check that:"
985
+ echo " โ€ข The cluster is in 'InService' status"
986
+ echo " โ€ข Your network can reach the cluster API server"
987
+ echo " โ€ข Your IAM credentials are valid"
988
+ exit 4
989
+ fi
990
+ echo "โœ… Connected to HyperPod cluster"
991
+
992
+ # Create namespace if it doesn't exist
993
+ echo "๐Ÿ“ Ensuring namespace exists: ${HYPERPOD_NAMESPACE}"
994
+ if ! kubectl create namespace "${HYPERPOD_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - 2>&1; then
995
+ echo "โš ๏ธ Warning: Could not create/verify namespace"
996
+ fi
997
+
998
+ # Apply Kubernetes manifests
999
+ echo "๐Ÿ“„ Applying Kubernetes manifests from hyperpod/..."
1000
+
1001
+ # Substitute shell variables (e.g. ${AWS_ACCOUNT_ID}) in manifests before applying
1002
+ export AWS_ACCOUNT_ID
1003
+ export ECR_IMAGE="${ECR_REPOSITORY}:${IMAGE_TAG}"
1004
+
1005
+ APPLY_OUTPUT=""
1006
+ APPLY_EXIT_CODE=0
1007
+ for manifest in hyperpod/*.yaml; do
1008
+ # Skip files that contain no Kubernetes objects (e.g. comment-only PVC stubs)
1009
+ RENDERED=$(envsubst < "${manifest}")
1010
+ if echo "${RENDERED}" | grep -q '^kind:'; then
1011
+ FILE_OUTPUT=$(echo "${RENDERED}" | kubectl apply -n "${HYPERPOD_NAMESPACE}" -f - 2>&1) || {
1012
+ APPLY_EXIT_CODE=$?
1013
+ }
1014
+ APPLY_OUTPUT="${APPLY_OUTPUT}${FILE_OUTPUT}\n"
1015
+ fi
1016
+ done
1017
+
1018
+ if [ "${APPLY_EXIT_CODE}" -ne 0 ]; then
1019
+ echo ""
1020
+ echo "โŒ Failed to apply Kubernetes manifests"
1021
+ echo ""
1022
+ echo " Error details:"
1023
+ echo " ${APPLY_OUTPUT}"
1024
+ echo ""
1025
+ echo " Common issues:"
1026
+ echo " โ€ข Insufficient node capacity - check available GPU nodes"
1027
+ echo " โ€ข Resource requests exceed node capacity"
1028
+ echo " โ€ข RBAC permissions - ensure you have permission to create resources in namespace '${HYPERPOD_NAMESPACE}'"
1029
+ echo " โ€ข Invalid manifest syntax"
1030
+ <% if (fsxVolumeHandle) { %>
1031
+ echo " โ€ข PVC creation failure - verify the FSx CSI driver is installed on the cluster"
1032
+ echo " kubectl get csidriver -o name | grep fsx"
1033
+ <% } %>
1034
+ echo ""
1035
+ echo " Debug commands:"
1036
+ echo " kubectl get nodes -o wide"
1037
+ echo " kubectl describe nodes"
1038
+ echo " kubectl get events -n ${HYPERPOD_NAMESPACE}"
1039
+ exit ${APPLY_EXIT_CODE}
1040
+ fi
1041
+
1042
+ echo "โœ… Kubernetes manifests applied"
1043
+
1044
+ # Record k8s deployment and service in manifest (non-blocking)
1045
+ ./do/manifest add \
1046
+ --type k8s-deployment \
1047
+ --id "${HYPERPOD_NAMESPACE}/${PROJECT_NAME}" \
1048
+ --project "${PROJECT_NAME}" \
1049
+ --meta "{\"namespace\":\"${HYPERPOD_NAMESPACE}\",\"deploymentName\":\"${PROJECT_NAME}\",\"clusterName\":\"${HYPERPOD_CLUSTER_NAME}\",\"region\":\"${AWS_REGION}\"}" \
1050
+ 2>/dev/null || true
1051
+
1052
+ ./do/manifest add \
1053
+ --type k8s-service \
1054
+ --id "${HYPERPOD_NAMESPACE}/${PROJECT_NAME}" \
1055
+ --project "${PROJECT_NAME}" \
1056
+ --meta "{\"namespace\":\"${HYPERPOD_NAMESPACE}\",\"serviceName\":\"${PROJECT_NAME}\",\"clusterName\":\"${HYPERPOD_CLUSTER_NAME}\",\"region\":\"${AWS_REGION}\"}" \
1057
+ 2>/dev/null || true
1058
+
1059
+ # Wait for deployment to be ready
1060
+ DEPLOY_TIMEOUT=${DEPLOY_TIMEOUT:-1200}
1061
+ echo "โณ Waiting for deployment to be ready (timeout: ${DEPLOY_TIMEOUT}s)..."
1062
+ echo " This may take several minutes for GPU workloads..."
1063
+ echo ""
1064
+
1065
+ # Poll pod status every 30s while rollout is in progress
1066
+ (
1067
+ while true; do
1068
+ sleep 30
1069
+ POD_STATUS=$(kubectl get pods -n "${HYPERPOD_NAMESPACE}" -l app=${PROJECT_NAME} \
1070
+ --no-headers 2>/dev/null | head -5)
1071
+ if [ -n "${POD_STATUS}" ]; then
1072
+ echo " ๐Ÿ“Š $(date +%H:%M:%S) Pod status:"
1073
+ echo "${POD_STATUS}" | while read -r line; do echo " ${line}"; done
1074
+ fi
1075
+ done
1076
+ ) &
1077
+ STATUS_PID=$!
1078
+ trap "kill ${STATUS_PID} 2>/dev/null; wait ${STATUS_PID} 2>/dev/null" EXIT
1079
+
1080
+ ROLLOUT_OUTPUT=$(kubectl rollout status deployment/${PROJECT_NAME} -n "${HYPERPOD_NAMESPACE}" --timeout=${DEPLOY_TIMEOUT}s 2>&1) || {
1081
+ ROLLOUT_EXIT_CODE=$?
1082
+ kill ${STATUS_PID} 2>/dev/null
1083
+ echo ""
1084
+ echo "โŒ Deployment failed to become ready within timeout"
1085
+ echo ""
1086
+ echo " Error details:"
1087
+ echo " ${ROLLOUT_OUTPUT}"
1088
+ echo ""
1089
+ echo " Current pod state:"
1090
+ kubectl get pods -n "${HYPERPOD_NAMESPACE}" -l app=${PROJECT_NAME} -o wide 2>/dev/null
1091
+ echo ""
1092
+ echo " Debug commands:"
1093
+ echo " kubectl describe pods -n ${HYPERPOD_NAMESPACE} -l app=${PROJECT_NAME}"
1094
+ echo " kubectl logs -n ${HYPERPOD_NAMESPACE} -l app=${PROJECT_NAME} --tail=100"
1095
+ echo ""
1096
+ echo " Common issues:"
1097
+ echo " โ€ข Image pull errors - check ECR permissions"
1098
+ echo " โ€ข Resource scheduling - insufficient GPU nodes"
1099
+ echo " โ€ข Container crash - check application logs"
1100
+ <% if (fsxVolumeHandle) { %>
1101
+ echo " โ€ข PVC binding errors - verify FSx CSI driver is installed on the cluster"
1102
+ echo " kubectl get pvc -n ${HYPERPOD_NAMESPACE}"
1103
+ echo " kubectl describe pvc -n ${HYPERPOD_NAMESPACE}"
1104
+ echo " kubectl get csidriver -o name | grep fsx"
1105
+ <% } %>
1106
+ exit ${ROLLOUT_EXIT_CODE}
1107
+ }
1108
+
1109
+ kill ${STATUS_PID} 2>/dev/null
1110
+ wait ${STATUS_PID} 2>/dev/null
1111
+
1112
+ echo "โœ… HyperPod EKS deployment complete!"
1113
+ echo ""
1114
+ echo "๐Ÿ“‹ Deployment Details:"
1115
+ echo " Cluster: ${HYPERPOD_CLUSTER_NAME}"
1116
+ echo " Namespace: ${HYPERPOD_NAMESPACE}"
1117
+ echo " Deployment: ${PROJECT_NAME}"
1118
+ echo " Replicas: ${HYPERPOD_REPLICAS}"
1119
+ echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
1120
+ echo ""
1121
+ echo "๐Ÿ” Check deployment status:"
1122
+ echo " export KUBECONFIG=${KUBECONFIG_PATH}"
1123
+ echo " kubectl get pods -n ${HYPERPOD_NAMESPACE}"
1124
+ echo " kubectl get svc -n ${HYPERPOD_NAMESPACE}"
1125
+ echo ""
1126
+ echo "๐Ÿงช Test your deployment:"
1127
+ echo " ./do/test"
1128
+ echo ""
1129
+ echo "๐Ÿ“ Register this deployment:"
1130
+ echo " ./do/register"
1131
+ echo ""
1132
+ echo "๐Ÿ“‹ View logs:"
1133
+ echo " ./do/logs"
1134
+ echo ""
1135
+ echo "๐Ÿงน Clean up when done:"
1136
+ echo " ./do/clean hyperpod"
1137
+
1138
+ # Write kubeconfig path to config so other scripts can use it (idempotent)
1139
+ _update_config_var() {
1140
+ local var_name="$1" var_value="$2" config_file="${SCRIPT_DIR}/config"
1141
+ if grep -q "^export ${var_name}=" "${config_file}" 2>/dev/null; then
1142
+ sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
1143
+ rm -f "${config_file}.bak"
1144
+ else
1145
+ echo "" >> "${config_file}"
1146
+ echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
1147
+ fi
1148
+ }
1149
+
1150
+ _update_config_var "KUBECONFIG" "${KUBECONFIG_PATH}"
1151
+
1152
+ <% } else if (deploymentTarget === 'batch-transform') { %>
1153
+ # ============================================================
1154
+ # SageMaker Managed Inference - Batch Deployment
1155
+ # Flow: create-model โ†’ create-transform-job โ†’ poll until completion
1156
+ # ============================================================
1157
+
1158
+ # Validate execution role ARN
1159
+ if [ -z "${ROLE_ARN:-}" ]; then
1160
+ echo "โŒ Execution role ARN not provided"
1161
+ echo ""
1162
+ echo "Usage:"
1163
+ echo " export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE"
1164
+ echo " ./do/deploy"
1165
+ echo ""
1166
+ echo "Or set ROLE_ARN in do/config"
1167
+ echo ""
1168
+ echo "The execution role must have permissions for:"
1169
+ echo " โ€ข SageMaker model and transform job management"
1170
+ echo " โ€ข ECR image access"
1171
+ echo " โ€ข S3 read access for input path: ${BATCH_INPUT_PATH}"
1172
+ echo " โ€ข S3 write access for output path: ${BATCH_OUTPUT_PATH}"
1173
+ echo " โ€ข CloudWatch Logs"
1174
+ exit 3
1175
+ fi
1176
+
1177
+ echo " Using execution role: ${ROLE_ARN}"
1178
+
1179
+ # Validate S3 input path
1180
+ if [ -z "${BATCH_INPUT_PATH:-}" ]; then
1181
+ echo "โŒ S3 input path not provided"
1182
+ echo ""
1183
+ echo "Set BATCH_INPUT_PATH in do/config or provide via CLI:"
1184
+ echo " export BATCH_INPUT_PATH=s3://my-bucket/input/"
1185
+ echo " ./do/deploy"
1186
+ exit 3
1187
+ fi
1188
+
1189
+ if [[ "${BATCH_INPUT_PATH}" != s3://* ]]; then
1190
+ echo "โŒ S3 input path must start with s3://"
1191
+ echo " Current value: ${BATCH_INPUT_PATH}"
1192
+ echo " Example: s3://my-bucket/input/"
1193
+ exit 3
1194
+ fi
1195
+
1196
+ # Validate S3 output path
1197
+ if [ -z "${BATCH_OUTPUT_PATH:-}" ]; then
1198
+ echo "โŒ S3 output path not provided"
1199
+ echo ""
1200
+ echo "Set BATCH_OUTPUT_PATH in do/config or provide via CLI:"
1201
+ echo " export BATCH_OUTPUT_PATH=s3://my-bucket/output/"
1202
+ echo " ./do/deploy"
1203
+ exit 3
1204
+ fi
1205
+
1206
+ if [[ "${BATCH_OUTPUT_PATH}" != s3://* ]]; then
1207
+ echo "โŒ S3 output path must start with s3://"
1208
+ echo " Current value: ${BATCH_OUTPUT_PATH}"
1209
+ echo " Example: s3://my-bucket/output/"
1210
+ exit 3
1211
+ fi
1212
+
1213
+ # ============================================================
1214
+ # Bootstrap S3 buckets for batch transform
1215
+ # ============================================================
1216
+
1217
+ # Extract bucket names from S3 paths
1218
+ BATCH_INPUT_BUCKET=$(echo "${BATCH_INPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
1219
+ BATCH_OUTPUT_BUCKET=$(echo "${BATCH_OUTPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
1220
+
1221
+ <% if (!batchInputPath) { %>
1222
+ # Bootstrap default S3 input bucket (check-and-create)
1223
+ echo "๐Ÿ” Checking if S3 input bucket exists: ${BATCH_INPUT_BUCKET}"
1224
+ if ! aws s3api head-bucket --bucket "${BATCH_INPUT_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
1225
+ echo "๐Ÿ“ฆ Creating S3 input bucket: ${BATCH_INPUT_BUCKET}"
1226
+ if [ "${AWS_REGION}" = "us-east-1" ]; then
1227
+ if ! aws s3api create-bucket \
1228
+ --bucket "${BATCH_INPUT_BUCKET}" \
1229
+ --region "${AWS_REGION}"; then
1230
+ echo "โŒ Failed to create S3 input bucket: ${BATCH_INPUT_BUCKET}"
1231
+ echo ""
1232
+ echo " Check that:"
1233
+ echo " โ€ข Your IAM credentials have s3:CreateBucket permission"
1234
+ echo " โ€ข The bucket name is not already taken globally"
1235
+ exit 4
1236
+ fi
1237
+ else
1238
+ if ! aws s3api create-bucket \
1239
+ --bucket "${BATCH_INPUT_BUCKET}" \
1240
+ --region "${AWS_REGION}" \
1241
+ --create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
1242
+ echo "โŒ Failed to create S3 input bucket: ${BATCH_INPUT_BUCKET}"
1243
+ echo ""
1244
+ echo " Check that:"
1245
+ echo " โ€ข Your IAM credentials have s3:CreateBucket permission"
1246
+ echo " โ€ข The bucket name is not already taken globally"
1247
+ exit 4
1248
+ fi
1249
+ fi
1250
+ echo "โœ… S3 input bucket created: ${BATCH_INPUT_BUCKET}"
1251
+ else
1252
+ echo "โœ… S3 input bucket exists: ${BATCH_INPUT_BUCKET}"
1253
+ fi
1254
+
1255
+ # Upload sample input file if the input prefix is empty
1256
+ EXISTING_OBJECTS=$(aws s3 ls "${BATCH_INPUT_PATH}" --region "${AWS_REGION}" 2>/dev/null | head -1 || true)
1257
+ if [ -z "${EXISTING_OBJECTS}" ]; then
1258
+ echo "๐Ÿ“„ Uploading sample input file to ${BATCH_INPUT_PATH}"
1259
+ <% if (framework === 'transformers' && (modelServer === 'vllm' || modelServer === 'sglang')) { %>
1260
+ echo '{"model": "<%= modelName %>", "messages": [{"role": "user", "content": "What is machine learning?"}], "max_tokens": 50}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
1261
+ <% } else if (framework === 'transformers') { %>
1262
+ echo '{"inputs": "What is machine learning?", "parameters": {"max_new_tokens": 50}}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
1263
+ <% } else if (framework === 'diffusors') { %>
1264
+ echo '{"prompt": "A white cat", "n": 1, "size": "512x512"}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
1265
+ <% } else { %>
1266
+ echo '{"instances": [[1.0, 2.0, 3.0, 4.0]]}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
1267
+ <% } %>
1268
+ echo "โœ… Sample input uploaded: ${BATCH_INPUT_PATH}sample.jsonl"
1269
+ echo " โš ๏ธ Replace this with your actual input data before running production jobs"
1270
+ fi
1271
+ <% } else { %>
1272
+ # Custom S3 input path provided โ€” skip bucket creation
1273
+ echo "โœ… Using custom S3 input path: ${BATCH_INPUT_PATH}"
1274
+ <% } %>
1275
+
1276
+ <% if (!batchOutputPath) { %>
1277
+ # Bootstrap default S3 output bucket (check-and-create, may be same as input)
1278
+ if [ "${BATCH_OUTPUT_BUCKET}" != "${BATCH_INPUT_BUCKET}" ]; then
1279
+ echo "๐Ÿ” Checking if S3 output bucket exists: ${BATCH_OUTPUT_BUCKET}"
1280
+ if ! aws s3api head-bucket --bucket "${BATCH_OUTPUT_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
1281
+ echo "๐Ÿ“ฆ Creating S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
1282
+ if [ "${AWS_REGION}" = "us-east-1" ]; then
1283
+ if ! aws s3api create-bucket \
1284
+ --bucket "${BATCH_OUTPUT_BUCKET}" \
1285
+ --region "${AWS_REGION}"; then
1286
+ echo "โŒ Failed to create S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
1287
+ exit 4
1288
+ fi
1289
+ else
1290
+ if ! aws s3api create-bucket \
1291
+ --bucket "${BATCH_OUTPUT_BUCKET}" \
1292
+ --region "${AWS_REGION}" \
1293
+ --create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
1294
+ echo "โŒ Failed to create S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
1295
+ exit 4
1296
+ fi
1297
+ fi
1298
+ echo "โœ… S3 output bucket created: ${BATCH_OUTPUT_BUCKET}"
1299
+ else
1300
+ echo "โœ… S3 output bucket exists: ${BATCH_OUTPUT_BUCKET}"
1301
+ fi
1302
+ else
1303
+ echo "โœ… S3 output bucket same as input: ${BATCH_OUTPUT_BUCKET}"
1304
+ fi
1305
+ <% } else { %>
1306
+ # Custom S3 output path provided โ€” skip bucket creation
1307
+ echo "โœ… Using custom S3 output path: ${BATCH_OUTPUT_PATH}"
1308
+ <% } %>
1309
+
1310
+ # Helper: persist a variable to do/config so other scripts can use it
1311
+ _update_config_var() {
1312
+ local var_name="$1" var_value="$2" config_file="${SCRIPT_DIR}/config"
1313
+ if grep -q "^export ${var_name}=" "${config_file}" 2>/dev/null; then
1314
+ sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
1315
+ rm -f "${config_file}.bak"
1316
+ else
1317
+ echo "" >> "${config_file}"
1318
+ echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
1319
+ fi
1320
+ }
1321
+
1322
+ # ============================================================
1323
+ # Check for previous transform job still running
1324
+ # ============================================================
1325
+ if [ "${FORCE_NEW}" != true ] && [ -n "${TRANSFORM_JOB_NAME:-}" ]; then
1326
+ echo "๐Ÿ” Checking previous transform job: ${TRANSFORM_JOB_NAME}"
1327
+ PREV_JOB_STATUS=$(aws sagemaker describe-transform-job \
1328
+ --transform-job-name "${TRANSFORM_JOB_NAME}" \
1329
+ --region "${AWS_REGION}" \
1330
+ --query "TransformJobStatus" \
1331
+ --output text 2>/dev/null || echo "")
1332
+
1333
+ case "${PREV_JOB_STATUS}" in
1334
+ InProgress)
1335
+ echo "โš ๏ธ Previous transform job is still running: ${TRANSFORM_JOB_NAME}"
1336
+ echo " Wait for it to complete, or stop it with:"
1337
+ echo " aws sagemaker stop-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION}"
1338
+ echo ""
1339
+ echo " Use --force to create a new job anyway."
1340
+ exit 4
1341
+ ;;
1342
+ Completed)
1343
+ echo "โœ… Previous transform job completed: ${TRANSFORM_JOB_NAME}"
1344
+ echo " Creating a new job. Results from the previous job are in:"
1345
+ echo " ${BATCH_OUTPUT_PATH}"
1346
+ echo ""
1347
+ ;;
1348
+ *)
1349
+ # Failed, Stopped, or not found โ€” proceed with new job
1350
+ ;;
1351
+ esac
1352
+ fi
1353
+
1354
+ # Generate unique names with timestamp
1355
+ TIMESTAMP=$(date +%s)
1356
+ MODEL_NAME_SM="${PROJECT_NAME}-batch-model-${TIMESTAMP}"
1357
+ TRANSFORM_JOB_NAME="${PROJECT_NAME}-batch-job-${TIMESTAMP}"
1358
+
1359
+ _update_config_var "TRANSFORM_JOB_NAME" "${TRANSFORM_JOB_NAME}"
1360
+ _update_config_var "SAGEMAKER_MODEL_NAME" "${MODEL_NAME_SM}"
1361
+
1362
+ # Step 1: Create SageMaker model
1363
+ echo "๐Ÿ“ฆ Creating SageMaker model: ${MODEL_NAME_SM}"
1364
+ if ! aws sagemaker create-model \
1365
+ --model-name "${MODEL_NAME_SM}" \
1366
+ --primary-container "{\"Image\":\"${ECR_REPOSITORY}:${IMAGE_TAG}\"}" \
1367
+ --execution-role-arn "${ROLE_ARN}" \
1368
+ --region "${AWS_REGION}"; then
1369
+
1370
+ echo "โŒ Failed to create SageMaker model"
1371
+ echo " Check that:"
1372
+ echo " โ€ข The execution role ARN is valid"
1373
+ echo " โ€ข The ECR image exists and is accessible"
1374
+ echo " โ€ข The IAM role has ecr:GetDownloadUrlForLayer permission"
1375
+ exit 4
1376
+ fi
1377
+
1378
+ echo "โœ… SageMaker model created: ${MODEL_NAME_SM}"
1379
+
1380
+ # Record model in manifest (non-blocking)
1381
+ MODEL_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:model/${MODEL_NAME_SM}"
1382
+ ./do/manifest add \
1383
+ --type sagemaker-model \
1384
+ --id "${MODEL_ARN}" \
1385
+ --project "${PROJECT_NAME}" \
1386
+ --meta "{\"modelName\":\"${MODEL_NAME_SM}\",\"region\":\"${AWS_REGION}\"}" \
1387
+ 2>/dev/null || true
1388
+
1389
+ # Step 2: Build transform job JSON
1390
+ TRANSFORM_JOB_JSON="{
1391
+ \"TransformJobName\": \"${TRANSFORM_JOB_NAME}\",
1392
+ \"ModelName\": \"${MODEL_NAME_SM}\",
1393
+ \"TransformInput\": {
1394
+ \"DataSource\": {
1395
+ \"S3DataSource\": {
1396
+ \"S3DataType\": \"S3Prefix\",
1397
+ \"S3Uri\": \"${BATCH_INPUT_PATH}\"
1398
+ }
1399
+ },
1400
+ \"ContentType\": \"application/json\",
1401
+ \"SplitType\": \"${BATCH_SPLIT_TYPE}\"
1402
+ },
1403
+ \"TransformOutput\": {
1404
+ \"S3OutputPath\": \"${BATCH_OUTPUT_PATH}\"
1405
+ $([ "${BATCH_JOIN_SOURCE:-None}" = "Input" ] && echo ",\"Accept\": \"application/json\", \"AssembleWith\": \"${BATCH_SPLIT_TYPE}\"")
1406
+ },
1407
+ \"TransformResources\": {
1408
+ \"InstanceType\": \"${INSTANCE_TYPE}\",
1409
+ \"InstanceCount\": ${BATCH_INSTANCE_COUNT}
1410
+ },
1411
+ \"MaxConcurrentTransforms\": ${BATCH_MAX_CONCURRENT_TRANSFORMS:-1},
1412
+ \"MaxPayloadInMB\": ${BATCH_MAX_PAYLOAD_IN_MB:-6},
1413
+ \"BatchStrategy\": \"${BATCH_STRATEGY}\"
1414
+ $([ "${BATCH_JOIN_SOURCE:-None}" = "Input" ] && echo ",\"DataProcessing\": { \"JoinSource\": \"Input\" }")
1415
+ }"
1416
+
1417
+ # Step 3: Create transform job
1418
+ echo "๐Ÿš€ Creating transform job: ${TRANSFORM_JOB_NAME}"
1419
+ if ! aws sagemaker create-transform-job \
1420
+ --cli-input-json "${TRANSFORM_JOB_JSON}" \
1421
+ --region "${AWS_REGION}"; then
1422
+
1423
+ echo "โŒ Failed to create transform job"
1424
+ echo " Check that:"
1425
+ echo " โ€ข The S3 input path exists and is accessible: ${BATCH_INPUT_PATH}"
1426
+ echo " โ€ข The S3 output path is writable: ${BATCH_OUTPUT_PATH}"
1427
+ echo " โ€ข The IAM role has s3:GetObject permission on the input path"
1428
+ echo " โ€ข The IAM role has s3:PutObject permission on the output path"
1429
+ echo " โ€ข The instance type is valid: ${INSTANCE_TYPE}"
1430
+ echo " โ€ข The instance type is available in region: ${AWS_REGION}"
1431
+ echo " โ€ข You have sufficient service quota for the instance type"
1432
+ exit 4
1433
+ fi
1434
+
1435
+ echo "โœ… Transform job created: ${TRANSFORM_JOB_NAME}"
1436
+
1437
+ # Record transform job in manifest (non-blocking)
1438
+ TRANSFORM_JOB_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:transform-job/${TRANSFORM_JOB_NAME}"
1439
+ ./do/manifest add \
1440
+ --type sagemaker-transform-job \
1441
+ --id "${TRANSFORM_JOB_ARN}" \
1442
+ --project "${PROJECT_NAME}" \
1443
+ --meta "{\"transformJobName\":\"${TRANSFORM_JOB_NAME}\",\"modelName\":\"${MODEL_NAME_SM}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
1444
+ 2>/dev/null || true
1445
+
1446
+ # Step 4: Poll transform job status until completion or failure
1447
+ echo "โณ Waiting for transform job to complete..."
1448
+ echo " This may take several minutes depending on dataset size..."
1449
+ echo " If this times out, check status with:"
1450
+ echo " aws sagemaker describe-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION}"
1451
+ echo ""
1452
+
1453
+ while true; do
1454
+ JOB_STATUS=$(aws sagemaker describe-transform-job \
1455
+ --transform-job-name "${TRANSFORM_JOB_NAME}" \
1456
+ --region "${AWS_REGION}" \
1457
+ --query "TransformJobStatus" \
1458
+ --output text 2>&1) || {
1459
+ # Check if it was a credential expiration
1460
+ if echo "${JOB_STATUS}" | grep -qi "expired\|token"; then
1461
+ echo ""
1462
+ echo "โš ๏ธ Credentials expired, but the transform job is still running."
1463
+ echo " Refresh your credentials and check status with:"
1464
+ echo " aws sagemaker describe-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION} --query TransformJobStatus"
1465
+ exit 4
1466
+ fi
1467
+ echo "โŒ Failed to describe transform job: ${TRANSFORM_JOB_NAME}"
1468
+ echo " Error: ${JOB_STATUS}"
1469
+ exit 4
1470
+ }
1471
+
1472
+ case "${JOB_STATUS}" in
1473
+ Completed)
1474
+ echo "โœ… Transform job completed successfully!"
1475
+ break
1476
+ ;;
1477
+ Failed)
1478
+ FAILURE_REASON=$(aws sagemaker describe-transform-job \
1479
+ --transform-job-name "${TRANSFORM_JOB_NAME}" \
1480
+ --region "${AWS_REGION}" \
1481
+ --query "FailureReason" \
1482
+ --output text 2>/dev/null || echo "Unknown")
1483
+ echo "โŒ Transform job failed"
1484
+ echo " Reason: ${FAILURE_REASON}"
1485
+ echo ""
1486
+ echo " Check CloudWatch Logs for details:"
1487
+ echo " https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/TransformJobs"
1488
+ echo ""
1489
+ echo " Verify that:"
1490
+ echo " โ€ข The S3 input path exists and contains data: ${BATCH_INPUT_PATH}"
1491
+ echo " โ€ข The input data format matches the container's expected format"
1492
+ echo " โ€ข The container's /ping and /invocations endpoints work correctly"
1493
+ exit 4
1494
+ ;;
1495
+ Stopped)
1496
+ echo "โš ๏ธ Transform job was stopped"
1497
+ exit 4
1498
+ ;;
1499
+ InProgress)
1500
+ echo " $(date +%H:%M:%S) Job status: InProgress..."
1501
+ sleep 30
1502
+ ;;
1503
+ *)
1504
+ echo " $(date +%H:%M:%S) Job status: ${JOB_STATUS}..."
1505
+ sleep 30
1506
+ ;;
1507
+ esac
1508
+ done
1509
+
1510
+ echo ""
1511
+ echo "๐Ÿ“‹ Deployment Details:"
1512
+ echo " Transform Job: ${TRANSFORM_JOB_NAME}"
1513
+ echo " Model: ${MODEL_NAME_SM}"
1514
+ echo " Region: ${AWS_REGION}"
1515
+ echo " Instance Type: ${INSTANCE_TYPE}"
1516
+ echo " Instance Count: ${BATCH_INSTANCE_COUNT}"
1517
+ echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
1518
+ echo " S3 Input: ${BATCH_INPUT_PATH}"
1519
+ echo " S3 Output: ${BATCH_OUTPUT_PATH}"
1520
+ echo " Split Type: ${BATCH_SPLIT_TYPE}"
1521
+ echo " Strategy: ${BATCH_STRATEGY}"
1522
+ echo ""
1523
+
1524
+ # Download results locally
1525
+ LOCAL_OUTPUT_DIR="${SCRIPT_DIR}/../batch-output"
1526
+ mkdir -p "${LOCAL_OUTPUT_DIR}"
1527
+ echo "๐Ÿ“ฅ Downloading results to ${LOCAL_OUTPUT_DIR}/"
1528
+ if aws s3 sync "${BATCH_OUTPUT_PATH}" "${LOCAL_OUTPUT_DIR}/" --region "${AWS_REGION}"; then
1529
+ DOWNLOADED=$(ls -1 "${LOCAL_OUTPUT_DIR}" 2>/dev/null | wc -l | tr -d ' ')
1530
+ echo "โœ… Downloaded ${DOWNLOADED} file(s) to ${LOCAL_OUTPUT_DIR}/"
1531
+ echo ""
1532
+
1533
+ # Display first output file preview
1534
+ FIRST_FILE=$(ls -1 "${LOCAL_OUTPUT_DIR}" 2>/dev/null | head -1)
1535
+ if [ -n "${FIRST_FILE}" ]; then
1536
+ echo "๐Ÿ“„ Sample output (${FIRST_FILE}):"
1537
+ head -5 "${LOCAL_OUTPUT_DIR}/${FIRST_FILE}"
1538
+ LINES=$(wc -l < "${LOCAL_OUTPUT_DIR}/${FIRST_FILE}" | tr -d ' ')
1539
+ if [ "${LINES}" -gt 5 ]; then
1540
+ echo " ... (${LINES} total lines)"
1541
+ fi
1542
+ fi
1543
+ else
1544
+ echo "โš ๏ธ Could not download output files"
1545
+ fi
1546
+
1547
+ echo ""
1548
+ echo "๐Ÿงช Review results:"
1549
+ echo " ./do/test"
1550
+ echo ""
1551
+ echo "๐Ÿ“ Register this deployment:"
1552
+ echo " ./do/register"
1553
+ echo ""
1554
+ echo "๐Ÿ“‹ View logs:"
1555
+ echo " ./do/logs"
1556
+ echo ""
1557
+ echo "๐Ÿงน Clean up when done:"
1558
+ echo " ./do/clean"
1559
+
1560
+ <% } %>