@aws/ml-container-creator 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,20 +9,59 @@ set -o pipefail
9
9
  # Parse flags
10
10
  FORCE_NEW=false
11
11
  FORCE_IC=false
12
- for arg in "$@"; do
13
- case "$arg" in
14
- --force) FORCE_NEW=true ;;
15
- --force-ic) FORCE_IC=true ;;
12
+ IC_TARGET=""
13
+ while [ $# -gt 0 ]; do
14
+ case "$1" in
15
+ --force) FORCE_NEW=true; shift ;;
16
+ --force-ic)
17
+ FORCE_IC=true
18
+ shift
19
+ <% if (deploymentTarget === 'realtime-inference') { %>
20
+ # Optional name argument: --force-ic <name>
21
+ if [ $# -gt 0 ] && [[ ! "$1" == --* ]]; then
22
+ IC_TARGET="$1"
23
+ shift
24
+ fi
25
+ <% } %>
26
+ ;;
27
+ <% if (deploymentTarget === 'realtime-inference') { %>
28
+ --ic)
29
+ if [ -z "${2:-}" ]; then
30
+ echo "❌ --ic requires a name argument"
31
+ echo " Usage: ./do/deploy --ic <name>"
32
+ exit 1
33
+ fi
34
+ IC_TARGET="$2"
35
+ shift 2
36
+ ;;
37
+ <% } %>
16
38
  --help|-h)
39
+ <% if (deploymentTarget === 'realtime-inference') { %>
40
+ echo "Usage: ./do/deploy [--force] [--force-ic [<name>]] [--ic <name>]"
41
+ echo ""
42
+ echo "Options:"
43
+ echo " --force Create a new endpoint and IC, even if one already exists."
44
+ echo " --force-ic Recreate ALL inference components on the existing endpoint."
45
+ echo " --force-ic <name> Recreate only the named IC on the existing endpoint."
46
+ echo " --ic <name> Deploy only the named IC (from do/ic/<name>.conf)."
47
+ echo ""
48
+ echo "Without flags, deploy resumes from the last run."
49
+ <% } else { %>
17
50
  echo "Usage: ./do/deploy [--force] [--force-ic]"
18
51
  echo ""
19
52
  echo "Options:"
20
- echo " --force Create a new endpoint and IC, even if one already exists."
21
- echo " --force-ic Recreate just the IC on the existing endpoint."
53
+ echo " --force Create a new endpoint, even if one already exists."
54
+ echo " --force-ic Recreate the inference component on the existing endpoint."
22
55
  echo ""
23
56
  echo "Without flags, deploy resumes from the last run."
57
+ <% } %>
24
58
  exit 0
25
59
  ;;
60
+ *)
61
+ echo "❌ Unknown option: $1"
62
+ echo " Run ./do/deploy --help for usage."
63
+ exit 1
64
+ ;;
26
65
  esac
27
66
  done
28
67
 
@@ -37,7 +76,11 @@ echo " Region: ${AWS_REGION}"
37
76
  echo " Build target: ${BUILD_TARGET}"
38
77
  echo " Deployment target: ${DEPLOYMENT_TARGET}"
39
78
  <% if (deploymentTarget === 'realtime-inference') { %>
40
- echo " Instance type: ${INSTANCE_TYPE}"
79
+ if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
80
+ echo " Endpoint: ${ENDPOINT_NAME} (external)"
81
+ else
82
+ echo " Instance type: ${INSTANCE_TYPE}"
83
+ fi
41
84
  <% } else if (deploymentTarget === 'async-inference') { %>
42
85
  echo " Instance type: ${INSTANCE_TYPE}"
43
86
  echo " S3 output: ${ASYNC_S3_OUTPUT_PATH}"
@@ -135,6 +178,12 @@ fi
135
178
  # SageMaker Real-Time Inference Deployment (Inference Components)
136
179
  # ============================================================
137
180
 
181
+ # Source shared helpers
182
+ source "${SCRIPT_DIR}/lib/secrets.sh"
183
+ source "${SCRIPT_DIR}/lib/wait.sh"
184
+ source "${SCRIPT_DIR}/lib/endpoint-config.sh"
185
+ source "${SCRIPT_DIR}/lib/inference-component.sh"
186
+
138
187
  # Validate execution role ARN
139
188
  if [ -z "${ROLE_ARN:-}" ]; then
140
189
  echo "❌ Execution role ARN not provided"
@@ -155,44 +204,30 @@ fi
155
204
 
156
205
  echo " Using execution role: ${ROLE_ARN}"
157
206
 
158
- # Helper: persist a variable to do/config so other scripts can use it
159
- _update_config_var() {
160
- local var_name="$1" var_value="$2" config_file="${SCRIPT_DIR}/config"
161
- if grep -q "^export ${var_name}=" "${config_file}" 2>/dev/null; then
162
- sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
163
- rm -f "${config_file}.bak"
164
- else
165
- echo "" >> "${config_file}"
166
- echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
207
+ # Validate --ic argument if specified (set by --ic <name> or --force-ic <name>)
208
+ if [ -n "${IC_TARGET}" ]; then
209
+ if [ ! -d "${SCRIPT_DIR}/ic" ]; then
210
+ echo "❌ IC name specified but no do/ic/ directory found"
211
+ echo " This project does not use multi-IC configuration."
212
+ echo " Remove --ic/--force-ic <name> to deploy using the legacy single-IC path."
213
+ exit 1
167
214
  fi
168
- }
169
-
170
- # Helper: query a SageMaker resource status, returns empty string if not found
171
- _get_endpoint_status() {
172
- aws sagemaker describe-endpoint \
173
- --endpoint-name "$1" \
174
- --region "${AWS_REGION}" \
175
- --query EndpointStatus \
176
- --output text 2>/dev/null || echo ""
177
- }
178
-
179
- _get_ic_status() {
180
- aws sagemaker describe-inference-component \
181
- --inference-component-name "$1" \
182
- --region "${AWS_REGION}" \
183
- --query InferenceComponentStatus \
184
- --output text 2>/dev/null || echo ""
185
- }
215
+ if [ ! -f "${SCRIPT_DIR}/ic/${IC_TARGET}.conf" ]; then
216
+ echo "❌ IC config not found: do/ic/${IC_TARGET}.conf"
217
+ echo ""
218
+ echo " Available ICs:"
219
+ for conf in "${SCRIPT_DIR}"/ic/*.conf; do
220
+ [ -f "${conf}" ] || continue
221
+ echo "$(basename "${conf}" .conf)"
222
+ done
223
+ echo ""
224
+ echo " Usage: ./do/deploy --ic <name>"
225
+ exit 1
226
+ fi
227
+ fi
186
228
 
187
- # Helper: find an InService IC on an endpoint (returns first match or empty)
188
- _find_active_ic_on_endpoint() {
189
- aws sagemaker list-inference-components \
190
- --endpoint-name "$1" \
191
- --status-equals InService \
192
- --region "${AWS_REGION}" \
193
- --query 'InferenceComponents[0].InferenceComponentName' \
194
- --output text 2>/dev/null || echo ""
195
- }
229
+ # Resolve container secrets (HF_TOKEN, NGC_API_KEY)
230
+ resolve_secrets
196
231
 
197
232
  # ============================================================
198
233
  # Idempotency: check for existing deployment from a previous run
@@ -204,7 +239,11 @@ if [ "${FORCE_NEW}" = true ]; then
204
239
  elif [ "${FORCE_IC}" = true ] && [ -n "${ENDPOINT_NAME:-}" ]; then
205
240
  EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")
206
241
  if [ "${EP_STATUS}" = "InService" ]; then
207
- echo "🔄 --force-ic: recreating inference component on existing endpoint: ${ENDPOINT_NAME}"
242
+ if [ -n "${IC_TARGET}" ]; then
243
+ echo "🔄 --force-ic: recreating IC '${IC_TARGET}' on existing endpoint: ${ENDPOINT_NAME}"
244
+ else
245
+ echo "🔄 --force-ic: recreating ALL inference components on existing endpoint: ${ENDPOINT_NAME}"
246
+ fi
208
247
  SKIP_TO="create_ic"
209
248
  else
210
249
  echo "⚠️ --force-ic requires an InService endpoint, but ${ENDPOINT_NAME} is: ${EP_STATUS:-not found}"
@@ -242,7 +281,7 @@ elif [ -n "${ENDPOINT_NAME:-}" ]; then
242
281
  Creating)
243
282
  echo "⏳ Inference component still creating: ${INFERENCE_COMPONENT_NAME}"
244
283
  SKIP_TO="wait_ic"
245
- IC_NAME="${INFERENCE_COMPONENT_NAME}"
284
+ IC_DEPLOYED_NAME="${INFERENCE_COMPONENT_NAME}"
246
285
  ;;
247
286
  Failed)
248
287
  echo "⚠️ Inference component failed: ${INFERENCE_COMPONENT_NAME}"
@@ -251,47 +290,59 @@ elif [ -n "${ENDPOINT_NAME:-}" ]; then
251
290
  ;;
252
291
  *)
253
292
  # Stored IC not found — check if a different IC is running on this endpoint
254
- LIVE_IC=$(_find_active_ic_on_endpoint "${ENDPOINT_NAME}")
255
- if [ -n "${LIVE_IC}" ] && [ "${LIVE_IC}" != "None" ]; then
256
- echo " Found running inference component on endpoint: ${LIVE_IC}"
257
- echo " (config had stale reference: ${INFERENCE_COMPONENT_NAME})"
258
- _update_config_var "INFERENCE_COMPONENT_NAME" "${LIVE_IC}"
259
- echo ""
260
- echo "📋 Deployment is already live. Nothing to do."
261
- echo " Endpoint: ${ENDPOINT_NAME}"
262
- echo " Inference Component: ${LIVE_IC}"
263
- echo ""
264
- echo "🧪 Test your endpoint:"
265
- echo " ./do/test"
266
- echo ""
267
- echo "🧹 Clean up when done:"
268
- echo " ./do/clean endpoint"
269
- exit 0
270
- else
271
- echo " No existing inference component found on endpoint. Will create one."
293
+ if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
294
+ # External endpoint: never adopt ICs we didn't create
295
+ echo " Stored IC not found on external endpoint. Will create a new one."
272
296
  SKIP_TO="create_ic"
297
+ else
298
+ LIVE_IC=$(_find_active_ic_on_endpoint "${ENDPOINT_NAME}")
299
+ if [ -n "${LIVE_IC}" ] && [ "${LIVE_IC}" != "None" ]; then
300
+ echo "✅ Found running inference component on endpoint: ${LIVE_IC}"
301
+ echo " (config had stale reference: ${INFERENCE_COMPONENT_NAME})"
302
+ _update_config_var "INFERENCE_COMPONENT_NAME" "${LIVE_IC}"
303
+ echo ""
304
+ echo "📋 Deployment is already live. Nothing to do."
305
+ echo " Endpoint: ${ENDPOINT_NAME}"
306
+ echo " Inference Component: ${LIVE_IC}"
307
+ echo ""
308
+ echo "🧪 Test your endpoint:"
309
+ echo " ./do/test"
310
+ echo ""
311
+ echo "🧹 Clean up when done:"
312
+ echo " ./do/clean endpoint"
313
+ exit 0
314
+ else
315
+ echo " No existing inference component found on endpoint. Will create one."
316
+ SKIP_TO="create_ic"
317
+ fi
273
318
  fi
274
319
  ;;
275
320
  esac
276
321
  else
277
322
  # No IC name in config — check if one is already running on the endpoint
278
- LIVE_IC=$(_find_active_ic_on_endpoint "${ENDPOINT_NAME}")
279
- if [ -n "${LIVE_IC}" ] && [ "${LIVE_IC}" != "None" ]; then
280
- echo " Found running inference component on endpoint: ${LIVE_IC}"
281
- _update_config_var "INFERENCE_COMPONENT_NAME" "${LIVE_IC}"
282
- echo ""
283
- echo "📋 Deployment is already live. Nothing to do."
284
- echo " Endpoint: ${ENDPOINT_NAME}"
285
- echo " Inference Component: ${LIVE_IC}"
286
- echo ""
287
- echo "🧪 Test your endpoint:"
288
- echo " ./do/test"
289
- echo ""
290
- echo "🧹 Clean up when done:"
291
- echo " ./do/clean endpoint"
292
- exit 0
293
- else
323
+ if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
324
+ # External endpoint: never adopt ICs we didn't create
325
+ echo " No previous IC deployed by this project. Will create a new one."
294
326
  SKIP_TO="create_ic"
327
+ else
328
+ LIVE_IC=$(_find_active_ic_on_endpoint "${ENDPOINT_NAME}")
329
+ if [ -n "${LIVE_IC}" ] && [ "${LIVE_IC}" != "None" ]; then
330
+ echo "✅ Found running inference component on endpoint: ${LIVE_IC}"
331
+ _update_config_var "INFERENCE_COMPONENT_NAME" "${LIVE_IC}"
332
+ echo ""
333
+ echo "📋 Deployment is already live. Nothing to do."
334
+ echo " Endpoint: ${ENDPOINT_NAME}"
335
+ echo " Inference Component: ${LIVE_IC}"
336
+ echo ""
337
+ echo "🧪 Test your endpoint:"
338
+ echo " ./do/test"
339
+ echo ""
340
+ echo "🧹 Clean up when done:"
341
+ echo " ./do/clean endpoint"
342
+ exit 0
343
+ else
344
+ SKIP_TO="create_ic"
345
+ fi
295
346
  fi
296
347
  fi
297
348
  ;;
@@ -316,252 +367,399 @@ elif [ -n "${ENDPOINT_NAME:-}" ]; then
316
367
  fi
317
368
 
318
369
  # ============================================================
319
- # Step 1: Create endpoint configuration (skip if resuming)
370
+ # Step 1: Create endpoint configuration and endpoint (skip if resuming)
320
371
  # ============================================================
321
372
  if [ -z "${SKIP_TO}" ]; then
322
- TIMESTAMP=$(date +%s)
323
- ENDPOINT_CONFIG_NAME="${PROJECT_NAME}-epc-${TIMESTAMP}"
324
- ENDPOINT_NAME="${PROJECT_NAME}-endpoint-${TIMESTAMP}"
325
- IC_NAME="${PROJECT_NAME}-ic-${TIMESTAMP}"
326
-
327
- _update_config_var "ENDPOINT_NAME" "${ENDPOINT_NAME}"
328
- _update_config_var "ENDPOINT_CONFIG_NAME" "${ENDPOINT_CONFIG_NAME}"
329
- _update_config_var "INFERENCE_COMPONENT_NAME" "${IC_NAME}"
373
+ if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
374
+ # External endpoint: validate it still exists and is InService
375
+ echo "🔗 Using external endpoint: ${ENDPOINT_NAME}"
376
+ echo " Validating endpoint status..."
330
377
 
331
- # Build production variant JSON
332
- VARIANT_JSON="[{\"VariantName\":\"AllTraffic\",\"InstanceType\":\"${INSTANCE_TYPE}\",\"InitialInstanceCount\":1"
378
+ EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")
333
379
 
334
- if [ -n "${INFERENCE_AMI_VERSION:-}" ]; then
335
- VARIANT_JSON="${VARIANT_JSON},\"InferenceAmiVersion\":\"${INFERENCE_AMI_VERSION}\""
336
- echo " AMI version: ${INFERENCE_AMI_VERSION}"
337
- fi
380
+ if [ -z "${EP_STATUS}" ]; then
381
+ echo "❌ External endpoint not found: ${ENDPOINT_NAME}"
382
+ echo " The endpoint may have been deleted. Update ENDPOINT_NAME in do/config"
383
+ echo " or remove ENDPOINT_EXTERNAL=true to create a new endpoint."
384
+ exit 4
385
+ fi
338
386
 
339
- if [ -n "${CAPACITY_RESERVATION_ARN:-}" ]; then
340
- VARIANT_JSON="${VARIANT_JSON},\"CapacityReservationConfig\":{\"CapacityReservationPreference\":\"capacity-reservations-only\",\"MlReservationArn\":\"${CAPACITY_RESERVATION_ARN}\"}"
341
- echo " ⚠️ Capacity reservation (experimental): ${CAPACITY_RESERVATION_ARN}"
342
- fi
387
+ if [ "${EP_STATUS}" != "InService" ]; then
388
+ echo "❌ External endpoint not InService: ${ENDPOINT_NAME} (status: ${EP_STATUS})"
389
+ echo " The endpoint must be InService before attaching inference components."
390
+ echo " Wait for the endpoint to become InService, or update do/config."
391
+ exit 4
392
+ fi
343
393
 
344
- VARIANT_JSON="${VARIANT_JSON}}]"
394
+ echo "✅ External endpoint is InService: ${ENDPOINT_NAME}"
395
+ # Skip directly to IC creation — no endpoint config, no endpoint creation, no wait
396
+ SKIP_TO="create_ic"
397
+ else
398
+ TIMESTAMP=$(date +%s)
399
+ ENDPOINT_NAME="${PROJECT_NAME}-endpoint-${TIMESTAMP}"
345
400
 
346
- echo "⚙️ Creating endpoint configuration: ${ENDPOINT_CONFIG_NAME}"
347
- if ! aws sagemaker create-endpoint-config \
348
- --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
349
- --execution-role-arn "${ROLE_ARN}" \
350
- --production-variants "${VARIANT_JSON}" \
351
- --region "${AWS_REGION}"; then
401
+ _update_config_var "ENDPOINT_NAME" "${ENDPOINT_NAME}"
352
402
 
353
- echo "❌ Failed to create endpoint configuration"
354
- echo " Check that:"
355
- echo " • The execution role ARN is valid"
356
- echo " • The instance type is valid: ${INSTANCE_TYPE}"
357
- echo " • The instance type is available in region: ${AWS_REGION}"
358
- echo " • You have sufficient service quota for the instance type"
359
- exit 4
360
- fi
403
+ # Create endpoint configuration via shared helper
404
+ create_endpoint_config
361
405
 
362
- echo " Endpoint configuration created: ${ENDPOINT_CONFIG_NAME}"
406
+ _update_config_var "ENDPOINT_CONFIG_NAME" "${ENDPOINT_CONFIG_NAME}"
363
407
 
364
- # Record endpoint config in manifest (non-blocking)
365
- ENDPOINT_CONFIG_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint-config/${ENDPOINT_CONFIG_NAME}"
366
- ./do/manifest add \
367
- --type sagemaker-endpoint-config \
368
- --id "${ENDPOINT_CONFIG_ARN}" \
369
- --project "${PROJECT_NAME}" \
370
- --meta "{\"endpointConfigName\":\"${ENDPOINT_CONFIG_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
371
- 2>/dev/null || true
408
+ # Record endpoint config in manifest (non-blocking)
409
+ ENDPOINT_CONFIG_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint-config/${ENDPOINT_CONFIG_NAME}"
410
+ ./do/manifest add \
411
+ --type sagemaker-endpoint-config \
412
+ --id "${ENDPOINT_CONFIG_ARN}" \
413
+ --project "${PROJECT_NAME}" \
414
+ --meta "{\"endpointConfigName\":\"${ENDPOINT_CONFIG_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
415
+ 2>/dev/null || true
372
416
 
373
- # Step 2: Create endpoint
374
- echo "🚀 Creating endpoint: ${ENDPOINT_NAME}"
375
- if ! aws sagemaker create-endpoint \
376
- --endpoint-name "${ENDPOINT_NAME}" \
377
- --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
378
- --region "${AWS_REGION}"; then
417
+ # Step 2: Create endpoint
418
+ echo "🚀 Creating endpoint: ${ENDPOINT_NAME}"
419
+ if ! aws sagemaker create-endpoint \
420
+ --endpoint-name "${ENDPOINT_NAME}" \
421
+ --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
422
+ --region "${AWS_REGION}"; then
379
423
 
380
- echo "❌ Failed to create endpoint"
381
- echo " Check that:"
382
- echo " • Your IAM credentials have sagemaker:CreateEndpoint permission"
383
- echo " • You have sufficient service quota in region: ${AWS_REGION}"
384
- exit 4
385
- fi
424
+ echo "❌ Failed to create endpoint"
425
+ echo " Check that:"
426
+ echo " • Your IAM credentials have sagemaker:CreateEndpoint permission"
427
+ echo " • You have sufficient service quota in region: ${AWS_REGION}"
428
+ exit 4
429
+ fi
386
430
 
387
- echo "✅ Endpoint creation initiated: ${ENDPOINT_NAME}"
431
+ echo "✅ Endpoint creation initiated: ${ENDPOINT_NAME}"
388
432
 
389
- # Record endpoint in manifest (non-blocking)
390
- ENDPOINT_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint/${ENDPOINT_NAME}"
391
- ./do/manifest add \
392
- --type sagemaker-endpoint \
393
- --id "${ENDPOINT_ARN}" \
394
- --project "${PROJECT_NAME}" \
395
- --meta "{\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
396
- 2>/dev/null || true
433
+ # Record endpoint in manifest (non-blocking)
434
+ ENDPOINT_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint/${ENDPOINT_NAME}"
435
+ ./do/manifest add \
436
+ --type sagemaker-endpoint \
437
+ --id "${ENDPOINT_ARN}" \
438
+ --project "${PROJECT_NAME}" \
439
+ --meta "{\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
440
+ 2>/dev/null || true
441
+ fi
397
442
  fi
398
443
 
399
444
  # ============================================================
400
- # Wait for endpoint (skip if already InService)
445
+ # Wait for endpoint (skip if already InService or external)
401
446
  # ============================================================
402
447
  if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
403
448
  echo "⏳ Waiting for endpoint to reach InService status..."
404
449
  echo " This may take a few minutes..."
405
450
  echo " If this times out, re-run ./do/deploy to resume."
406
451
 
407
- if ! aws sagemaker wait endpoint-in-service \
408
- --endpoint-name "${ENDPOINT_NAME}" \
409
- --region "${AWS_REGION}"; then
410
-
411
- # Check if it was a credential expiration vs actual failure
412
- EP_CHECK=$(_get_endpoint_status "${ENDPOINT_NAME}" 2>/dev/null)
413
- if [ "${EP_CHECK}" = "Creating" ]; then
414
- echo ""
415
- echo "⚠️ Wait interrupted (credentials may have expired), but endpoint is still creating."
416
- echo " Refresh your credentials and re-run ./do/deploy to resume."
417
- echo ""
418
- echo " Or check status manually:"
419
- echo " aws sagemaker describe-endpoint --endpoint-name ${ENDPOINT_NAME} --region ${AWS_REGION} --query EndpointStatus"
420
- exit 4
421
- fi
422
-
423
- echo "❌ Endpoint failed to reach InService status"
424
- echo " Check CloudWatch Logs for details:"
425
- echo " https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/Endpoints/${ENDPOINT_NAME}"
426
- exit 4
427
- fi
452
+ wait_endpoint "${ENDPOINT_NAME}"
428
453
 
429
454
  echo "✅ Endpoint is InService: ${ENDPOINT_NAME}"
430
455
  fi
431
456
 
432
457
  # ============================================================
433
- # Step 3: Create inference component (skip if resuming from wait_ic)
458
+ # Step 3: Deploy inference components (skip if resuming from wait_ic)
434
459
  # ============================================================
435
460
  if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "create_ic" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
436
- # Generate new IC name if resuming after endpoint wait or failed IC
437
- if [ "${SKIP_TO}" = "create_ic" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
438
- TIMESTAMP=$(date +%s)
439
- IC_NAME="${PROJECT_NAME}-ic-${TIMESTAMP}"
440
- _update_config_var "INFERENCE_COMPONENT_NAME" "${IC_NAME}"
441
- fi
442
461
 
443
- # Build container spec JSON
444
- CONTAINER_SPEC="{\"Image\":\"${ECR_REPOSITORY}:${IMAGE_TAG}\""
445
- if [ -n "${CONTAINER_ENV_JSON}" ]; then
446
- CONTAINER_SPEC="${CONTAINER_SPEC},\"Environment\":{${CONTAINER_ENV_JSON}}"
447
- fi
448
- CONTAINER_SPEC="${CONTAINER_SPEC}}"
462
+ if [ -d "${SCRIPT_DIR}/ic" ]; then
463
+ # _check_gpu_capacity
464
+ # Best-effort capacity guardrail: sums IC_GPU_COUNT across all do/ic/*.conf
465
+ # and compares against known GPU count for the instance type.
466
+ # Warns (does not error) if total exceeds instance capacity.
467
+ # Skips check if instance type is not in the known map.
468
+ _check_gpu_capacity() {
469
+ # Skip check if no INSTANCE_TYPE (external endpoints)
470
+ if [ -z "${INSTANCE_TYPE:-}" ]; then
471
+ return 0
472
+ fi
449
473
 
450
- echo "📦 Creating inference component: ${IC_NAME}"
451
- if ! aws sagemaker create-inference-component \
452
- --inference-component-name "${IC_NAME}" \
453
- --endpoint-name "${ENDPOINT_NAME}" \
454
- --variant-name "AllTraffic" \
455
- --specification "{
456
- \"Container\": ${CONTAINER_SPEC},
457
- \"StartupParameters\": {
458
- \"ContainerStartupHealthCheckTimeoutInSeconds\": 900
459
- },
460
- \"ComputeResourceRequirements\": {
461
- \"NumberOfAcceleratorDevicesRequired\": ${IC_GPU_COUNT},
462
- \"MinMemoryRequiredInMb\": 1024
463
- }
464
- }" \
465
- --runtime-config "{\"CopyCount\": 1}" \
466
- --region "${AWS_REGION}"; then
474
+ # Best-effort capacity guardrail: sums GPU requirements from base ICs only.
475
+ # NOTE: Only do/ic/*.conf files are counted. Adapter ICs (do/adapters/*.conf)
476
+ # share the base IC's GPU resources and have no ComputeResourceRequirements,
477
+ # so they are intentionally excluded from this capacity check.
478
+ #
479
+ # Hardcoded GPU counts for common SageMaker GPU instance types
480
+ local instance_gpus=""
481
+ case "${INSTANCE_TYPE}" in
482
+ ml.g4dn.xlarge) instance_gpus=1 ;;
483
+ ml.g4dn.12xlarge) instance_gpus=4 ;;
484
+ ml.g5.xlarge) instance_gpus=1 ;;
485
+ ml.g5.2xlarge) instance_gpus=1 ;;
486
+ ml.g5.4xlarge) instance_gpus=1 ;;
487
+ ml.g5.8xlarge) instance_gpus=1 ;;
488
+ ml.g5.12xlarge) instance_gpus=4 ;;
489
+ ml.g5.48xlarge) instance_gpus=8 ;;
490
+ ml.g6.xlarge) instance_gpus=1 ;;
491
+ ml.g6.12xlarge) instance_gpus=4 ;;
492
+ ml.g6.48xlarge) instance_gpus=8 ;;
493
+ ml.g6e.xlarge) instance_gpus=1 ;;
494
+ ml.g6e.2xlarge) instance_gpus=1 ;;
495
+ ml.g6e.4xlarge) instance_gpus=1 ;;
496
+ ml.g6e.8xlarge) instance_gpus=1 ;;
497
+ ml.g6e.12xlarge) instance_gpus=4 ;;
498
+ ml.g6e.48xlarge) instance_gpus=8 ;;
499
+ ml.g7e.xlarge) instance_gpus=1 ;;
500
+ ml.g7e.2xlarge) instance_gpus=1 ;;
501
+ ml.g7e.4xlarge) instance_gpus=1 ;;
502
+ ml.g7e.8xlarge) instance_gpus=1 ;;
503
+ ml.g7e.12xlarge) instance_gpus=4 ;;
504
+ ml.g7e.48xlarge) instance_gpus=8 ;;
505
+ ml.p3.2xlarge) instance_gpus=1 ;;
506
+ ml.p3.8xlarge) instance_gpus=4 ;;
507
+ ml.p3.16xlarge) instance_gpus=8 ;;
508
+ ml.p4d.24xlarge) instance_gpus=8 ;;
509
+ ml.p4de.24xlarge) instance_gpus=8 ;;
510
+ ml.p5.48xlarge) instance_gpus=8 ;;
511
+ *) instance_gpus="" ;;
512
+ esac
513
+
514
+ # Skip check if instance type not in map
515
+ if [ -z "${instance_gpus}" ]; then
516
+ return 0
517
+ fi
467
518
 
468
- echo "❌ Failed to create inference component"
469
- echo " Check that:"
470
- echo " • The ECR image exists and is accessible"
471
- echo " The endpoint is in InService status"
472
- echo " • The compute resource requirements fit the instance type: ${INSTANCE_TYPE}"
473
- exit 4
474
- fi
519
+ # Sum IC_GPU_COUNT across all IC config files
520
+ local total_gpu_requested=0
521
+ for conf in "${SCRIPT_DIR}"/ic/*.conf; do
522
+ [ -f "${conf}" ] || continue
523
+ local ic_gpus
524
+ ic_gpus=$(grep "^export IC_GPU_COUNT=" "${conf}" 2>/dev/null | sed 's/^export IC_GPU_COUNT=//' | tr -d '"' || echo "1")
525
+ if [ -z "${ic_gpus}" ]; then
526
+ ic_gpus=1
527
+ fi
528
+ total_gpu_requested=$(( total_gpu_requested + ic_gpus ))
529
+ done
475
530
 
476
- echo " Inference component creation initiated: ${IC_NAME}"
531
+ if [ "${total_gpu_requested}" -gt "${instance_gpus}" ]; then
532
+ echo ""
533
+ echo "⚠️ GPU capacity warning: ICs request ${total_gpu_requested} GPUs total, but ${INSTANCE_TYPE} has ${instance_gpus} GPUs."
534
+ echo " SageMaker will likely reject IC creation if capacity is exceeded."
535
+ echo " Consider reducing IC_GPU_COUNT values or using a larger instance type."
536
+ echo ""
537
+ fi
538
+ }
477
539
 
478
- # Record inference component in manifest (non-blocking)
479
- IC_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:inference-component/${IC_NAME}"
480
- ./do/manifest add \
481
- --type sagemaker-inference-component \
482
- --id "${IC_ARN}" \
483
- --project "${PROJECT_NAME}" \
484
- --meta "{\"inferenceComponentName\":\"${IC_NAME}\",\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
485
- 2>/dev/null || true
486
- fi
540
+ # Run capacity guardrail before deploying ICs
541
+ _check_gpu_capacity
542
+
543
+ # _delete_and_wait_ic <ic_name>
544
+ # Deletes an inference component and waits for deletion to complete.
545
+ # Polls until the IC is no longer found (avoids name conflicts on recreate).
546
+ _delete_and_wait_ic() {
547
+ local ic_name="$1"
548
+ local delete_timeout=600 # 10 minutes max wait for deletion
549
+
550
+ echo "🗑️ Deleting inference component: ${ic_name}"
551
+ if ! aws sagemaker delete-inference-component \
552
+ --inference-component-name "${ic_name}" \
553
+ --region "${AWS_REGION}" 2>/dev/null; then
554
+ echo " ⚠️ Delete call failed (IC may already be gone). Continuing..."
555
+ return 0
556
+ fi
487
557
 
488
- # ============================================================
489
- # Wait for inference component
490
- # ============================================================
491
- echo "⏳ Waiting for inference component to reach InService status..."
492
- echo " This may take 5-10 minutes..."
493
- echo " If this times out, re-run ./do/deploy to resume."
558
+ echo " Waiting for deletion to complete..."
559
+ local delete_start
560
+ delete_start=$(date +%s)
494
561
 
495
- # Poll loop — replaces `aws sagemaker wait inference-component-in-service`
496
- # which is only available in AWS CLI v2.15+
497
- IC_WAIT_TIMEOUT=1800 # 30 minutes max
498
- IC_WAIT_START=$(date +%s)
562
+ while true; do
563
+ local ic_status
564
+ ic_status=$(_get_ic_status "${ic_name}")
499
565
 
500
- while true; do
501
- IC_STATUS=$(_get_ic_status "${IC_NAME}" 2>/dev/null)
566
+ if [ -z "${ic_status}" ]; then
567
+ echo " ✅ Inference component deleted: ${ic_name}"
568
+ break
569
+ fi
502
570
 
503
- case "${IC_STATUS}" in
504
- InService)
505
- break
506
- ;;
507
- Failed)
508
- echo "❌ Inference component failed to reach InService status"
509
- echo " Check CloudWatch Logs for details:"
510
- echo " https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/Endpoints/${ENDPOINT_NAME}"
571
+ local elapsed=$(( $(date +%s) - delete_start ))
572
+ if [ "${elapsed}" -ge "${delete_timeout}" ]; then
573
+ echo " ⚠️ Deletion timed out after ${delete_timeout}s. IC status: ${ic_status}"
574
+ echo " Proceeding anyway — SageMaker may reject the new IC if name conflicts."
575
+ break
576
+ fi
577
+
578
+ echo " $(date +%H:%M:%S) Deleting... (${ic_status}, ${elapsed}s elapsed)"
579
+ sleep 15
580
+ done
581
+ }
582
+
583
+ # _deploy_single_ic <conf_file>
584
+ # Deploys a single IC with per-IC idempotency:
585
+ # - If FORCE_IC is true: delete existing IC, clear state, create fresh
586
+ # - If IC_DEPLOYED_NAME is set and InService → skip
587
+ # - If IC_DEPLOYED_NAME is set and Creating → wait for it
588
+ # - If IC_DEPLOYED_NAME is set and Failed → recreate with new timestamp
589
+ # - If IC_DEPLOYED_NAME is not set → create new IC
590
+ # Fail-fast: exits immediately on failure.
591
+ _deploy_single_ic() {
592
+ local ic_conf="$1"
593
+ local ic_basename
594
+ ic_basename=$(basename "${ic_conf}" .conf)
595
+
596
+ # Source the IC config to check IC_DEPLOYED_NAME
597
+ # Use a subshell-safe approach: read the variable without polluting scope
598
+ local existing_ic_name=""
599
+ if grep -q "^export IC_DEPLOYED_NAME=" "${ic_conf}" 2>/dev/null; then
600
+ existing_ic_name=$(grep "^export IC_DEPLOYED_NAME=" "${ic_conf}" | sed 's/^export IC_DEPLOYED_NAME="//' | sed 's/"$//')
601
+ fi
602
+
603
+ # --force-ic: delete existing IC before recreating
604
+ if [ "${FORCE_IC}" = true ] && [ -n "${existing_ic_name}" ]; then
605
+ echo "🔄 --force-ic: recreating IC '${ic_basename}'"
606
+ _delete_and_wait_ic "${existing_ic_name}"
607
+
608
+ # Clear deployed state from config before recreating
609
+ _update_config_var "IC_DEPLOYED_NAME" "" "${ic_conf}"
610
+ _update_config_var "IC_DEPLOYED_AT" "" "${ic_conf}"
611
+ existing_ic_name=""
612
+ fi
613
+
614
+ if [ "${FORCE_IC}" = true ] && [ -z "${existing_ic_name}" ]; then
615
+ # Force mode with no existing IC — just create new
616
+ create_inference_component "${ic_conf}"
617
+ elif [ -n "${existing_ic_name}" ]; then
618
+ # IC was previously deployed — check its current status
619
+ local ic_status
620
+ ic_status=$(_get_ic_status "${existing_ic_name}")
621
+
622
+ case "${ic_status}" in
623
+ InService)
624
+ echo "✅ IC '${ic_basename}' already InService: ${existing_ic_name} — skipping"
625
+ IC_DEPLOYED_NAME="${existing_ic_name}"
626
+ return 0
627
+ ;;
628
+ Creating)
629
+ echo "⏳ IC '${ic_basename}' is still Creating: ${existing_ic_name} — waiting..."
630
+ IC_DEPLOYED_NAME="${existing_ic_name}"
631
+ wait_ic "${IC_DEPLOYED_NAME}"
632
+ echo "✅ Inference component is InService: ${IC_DEPLOYED_NAME}"
633
+ return 0
634
+ ;;
635
+ Failed)
636
+ echo "⚠️ IC '${ic_basename}' previously Failed: ${existing_ic_name} — recreating..."
637
+ create_inference_component "${ic_conf}"
638
+ ;;
639
+ *)
640
+ echo " IC '${ic_basename}' has unknown/missing status for ${existing_ic_name} — creating new..."
641
+ create_inference_component "${ic_conf}"
642
+ ;;
643
+ esac
644
+ else
645
+ # No previous deployment — create new IC
646
+ create_inference_component "${ic_conf}"
647
+ fi
648
+
649
+ echo "⏳ Waiting for inference component to reach InService status..."
650
+ echo " This may take 5-10 minutes..."
651
+
652
+ wait_ic "${IC_DEPLOYED_NAME}"
653
+
654
+ echo "✅ Inference component is InService: ${IC_DEPLOYED_NAME}"
655
+
656
+ # Record inference component in manifest (non-blocking)
657
+ local ic_arn="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:inference-component/${IC_DEPLOYED_NAME}"
658
+ ./do/manifest add \
659
+ --type sagemaker-inference-component \
660
+ --id "${ic_arn}" \
661
+ --project "${PROJECT_NAME}" \
662
+ --meta "{\"inferenceComponentName\":\"${IC_DEPLOYED_NAME}\",\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE:-external}\",\"region\":\"${AWS_REGION}\"}" \
663
+ 2>/dev/null || true
664
+ }
665
+
666
+ if [ -n "${IC_TARGET}" ]; then
667
+ # Single IC path: deploy only the named IC
511
668
  echo ""
512
- echo " Debug:"
513
- echo " aws sagemaker describe-inference-component --inference-component-name ${IC_NAME} --region ${AWS_REGION}"
514
- exit 4
515
- ;;
516
- Creating)
517
- # Check timeout
518
- IC_ELAPSED=$(( $(date +%s) - IC_WAIT_START ))
519
- if [ "${IC_ELAPSED}" -ge "${IC_WAIT_TIMEOUT}" ]; then
520
- echo ""
521
- echo "⚠️ Inference component still creating after ${IC_WAIT_TIMEOUT}s."
522
- echo " Re-run ./do/deploy to resume waiting."
669
+ echo "── Deploying IC: ${IC_TARGET} ──"
670
+ _deploy_single_ic "${SCRIPT_DIR}/ic/${IC_TARGET}.conf"
671
+ else
672
+ # Multi-IC path: iterate all IC config files (alphabetical order)
673
+ IC_SUMMARY=""
674
+ IC_DEPLOY_FAILED=false
675
+
676
+ for conf in "${SCRIPT_DIR}"/ic/*.conf; do
677
+ [ -f "${conf}" ] || continue
678
+ local_ic_basename=$(basename "${conf}" .conf)
523
679
  echo ""
524
- echo " Or check status manually:"
525
- echo " aws sagemaker describe-inference-component --inference-component-name ${IC_NAME} --region ${AWS_REGION}"
680
+ echo "── Deploying IC: ${local_ic_basename} ──"
681
+
682
+ if ! _deploy_single_ic "${conf}"; then
683
+ echo "❌ IC '${local_ic_basename}' failed to deploy. Stopping."
684
+ IC_SUMMARY="${IC_SUMMARY} ${local_ic_basename}: FAILED\n"
685
+ IC_DEPLOY_FAILED=true
686
+ break
687
+ fi
688
+
689
+ IC_SUMMARY="${IC_SUMMARY} ${local_ic_basename}: ${IC_DEPLOYED_NAME} [InService]\n"
690
+ done
691
+
692
+ # Print summary
693
+ echo ""
694
+ echo "📋 IC Deployment Summary:"
695
+ echo -e "${IC_SUMMARY}"
696
+
697
+ if [ "${IC_DEPLOY_FAILED}" = true ]; then
698
+ echo "❌ Deployment stopped due to IC failure. Fix the issue and re-run ./do/deploy to resume."
526
699
  exit 4
527
700
  fi
528
- echo " $(date +%H:%M:%S) Status: Creating (${IC_ELAPSED}s elapsed)..."
529
- sleep 30
530
- ;;
531
- "")
532
- echo "⚠️ Could not determine inference component status (credentials may have expired)."
533
- echo " Re-run ./do/deploy to resume."
534
- exit 4
535
- ;;
536
- *)
537
- echo " $(date +%H:%M:%S) Status: ${IC_STATUS}..."
538
- sleep 30
539
- ;;
540
- esac
541
- done
701
+ fi
702
+ else
703
+ # Legacy single-IC path: no do/ic/ directory
704
+ create_inference_component_legacy
705
+
706
+ echo " Waiting for inference component to reach InService status..."
707
+ echo " This may take 5-10 minutes..."
708
+ echo " If this times out, re-run ./do/deploy to resume."
709
+
710
+ wait_ic "${IC_DEPLOYED_NAME}"
711
+
712
+ echo "✅ Inference component is InService: ${IC_DEPLOYED_NAME}"
713
+
714
+ # Record inference component in manifest (non-blocking)
715
+ IC_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:inference-component/${IC_DEPLOYED_NAME}"
716
+ ./do/manifest add \
717
+ --type sagemaker-inference-component \
718
+ --id "${IC_ARN}" \
719
+ --project "${PROJECT_NAME}" \
720
+ --meta "{\"inferenceComponentName\":\"${IC_DEPLOYED_NAME}\",\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE:-external}\",\"region\":\"${AWS_REGION}\"}" \
721
+ 2>/dev/null || true
722
+ fi
723
+
724
+ elif [ "${SKIP_TO}" = "wait_ic" ]; then
725
+ # Resuming: just wait for the IC that was already being created
726
+ echo "⏳ Waiting for inference component to reach InService status..."
727
+ echo " This may take 5-10 minutes..."
728
+ echo " If this times out, re-run ./do/deploy to resume."
729
+
730
+ wait_ic "${IC_DEPLOYED_NAME}"
731
+
732
+ echo "✅ Inference component is InService: ${IC_DEPLOYED_NAME}"
733
+ fi
542
734
 
543
735
  echo "✅ Deployment complete!"
544
736
  echo ""
545
737
  echo "📋 Deployment Details:"
546
738
  echo " Endpoint: ${ENDPOINT_NAME}"
547
- echo " Endpoint Config: ${ENDPOINT_CONFIG_NAME}"
548
- echo " Inference Component: ${IC_NAME}"
549
- echo " Region: ${AWS_REGION}"
550
- echo " Instance Type: ${INSTANCE_TYPE}"
739
+ if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
740
+ echo " Endpoint Config: (external — not managed by this project)"
741
+ echo " Region: ${AWS_REGION}"
742
+ else
743
+ echo " Endpoint Config: ${ENDPOINT_CONFIG_NAME:-N/A}"
744
+ echo " Region: ${AWS_REGION}"
745
+ echo " Instance Type: ${INSTANCE_TYPE}"
746
+ fi
551
747
  echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
552
748
  echo ""
553
- echo "🧪 Test your endpoint:"
554
- echo " ./do/test"
555
- echo ""
556
- echo "📝 Register this deployment:"
557
- echo " ./do/register"
558
- echo ""
559
- echo "🔍 Monitor your deployment:"
560
- echo " aws sagemaker describe-inference-component --inference-component-name ${IC_NAME} --region ${AWS_REGION}"
561
- echo " aws sagemaker describe-endpoint --endpoint-name ${ENDPOINT_NAME} --region ${AWS_REGION}"
562
- echo ""
563
- echo "🧹 Clean up when done:"
564
- echo " ./do/clean endpoint"
749
+ echo "📋 What's next?"
750
+ echo " • Test your endpoint: ./do/test"
751
+ <% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
752
+ echo " Benchmark performance: ./do/benchmark"
753
+ <% } %>
754
+ <% if (typeof enableLora !== 'undefined' && enableLora) { %>
755
+ echo " Add a LoRA adapter: ./do/adapter add <name> --weights s3://..."
756
+ <% } %>
757
+ echo " View endpoint status: ./do/status"
758
+ echo " • Register this deployment: ./do/register"
759
+ echo " View logs: ./do/logs"
760
+ <% if (!(typeof existingEndpointName !== 'undefined' && existingEndpointName)) { %>
761
+ echo " • Clean up when done: ./do/clean endpoint"
762
+ <% } %>
565
763
 
566
764
  <% } else if (deploymentTarget === 'async-inference') { %>
567
765
  # ============================================================
@@ -570,6 +768,13 @@ echo " ./do/clean endpoint"
570
768
  # Flow: create-model → create-endpoint-config (with AsyncInferenceConfig) → create-endpoint
571
769
  # ============================================================
572
770
 
771
+ # Source shared helpers
772
+ source "${SCRIPT_DIR}/lib/secrets.sh"
773
+ source "${SCRIPT_DIR}/lib/wait.sh"
774
+
775
+ # Resolve container secrets (HF_TOKEN, NGC_API_KEY)
776
+ resolve_secrets
777
+
573
778
  # Validate execution role ARN
574
779
  if [ -z "${ROLE_ARN:-}" ]; then
575
780
  echo "❌ Execution role ARN not provided"
@@ -732,27 +937,6 @@ ASYNC_SNS_ERROR_TOPIC_NAME=$(echo "${ASYNC_SNS_ERROR_TOPIC}" | awk -F: '{print $
732
937
  # Flow: create-model → create-endpoint-config (with AsyncInferenceConfig) → create-endpoint
733
938
  # ============================================================
734
939
 
735
- # Helper: persist a variable to do/config so other scripts can use it
736
- _update_config_var() {
737
- local var_name="$1" var_value="$2" config_file="${SCRIPT_DIR}/config"
738
- if grep -q "^export ${var_name}=" "${config_file}" 2>/dev/null; then
739
- sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
740
- rm -f "${config_file}.bak"
741
- else
742
- echo "" >> "${config_file}"
743
- echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
744
- fi
745
- }
746
-
747
- # Helper: query a SageMaker resource status, returns empty string if not found
748
- _get_endpoint_status() {
749
- aws sagemaker describe-endpoint \
750
- --endpoint-name "$1" \
751
- --region "${AWS_REGION}" \
752
- --query EndpointStatus \
753
- --output text 2>/dev/null || echo ""
754
- }
755
-
756
940
  # ============================================================
757
941
  # Idempotency: check for existing deployment from a previous run
758
942
  # ============================================================
@@ -928,27 +1112,7 @@ if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
928
1112
  echo " This may take several minutes..."
929
1113
  echo " If this times out, re-run ./do/deploy to resume."
930
1114
 
931
- if ! aws sagemaker wait endpoint-in-service \
932
- --endpoint-name "${ENDPOINT_NAME}" \
933
- --region "${AWS_REGION}"; then
934
-
935
- # Check if it was a credential expiration vs actual failure
936
- EP_CHECK=$(_get_endpoint_status "${ENDPOINT_NAME}" 2>/dev/null)
937
- if [ "${EP_CHECK}" = "Creating" ]; then
938
- echo ""
939
- echo "⚠️ Wait interrupted (credentials may have expired), but endpoint is still creating."
940
- echo " Refresh your credentials and re-run ./do/deploy to resume."
941
- echo ""
942
- echo " Or check status manually:"
943
- echo " aws sagemaker describe-endpoint --endpoint-name ${ENDPOINT_NAME} --region ${AWS_REGION} --query EndpointStatus"
944
- exit 4
945
- fi
946
-
947
- echo "❌ Async endpoint failed to reach InService status"
948
- echo " Check CloudWatch Logs for details:"
949
- echo " https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/Endpoints/${ENDPOINT_NAME}"
950
- exit 4
951
- fi
1115
+ wait_endpoint "${ENDPOINT_NAME}"
952
1116
  fi
953
1117
 
954
1118
  echo "✅ Async deployment complete!"
@@ -964,17 +1128,15 @@ echo " S3 Output: ${ASYNC_S3_OUTPUT_PATH}"
964
1128
  echo " SNS Success: ${ASYNC_SNS_SUCCESS_TOPIC}"
965
1129
  echo " SNS Error: ${ASYNC_SNS_ERROR_TOPIC}"
966
1130
  echo ""
967
- echo "🧪 Test your async endpoint:"
968
- echo " ./do/test"
969
- echo ""
970
- echo "📝 Register this deployment:"
971
- echo " ./do/register"
972
- echo ""
973
- echo "🔍 Monitor your deployment:"
974
- echo " aws sagemaker describe-endpoint --endpoint-name ${ENDPOINT_NAME} --region ${AWS_REGION}"
975
- echo ""
976
- echo "🧹 Clean up when done:"
977
- echo " ./do/clean endpoint"
1131
+ echo "📋 What's next?"
1132
+ echo " • Test your async endpoint: ./do/test"
1133
+ echo " • Check async output: aws s3 ls ${ASYNC_S3_OUTPUT_PATH}"
1134
+ <% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
1135
+ echo " • Benchmark performance: ./do/benchmark"
1136
+ <% } %>
1137
+ echo " Register this deployment: ./do/register"
1138
+ echo " View logs: ./do/logs"
1139
+ echo " • Clean up when done: ./do/clean endpoint"
978
1140
 
979
1141
  <% } else if (deploymentTarget === 'hyperpod-eks') { %>
980
1142
  # ============================================================
@@ -1170,22 +1332,16 @@ echo " Deployment: ${PROJECT_NAME}"
1170
1332
  echo " Replicas: ${HYPERPOD_REPLICAS}"
1171
1333
  echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
1172
1334
  echo ""
1173
- echo "🔍 Check deployment status:"
1174
- echo " export KUBECONFIG=${KUBECONFIG_PATH}"
1175
- echo " kubectl get pods -n ${HYPERPOD_NAMESPACE}"
1176
- echo " kubectl get svc -n ${HYPERPOD_NAMESPACE}"
1177
- echo ""
1178
- echo "🧪 Test your deployment:"
1179
- echo " ./do/test"
1180
- echo ""
1181
- echo "📝 Register this deployment:"
1182
- echo " ./do/register"
1183
- echo ""
1184
- echo "📋 View logs:"
1185
- echo " ./do/logs"
1186
- echo ""
1187
- echo "🧹 Clean up when done:"
1188
- echo " ./do/clean hyperpod"
1335
+ echo "📋 What's next?"
1336
+ echo " Test your deployment: ./do/test"
1337
+ echo " • Check pod status: kubectl get pods -n ${HYPERPOD_NAMESPACE}"
1338
+ echo " • View pod logs: kubectl logs -n ${HYPERPOD_NAMESPACE} -l app=${PROJECT_NAME}"
1339
+ <% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
1340
+ echo " Benchmark performance: ./do/benchmark"
1341
+ <% } %>
1342
+ echo " • Register this deployment: ./do/register"
1343
+ echo " View logs: ./do/logs"
1344
+ echo " • Clean up when done: ./do/clean hyperpod"
1189
1345
 
1190
1346
  # Write kubeconfig path to config so other scripts can use it (idempotent)
1191
1347
  _update_config_var() {
@@ -1207,6 +1363,13 @@ _update_config_var "KUBECONFIG" "${KUBECONFIG_PATH}"
1207
1363
  # Flow: create-model → create-transform-job → poll until completion
1208
1364
  # ============================================================
1209
1365
 
1366
+ # Source shared helpers
1367
+ source "${SCRIPT_DIR}/lib/secrets.sh"
1368
+ source "${SCRIPT_DIR}/lib/wait.sh"
1369
+
1370
+ # Resolve container secrets (HF_TOKEN, NGC_API_KEY)
1371
+ resolve_secrets
1372
+
1210
1373
  # Validate execution role ARN
1211
1374
  if [ -z "${ROLE_ARN:-}" ]; then
1212
1375
  echo "❌ Execution role ARN not provided"
@@ -1359,18 +1522,6 @@ fi
1359
1522
  echo "✅ Using custom S3 output path: ${BATCH_OUTPUT_PATH}"
1360
1523
  <% } %>
1361
1524
 
1362
- # Helper: persist a variable to do/config so other scripts can use it
1363
- _update_config_var() {
1364
- local var_name="$1" var_value="$2" config_file="${SCRIPT_DIR}/config"
1365
- if grep -q "^export ${var_name}=" "${config_file}" 2>/dev/null; then
1366
- sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
1367
- rm -f "${config_file}.bak"
1368
- else
1369
- echo "" >> "${config_file}"
1370
- echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
1371
- fi
1372
- }
1373
-
1374
1525
  # ============================================================
1375
1526
  # Check for previous transform job still running
1376
1527
  # ============================================================
@@ -1605,16 +1756,11 @@ else
1605
1756
  fi
1606
1757
 
1607
1758
  echo ""
1608
- echo "🧪 Review results:"
1609
- echo " ./do/test"
1610
- echo ""
1611
- echo "📝 Register this deployment:"
1612
- echo " ./do/register"
1613
- echo ""
1614
- echo "📋 View logs:"
1615
- echo " ./do/logs"
1616
- echo ""
1617
- echo "🧹 Clean up when done:"
1618
- echo " ./do/clean"
1759
+ echo "📋 What's next?"
1760
+ echo " • View results: cat batch-output/"
1761
+ echo " • Review results: ./do/test"
1762
+ echo " Register this deployment: ./do/register"
1763
+ echo " • View logs: ./do/logs"
1764
+ echo " • Clean up when done: ./do/clean"
1619
1765
 
1620
1766
  <% } %>