@aws/ml-container-creator 0.8.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/LICENSE-THIRD-PARTY +50760 -16218
  2. package/bin/cli.js +31 -137
  3. package/package.json +7 -2
  4. package/servers/lib/catalogs/instances.json +52 -1275
  5. package/servers/lib/catalogs/models.json +0 -132
  6. package/servers/lib/catalogs/popular-diffusors.json +1 -110
  7. package/src/app.js +29 -2
  8. package/src/lib/config-manager.js +17 -0
  9. package/src/lib/generated/cli-options.js +467 -0
  10. package/src/lib/generated/validation-rules.js +202 -0
  11. package/src/lib/mcp-client.js +16 -1
  12. package/src/lib/mcp-command-handler.js +10 -2
  13. package/src/lib/prompt-runner.js +16 -2
  14. package/src/lib/train-config-parser.js +136 -0
  15. package/src/lib/train-config-persistence.js +143 -0
  16. package/src/lib/train-config-validator.js +112 -0
  17. package/src/lib/train-feedback.js +46 -0
  18. package/src/lib/train-idempotency.js +97 -0
  19. package/src/lib/train-request-builder.js +120 -0
  20. package/templates/code/serve +5 -134
  21. package/templates/code/serve.d/lmi.ejs +19 -0
  22. package/templates/code/serve.d/sglang.ejs +47 -0
  23. package/templates/code/serve.d/tensorrt-llm.ejs +53 -0
  24. package/templates/code/serve.d/vllm.ejs +48 -0
  25. package/templates/do/.train_build_request.py +141 -0
  26. package/templates/do/.train_poll_parser.py +135 -0
  27. package/templates/do/.train_status_parser.py +187 -0
  28. package/templates/do/clean +1 -1387
  29. package/templates/do/clean.d/async-inference.ejs +508 -0
  30. package/templates/do/clean.d/batch-transform.ejs +512 -0
  31. package/templates/do/clean.d/hyperpod-eks.ejs +481 -0
  32. package/templates/do/clean.d/managed-inference.ejs +1043 -0
  33. package/templates/do/deploy +1 -1766
  34. package/templates/do/deploy.d/async-inference.ejs +501 -0
  35. package/templates/do/deploy.d/batch-transform.ejs +529 -0
  36. package/templates/do/deploy.d/hyperpod-eks.ejs +339 -0
  37. package/templates/do/deploy.d/managed-inference.ejs +726 -0
  38. package/templates/do/lib/feedback.sh +41 -0
  39. package/templates/do/train +786 -0
  40. package/templates/do/training/config.yaml +140 -0
  41. package/templates/do/training/train.py +463 -0
@@ -0,0 +1,726 @@
1
+ #!/bin/bash
2
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ set -e
6
+ set -u
7
+ set -o pipefail
8
+
9
+ # Parse flags
10
+ FORCE_NEW=false
11
+ FORCE_IC=false
12
+ IC_TARGET=""
13
+ while [ $# -gt 0 ]; do
14
+ case "$1" in
15
+ --force) FORCE_NEW=true; shift ;;
16
+ --force-ic)
17
+ FORCE_IC=true
18
+ shift
19
+ # Optional name argument: --force-ic <name>
20
+ if [ $# -gt 0 ] && [[ ! "$1" == --* ]]; then
21
+ IC_TARGET="$1"
22
+ shift
23
+ fi
24
+ ;;
25
+ --ic)
26
+ if [ -z "${2:-}" ]; then
27
+ echo "❌ --ic requires a name argument"
28
+ echo " Usage: ./do/deploy --ic <name>"
29
+ exit 1
30
+ fi
31
+ IC_TARGET="$2"
32
+ shift 2
33
+ ;;
34
+ --help|-h)
35
+ echo "Usage: ./do/deploy [--force] [--force-ic [<name>]] [--ic <name>]"
36
+ echo ""
37
+ echo "Options:"
38
+ echo " --force Create a new endpoint and IC, even if one already exists."
39
+ echo " --force-ic Recreate ALL inference components on the existing endpoint."
40
+ echo " --force-ic <name> Recreate only the named IC on the existing endpoint."
41
+ echo " --ic <name> Deploy only the named IC (from do/ic/<name>.conf)."
42
+ echo ""
43
+ echo "Without flags, deploy resumes from the last run."
44
+ exit 0
45
+ ;;
46
+ *)
47
+ echo "❌ Unknown option: $1"
48
+ echo " Run ./do/deploy --help for usage."
49
+ exit 1
50
+ ;;
51
+ esac
52
+ done
53
+
54
+ # Source configuration
55
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
56
+ source "${SCRIPT_DIR}/config"
57
+
58
+ echo "🚀 Deploying to AWS"
59
+ echo " Project: ${PROJECT_NAME}"
60
+ echo " Deployment config: ${DEPLOYMENT_CONFIG}"
61
+ echo " Region: ${AWS_REGION}"
62
+ echo " Build target: ${BUILD_TARGET}"
63
+ echo " Deployment target: ${DEPLOYMENT_TARGET}"
64
+ if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
65
+ echo " Endpoint: ${ENDPOINT_NAME} (external)"
66
+ else
67
+ echo " Instance type: ${INSTANCE_TYPE}"
68
+ fi
69
+
70
+ # Check AWS credentials
71
+ echo "🔍 Validating AWS credentials..."
72
+ if ! aws sts get-caller-identity &> /dev/null; then
73
+ echo "❌ AWS credentials not configured"
74
+ echo " Run: aws configure"
75
+ echo " Or set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables"
76
+ exit 4
77
+ fi
78
+
79
+ AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
80
+ echo "✅ AWS credentials validated (Account: ${AWS_ACCOUNT_ID})"
81
+
82
+ # Construct ECR repository URL
83
+ ECR_REPOSITORY="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_REPOSITORY_NAME}"
84
+
85
+ # ============================================================
86
+ # Shared: Verify ECR image exists
87
+ # ============================================================
88
+ echo "🔍 Verifying ECR image exists..."
89
+ if ! aws ecr describe-images \
90
+ --repository-name "${ECR_REPOSITORY_NAME}" \
91
+ --image-ids imageTag="${PROJECT_NAME}-latest" \
92
+ --region "${AWS_REGION}" &> /dev/null; then
93
+
94
+ echo "❌ ECR image not found: ${ECR_REPOSITORY}:${PROJECT_NAME}-latest"
95
+ echo ""
96
+ echo "Please build and push your image first:"
97
+ echo " ./do/submit"
98
+ echo ""
99
+ echo "After the build completes successfully, run this deploy script again."
100
+ exit 4
101
+ fi
102
+
103
+ echo "✅ ECR image found: ${ECR_REPOSITORY}:${PROJECT_NAME}-latest"
104
+ IMAGE_TAG="${PROJECT_NAME}-latest"
105
+
106
+ # ============================================================
107
+ # Shared: Resolve secrets for container environment
108
+ # ============================================================
109
+ CONTAINER_ENV_JSON=""
110
+
111
+ if [ -n "${HF_TOKEN_ARN:-}" ]; then
112
+ echo "🔐 Resolving HuggingFace token from Secrets Manager..."
113
+ RESOLVED_HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "${HF_TOKEN_ARN}" --query SecretString --output text --region "${AWS_REGION}") || {
114
+ echo "❌ Failed to resolve HuggingFace token from Secrets Manager"
115
+ exit 3
116
+ }
117
+ CONTAINER_ENV_JSON="\"HF_TOKEN\":\"${RESOLVED_HF_TOKEN}\""
118
+ elif [ -n "${HF_TOKEN:-}" ]; then
119
+ CONTAINER_ENV_JSON="\"HF_TOKEN\":\"${HF_TOKEN}\""
120
+ fi
121
+
122
+ if [ -n "${NGC_API_KEY_ARN:-}" ]; then
123
+ echo "🔐 Resolving NGC API key from Secrets Manager..."
124
+ RESOLVED_NGC_KEY=$(aws secretsmanager get-secret-value --secret-id "${NGC_API_KEY_ARN}" --query SecretString --output text --region "${AWS_REGION}") || {
125
+ echo "❌ Failed to resolve NGC API key from Secrets Manager"
126
+ exit 3
127
+ }
128
+ if [ -n "${CONTAINER_ENV_JSON}" ]; then
129
+ CONTAINER_ENV_JSON="${CONTAINER_ENV_JSON},\"NGC_API_KEY\":\"${RESOLVED_NGC_KEY}\""
130
+ else
131
+ CONTAINER_ENV_JSON="\"NGC_API_KEY\":\"${RESOLVED_NGC_KEY}\""
132
+ fi
133
+ elif [ -n "${NGC_API_KEY:-}" ]; then
134
+ if [ -n "${CONTAINER_ENV_JSON}" ]; then
135
+ CONTAINER_ENV_JSON="${CONTAINER_ENV_JSON},\"NGC_API_KEY\":\"${NGC_API_KEY}\""
136
+ else
137
+ CONTAINER_ENV_JSON="\"NGC_API_KEY\":\"${NGC_API_KEY}\""
138
+ fi
139
+ fi
140
+
141
+ # ============================================================
142
+ # SageMaker Real-Time Inference Deployment (Inference Components)
143
+ # ============================================================
144
+
145
+ # Source shared helpers
146
+ source "${SCRIPT_DIR}/lib/secrets.sh"
147
+ source "${SCRIPT_DIR}/lib/wait.sh"
148
+ source "${SCRIPT_DIR}/lib/endpoint-config.sh"
149
+ source "${SCRIPT_DIR}/lib/inference-component.sh"
150
+
151
+ # Validate execution role ARN
152
+ if [ -z "${ROLE_ARN:-}" ]; then
153
+ echo "❌ Execution role ARN not provided"
154
+ echo ""
155
+ echo "Usage:"
156
+ echo " export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE"
157
+ echo " ./do/deploy"
158
+ echo ""
159
+ echo "Or set ROLE_ARN in do/config"
160
+ echo ""
161
+ echo "The execution role must have permissions for:"
162
+ echo " • SageMaker endpoint and inference component management"
163
+ echo " • ECR image access"
164
+ echo " • S3 access (if using model artifacts)"
165
+ echo " • CloudWatch Logs"
166
+ exit 3
167
+ fi
168
+
169
+ echo " Using execution role: ${ROLE_ARN}"
170
+
171
+ # Validate --ic argument if specified (set by --ic <name> or --force-ic <name>)
172
+ if [ -n "${IC_TARGET}" ]; then
173
+ if [ ! -d "${SCRIPT_DIR}/ic" ]; then
174
+ echo "❌ IC name specified but no do/ic/ directory found"
175
+ echo " This project does not use multi-IC configuration."
176
+ echo " Remove --ic/--force-ic <name> to deploy using the legacy single-IC path."
177
+ exit 1
178
+ fi
179
+ if [ ! -f "${SCRIPT_DIR}/ic/${IC_TARGET}.conf" ]; then
180
+ echo "❌ IC config not found: do/ic/${IC_TARGET}.conf"
181
+ echo ""
182
+ echo " Available ICs:"
183
+ for conf in "${SCRIPT_DIR}"/ic/*.conf; do
184
+ [ -f "${conf}" ] || continue
185
+ echo " • $(basename "${conf}" .conf)"
186
+ done
187
+ echo ""
188
+ echo " Usage: ./do/deploy --ic <name>"
189
+ exit 1
190
+ fi
191
+ fi
192
+
193
+ # Resolve container secrets (HF_TOKEN, NGC_API_KEY)
194
+ resolve_secrets
195
+
196
+ # ============================================================
197
+ # Idempotency: check for existing deployment from a previous run
198
+ # ============================================================
199
+ SKIP_TO=""
200
+
201
+ if [ "${FORCE_NEW}" = true ]; then
202
+ echo "🔄 --force: ignoring previous deployment, creating new resources."
203
+ elif [ "${FORCE_IC}" = true ] && [ -n "${ENDPOINT_NAME:-}" ]; then
204
+ EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")
205
+ if [ "${EP_STATUS}" = "InService" ]; then
206
+ if [ -n "${IC_TARGET}" ]; then
207
+ echo "🔄 --force-ic: recreating IC '${IC_TARGET}' on existing endpoint: ${ENDPOINT_NAME}"
208
+ else
209
+ echo "🔄 --force-ic: recreating ALL inference components on existing endpoint: ${ENDPOINT_NAME}"
210
+ fi
211
+ SKIP_TO="create_ic"
212
+ else
213
+ echo "⚠️ --force-ic requires an InService endpoint, but ${ENDPOINT_NAME} is: ${EP_STATUS:-not found}"
214
+ echo " Use --force to create a new endpoint, or wait for the current one."
215
+ exit 4
216
+ fi
217
+ elif [ -n "${ENDPOINT_NAME:-}" ]; then
218
+ echo "🔍 Checking for existing deployment: ${ENDPOINT_NAME}"
219
+
220
+ EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")
221
+
222
+ case "${EP_STATUS}" in
223
+ InService)
224
+ echo "✅ Endpoint already InService: ${ENDPOINT_NAME}"
225
+
226
+ # Check inference component
227
+ if [ -n "${INFERENCE_COMPONENT_NAME:-}" ]; then
228
+ IC_STATUS=$(_get_ic_status "${INFERENCE_COMPONENT_NAME}")
229
+
230
+ case "${IC_STATUS}" in
231
+ InService)
232
+ echo "✅ Inference component already InService: ${INFERENCE_COMPONENT_NAME}"
233
+ echo ""
234
+ echo "📋 Deployment is already live. Nothing to do."
235
+ echo " Endpoint: ${ENDPOINT_NAME}"
236
+ echo " Inference Component: ${INFERENCE_COMPONENT_NAME}"
237
+ echo ""
238
+ echo "🧪 Test your endpoint:"
239
+ echo " ./do/test"
240
+ echo ""
241
+ echo "🧹 Clean up when done:"
242
+ echo " ./do/clean endpoint"
243
+ exit 0
244
+ ;;
245
+ Creating)
246
+ echo "⏳ Inference component still creating: ${INFERENCE_COMPONENT_NAME}"
247
+ SKIP_TO="wait_ic"
248
+ IC_DEPLOYED_NAME="${INFERENCE_COMPONENT_NAME}"
249
+ ;;
250
+ Failed)
251
+ echo "⚠️ Inference component failed: ${INFERENCE_COMPONENT_NAME}"
252
+ echo " Will create a new inference component on the existing endpoint."
253
+ SKIP_TO="create_ic"
254
+ ;;
255
+ *)
256
+ # Stored IC not found — check if a different IC is running on this endpoint
257
+ if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
258
+ # External endpoint: never adopt ICs we didn't create
259
+ echo " Stored IC not found on external endpoint. Will create a new one."
260
+ SKIP_TO="create_ic"
261
+ else
262
+ LIVE_IC=$(_find_active_ic_on_endpoint "${ENDPOINT_NAME}")
263
+ if [ -n "${LIVE_IC}" ] && [ "${LIVE_IC}" != "None" ]; then
264
+ echo "✅ Found running inference component on endpoint: ${LIVE_IC}"
265
+ echo " (config had stale reference: ${INFERENCE_COMPONENT_NAME})"
266
+ _update_config_var "INFERENCE_COMPONENT_NAME" "${LIVE_IC}"
267
+ echo ""
268
+ echo "📋 Deployment is already live. Nothing to do."
269
+ echo " Endpoint: ${ENDPOINT_NAME}"
270
+ echo " Inference Component: ${LIVE_IC}"
271
+ echo ""
272
+ echo "🧪 Test your endpoint:"
273
+ echo " ./do/test"
274
+ echo ""
275
+ echo "🧹 Clean up when done:"
276
+ echo " ./do/clean endpoint"
277
+ exit 0
278
+ else
279
+ echo " No existing inference component found on endpoint. Will create one."
280
+ SKIP_TO="create_ic"
281
+ fi
282
+ fi
283
+ ;;
284
+ esac
285
+ else
286
+ # No IC name in config — check if one is already running on the endpoint
287
+ if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
288
+ # External endpoint: never adopt ICs we didn't create
289
+ echo " No previous IC deployed by this project. Will create a new one."
290
+ SKIP_TO="create_ic"
291
+ else
292
+ LIVE_IC=$(_find_active_ic_on_endpoint "${ENDPOINT_NAME}")
293
+ if [ -n "${LIVE_IC}" ] && [ "${LIVE_IC}" != "None" ]; then
294
+ echo "✅ Found running inference component on endpoint: ${LIVE_IC}"
295
+ _update_config_var "INFERENCE_COMPONENT_NAME" "${LIVE_IC}"
296
+ echo ""
297
+ echo "📋 Deployment is already live. Nothing to do."
298
+ echo " Endpoint: ${ENDPOINT_NAME}"
299
+ echo " Inference Component: ${LIVE_IC}"
300
+ echo ""
301
+ echo "🧪 Test your endpoint:"
302
+ echo " ./do/test"
303
+ echo ""
304
+ echo "🧹 Clean up when done:"
305
+ echo " ./do/clean endpoint"
306
+ exit 0
307
+ else
308
+ SKIP_TO="create_ic"
309
+ fi
310
+ fi
311
+ fi
312
+ ;;
313
+ Creating|Updating)
314
+ echo "⏳ Endpoint still ${EP_STATUS}: ${ENDPOINT_NAME}"
315
+ SKIP_TO="wait_endpoint"
316
+ ;;
317
+ Failed)
318
+ echo "⚠️ Previous endpoint failed: ${ENDPOINT_NAME}"
319
+ echo " Creating a new deployment. Clean up the failed endpoint with:"
320
+ echo " ./do/clean endpoint"
321
+ echo ""
322
+ # Fall through to create new resources
323
+ ;;
324
+ "")
325
+ echo " Previous endpoint not found (may have been cleaned up). Creating new deployment."
326
+ ;;
327
+ *)
328
+ echo " Endpoint in unexpected state: ${EP_STATUS}. Creating new deployment."
329
+ ;;
330
+ esac
331
+ fi
332
+
333
+ # ============================================================
334
+ # Step 1: Create endpoint configuration and endpoint (skip if resuming)
335
+ # ============================================================
336
+ if [ -z "${SKIP_TO}" ]; then
337
+ if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
338
+ # External endpoint: validate it still exists and is InService
339
+ echo "🔗 Using external endpoint: ${ENDPOINT_NAME}"
340
+ echo " Validating endpoint status..."
341
+
342
+ EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")
343
+
344
+ if [ -z "${EP_STATUS}" ]; then
345
+ echo "❌ External endpoint not found: ${ENDPOINT_NAME}"
346
+ echo " The endpoint may have been deleted. Update ENDPOINT_NAME in do/config"
347
+ echo " or remove ENDPOINT_EXTERNAL=true to create a new endpoint."
348
+ exit 4
349
+ fi
350
+
351
+ if [ "${EP_STATUS}" != "InService" ]; then
352
+ echo "❌ External endpoint not InService: ${ENDPOINT_NAME} (status: ${EP_STATUS})"
353
+ echo " The endpoint must be InService before attaching inference components."
354
+ echo " Wait for the endpoint to become InService, or update do/config."
355
+ exit 4
356
+ fi
357
+
358
+ echo "✅ External endpoint is InService: ${ENDPOINT_NAME}"
359
+ # Skip directly to IC creation — no endpoint config, no endpoint creation, no wait
360
+ SKIP_TO="create_ic"
361
+ else
362
+ TIMESTAMP=$(date +%s)
363
+ ENDPOINT_NAME="${PROJECT_NAME}-endpoint-${TIMESTAMP}"
364
+
365
+ _update_config_var "ENDPOINT_NAME" "${ENDPOINT_NAME}"
366
+
367
+ # Create endpoint configuration via shared helper
368
+ create_endpoint_config
369
+
370
+ _update_config_var "ENDPOINT_CONFIG_NAME" "${ENDPOINT_CONFIG_NAME}"
371
+
372
+ # Record endpoint config in manifest (non-blocking)
373
+ ENDPOINT_CONFIG_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint-config/${ENDPOINT_CONFIG_NAME}"
374
+ ./do/manifest add \
375
+ --type sagemaker-endpoint-config \
376
+ --id "${ENDPOINT_CONFIG_ARN}" \
377
+ --project "${PROJECT_NAME}" \
378
+ --meta "{\"endpointConfigName\":\"${ENDPOINT_CONFIG_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
379
+ 2>/dev/null || true
380
+
381
+ # Step 2: Create endpoint
382
+ echo "🚀 Creating endpoint: ${ENDPOINT_NAME}"
383
+ if ! aws sagemaker create-endpoint \
384
+ --endpoint-name "${ENDPOINT_NAME}" \
385
+ --endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
386
+ --region "${AWS_REGION}"; then
387
+
388
+ echo "❌ Failed to create endpoint"
389
+ echo " Check that:"
390
+ echo " • Your IAM credentials have sagemaker:CreateEndpoint permission"
391
+ echo " • You have sufficient service quota in region: ${AWS_REGION}"
392
+ exit 4
393
+ fi
394
+
395
+ echo "✅ Endpoint creation initiated: ${ENDPOINT_NAME}"
396
+
397
+ # Record endpoint in manifest (non-blocking)
398
+ ENDPOINT_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint/${ENDPOINT_NAME}"
399
+ ./do/manifest add \
400
+ --type sagemaker-endpoint \
401
+ --id "${ENDPOINT_ARN}" \
402
+ --project "${PROJECT_NAME}" \
403
+ --meta "{\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
404
+ 2>/dev/null || true
405
+ fi
406
+ fi
407
+
408
+ # ============================================================
409
+ # Wait for endpoint (skip if already InService or external)
410
+ # ============================================================
411
+ if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
412
+ echo "⏳ Waiting for endpoint to reach InService status..."
413
+ echo " This may take a few minutes..."
414
+ echo " If this times out, re-run ./do/deploy to resume."
415
+
416
+ wait_endpoint "${ENDPOINT_NAME}"
417
+
418
+ echo "✅ Endpoint is InService: ${ENDPOINT_NAME}"
419
+ fi
420
+
421
+ # ============================================================
422
+ # Step 3: Deploy inference components (skip if resuming from wait_ic)
423
+ # ============================================================
424
+ if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "create_ic" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
425
+
426
+ if [ -d "${SCRIPT_DIR}/ic" ]; then
427
+ # _check_gpu_capacity
428
+ # Best-effort capacity guardrail: sums IC_GPU_COUNT across all do/ic/*.conf
429
+ # and compares against known GPU count for the instance type.
430
+ # Warns (does not error) if total exceeds instance capacity.
431
+ # Skips check if instance type is not in the known map.
432
+ _check_gpu_capacity() {
433
+ # Skip check if no INSTANCE_TYPE (external endpoints)
434
+ if [ -z "${INSTANCE_TYPE:-}" ]; then
435
+ return 0
436
+ fi
437
+
438
+ # Best-effort capacity guardrail: sums GPU requirements from base ICs only.
439
+ # NOTE: Only do/ic/*.conf files are counted. Adapter ICs (do/adapters/*.conf)
440
+ # share the base IC's GPU resources and have no ComputeResourceRequirements,
441
+ # so they are intentionally excluded from this capacity check.
442
+ #
443
+ # Hardcoded GPU counts for common SageMaker GPU instance types
444
+ local instance_gpus=""
445
+ case "${INSTANCE_TYPE}" in
446
+ ml.g4dn.xlarge) instance_gpus=1 ;;
447
+ ml.g4dn.12xlarge) instance_gpus=4 ;;
448
+ ml.g5.xlarge) instance_gpus=1 ;;
449
+ ml.g5.2xlarge) instance_gpus=1 ;;
450
+ ml.g5.4xlarge) instance_gpus=1 ;;
451
+ ml.g5.8xlarge) instance_gpus=1 ;;
452
+ ml.g5.12xlarge) instance_gpus=4 ;;
453
+ ml.g5.48xlarge) instance_gpus=8 ;;
454
+ ml.g6.xlarge) instance_gpus=1 ;;
455
+ ml.g6.12xlarge) instance_gpus=4 ;;
456
+ ml.g6.48xlarge) instance_gpus=8 ;;
457
+ ml.g6e.xlarge) instance_gpus=1 ;;
458
+ ml.g6e.2xlarge) instance_gpus=1 ;;
459
+ ml.g6e.4xlarge) instance_gpus=1 ;;
460
+ ml.g6e.8xlarge) instance_gpus=1 ;;
461
+ ml.g6e.12xlarge) instance_gpus=4 ;;
462
+ ml.g6e.48xlarge) instance_gpus=8 ;;
463
+ ml.g7e.xlarge) instance_gpus=1 ;;
464
+ ml.g7e.2xlarge) instance_gpus=1 ;;
465
+ ml.g7e.4xlarge) instance_gpus=1 ;;
466
+ ml.g7e.8xlarge) instance_gpus=1 ;;
467
+ ml.g7e.12xlarge) instance_gpus=4 ;;
468
+ ml.g7e.48xlarge) instance_gpus=8 ;;
469
+ ml.p3.2xlarge) instance_gpus=1 ;;
470
+ ml.p3.8xlarge) instance_gpus=4 ;;
471
+ ml.p3.16xlarge) instance_gpus=8 ;;
472
+ ml.p4d.24xlarge) instance_gpus=8 ;;
473
+ ml.p4de.24xlarge) instance_gpus=8 ;;
474
+ ml.p5.48xlarge) instance_gpus=8 ;;
475
+ *) instance_gpus="" ;;
476
+ esac
477
+
478
+ # Skip check if instance type not in map
479
+ if [ -z "${instance_gpus}" ]; then
480
+ return 0
481
+ fi
482
+
483
+ # Sum IC_GPU_COUNT across all IC config files
484
+ local total_gpu_requested=0
485
+ for conf in "${SCRIPT_DIR}"/ic/*.conf; do
486
+ [ -f "${conf}" ] || continue
487
+ local ic_gpus
488
+ ic_gpus=$(grep "^export IC_GPU_COUNT=" "${conf}" 2>/dev/null | sed 's/^export IC_GPU_COUNT=//' | tr -d '"' || echo "1")
489
+ if [ -z "${ic_gpus}" ]; then
490
+ ic_gpus=1
491
+ fi
492
+ total_gpu_requested=$(( total_gpu_requested + ic_gpus ))
493
+ done
494
+
495
+ if [ "${total_gpu_requested}" -gt "${instance_gpus}" ]; then
496
+ echo ""
497
+ echo "⚠️ GPU capacity warning: ICs request ${total_gpu_requested} GPUs total, but ${INSTANCE_TYPE} has ${instance_gpus} GPUs."
498
+ echo " SageMaker will likely reject IC creation if capacity is exceeded."
499
+ echo " Consider reducing IC_GPU_COUNT values or using a larger instance type."
500
+ echo ""
501
+ fi
502
+ }
503
+
504
+ # Run capacity guardrail before deploying ICs
505
+ _check_gpu_capacity
506
+
507
+ # _delete_and_wait_ic <ic_name>
508
+ # Deletes an inference component and waits for deletion to complete.
509
+ # Polls until the IC is no longer found (avoids name conflicts on recreate).
510
+ _delete_and_wait_ic() {
511
+ local ic_name="$1"
512
+ local delete_timeout=600 # 10 minutes max wait for deletion
513
+
514
+ echo "🗑️ Deleting inference component: ${ic_name}"
515
+ if ! aws sagemaker delete-inference-component \
516
+ --inference-component-name "${ic_name}" \
517
+ --region "${AWS_REGION}" 2>/dev/null; then
518
+ echo " ⚠️ Delete call failed (IC may already be gone). Continuing..."
519
+ return 0
520
+ fi
521
+
522
+ echo " Waiting for deletion to complete..."
523
+ local delete_start
524
+ delete_start=$(date +%s)
525
+
526
+ while true; do
527
+ local ic_status
528
+ ic_status=$(_get_ic_status "${ic_name}")
529
+
530
+ if [ -z "${ic_status}" ]; then
531
+ echo " ✅ Inference component deleted: ${ic_name}"
532
+ break
533
+ fi
534
+
535
+ local elapsed=$(( $(date +%s) - delete_start ))
536
+ if [ "${elapsed}" -ge "${delete_timeout}" ]; then
537
+ echo " ⚠️ Deletion timed out after ${delete_timeout}s. IC status: ${ic_status}"
538
+ echo " Proceeding anyway — SageMaker may reject the new IC if name conflicts."
539
+ break
540
+ fi
541
+
542
+ echo " $(date +%H:%M:%S) Deleting... (${ic_status}, ${elapsed}s elapsed)"
543
+ sleep 15
544
+ done
545
+ }
546
+
547
+ # _deploy_single_ic <conf_file>
548
+ # Deploys a single IC with per-IC idempotency:
549
+ # - If FORCE_IC is true: delete existing IC, clear state, create fresh
550
+ # - If IC_DEPLOYED_NAME is set and InService → skip
551
+ # - If IC_DEPLOYED_NAME is set and Creating → wait for it
552
+ # - If IC_DEPLOYED_NAME is set and Failed → recreate with new timestamp
553
+ # - If IC_DEPLOYED_NAME is not set → create new IC
554
+ # Fail-fast: exits immediately on failure.
555
+ _deploy_single_ic() {
556
+ local ic_conf="$1"
557
+ local ic_basename
558
+ ic_basename=$(basename "${ic_conf}" .conf)
559
+
560
+ # Source the IC config to check IC_DEPLOYED_NAME
561
+ # Use a subshell-safe approach: read the variable without polluting scope
562
+ local existing_ic_name=""
563
+ if grep -q "^export IC_DEPLOYED_NAME=" "${ic_conf}" 2>/dev/null; then
564
+ existing_ic_name=$(grep "^export IC_DEPLOYED_NAME=" "${ic_conf}" | sed 's/^export IC_DEPLOYED_NAME="//' | sed 's/"$//')
565
+ fi
566
+
567
+ # --force-ic: delete existing IC before recreating
568
+ if [ "${FORCE_IC}" = true ] && [ -n "${existing_ic_name}" ]; then
569
+ echo "🔄 --force-ic: recreating IC '${ic_basename}'"
570
+ _delete_and_wait_ic "${existing_ic_name}"
571
+
572
+ # Clear deployed state from config before recreating
573
+ _update_config_var "IC_DEPLOYED_NAME" "" "${ic_conf}"
574
+ _update_config_var "IC_DEPLOYED_AT" "" "${ic_conf}"
575
+ existing_ic_name=""
576
+ fi
577
+
578
+ if [ "${FORCE_IC}" = true ] && [ -z "${existing_ic_name}" ]; then
579
+ # Force mode with no existing IC — just create new
580
+ create_inference_component "${ic_conf}"
581
+ elif [ -n "${existing_ic_name}" ]; then
582
+ # IC was previously deployed — check its current status
583
+ local ic_status
584
+ ic_status=$(_get_ic_status "${existing_ic_name}")
585
+
586
+ case "${ic_status}" in
587
+ InService)
588
+ echo "✅ IC '${ic_basename}' already InService: ${existing_ic_name} — skipping"
589
+ IC_DEPLOYED_NAME="${existing_ic_name}"
590
+ return 0
591
+ ;;
592
+ Creating)
593
+ echo "⏳ IC '${ic_basename}' is still Creating: ${existing_ic_name} — waiting..."
594
+ IC_DEPLOYED_NAME="${existing_ic_name}"
595
+ wait_ic "${IC_DEPLOYED_NAME}"
596
+ echo "✅ Inference component is InService: ${IC_DEPLOYED_NAME}"
597
+ return 0
598
+ ;;
599
+ Failed)
600
+ echo "⚠️ IC '${ic_basename}' previously Failed: ${existing_ic_name} — recreating..."
601
+ create_inference_component "${ic_conf}"
602
+ ;;
603
+ *)
604
+ echo " IC '${ic_basename}' has unknown/missing status for ${existing_ic_name} — creating new..."
605
+ create_inference_component "${ic_conf}"
606
+ ;;
607
+ esac
608
+ else
609
+ # No previous deployment — create new IC
610
+ create_inference_component "${ic_conf}"
611
+ fi
612
+
613
+ echo "⏳ Waiting for inference component to reach InService status..."
614
+ echo " This may take 5-10 minutes..."
615
+
616
+ wait_ic "${IC_DEPLOYED_NAME}"
617
+
618
+ echo "✅ Inference component is InService: ${IC_DEPLOYED_NAME}"
619
+
620
+ # Record inference component in manifest (non-blocking)
621
+ local ic_arn="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:inference-component/${IC_DEPLOYED_NAME}"
622
+ ./do/manifest add \
623
+ --type sagemaker-inference-component \
624
+ --id "${ic_arn}" \
625
+ --project "${PROJECT_NAME}" \
626
+ --meta "{\"inferenceComponentName\":\"${IC_DEPLOYED_NAME}\",\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE:-external}\",\"region\":\"${AWS_REGION}\"}" \
627
+ 2>/dev/null || true
628
+ }
629
+
630
+ if [ -n "${IC_TARGET}" ]; then
631
+ # Single IC path: deploy only the named IC
632
+ echo ""
633
+ echo "── Deploying IC: ${IC_TARGET} ──"
634
+ _deploy_single_ic "${SCRIPT_DIR}/ic/${IC_TARGET}.conf"
635
+ else
636
+ # Multi-IC path: iterate all IC config files (alphabetical order)
637
+ IC_SUMMARY=""
638
+ IC_DEPLOY_FAILED=false
639
+
640
+ for conf in "${SCRIPT_DIR}"/ic/*.conf; do
641
+ [ -f "${conf}" ] || continue
642
+ local_ic_basename=$(basename "${conf}" .conf)
643
+ echo ""
644
+ echo "── Deploying IC: ${local_ic_basename} ──"
645
+
646
+ if ! _deploy_single_ic "${conf}"; then
647
+ echo "❌ IC '${local_ic_basename}' failed to deploy. Stopping."
648
+ IC_SUMMARY="${IC_SUMMARY} ${local_ic_basename}: FAILED\n"
649
+ IC_DEPLOY_FAILED=true
650
+ break
651
+ fi
652
+
653
+ IC_SUMMARY="${IC_SUMMARY} ${local_ic_basename}: ${IC_DEPLOYED_NAME} [InService]\n"
654
+ done
655
+
656
+ # Print summary
657
+ echo ""
658
+ echo "📋 IC Deployment Summary:"
659
+ echo -e "${IC_SUMMARY}"
660
+
661
+ if [ "${IC_DEPLOY_FAILED}" = true ]; then
662
+ echo "❌ Deployment stopped due to IC failure. Fix the issue and re-run ./do/deploy to resume."
663
+ exit 4
664
+ fi
665
+ fi
666
+ else
667
+ # Legacy single-IC path: no do/ic/ directory
668
+ create_inference_component_legacy
669
+
670
+ echo "⏳ Waiting for inference component to reach InService status..."
671
+ echo " This may take 5-10 minutes..."
672
+ echo " If this times out, re-run ./do/deploy to resume."
673
+
674
+ wait_ic "${IC_DEPLOYED_NAME}"
675
+
676
+ echo "✅ Inference component is InService: ${IC_DEPLOYED_NAME}"
677
+
678
+ # Record inference component in manifest (non-blocking)
679
+ IC_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:inference-component/${IC_DEPLOYED_NAME}"
680
+ ./do/manifest add \
681
+ --type sagemaker-inference-component \
682
+ --id "${IC_ARN}" \
683
+ --project "${PROJECT_NAME}" \
684
+ --meta "{\"inferenceComponentName\":\"${IC_DEPLOYED_NAME}\",\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE:-external}\",\"region\":\"${AWS_REGION}\"}" \
685
+ 2>/dev/null || true
686
+ fi
687
+
688
+ elif [ "${SKIP_TO}" = "wait_ic" ]; then
689
+ # Resuming: just wait for the IC that was already being created
690
+ echo "⏳ Waiting for inference component to reach InService status..."
691
+ echo " This may take 5-10 minutes..."
692
+ echo " If this times out, re-run ./do/deploy to resume."
693
+
694
+ wait_ic "${IC_DEPLOYED_NAME}"
695
+
696
+ echo "✅ Inference component is InService: ${IC_DEPLOYED_NAME}"
697
+ fi
698
+
699
+ echo "✅ Deployment complete!"
700
+ echo ""
701
+ echo "📋 Deployment Details:"
702
+ echo " Endpoint: ${ENDPOINT_NAME}"
703
+ if [ "${ENDPOINT_EXTERNAL:-false}" = "true" ]; then
704
+ echo " Endpoint Config: (external — not managed by this project)"
705
+ echo " Region: ${AWS_REGION}"
706
+ else
707
+ echo " Endpoint Config: ${ENDPOINT_CONFIG_NAME:-N/A}"
708
+ echo " Region: ${AWS_REGION}"
709
+ echo " Instance Type: ${INSTANCE_TYPE}"
710
+ fi
711
+ echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
712
+ echo ""
713
+ echo "📋 What's next?"
714
+ echo " • Test your endpoint: ./do/test"
715
+ <% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
716
+ echo " • Benchmark performance: ./do/benchmark"
717
+ <% } %>
718
+ <% if (typeof enableLora !== 'undefined' && enableLora) { %>
719
+ echo " • Add a LoRA adapter: ./do/adapter add <name> --weights s3://..."
720
+ <% } %>
721
+ echo " • View endpoint status: ./do/status"
722
+ echo " • Register this deployment: ./do/register"
723
+ echo " • View logs: ./do/logs"
724
+ <% if (!(typeof existingEndpointName !== 'undefined' && existingEndpointName)) { %>
725
+ echo " • Clean up when done: ./do/clean endpoint"
726
+ <% } %>