@aws/ml-container-creator 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/bin/cli.js +31 -137
  2. package/config/parameter-schema-v2.json +2065 -0
  3. package/package.json +6 -3
  4. package/servers/lib/catalogs/jumpstart-public.json +101 -16
  5. package/servers/lib/catalogs/models.json +182 -26
  6. package/src/app.js +6 -389
  7. package/src/lib/bootstrap-command-handler.js +75 -1078
  8. package/src/lib/bootstrap-profile-manager.js +634 -0
  9. package/src/lib/bootstrap-provisioners.js +421 -0
  10. package/src/lib/config-loader.js +405 -0
  11. package/src/lib/config-manager.js +59 -1668
  12. package/src/lib/config-mcp-client.js +118 -0
  13. package/src/lib/config-validator.js +634 -0
  14. package/src/lib/cuda-resolver.js +140 -0
  15. package/src/lib/e2e-catalog-validator.js +251 -3
  16. package/src/lib/e2e-ci-recorder.js +103 -0
  17. package/src/lib/generated/cli-options.js +471 -0
  18. package/src/lib/generated/parameter-matrix.js +671 -0
  19. package/src/lib/generated/validation-rules.js +202 -0
  20. package/src/lib/marketplace-flow.js +276 -0
  21. package/src/lib/mcp-query-runner.js +768 -0
  22. package/src/lib/parameter-schema-validator.js +62 -18
  23. package/src/lib/prompt-runner.js +41 -1504
  24. package/src/lib/prompts/feature-prompts.js +172 -0
  25. package/src/lib/prompts/index.js +48 -0
  26. package/src/lib/prompts/infrastructure-prompts.js +690 -0
  27. package/src/lib/prompts/model-prompts.js +552 -0
  28. package/src/lib/prompts/project-prompts.js +70 -0
  29. package/src/lib/prompts.js +2 -1446
  30. package/src/lib/registry-command-handler.js +135 -3
  31. package/src/lib/secrets-prompt-runner.js +251 -0
  32. package/src/lib/template-variable-resolver.js +398 -0
  33. package/templates/code/serve +5 -134
  34. package/templates/code/serve.d/lmi.ejs +19 -0
  35. package/templates/code/serve.d/sglang.ejs +47 -0
  36. package/templates/code/serve.d/tensorrt-llm.ejs +53 -0
  37. package/templates/code/serve.d/vllm.ejs +48 -0
  38. package/templates/do/clean +1 -1387
  39. package/templates/do/clean.d/async-inference.ejs +508 -0
  40. package/templates/do/clean.d/batch-transform.ejs +512 -0
  41. package/templates/do/clean.d/hyperpod-eks.ejs +481 -0
  42. package/templates/do/clean.d/managed-inference.ejs +1043 -0
  43. package/templates/do/deploy +1 -1766
  44. package/templates/do/deploy.d/async-inference.ejs +501 -0
  45. package/templates/do/deploy.d/batch-transform.ejs +529 -0
  46. package/templates/do/deploy.d/hyperpod-eks.ejs +339 -0
  47. package/templates/do/deploy.d/managed-inference.ejs +726 -0
  48. package/config/parameter-schema.json +0 -88
@@ -0,0 +1,339 @@
1
+ #!/bin/bash
2
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ set -e
6
+ set -u
7
+ set -o pipefail
8
+
9
+ # Parse flags
10
+ FORCE_NEW=false
11
+ FORCE_IC=false
12
+ IC_TARGET=""
13
+ while [ $# -gt 0 ]; do
14
+ case "$1" in
15
+ --force) FORCE_NEW=true; shift ;;
16
+ --force-ic)
17
+ FORCE_IC=true
18
+ shift
19
+ ;;
20
+ --help|-h)
21
+ echo "Usage: ./do/deploy [--force] [--force-ic]"
22
+ echo ""
23
+ echo "Options:"
24
+ echo " --force Create a new endpoint, even if one already exists."
25
+ echo " --force-ic Recreate the inference component on the existing endpoint."
26
+ echo ""
27
+ echo "Without flags, deploy resumes from the last run."
28
+ exit 0
29
+ ;;
30
+ *)
31
+ echo "❌ Unknown option: $1"
32
+ echo " Run ./do/deploy --help for usage."
33
+ exit 1
34
+ ;;
35
+ esac
36
+ done
37
+
38
+ # Source configuration
39
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
40
+ source "${SCRIPT_DIR}/config"
41
+
42
+ echo "🚀 Deploying to AWS"
43
+ echo " Project: ${PROJECT_NAME}"
44
+ echo " Deployment config: ${DEPLOYMENT_CONFIG}"
45
+ echo " Region: ${AWS_REGION}"
46
+ echo " Build target: ${BUILD_TARGET}"
47
+ echo " Deployment target: ${DEPLOYMENT_TARGET}"
48
+ echo " HyperPod cluster: ${HYPERPOD_CLUSTER_NAME}"
49
+ echo " Namespace: ${HYPERPOD_NAMESPACE}"
50
+ echo " Replicas: ${HYPERPOD_REPLICAS}"
51
+
52
+ # Check AWS credentials
53
+ echo "🔍 Validating AWS credentials..."
54
+ if ! aws sts get-caller-identity &> /dev/null; then
55
+ echo "❌ AWS credentials not configured"
56
+ echo " Run: aws configure"
57
+ echo " Or set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables"
58
+ exit 4
59
+ fi
60
+
61
+ AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
62
+ echo "✅ AWS credentials validated (Account: ${AWS_ACCOUNT_ID})"
63
+
64
+ # Construct ECR repository URL
65
+ ECR_REPOSITORY="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_REPOSITORY_NAME}"
66
+
67
+ # ============================================================
68
+ # Shared: Verify ECR image exists
69
+ # ============================================================
70
+ echo "🔍 Verifying ECR image exists..."
71
+ if ! aws ecr describe-images \
72
+ --repository-name "${ECR_REPOSITORY_NAME}" \
73
+ --image-ids imageTag="${PROJECT_NAME}-latest" \
74
+ --region "${AWS_REGION}" &> /dev/null; then
75
+
76
+ echo "❌ ECR image not found: ${ECR_REPOSITORY}:${PROJECT_NAME}-latest"
77
+ echo ""
78
+ echo "Please build and push your image first:"
79
+ echo " ./do/submit"
80
+ echo ""
81
+ echo "After the build completes successfully, run this deploy script again."
82
+ exit 4
83
+ fi
84
+
85
+ echo "✅ ECR image found: ${ECR_REPOSITORY}:${PROJECT_NAME}-latest"
86
+ IMAGE_TAG="${PROJECT_NAME}-latest"
87
+
88
+ # ============================================================
89
+ # Shared: Resolve secrets for container environment
90
+ # ============================================================
91
+ CONTAINER_ENV_JSON=""
92
+
93
+ if [ -n "${HF_TOKEN_ARN:-}" ]; then
94
+ echo "🔐 Resolving HuggingFace token from Secrets Manager..."
95
+ RESOLVED_HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "${HF_TOKEN_ARN}" --query SecretString --output text --region "${AWS_REGION}") || {
96
+ echo "❌ Failed to resolve HuggingFace token from Secrets Manager"
97
+ exit 3
98
+ }
99
+ CONTAINER_ENV_JSON="\"HF_TOKEN\":\"${RESOLVED_HF_TOKEN}\""
100
+ elif [ -n "${HF_TOKEN:-}" ]; then
101
+ CONTAINER_ENV_JSON="\"HF_TOKEN\":\"${HF_TOKEN}\""
102
+ fi
103
+
104
+ if [ -n "${NGC_API_KEY_ARN:-}" ]; then
105
+ echo "🔐 Resolving NGC API key from Secrets Manager..."
106
+ RESOLVED_NGC_KEY=$(aws secretsmanager get-secret-value --secret-id "${NGC_API_KEY_ARN}" --query SecretString --output text --region "${AWS_REGION}") || {
107
+ echo "❌ Failed to resolve NGC API key from Secrets Manager"
108
+ exit 3
109
+ }
110
+ if [ -n "${CONTAINER_ENV_JSON}" ]; then
111
+ CONTAINER_ENV_JSON="${CONTAINER_ENV_JSON},\"NGC_API_KEY\":\"${RESOLVED_NGC_KEY}\""
112
+ else
113
+ CONTAINER_ENV_JSON="\"NGC_API_KEY\":\"${RESOLVED_NGC_KEY}\""
114
+ fi
115
+ elif [ -n "${NGC_API_KEY:-}" ]; then
116
+ if [ -n "${CONTAINER_ENV_JSON}" ]; then
117
+ CONTAINER_ENV_JSON="${CONTAINER_ENV_JSON},\"NGC_API_KEY\":\"${NGC_API_KEY}\""
118
+ else
119
+ CONTAINER_ENV_JSON="\"NGC_API_KEY\":\"${NGC_API_KEY}\""
120
+ fi
121
+ fi
122
+
123
+ # ============================================================
124
+ # HyperPod EKS Deployment
125
+ # ============================================================
126
+
127
+ # Get kubeconfig for HyperPod cluster
128
+ echo "🔑 Configuring kubectl for HyperPod cluster..."
129
+ KUBECONFIG_PATH="${HOME}/.kube/hyperpod-${HYPERPOD_CLUSTER_NAME}"
130
+
131
+ # Step 1: Describe the HyperPod cluster to get the underlying EKS cluster ARN
132
+ EKS_CLUSTER_ARN=$(aws sagemaker describe-cluster \
133
+ --cluster-name "${HYPERPOD_CLUSTER_NAME}" \
134
+ --region "${AWS_REGION}" \
135
+ --query "Orchestrator.Eks.ClusterArn" \
136
+ --output text 2>&1) || {
137
+ echo "❌ Failed to describe HyperPod cluster: ${HYPERPOD_CLUSTER_NAME}"
138
+ echo ""
139
+ echo " Error details:"
140
+ echo " ${EKS_CLUSTER_ARN}"
141
+ echo ""
142
+ echo " Check that:"
143
+ echo " • The cluster name is correct"
144
+ echo " • The cluster exists in region: ${AWS_REGION}"
145
+ echo " • Your IAM user/role has permission to access the cluster"
146
+ echo ""
147
+ echo " Required IAM permissions:"
148
+ echo " • sagemaker:DescribeCluster"
149
+ echo " • eks:DescribeCluster"
150
+ exit 4
151
+ }
152
+
153
+ # Step 2: Extract the EKS cluster name from the ARN
154
+ EKS_CLUSTER_NAME=$(echo "${EKS_CLUSTER_ARN}" | awk -F'/' '{print $NF}')
155
+ echo " HyperPod cluster: ${HYPERPOD_CLUSTER_NAME}"
156
+ echo " EKS cluster: ${EKS_CLUSTER_NAME}"
157
+
158
+ # Step 3: Update kubeconfig using the EKS cluster
159
+ if ! aws eks update-kubeconfig \
160
+ --name "${EKS_CLUSTER_NAME}" \
161
+ --region "${AWS_REGION}" \
162
+ --kubeconfig "${KUBECONFIG_PATH}" 2>&1; then
163
+ echo "❌ Failed to configure kubectl for EKS cluster: ${EKS_CLUSTER_NAME}"
164
+ echo ""
165
+ echo " Required IAM permissions:"
166
+ echo " • eks:DescribeCluster"
167
+ echo " • eks:AccessKubernetesApi"
168
+ exit 4
169
+ fi
170
+
171
+ export KUBECONFIG="${KUBECONFIG_PATH}"
172
+ echo "✅ Kubeconfig saved to: ${KUBECONFIG_PATH}"
173
+
174
+ # Verify cluster connectivity
175
+ echo "🔍 Verifying cluster connectivity..."
176
+ if ! kubectl cluster-info &> /dev/null; then
177
+ echo "❌ Cannot connect to HyperPod cluster"
178
+ echo ""
179
+ echo " Check that:"
180
+ echo " • The cluster is in 'InService' status"
181
+ echo " • Your network can reach the cluster API server"
182
+ echo " • Your IAM credentials are valid"
183
+ exit 4
184
+ fi
185
+ echo "✅ Connected to HyperPod cluster"
186
+
187
+ # Create namespace if it doesn't exist
188
+ echo "📁 Ensuring namespace exists: ${HYPERPOD_NAMESPACE}"
189
+ if ! kubectl create namespace "${HYPERPOD_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - 2>&1; then
190
+ echo "⚠️ Warning: Could not create/verify namespace"
191
+ fi
192
+
193
+ # Apply Kubernetes manifests
194
+ echo "📄 Applying Kubernetes manifests from hyperpod/..."
195
+
196
+ # Substitute shell variables (e.g. ${AWS_ACCOUNT_ID}) in manifests before applying
197
+ export AWS_ACCOUNT_ID
198
+ export ECR_IMAGE="${ECR_REPOSITORY}:${IMAGE_TAG}"
199
+
200
+ APPLY_OUTPUT=""
201
+ APPLY_EXIT_CODE=0
202
+ for manifest in hyperpod/*.yaml; do
203
+ # Skip files that contain no Kubernetes objects (e.g. comment-only PVC stubs)
204
+ RENDERED=$(envsubst < "${manifest}")
205
+ if echo "${RENDERED}" | grep -q '^kind:'; then
206
+ FILE_OUTPUT=$(echo "${RENDERED}" | kubectl apply -n "${HYPERPOD_NAMESPACE}" -f - 2>&1) || {
207
+ APPLY_EXIT_CODE=$?
208
+ }
209
+ APPLY_OUTPUT="${APPLY_OUTPUT}${FILE_OUTPUT}\n"
210
+ fi
211
+ done
212
+
213
+ if [ "${APPLY_EXIT_CODE}" -ne 0 ]; then
214
+ echo ""
215
+ echo "❌ Failed to apply Kubernetes manifests"
216
+ echo ""
217
+ echo " Error details:"
218
+ echo " ${APPLY_OUTPUT}"
219
+ echo ""
220
+ echo " Common issues:"
221
+ echo " • Insufficient node capacity - check available GPU nodes"
222
+ echo " • Resource requests exceed node capacity"
223
+ echo " • RBAC permissions - ensure you have permission to create resources in namespace '${HYPERPOD_NAMESPACE}'"
224
+ echo " • Invalid manifest syntax"
225
+ <% if (fsxVolumeHandle) { %>
226
+ echo " • PVC creation failure - verify the FSx CSI driver is installed on the cluster"
227
+ echo " kubectl get csidriver -o name | grep fsx"
228
+ <% } %>
229
+ echo ""
230
+ echo " Debug commands:"
231
+ echo " kubectl get nodes -o wide"
232
+ echo " kubectl describe nodes"
233
+ echo " kubectl get events -n ${HYPERPOD_NAMESPACE}"
234
+ exit ${APPLY_EXIT_CODE}
235
+ fi
236
+
237
+ echo "✅ Kubernetes manifests applied"
238
+
239
+ # Record k8s deployment and service in manifest (non-blocking)
240
+ ./do/manifest add \
241
+ --type k8s-deployment \
242
+ --id "${HYPERPOD_NAMESPACE}/${PROJECT_NAME}" \
243
+ --project "${PROJECT_NAME}" \
244
+ --meta "{\"namespace\":\"${HYPERPOD_NAMESPACE}\",\"deploymentName\":\"${PROJECT_NAME}\",\"clusterName\":\"${HYPERPOD_CLUSTER_NAME}\",\"region\":\"${AWS_REGION}\"}" \
245
+ 2>/dev/null || true
246
+
247
+ ./do/manifest add \
248
+ --type k8s-service \
249
+ --id "${HYPERPOD_NAMESPACE}/${PROJECT_NAME}" \
250
+ --project "${PROJECT_NAME}" \
251
+ --meta "{\"namespace\":\"${HYPERPOD_NAMESPACE}\",\"serviceName\":\"${PROJECT_NAME}\",\"clusterName\":\"${HYPERPOD_CLUSTER_NAME}\",\"region\":\"${AWS_REGION}\"}" \
252
+ 2>/dev/null || true
253
+
254
+ # Wait for deployment to be ready
255
+ DEPLOY_TIMEOUT=${DEPLOY_TIMEOUT:-1200}
256
+ echo "⏳ Waiting for deployment to be ready (timeout: ${DEPLOY_TIMEOUT}s)..."
257
+ echo " This may take several minutes for GPU workloads..."
258
+ echo ""
259
+
260
+ # Poll pod status every 30s while rollout is in progress
261
+ (
262
+ while true; do
263
+ sleep 30
264
+ POD_STATUS=$(kubectl get pods -n "${HYPERPOD_NAMESPACE}" -l app=${PROJECT_NAME} \
265
+ --no-headers 2>/dev/null | head -5)
266
+ if [ -n "${POD_STATUS}" ]; then
267
+ echo " 📊 $(date +%H:%M:%S) Pod status:"
268
+ echo "${POD_STATUS}" | while read -r line; do echo " ${line}"; done
269
+ fi
270
+ done
271
+ ) &
272
+ STATUS_PID=$!
273
+ trap "kill ${STATUS_PID} 2>/dev/null; wait ${STATUS_PID} 2>/dev/null" EXIT
274
+
275
+ ROLLOUT_OUTPUT=$(kubectl rollout status deployment/${PROJECT_NAME} -n "${HYPERPOD_NAMESPACE}" --timeout=${DEPLOY_TIMEOUT}s 2>&1) || {
276
+ ROLLOUT_EXIT_CODE=$?
277
+ kill ${STATUS_PID} 2>/dev/null
278
+ echo ""
279
+ echo "❌ Deployment failed to become ready within timeout"
280
+ echo ""
281
+ echo " Error details:"
282
+ echo " ${ROLLOUT_OUTPUT}"
283
+ echo ""
284
+ echo " Current pod state:"
285
+ kubectl get pods -n "${HYPERPOD_NAMESPACE}" -l app=${PROJECT_NAME} -o wide 2>/dev/null
286
+ echo ""
287
+ echo " Debug commands:"
288
+ echo " kubectl describe pods -n ${HYPERPOD_NAMESPACE} -l app=${PROJECT_NAME}"
289
+ echo " kubectl logs -n ${HYPERPOD_NAMESPACE} -l app=${PROJECT_NAME} --tail=100"
290
+ echo ""
291
+ echo " Common issues:"
292
+ echo " • Image pull errors - check ECR permissions"
293
+ echo " • Resource scheduling - insufficient GPU nodes"
294
+ echo " • Container crash - check application logs"
295
+ <% if (fsxVolumeHandle) { %>
296
+ echo " • PVC binding errors - verify FSx CSI driver is installed on the cluster"
297
+ echo " kubectl get pvc -n ${HYPERPOD_NAMESPACE}"
298
+ echo " kubectl describe pvc -n ${HYPERPOD_NAMESPACE}"
299
+ echo " kubectl get csidriver -o name | grep fsx"
300
+ <% } %>
301
+ exit ${ROLLOUT_EXIT_CODE}
302
+ }
303
+
304
+ kill ${STATUS_PID} 2>/dev/null
305
+ wait ${STATUS_PID} 2>/dev/null
306
+
307
+ echo "✅ HyperPod EKS deployment complete!"
308
+ echo ""
309
+ echo "📋 Deployment Details:"
310
+ echo " Cluster: ${HYPERPOD_CLUSTER_NAME}"
311
+ echo " Namespace: ${HYPERPOD_NAMESPACE}"
312
+ echo " Deployment: ${PROJECT_NAME}"
313
+ echo " Replicas: ${HYPERPOD_REPLICAS}"
314
+ echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
315
+ echo ""
316
+ echo "📋 What's next?"
317
+ echo " • Test your deployment: ./do/test"
318
+ echo " • Check pod status: kubectl get pods -n ${HYPERPOD_NAMESPACE}"
319
+ echo " • View pod logs: kubectl logs -n ${HYPERPOD_NAMESPACE} -l app=${PROJECT_NAME}"
320
+ <% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
321
+ echo " • Benchmark performance: ./do/benchmark"
322
+ <% } %>
323
+ echo " • Register this deployment: ./do/register"
324
+ echo " • View logs: ./do/logs"
325
+ echo " • Clean up when done: ./do/clean hyperpod"
326
+
327
+ # Write kubeconfig path to config so other scripts can use it (idempotent)
328
+ _update_config_var() {
329
+ local var_name="$1" var_value="$2" config_file="${SCRIPT_DIR}/config"
330
+ if grep -q "^export ${var_name}=" "${config_file}" 2>/dev/null; then
331
+ sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
332
+ rm -f "${config_file}.bak"
333
+ else
334
+ echo "" >> "${config_file}"
335
+ echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
336
+ fi
337
+ }
338
+
339
+ _update_config_var "KUBECONFIG" "${KUBECONFIG_PATH}"