@aws/ml-container-creator 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,481 @@
1
+ #!/bin/bash
2
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ set -e
6
+ set -u
7
+ set -o pipefail
8
+
9
+ # Source configuration
10
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
11
+ source "${SCRIPT_DIR}/config"
12
+
13
+ # Parse arguments
14
+ CLEANUP_TARGET=""
15
+ CLEANUP_ARG=""
16
+ FORCE_CLEAN=false
17
+
18
+ for arg in "$@"; do
19
+ case "$arg" in
20
+ --force) FORCE_CLEAN=true ;;
21
+ -*) ;; # ignore other flags
22
+ *)
23
+ if [ -z "${CLEANUP_TARGET}" ]; then
24
+ CLEANUP_TARGET="$arg"
25
+ elif [ -z "${CLEANUP_ARG}" ]; then
26
+ CLEANUP_ARG="$arg"
27
+ fi
28
+ ;;
29
+ esac
30
+ done
31
+
32
+ # Function to display usage
33
+ show_usage() {
34
+ echo "Usage: ./do/clean [local|ecr|hyperpod|codebuild|all]"
35
+ echo ""
36
+ echo "Cleanup targets:"
37
+ echo " local - Remove local Docker images"
38
+ echo " ecr - Remove images from Amazon ECR"
39
+ echo " hyperpod - Delete HyperPod EKS deployment and services"
40
+ echo " codebuild - Delete CodeBuild project, IAM role, and S3 source artifacts"
41
+ echo " all - Perform all cleanup operations"
42
+ echo ""
43
+ echo "Examples:"
44
+ echo " ./do/clean local # Remove local Docker images only"
45
+ echo " ./do/clean hyperpod # Delete HyperPod EKS resources only"
46
+ echo " ./do/clean codebuild # Delete CodeBuild project and rebuild fresh"
47
+ echo " ./do/clean all # Clean up everything"
48
+ }
49
+
50
+ # Function to confirm action (skipped when --force is set)
51
+ confirm_action() {
52
+ local message="$1"
53
+ if [ "${FORCE_CLEAN}" = true ]; then
54
+ return 0
55
+ fi
56
+ echo ""
57
+ echo "⚠️ ${message}"
58
+ read -p " Are you sure? (yes/no): " -r
59
+ echo ""
60
+ if [[ ! $REPLY =~ ^[Yy][Ee][Ss]$ ]]; then
61
+ echo "❌ Operation cancelled"
62
+ return 1
63
+ fi
64
+ return 0
65
+ }
66
+
67
+ # Function to clean local Docker images
68
+ clean_local() {
69
+ echo "🧹 Cleaning local Docker images"
70
+ echo " Project: ${PROJECT_NAME}"
71
+
72
+ LOCAL_PATTERN="^${PROJECT_NAME}:"
73
+ ECR_PATTERN="\.dkr\.ecr\..*\.amazonaws\.com/${ECR_REPOSITORY_NAME}:${PROJECT_NAME}-"
74
+
75
+ if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -qE "${LOCAL_PATTERN}|${ECR_PATTERN}"; then
76
+ echo "ℹ️ No local images found for ${PROJECT_NAME}"
77
+ return 0
78
+ fi
79
+
80
+ echo ""
81
+ echo "Images to be removed:"
82
+ docker images --format "{{.Repository}}:{{.Tag}}" | grep -E "${LOCAL_PATTERN}|${ECR_PATTERN}" | while read -r image; do
83
+ echo " ${image}"
84
+ done
85
+
86
+ if ! confirm_action "This will remove all local Docker images for ${PROJECT_NAME}"; then
87
+ return 1
88
+ fi
89
+
90
+ echo "🗑️ Removing local images..."
91
+ docker images --format "{{.Repository}}:{{.Tag}}" | grep -E "${LOCAL_PATTERN}|${ECR_PATTERN}" | while read -r image; do
92
+ echo " Removing: ${image}"
93
+ docker rmi "${image}" || echo " ⚠️ Failed to remove ${image}"
94
+ done
95
+
96
+ echo "✅ Local images cleaned"
97
+ }
98
+
99
+ # Function to clean ECR images
100
+ clean_ecr() {
101
+ echo "🧹 Cleaning ECR images"
102
+ echo " Repository: ${ECR_REPOSITORY_NAME}"
103
+ echo " Region: ${AWS_REGION}"
104
+
105
+ if ! aws sts get-caller-identity &> /dev/null; then
106
+ echo "❌ AWS credentials not configured"
107
+ echo " Run: aws configure"
108
+ exit 4
109
+ fi
110
+
111
+ AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
112
+
113
+ if ! aws ecr describe-repositories \
114
+ --repository-names "${ECR_REPOSITORY_NAME}" \
115
+ --region "${AWS_REGION}" &> /dev/null; then
116
+ echo "ℹ️ ECR repository ${ECR_REPOSITORY_NAME} does not exist"
117
+ return 0
118
+ fi
119
+
120
+ echo ""
121
+ echo "Checking for images in repository..."
122
+
123
+ if ! IMAGE_IDS=$(aws ecr list-images \
124
+ --repository-name "${ECR_REPOSITORY_NAME}" \
125
+ --region "${AWS_REGION}" \
126
+ --query "imageIds[?starts_with(imageTag, '${PROJECT_NAME}-')].[imageTag]" \
127
+ --output text 2>&1); then
128
+ echo "ℹ️ No images found for project: ${PROJECT_NAME}"
129
+ return 0
130
+ fi
131
+
132
+ if [ -z "${IMAGE_IDS}" ] || [ "${IMAGE_IDS}" = "None" ]; then
133
+ echo "ℹ️ No images found for project: ${PROJECT_NAME}"
134
+ return 0
135
+ fi
136
+
137
+ echo "Images for project ${PROJECT_NAME}:"
138
+ echo "${IMAGE_IDS}" | while read -r tag; do
139
+ if [ -n "${tag}" ] && [ "${tag}" != "None" ]; then
140
+ echo " - ${tag}"
141
+ fi
142
+ done
143
+
144
+ if ! confirm_action "This will remove all images from ECR repository ${ECR_REPOSITORY_NAME}"; then
145
+ return 1
146
+ fi
147
+
148
+ echo "🗑️ Removing ECR images..."
149
+
150
+ IMAGE_IDS_JSON=$(aws ecr list-images \
151
+ --repository-name "${ECR_REPOSITORY_NAME}" \
152
+ --region "${AWS_REGION}" \
153
+ --query "imageIds[?starts_with(imageTag, '${PROJECT_NAME}-')]" \
154
+ --output json)
155
+
156
+ if [ "${IMAGE_IDS_JSON}" != "[]" ] && [ -n "${IMAGE_IDS_JSON}" ]; then
157
+ if aws ecr batch-delete-image \
158
+ --repository-name "${ECR_REPOSITORY_NAME}" \
159
+ --region "${AWS_REGION}" \
160
+ --image-ids "${IMAGE_IDS_JSON}" &> /dev/null; then
161
+ echo "✅ ECR images removed for project: ${PROJECT_NAME}"
162
+ else
163
+ echo "❌ Failed to remove some ECR images"
164
+ return 1
165
+ fi
166
+ else
167
+ echo "ℹ️ No images to remove for project: ${PROJECT_NAME}"
168
+ fi
169
+ }
170
+
171
+ # Function to clean HyperPod EKS deployment
172
+ clean_hyperpod() {
173
+ echo "🧹 Cleaning HyperPod EKS resources"
174
+ echo " Cluster: ${HYPERPOD_CLUSTER_NAME}"
175
+ echo " Namespace: ${HYPERPOD_NAMESPACE}"
176
+
177
+ if ! aws sts get-caller-identity &> /dev/null; then
178
+ echo "❌ AWS credentials not configured"
179
+ echo " Run: aws configure"
180
+ exit 4
181
+ fi
182
+
183
+ # Get kubeconfig for HyperPod cluster
184
+ echo "🔑 Configuring kubectl for HyperPod cluster..."
185
+ KUBECONFIG_PATH="${HOME}/.kube/hyperpod-${HYPERPOD_CLUSTER_NAME}"
186
+
187
+ EKS_CLUSTER_ARN=$(aws sagemaker describe-cluster \
188
+ --cluster-name "${HYPERPOD_CLUSTER_NAME}" \
189
+ --region "${AWS_REGION}" \
190
+ --query "Orchestrator.Eks.ClusterArn" \
191
+ --output text 2>&1) || {
192
+ echo "❌ Failed to describe HyperPod cluster: ${HYPERPOD_CLUSTER_NAME}"
193
+ echo " Check that the cluster exists and you have permission to access it"
194
+ return 1
195
+ }
196
+
197
+ EKS_CLUSTER_NAME=$(echo "${EKS_CLUSTER_ARN}" | awk -F'/' '{print $NF}')
198
+
199
+ if ! aws eks update-kubeconfig \
200
+ --name "${EKS_CLUSTER_NAME}" \
201
+ --region "${AWS_REGION}" \
202
+ --kubeconfig "${KUBECONFIG_PATH}" 2>&1; then
203
+ echo "❌ Failed to configure kubectl for EKS cluster: ${EKS_CLUSTER_NAME}"
204
+ return 1
205
+ fi
206
+
207
+ export KUBECONFIG="${KUBECONFIG_PATH}"
208
+
209
+ if ! confirm_action "This will delete the HyperPod deployment in namespace ${HYPERPOD_NAMESPACE}"; then
210
+ return 1
211
+ fi
212
+
213
+ # Delete Kubernetes resources
214
+ echo "🗑️ Deleting Kubernetes resources..."
215
+ AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
216
+ export AWS_ACCOUNT_ID
217
+ DELETE_FAILED=false
218
+ for manifest in hyperpod/*.yaml; do
219
+ # Skip files that contain no Kubernetes objects (e.g. comment-only PVC stubs)
220
+ RENDERED=$(envsubst < "${manifest}")
221
+ if echo "${RENDERED}" | grep -q '^kind:'; then
222
+ if ! echo "${RENDERED}" | kubectl delete -n "${HYPERPOD_NAMESPACE}" --ignore-not-found -f - 2>&1; then
223
+ DELETE_FAILED=true
224
+ fi
225
+ fi
226
+ done
227
+ if [ "${DELETE_FAILED}" = true ]; then
228
+ echo "❌ Failed to delete some Kubernetes resources"
229
+ echo " You may need to manually clean up:"
230
+ echo " kubectl get all -n ${HYPERPOD_NAMESPACE}"
231
+ return 1
232
+ fi
233
+
234
+ # Mark k8s resources as deleted in manifest (non-blocking)
235
+ ./do/manifest delete --id "${HYPERPOD_NAMESPACE}/${PROJECT_NAME}" 2>/dev/null || true
236
+
237
+ echo "✅ HyperPod EKS resources cleaned"
238
+ }
239
+
240
+ # Function to clean CodeBuild project and related resources
241
+ clean_codebuild() {
242
+ echo "🧹 Cleaning CodeBuild resources"
243
+ echo " Project: ${CODEBUILD_PROJECT_NAME:-not set}"
244
+ echo " Region: ${AWS_REGION}"
245
+
246
+ if [ -z "${CODEBUILD_PROJECT_NAME:-}" ]; then
247
+ echo "ℹ️ No CodeBuild project name configured (build target may not be codebuild)"
248
+ return 0
249
+ fi
250
+
251
+ if ! aws sts get-caller-identity &> /dev/null; then
252
+ echo "❌ AWS credentials not configured"
253
+ echo " Run: aws configure"
254
+ exit 4
255
+ fi
256
+
257
+ AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
258
+
259
+ PROJECT_CHECK=$(aws codebuild batch-get-projects \
260
+ --names "${CODEBUILD_PROJECT_NAME}" \
261
+ --region "${AWS_REGION}" \
262
+ --query 'projects[0].name' \
263
+ --output text 2>/dev/null)
264
+
265
+ if [ "$PROJECT_CHECK" = "None" ] || [ -z "$PROJECT_CHECK" ] || [ "$PROJECT_CHECK" = "null" ]; then
266
+ echo "ℹ️ CodeBuild project not found: ${CODEBUILD_PROJECT_NAME}"
267
+ return 0
268
+ fi
269
+
270
+ echo ""
271
+ echo "Resources to be removed:"
272
+ echo " • CodeBuild project: ${CODEBUILD_PROJECT_NAME}"
273
+
274
+ ROLE_NAME="${CODEBUILD_PROJECT_NAME}-service-role"
275
+ ROLE_EXISTS=false
276
+ if aws iam get-role --role-name "${ROLE_NAME}" &> /dev/null; then
277
+ ROLE_EXISTS=true
278
+ echo " • IAM service role: ${ROLE_NAME}"
279
+ fi
280
+
281
+ S3_BUCKET="codebuild-source-${AWS_ACCOUNT_ID}-${AWS_REGION}"
282
+ S3_PREFIX="${PROJECT_NAME}/"
283
+ S3_EXISTS=false
284
+ if aws s3api head-bucket --bucket "$S3_BUCKET" --region "${AWS_REGION}" &> /dev/null; then
285
+ S3_COUNT=$(aws s3 ls "s3://$S3_BUCKET/$S3_PREFIX" --region "${AWS_REGION}" 2>/dev/null | wc -l | tr -d ' ')
286
+ if [ "$S3_COUNT" -gt 0 ]; then
287
+ S3_EXISTS=true
288
+ echo " • S3 source artifacts: s3://$S3_BUCKET/$S3_PREFIX ($S3_COUNT objects)"
289
+ fi
290
+ fi
291
+
292
+ if ! confirm_action "This will delete the CodeBuild project and associated resources"; then
293
+ return 1
294
+ fi
295
+
296
+ echo "🗑️ Deleting CodeBuild project: ${CODEBUILD_PROJECT_NAME}"
297
+ if aws codebuild delete-project \
298
+ --name "${CODEBUILD_PROJECT_NAME}" \
299
+ --region "${AWS_REGION}" &> /dev/null; then
300
+ echo "✅ CodeBuild project deleted"
301
+
302
+ ./do/manifest delete --id "arn:aws:codebuild:${AWS_REGION}:${AWS_ACCOUNT_ID}:project/${CODEBUILD_PROJECT_NAME}" 2>/dev/null || true
303
+ else
304
+ echo "❌ Failed to delete CodeBuild project"
305
+ fi
306
+
307
+ if [ "$ROLE_EXISTS" = true ]; then
308
+ echo "🗑️ Deleting IAM service role: ${ROLE_NAME}"
309
+ POLICIES=$(aws iam list-role-policies --role-name "${ROLE_NAME}" --query 'PolicyNames' --output text 2>/dev/null || echo "")
310
+ for policy in $POLICIES; do
311
+ aws iam delete-role-policy --role-name "${ROLE_NAME}" --policy-name "$policy" 2>/dev/null || true
312
+ done
313
+ if aws iam delete-role --role-name "${ROLE_NAME}" &> /dev/null; then
314
+ echo "✅ IAM service role deleted"
315
+
316
+ ./do/manifest delete --id "arn:aws:iam::${AWS_ACCOUNT_ID}:role/${ROLE_NAME}" 2>/dev/null || true
317
+ else
318
+ echo "❌ Failed to delete IAM service role"
319
+ fi
320
+ fi
321
+
322
+ if [ "$S3_EXISTS" = true ]; then
323
+ echo "🗑️ Deleting S3 source artifacts: s3://$S3_BUCKET/$S3_PREFIX"
324
+ if aws s3 rm "s3://$S3_BUCKET/$S3_PREFIX" --recursive --region "${AWS_REGION}" &> /dev/null; then
325
+ echo "✅ S3 source artifacts deleted"
326
+ else
327
+ echo "❌ Failed to delete S3 source artifacts"
328
+ fi
329
+ fi
330
+
331
+ echo "✅ CodeBuild resources cleaned"
332
+ }
333
+
334
+ # Main script logic
335
+ echo "🧹 Cleanup script for ${PROJECT_NAME}"
336
+ echo ""
337
+
338
+ if [ -z "${CLEANUP_TARGET}" ]; then
339
+ show_usage
340
+ exit 0
341
+ fi
342
+
343
+ case "${CLEANUP_TARGET}" in
344
+ local)
345
+ clean_local
346
+ ;;
347
+ ecr)
348
+ clean_ecr
349
+ ;;
350
+ hyperpod)
351
+ clean_hyperpod
352
+ ;;
353
+ codebuild)
354
+ clean_codebuild
355
+ ;;
356
+ <% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
357
+ benchmark)
358
+ echo "🧹 Cleaning benchmark resources..."
359
+ WORKLOAD_CONFIG_NAME="${PROJECT_NAME}-benchmark-config"
360
+
361
+ if aws sagemaker describe-ai-workload-config \
362
+ --ai-workload-config-name "$WORKLOAD_CONFIG_NAME" \
363
+ --region "$AWS_REGION" 2>/dev/null; then
364
+ aws sagemaker delete-ai-workload-config \
365
+ --ai-workload-config-name "$WORKLOAD_CONFIG_NAME" \
366
+ --region "$AWS_REGION"
367
+ echo " ✓ Deleted workload config: $WORKLOAD_CONFIG_NAME"
368
+ fi
369
+
370
+ aws sagemaker list-ai-benchmark-jobs \
371
+ --name-contains "${PROJECT_NAME}-benchmark-" \
372
+ --region "$AWS_REGION" \
373
+ --query 'AIBenchmarkJobs[?AIBenchmarkJobStatus!=`InProgress`].AIBenchmarkJobName' \
374
+ --output text | tr '\t' '\n' | while read -r job; do
375
+ [ -z "$job" ] && continue
376
+ aws sagemaker delete-ai-benchmark-job \
377
+ --ai-benchmark-job-name "$job" \
378
+ --region "$AWS_REGION"
379
+ echo " ✓ Deleted benchmark job: $job"
380
+ done
381
+
382
+ if [ -d "${SCRIPT_DIR}/../benchmarks" ]; then
383
+ read -p "Delete local benchmark results? (Y/n) " CONFIRM_DELETE
384
+ CONFIRM_DELETE="${CONFIRM_DELETE:-Y}"
385
+ if [[ "${CONFIRM_DELETE}" =~ ^[Yy]$ ]]; then
386
+ rm -rf "${SCRIPT_DIR}/../benchmarks"
387
+ echo " ✓ Deleted local benchmarks/ directory"
388
+ else
389
+ echo " ⏭ Skipped local benchmarks/ deletion"
390
+ fi
391
+ fi
392
+
393
+ echo "✅ Benchmark cleanup complete"
394
+ ;;
395
+ <% } %>
396
+ all)
397
+ echo "🧹 Performing complete cleanup"
398
+ echo ""
399
+
400
+ CLEANED_ITEMS=()
401
+
402
+ if clean_local; then
403
+ CLEANED_ITEMS+=("Local Docker images")
404
+ fi
405
+
406
+ echo ""
407
+
408
+ if clean_ecr; then
409
+ CLEANED_ITEMS+=("ECR images")
410
+ fi
411
+
412
+ echo ""
413
+
414
+ # Clean HyperPod EKS resources
415
+ if clean_hyperpod; then
416
+ CLEANED_ITEMS+=("HyperPod EKS resources")
417
+ fi
418
+
419
+ echo ""
420
+
421
+ if clean_codebuild; then
422
+ CLEANED_ITEMS+=("CodeBuild resources")
423
+ fi
424
+
425
+ <% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
426
+ echo ""
427
+
428
+ WORKLOAD_CONFIG_NAME="${PROJECT_NAME}-benchmark-config"
429
+
430
+ if aws sagemaker describe-ai-workload-config \
431
+ --ai-workload-config-name "$WORKLOAD_CONFIG_NAME" \
432
+ --region "$AWS_REGION" 2>/dev/null; then
433
+ aws sagemaker delete-ai-workload-config \
434
+ --ai-workload-config-name "$WORKLOAD_CONFIG_NAME" \
435
+ --region "$AWS_REGION"
436
+ echo " ✓ Deleted workload config: $WORKLOAD_CONFIG_NAME"
437
+ fi
438
+
439
+ aws sagemaker list-ai-benchmark-jobs \
440
+ --name-contains "${PROJECT_NAME}-benchmark-" \
441
+ --region "$AWS_REGION" \
442
+ --query 'AIBenchmarkJobs[?AIBenchmarkJobStatus!=`InProgress`].AIBenchmarkJobName' \
443
+ --output text | tr '\t' '\n' | while read -r job; do
444
+ [ -z "$job" ] && continue
445
+ aws sagemaker delete-ai-benchmark-job \
446
+ --ai-benchmark-job-name "$job" \
447
+ --region "$AWS_REGION"
448
+ echo " ✓ Deleted benchmark job: $job"
449
+ done
450
+
451
+ if [ -d "${SCRIPT_DIR}/../benchmarks" ]; then
452
+ read -p "Delete local benchmark results? (Y/n) " CONFIRM_DELETE
453
+ CONFIRM_DELETE="${CONFIRM_DELETE:-Y}"
454
+ if [[ "${CONFIRM_DELETE}" =~ ^[Yy]$ ]]; then
455
+ rm -rf "${SCRIPT_DIR}/../benchmarks"
456
+ echo " ✓ Deleted local benchmarks/ directory"
457
+ else
458
+ echo " ⏭ Skipped local benchmarks/ deletion"
459
+ fi
460
+ fi
461
+
462
+ CLEANED_ITEMS+=("Benchmark resources")
463
+ <% } %>
464
+ echo ""
465
+ echo "✅ Cleanup complete!"
466
+ echo ""
467
+ echo "Summary of cleaned resources:"
468
+ for item in "${CLEANED_ITEMS[@]}"; do
469
+ echo " ✓ ${item}"
470
+ done
471
+ ;;
472
+ *)
473
+ echo "❌ Unknown cleanup target: ${CLEANUP_TARGET}"
474
+ echo ""
475
+ show_usage
476
+ exit 1
477
+ ;;
478
+ esac
479
+
480
+ echo ""
481
+ echo "Cleanup finished!"