@aws/ml-container-creator 0.10.3 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/config/parameter-schema-v2.json +28 -1
  2. package/infra/ci-harness/lib/ci-harness-stack.ts +50 -36
  3. package/package.json +6 -5
  4. package/servers/instance-sizer/index.js +30 -17
  5. package/servers/instance-sizer/lib/instance-ranker.js +44 -0
  6. package/servers/lib/catalogs/instances.json +27 -0
  7. package/src/app.js +8 -1
  8. package/src/lib/bootstrap-command-handler.js +32 -3
  9. package/src/lib/config-validator.js +1 -1
  10. package/src/lib/generated/cli-options.js +7 -2
  11. package/src/lib/generated/parameter-matrix.js +16 -5
  12. package/src/lib/generated/validation-rules.js +7 -3
  13. package/src/lib/path-prover-brain.js +58 -1
  14. package/src/lib/prompts/infrastructure-prompts.js +2 -2
  15. package/src/lib/prompts/model-prompts.js +6 -0
  16. package/src/lib/secrets-prompt-runner.js +4 -0
  17. package/src/lib/template-manager.js +1 -1
  18. package/src/lib/template-variable-resolver.js +62 -0
  19. package/templates/do/adapter +5 -0
  20. package/templates/do/build +5 -0
  21. package/templates/do/clean.d/async-inference.ejs +5 -0
  22. package/templates/do/clean.d/batch-transform.ejs +5 -0
  23. package/templates/do/clean.d/hyperpod-eks.ejs +5 -0
  24. package/templates/do/clean.d/managed-inference.ejs +5 -0
  25. package/templates/do/config +12 -45
  26. package/templates/do/deploy.d/async-inference.ejs +30 -3
  27. package/templates/do/deploy.d/batch-transform.ejs +29 -3
  28. package/templates/do/deploy.d/hyperpod-eks.ejs +4 -0
  29. package/templates/do/deploy.d/managed-inference.ejs +24 -3
  30. package/templates/do/lib/endpoint-config.sh +1 -1
  31. package/templates/do/lib/profile.sh +44 -0
  32. package/templates/do/push +5 -0
  33. package/templates/do/register +5 -0
  34. package/templates/do/stage +567 -0
  35. package/templates/do/submit +7 -0
  36. package/templates/do/test +1 -0
  37. package/templates/do/tune +4 -0
@@ -0,0 +1,567 @@
1
+ #!/bin/bash
2
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # do/stage — Pre-stage model weights from HuggingFace to S3
6
+ # Downloads the model using huggingface-cli and syncs to S3 so that
7
+ # vLLM can load directly from S3 at deploy time (fast cold-start).
8
+ #
9
+ # Idempotent: if the model is already staged (config.json exists at
10
+ # the target S3 path), the script exits early.
11
+ #
12
+ # Usage:
13
+ # ./do/stage Stage model to S3
14
+ # ./do/stage --force Re-stage even if already present in S3
15
+ # ./do/stage --update-config Stage and update MODEL_NAME in do/config
16
+ # ./do/stage --submit Submit as SageMaker Processing Job (for models >500GB)
17
+ # ./do/stage --submit --no-wait Submit and exit without polling
18
+
19
+ set -e
20
+ set -u
21
+ set -o pipefail
22
+
23
+ # ── Source project configuration ──────────────────────────────────────────────
24
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
25
+ source "${SCRIPT_DIR}/config"
26
+
27
+ # ── Parse flags ───────────────────────────────────────────────────────────────
28
+ FORCE=false
29
+ UPDATE_CONFIG=false
30
+ SUBMIT_MODE=false
31
+ NO_WAIT=false
32
+ while [ $# -gt 0 ]; do
33
+ case "$1" in
34
+ --force) FORCE=true; shift ;;
35
+ --update-config) UPDATE_CONFIG=true; shift ;;
36
+ --submit) SUBMIT_MODE=true; shift ;;
37
+ --no-wait) NO_WAIT=true; shift ;;
38
+ --help|-h)
39
+ echo "Usage: ./do/stage [--force] [--update-config] [--submit] [--no-wait]"
40
+ echo ""
41
+ echo "Pre-stage model weights from HuggingFace to S3."
42
+ echo ""
43
+ echo "Modes:"
44
+ echo " (default) Download locally then sync to S3"
45
+ echo " --submit Submit as SageMaker Processing Job (for models >500GB)"
46
+ echo ""
47
+ echo "Options:"
48
+ echo " --force Re-stage even if model already exists in S3"
49
+ echo " --update-config Update MODEL_NAME in do/config to the staged S3 URI"
50
+ echo " --no-wait (with --submit) Exit without polling for completion"
51
+ echo ""
52
+ echo "Environment:"
53
+ echo " HF_TOKEN HuggingFace token (for gated models)"
54
+ echo ""
55
+ echo "The staged S3 URI will be printed on completion."
56
+ echo "Pass --update-config to automatically update do/config for S3-backed deploys."
57
+ echo ""
58
+ echo "The --submit mode uses a SageMaker Processing Job with 2TB attached"
59
+ echo "storage, suitable for very large models that exceed local disk capacity."
60
+ exit 0
61
+ ;;
62
+ *) shift ;;
63
+ esac
64
+ done
65
+
66
+ # ── Processing Job submission function ────────────────────────────────────────
67
+ # Submits a SageMaker Processing Job that downloads model weights from HuggingFace
68
+ # and syncs them to S3. Uses 2TB attached storage to handle any model size.
69
+ POLL_INTERVAL=30
70
+ PROCESSING_JOB_INSTANCE_TYPE="ml.m5.xlarge"
71
+ PROCESSING_JOB_VOLUME_GB=2048
72
+
73
+ _submit_processing_job() {
74
+ echo "🚀 Submitting SageMaker Processing Job for model staging"
75
+ echo " Model: ${MODEL_NAME}"
76
+ echo " Target: ${MODEL_S3_URI}"
77
+ echo " Instance: ${PROCESSING_JOB_INSTANCE_TYPE}"
78
+ echo " Storage: ${PROCESSING_JOB_VOLUME_GB} GB"
79
+ echo ""
80
+
81
+ # Validate AWS credentials
82
+ if ! aws sts get-caller-identity &>/dev/null; then
83
+ echo "❌ AWS credentials not configured or expired."
84
+ echo " Run: aws configure"
85
+ exit 4
86
+ fi
87
+
88
+ # Resolve execution role from profile
89
+ local execution_role
90
+ execution_role=$(echo "${_PROFILE_JSON}" | python3 -c "
91
+ import sys, json
92
+ p = json.load(sys.stdin)
93
+ print(p.get('executionRoleArn', ''))
94
+ " 2>/dev/null) || execution_role=""
95
+
96
+ if [ -z "${execution_role}" ]; then
97
+ echo "❌ No execution role configured."
98
+ echo " Run 'ml-container-creator bootstrap' to set up your profile."
99
+ echo " The role needs: SageMaker, S3, and Secrets Manager permissions."
100
+ exit 1
101
+ fi
102
+
103
+ # Resolve HF token ARN for the processing job (optional — for gated models)
104
+ local hf_token_secret_arn="${HF_TOKEN_ARN:-}"
105
+
106
+ # Generate job name with timestamp
107
+ local timestamp
108
+ timestamp=$(date +%Y%m%d-%H%M%S)
109
+ local job_name="mlcc-stage-${PROJECT_NAME}-${timestamp}"
110
+ # SageMaker job names max 63 chars, must match [a-zA-Z0-9](-*[a-zA-Z0-9])*
111
+ job_name=$(echo "${job_name}" | cut -c1-63 | sed 's/[^a-zA-Z0-9-]/-/g' | sed 's/-*$//')
112
+
113
+ echo " Job name: ${job_name}"
114
+ echo ""
115
+
116
+ # Build the entrypoint script that runs inside the processing container
117
+ local entrypoint_script
118
+ entrypoint_script=$(cat <<'ENTRYPOINT_EOF'
119
+ #!/bin/bash
120
+ set -e
121
+ set -o pipefail
122
+
123
+ echo "=== MCC Model Staging Processing Job ==="
124
+ echo "Model: ${MODEL_ID}"
125
+ echo "Target: ${S3_OUTPUT_URI}"
126
+ echo ""
127
+
128
+ # Install dependencies
129
+ echo "📦 Installing huggingface-cli and hf_transfer..."
130
+ pip install -q huggingface_hub[cli] hf_transfer
131
+
132
+ # Enable fast parallel downloads
133
+ export HF_HUB_ENABLE_HF_TRANSFER=1
134
+
135
+ # Set HF token if provided
136
+ if [ -n "${HF_TOKEN:-}" ]; then
137
+ echo "🔐 Using provided HuggingFace token"
138
+ fi
139
+
140
+ # Download model from HuggingFace
141
+ echo ""
142
+ echo "⬇️ Downloading model: ${MODEL_ID}"
143
+ DOWNLOAD_ARGS="${MODEL_ID}"
144
+ if [ -n "${HF_TOKEN:-}" ]; then
145
+ DOWNLOAD_ARGS="${DOWNLOAD_ARGS} --token ${HF_TOKEN}"
146
+ fi
147
+ huggingface-cli download ${DOWNLOAD_ARGS}
148
+
149
+ echo ""
150
+ echo "✅ Download complete"
151
+
152
+ # Locate downloaded files
153
+ CACHE_PATH=$(python3 -c "
154
+ from huggingface_hub import snapshot_download
155
+ path = snapshot_download('${MODEL_ID}', local_files_only=True)
156
+ print(path)
157
+ ")
158
+
159
+ echo "📁 Cache path: ${CACHE_PATH}"
160
+
161
+ # Sync to S3
162
+ echo ""
163
+ echo "☁️ Syncing to S3: ${S3_OUTPUT_URI}"
164
+ aws s3 sync "${CACHE_PATH}" "${S3_OUTPUT_URI}" \
165
+ --no-progress \
166
+ --exclude "*.lock" \
167
+ --exclude ".gitattributes"
168
+
169
+ echo ""
170
+ echo "✅ Model staged successfully to: ${S3_OUTPUT_URI}"
171
+ ENTRYPOINT_EOF
172
+ )
173
+
174
+ # Build environment variables for the container
175
+ local env_vars="MODEL_ID=${MODEL_NAME},S3_OUTPUT_URI=${MODEL_S3_URI}"
176
+ if [ -n "${hf_token_secret_arn}" ]; then
177
+ # Resolve token and pass as env var to the job
178
+ local hf_token_value=""
179
+ hf_token_value=$(aws secretsmanager get-secret-value \
180
+ --secret-id "${hf_token_secret_arn}" \
181
+ --query SecretString --output text 2>/dev/null) || hf_token_value=""
182
+ if [ -n "${hf_token_value}" ]; then
183
+ env_vars="${env_vars},HF_TOKEN=${hf_token_value}"
184
+ fi
185
+ elif [ -n "${HF_TOKEN:-}" ]; then
186
+ env_vars="${env_vars},HF_TOKEN=${HF_TOKEN}"
187
+ fi
188
+
189
+ # Write entrypoint to a temp file for the processing job input
190
+ local entrypoint_s3_key="staging-jobs/${job_name}/entrypoint.sh"
191
+ local entrypoint_s3_uri="s3://${STAGE_S3_BUCKET}/${entrypoint_s3_key}"
192
+
193
+ echo "📤 Uploading entrypoint script..."
194
+ echo "${entrypoint_script}" | aws s3 cp - "${entrypoint_s3_uri}" --region "${AWS_REGION}"
195
+
196
+ # Create the processing job
197
+ # Uses a lightweight Python image with AWS CLI pre-installed
198
+ local container_image="763104351884.dkr.ecr.${AWS_REGION}.amazonaws.com/pytorch-training:2.1.0-cpu-py310-ubuntu20.04-sagemaker"
199
+
200
+ local processing_request
201
+ processing_request=$(python3 -c "
202
+ import json, sys
203
+
204
+ job = {
205
+ 'ProcessingJobName': '${job_name}',
206
+ 'ProcessingResources': {
207
+ 'ClusterConfig': {
208
+ 'InstanceCount': 1,
209
+ 'InstanceType': '${PROCESSING_JOB_INSTANCE_TYPE}',
210
+ 'VolumeSizeInGB': ${PROCESSING_JOB_VOLUME_GB}
211
+ }
212
+ },
213
+ 'AppSpecification': {
214
+ 'ImageUri': '${container_image}',
215
+ 'ContainerEntrypoint': ['bash', '-c'],
216
+ 'ContainerArguments': ['aws s3 cp ${entrypoint_s3_uri} /tmp/entrypoint.sh && chmod +x /tmp/entrypoint.sh && /tmp/entrypoint.sh']
217
+ },
218
+ 'Environment': dict(item.split('=', 1) for item in '${env_vars}'.split(',')),
219
+ 'RoleArn': '${execution_role}',
220
+ 'StoppingCondition': {
221
+ 'MaxRuntimeInSeconds': 86400
222
+ }
223
+ }
224
+
225
+ print(json.dumps(job, indent=2))
226
+ ")
227
+
228
+ # Write request JSON to temp file
229
+ local request_file="/tmp/mlcc-stage-request-${timestamp}.json"
230
+ echo "${processing_request}" > "${request_file}"
231
+
232
+ echo "🚀 Creating Processing Job: ${job_name}"
233
+ echo ""
234
+
235
+ local create_output
236
+ local create_exit_code
237
+ create_output=$(aws sagemaker create-processing-job \
238
+ --cli-input-json "file://${request_file}" \
239
+ --region "${AWS_REGION}" 2>&1) || create_exit_code=$?
240
+ create_exit_code=${create_exit_code:-0}
241
+
242
+ rm -f "${request_file}"
243
+
244
+ if [ ${create_exit_code} -ne 0 ]; then
245
+ echo "❌ Failed to create Processing Job"
246
+ echo " ${create_output}"
247
+ echo ""
248
+ if echo "${create_output}" | grep -q "AccessDeniedException"; then
249
+ echo " Remediation: ensure the execution role has sagemaker:CreateProcessingJob permission"
250
+ fi
251
+ exit 1
252
+ fi
253
+
254
+ echo " ✅ Processing Job submitted: ${job_name}"
255
+ echo ""
256
+
257
+ # Handle --no-wait
258
+ if [ "${NO_WAIT}" = true ]; then
259
+ echo " --no-wait specified. Job submitted, exiting without polling."
260
+ echo ""
261
+ echo " Check status:"
262
+ echo " aws sagemaker describe-processing-job --processing-job-name ${job_name} --region ${AWS_REGION}"
263
+ echo ""
264
+ echo " On completion, the staged model will be at:"
265
+ echo " ${MODEL_S3_URI}"
266
+ return 0
267
+ fi
268
+
269
+ # Poll for completion
270
+ _poll_processing_job "${job_name}"
271
+ }
272
+
273
+ # ── Poll Processing Job status ────────────────────────────────────────────────
274
+ _poll_processing_job() {
275
+ local job_name="$1"
276
+
277
+ echo "⏳ Polling Processing Job status (every ${POLL_INTERVAL}s)..."
278
+ echo " (Ctrl+C to stop polling — job continues in background)"
279
+ echo ""
280
+
281
+ while true; do
282
+ local describe_output
283
+ local describe_exit_code
284
+ describe_output=$(aws sagemaker describe-processing-job \
285
+ --processing-job-name "${job_name}" \
286
+ --region "${AWS_REGION}" 2>&1) || describe_exit_code=$?
287
+ describe_exit_code=${describe_exit_code:-0}
288
+
289
+ if [ ${describe_exit_code} -ne 0 ]; then
290
+ echo " ⚠️ Failed to describe job (will retry): ${describe_output}"
291
+ sleep "${POLL_INTERVAL}"
292
+ continue
293
+ fi
294
+
295
+ # Parse status from response
296
+ local job_status
297
+ local failure_reason
298
+ job_status=$(echo "${describe_output}" | python3 -c "
299
+ import sys, json
300
+ d = json.load(sys.stdin)
301
+ print(d.get('ProcessingJobStatus', 'Unknown'))
302
+ " 2>/dev/null) || job_status="Unknown"
303
+
304
+ failure_reason=$(echo "${describe_output}" | python3 -c "
305
+ import sys, json
306
+ d = json.load(sys.stdin)
307
+ print(d.get('FailureReason', ''))
308
+ " 2>/dev/null) || failure_reason=""
309
+
310
+ # Print status
311
+ local now
312
+ now=$(date +%H:%M:%S)
313
+ echo " [${now}] Status: ${job_status}"
314
+
315
+ # Handle terminal states
316
+ case "${job_status}" in
317
+ Completed)
318
+ echo ""
319
+ echo "✅ Processing Job completed: ${job_name}"
320
+ echo ""
321
+ echo " S3 URI: ${MODEL_S3_URI}"
322
+ echo ""
323
+ if [ "${UPDATE_CONFIG}" = true ]; then
324
+ CONFIG_FILE="${SCRIPT_DIR}/config"
325
+ sed -i.bak "s|^export MODEL_NAME=.*|export MODEL_NAME=\"${MODEL_S3_URI}\"|" "${CONFIG_FILE}"
326
+ rm -f "${CONFIG_FILE}.bak"
327
+ echo " ✅ Updated MODEL_NAME in do/config → ${MODEL_S3_URI}"
328
+ echo ""
329
+ echo " Re-deploy with S3-backed model: ./do/deploy"
330
+ else
331
+ echo " To use this staged model, update do/config:"
332
+ echo " export MODEL_NAME=\"${MODEL_S3_URI}\""
333
+ echo ""
334
+ echo " Or re-run with --update-config:"
335
+ echo " ./do/stage --submit --update-config"
336
+ fi
337
+ return 0
338
+ ;;
339
+ Failed)
340
+ echo ""
341
+ echo "❌ Processing Job failed: ${job_name}"
342
+ if [ -n "${failure_reason}" ]; then
343
+ echo " Reason: ${failure_reason}"
344
+ fi
345
+ echo ""
346
+ echo " Check CloudWatch logs:"
347
+ echo " /aws/sagemaker/ProcessingJobs/${job_name}"
348
+ echo ""
349
+ echo " To retry: ./do/stage --submit --force"
350
+ return 1
351
+ ;;
352
+ Stopped)
353
+ echo ""
354
+ echo "⏹️ Processing Job was stopped: ${job_name}"
355
+ echo ""
356
+ echo " To retry: ./do/stage --submit --force"
357
+ return 2
358
+ ;;
359
+ esac
360
+
361
+ sleep "${POLL_INTERVAL}"
362
+ done
363
+ }
364
+
365
+ # ── Check if model is already an S3 URI ──────────────────────────────────────
366
+ if [[ "${MODEL_NAME}" == s3://* ]]; then
367
+ echo "✅ Model is already an S3 URI: ${MODEL_NAME}"
368
+ echo " Nothing to stage."
369
+ exit 0
370
+ fi
371
+
372
+ echo "📦 Staging model: ${MODEL_NAME}"
373
+ echo " Project: ${PROJECT_NAME}"
374
+ echo ""
375
+
376
+ # ── Resolve profile for S3 bucket ────────────────────────────────────────────
377
+ _PROFILE_JSON=""
378
+ if command -v python3 &>/dev/null; then
379
+ _PROFILE_JSON=$(python3 -c "
380
+ import json, os
381
+ config_path = os.path.expanduser('~/.ml-container-creator/config.json')
382
+ try:
383
+ with open(config_path) as f:
384
+ config = json.load(f)
385
+ profile = config['profiles'][config['activeProfile']]
386
+ print(json.dumps(profile))
387
+ except:
388
+ print('{}')
389
+ " 2>/dev/null) || _PROFILE_JSON="{}"
390
+ fi
391
+
392
+ # Extract the benchmark S3 bucket from profile (used for model staging)
393
+ STAGE_S3_BUCKET=$(echo "${_PROFILE_JSON}" | python3 -c "
394
+ import sys, json
395
+ p = json.load(sys.stdin)
396
+ bucket = p.get('benchmarkS3Bucket', '')
397
+ if not bucket:
398
+ acct = p.get('accountId', 'unknown')
399
+ region = p.get('awsRegion', 'us-east-1')
400
+ bucket = f'ml-container-creator-benchmark-{region}-{acct}'
401
+ print(bucket)
402
+ " 2>/dev/null) || STAGE_S3_BUCKET=""
403
+
404
+ if [ -z "${STAGE_S3_BUCKET}" ]; then
405
+ echo "❌ Could not determine S3 bucket for staging."
406
+ echo " Run 'ml-container-creator bootstrap' to set up your profile."
407
+ exit 1
408
+ fi
409
+
410
+ # Target S3 path for staged model
411
+ MODEL_S3_URI="s3://${STAGE_S3_BUCKET}/models/${PROJECT_NAME}/"
412
+
413
+ echo " Target: ${MODEL_S3_URI}"
414
+ echo ""
415
+
416
+ # ── Submit mode: SageMaker Processing Job ─────────────────────────────────────
417
+ # For very large models (>500GB) that exceed local disk, submit a Processing Job
418
+ # with 2TB attached storage. The job downloads from HuggingFace and syncs to S3.
419
+ if [ "${SUBMIT_MODE}" = true ]; then
420
+ _submit_processing_job
421
+ exit $?
422
+ fi
423
+
424
+ # ── Idempotency: check if model is already staged ────────────────────────────
425
+ if [ "${FORCE}" = false ]; then
426
+ if aws s3 ls "${MODEL_S3_URI}config.json" --region "${AWS_REGION}" &>/dev/null; then
427
+ echo "✅ Model already staged at: ${MODEL_S3_URI}"
428
+ echo " Use --force to re-stage."
429
+ echo ""
430
+ if [ "${UPDATE_CONFIG}" = true ]; then
431
+ CONFIG_FILE="${SCRIPT_DIR}/config"
432
+ sed -i.bak "s|^export MODEL_NAME=.*|export MODEL_NAME=\"${MODEL_S3_URI}\"|" "${CONFIG_FILE}"
433
+ rm -f "${CONFIG_FILE}.bak"
434
+ echo " ✅ Updated MODEL_NAME in do/config → ${MODEL_S3_URI}"
435
+ else
436
+ echo " To use this staged model, set in do/config:"
437
+ echo " export MODEL_NAME=\"${MODEL_S3_URI}\""
438
+ fi
439
+ exit 0
440
+ fi
441
+ fi
442
+
443
+ # ── Validate prerequisites ───────────────────────────────────────────────────
444
+ if ! command -v huggingface-cli &>/dev/null; then
445
+ echo "❌ huggingface-cli is not installed"
446
+ echo " Install: pip install huggingface_hub[cli] hf_transfer"
447
+ exit 2
448
+ fi
449
+
450
+ if ! command -v aws &>/dev/null; then
451
+ echo "❌ AWS CLI is not installed"
452
+ echo " Install: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html"
453
+ exit 2
454
+ fi
455
+
456
+ # Validate AWS credentials
457
+ if ! aws sts get-caller-identity &>/dev/null; then
458
+ echo "❌ AWS credentials not configured or expired."
459
+ echo " Run: aws configure"
460
+ exit 4
461
+ fi
462
+
463
+ # ── Resolve HuggingFace token (for gated models) ─────────────────────────────
464
+ if [ -n "${HF_TOKEN_ARN:-}" ] && [ -z "${HF_TOKEN:-}" ]; then
465
+ echo "🔐 Resolving HuggingFace token from Secrets Manager..."
466
+ HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "${HF_TOKEN_ARN}" --query SecretString --output text) || {
467
+ echo "⚠️ Failed to resolve HF token from Secrets Manager (continuing without token)"
468
+ HF_TOKEN=""
469
+ }
470
+ export HF_TOKEN
471
+ fi
472
+
473
+ # ── Download model from HuggingFace ──────────────────────────────────────────
474
+ echo "⬇️ Downloading model from HuggingFace: ${MODEL_NAME}"
475
+ echo " Using hf_transfer for fast parallel downloads..."
476
+ echo ""
477
+
478
+ # Enable fast parallel downloads via hf_transfer
479
+ export HF_HUB_ENABLE_HF_TRANSFER=1
480
+
481
+ # Download to HF cache (huggingface-cli manages cache location)
482
+ DOWNLOAD_ARGS=("${MODEL_NAME}")
483
+ if [ -n "${HF_TOKEN:-}" ]; then
484
+ DOWNLOAD_ARGS+=("--token" "${HF_TOKEN}")
485
+ fi
486
+
487
+ if ! huggingface-cli download "${DOWNLOAD_ARGS[@]}"; then
488
+ echo "❌ Failed to download model from HuggingFace: ${MODEL_NAME}"
489
+ echo ""
490
+ echo "Possible causes:"
491
+ echo " • Model name is incorrect"
492
+ echo " • Model is gated and requires HF_TOKEN"
493
+ echo " • Network connectivity issues"
494
+ exit 3
495
+ fi
496
+
497
+ echo ""
498
+ echo "✅ Download complete"
499
+
500
+ # ── Locate downloaded files in HF cache ───────────────────────────────────────
501
+ # huggingface-cli downloads to ~/.cache/huggingface/hub/models--<org>--<name>/snapshots/<rev>/
502
+ HF_CACHE_DIR=$(python3 -c "
503
+ from huggingface_hub import snapshot_download
504
+ import os
505
+ path = snapshot_download('${MODEL_NAME}', local_files_only=True)
506
+ print(path)
507
+ " 2>/dev/null) || HF_CACHE_DIR=""
508
+
509
+ if [ -z "${HF_CACHE_DIR}" ] || [ ! -d "${HF_CACHE_DIR}" ]; then
510
+ # Fallback: construct the path manually
511
+ MODEL_DIR_NAME=$(echo "${MODEL_NAME}" | tr '/' '--')
512
+ HF_CACHE_DIR="${HOME}/.cache/huggingface/hub/models--${MODEL_DIR_NAME}/snapshots"
513
+ # Use the latest snapshot
514
+ if [ -d "${HF_CACHE_DIR}" ]; then
515
+ HF_CACHE_DIR=$(ls -td "${HF_CACHE_DIR}"/*/ 2>/dev/null | head -1)
516
+ fi
517
+ fi
518
+
519
+ if [ -z "${HF_CACHE_DIR}" ] || [ ! -d "${HF_CACHE_DIR}" ]; then
520
+ echo "❌ Could not locate downloaded model files in HuggingFace cache"
521
+ echo " Expected location: ~/.cache/huggingface/hub/models--${MODEL_NAME//\//-}/snapshots/"
522
+ exit 3
523
+ fi
524
+
525
+ echo "📁 Model cache: ${HF_CACHE_DIR}"
526
+
527
+ # ── Sync to S3 ───────────────────────────────────────────────────────────────
528
+ echo ""
529
+ echo "☁️ Syncing model to S3: ${MODEL_S3_URI}"
530
+ echo " This may take a while for large models..."
531
+ echo ""
532
+
533
+ if ! aws s3 sync "${HF_CACHE_DIR}" "${MODEL_S3_URI}" \
534
+ --region "${AWS_REGION}" \
535
+ --no-progress \
536
+ --exclude "*.lock" \
537
+ --exclude ".gitattributes"; then
538
+ echo "❌ Failed to sync model to S3"
539
+ echo ""
540
+ echo "Possible causes:"
541
+ echo " • Missing S3 write permissions (s3:PutObject)"
542
+ echo " • Bucket does not exist (run 'ml-container-creator bootstrap')"
543
+ echo " • Network connectivity issues"
544
+ exit 4
545
+ fi
546
+
547
+ echo ""
548
+ echo "✅ Model staged successfully!"
549
+ echo ""
550
+ echo " S3 URI: ${MODEL_S3_URI}"
551
+ echo ""
552
+ if [ "${UPDATE_CONFIG}" = true ]; then
553
+ CONFIG_FILE="${SCRIPT_DIR}/config"
554
+ sed -i.bak "s|^export MODEL_NAME=.*|export MODEL_NAME=\"${MODEL_S3_URI}\"|" "${CONFIG_FILE}"
555
+ rm -f "${CONFIG_FILE}.bak"
556
+ echo " ✅ Updated MODEL_NAME in do/config → ${MODEL_S3_URI}"
557
+ echo ""
558
+ echo " Re-deploy with S3-backed model: ./do/deploy"
559
+ else
560
+ echo " To use this staged model, update do/config:"
561
+ echo " export MODEL_NAME=\"${MODEL_S3_URI}\""
562
+ echo ""
563
+ echo " Or re-run with --update-config to do it automatically:"
564
+ echo " ./do/stage --update-config"
565
+ echo ""
566
+ echo " Then re-deploy: ./do/deploy"
567
+ fi
@@ -9,6 +9,13 @@ set -o pipefail
9
9
  # Source configuration
10
10
  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
11
11
  source "${SCRIPT_DIR}/config"
12
+ source "${SCRIPT_DIR}/lib/profile.sh"
13
+
14
+ # ── Profile-resolved variables (env var > profile > default) ──────────────────
15
+ ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE[ecrRepositoryName]:-ml-container-creator}}"
16
+
17
+ # ── Derived variables (env var > computed default) ────────────────────────────
18
+ CODEBUILD_PROJECT_NAME="${CODEBUILD_PROJECT_NAME:-${PROJECT_NAME}-build-$(date +%Y%m%d)}"
12
19
 
13
20
  echo "🚀 Submitting CodeBuild job for ${PROJECT_NAME}"
14
21
  echo " Deployment config: ${DEPLOYMENT_CONFIG}"
package/templates/do/test CHANGED
@@ -9,6 +9,7 @@ set -o pipefail
9
9
  # Source configuration
10
10
  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
11
11
  source "${SCRIPT_DIR}/config"
12
+ source "${SCRIPT_DIR}/lib/profile.sh"
12
13
 
13
14
  <% if (deploymentTarget === 'realtime-inference') { %>
14
15
  # ============================================================
package/templates/do/tune CHANGED
@@ -13,6 +13,10 @@ set -o pipefail
13
13
  # ── Source project configuration ──────────────────────────────────────────────
14
14
  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
15
15
  source "${SCRIPT_DIR}/config"
16
+ source "${SCRIPT_DIR}/lib/profile.sh"
17
+
18
+ # ── Profile-resolved variables (env var > profile > default) ──────────────────
19
+ TUNE_S3_BUCKET="${TUNE_S3_BUCKET:-mlcc-tune-${_PROFILE[accountId]:-unknown}-${_PROFILE[awsRegion]:-us-east-1}}"
16
20
 
17
21
  # ── Constants ─────────────────────────────────────────────────────────────────
18
22
  CATALOG_FILE="${SCRIPT_DIR}/.tune_catalog.json"