@aws/ml-container-creator 0.13.3 → 0.13.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/README.md +23 -5
  2. package/infra/ci-harness/package-lock.json +1 -5
  3. package/package.json +5 -3
  4. package/pyproject.toml +21 -0
  5. package/requirements.txt +19 -0
  6. package/servers/instance-sizer/lib/model-resolver.js +127 -185
  7. package/servers/instance-sizer/lib/vram-estimator.js +86 -0
  8. package/servers/lib/catalogs/instances.json +0 -27
  9. package/src/app.js +2 -0
  10. package/src/lib/bootstrap-command-handler.js +35 -25
  11. package/src/lib/generated/cli-options.js +1 -1
  12. package/src/lib/generated/parameter-matrix.js +1 -1
  13. package/src/lib/generated/validation-rules.js +1 -1
  14. package/src/lib/prompt-runner.js +14 -31
  15. package/templates/IAM_PERMISSIONS.md +64 -13
  16. package/templates/do/.adapter_helper.py +451 -0
  17. package/templates/do/.benchmark_writer.py +13 -0
  18. package/templates/do/.stage_helper.py +419 -0
  19. package/templates/do/.tune_helper.py +218 -67
  20. package/templates/do/README.md +50 -604
  21. package/templates/do/__pycache__/.adapter_helper.cpython-312.pyc +0 -0
  22. package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
  23. package/templates/do/__pycache__/.tune_helper.cpython-312.pyc +0 -0
  24. package/templates/do/adapter +109 -4
  25. package/templates/do/benchmark +150 -12
  26. package/templates/do/build +2 -5
  27. package/templates/do/clean.d/async-inference.ejs +2 -5
  28. package/templates/do/clean.d/batch-transform.ejs +2 -5
  29. package/templates/do/clean.d/hyperpod-eks.ejs +2 -5
  30. package/templates/do/clean.d/managed-inference.ejs +2 -5
  31. package/templates/do/config +4 -0
  32. package/templates/do/deploy.d/async-inference.ejs +6 -9
  33. package/templates/do/deploy.d/batch-transform.ejs +4 -7
  34. package/templates/do/deploy.d/hyperpod-eks.ejs +1 -4
  35. package/templates/do/deploy.d/managed-inference.ejs +15 -6
  36. package/templates/do/lib/profile.sh +24 -15
  37. package/templates/do/push +2 -5
  38. package/templates/do/register +2 -5
  39. package/templates/do/stage +114 -292
  40. package/templates/do/submit +1 -4
  41. package/templates/do/tune +64 -10
  42. package/templates/MIGRATION.md +0 -488
  43. package/templates/TEMPLATE_SYSTEM.md +0 -243
@@ -1,21 +1,27 @@
1
1
  #!/usr/bin/env bash
2
- # Profile loader — reads active bootstrap profile into _PROFILE[] associative array.
2
+ # Profile loader — reads active bootstrap profile into _PROFILE_<key> variables.
3
3
  # Source this file after do/config. Values provide defaults; explicit env vars take precedence.
4
4
  #
5
- # Requires bash 4+ for associative array support.
6
- # macOS ships with bash 3.2 — install bash 4+ via Homebrew: brew install bash
5
+ # POSIX-compatible: works on bash 3.2+ (macOS default) and bash 4+/5+.
6
+ # No associative arrays required.
7
7
  #
8
- # Expected keys in _PROFILE:
8
+ # After sourcing, access values via:
9
+ # ${_PROFILE_roleArn:-}
10
+ # ${_PROFILE_ecrRepositoryName:-ml-container-creator}
11
+ # ${_PROFILE_awsRegion:-us-east-1}
12
+ # ${_PROFILE_accountId:-}
13
+ # ${_PROFILE_benchmarkS3Bucket:-}
14
+ # ${_PROFILE_asyncS3Bucket:-}
15
+ # ${_PROFILE_batchS3Bucket:-}
16
+ #
17
+ # Expected keys (set as _PROFILE_<key>):
9
18
  # awsRegion, accountId, awsProfile, roleArn, ecrRepositoryName,
10
19
  # benchmarkS3Bucket, ciBenchmarkResultsBucket, asyncS3Bucket, batchS3Bucket,
11
20
  # ciTableName, ciInfraProvisioned
12
21
 
13
22
  # Temporarily disable unbound variable checking for profile loading
14
- # (keys may not exist in the profile config, and declare -A behavior
15
- # varies across bash versions with set -u)
16
23
  set +u 2>/dev/null || true
17
24
 
18
- declare -A _PROFILE 2>/dev/null || true
19
25
  if command -v python3 &>/dev/null; then
20
26
  _PROFILE_RAW=$(python3 -c "
21
27
  import json, os
@@ -23,22 +29,25 @@ try:
23
29
  with open(os.path.expanduser('~/.ml-container-creator/config.json')) as f:
24
30
  c = json.load(f)
25
31
  p = c['profiles'][c['activeProfile']]
26
- # Output as KEY=VALUE lines (simple, no JSON parsing in bash)
32
+ # Output as _PROFILE_KEY=VALUE lines safe for eval with known prefix
27
33
  for k, v in p.items():
28
34
  if isinstance(v, (str, int, float, bool)):
29
- print(f'{k}={v}')
35
+ # Sanitize: only allow alphanumeric key names
36
+ if k.isalnum() or all(c.isalnum() or c == '_' for c in k):
37
+ print(f'_PROFILE_{k}=\"{v}\"')
30
38
  except:
31
39
  pass
32
40
  " 2>/dev/null) || _PROFILE_RAW=""
33
41
 
34
42
  if [ -n "${_PROFILE_RAW}" ]; then
35
- while IFS='=' read -r key value; do
36
- [ -n "${key}" ] && _PROFILE["${key}"]="${value}"
37
- done <<< "${_PROFILE_RAW}"
43
+ eval "${_PROFILE_RAW}"
38
44
  fi
39
45
  fi
40
46
 
47
+ # Map commonly-used profile values to the variable names scripts expect.
48
+ # Explicit env vars take precedence (${X:-...} pattern).
49
+ ROLE_ARN="${ROLE_ARN:-${_PROFILE_roleArn:-}}"
50
+ CI_BENCHMARK_RESULTS_BUCKET="${CI_BENCHMARK_RESULTS_BUCKET:-${_PROFILE_ciBenchmarkResultsBucket:-}}"
51
+
41
52
  # NOTE: set -u is NOT re-enabled here. The caller is responsible for managing
42
- # their own shell options. Re-enabling set -u would cause "unbound variable"
43
- # errors when accessing _PROFILE keys on bash versions where empty associative
44
- # arrays are treated as unset (bash 5.x on some platforms).
53
+ # their own shell options.
package/templates/do/push CHANGED
@@ -12,11 +12,8 @@ source "${SCRIPT_DIR}/config"
12
12
  source "${SCRIPT_DIR}/lib/profile.sh"
13
13
 
14
14
  # ── Profile-resolved variables (env var > profile > default) ──────────────────
15
- # Disable unbound-variable checking for associative array access (bash 3.2 compat)
16
- set +u
17
- ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE[ecrRepositoryName]:-ml-container-creator}}"
18
- export AWS_REGION="${AWS_REGION:-${_PROFILE[awsRegion]:-us-east-1}}"
19
- set -u
15
+ ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE_ecrRepositoryName:-ml-container-creator}}"
16
+ export AWS_REGION="${AWS_REGION:-${_PROFILE_awsRegion:-us-east-1}}"
20
17
 
21
18
  echo "🚀 Pushing Docker image to Amazon ECR"
22
19
  echo " Project: ${PROJECT_NAME}"
@@ -12,11 +12,8 @@ source "${SCRIPT_DIR}/config"
12
12
  source "${SCRIPT_DIR}/lib/profile.sh"
13
13
 
14
14
  # ── Profile-resolved variables (env var > profile > default) ──────────────────
15
- # Disable unbound-variable checking for associative array access (bash 3.2 compat)
16
- set +u
17
- ROLE_ARN="${ROLE_ARN:-${_PROFILE[roleArn]:-}}"
18
- ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE[ecrRepositoryName]:-ml-container-creator}}"
19
- set -u
15
+ ROLE_ARN="${ROLE_ARN:-${_PROFILE_roleArn:-}}"
16
+ ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE_ecrRepositoryName:-ml-container-creator}}"
20
17
 
21
18
  # ============================================================
22
19
  # Register deployment to the deployment registry
@@ -3,18 +3,17 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  # do/stage — Pre-stage model weights from HuggingFace to S3
6
- # Downloads the model using huggingface-cli and syncs to S3 so that
7
- # vLLM can load directly from S3 at deploy time (fast cold-start).
6
+ # Submits a SageMaker Processing Job that downloads from HuggingFace
7
+ # and writes directly to S3 no local disk usage.
8
8
  #
9
9
  # Idempotent: if the model is already staged (config.json exists at
10
10
  # the target S3 path), the script exits early.
11
11
  #
12
12
  # Usage:
13
- # ./do/stage Stage model to S3
13
+ # ./do/stage Submit Processing Job to stage model (default)
14
+ # ./do/stage --local Download locally then sync to S3
15
+ # ./do/stage --no-wait Submit and exit without polling
14
16
  # ./do/stage --force Re-stage even if already present in S3
15
- # ./do/stage --update-config Stage and update MODEL_NAME in do/config
16
- # ./do/stage --submit Submit as SageMaker Processing Job (for models >500GB)
17
- # ./do/stage --submit --no-wait Submit and exit without polling
18
17
 
19
18
  set -e
20
19
  set -u
@@ -28,47 +27,46 @@ source "${SCRIPT_DIR}/lib/staged-assets.sh"
28
27
 
29
28
  # ── Parse flags ───────────────────────────────────────────────────────────────
30
29
  FORCE=false
31
- UPDATE_CONFIG=false
32
- SUBMIT_MODE=false
30
+ UPDATE_CONFIG=true
31
+ LOCAL_MODE=false
33
32
  NO_WAIT=false
34
33
  while [ $# -gt 0 ]; do
35
34
  case "$1" in
36
35
  --force) FORCE=true; shift ;;
37
- --update-config) UPDATE_CONFIG=true; shift ;;
38
- --submit) SUBMIT_MODE=true; shift ;;
36
+ --update-config) UPDATE_CONFIG=true; shift ;; # default, kept for backward compat
37
+ --no-update-config) UPDATE_CONFIG=false; shift ;;
38
+ --local) LOCAL_MODE=true; shift ;;
39
+ --submit) shift ;; # Deprecated — now the default; kept for backward compat
39
40
  --no-wait) NO_WAIT=true; shift ;;
40
41
  --help|-h)
41
- echo "Usage: ./do/stage [--force] [--update-config] [--submit] [--no-wait]"
42
+ echo "Usage: ./do/stage [--force] [--local] [--no-wait] [--no-update-config]"
42
43
  echo ""
43
44
  echo "Pre-stage model weights from HuggingFace to S3."
45
+ echo "On success, updates MODEL_NAME in do/config so subsequent tasks"
46
+ echo "(submit, deploy) pull from S3 with HuggingFace as fallback."
44
47
  echo ""
45
48
  echo "Modes:"
46
- echo " (default) Download locally then sync to S3"
47
- echo " --submit Submit as SageMaker Processing Job (for models >500GB)"
49
+ echo " (default) Submit SageMaker Processing Job (no local disk usage)"
50
+ echo " --local Download locally then sync to S3 (legacy behavior)"
51
+ echo " --submit Deprecated — Processing Job is now the default"
48
52
  echo ""
49
53
  echo "Options:"
50
- echo " --force Re-stage even if model already exists in S3"
51
- echo " --update-config Update MODEL_NAME in do/config to the staged S3 URI"
52
- echo " --no-wait (with --submit) Exit without polling for completion"
54
+ echo " --force Re-stage even if model already exists in S3"
55
+ echo " --no-update-config Do NOT update MODEL_NAME in do/config after staging"
56
+ echo " --no-wait Return immediately with job name (Processing Job mode)"
53
57
  echo ""
54
58
  echo "Environment:"
55
59
  echo " HF_TOKEN HuggingFace token (for gated models)"
56
60
  echo ""
57
61
  echo "The staged S3 URI will be printed on completion."
58
- echo "Pass --update-config to automatically update do/config for S3-backed deploys."
59
- echo ""
60
- echo "The --submit mode uses a SageMaker Processing Job with 2TB attached"
61
- echo "storage, suitable for very large models that exceed local disk capacity."
62
+ echo "MODEL_NAME in do/config is updated automatically unless --no-update-config is passed."
62
63
  exit 0
63
64
  ;;
64
65
  *) shift ;;
65
66
  esac
66
67
  done
67
68
 
68
- # ── Processing Job submission function ────────────────────────────────────────
69
- # Submits a SageMaker Processing Job that downloads model weights from HuggingFace
70
- # and syncs them to S3. Uses 2TB attached storage to handle any model size.
71
- POLL_INTERVAL=30
69
+ # ── Processing Job submission via .stage_helper.py ────────────────────────────
72
70
  PROCESSING_JOB_INSTANCE_TYPE="ml.m5.xlarge"
73
71
  PROCESSING_JOB_VOLUME_GB=2048
74
72
 
@@ -80,19 +78,12 @@ _submit_processing_job() {
80
78
  echo " Storage: ${PROCESSING_JOB_VOLUME_GB} GB"
81
79
  echo ""
82
80
 
83
- # Validate AWS credentials
84
- if ! aws sts get-caller-identity &>/dev/null; then
85
- echo "❌ AWS credentials not configured or expired."
86
- echo " Run: aws configure"
87
- exit 4
88
- fi
89
-
90
81
  # Resolve execution role from profile
91
82
  local execution_role
92
83
  execution_role=$(echo "${_PROFILE_JSON}" | python3 -c "
93
84
  import sys, json
94
85
  p = json.load(sys.stdin)
95
- print(p.get('executionRoleArn', ''))
86
+ print(p.get('roleArn', ''))
96
87
  " 2>/dev/null) || execution_role=""
97
88
 
98
89
  if [ -z "${execution_role}" ]; then
@@ -102,266 +93,88 @@ print(p.get('executionRoleArn', ''))
102
93
  exit 1
103
94
  fi
104
95
 
105
- # Resolve HF token ARN for the processing job (optional — for gated models)
96
+ # Resolve HF token (optional — for gated models)
97
+ local hf_token_value=""
106
98
  local hf_token_secret_arn="${HF_TOKEN_ARN:-}"
107
-
108
- # Generate job name with timestamp
109
- local timestamp
110
- timestamp=$(date +%Y%m%d-%H%M%S)
111
- local job_name="mlcc-stage-${PROJECT_NAME}-${timestamp}"
112
- # SageMaker job names max 63 chars, must match [a-zA-Z0-9](-*[a-zA-Z0-9])*
113
- job_name=$(echo "${job_name}" | cut -c1-63 | sed 's/[^a-zA-Z0-9-]/-/g' | sed 's/-*$//')
114
-
115
- echo " Job name: ${job_name}"
116
- echo ""
117
-
118
- # Build the entrypoint script that runs inside the processing container
119
- local entrypoint_script
120
- entrypoint_script=$(cat <<'ENTRYPOINT_EOF'
121
- #!/bin/bash
122
- set -e
123
- set -o pipefail
124
-
125
- echo "=== MCC Model Staging Processing Job ==="
126
- echo "Model: ${MODEL_ID}"
127
- echo "Target: ${S3_OUTPUT_URI}"
128
- echo ""
129
-
130
- # Install dependencies
131
- echo "📦 Installing huggingface-cli and hf_transfer..."
132
- pip install -q huggingface_hub[cli] hf_transfer
133
-
134
- # Enable fast parallel downloads
135
- export HF_HUB_ENABLE_HF_TRANSFER=1
136
-
137
- # Set HF token if provided
138
- if [ -n "${HF_TOKEN:-}" ]; then
139
- echo "🔐 Using provided HuggingFace token"
140
- fi
141
-
142
- # Download model from HuggingFace
143
- echo ""
144
- echo "⬇️ Downloading model: ${MODEL_ID}"
145
- DOWNLOAD_ARGS="${MODEL_ID}"
146
- if [ -n "${HF_TOKEN:-}" ]; then
147
- DOWNLOAD_ARGS="${DOWNLOAD_ARGS} --token ${HF_TOKEN}"
148
- fi
149
- huggingface-cli download ${DOWNLOAD_ARGS}
150
-
151
- echo ""
152
- echo "✅ Download complete"
153
-
154
- # Locate downloaded files
155
- CACHE_PATH=$(python3 -c "
156
- from huggingface_hub import snapshot_download
157
- path = snapshot_download('${MODEL_ID}', local_files_only=True)
158
- print(path)
159
- ")
160
-
161
- echo "📁 Cache path: ${CACHE_PATH}"
162
-
163
- # Sync to S3
164
- echo ""
165
- echo "☁️ Syncing to S3: ${S3_OUTPUT_URI}"
166
- aws s3 sync "${CACHE_PATH}" "${S3_OUTPUT_URI}" \
167
- --no-progress \
168
- --exclude "*.lock" \
169
- --exclude ".gitattributes"
170
-
171
- echo ""
172
- echo "✅ Model staged successfully to: ${S3_OUTPUT_URI}"
173
- ENTRYPOINT_EOF
174
- )
175
-
176
- # Build environment variables for the container
177
- local env_vars="MODEL_ID=${MODEL_NAME},S3_OUTPUT_URI=${MODEL_S3_URI}"
178
99
  if [ -n "${hf_token_secret_arn}" ]; then
179
- # Resolve token and pass as env var to the job
180
- local hf_token_value=""
181
100
  hf_token_value=$(aws secretsmanager get-secret-value \
182
101
  --secret-id "${hf_token_secret_arn}" \
183
102
  --query SecretString --output text 2>/dev/null) || hf_token_value=""
184
- if [ -n "${hf_token_value}" ]; then
185
- env_vars="${env_vars},HF_TOKEN=${hf_token_value}"
186
- fi
187
103
  elif [ -n "${HF_TOKEN:-}" ]; then
188
- env_vars="${env_vars},HF_TOKEN=${HF_TOKEN}"
104
+ hf_token_value="${HF_TOKEN}"
189
105
  fi
190
106
 
191
- # Write entrypoint to a temp file for the processing job input
192
- local entrypoint_s3_key="staging-jobs/${job_name}/entrypoint.sh"
193
- local entrypoint_s3_uri="s3://${STAGE_S3_BUCKET}/${entrypoint_s3_key}"
194
-
195
- echo "📤 Uploading entrypoint script..."
196
- echo "${entrypoint_script}" | aws s3 cp - "${entrypoint_s3_uri}" --region "${AWS_REGION}"
197
-
198
- # Create the processing job
199
- # Uses a lightweight Python image with AWS CLI pre-installed
200
- local container_image="763104351884.dkr.ecr.${AWS_REGION}.amazonaws.com/pytorch-training:2.1.0-cpu-py310-ubuntu20.04-sagemaker"
201
-
202
- local processing_request
203
- processing_request=$(python3 -c "
204
- import json, sys
205
-
206
- job = {
207
- 'ProcessingJobName': '${job_name}',
208
- 'ProcessingResources': {
209
- 'ClusterConfig': {
210
- 'InstanceCount': 1,
211
- 'InstanceType': '${PROCESSING_JOB_INSTANCE_TYPE}',
212
- 'VolumeSizeInGB': ${PROCESSING_JOB_VOLUME_GB}
213
- }
214
- },
215
- 'AppSpecification': {
216
- 'ImageUri': '${container_image}',
217
- 'ContainerEntrypoint': ['bash', '-c'],
218
- 'ContainerArguments': ['aws s3 cp ${entrypoint_s3_uri} /tmp/entrypoint.sh && chmod +x /tmp/entrypoint.sh && /tmp/entrypoint.sh']
219
- },
220
- 'Environment': dict(item.split('=', 1) for item in '${env_vars}'.split(',')),
221
- 'RoleArn': '${execution_role}',
222
- 'StoppingCondition': {
223
- 'MaxRuntimeInSeconds': 86400
224
- }
225
- }
226
-
227
- print(json.dumps(job, indent=2))
228
- ")
229
-
230
- # Write request JSON to temp file
231
- local request_file="/tmp/mlcc-stage-request-${timestamp}.json"
232
- echo "${processing_request}" > "${request_file}"
233
-
234
- echo "🚀 Creating Processing Job: ${job_name}"
235
- echo ""
236
-
237
- local create_output
238
- local create_exit_code
239
- create_output=$(aws sagemaker create-processing-job \
240
- --cli-input-json "file://${request_file}" \
241
- --region "${AWS_REGION}" 2>&1) || create_exit_code=$?
242
- create_exit_code=${create_exit_code:-0}
107
+ # Build helper arguments
108
+ local helper_args=(
109
+ submit
110
+ --model-name "${MODEL_NAME}"
111
+ --bucket "${STAGE_S3_BUCKET}"
112
+ --project "${PROJECT_NAME}"
113
+ --role-arn "${execution_role}"
114
+ --region "${AWS_REGION}"
115
+ --instance-type "${PROCESSING_JOB_INSTANCE_TYPE}"
116
+ --volume-size-gb "${PROCESSING_JOB_VOLUME_GB}"
117
+ )
118
+ if [ -n "${hf_token_value}" ]; then
119
+ helper_args+=(--hf-token "${hf_token_value}")
120
+ fi
121
+ if [ "${FORCE}" = true ]; then
122
+ helper_args+=(--force)
123
+ fi
124
+ if [ "${NO_WAIT}" = true ]; then
125
+ helper_args+=(--no-wait)
126
+ fi
243
127
 
244
- rm -f "${request_file}"
128
+ # Call .stage_helper.py (sagemaker-core ProcessingJob.create())
129
+ # stdout = JSON result, stderr = progress messages (piped to user)
130
+ local json_output
131
+ local helper_exit_code=0
132
+ json_output=$(python3 "${SCRIPT_DIR}/.stage_helper.py" "${helper_args[@]}") || helper_exit_code=$?
245
133
 
246
- if [ ${create_exit_code} -ne 0 ]; then
247
- echo "❌ Failed to create Processing Job"
248
- echo " ${create_output}"
134
+ if [ ${helper_exit_code} -ne 0 ]; then
249
135
  echo ""
250
- if echo "${create_output}" | grep -q "AccessDeniedException"; then
251
- echo " Remediation: ensure the execution role has sagemaker:CreateProcessingJob permission"
252
- fi
253
- exit 1
136
+ echo " Processing Job failed"
137
+ echo " To retry: ./do/stage --force"
138
+ exit ${helper_exit_code}
254
139
  fi
255
140
 
256
- echo " ✅ Processing Job submitted: ${job_name}"
257
- echo ""
141
+ # Parse JSON output
142
+ local job_status
143
+ local job_name
144
+ local s3_uri
145
+ job_status=$(echo "${json_output}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('status',''))" 2>/dev/null) || job_status=""
146
+ job_name=$(echo "${json_output}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('job_name',''))" 2>/dev/null) || job_name=""
147
+ s3_uri=$(echo "${json_output}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('s3_uri',''))" 2>/dev/null) || s3_uri="${MODEL_S3_URI}"
258
148
 
259
- # Handle --no-wait
260
- if [ "${NO_WAIT}" = true ]; then
149
+ if [ "${job_status}" = "AlreadyStaged" ]; then
150
+ echo "✅ Model already staged at: ${s3_uri}"
151
+ echo " Use --force to re-stage."
152
+ elif [ "${job_status}" = "Submitted" ]; then
153
+ echo " ✅ Processing Job submitted: ${job_name}"
154
+ echo ""
261
155
  echo " --no-wait specified. Job submitted, exiting without polling."
262
156
  echo ""
263
157
  echo " Check status:"
264
- echo " aws sagemaker describe-processing-job --processing-job-name ${job_name} --region ${AWS_REGION}"
158
+ echo " python3 ${SCRIPT_DIR}/.stage_helper.py status --job-name ${job_name}"
265
159
  echo ""
266
160
  echo " On completion, the staged model will be at:"
267
- echo " ${MODEL_S3_URI}"
268
- return 0
161
+ echo " ${s3_uri}"
162
+ elif [ "${job_status}" = "Completed" ]; then
163
+ echo ""
164
+ echo "✅ Processing Job completed: ${job_name}"
165
+ echo ""
166
+ echo " S3 URI: ${s3_uri}"
269
167
  fi
270
168
 
271
- # Poll for completion
272
- _poll_processing_job "${job_name}"
273
- }
274
-
275
- # ── Poll Processing Job status ────────────────────────────────────────────────
276
- _poll_processing_job() {
277
- local job_name="$1"
278
-
279
- echo "⏳ Polling Processing Job status (every ${POLL_INTERVAL}s)..."
280
- echo " (Ctrl+C to stop polling — job continues in background)"
281
- echo ""
282
-
283
- while true; do
284
- local describe_output
285
- local describe_exit_code
286
- describe_output=$(aws sagemaker describe-processing-job \
287
- --processing-job-name "${job_name}" \
288
- --region "${AWS_REGION}" 2>&1) || describe_exit_code=$?
289
- describe_exit_code=${describe_exit_code:-0}
290
-
291
- if [ ${describe_exit_code} -ne 0 ]; then
292
- echo " ⚠️ Failed to describe job (will retry): ${describe_output}"
293
- sleep "${POLL_INTERVAL}"
294
- continue
295
- fi
296
-
297
- # Parse status from response
298
- local job_status
299
- local failure_reason
300
- job_status=$(echo "${describe_output}" | python3 -c "
301
- import sys, json
302
- d = json.load(sys.stdin)
303
- print(d.get('ProcessingJobStatus', 'Unknown'))
304
- " 2>/dev/null) || job_status="Unknown"
305
-
306
- failure_reason=$(echo "${describe_output}" | python3 -c "
307
- import sys, json
308
- d = json.load(sys.stdin)
309
- print(d.get('FailureReason', ''))
310
- " 2>/dev/null) || failure_reason=""
311
-
312
- # Print status
313
- local now
314
- now=$(date +%H:%M:%S)
315
- echo " [${now}] Status: ${job_status}"
316
-
317
- # Handle terminal states
318
- case "${job_status}" in
319
- Completed)
320
- echo ""
321
- echo "✅ Processing Job completed: ${job_name}"
322
- echo ""
323
- echo " S3 URI: ${MODEL_S3_URI}"
324
- echo ""
325
- if [ "${UPDATE_CONFIG}" = true ]; then
326
- CONFIG_FILE="${SCRIPT_DIR}/config"
327
- sed -i.bak "s|^export MODEL_NAME=.*|export MODEL_NAME=\"${MODEL_S3_URI}\"|" "${CONFIG_FILE}"
328
- rm -f "${CONFIG_FILE}.bak"
329
- echo " ✅ Updated MODEL_NAME in do/config → ${MODEL_S3_URI}"
330
- echo ""
331
- echo " Re-deploy with S3-backed model: ./do/deploy"
332
- else
333
- echo " To use this staged model, update do/config:"
334
- echo " export MODEL_NAME=\"${MODEL_S3_URI}\""
335
- echo ""
336
- echo " Or re-run with --update-config:"
337
- echo " ./do/stage --submit --update-config"
338
- fi
339
- return 0
340
- ;;
341
- Failed)
342
- echo ""
343
- echo "❌ Processing Job failed: ${job_name}"
344
- if [ -n "${failure_reason}" ]; then
345
- echo " Reason: ${failure_reason}"
346
- fi
347
- echo ""
348
- echo " Check CloudWatch logs:"
349
- echo " /aws/sagemaker/ProcessingJobs/${job_name}"
350
- echo ""
351
- echo " To retry: ./do/stage --submit --force"
352
- return 1
353
- ;;
354
- Stopped)
355
- echo ""
356
- echo "⏹️ Processing Job was stopped: ${job_name}"
357
- echo ""
358
- echo " To retry: ./do/stage --submit --force"
359
- return 2
360
- ;;
361
- esac
362
-
363
- sleep "${POLL_INTERVAL}"
364
- done
169
+ # Update config if requested and we have a valid S3 URI
170
+ if [ "${UPDATE_CONFIG}" = true ] && [ -n "${s3_uri}" ] && [ "${job_status}" != "Submitted" ]; then
171
+ CONFIG_FILE="${SCRIPT_DIR}/config"
172
+ sed -i.bak "s|^export MODEL_NAME=.*|export MODEL_NAME=\"${s3_uri}\"|" "${CONFIG_FILE}"
173
+ rm -f "${CONFIG_FILE}.bak"
174
+ echo ""
175
+ echo " ✅ Updated MODEL_NAME in do/config → S3-backed"
176
+ echo " Subsequent tasks (submit, deploy) will pull from S3."
177
+ fi
365
178
  }
366
179
 
367
180
  # ── Check if model is already an S3 URI ──────────────────────────────────────
@@ -409,21 +222,28 @@ if [ -z "${STAGE_S3_BUCKET}" ]; then
409
222
  exit 1
410
223
  fi
411
224
 
412
- # Target S3 path for staged model
413
- MODEL_S3_URI="s3://${STAGE_S3_BUCKET}/models/${PROJECT_NAME}/"
225
+ # Target S3 path for staged model: s3://{bucket}/{project}/models/{model-slug}/
226
+ # Sanitize MODEL_NAME for use as an S3 path segment:
227
+ # - Replace / with -- (e.g., "nvidia/Nemotron-3-Ultra..." → "nvidia--Nemotron-3-Ultra...")
228
+ # - This prevents HF org/repo IDs from creating nested S3 prefixes
229
+ MODEL_SLUG="${MODEL_NAME//\//-}"
230
+ MODEL_S3_URI="s3://${STAGE_S3_BUCKET}/${PROJECT_NAME}/models/${MODEL_SLUG}/"
414
231
 
415
232
  echo " Target: ${MODEL_S3_URI}"
416
233
  echo ""
417
234
 
418
- # ── Submit mode: SageMaker Processing Job ─────────────────────────────────────
419
- # For very large models (>500GB) that exceed local disk, submit a Processing Job
420
- # with 2TB attached storage. The job downloads from HuggingFace and syncs to S3.
421
- if [ "${SUBMIT_MODE}" = true ]; then
235
+ # ── Default mode: SageMaker Processing Job via .stage_helper.py ───────────────
236
+ # Submits a Processing Job that downloads model weights from HuggingFace and
237
+ # syncs to S3 directly no local disk usage. Uses sagemaker-core SDK v3.
238
+ if [ "${LOCAL_MODE}" = false ]; then
422
239
  _submit_processing_job
423
240
  exit $?
424
241
  fi
425
242
 
426
- # ── Idempotency: check if model is already staged ────────────────────────────
243
+ # ── Local mode: download locally then sync to S3 (--local flag) ───────────────
244
+ # Preserved for offline work, debugging, or when Processing Jobs are unavailable.
245
+
246
+ # Idempotency: check if model is already staged
427
247
  if [ "${FORCE}" = false ]; then
428
248
  if aws s3 ls "${MODEL_S3_URI}config.json" --region "${AWS_REGION}" &>/dev/null; then
429
249
  echo "✅ Model already staged at: ${MODEL_S3_URI}"
@@ -433,7 +253,7 @@ if [ "${FORCE}" = false ]; then
433
253
  CONFIG_FILE="${SCRIPT_DIR}/config"
434
254
  sed -i.bak "s|^export MODEL_NAME=.*|export MODEL_NAME=\"${MODEL_S3_URI}\"|" "${CONFIG_FILE}"
435
255
  rm -f "${CONFIG_FILE}.bak"
436
- echo " ✅ Updated MODEL_NAME in do/config → ${MODEL_S3_URI}"
256
+ echo " ✅ Updated MODEL_NAME in do/config → S3-backed"
437
257
  else
438
258
  echo " To use this staged model, set in do/config:"
439
259
  echo " export MODEL_NAME=\"${MODEL_S3_URI}\""
@@ -442,7 +262,7 @@ if [ "${FORCE}" = false ]; then
442
262
  fi
443
263
  fi
444
264
 
445
- # ── Validate prerequisites ───────────────────────────────────────────────────
265
+ # Validate prerequisites
446
266
  if ! command -v huggingface-cli &>/dev/null; then
447
267
  echo "❌ huggingface-cli is not installed"
448
268
  echo " Install: pip install huggingface_hub[cli] hf_transfer"
@@ -474,13 +294,21 @@ fi
474
294
 
475
295
  # ── Download model from HuggingFace ──────────────────────────────────────────
476
296
  echo "⬇️ Downloading model from HuggingFace: ${MODEL_NAME}"
477
- echo " Using hf_transfer for fast parallel downloads..."
297
+ if python3 -c "import hf_transfer" 2>/dev/null; then
298
+ echo " Using hf_transfer for fast parallel downloads..."
299
+ else
300
+ echo " Using standard downloads (install hf_transfer for faster staging)..."
301
+ fi
478
302
  echo ""
479
303
 
480
- # Enable fast parallel downloads via hf_transfer
481
- export HF_HUB_ENABLE_HF_TRANSFER=1
304
+ # Enable fast parallel downloads via hf_transfer (if available)
305
+ if python3 -c "import hf_transfer" 2>/dev/null; then
306
+ export HF_HUB_ENABLE_HF_TRANSFER=1
307
+ else
308
+ unset HF_HUB_ENABLE_HF_TRANSFER 2>/dev/null || true
309
+ fi
482
310
 
483
- # Download to HF cache (huggingface-cli manages cache location)
311
+ # Download to HF cache
484
312
  DOWNLOAD_ARGS=("${MODEL_NAME}")
485
313
  if [ -n "${HF_TOKEN:-}" ]; then
486
314
  DOWNLOAD_ARGS+=("--token" "${HF_TOKEN}")
@@ -555,15 +383,9 @@ if [ "${UPDATE_CONFIG}" = true ]; then
555
383
  CONFIG_FILE="${SCRIPT_DIR}/config"
556
384
  sed -i.bak "s|^export MODEL_NAME=.*|export MODEL_NAME=\"${MODEL_S3_URI}\"|" "${CONFIG_FILE}"
557
385
  rm -f "${CONFIG_FILE}.bak"
558
- echo " ✅ Updated MODEL_NAME in do/config → ${MODEL_S3_URI}"
559
- echo ""
560
- echo " Re-deploy with S3-backed model: ./do/deploy"
386
+ echo " ✅ Updated MODEL_NAME in do/config → S3-backed"
387
+ echo " Subsequent tasks (submit, deploy) will pull from S3."
561
388
  else
562
389
  echo " To use this staged model, update do/config:"
563
390
  echo " export MODEL_NAME=\"${MODEL_S3_URI}\""
564
- echo ""
565
- echo " Or re-run with --update-config to do it automatically:"
566
- echo " ./do/stage --update-config"
567
- echo ""
568
- echo " Then re-deploy: ./do/deploy"
569
391
  fi
@@ -12,10 +12,7 @@ source "${SCRIPT_DIR}/config"
12
12
  source "${SCRIPT_DIR}/lib/profile.sh"
13
13
 
14
14
  # ── Profile-resolved variables (env var > profile > default) ──────────────────
15
- # Disable unbound-variable checking for associative array access (bash 3.2 compat)
16
- set +u
17
- ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE[ecrRepositoryName]:-ml-container-creator}}"
18
- set -u
15
+ ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE_ecrRepositoryName:-ml-container-creator}}"
19
16
 
20
17
  # ── Derived variables (env var > computed default) ────────────────────────────
21
18
  CODEBUILD_PROJECT_NAME="${CODEBUILD_PROJECT_NAME:-${PROJECT_NAME}-build-$(date +%Y%m%d)}"