@aws/ml-container-creator 0.13.3 → 0.13.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -5
- package/infra/ci-harness/package-lock.json +1 -5
- package/package.json +5 -3
- package/pyproject.toml +21 -0
- package/requirements.txt +19 -0
- package/servers/instance-sizer/lib/model-resolver.js +127 -185
- package/servers/instance-sizer/lib/vram-estimator.js +86 -0
- package/servers/lib/catalogs/instances.json +0 -27
- package/src/app.js +2 -0
- package/src/lib/bootstrap-command-handler.js +35 -25
- package/src/lib/generated/cli-options.js +1 -1
- package/src/lib/generated/parameter-matrix.js +1 -1
- package/src/lib/generated/validation-rules.js +1 -1
- package/src/lib/prompt-runner.js +14 -31
- package/templates/IAM_PERMISSIONS.md +64 -13
- package/templates/do/.adapter_helper.py +451 -0
- package/templates/do/.benchmark_writer.py +13 -0
- package/templates/do/.stage_helper.py +419 -0
- package/templates/do/.tune_helper.py +218 -67
- package/templates/do/README.md +50 -604
- package/templates/do/__pycache__/.adapter_helper.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.tune_helper.cpython-312.pyc +0 -0
- package/templates/do/adapter +109 -4
- package/templates/do/benchmark +150 -12
- package/templates/do/build +2 -5
- package/templates/do/clean.d/async-inference.ejs +2 -5
- package/templates/do/clean.d/batch-transform.ejs +2 -5
- package/templates/do/clean.d/hyperpod-eks.ejs +2 -5
- package/templates/do/clean.d/managed-inference.ejs +2 -5
- package/templates/do/config +4 -0
- package/templates/do/deploy.d/async-inference.ejs +6 -9
- package/templates/do/deploy.d/batch-transform.ejs +4 -7
- package/templates/do/deploy.d/hyperpod-eks.ejs +1 -4
- package/templates/do/deploy.d/managed-inference.ejs +15 -6
- package/templates/do/lib/profile.sh +24 -15
- package/templates/do/push +2 -5
- package/templates/do/register +2 -5
- package/templates/do/stage +114 -292
- package/templates/do/submit +1 -4
- package/templates/do/tune +64 -10
- package/templates/MIGRATION.md +0 -488
- package/templates/TEMPLATE_SYSTEM.md +0 -243
|
@@ -1,21 +1,27 @@
|
|
|
1
1
|
#!/usr/bin/env bash
|
|
2
|
-
# Profile loader — reads active bootstrap profile into
|
|
2
|
+
# Profile loader — reads active bootstrap profile into _PROFILE_<key> variables.
|
|
3
3
|
# Source this file after do/config. Values provide defaults; explicit env vars take precedence.
|
|
4
4
|
#
|
|
5
|
-
#
|
|
6
|
-
#
|
|
5
|
+
# POSIX-compatible: works on bash 3.2+ (macOS default) and bash 4+/5+.
|
|
6
|
+
# No associative arrays required.
|
|
7
7
|
#
|
|
8
|
-
#
|
|
8
|
+
# After sourcing, access values via:
|
|
9
|
+
# ${_PROFILE_roleArn:-}
|
|
10
|
+
# ${_PROFILE_ecrRepositoryName:-ml-container-creator}
|
|
11
|
+
# ${_PROFILE_awsRegion:-us-east-1}
|
|
12
|
+
# ${_PROFILE_accountId:-}
|
|
13
|
+
# ${_PROFILE_benchmarkS3Bucket:-}
|
|
14
|
+
# ${_PROFILE_asyncS3Bucket:-}
|
|
15
|
+
# ${_PROFILE_batchS3Bucket:-}
|
|
16
|
+
#
|
|
17
|
+
# Expected keys (set as _PROFILE_<key>):
|
|
9
18
|
# awsRegion, accountId, awsProfile, roleArn, ecrRepositoryName,
|
|
10
19
|
# benchmarkS3Bucket, ciBenchmarkResultsBucket, asyncS3Bucket, batchS3Bucket,
|
|
11
20
|
# ciTableName, ciInfraProvisioned
|
|
12
21
|
|
|
13
22
|
# Temporarily disable unbound variable checking for profile loading
|
|
14
|
-
# (keys may not exist in the profile config, and declare -A behavior
|
|
15
|
-
# varies across bash versions with set -u)
|
|
16
23
|
set +u 2>/dev/null || true
|
|
17
24
|
|
|
18
|
-
declare -A _PROFILE 2>/dev/null || true
|
|
19
25
|
if command -v python3 &>/dev/null; then
|
|
20
26
|
_PROFILE_RAW=$(python3 -c "
|
|
21
27
|
import json, os
|
|
@@ -23,22 +29,25 @@ try:
|
|
|
23
29
|
with open(os.path.expanduser('~/.ml-container-creator/config.json')) as f:
|
|
24
30
|
c = json.load(f)
|
|
25
31
|
p = c['profiles'][c['activeProfile']]
|
|
26
|
-
# Output as
|
|
32
|
+
# Output as _PROFILE_KEY=VALUE lines — safe for eval with known prefix
|
|
27
33
|
for k, v in p.items():
|
|
28
34
|
if isinstance(v, (str, int, float, bool)):
|
|
29
|
-
|
|
35
|
+
# Sanitize: only allow alphanumeric key names
|
|
36
|
+
if k.isalnum() or all(c.isalnum() or c == '_' for c in k):
|
|
37
|
+
print(f'_PROFILE_{k}=\"{v}\"')
|
|
30
38
|
except:
|
|
31
39
|
pass
|
|
32
40
|
" 2>/dev/null) || _PROFILE_RAW=""
|
|
33
41
|
|
|
34
42
|
if [ -n "${_PROFILE_RAW}" ]; then
|
|
35
|
-
|
|
36
|
-
[ -n "${key}" ] && _PROFILE["${key}"]="${value}"
|
|
37
|
-
done <<< "${_PROFILE_RAW}"
|
|
43
|
+
eval "${_PROFILE_RAW}"
|
|
38
44
|
fi
|
|
39
45
|
fi
|
|
40
46
|
|
|
47
|
+
# Map commonly-used profile values to the variable names scripts expect.
|
|
48
|
+
# Explicit env vars take precedence (${X:-...} pattern).
|
|
49
|
+
ROLE_ARN="${ROLE_ARN:-${_PROFILE_roleArn:-}}"
|
|
50
|
+
CI_BENCHMARK_RESULTS_BUCKET="${CI_BENCHMARK_RESULTS_BUCKET:-${_PROFILE_ciBenchmarkResultsBucket:-}}"
|
|
51
|
+
|
|
41
52
|
# NOTE: set -u is NOT re-enabled here. The caller is responsible for managing
|
|
42
|
-
# their own shell options.
|
|
43
|
-
# errors when accessing _PROFILE keys on bash versions where empty associative
|
|
44
|
-
# arrays are treated as unset (bash 5.x on some platforms).
|
|
53
|
+
# their own shell options.
|
package/templates/do/push
CHANGED
|
@@ -12,11 +12,8 @@ source "${SCRIPT_DIR}/config"
|
|
|
12
12
|
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
13
13
|
|
|
14
14
|
# ── Profile-resolved variables (env var > profile > default) ──────────────────
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE[ecrRepositoryName]:-ml-container-creator}}"
|
|
18
|
-
export AWS_REGION="${AWS_REGION:-${_PROFILE[awsRegion]:-us-east-1}}"
|
|
19
|
-
set -u
|
|
15
|
+
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE_ecrRepositoryName:-ml-container-creator}}"
|
|
16
|
+
export AWS_REGION="${AWS_REGION:-${_PROFILE_awsRegion:-us-east-1}}"
|
|
20
17
|
|
|
21
18
|
echo "🚀 Pushing Docker image to Amazon ECR"
|
|
22
19
|
echo " Project: ${PROJECT_NAME}"
|
package/templates/do/register
CHANGED
|
@@ -12,11 +12,8 @@ source "${SCRIPT_DIR}/config"
|
|
|
12
12
|
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
13
13
|
|
|
14
14
|
# ── Profile-resolved variables (env var > profile > default) ──────────────────
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
ROLE_ARN="${ROLE_ARN:-${_PROFILE[roleArn]:-}}"
|
|
18
|
-
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE[ecrRepositoryName]:-ml-container-creator}}"
|
|
19
|
-
set -u
|
|
15
|
+
ROLE_ARN="${ROLE_ARN:-${_PROFILE_roleArn:-}}"
|
|
16
|
+
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE_ecrRepositoryName:-ml-container-creator}}"
|
|
20
17
|
|
|
21
18
|
# ============================================================
|
|
22
19
|
# Register deployment to the deployment registry
|
package/templates/do/stage
CHANGED
|
@@ -3,18 +3,17 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
# do/stage — Pre-stage model weights from HuggingFace to S3
|
|
6
|
-
#
|
|
7
|
-
#
|
|
6
|
+
# Submits a SageMaker Processing Job that downloads from HuggingFace
|
|
7
|
+
# and writes directly to S3 — no local disk usage.
|
|
8
8
|
#
|
|
9
9
|
# Idempotent: if the model is already staged (config.json exists at
|
|
10
10
|
# the target S3 path), the script exits early.
|
|
11
11
|
#
|
|
12
12
|
# Usage:
|
|
13
|
-
# ./do/stage
|
|
13
|
+
# ./do/stage Submit Processing Job to stage model (default)
|
|
14
|
+
# ./do/stage --local Download locally then sync to S3
|
|
15
|
+
# ./do/stage --no-wait Submit and exit without polling
|
|
14
16
|
# ./do/stage --force Re-stage even if already present in S3
|
|
15
|
-
# ./do/stage --update-config Stage and update MODEL_NAME in do/config
|
|
16
|
-
# ./do/stage --submit Submit as SageMaker Processing Job (for models >500GB)
|
|
17
|
-
# ./do/stage --submit --no-wait Submit and exit without polling
|
|
18
17
|
|
|
19
18
|
set -e
|
|
20
19
|
set -u
|
|
@@ -28,47 +27,46 @@ source "${SCRIPT_DIR}/lib/staged-assets.sh"
|
|
|
28
27
|
|
|
29
28
|
# ── Parse flags ───────────────────────────────────────────────────────────────
|
|
30
29
|
FORCE=false
|
|
31
|
-
UPDATE_CONFIG=
|
|
32
|
-
|
|
30
|
+
UPDATE_CONFIG=true
|
|
31
|
+
LOCAL_MODE=false
|
|
33
32
|
NO_WAIT=false
|
|
34
33
|
while [ $# -gt 0 ]; do
|
|
35
34
|
case "$1" in
|
|
36
35
|
--force) FORCE=true; shift ;;
|
|
37
|
-
--update-config) UPDATE_CONFIG=true; shift ;;
|
|
38
|
-
--
|
|
36
|
+
--update-config) UPDATE_CONFIG=true; shift ;; # default, kept for backward compat
|
|
37
|
+
--no-update-config) UPDATE_CONFIG=false; shift ;;
|
|
38
|
+
--local) LOCAL_MODE=true; shift ;;
|
|
39
|
+
--submit) shift ;; # Deprecated — now the default; kept for backward compat
|
|
39
40
|
--no-wait) NO_WAIT=true; shift ;;
|
|
40
41
|
--help|-h)
|
|
41
|
-
echo "Usage: ./do/stage [--force] [--
|
|
42
|
+
echo "Usage: ./do/stage [--force] [--local] [--no-wait] [--no-update-config]"
|
|
42
43
|
echo ""
|
|
43
44
|
echo "Pre-stage model weights from HuggingFace to S3."
|
|
45
|
+
echo "On success, updates MODEL_NAME in do/config so subsequent tasks"
|
|
46
|
+
echo "(submit, deploy) pull from S3 with HuggingFace as fallback."
|
|
44
47
|
echo ""
|
|
45
48
|
echo "Modes:"
|
|
46
|
-
echo " (default)
|
|
47
|
-
echo " --
|
|
49
|
+
echo " (default) Submit SageMaker Processing Job (no local disk usage)"
|
|
50
|
+
echo " --local Download locally then sync to S3 (legacy behavior)"
|
|
51
|
+
echo " --submit Deprecated — Processing Job is now the default"
|
|
48
52
|
echo ""
|
|
49
53
|
echo "Options:"
|
|
50
|
-
echo " --force
|
|
51
|
-
echo " --update-config
|
|
52
|
-
echo " --no-wait
|
|
54
|
+
echo " --force Re-stage even if model already exists in S3"
|
|
55
|
+
echo " --no-update-config Do NOT update MODEL_NAME in do/config after staging"
|
|
56
|
+
echo " --no-wait Return immediately with job name (Processing Job mode)"
|
|
53
57
|
echo ""
|
|
54
58
|
echo "Environment:"
|
|
55
59
|
echo " HF_TOKEN HuggingFace token (for gated models)"
|
|
56
60
|
echo ""
|
|
57
61
|
echo "The staged S3 URI will be printed on completion."
|
|
58
|
-
echo "
|
|
59
|
-
echo ""
|
|
60
|
-
echo "The --submit mode uses a SageMaker Processing Job with 2TB attached"
|
|
61
|
-
echo "storage, suitable for very large models that exceed local disk capacity."
|
|
62
|
+
echo "MODEL_NAME in do/config is updated automatically unless --no-update-config is passed."
|
|
62
63
|
exit 0
|
|
63
64
|
;;
|
|
64
65
|
*) shift ;;
|
|
65
66
|
esac
|
|
66
67
|
done
|
|
67
68
|
|
|
68
|
-
# ── Processing Job submission
|
|
69
|
-
# Submits a SageMaker Processing Job that downloads model weights from HuggingFace
|
|
70
|
-
# and syncs them to S3. Uses 2TB attached storage to handle any model size.
|
|
71
|
-
POLL_INTERVAL=30
|
|
69
|
+
# ── Processing Job submission via .stage_helper.py ────────────────────────────
|
|
72
70
|
PROCESSING_JOB_INSTANCE_TYPE="ml.m5.xlarge"
|
|
73
71
|
PROCESSING_JOB_VOLUME_GB=2048
|
|
74
72
|
|
|
@@ -80,19 +78,12 @@ _submit_processing_job() {
|
|
|
80
78
|
echo " Storage: ${PROCESSING_JOB_VOLUME_GB} GB"
|
|
81
79
|
echo ""
|
|
82
80
|
|
|
83
|
-
# Validate AWS credentials
|
|
84
|
-
if ! aws sts get-caller-identity &>/dev/null; then
|
|
85
|
-
echo "❌ AWS credentials not configured or expired."
|
|
86
|
-
echo " Run: aws configure"
|
|
87
|
-
exit 4
|
|
88
|
-
fi
|
|
89
|
-
|
|
90
81
|
# Resolve execution role from profile
|
|
91
82
|
local execution_role
|
|
92
83
|
execution_role=$(echo "${_PROFILE_JSON}" | python3 -c "
|
|
93
84
|
import sys, json
|
|
94
85
|
p = json.load(sys.stdin)
|
|
95
|
-
print(p.get('
|
|
86
|
+
print(p.get('roleArn', ''))
|
|
96
87
|
" 2>/dev/null) || execution_role=""
|
|
97
88
|
|
|
98
89
|
if [ -z "${execution_role}" ]; then
|
|
@@ -102,266 +93,88 @@ print(p.get('executionRoleArn', ''))
|
|
|
102
93
|
exit 1
|
|
103
94
|
fi
|
|
104
95
|
|
|
105
|
-
# Resolve HF token
|
|
96
|
+
# Resolve HF token (optional — for gated models)
|
|
97
|
+
local hf_token_value=""
|
|
106
98
|
local hf_token_secret_arn="${HF_TOKEN_ARN:-}"
|
|
107
|
-
|
|
108
|
-
# Generate job name with timestamp
|
|
109
|
-
local timestamp
|
|
110
|
-
timestamp=$(date +%Y%m%d-%H%M%S)
|
|
111
|
-
local job_name="mlcc-stage-${PROJECT_NAME}-${timestamp}"
|
|
112
|
-
# SageMaker job names max 63 chars, must match [a-zA-Z0-9](-*[a-zA-Z0-9])*
|
|
113
|
-
job_name=$(echo "${job_name}" | cut -c1-63 | sed 's/[^a-zA-Z0-9-]/-/g' | sed 's/-*$//')
|
|
114
|
-
|
|
115
|
-
echo " Job name: ${job_name}"
|
|
116
|
-
echo ""
|
|
117
|
-
|
|
118
|
-
# Build the entrypoint script that runs inside the processing container
|
|
119
|
-
local entrypoint_script
|
|
120
|
-
entrypoint_script=$(cat <<'ENTRYPOINT_EOF'
|
|
121
|
-
#!/bin/bash
|
|
122
|
-
set -e
|
|
123
|
-
set -o pipefail
|
|
124
|
-
|
|
125
|
-
echo "=== MCC Model Staging Processing Job ==="
|
|
126
|
-
echo "Model: ${MODEL_ID}"
|
|
127
|
-
echo "Target: ${S3_OUTPUT_URI}"
|
|
128
|
-
echo ""
|
|
129
|
-
|
|
130
|
-
# Install dependencies
|
|
131
|
-
echo "📦 Installing huggingface-cli and hf_transfer..."
|
|
132
|
-
pip install -q huggingface_hub[cli] hf_transfer
|
|
133
|
-
|
|
134
|
-
# Enable fast parallel downloads
|
|
135
|
-
export HF_HUB_ENABLE_HF_TRANSFER=1
|
|
136
|
-
|
|
137
|
-
# Set HF token if provided
|
|
138
|
-
if [ -n "${HF_TOKEN:-}" ]; then
|
|
139
|
-
echo "🔐 Using provided HuggingFace token"
|
|
140
|
-
fi
|
|
141
|
-
|
|
142
|
-
# Download model from HuggingFace
|
|
143
|
-
echo ""
|
|
144
|
-
echo "⬇️ Downloading model: ${MODEL_ID}"
|
|
145
|
-
DOWNLOAD_ARGS="${MODEL_ID}"
|
|
146
|
-
if [ -n "${HF_TOKEN:-}" ]; then
|
|
147
|
-
DOWNLOAD_ARGS="${DOWNLOAD_ARGS} --token ${HF_TOKEN}"
|
|
148
|
-
fi
|
|
149
|
-
huggingface-cli download ${DOWNLOAD_ARGS}
|
|
150
|
-
|
|
151
|
-
echo ""
|
|
152
|
-
echo "✅ Download complete"
|
|
153
|
-
|
|
154
|
-
# Locate downloaded files
|
|
155
|
-
CACHE_PATH=$(python3 -c "
|
|
156
|
-
from huggingface_hub import snapshot_download
|
|
157
|
-
path = snapshot_download('${MODEL_ID}', local_files_only=True)
|
|
158
|
-
print(path)
|
|
159
|
-
")
|
|
160
|
-
|
|
161
|
-
echo "📁 Cache path: ${CACHE_PATH}"
|
|
162
|
-
|
|
163
|
-
# Sync to S3
|
|
164
|
-
echo ""
|
|
165
|
-
echo "☁️ Syncing to S3: ${S3_OUTPUT_URI}"
|
|
166
|
-
aws s3 sync "${CACHE_PATH}" "${S3_OUTPUT_URI}" \
|
|
167
|
-
--no-progress \
|
|
168
|
-
--exclude "*.lock" \
|
|
169
|
-
--exclude ".gitattributes"
|
|
170
|
-
|
|
171
|
-
echo ""
|
|
172
|
-
echo "✅ Model staged successfully to: ${S3_OUTPUT_URI}"
|
|
173
|
-
ENTRYPOINT_EOF
|
|
174
|
-
)
|
|
175
|
-
|
|
176
|
-
# Build environment variables for the container
|
|
177
|
-
local env_vars="MODEL_ID=${MODEL_NAME},S3_OUTPUT_URI=${MODEL_S3_URI}"
|
|
178
99
|
if [ -n "${hf_token_secret_arn}" ]; then
|
|
179
|
-
# Resolve token and pass as env var to the job
|
|
180
|
-
local hf_token_value=""
|
|
181
100
|
hf_token_value=$(aws secretsmanager get-secret-value \
|
|
182
101
|
--secret-id "${hf_token_secret_arn}" \
|
|
183
102
|
--query SecretString --output text 2>/dev/null) || hf_token_value=""
|
|
184
|
-
if [ -n "${hf_token_value}" ]; then
|
|
185
|
-
env_vars="${env_vars},HF_TOKEN=${hf_token_value}"
|
|
186
|
-
fi
|
|
187
103
|
elif [ -n "${HF_TOKEN:-}" ]; then
|
|
188
|
-
|
|
104
|
+
hf_token_value="${HF_TOKEN}"
|
|
189
105
|
fi
|
|
190
106
|
|
|
191
|
-
#
|
|
192
|
-
local
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
'InstanceType': '${PROCESSING_JOB_INSTANCE_TYPE}',
|
|
212
|
-
'VolumeSizeInGB': ${PROCESSING_JOB_VOLUME_GB}
|
|
213
|
-
}
|
|
214
|
-
},
|
|
215
|
-
'AppSpecification': {
|
|
216
|
-
'ImageUri': '${container_image}',
|
|
217
|
-
'ContainerEntrypoint': ['bash', '-c'],
|
|
218
|
-
'ContainerArguments': ['aws s3 cp ${entrypoint_s3_uri} /tmp/entrypoint.sh && chmod +x /tmp/entrypoint.sh && /tmp/entrypoint.sh']
|
|
219
|
-
},
|
|
220
|
-
'Environment': dict(item.split('=', 1) for item in '${env_vars}'.split(',')),
|
|
221
|
-
'RoleArn': '${execution_role}',
|
|
222
|
-
'StoppingCondition': {
|
|
223
|
-
'MaxRuntimeInSeconds': 86400
|
|
224
|
-
}
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
print(json.dumps(job, indent=2))
|
|
228
|
-
")
|
|
229
|
-
|
|
230
|
-
# Write request JSON to temp file
|
|
231
|
-
local request_file="/tmp/mlcc-stage-request-${timestamp}.json"
|
|
232
|
-
echo "${processing_request}" > "${request_file}"
|
|
233
|
-
|
|
234
|
-
echo "🚀 Creating Processing Job: ${job_name}"
|
|
235
|
-
echo ""
|
|
236
|
-
|
|
237
|
-
local create_output
|
|
238
|
-
local create_exit_code
|
|
239
|
-
create_output=$(aws sagemaker create-processing-job \
|
|
240
|
-
--cli-input-json "file://${request_file}" \
|
|
241
|
-
--region "${AWS_REGION}" 2>&1) || create_exit_code=$?
|
|
242
|
-
create_exit_code=${create_exit_code:-0}
|
|
107
|
+
# Build helper arguments
|
|
108
|
+
local helper_args=(
|
|
109
|
+
submit
|
|
110
|
+
--model-name "${MODEL_NAME}"
|
|
111
|
+
--bucket "${STAGE_S3_BUCKET}"
|
|
112
|
+
--project "${PROJECT_NAME}"
|
|
113
|
+
--role-arn "${execution_role}"
|
|
114
|
+
--region "${AWS_REGION}"
|
|
115
|
+
--instance-type "${PROCESSING_JOB_INSTANCE_TYPE}"
|
|
116
|
+
--volume-size-gb "${PROCESSING_JOB_VOLUME_GB}"
|
|
117
|
+
)
|
|
118
|
+
if [ -n "${hf_token_value}" ]; then
|
|
119
|
+
helper_args+=(--hf-token "${hf_token_value}")
|
|
120
|
+
fi
|
|
121
|
+
if [ "${FORCE}" = true ]; then
|
|
122
|
+
helper_args+=(--force)
|
|
123
|
+
fi
|
|
124
|
+
if [ "${NO_WAIT}" = true ]; then
|
|
125
|
+
helper_args+=(--no-wait)
|
|
126
|
+
fi
|
|
243
127
|
|
|
244
|
-
|
|
128
|
+
# Call .stage_helper.py (sagemaker-core ProcessingJob.create())
|
|
129
|
+
# stdout = JSON result, stderr = progress messages (piped to user)
|
|
130
|
+
local json_output
|
|
131
|
+
local helper_exit_code=0
|
|
132
|
+
json_output=$(python3 "${SCRIPT_DIR}/.stage_helper.py" "${helper_args[@]}") || helper_exit_code=$?
|
|
245
133
|
|
|
246
|
-
if [ ${
|
|
247
|
-
echo "❌ Failed to create Processing Job"
|
|
248
|
-
echo " ${create_output}"
|
|
134
|
+
if [ ${helper_exit_code} -ne 0 ]; then
|
|
249
135
|
echo ""
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
exit 1
|
|
136
|
+
echo "❌ Processing Job failed"
|
|
137
|
+
echo " To retry: ./do/stage --force"
|
|
138
|
+
exit ${helper_exit_code}
|
|
254
139
|
fi
|
|
255
140
|
|
|
256
|
-
|
|
257
|
-
|
|
141
|
+
# Parse JSON output
|
|
142
|
+
local job_status
|
|
143
|
+
local job_name
|
|
144
|
+
local s3_uri
|
|
145
|
+
job_status=$(echo "${json_output}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('status',''))" 2>/dev/null) || job_status=""
|
|
146
|
+
job_name=$(echo "${json_output}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('job_name',''))" 2>/dev/null) || job_name=""
|
|
147
|
+
s3_uri=$(echo "${json_output}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('s3_uri',''))" 2>/dev/null) || s3_uri="${MODEL_S3_URI}"
|
|
258
148
|
|
|
259
|
-
|
|
260
|
-
|
|
149
|
+
if [ "${job_status}" = "AlreadyStaged" ]; then
|
|
150
|
+
echo "✅ Model already staged at: ${s3_uri}"
|
|
151
|
+
echo " Use --force to re-stage."
|
|
152
|
+
elif [ "${job_status}" = "Submitted" ]; then
|
|
153
|
+
echo " ✅ Processing Job submitted: ${job_name}"
|
|
154
|
+
echo ""
|
|
261
155
|
echo " --no-wait specified. Job submitted, exiting without polling."
|
|
262
156
|
echo ""
|
|
263
157
|
echo " Check status:"
|
|
264
|
-
echo "
|
|
158
|
+
echo " python3 ${SCRIPT_DIR}/.stage_helper.py status --job-name ${job_name}"
|
|
265
159
|
echo ""
|
|
266
160
|
echo " On completion, the staged model will be at:"
|
|
267
|
-
echo " ${
|
|
268
|
-
|
|
161
|
+
echo " ${s3_uri}"
|
|
162
|
+
elif [ "${job_status}" = "Completed" ]; then
|
|
163
|
+
echo ""
|
|
164
|
+
echo "✅ Processing Job completed: ${job_name}"
|
|
165
|
+
echo ""
|
|
166
|
+
echo " S3 URI: ${s3_uri}"
|
|
269
167
|
fi
|
|
270
168
|
|
|
271
|
-
#
|
|
272
|
-
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
echo " (Ctrl+C to stop polling — job continues in background)"
|
|
281
|
-
echo ""
|
|
282
|
-
|
|
283
|
-
while true; do
|
|
284
|
-
local describe_output
|
|
285
|
-
local describe_exit_code
|
|
286
|
-
describe_output=$(aws sagemaker describe-processing-job \
|
|
287
|
-
--processing-job-name "${job_name}" \
|
|
288
|
-
--region "${AWS_REGION}" 2>&1) || describe_exit_code=$?
|
|
289
|
-
describe_exit_code=${describe_exit_code:-0}
|
|
290
|
-
|
|
291
|
-
if [ ${describe_exit_code} -ne 0 ]; then
|
|
292
|
-
echo " ⚠️ Failed to describe job (will retry): ${describe_output}"
|
|
293
|
-
sleep "${POLL_INTERVAL}"
|
|
294
|
-
continue
|
|
295
|
-
fi
|
|
296
|
-
|
|
297
|
-
# Parse status from response
|
|
298
|
-
local job_status
|
|
299
|
-
local failure_reason
|
|
300
|
-
job_status=$(echo "${describe_output}" | python3 -c "
|
|
301
|
-
import sys, json
|
|
302
|
-
d = json.load(sys.stdin)
|
|
303
|
-
print(d.get('ProcessingJobStatus', 'Unknown'))
|
|
304
|
-
" 2>/dev/null) || job_status="Unknown"
|
|
305
|
-
|
|
306
|
-
failure_reason=$(echo "${describe_output}" | python3 -c "
|
|
307
|
-
import sys, json
|
|
308
|
-
d = json.load(sys.stdin)
|
|
309
|
-
print(d.get('FailureReason', ''))
|
|
310
|
-
" 2>/dev/null) || failure_reason=""
|
|
311
|
-
|
|
312
|
-
# Print status
|
|
313
|
-
local now
|
|
314
|
-
now=$(date +%H:%M:%S)
|
|
315
|
-
echo " [${now}] Status: ${job_status}"
|
|
316
|
-
|
|
317
|
-
# Handle terminal states
|
|
318
|
-
case "${job_status}" in
|
|
319
|
-
Completed)
|
|
320
|
-
echo ""
|
|
321
|
-
echo "✅ Processing Job completed: ${job_name}"
|
|
322
|
-
echo ""
|
|
323
|
-
echo " S3 URI: ${MODEL_S3_URI}"
|
|
324
|
-
echo ""
|
|
325
|
-
if [ "${UPDATE_CONFIG}" = true ]; then
|
|
326
|
-
CONFIG_FILE="${SCRIPT_DIR}/config"
|
|
327
|
-
sed -i.bak "s|^export MODEL_NAME=.*|export MODEL_NAME=\"${MODEL_S3_URI}\"|" "${CONFIG_FILE}"
|
|
328
|
-
rm -f "${CONFIG_FILE}.bak"
|
|
329
|
-
echo " ✅ Updated MODEL_NAME in do/config → ${MODEL_S3_URI}"
|
|
330
|
-
echo ""
|
|
331
|
-
echo " Re-deploy with S3-backed model: ./do/deploy"
|
|
332
|
-
else
|
|
333
|
-
echo " To use this staged model, update do/config:"
|
|
334
|
-
echo " export MODEL_NAME=\"${MODEL_S3_URI}\""
|
|
335
|
-
echo ""
|
|
336
|
-
echo " Or re-run with --update-config:"
|
|
337
|
-
echo " ./do/stage --submit --update-config"
|
|
338
|
-
fi
|
|
339
|
-
return 0
|
|
340
|
-
;;
|
|
341
|
-
Failed)
|
|
342
|
-
echo ""
|
|
343
|
-
echo "❌ Processing Job failed: ${job_name}"
|
|
344
|
-
if [ -n "${failure_reason}" ]; then
|
|
345
|
-
echo " Reason: ${failure_reason}"
|
|
346
|
-
fi
|
|
347
|
-
echo ""
|
|
348
|
-
echo " Check CloudWatch logs:"
|
|
349
|
-
echo " /aws/sagemaker/ProcessingJobs/${job_name}"
|
|
350
|
-
echo ""
|
|
351
|
-
echo " To retry: ./do/stage --submit --force"
|
|
352
|
-
return 1
|
|
353
|
-
;;
|
|
354
|
-
Stopped)
|
|
355
|
-
echo ""
|
|
356
|
-
echo "⏹️ Processing Job was stopped: ${job_name}"
|
|
357
|
-
echo ""
|
|
358
|
-
echo " To retry: ./do/stage --submit --force"
|
|
359
|
-
return 2
|
|
360
|
-
;;
|
|
361
|
-
esac
|
|
362
|
-
|
|
363
|
-
sleep "${POLL_INTERVAL}"
|
|
364
|
-
done
|
|
169
|
+
# Update config if requested and we have a valid S3 URI
|
|
170
|
+
if [ "${UPDATE_CONFIG}" = true ] && [ -n "${s3_uri}" ] && [ "${job_status}" != "Submitted" ]; then
|
|
171
|
+
CONFIG_FILE="${SCRIPT_DIR}/config"
|
|
172
|
+
sed -i.bak "s|^export MODEL_NAME=.*|export MODEL_NAME=\"${s3_uri}\"|" "${CONFIG_FILE}"
|
|
173
|
+
rm -f "${CONFIG_FILE}.bak"
|
|
174
|
+
echo ""
|
|
175
|
+
echo " ✅ Updated MODEL_NAME in do/config → S3-backed"
|
|
176
|
+
echo " Subsequent tasks (submit, deploy) will pull from S3."
|
|
177
|
+
fi
|
|
365
178
|
}
|
|
366
179
|
|
|
367
180
|
# ── Check if model is already an S3 URI ──────────────────────────────────────
|
|
@@ -409,21 +222,28 @@ if [ -z "${STAGE_S3_BUCKET}" ]; then
|
|
|
409
222
|
exit 1
|
|
410
223
|
fi
|
|
411
224
|
|
|
412
|
-
# Target S3 path for staged model
|
|
413
|
-
|
|
225
|
+
# Target S3 path for staged model: s3://{bucket}/{project}/models/{model-slug}/
|
|
226
|
+
# Sanitize MODEL_NAME for use as an S3 path segment:
|
|
227
|
+
# - Replace / with -- (e.g., "nvidia/Nemotron-3-Ultra..." → "nvidia--Nemotron-3-Ultra...")
|
|
228
|
+
# - This prevents HF org/repo IDs from creating nested S3 prefixes
|
|
229
|
+
MODEL_SLUG="${MODEL_NAME//\//-}"
|
|
230
|
+
MODEL_S3_URI="s3://${STAGE_S3_BUCKET}/${PROJECT_NAME}/models/${MODEL_SLUG}/"
|
|
414
231
|
|
|
415
232
|
echo " Target: ${MODEL_S3_URI}"
|
|
416
233
|
echo ""
|
|
417
234
|
|
|
418
|
-
# ──
|
|
419
|
-
#
|
|
420
|
-
#
|
|
421
|
-
if [ "${
|
|
235
|
+
# ── Default mode: SageMaker Processing Job via .stage_helper.py ───────────────
|
|
236
|
+
# Submits a Processing Job that downloads model weights from HuggingFace and
|
|
237
|
+
# syncs to S3 directly — no local disk usage. Uses sagemaker-core SDK v3.
|
|
238
|
+
if [ "${LOCAL_MODE}" = false ]; then
|
|
422
239
|
_submit_processing_job
|
|
423
240
|
exit $?
|
|
424
241
|
fi
|
|
425
242
|
|
|
426
|
-
# ──
|
|
243
|
+
# ── Local mode: download locally then sync to S3 (--local flag) ───────────────
|
|
244
|
+
# Preserved for offline work, debugging, or when Processing Jobs are unavailable.
|
|
245
|
+
|
|
246
|
+
# Idempotency: check if model is already staged
|
|
427
247
|
if [ "${FORCE}" = false ]; then
|
|
428
248
|
if aws s3 ls "${MODEL_S3_URI}config.json" --region "${AWS_REGION}" &>/dev/null; then
|
|
429
249
|
echo "✅ Model already staged at: ${MODEL_S3_URI}"
|
|
@@ -433,7 +253,7 @@ if [ "${FORCE}" = false ]; then
|
|
|
433
253
|
CONFIG_FILE="${SCRIPT_DIR}/config"
|
|
434
254
|
sed -i.bak "s|^export MODEL_NAME=.*|export MODEL_NAME=\"${MODEL_S3_URI}\"|" "${CONFIG_FILE}"
|
|
435
255
|
rm -f "${CONFIG_FILE}.bak"
|
|
436
|
-
echo " ✅ Updated MODEL_NAME in do/config →
|
|
256
|
+
echo " ✅ Updated MODEL_NAME in do/config → S3-backed"
|
|
437
257
|
else
|
|
438
258
|
echo " To use this staged model, set in do/config:"
|
|
439
259
|
echo " export MODEL_NAME=\"${MODEL_S3_URI}\""
|
|
@@ -442,7 +262,7 @@ if [ "${FORCE}" = false ]; then
|
|
|
442
262
|
fi
|
|
443
263
|
fi
|
|
444
264
|
|
|
445
|
-
#
|
|
265
|
+
# Validate prerequisites
|
|
446
266
|
if ! command -v huggingface-cli &>/dev/null; then
|
|
447
267
|
echo "❌ huggingface-cli is not installed"
|
|
448
268
|
echo " Install: pip install huggingface_hub[cli] hf_transfer"
|
|
@@ -474,13 +294,21 @@ fi
|
|
|
474
294
|
|
|
475
295
|
# ── Download model from HuggingFace ──────────────────────────────────────────
|
|
476
296
|
echo "⬇️ Downloading model from HuggingFace: ${MODEL_NAME}"
|
|
477
|
-
|
|
297
|
+
if python3 -c "import hf_transfer" 2>/dev/null; then
|
|
298
|
+
echo " Using hf_transfer for fast parallel downloads..."
|
|
299
|
+
else
|
|
300
|
+
echo " Using standard downloads (install hf_transfer for faster staging)..."
|
|
301
|
+
fi
|
|
478
302
|
echo ""
|
|
479
303
|
|
|
480
|
-
# Enable fast parallel downloads via hf_transfer
|
|
481
|
-
|
|
304
|
+
# Enable fast parallel downloads via hf_transfer (if available)
|
|
305
|
+
if python3 -c "import hf_transfer" 2>/dev/null; then
|
|
306
|
+
export HF_HUB_ENABLE_HF_TRANSFER=1
|
|
307
|
+
else
|
|
308
|
+
unset HF_HUB_ENABLE_HF_TRANSFER 2>/dev/null || true
|
|
309
|
+
fi
|
|
482
310
|
|
|
483
|
-
# Download to HF cache
|
|
311
|
+
# Download to HF cache
|
|
484
312
|
DOWNLOAD_ARGS=("${MODEL_NAME}")
|
|
485
313
|
if [ -n "${HF_TOKEN:-}" ]; then
|
|
486
314
|
DOWNLOAD_ARGS+=("--token" "${HF_TOKEN}")
|
|
@@ -555,15 +383,9 @@ if [ "${UPDATE_CONFIG}" = true ]; then
|
|
|
555
383
|
CONFIG_FILE="${SCRIPT_DIR}/config"
|
|
556
384
|
sed -i.bak "s|^export MODEL_NAME=.*|export MODEL_NAME=\"${MODEL_S3_URI}\"|" "${CONFIG_FILE}"
|
|
557
385
|
rm -f "${CONFIG_FILE}.bak"
|
|
558
|
-
echo " ✅ Updated MODEL_NAME in do/config →
|
|
559
|
-
echo ""
|
|
560
|
-
echo " Re-deploy with S3-backed model: ./do/deploy"
|
|
386
|
+
echo " ✅ Updated MODEL_NAME in do/config → S3-backed"
|
|
387
|
+
echo " Subsequent tasks (submit, deploy) will pull from S3."
|
|
561
388
|
else
|
|
562
389
|
echo " To use this staged model, update do/config:"
|
|
563
390
|
echo " export MODEL_NAME=\"${MODEL_S3_URI}\""
|
|
564
|
-
echo ""
|
|
565
|
-
echo " Or re-run with --update-config to do it automatically:"
|
|
566
|
-
echo " ./do/stage --update-config"
|
|
567
|
-
echo ""
|
|
568
|
-
echo " Then re-deploy: ./do/deploy"
|
|
569
391
|
fi
|
package/templates/do/submit
CHANGED
|
@@ -12,10 +12,7 @@ source "${SCRIPT_DIR}/config"
|
|
|
12
12
|
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
13
13
|
|
|
14
14
|
# ── Profile-resolved variables (env var > profile > default) ──────────────────
|
|
15
|
-
|
|
16
|
-
set +u
|
|
17
|
-
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE[ecrRepositoryName]:-ml-container-creator}}"
|
|
18
|
-
set -u
|
|
15
|
+
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE_ecrRepositoryName:-ml-container-creator}}"
|
|
19
16
|
|
|
20
17
|
# ── Derived variables (env var > computed default) ────────────────────────────
|
|
21
18
|
CODEBUILD_PROJECT_NAME="${CODEBUILD_PROJECT_NAME:-${PROJECT_NAME}-build-$(date +%Y%m%d)}"
|