@aws/ml-container-creator 0.13.4 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -5
- package/config/parameter-schema-v2.json +32 -4
- package/infra/ci-harness/lib/ci-harness-stack.ts +13 -5
- package/infra/ci-harness/package-lock.json +122 -116
- package/infra/ci-harness/package.json +1 -1
- package/package.json +5 -3
- package/pyproject.toml +21 -0
- package/requirements.txt +19 -0
- package/servers/instance-sizer/index.js +72 -4
- package/servers/instance-sizer/lib/model-resolver.js +28 -2
- package/src/app.js +17 -0
- package/src/lib/bootstrap-command-handler.js +33 -23
- package/src/lib/config-loader.js +18 -0
- package/src/lib/config-manager.js +6 -1
- package/src/lib/dataset-slug.js +152 -0
- package/src/lib/generated/cli-options.js +9 -3
- package/src/lib/generated/parameter-matrix.js +14 -3
- package/src/lib/generated/validation-rules.js +1 -1
- package/src/lib/mcp-query-runner.js +6 -0
- package/src/lib/prompt-runner.js +5 -0
- package/src/lib/prompts/feature-prompts.js +1 -1
- package/src/lib/template-manager.js +0 -7
- package/src/lib/template-variable-resolver.js +51 -1
- package/src/lib/tune-config-state.js +14 -1
- package/templates/do/.adapter_helper.py +451 -0
- package/templates/do/.benchmark_writer.py +22 -0
- package/templates/do/.register_helper.py +1163 -0
- package/templates/do/.stage_helper.py +419 -0
- package/templates/do/.tune_helper.py +379 -65
- package/templates/do/__pycache__/.adapter_helper.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.tune_helper.cpython-312.pyc +0 -0
- package/templates/do/adapter +427 -27
- package/templates/do/add-ic +85 -3
- package/templates/do/benchmark +173 -15
- package/templates/do/config +24 -0
- package/templates/do/lib/inference-component.sh +56 -3
- package/templates/do/lib/profile.sh +5 -0
- package/templates/do/register +552 -6
- package/templates/do/stage +91 -272
- package/templates/do/test +12 -2
- package/templates/do/tune +264 -12
package/templates/do/stage
CHANGED
|
@@ -3,18 +3,17 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
# do/stage — Pre-stage model weights from HuggingFace to S3
|
|
6
|
-
#
|
|
7
|
-
#
|
|
6
|
+
# Submits a SageMaker Processing Job that downloads from HuggingFace
|
|
7
|
+
# and writes directly to S3 — no local disk usage.
|
|
8
8
|
#
|
|
9
9
|
# Idempotent: if the model is already staged (config.json exists at
|
|
10
10
|
# the target S3 path), the script exits early.
|
|
11
11
|
#
|
|
12
12
|
# Usage:
|
|
13
|
-
# ./do/stage
|
|
13
|
+
# ./do/stage Submit Processing Job to stage model (default)
|
|
14
|
+
# ./do/stage --local Download locally then sync to S3
|
|
15
|
+
# ./do/stage --no-wait Submit and exit without polling
|
|
14
16
|
# ./do/stage --force Re-stage even if already present in S3
|
|
15
|
-
# ./do/stage --update-config Stage and update MODEL_NAME in do/config
|
|
16
|
-
# ./do/stage --submit Submit as SageMaker Processing Job (for models >500GB)
|
|
17
|
-
# ./do/stage --submit --no-wait Submit and exit without polling
|
|
18
17
|
|
|
19
18
|
set -e
|
|
20
19
|
set -u
|
|
@@ -29,30 +28,32 @@ source "${SCRIPT_DIR}/lib/staged-assets.sh"
|
|
|
29
28
|
# ── Parse flags ───────────────────────────────────────────────────────────────
|
|
30
29
|
FORCE=false
|
|
31
30
|
UPDATE_CONFIG=true
|
|
32
|
-
|
|
31
|
+
LOCAL_MODE=false
|
|
33
32
|
NO_WAIT=false
|
|
34
33
|
while [ $# -gt 0 ]; do
|
|
35
34
|
case "$1" in
|
|
36
35
|
--force) FORCE=true; shift ;;
|
|
37
36
|
--update-config) UPDATE_CONFIG=true; shift ;; # default, kept for backward compat
|
|
38
37
|
--no-update-config) UPDATE_CONFIG=false; shift ;;
|
|
39
|
-
--
|
|
38
|
+
--local) LOCAL_MODE=true; shift ;;
|
|
39
|
+
--submit) shift ;; # Deprecated — now the default; kept for backward compat
|
|
40
40
|
--no-wait) NO_WAIT=true; shift ;;
|
|
41
41
|
--help|-h)
|
|
42
|
-
echo "Usage: ./do/stage [--force] [--
|
|
42
|
+
echo "Usage: ./do/stage [--force] [--local] [--no-wait] [--no-update-config]"
|
|
43
43
|
echo ""
|
|
44
44
|
echo "Pre-stage model weights from HuggingFace to S3."
|
|
45
45
|
echo "On success, updates MODEL_NAME in do/config so subsequent tasks"
|
|
46
46
|
echo "(submit, deploy) pull from S3 with HuggingFace as fallback."
|
|
47
47
|
echo ""
|
|
48
48
|
echo "Modes:"
|
|
49
|
-
echo " (default)
|
|
50
|
-
echo " --
|
|
49
|
+
echo " (default) Submit SageMaker Processing Job (no local disk usage)"
|
|
50
|
+
echo " --local Download locally then sync to S3 (legacy behavior)"
|
|
51
|
+
echo " --submit Deprecated — Processing Job is now the default"
|
|
51
52
|
echo ""
|
|
52
53
|
echo "Options:"
|
|
53
54
|
echo " --force Re-stage even if model already exists in S3"
|
|
54
55
|
echo " --no-update-config Do NOT update MODEL_NAME in do/config after staging"
|
|
55
|
-
echo " --no-wait
|
|
56
|
+
echo " --no-wait Return immediately with job name (Processing Job mode)"
|
|
56
57
|
echo ""
|
|
57
58
|
echo "Environment:"
|
|
58
59
|
echo " HF_TOKEN HuggingFace token (for gated models)"
|
|
@@ -65,10 +66,7 @@ while [ $# -gt 0 ]; do
|
|
|
65
66
|
esac
|
|
66
67
|
done
|
|
67
68
|
|
|
68
|
-
# ── Processing Job submission
|
|
69
|
-
# Submits a SageMaker Processing Job that downloads model weights from HuggingFace
|
|
70
|
-
# and syncs them to S3. Uses 2TB attached storage to handle any model size.
|
|
71
|
-
POLL_INTERVAL=30
|
|
69
|
+
# ── Processing Job submission via .stage_helper.py ────────────────────────────
|
|
72
70
|
PROCESSING_JOB_INSTANCE_TYPE="ml.m5.xlarge"
|
|
73
71
|
PROCESSING_JOB_VOLUME_GB=2048
|
|
74
72
|
|
|
@@ -80,19 +78,12 @@ _submit_processing_job() {
|
|
|
80
78
|
echo " Storage: ${PROCESSING_JOB_VOLUME_GB} GB"
|
|
81
79
|
echo ""
|
|
82
80
|
|
|
83
|
-
# Validate AWS credentials
|
|
84
|
-
if ! aws sts get-caller-identity &>/dev/null; then
|
|
85
|
-
echo "❌ AWS credentials not configured or expired."
|
|
86
|
-
echo " Run: aws configure"
|
|
87
|
-
exit 4
|
|
88
|
-
fi
|
|
89
|
-
|
|
90
81
|
# Resolve execution role from profile
|
|
91
82
|
local execution_role
|
|
92
83
|
execution_role=$(echo "${_PROFILE_JSON}" | python3 -c "
|
|
93
84
|
import sys, json
|
|
94
85
|
p = json.load(sys.stdin)
|
|
95
|
-
print(p.get('
|
|
86
|
+
print(p.get('roleArn', ''))
|
|
96
87
|
" 2>/dev/null) || execution_role=""
|
|
97
88
|
|
|
98
89
|
if [ -z "${execution_role}" ]; then
|
|
@@ -102,267 +93,88 @@ print(p.get('executionRoleArn', ''))
|
|
|
102
93
|
exit 1
|
|
103
94
|
fi
|
|
104
95
|
|
|
105
|
-
# Resolve HF token
|
|
96
|
+
# Resolve HF token (optional — for gated models)
|
|
97
|
+
local hf_token_value=""
|
|
106
98
|
local hf_token_secret_arn="${HF_TOKEN_ARN:-}"
|
|
107
|
-
|
|
108
|
-
# Generate job name with timestamp
|
|
109
|
-
local timestamp
|
|
110
|
-
timestamp=$(date +%Y%m%d-%H%M%S)
|
|
111
|
-
local job_name="mlcc-stage-${PROJECT_NAME}-${timestamp}"
|
|
112
|
-
# SageMaker job names max 63 chars, must match [a-zA-Z0-9](-*[a-zA-Z0-9])*
|
|
113
|
-
job_name=$(echo "${job_name}" | cut -c1-63 | sed 's/[^a-zA-Z0-9-]/-/g' | sed 's/-*$//')
|
|
114
|
-
|
|
115
|
-
echo " Job name: ${job_name}"
|
|
116
|
-
echo ""
|
|
117
|
-
|
|
118
|
-
# Build the entrypoint script that runs inside the processing container
|
|
119
|
-
local entrypoint_script
|
|
120
|
-
entrypoint_script=$(cat <<'ENTRYPOINT_EOF'
|
|
121
|
-
#!/bin/bash
|
|
122
|
-
set -e
|
|
123
|
-
set -o pipefail
|
|
124
|
-
|
|
125
|
-
echo "=== MCC Model Staging Processing Job ==="
|
|
126
|
-
echo "Model: ${MODEL_ID}"
|
|
127
|
-
echo "Target: ${S3_OUTPUT_URI}"
|
|
128
|
-
echo ""
|
|
129
|
-
|
|
130
|
-
# Install dependencies
|
|
131
|
-
echo "📦 Checking huggingface-cli and hf_transfer..."
|
|
132
|
-
pip install -q huggingface_hub[cli] hf_transfer 2>/dev/null || true
|
|
133
|
-
|
|
134
|
-
# Enable fast parallel downloads only if hf_transfer is available
|
|
135
|
-
if python3 -c "import hf_transfer" 2>/dev/null; then
|
|
136
|
-
export HF_HUB_ENABLE_HF_TRANSFER=1
|
|
137
|
-
else
|
|
138
|
-
echo " ℹ️ hf_transfer not available — using standard download (install with: pip install hf_transfer)"
|
|
139
|
-
unset HF_HUB_ENABLE_HF_TRANSFER 2>/dev/null || true
|
|
140
|
-
fi
|
|
141
|
-
|
|
142
|
-
# Set HF token if provided
|
|
143
|
-
if [ -n "${HF_TOKEN:-}" ]; then
|
|
144
|
-
echo "🔐 Using provided HuggingFace token"
|
|
145
|
-
fi
|
|
146
|
-
|
|
147
|
-
# Download model from HuggingFace
|
|
148
|
-
echo ""
|
|
149
|
-
echo "⬇️ Downloading model: ${MODEL_ID}"
|
|
150
|
-
DOWNLOAD_ARGS="${MODEL_ID}"
|
|
151
|
-
if [ -n "${HF_TOKEN:-}" ]; then
|
|
152
|
-
DOWNLOAD_ARGS="${DOWNLOAD_ARGS} --token ${HF_TOKEN}"
|
|
153
|
-
fi
|
|
154
|
-
huggingface-cli download ${DOWNLOAD_ARGS}
|
|
155
|
-
|
|
156
|
-
echo ""
|
|
157
|
-
echo "✅ Download complete"
|
|
158
|
-
|
|
159
|
-
# Locate downloaded files
|
|
160
|
-
CACHE_PATH=$(python3 -c "
|
|
161
|
-
from huggingface_hub import snapshot_download
|
|
162
|
-
path = snapshot_download('${MODEL_ID}', local_files_only=True)
|
|
163
|
-
print(path)
|
|
164
|
-
")
|
|
165
|
-
|
|
166
|
-
echo "📁 Cache path: ${CACHE_PATH}"
|
|
167
|
-
|
|
168
|
-
# Sync to S3
|
|
169
|
-
echo ""
|
|
170
|
-
echo "☁️ Syncing to S3: ${S3_OUTPUT_URI}"
|
|
171
|
-
aws s3 sync "${CACHE_PATH}" "${S3_OUTPUT_URI}" \
|
|
172
|
-
--no-progress \
|
|
173
|
-
--exclude "*.lock" \
|
|
174
|
-
--exclude ".gitattributes"
|
|
175
|
-
|
|
176
|
-
echo ""
|
|
177
|
-
echo "✅ Model staged successfully to: ${S3_OUTPUT_URI}"
|
|
178
|
-
ENTRYPOINT_EOF
|
|
179
|
-
)
|
|
180
|
-
|
|
181
|
-
# Build environment variables for the container
|
|
182
|
-
local env_vars="MODEL_ID=${MODEL_NAME},S3_OUTPUT_URI=${MODEL_S3_URI}"
|
|
183
99
|
if [ -n "${hf_token_secret_arn}" ]; then
|
|
184
|
-
# Resolve token and pass as env var to the job
|
|
185
|
-
local hf_token_value=""
|
|
186
100
|
hf_token_value=$(aws secretsmanager get-secret-value \
|
|
187
101
|
--secret-id "${hf_token_secret_arn}" \
|
|
188
102
|
--query SecretString --output text 2>/dev/null) || hf_token_value=""
|
|
189
|
-
if [ -n "${hf_token_value}" ]; then
|
|
190
|
-
env_vars="${env_vars},HF_TOKEN=${hf_token_value}"
|
|
191
|
-
fi
|
|
192
103
|
elif [ -n "${HF_TOKEN:-}" ]; then
|
|
193
|
-
|
|
104
|
+
hf_token_value="${HF_TOKEN}"
|
|
194
105
|
fi
|
|
195
106
|
|
|
196
|
-
#
|
|
197
|
-
local
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
'InstanceType': '${PROCESSING_JOB_INSTANCE_TYPE}',
|
|
217
|
-
'VolumeSizeInGB': ${PROCESSING_JOB_VOLUME_GB}
|
|
218
|
-
}
|
|
219
|
-
},
|
|
220
|
-
'AppSpecification': {
|
|
221
|
-
'ImageUri': '${container_image}',
|
|
222
|
-
'ContainerEntrypoint': ['bash', '-c'],
|
|
223
|
-
'ContainerArguments': ['aws s3 cp ${entrypoint_s3_uri} /tmp/entrypoint.sh && chmod +x /tmp/entrypoint.sh && /tmp/entrypoint.sh']
|
|
224
|
-
},
|
|
225
|
-
'Environment': dict(item.split('=', 1) for item in '${env_vars}'.split(',')),
|
|
226
|
-
'RoleArn': '${execution_role}',
|
|
227
|
-
'StoppingCondition': {
|
|
228
|
-
'MaxRuntimeInSeconds': 86400
|
|
229
|
-
}
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
print(json.dumps(job, indent=2))
|
|
233
|
-
")
|
|
234
|
-
|
|
235
|
-
# Write request JSON to temp file
|
|
236
|
-
local request_file="/tmp/mlcc-stage-request-${timestamp}.json"
|
|
237
|
-
echo "${processing_request}" > "${request_file}"
|
|
238
|
-
|
|
239
|
-
echo "🚀 Creating Processing Job: ${job_name}"
|
|
240
|
-
echo ""
|
|
241
|
-
|
|
242
|
-
local create_output
|
|
243
|
-
local create_exit_code
|
|
244
|
-
create_output=$(aws sagemaker create-processing-job \
|
|
245
|
-
--cli-input-json "file://${request_file}" \
|
|
246
|
-
--region "${AWS_REGION}" 2>&1) || create_exit_code=$?
|
|
247
|
-
create_exit_code=${create_exit_code:-0}
|
|
107
|
+
# Build helper arguments
|
|
108
|
+
local helper_args=(
|
|
109
|
+
submit
|
|
110
|
+
--model-name "${MODEL_NAME}"
|
|
111
|
+
--bucket "${STAGE_S3_BUCKET}"
|
|
112
|
+
--project "${PROJECT_NAME}"
|
|
113
|
+
--role-arn "${execution_role}"
|
|
114
|
+
--region "${AWS_REGION}"
|
|
115
|
+
--instance-type "${PROCESSING_JOB_INSTANCE_TYPE}"
|
|
116
|
+
--volume-size-gb "${PROCESSING_JOB_VOLUME_GB}"
|
|
117
|
+
)
|
|
118
|
+
if [ -n "${hf_token_value}" ]; then
|
|
119
|
+
helper_args+=(--hf-token "${hf_token_value}")
|
|
120
|
+
fi
|
|
121
|
+
if [ "${FORCE}" = true ]; then
|
|
122
|
+
helper_args+=(--force)
|
|
123
|
+
fi
|
|
124
|
+
if [ "${NO_WAIT}" = true ]; then
|
|
125
|
+
helper_args+=(--no-wait)
|
|
126
|
+
fi
|
|
248
127
|
|
|
249
|
-
|
|
128
|
+
# Call .stage_helper.py (sagemaker-core ProcessingJob.create())
|
|
129
|
+
# stdout = JSON result, stderr = progress messages (piped to user)
|
|
130
|
+
local json_output
|
|
131
|
+
local helper_exit_code=0
|
|
132
|
+
json_output=$(python3 "${SCRIPT_DIR}/.stage_helper.py" "${helper_args[@]}") || helper_exit_code=$?
|
|
250
133
|
|
|
251
|
-
if [ ${
|
|
252
|
-
echo "❌ Failed to create Processing Job"
|
|
253
|
-
echo " ${create_output}"
|
|
134
|
+
if [ ${helper_exit_code} -ne 0 ]; then
|
|
254
135
|
echo ""
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
exit 1
|
|
136
|
+
echo "❌ Processing Job failed"
|
|
137
|
+
echo " To retry: ./do/stage --force"
|
|
138
|
+
exit ${helper_exit_code}
|
|
259
139
|
fi
|
|
260
140
|
|
|
261
|
-
|
|
262
|
-
|
|
141
|
+
# Parse JSON output
|
|
142
|
+
local job_status
|
|
143
|
+
local job_name
|
|
144
|
+
local s3_uri
|
|
145
|
+
job_status=$(echo "${json_output}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('status',''))" 2>/dev/null) || job_status=""
|
|
146
|
+
job_name=$(echo "${json_output}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('job_name',''))" 2>/dev/null) || job_name=""
|
|
147
|
+
s3_uri=$(echo "${json_output}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('s3_uri',''))" 2>/dev/null) || s3_uri="${MODEL_S3_URI}"
|
|
263
148
|
|
|
264
|
-
|
|
265
|
-
|
|
149
|
+
if [ "${job_status}" = "AlreadyStaged" ]; then
|
|
150
|
+
echo "✅ Model already staged at: ${s3_uri}"
|
|
151
|
+
echo " Use --force to re-stage."
|
|
152
|
+
elif [ "${job_status}" = "Submitted" ]; then
|
|
153
|
+
echo " ✅ Processing Job submitted: ${job_name}"
|
|
154
|
+
echo ""
|
|
266
155
|
echo " --no-wait specified. Job submitted, exiting without polling."
|
|
267
156
|
echo ""
|
|
268
157
|
echo " Check status:"
|
|
269
|
-
echo "
|
|
158
|
+
echo " python3 ${SCRIPT_DIR}/.stage_helper.py status --job-name ${job_name}"
|
|
270
159
|
echo ""
|
|
271
160
|
echo " On completion, the staged model will be at:"
|
|
272
|
-
echo " ${
|
|
273
|
-
|
|
161
|
+
echo " ${s3_uri}"
|
|
162
|
+
elif [ "${job_status}" = "Completed" ]; then
|
|
163
|
+
echo ""
|
|
164
|
+
echo "✅ Processing Job completed: ${job_name}"
|
|
165
|
+
echo ""
|
|
166
|
+
echo " S3 URI: ${s3_uri}"
|
|
274
167
|
fi
|
|
275
168
|
|
|
276
|
-
#
|
|
277
|
-
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
echo " (Ctrl+C to stop polling — job continues in background)"
|
|
286
|
-
echo ""
|
|
287
|
-
|
|
288
|
-
while true; do
|
|
289
|
-
local describe_output
|
|
290
|
-
local describe_exit_code
|
|
291
|
-
describe_output=$(aws sagemaker describe-processing-job \
|
|
292
|
-
--processing-job-name "${job_name}" \
|
|
293
|
-
--region "${AWS_REGION}" 2>&1) || describe_exit_code=$?
|
|
294
|
-
describe_exit_code=${describe_exit_code:-0}
|
|
295
|
-
|
|
296
|
-
if [ ${describe_exit_code} -ne 0 ]; then
|
|
297
|
-
echo " ⚠️ Failed to describe job (will retry): ${describe_output}"
|
|
298
|
-
sleep "${POLL_INTERVAL}"
|
|
299
|
-
continue
|
|
300
|
-
fi
|
|
301
|
-
|
|
302
|
-
# Parse status from response
|
|
303
|
-
local job_status
|
|
304
|
-
local failure_reason
|
|
305
|
-
job_status=$(echo "${describe_output}" | python3 -c "
|
|
306
|
-
import sys, json
|
|
307
|
-
d = json.load(sys.stdin)
|
|
308
|
-
print(d.get('ProcessingJobStatus', 'Unknown'))
|
|
309
|
-
" 2>/dev/null) || job_status="Unknown"
|
|
310
|
-
|
|
311
|
-
failure_reason=$(echo "${describe_output}" | python3 -c "
|
|
312
|
-
import sys, json
|
|
313
|
-
d = json.load(sys.stdin)
|
|
314
|
-
print(d.get('FailureReason', ''))
|
|
315
|
-
" 2>/dev/null) || failure_reason=""
|
|
316
|
-
|
|
317
|
-
# Print status
|
|
318
|
-
local now
|
|
319
|
-
now=$(date +%H:%M:%S)
|
|
320
|
-
echo " [${now}] Status: ${job_status}"
|
|
321
|
-
|
|
322
|
-
# Handle terminal states
|
|
323
|
-
case "${job_status}" in
|
|
324
|
-
Completed)
|
|
325
|
-
echo ""
|
|
326
|
-
echo "✅ Processing Job completed: ${job_name}"
|
|
327
|
-
echo ""
|
|
328
|
-
echo " S3 URI: ${MODEL_S3_URI}"
|
|
329
|
-
echo ""
|
|
330
|
-
if [ "${UPDATE_CONFIG}" = true ]; then
|
|
331
|
-
CONFIG_FILE="${SCRIPT_DIR}/config"
|
|
332
|
-
sed -i.bak "s|^export MODEL_NAME=.*|export MODEL_NAME=\"${MODEL_S3_URI}\"|" "${CONFIG_FILE}"
|
|
333
|
-
rm -f "${CONFIG_FILE}.bak"
|
|
334
|
-
echo " ✅ Updated MODEL_NAME in do/config → S3-backed"
|
|
335
|
-
echo " Subsequent tasks (submit, deploy) will pull from S3."
|
|
336
|
-
else
|
|
337
|
-
echo " To use this staged model, update do/config:"
|
|
338
|
-
echo " export MODEL_NAME=\"${MODEL_S3_URI}\""
|
|
339
|
-
fi
|
|
340
|
-
return 0
|
|
341
|
-
;;
|
|
342
|
-
Failed)
|
|
343
|
-
echo ""
|
|
344
|
-
echo "❌ Processing Job failed: ${job_name}"
|
|
345
|
-
if [ -n "${failure_reason}" ]; then
|
|
346
|
-
echo " Reason: ${failure_reason}"
|
|
347
|
-
fi
|
|
348
|
-
echo ""
|
|
349
|
-
echo " Check CloudWatch logs:"
|
|
350
|
-
echo " /aws/sagemaker/ProcessingJobs/${job_name}"
|
|
351
|
-
echo ""
|
|
352
|
-
echo " To retry: ./do/stage --submit --force"
|
|
353
|
-
return 1
|
|
354
|
-
;;
|
|
355
|
-
Stopped)
|
|
356
|
-
echo ""
|
|
357
|
-
echo "⏹️ Processing Job was stopped: ${job_name}"
|
|
358
|
-
echo ""
|
|
359
|
-
echo " To retry: ./do/stage --submit --force"
|
|
360
|
-
return 2
|
|
361
|
-
;;
|
|
362
|
-
esac
|
|
363
|
-
|
|
364
|
-
sleep "${POLL_INTERVAL}"
|
|
365
|
-
done
|
|
169
|
+
# Update config if requested and we have a valid S3 URI
|
|
170
|
+
if [ "${UPDATE_CONFIG}" = true ] && [ -n "${s3_uri}" ] && [ "${job_status}" != "Submitted" ]; then
|
|
171
|
+
CONFIG_FILE="${SCRIPT_DIR}/config"
|
|
172
|
+
sed -i.bak "s|^export MODEL_NAME=.*|export MODEL_NAME=\"${s3_uri}\"|" "${CONFIG_FILE}"
|
|
173
|
+
rm -f "${CONFIG_FILE}.bak"
|
|
174
|
+
echo ""
|
|
175
|
+
echo " ✅ Updated MODEL_NAME in do/config → S3-backed"
|
|
176
|
+
echo " Subsequent tasks (submit, deploy) will pull from S3."
|
|
177
|
+
fi
|
|
366
178
|
}
|
|
367
179
|
|
|
368
180
|
# ── Check if model is already an S3 URI ──────────────────────────────────────
|
|
@@ -410,21 +222,28 @@ if [ -z "${STAGE_S3_BUCKET}" ]; then
|
|
|
410
222
|
exit 1
|
|
411
223
|
fi
|
|
412
224
|
|
|
413
|
-
# Target S3 path for staged model
|
|
414
|
-
|
|
225
|
+
# Target S3 path for staged model: s3://{bucket}/{project}/models/{model-slug}/
|
|
226
|
+
# Sanitize MODEL_NAME for use as an S3 path segment:
|
|
227
|
+
# - Replace / with -- (e.g., "nvidia/Nemotron-3-Ultra..." → "nvidia--Nemotron-3-Ultra...")
|
|
228
|
+
# - This prevents HF org/repo IDs from creating nested S3 prefixes
|
|
229
|
+
MODEL_SLUG="${MODEL_NAME//\//-}"
|
|
230
|
+
MODEL_S3_URI="s3://${STAGE_S3_BUCKET}/${PROJECT_NAME}/models/${MODEL_SLUG}/"
|
|
415
231
|
|
|
416
232
|
echo " Target: ${MODEL_S3_URI}"
|
|
417
233
|
echo ""
|
|
418
234
|
|
|
419
|
-
# ──
|
|
420
|
-
#
|
|
421
|
-
#
|
|
422
|
-
if [ "${
|
|
235
|
+
# ── Default mode: SageMaker Processing Job via .stage_helper.py ───────────────
|
|
236
|
+
# Submits a Processing Job that downloads model weights from HuggingFace and
|
|
237
|
+
# syncs to S3 directly — no local disk usage. Uses sagemaker-core SDK v3.
|
|
238
|
+
if [ "${LOCAL_MODE}" = false ]; then
|
|
423
239
|
_submit_processing_job
|
|
424
240
|
exit $?
|
|
425
241
|
fi
|
|
426
242
|
|
|
427
|
-
# ──
|
|
243
|
+
# ── Local mode: download locally then sync to S3 (--local flag) ───────────────
|
|
244
|
+
# Preserved for offline work, debugging, or when Processing Jobs are unavailable.
|
|
245
|
+
|
|
246
|
+
# Idempotency: check if model is already staged
|
|
428
247
|
if [ "${FORCE}" = false ]; then
|
|
429
248
|
if aws s3 ls "${MODEL_S3_URI}config.json" --region "${AWS_REGION}" &>/dev/null; then
|
|
430
249
|
echo "✅ Model already staged at: ${MODEL_S3_URI}"
|
|
@@ -443,7 +262,7 @@ if [ "${FORCE}" = false ]; then
|
|
|
443
262
|
fi
|
|
444
263
|
fi
|
|
445
264
|
|
|
446
|
-
#
|
|
265
|
+
# Validate prerequisites
|
|
447
266
|
if ! command -v huggingface-cli &>/dev/null; then
|
|
448
267
|
echo "❌ huggingface-cli is not installed"
|
|
449
268
|
echo " Install: pip install huggingface_hub[cli] hf_transfer"
|
package/templates/do/test
CHANGED
|
@@ -16,8 +16,18 @@ source "${SCRIPT_DIR}/lib/profile.sh"
|
|
|
16
16
|
# SageMaker Real-Time Inference Testing
|
|
17
17
|
# ============================================================
|
|
18
18
|
|
|
19
|
-
# Parse arguments: ./do/test [<ic-name>]
|
|
20
|
-
IC_ARG="
|
|
19
|
+
# Parse arguments: ./do/test [<ic-name>] or ./do/test --adapter <name>
|
|
20
|
+
IC_ARG=""
|
|
21
|
+
if [ "${1:-}" = "--adapter" ] || [ "${1:-}" = "-a" ]; then
|
|
22
|
+
if [ -z "${2:-}" ]; then
|
|
23
|
+
echo "❌ --adapter requires an adapter name argument"
|
|
24
|
+
echo " Usage: ./do/test --adapter <name>"
|
|
25
|
+
exit 1
|
|
26
|
+
fi
|
|
27
|
+
IC_ARG="$2"
|
|
28
|
+
else
|
|
29
|
+
IC_ARG="${1:-}"
|
|
30
|
+
fi
|
|
21
31
|
|
|
22
32
|
# Determine test mode based on ENDPOINT_NAME in config
|
|
23
33
|
if [ -z "${ENDPOINT_NAME:-}" ]; then
|