@aws/ml-container-creator 0.10.3 → 0.13.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/parameter-schema-v2.json +28 -1
- package/infra/ci-harness/lib/ci-harness-stack.ts +50 -36
- package/package.json +14 -5
- package/servers/instance-sizer/index.js +30 -17
- package/servers/instance-sizer/lib/instance-ranker.js +44 -0
- package/servers/lib/catalogs/instances.json +27 -0
- package/src/app.js +22 -1
- package/src/lib/bootstrap-command-handler.js +32 -3
- package/src/lib/config-validator.js +1 -1
- package/src/lib/generated/cli-options.js +7 -2
- package/src/lib/generated/parameter-matrix.js +16 -5
- package/src/lib/generated/validation-rules.js +7 -3
- package/src/lib/path-prover-brain.js +58 -1
- package/src/lib/prompts/infrastructure-prompts.js +2 -2
- package/src/lib/prompts/model-prompts.js +6 -0
- package/src/lib/prove-pipeline-executor.js +294 -0
- package/src/lib/secrets-prompt-runner.js +4 -0
- package/src/lib/template-manager.js +1 -1
- package/src/lib/template-variable-resolver.js +62 -0
- package/templates/do/README.md +37 -0
- package/templates/do/adapter +8 -0
- package/templates/do/build +8 -0
- package/templates/do/clean.d/async-inference.ejs +8 -0
- package/templates/do/clean.d/batch-transform.ejs +8 -0
- package/templates/do/clean.d/hyperpod-eks.ejs +8 -0
- package/templates/do/clean.d/managed-inference.ejs +8 -0
- package/templates/do/config +12 -45
- package/templates/do/deploy.d/async-inference.ejs +33 -3
- package/templates/do/deploy.d/batch-transform.ejs +32 -3
- package/templates/do/deploy.d/hyperpod-eks.ejs +7 -0
- package/templates/do/deploy.d/managed-inference.ejs +27 -3
- package/templates/do/lib/endpoint-config.sh +1 -1
- package/templates/do/lib/profile.sh +44 -0
- package/templates/do/lib/staged-assets.sh +217 -0
- package/templates/do/push +8 -0
- package/templates/do/register +8 -0
- package/templates/do/stage +569 -0
- package/templates/do/submit +10 -0
- package/templates/do/test +1 -0
- package/templates/do/tune +7 -0
|
@@ -0,0 +1,569 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
# do/stage — Pre-stage model weights from HuggingFace to S3
|
|
6
|
+
# Downloads the model using huggingface-cli and syncs to S3 so that
|
|
7
|
+
# vLLM can load directly from S3 at deploy time (fast cold-start).
|
|
8
|
+
#
|
|
9
|
+
# Idempotent: if the model is already staged (config.json exists at
|
|
10
|
+
# the target S3 path), the script exits early.
|
|
11
|
+
#
|
|
12
|
+
# Usage:
|
|
13
|
+
# ./do/stage Stage model to S3
|
|
14
|
+
# ./do/stage --force Re-stage even if already present in S3
|
|
15
|
+
# ./do/stage --update-config Stage and update MODEL_NAME in do/config
|
|
16
|
+
# ./do/stage --submit Submit as SageMaker Processing Job (for models >500GB)
|
|
17
|
+
# ./do/stage --submit --no-wait Submit and exit without polling
|
|
18
|
+
|
|
19
|
+
set -e
|
|
20
|
+
set -u
|
|
21
|
+
set -o pipefail
|
|
22
|
+
|
|
23
|
+
# ── Source project configuration ──────────────────────────────────────────────
|
|
24
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
25
|
+
source "${SCRIPT_DIR}/config"
|
|
26
|
+
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
27
|
+
source "${SCRIPT_DIR}/lib/staged-assets.sh"
|
|
28
|
+
|
|
29
|
+
# ── Parse flags ───────────────────────────────────────────────────────────────
|
|
30
|
+
FORCE=false
|
|
31
|
+
UPDATE_CONFIG=false
|
|
32
|
+
SUBMIT_MODE=false
|
|
33
|
+
NO_WAIT=false
|
|
34
|
+
while [ $# -gt 0 ]; do
|
|
35
|
+
case "$1" in
|
|
36
|
+
--force) FORCE=true; shift ;;
|
|
37
|
+
--update-config) UPDATE_CONFIG=true; shift ;;
|
|
38
|
+
--submit) SUBMIT_MODE=true; shift ;;
|
|
39
|
+
--no-wait) NO_WAIT=true; shift ;;
|
|
40
|
+
--help|-h)
|
|
41
|
+
echo "Usage: ./do/stage [--force] [--update-config] [--submit] [--no-wait]"
|
|
42
|
+
echo ""
|
|
43
|
+
echo "Pre-stage model weights from HuggingFace to S3."
|
|
44
|
+
echo ""
|
|
45
|
+
echo "Modes:"
|
|
46
|
+
echo " (default) Download locally then sync to S3"
|
|
47
|
+
echo " --submit Submit as SageMaker Processing Job (for models >500GB)"
|
|
48
|
+
echo ""
|
|
49
|
+
echo "Options:"
|
|
50
|
+
echo " --force Re-stage even if model already exists in S3"
|
|
51
|
+
echo " --update-config Update MODEL_NAME in do/config to the staged S3 URI"
|
|
52
|
+
echo " --no-wait (with --submit) Exit without polling for completion"
|
|
53
|
+
echo ""
|
|
54
|
+
echo "Environment:"
|
|
55
|
+
echo " HF_TOKEN HuggingFace token (for gated models)"
|
|
56
|
+
echo ""
|
|
57
|
+
echo "The staged S3 URI will be printed on completion."
|
|
58
|
+
echo "Pass --update-config to automatically update do/config for S3-backed deploys."
|
|
59
|
+
echo ""
|
|
60
|
+
echo "The --submit mode uses a SageMaker Processing Job with 2TB attached"
|
|
61
|
+
echo "storage, suitable for very large models that exceed local disk capacity."
|
|
62
|
+
exit 0
|
|
63
|
+
;;
|
|
64
|
+
*) shift ;;
|
|
65
|
+
esac
|
|
66
|
+
done
|
|
67
|
+
|
|
68
|
+
# ── Processing Job submission function ────────────────────────────────────────
|
|
69
|
+
# Submits a SageMaker Processing Job that downloads model weights from HuggingFace
|
|
70
|
+
# and syncs them to S3. Uses 2TB attached storage to handle any model size.
|
|
71
|
+
POLL_INTERVAL=30
|
|
72
|
+
PROCESSING_JOB_INSTANCE_TYPE="ml.m5.xlarge"
|
|
73
|
+
PROCESSING_JOB_VOLUME_GB=2048
|
|
74
|
+
|
|
75
|
+
_submit_processing_job() {
|
|
76
|
+
echo "🚀 Submitting SageMaker Processing Job for model staging"
|
|
77
|
+
echo " Model: ${MODEL_NAME}"
|
|
78
|
+
echo " Target: ${MODEL_S3_URI}"
|
|
79
|
+
echo " Instance: ${PROCESSING_JOB_INSTANCE_TYPE}"
|
|
80
|
+
echo " Storage: ${PROCESSING_JOB_VOLUME_GB} GB"
|
|
81
|
+
echo ""
|
|
82
|
+
|
|
83
|
+
# Validate AWS credentials
|
|
84
|
+
if ! aws sts get-caller-identity &>/dev/null; then
|
|
85
|
+
echo "❌ AWS credentials not configured or expired."
|
|
86
|
+
echo " Run: aws configure"
|
|
87
|
+
exit 4
|
|
88
|
+
fi
|
|
89
|
+
|
|
90
|
+
# Resolve execution role from profile
|
|
91
|
+
local execution_role
|
|
92
|
+
execution_role=$(echo "${_PROFILE_JSON}" | python3 -c "
|
|
93
|
+
import sys, json
|
|
94
|
+
p = json.load(sys.stdin)
|
|
95
|
+
print(p.get('executionRoleArn', ''))
|
|
96
|
+
" 2>/dev/null) || execution_role=""
|
|
97
|
+
|
|
98
|
+
if [ -z "${execution_role}" ]; then
|
|
99
|
+
echo "❌ No execution role configured."
|
|
100
|
+
echo " Run 'ml-container-creator bootstrap' to set up your profile."
|
|
101
|
+
echo " The role needs: SageMaker, S3, and Secrets Manager permissions."
|
|
102
|
+
exit 1
|
|
103
|
+
fi
|
|
104
|
+
|
|
105
|
+
# Resolve HF token ARN for the processing job (optional — for gated models)
|
|
106
|
+
local hf_token_secret_arn="${HF_TOKEN_ARN:-}"
|
|
107
|
+
|
|
108
|
+
# Generate job name with timestamp
|
|
109
|
+
local timestamp
|
|
110
|
+
timestamp=$(date +%Y%m%d-%H%M%S)
|
|
111
|
+
local job_name="mlcc-stage-${PROJECT_NAME}-${timestamp}"
|
|
112
|
+
# SageMaker job names max 63 chars, must match [a-zA-Z0-9](-*[a-zA-Z0-9])*
|
|
113
|
+
job_name=$(echo "${job_name}" | cut -c1-63 | sed 's/[^a-zA-Z0-9-]/-/g' | sed 's/-*$//')
|
|
114
|
+
|
|
115
|
+
echo " Job name: ${job_name}"
|
|
116
|
+
echo ""
|
|
117
|
+
|
|
118
|
+
# Build the entrypoint script that runs inside the processing container
|
|
119
|
+
local entrypoint_script
|
|
120
|
+
entrypoint_script=$(cat <<'ENTRYPOINT_EOF'
|
|
121
|
+
#!/bin/bash
|
|
122
|
+
set -e
|
|
123
|
+
set -o pipefail
|
|
124
|
+
|
|
125
|
+
echo "=== MCC Model Staging Processing Job ==="
|
|
126
|
+
echo "Model: ${MODEL_ID}"
|
|
127
|
+
echo "Target: ${S3_OUTPUT_URI}"
|
|
128
|
+
echo ""
|
|
129
|
+
|
|
130
|
+
# Install dependencies
|
|
131
|
+
echo "📦 Installing huggingface-cli and hf_transfer..."
|
|
132
|
+
pip install -q huggingface_hub[cli] hf_transfer
|
|
133
|
+
|
|
134
|
+
# Enable fast parallel downloads
|
|
135
|
+
export HF_HUB_ENABLE_HF_TRANSFER=1
|
|
136
|
+
|
|
137
|
+
# Set HF token if provided
|
|
138
|
+
if [ -n "${HF_TOKEN:-}" ]; then
|
|
139
|
+
echo "🔐 Using provided HuggingFace token"
|
|
140
|
+
fi
|
|
141
|
+
|
|
142
|
+
# Download model from HuggingFace
|
|
143
|
+
echo ""
|
|
144
|
+
echo "⬇️ Downloading model: ${MODEL_ID}"
|
|
145
|
+
DOWNLOAD_ARGS="${MODEL_ID}"
|
|
146
|
+
if [ -n "${HF_TOKEN:-}" ]; then
|
|
147
|
+
DOWNLOAD_ARGS="${DOWNLOAD_ARGS} --token ${HF_TOKEN}"
|
|
148
|
+
fi
|
|
149
|
+
huggingface-cli download ${DOWNLOAD_ARGS}
|
|
150
|
+
|
|
151
|
+
echo ""
|
|
152
|
+
echo "✅ Download complete"
|
|
153
|
+
|
|
154
|
+
# Locate downloaded files
|
|
155
|
+
CACHE_PATH=$(python3 -c "
|
|
156
|
+
from huggingface_hub import snapshot_download
|
|
157
|
+
path = snapshot_download('${MODEL_ID}', local_files_only=True)
|
|
158
|
+
print(path)
|
|
159
|
+
")
|
|
160
|
+
|
|
161
|
+
echo "📁 Cache path: ${CACHE_PATH}"
|
|
162
|
+
|
|
163
|
+
# Sync to S3
|
|
164
|
+
echo ""
|
|
165
|
+
echo "☁️ Syncing to S3: ${S3_OUTPUT_URI}"
|
|
166
|
+
aws s3 sync "${CACHE_PATH}" "${S3_OUTPUT_URI}" \
|
|
167
|
+
--no-progress \
|
|
168
|
+
--exclude "*.lock" \
|
|
169
|
+
--exclude ".gitattributes"
|
|
170
|
+
|
|
171
|
+
echo ""
|
|
172
|
+
echo "✅ Model staged successfully to: ${S3_OUTPUT_URI}"
|
|
173
|
+
ENTRYPOINT_EOF
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Build environment variables for the container
|
|
177
|
+
local env_vars="MODEL_ID=${MODEL_NAME},S3_OUTPUT_URI=${MODEL_S3_URI}"
|
|
178
|
+
if [ -n "${hf_token_secret_arn}" ]; then
|
|
179
|
+
# Resolve token and pass as env var to the job
|
|
180
|
+
local hf_token_value=""
|
|
181
|
+
hf_token_value=$(aws secretsmanager get-secret-value \
|
|
182
|
+
--secret-id "${hf_token_secret_arn}" \
|
|
183
|
+
--query SecretString --output text 2>/dev/null) || hf_token_value=""
|
|
184
|
+
if [ -n "${hf_token_value}" ]; then
|
|
185
|
+
env_vars="${env_vars},HF_TOKEN=${hf_token_value}"
|
|
186
|
+
fi
|
|
187
|
+
elif [ -n "${HF_TOKEN:-}" ]; then
|
|
188
|
+
env_vars="${env_vars},HF_TOKEN=${HF_TOKEN}"
|
|
189
|
+
fi
|
|
190
|
+
|
|
191
|
+
# Write entrypoint to a temp file for the processing job input
|
|
192
|
+
local entrypoint_s3_key="staging-jobs/${job_name}/entrypoint.sh"
|
|
193
|
+
local entrypoint_s3_uri="s3://${STAGE_S3_BUCKET}/${entrypoint_s3_key}"
|
|
194
|
+
|
|
195
|
+
echo "📤 Uploading entrypoint script..."
|
|
196
|
+
echo "${entrypoint_script}" | aws s3 cp - "${entrypoint_s3_uri}" --region "${AWS_REGION}"
|
|
197
|
+
|
|
198
|
+
# Create the processing job
|
|
199
|
+
# Uses a lightweight Python image with AWS CLI pre-installed
|
|
200
|
+
local container_image="763104351884.dkr.ecr.${AWS_REGION}.amazonaws.com/pytorch-training:2.1.0-cpu-py310-ubuntu20.04-sagemaker"
|
|
201
|
+
|
|
202
|
+
local processing_request
|
|
203
|
+
processing_request=$(python3 -c "
|
|
204
|
+
import json, sys
|
|
205
|
+
|
|
206
|
+
job = {
|
|
207
|
+
'ProcessingJobName': '${job_name}',
|
|
208
|
+
'ProcessingResources': {
|
|
209
|
+
'ClusterConfig': {
|
|
210
|
+
'InstanceCount': 1,
|
|
211
|
+
'InstanceType': '${PROCESSING_JOB_INSTANCE_TYPE}',
|
|
212
|
+
'VolumeSizeInGB': ${PROCESSING_JOB_VOLUME_GB}
|
|
213
|
+
}
|
|
214
|
+
},
|
|
215
|
+
'AppSpecification': {
|
|
216
|
+
'ImageUri': '${container_image}',
|
|
217
|
+
'ContainerEntrypoint': ['bash', '-c'],
|
|
218
|
+
'ContainerArguments': ['aws s3 cp ${entrypoint_s3_uri} /tmp/entrypoint.sh && chmod +x /tmp/entrypoint.sh && /tmp/entrypoint.sh']
|
|
219
|
+
},
|
|
220
|
+
'Environment': dict(item.split('=', 1) for item in '${env_vars}'.split(',')),
|
|
221
|
+
'RoleArn': '${execution_role}',
|
|
222
|
+
'StoppingCondition': {
|
|
223
|
+
'MaxRuntimeInSeconds': 86400
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
print(json.dumps(job, indent=2))
|
|
228
|
+
")
|
|
229
|
+
|
|
230
|
+
# Write request JSON to temp file
|
|
231
|
+
local request_file="/tmp/mlcc-stage-request-${timestamp}.json"
|
|
232
|
+
echo "${processing_request}" > "${request_file}"
|
|
233
|
+
|
|
234
|
+
echo "🚀 Creating Processing Job: ${job_name}"
|
|
235
|
+
echo ""
|
|
236
|
+
|
|
237
|
+
local create_output
|
|
238
|
+
local create_exit_code
|
|
239
|
+
create_output=$(aws sagemaker create-processing-job \
|
|
240
|
+
--cli-input-json "file://${request_file}" \
|
|
241
|
+
--region "${AWS_REGION}" 2>&1) || create_exit_code=$?
|
|
242
|
+
create_exit_code=${create_exit_code:-0}
|
|
243
|
+
|
|
244
|
+
rm -f "${request_file}"
|
|
245
|
+
|
|
246
|
+
if [ ${create_exit_code} -ne 0 ]; then
|
|
247
|
+
echo "❌ Failed to create Processing Job"
|
|
248
|
+
echo " ${create_output}"
|
|
249
|
+
echo ""
|
|
250
|
+
if echo "${create_output}" | grep -q "AccessDeniedException"; then
|
|
251
|
+
echo " Remediation: ensure the execution role has sagemaker:CreateProcessingJob permission"
|
|
252
|
+
fi
|
|
253
|
+
exit 1
|
|
254
|
+
fi
|
|
255
|
+
|
|
256
|
+
echo " ✅ Processing Job submitted: ${job_name}"
|
|
257
|
+
echo ""
|
|
258
|
+
|
|
259
|
+
# Handle --no-wait
|
|
260
|
+
if [ "${NO_WAIT}" = true ]; then
|
|
261
|
+
echo " --no-wait specified. Job submitted, exiting without polling."
|
|
262
|
+
echo ""
|
|
263
|
+
echo " Check status:"
|
|
264
|
+
echo " aws sagemaker describe-processing-job --processing-job-name ${job_name} --region ${AWS_REGION}"
|
|
265
|
+
echo ""
|
|
266
|
+
echo " On completion, the staged model will be at:"
|
|
267
|
+
echo " ${MODEL_S3_URI}"
|
|
268
|
+
return 0
|
|
269
|
+
fi
|
|
270
|
+
|
|
271
|
+
# Poll for completion
|
|
272
|
+
_poll_processing_job "${job_name}"
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
# ── Poll Processing Job status ────────────────────────────────────────────────
|
|
276
|
+
_poll_processing_job() {
|
|
277
|
+
local job_name="$1"
|
|
278
|
+
|
|
279
|
+
echo "⏳ Polling Processing Job status (every ${POLL_INTERVAL}s)..."
|
|
280
|
+
echo " (Ctrl+C to stop polling — job continues in background)"
|
|
281
|
+
echo ""
|
|
282
|
+
|
|
283
|
+
while true; do
|
|
284
|
+
local describe_output
|
|
285
|
+
local describe_exit_code
|
|
286
|
+
describe_output=$(aws sagemaker describe-processing-job \
|
|
287
|
+
--processing-job-name "${job_name}" \
|
|
288
|
+
--region "${AWS_REGION}" 2>&1) || describe_exit_code=$?
|
|
289
|
+
describe_exit_code=${describe_exit_code:-0}
|
|
290
|
+
|
|
291
|
+
if [ ${describe_exit_code} -ne 0 ]; then
|
|
292
|
+
echo " ⚠️ Failed to describe job (will retry): ${describe_output}"
|
|
293
|
+
sleep "${POLL_INTERVAL}"
|
|
294
|
+
continue
|
|
295
|
+
fi
|
|
296
|
+
|
|
297
|
+
# Parse status from response
|
|
298
|
+
local job_status
|
|
299
|
+
local failure_reason
|
|
300
|
+
job_status=$(echo "${describe_output}" | python3 -c "
|
|
301
|
+
import sys, json
|
|
302
|
+
d = json.load(sys.stdin)
|
|
303
|
+
print(d.get('ProcessingJobStatus', 'Unknown'))
|
|
304
|
+
" 2>/dev/null) || job_status="Unknown"
|
|
305
|
+
|
|
306
|
+
failure_reason=$(echo "${describe_output}" | python3 -c "
|
|
307
|
+
import sys, json
|
|
308
|
+
d = json.load(sys.stdin)
|
|
309
|
+
print(d.get('FailureReason', ''))
|
|
310
|
+
" 2>/dev/null) || failure_reason=""
|
|
311
|
+
|
|
312
|
+
# Print status
|
|
313
|
+
local now
|
|
314
|
+
now=$(date +%H:%M:%S)
|
|
315
|
+
echo " [${now}] Status: ${job_status}"
|
|
316
|
+
|
|
317
|
+
# Handle terminal states
|
|
318
|
+
case "${job_status}" in
|
|
319
|
+
Completed)
|
|
320
|
+
echo ""
|
|
321
|
+
echo "✅ Processing Job completed: ${job_name}"
|
|
322
|
+
echo ""
|
|
323
|
+
echo " S3 URI: ${MODEL_S3_URI}"
|
|
324
|
+
echo ""
|
|
325
|
+
if [ "${UPDATE_CONFIG}" = true ]; then
|
|
326
|
+
CONFIG_FILE="${SCRIPT_DIR}/config"
|
|
327
|
+
sed -i.bak "s|^export MODEL_NAME=.*|export MODEL_NAME=\"${MODEL_S3_URI}\"|" "${CONFIG_FILE}"
|
|
328
|
+
rm -f "${CONFIG_FILE}.bak"
|
|
329
|
+
echo " ✅ Updated MODEL_NAME in do/config → ${MODEL_S3_URI}"
|
|
330
|
+
echo ""
|
|
331
|
+
echo " Re-deploy with S3-backed model: ./do/deploy"
|
|
332
|
+
else
|
|
333
|
+
echo " To use this staged model, update do/config:"
|
|
334
|
+
echo " export MODEL_NAME=\"${MODEL_S3_URI}\""
|
|
335
|
+
echo ""
|
|
336
|
+
echo " Or re-run with --update-config:"
|
|
337
|
+
echo " ./do/stage --submit --update-config"
|
|
338
|
+
fi
|
|
339
|
+
return 0
|
|
340
|
+
;;
|
|
341
|
+
Failed)
|
|
342
|
+
echo ""
|
|
343
|
+
echo "❌ Processing Job failed: ${job_name}"
|
|
344
|
+
if [ -n "${failure_reason}" ]; then
|
|
345
|
+
echo " Reason: ${failure_reason}"
|
|
346
|
+
fi
|
|
347
|
+
echo ""
|
|
348
|
+
echo " Check CloudWatch logs:"
|
|
349
|
+
echo " /aws/sagemaker/ProcessingJobs/${job_name}"
|
|
350
|
+
echo ""
|
|
351
|
+
echo " To retry: ./do/stage --submit --force"
|
|
352
|
+
return 1
|
|
353
|
+
;;
|
|
354
|
+
Stopped)
|
|
355
|
+
echo ""
|
|
356
|
+
echo "⏹️ Processing Job was stopped: ${job_name}"
|
|
357
|
+
echo ""
|
|
358
|
+
echo " To retry: ./do/stage --submit --force"
|
|
359
|
+
return 2
|
|
360
|
+
;;
|
|
361
|
+
esac
|
|
362
|
+
|
|
363
|
+
sleep "${POLL_INTERVAL}"
|
|
364
|
+
done
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
# ── Check if model is already an S3 URI ──────────────────────────────────────
|
|
368
|
+
if [[ "${MODEL_NAME}" == s3://* ]]; then
|
|
369
|
+
echo "✅ Model is already an S3 URI: ${MODEL_NAME}"
|
|
370
|
+
echo " Nothing to stage."
|
|
371
|
+
exit 0
|
|
372
|
+
fi
|
|
373
|
+
|
|
374
|
+
echo "📦 Staging model: ${MODEL_NAME}"
|
|
375
|
+
echo " Project: ${PROJECT_NAME}"
|
|
376
|
+
echo ""
|
|
377
|
+
|
|
378
|
+
# ── Resolve profile for S3 bucket ────────────────────────────────────────────
|
|
379
|
+
_PROFILE_JSON=""
|
|
380
|
+
if command -v python3 &>/dev/null; then
|
|
381
|
+
_PROFILE_JSON=$(python3 -c "
|
|
382
|
+
import json, os
|
|
383
|
+
config_path = os.path.expanduser('~/.ml-container-creator/config.json')
|
|
384
|
+
try:
|
|
385
|
+
with open(config_path) as f:
|
|
386
|
+
config = json.load(f)
|
|
387
|
+
profile = config['profiles'][config['activeProfile']]
|
|
388
|
+
print(json.dumps(profile))
|
|
389
|
+
except:
|
|
390
|
+
print('{}')
|
|
391
|
+
" 2>/dev/null) || _PROFILE_JSON="{}"
|
|
392
|
+
fi
|
|
393
|
+
|
|
394
|
+
# Extract the benchmark S3 bucket from profile (used for model staging)
|
|
395
|
+
STAGE_S3_BUCKET=$(echo "${_PROFILE_JSON}" | python3 -c "
|
|
396
|
+
import sys, json
|
|
397
|
+
p = json.load(sys.stdin)
|
|
398
|
+
bucket = p.get('benchmarkS3Bucket', '')
|
|
399
|
+
if not bucket:
|
|
400
|
+
acct = p.get('accountId', 'unknown')
|
|
401
|
+
region = p.get('awsRegion', 'us-east-1')
|
|
402
|
+
bucket = f'ml-container-creator-benchmark-{region}-{acct}'
|
|
403
|
+
print(bucket)
|
|
404
|
+
" 2>/dev/null) || STAGE_S3_BUCKET=""
|
|
405
|
+
|
|
406
|
+
if [ -z "${STAGE_S3_BUCKET}" ]; then
|
|
407
|
+
echo "❌ Could not determine S3 bucket for staging."
|
|
408
|
+
echo " Run 'ml-container-creator bootstrap' to set up your profile."
|
|
409
|
+
exit 1
|
|
410
|
+
fi
|
|
411
|
+
|
|
412
|
+
# Target S3 path for staged model
|
|
413
|
+
MODEL_S3_URI="s3://${STAGE_S3_BUCKET}/models/${PROJECT_NAME}/"
|
|
414
|
+
|
|
415
|
+
echo " Target: ${MODEL_S3_URI}"
|
|
416
|
+
echo ""
|
|
417
|
+
|
|
418
|
+
# ── Submit mode: SageMaker Processing Job ─────────────────────────────────────
|
|
419
|
+
# For very large models (>500GB) that exceed local disk, submit a Processing Job
|
|
420
|
+
# with 2TB attached storage. The job downloads from HuggingFace and syncs to S3.
|
|
421
|
+
if [ "${SUBMIT_MODE}" = true ]; then
|
|
422
|
+
_submit_processing_job
|
|
423
|
+
exit $?
|
|
424
|
+
fi
|
|
425
|
+
|
|
426
|
+
# ── Idempotency: check if model is already staged ────────────────────────────
|
|
427
|
+
if [ "${FORCE}" = false ]; then
|
|
428
|
+
if aws s3 ls "${MODEL_S3_URI}config.json" --region "${AWS_REGION}" &>/dev/null; then
|
|
429
|
+
echo "✅ Model already staged at: ${MODEL_S3_URI}"
|
|
430
|
+
echo " Use --force to re-stage."
|
|
431
|
+
echo ""
|
|
432
|
+
if [ "${UPDATE_CONFIG}" = true ]; then
|
|
433
|
+
CONFIG_FILE="${SCRIPT_DIR}/config"
|
|
434
|
+
sed -i.bak "s|^export MODEL_NAME=.*|export MODEL_NAME=\"${MODEL_S3_URI}\"|" "${CONFIG_FILE}"
|
|
435
|
+
rm -f "${CONFIG_FILE}.bak"
|
|
436
|
+
echo " ✅ Updated MODEL_NAME in do/config → ${MODEL_S3_URI}"
|
|
437
|
+
else
|
|
438
|
+
echo " To use this staged model, set in do/config:"
|
|
439
|
+
echo " export MODEL_NAME=\"${MODEL_S3_URI}\""
|
|
440
|
+
fi
|
|
441
|
+
exit 0
|
|
442
|
+
fi
|
|
443
|
+
fi
|
|
444
|
+
|
|
445
|
+
# ── Validate prerequisites ───────────────────────────────────────────────────
|
|
446
|
+
if ! command -v huggingface-cli &>/dev/null; then
|
|
447
|
+
echo "❌ huggingface-cli is not installed"
|
|
448
|
+
echo " Install: pip install huggingface_hub[cli] hf_transfer"
|
|
449
|
+
exit 2
|
|
450
|
+
fi
|
|
451
|
+
|
|
452
|
+
if ! command -v aws &>/dev/null; then
|
|
453
|
+
echo "❌ AWS CLI is not installed"
|
|
454
|
+
echo " Install: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html"
|
|
455
|
+
exit 2
|
|
456
|
+
fi
|
|
457
|
+
|
|
458
|
+
# Validate AWS credentials
|
|
459
|
+
if ! aws sts get-caller-identity &>/dev/null; then
|
|
460
|
+
echo "❌ AWS credentials not configured or expired."
|
|
461
|
+
echo " Run: aws configure"
|
|
462
|
+
exit 4
|
|
463
|
+
fi
|
|
464
|
+
|
|
465
|
+
# ── Resolve HuggingFace token (for gated models) ─────────────────────────────
|
|
466
|
+
if [ -n "${HF_TOKEN_ARN:-}" ] && [ -z "${HF_TOKEN:-}" ]; then
|
|
467
|
+
echo "🔐 Resolving HuggingFace token from Secrets Manager..."
|
|
468
|
+
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "${HF_TOKEN_ARN}" --query SecretString --output text) || {
|
|
469
|
+
echo "⚠️ Failed to resolve HF token from Secrets Manager (continuing without token)"
|
|
470
|
+
HF_TOKEN=""
|
|
471
|
+
}
|
|
472
|
+
export HF_TOKEN
|
|
473
|
+
fi
|
|
474
|
+
|
|
475
|
+
# ── Download model from HuggingFace ──────────────────────────────────────────
|
|
476
|
+
echo "⬇️ Downloading model from HuggingFace: ${MODEL_NAME}"
|
|
477
|
+
echo " Using hf_transfer for fast parallel downloads..."
|
|
478
|
+
echo ""
|
|
479
|
+
|
|
480
|
+
# Enable fast parallel downloads via hf_transfer
|
|
481
|
+
export HF_HUB_ENABLE_HF_TRANSFER=1
|
|
482
|
+
|
|
483
|
+
# Download to HF cache (huggingface-cli manages cache location)
|
|
484
|
+
DOWNLOAD_ARGS=("${MODEL_NAME}")
|
|
485
|
+
if [ -n "${HF_TOKEN:-}" ]; then
|
|
486
|
+
DOWNLOAD_ARGS+=("--token" "${HF_TOKEN}")
|
|
487
|
+
fi
|
|
488
|
+
|
|
489
|
+
if ! huggingface-cli download "${DOWNLOAD_ARGS[@]}"; then
|
|
490
|
+
echo "❌ Failed to download model from HuggingFace: ${MODEL_NAME}"
|
|
491
|
+
echo ""
|
|
492
|
+
echo "Possible causes:"
|
|
493
|
+
echo " • Model name is incorrect"
|
|
494
|
+
echo " • Model is gated and requires HF_TOKEN"
|
|
495
|
+
echo " • Network connectivity issues"
|
|
496
|
+
exit 3
|
|
497
|
+
fi
|
|
498
|
+
|
|
499
|
+
echo ""
|
|
500
|
+
echo "✅ Download complete"
|
|
501
|
+
|
|
502
|
+
# ── Locate downloaded files in HF cache ───────────────────────────────────────
|
|
503
|
+
# huggingface-cli downloads to ~/.cache/huggingface/hub/models--<org>--<name>/snapshots/<rev>/
|
|
504
|
+
HF_CACHE_DIR=$(python3 -c "
|
|
505
|
+
from huggingface_hub import snapshot_download
|
|
506
|
+
import os
|
|
507
|
+
path = snapshot_download('${MODEL_NAME}', local_files_only=True)
|
|
508
|
+
print(path)
|
|
509
|
+
" 2>/dev/null) || HF_CACHE_DIR=""
|
|
510
|
+
|
|
511
|
+
if [ -z "${HF_CACHE_DIR}" ] || [ ! -d "${HF_CACHE_DIR}" ]; then
|
|
512
|
+
# Fallback: construct the path manually
|
|
513
|
+
MODEL_DIR_NAME=$(echo "${MODEL_NAME}" | tr '/' '--')
|
|
514
|
+
HF_CACHE_DIR="${HOME}/.cache/huggingface/hub/models--${MODEL_DIR_NAME}/snapshots"
|
|
515
|
+
# Use the latest snapshot
|
|
516
|
+
if [ -d "${HF_CACHE_DIR}" ]; then
|
|
517
|
+
HF_CACHE_DIR=$(ls -td "${HF_CACHE_DIR}"/*/ 2>/dev/null | head -1)
|
|
518
|
+
fi
|
|
519
|
+
fi
|
|
520
|
+
|
|
521
|
+
if [ -z "${HF_CACHE_DIR}" ] || [ ! -d "${HF_CACHE_DIR}" ]; then
|
|
522
|
+
echo "❌ Could not locate downloaded model files in HuggingFace cache"
|
|
523
|
+
echo " Expected location: ~/.cache/huggingface/hub/models--${MODEL_NAME//\//-}/snapshots/"
|
|
524
|
+
exit 3
|
|
525
|
+
fi
|
|
526
|
+
|
|
527
|
+
echo "📁 Model cache: ${HF_CACHE_DIR}"
|
|
528
|
+
|
|
529
|
+
# ── Sync to S3 ───────────────────────────────────────────────────────────────
|
|
530
|
+
echo ""
|
|
531
|
+
echo "☁️ Syncing model to S3: ${MODEL_S3_URI}"
|
|
532
|
+
echo " This may take a while for large models..."
|
|
533
|
+
echo ""
|
|
534
|
+
|
|
535
|
+
if ! aws s3 sync "${HF_CACHE_DIR}" "${MODEL_S3_URI}" \
|
|
536
|
+
--region "${AWS_REGION}" \
|
|
537
|
+
--no-progress \
|
|
538
|
+
--exclude "*.lock" \
|
|
539
|
+
--exclude ".gitattributes"; then
|
|
540
|
+
echo "❌ Failed to sync model to S3"
|
|
541
|
+
echo ""
|
|
542
|
+
echo "Possible causes:"
|
|
543
|
+
echo " • Missing S3 write permissions (s3:PutObject)"
|
|
544
|
+
echo " • Bucket does not exist (run 'ml-container-creator bootstrap')"
|
|
545
|
+
echo " • Network connectivity issues"
|
|
546
|
+
exit 4
|
|
547
|
+
fi
|
|
548
|
+
|
|
549
|
+
echo ""
|
|
550
|
+
echo "✅ Model staged successfully!"
|
|
551
|
+
echo ""
|
|
552
|
+
echo " S3 URI: ${MODEL_S3_URI}"
|
|
553
|
+
echo ""
|
|
554
|
+
if [ "${UPDATE_CONFIG}" = true ]; then
|
|
555
|
+
CONFIG_FILE="${SCRIPT_DIR}/config"
|
|
556
|
+
sed -i.bak "s|^export MODEL_NAME=.*|export MODEL_NAME=\"${MODEL_S3_URI}\"|" "${CONFIG_FILE}"
|
|
557
|
+
rm -f "${CONFIG_FILE}.bak"
|
|
558
|
+
echo " ✅ Updated MODEL_NAME in do/config → ${MODEL_S3_URI}"
|
|
559
|
+
echo ""
|
|
560
|
+
echo " Re-deploy with S3-backed model: ./do/deploy"
|
|
561
|
+
else
|
|
562
|
+
echo " To use this staged model, update do/config:"
|
|
563
|
+
echo " export MODEL_NAME=\"${MODEL_S3_URI}\""
|
|
564
|
+
echo ""
|
|
565
|
+
echo " Or re-run with --update-config to do it automatically:"
|
|
566
|
+
echo " ./do/stage --update-config"
|
|
567
|
+
echo ""
|
|
568
|
+
echo " Then re-deploy: ./do/deploy"
|
|
569
|
+
fi
|
package/templates/do/submit
CHANGED
|
@@ -9,6 +9,16 @@ set -o pipefail
|
|
|
9
9
|
# Source configuration
|
|
10
10
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
11
11
|
source "${SCRIPT_DIR}/config"
|
|
12
|
+
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
13
|
+
|
|
14
|
+
# ── Profile-resolved variables (env var > profile > default) ──────────────────
|
|
15
|
+
# Disable unbound-variable checking for associative array access (bash 3.2 compat)
|
|
16
|
+
set +u
|
|
17
|
+
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE[ecrRepositoryName]:-ml-container-creator}}"
|
|
18
|
+
set -u
|
|
19
|
+
|
|
20
|
+
# ── Derived variables (env var > computed default) ────────────────────────────
|
|
21
|
+
CODEBUILD_PROJECT_NAME="${CODEBUILD_PROJECT_NAME:-${PROJECT_NAME}-build-$(date +%Y%m%d)}"
|
|
12
22
|
|
|
13
23
|
echo "🚀 Submitting CodeBuild job for ${PROJECT_NAME}"
|
|
14
24
|
echo " Deployment config: ${DEPLOYMENT_CONFIG}"
|
package/templates/do/test
CHANGED
|
@@ -9,6 +9,7 @@ set -o pipefail
|
|
|
9
9
|
# Source configuration
|
|
10
10
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
11
11
|
source "${SCRIPT_DIR}/config"
|
|
12
|
+
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
12
13
|
|
|
13
14
|
<% if (deploymentTarget === 'realtime-inference') { %>
|
|
14
15
|
# ============================================================
|
package/templates/do/tune
CHANGED
|
@@ -13,6 +13,13 @@ set -o pipefail
|
|
|
13
13
|
# ── Source project configuration ──────────────────────────────────────────────
|
|
14
14
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
15
15
|
source "${SCRIPT_DIR}/config"
|
|
16
|
+
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
17
|
+
|
|
18
|
+
# ── Profile-resolved variables (env var > profile > default) ──────────────────
|
|
19
|
+
# Disable unbound-variable checking for associative array access (bash 3.2 compat)
|
|
20
|
+
set +u
|
|
21
|
+
TUNE_S3_BUCKET="${TUNE_S3_BUCKET:-mlcc-tune-${_PROFILE[accountId]:-unknown}-${_PROFILE[awsRegion]:-us-east-1}}"
|
|
22
|
+
set -u
|
|
16
23
|
|
|
17
24
|
# ── Constants ─────────────────────────────────────────────────────────────────
|
|
18
25
|
CATALOG_FILE="${SCRIPT_DIR}/.tune_catalog.json"
|