@aws/ml-container-creator 0.13.3 → 0.13.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -5
- package/infra/ci-harness/package-lock.json +1 -5
- package/package.json +5 -3
- package/pyproject.toml +21 -0
- package/requirements.txt +19 -0
- package/servers/instance-sizer/lib/model-resolver.js +127 -185
- package/servers/instance-sizer/lib/vram-estimator.js +86 -0
- package/servers/lib/catalogs/instances.json +0 -27
- package/src/app.js +2 -0
- package/src/lib/bootstrap-command-handler.js +35 -25
- package/src/lib/generated/cli-options.js +1 -1
- package/src/lib/generated/parameter-matrix.js +1 -1
- package/src/lib/generated/validation-rules.js +1 -1
- package/src/lib/prompt-runner.js +14 -31
- package/templates/IAM_PERMISSIONS.md +64 -13
- package/templates/do/.adapter_helper.py +451 -0
- package/templates/do/.benchmark_writer.py +13 -0
- package/templates/do/.stage_helper.py +419 -0
- package/templates/do/.tune_helper.py +218 -67
- package/templates/do/README.md +50 -604
- package/templates/do/__pycache__/.adapter_helper.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.tune_helper.cpython-312.pyc +0 -0
- package/templates/do/adapter +109 -4
- package/templates/do/benchmark +150 -12
- package/templates/do/build +2 -5
- package/templates/do/clean.d/async-inference.ejs +2 -5
- package/templates/do/clean.d/batch-transform.ejs +2 -5
- package/templates/do/clean.d/hyperpod-eks.ejs +2 -5
- package/templates/do/clean.d/managed-inference.ejs +2 -5
- package/templates/do/config +4 -0
- package/templates/do/deploy.d/async-inference.ejs +6 -9
- package/templates/do/deploy.d/batch-transform.ejs +4 -7
- package/templates/do/deploy.d/hyperpod-eks.ejs +1 -4
- package/templates/do/deploy.d/managed-inference.ejs +15 -6
- package/templates/do/lib/profile.sh +24 -15
- package/templates/do/push +2 -5
- package/templates/do/register +2 -5
- package/templates/do/stage +114 -292
- package/templates/do/submit +1 -4
- package/templates/do/tune +64 -10
- package/templates/MIGRATION.md +0 -488
- package/templates/TEMPLATE_SYSTEM.md +0 -243
package/templates/do/adapter
CHANGED
|
@@ -21,10 +21,7 @@ source "${SCRIPT_DIR}/config"
|
|
|
21
21
|
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
22
22
|
|
|
23
23
|
# ── Profile-resolved variables (env var > profile > default) ──────────────────
|
|
24
|
-
|
|
25
|
-
set +u
|
|
26
|
-
ADAPTER_S3_BUCKET="${ADAPTER_S3_BUCKET:-mlcc-adapters-${_PROFILE[accountId]:-unknown}-${_PROFILE[awsRegion]:-us-east-1}}"
|
|
27
|
-
set -u
|
|
24
|
+
ADAPTER_S3_BUCKET="${ADAPTER_S3_BUCKET:-mlcc-adapters-${_PROFILE_accountId:-unknown}-${_PROFILE_awsRegion:-us-east-1}}"
|
|
28
25
|
|
|
29
26
|
source "${SCRIPT_DIR}/lib/wait.sh"
|
|
30
27
|
|
|
@@ -46,12 +43,16 @@ _usage() {
|
|
|
46
43
|
echo ""
|
|
47
44
|
echo "Options:"
|
|
48
45
|
echo " --help, -h Show this help message"
|
|
46
|
+
echo " --local Use local aws s3 cp instead of Processing Job (--from-tune)"
|
|
47
|
+
echo " --no-wait Submit Processing Job and return immediately (--from-tune)"
|
|
49
48
|
echo ""
|
|
50
49
|
echo "Examples:"
|
|
51
50
|
echo " ./do/adapter add ectsum --weights s3://my-bucket/adapters/ectsum/adapter.tar.gz"
|
|
52
51
|
echo " ./do/adapter add ectsum --from-hub predibase/llama-3.1-8b-ectsum"
|
|
53
52
|
echo " ./do/adapter add tuned-sft --from-tune"
|
|
54
53
|
echo " ./do/adapter add tuned-sft --from-tune sft"
|
|
54
|
+
echo " ./do/adapter add tuned-sft --from-tune --local"
|
|
55
|
+
echo " ./do/adapter add tuned-sft --from-tune --no-wait"
|
|
55
56
|
echo " ./do/adapter list"
|
|
56
57
|
echo " ./do/adapter remove ectsum"
|
|
57
58
|
echo " ./do/adapter update ectsum --weights s3://my-bucket/adapters/ectsum-v2/adapter.tar.gz"
|
|
@@ -370,6 +371,8 @@ _adapter_add() {
|
|
|
370
371
|
local from_hub=""
|
|
371
372
|
local from_tune=""
|
|
372
373
|
local from_tune_technique=""
|
|
374
|
+
local use_local=""
|
|
375
|
+
local no_wait=""
|
|
373
376
|
|
|
374
377
|
# Parse add arguments
|
|
375
378
|
shift # remove 'add' from args
|
|
@@ -403,6 +406,14 @@ _adapter_add() {
|
|
|
403
406
|
shift
|
|
404
407
|
fi
|
|
405
408
|
;;
|
|
409
|
+
--local)
|
|
410
|
+
use_local="true"
|
|
411
|
+
shift
|
|
412
|
+
;;
|
|
413
|
+
--no-wait)
|
|
414
|
+
no_wait="true"
|
|
415
|
+
shift
|
|
416
|
+
;;
|
|
406
417
|
--help|-h)
|
|
407
418
|
echo "Usage: ./do/adapter add <name> --weights <s3-uri>"
|
|
408
419
|
echo " ./do/adapter add <name> --from-hub <hf-repo-id>"
|
|
@@ -417,6 +428,8 @@ _adapter_add() {
|
|
|
417
428
|
echo " --from-tune [technique] Use adapter output from do/tune"
|
|
418
429
|
echo " Without technique: uses latest tune output"
|
|
419
430
|
echo " With technique (e.g., sft, dpo): uses technique-specific output"
|
|
431
|
+
echo " --local Use local aws s3 cp instead of Processing Job (--from-tune only)"
|
|
432
|
+
echo " --no-wait Submit Processing Job and return immediately (--from-tune only)"
|
|
420
433
|
echo ""
|
|
421
434
|
echo "Note: --weights, --from-hub, and --from-tune are mutually exclusive."
|
|
422
435
|
echo ""
|
|
@@ -425,6 +438,8 @@ _adapter_add() {
|
|
|
425
438
|
echo " ./do/adapter add ectsum --from-hub predibase/llama-3.1-8b-ectsum"
|
|
426
439
|
echo " ./do/adapter add tuned-sft --from-tune"
|
|
427
440
|
echo " ./do/adapter add tuned-sft --from-tune sft"
|
|
441
|
+
echo " ./do/adapter add tuned-sft --from-tune --local"
|
|
442
|
+
echo " ./do/adapter add tuned-sft --from-tune --no-wait"
|
|
428
443
|
exit 0
|
|
429
444
|
;;
|
|
430
445
|
-*)
|
|
@@ -532,6 +547,95 @@ _adapter_add() {
|
|
|
532
547
|
fi
|
|
533
548
|
echo ""
|
|
534
549
|
|
|
550
|
+
# ── Route to Processing Job helper (default) or local path ────────
|
|
551
|
+
if [ -z "${use_local}" ]; then
|
|
552
|
+
# Default: use Processing Job via .adapter_helper.py
|
|
553
|
+
echo "🚀 Submitting Processing Job to stage adapter..."
|
|
554
|
+
echo ""
|
|
555
|
+
|
|
556
|
+
# Resolve execution role
|
|
557
|
+
local exec_role="${EXECUTION_ROLE_ARN:-}"
|
|
558
|
+
if [ -z "${exec_role}" ]; then
|
|
559
|
+
exec_role="${ROLE_ARN:-}"
|
|
560
|
+
fi
|
|
561
|
+
if [ -z "${exec_role}" ]; then
|
|
562
|
+
exec_role="${SAGEMAKER_ROLE_ARN:-}"
|
|
563
|
+
fi
|
|
564
|
+
if [ -z "${exec_role}" ]; then
|
|
565
|
+
echo "❌ No execution role found."
|
|
566
|
+
echo ""
|
|
567
|
+
echo " Run 'ml-container-creator bootstrap' to set up your profile,"
|
|
568
|
+
echo " or set ROLE_ARN / EXECUTION_ROLE_ARN in do/config."
|
|
569
|
+
exit 1
|
|
570
|
+
fi
|
|
571
|
+
|
|
572
|
+
# Resolve S3 bucket
|
|
573
|
+
local adapter_bucket="${ADAPTER_S3_BUCKET:-}"
|
|
574
|
+
if [ -z "${adapter_bucket}" ]; then
|
|
575
|
+
local account_id
|
|
576
|
+
account_id=$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo "")
|
|
577
|
+
if [ -z "${account_id}" ]; then
|
|
578
|
+
echo "❌ Could not determine AWS account ID."
|
|
579
|
+
echo " Ensure AWS credentials are configured."
|
|
580
|
+
exit 4
|
|
581
|
+
fi
|
|
582
|
+
adapter_bucket="mlcc-adapters-${account_id}-${AWS_REGION}"
|
|
583
|
+
fi
|
|
584
|
+
|
|
585
|
+
# Build helper args
|
|
586
|
+
local helper_args=(
|
|
587
|
+
"stage-from-tune"
|
|
588
|
+
"--training-output-s3-uri" "${weights_uri}"
|
|
589
|
+
"--adapter-name" "${adapter_name}"
|
|
590
|
+
"--bucket" "${adapter_bucket}"
|
|
591
|
+
"--project" "${PROJECT_NAME}"
|
|
592
|
+
"--role-arn" "${exec_role}"
|
|
593
|
+
"--region" "${AWS_REGION}"
|
|
594
|
+
)
|
|
595
|
+
if [ -n "${no_wait}" ]; then
|
|
596
|
+
helper_args+=("--no-wait")
|
|
597
|
+
fi
|
|
598
|
+
|
|
599
|
+
# Invoke the Python helper
|
|
600
|
+
local helper_output
|
|
601
|
+
if ! helper_output=$(python3 "${SCRIPT_DIR}/.adapter_helper.py" "${helper_args[@]}" 2>/dev/null); then
|
|
602
|
+
echo "❌ Processing Job failed. See error above."
|
|
603
|
+
exit 1
|
|
604
|
+
fi
|
|
605
|
+
|
|
606
|
+
# Parse JSON output from helper (extract only the JSON line, skip any log noise)
|
|
607
|
+
local json_line
|
|
608
|
+
json_line=$(echo "${helper_output}" | grep -E '^\{' | tail -1)
|
|
609
|
+
local job_status
|
|
610
|
+
job_status=$(echo "${json_line}" | python3 -c "import sys,json; print(json.loads(sys.stdin.read()).get('status',''))" 2>/dev/null || echo "")
|
|
611
|
+
|
|
612
|
+
if [ "${job_status}" = "Completed" ] || [ "${job_status}" = "InProgress" ]; then
|
|
613
|
+
echo "${json_line}"
|
|
614
|
+
# Extract adapter_s3_uri for downstream use
|
|
615
|
+
local staged_adapter_uri
|
|
616
|
+
staged_adapter_uri=$(echo "${json_line}" | python3 -c "import sys,json; print(json.loads(sys.stdin.read()).get('adapter_s3_uri',''))" 2>/dev/null || echo "")
|
|
617
|
+
|
|
618
|
+
if [ -n "${no_wait}" ]; then
|
|
619
|
+
echo ""
|
|
620
|
+
echo "✅ Processing Job submitted. Check status with:"
|
|
621
|
+
echo " python3 ${SCRIPT_DIR}/.adapter_helper.py status --job-name <job-name>"
|
|
622
|
+
echo ""
|
|
623
|
+
echo " Once complete, re-run without --no-wait to register the adapter."
|
|
624
|
+
exit 0
|
|
625
|
+
fi
|
|
626
|
+
|
|
627
|
+
# Update weights_uri to point to the staged adapter
|
|
628
|
+
weights_uri="${staged_adapter_uri}"
|
|
629
|
+
echo ""
|
|
630
|
+
echo "✅ Adapter staged via Processing Job: ${weights_uri}"
|
|
631
|
+
else
|
|
632
|
+
echo "❌ Unexpected status from Processing Job helper: ${job_status}"
|
|
633
|
+
echo " Output: ${helper_output}"
|
|
634
|
+
exit 1
|
|
635
|
+
fi
|
|
636
|
+
else
|
|
637
|
+
# ── --local flag: Package tune artifacts locally (original behavior) ──
|
|
638
|
+
|
|
535
639
|
# ── Package tune artifacts as tar.gz if needed ────────────────────
|
|
536
640
|
# Tune output is an S3 path that may be:
|
|
537
641
|
# 1. Already a tar.gz file (s3://...adapter.tar.gz) → use directly
|
|
@@ -680,6 +784,7 @@ _adapter_add() {
|
|
|
680
784
|
weights_uri="${s3_tar_path}"
|
|
681
785
|
fi
|
|
682
786
|
echo ""
|
|
787
|
+
fi # end --local else branch
|
|
683
788
|
fi
|
|
684
789
|
|
|
685
790
|
# ── Validate HF repo ID format (if --from-hub) ───────────────────────
|
package/templates/do/benchmark
CHANGED
|
@@ -12,10 +12,12 @@ set -o pipefail
|
|
|
12
12
|
# ── Source project configuration ──────────────────────────────────────────────
|
|
13
13
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
14
14
|
source "${SCRIPT_DIR}/config"
|
|
15
|
+
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
15
16
|
|
|
16
17
|
# ── Parse flags ───────────────────────────────────────────────────────────────
|
|
17
18
|
CLEAN_AFTER=false
|
|
18
19
|
FORCE=false
|
|
20
|
+
ARG_STATUS=false
|
|
19
21
|
IC_ARG=""
|
|
20
22
|
ADAPTER_ARG=""
|
|
21
23
|
ARG_NO_STALE_WARNING=false
|
|
@@ -24,30 +26,33 @@ while [ $# -gt 0 ]; do
|
|
|
24
26
|
case "$1" in
|
|
25
27
|
--clean) CLEAN_AFTER=true; shift ;;
|
|
26
28
|
--force) FORCE=true; shift ;;
|
|
29
|
+
--status) ARG_STATUS=true; shift ;;
|
|
27
30
|
--no-stale-warning) ARG_NO_STALE_WARNING=true; shift ;;
|
|
28
31
|
--workload) shift; ARG_WORKLOAD="${1:-}"; shift ;;
|
|
29
32
|
--ic) shift; IC_ARG="${1:-}"; shift ;;
|
|
30
33
|
--adapter) shift; ADAPTER_ARG="${1:-}"; shift ;;
|
|
31
34
|
--help|-h)
|
|
32
|
-
echo "Usage: ./do/benchmark [--workload <name>] [--ic <name>] [--adapter <name>] [--force] [--clean]
|
|
35
|
+
echo "Usage: ./do/benchmark [--workload <name>] [--status] [--ic <name>] [--adapter <name>] [--force] [--clean]"
|
|
33
36
|
echo ""
|
|
34
37
|
echo "Run SageMaker AI Benchmark against the deployed endpoint."
|
|
35
38
|
echo ""
|
|
36
39
|
echo "Options:"
|
|
40
|
+
echo " --status Check job status; if completed, download results + write to Athena"
|
|
37
41
|
echo " --ic <name> Benchmark a specific inference component"
|
|
38
42
|
echo " --adapter <name> Benchmark a specific LoRA adapter IC"
|
|
39
43
|
echo " --force Create a new benchmark job even if one is already running"
|
|
40
44
|
echo " --clean Delete workload config and benchmark job after displaying results"
|
|
41
45
|
echo " --no-stale-warning Suppress schema registry staleness warning"
|
|
46
|
+
echo " --no-stale-warning Suppress schema registry staleness warning"
|
|
42
47
|
echo ""
|
|
43
48
|
echo "IC resolution:"
|
|
44
49
|
echo " --adapter <name> Use ADAPTER_IC_NAME from do/adapters/<name>.conf"
|
|
45
50
|
echo " --ic <name> Use IC_DEPLOYED_NAME from do/ic/<name>.conf"
|
|
46
51
|
echo " (no flag) Use first IC in do/ic/ alphabetically, or legacy config"
|
|
47
52
|
echo ""
|
|
48
|
-
echo "
|
|
49
|
-
echo "
|
|
50
|
-
echo "
|
|
53
|
+
echo "Status:"
|
|
54
|
+
echo " After interrupting a running benchmark, use --status to check completion"
|
|
55
|
+
echo " and trigger results download + Athena write."
|
|
51
56
|
echo ""
|
|
52
57
|
echo "Prerequisites:"
|
|
53
58
|
echo " • Endpoint must be deployed and InService (run ./do/deploy first)"
|
|
@@ -59,6 +64,112 @@ while [ $# -gt 0 ]; do
|
|
|
59
64
|
done
|
|
60
65
|
|
|
61
66
|
|
|
67
|
+
# ── Handle --status (early exit) ─────────────────────────────────────────────
|
|
68
|
+
# Query the tracked benchmark job, display status, and if completed:
|
|
69
|
+
# download results, display metrics, and write to Athena (if not already done).
|
|
70
|
+
if [ "${ARG_STATUS}" = true ]; then
|
|
71
|
+
JOB_NAME="${BENCHMARK_JOB_NAME:-}"
|
|
72
|
+
if [ -z "${JOB_NAME}" ]; then
|
|
73
|
+
echo "❌ No benchmark job tracked"
|
|
74
|
+
echo " Run ./do/benchmark --workload <name> to start one."
|
|
75
|
+
exit 1
|
|
76
|
+
fi
|
|
77
|
+
|
|
78
|
+
echo "📊 Benchmark Job Status"
|
|
79
|
+
echo ""
|
|
80
|
+
echo " Job: ${JOB_NAME}"
|
|
81
|
+
|
|
82
|
+
STATUS=$(aws sagemaker describe-ai-benchmark-job \
|
|
83
|
+
--ai-benchmark-job-name "${JOB_NAME}" \
|
|
84
|
+
--region "${AWS_REGION}" \
|
|
85
|
+
--query 'AIBenchmarkJobStatus' \
|
|
86
|
+
--output text 2>/dev/null) || STATUS=""
|
|
87
|
+
|
|
88
|
+
if [ -z "${STATUS}" ]; then
|
|
89
|
+
echo " Status: Unknown (job not found or credentials expired)"
|
|
90
|
+
exit 1
|
|
91
|
+
fi
|
|
92
|
+
|
|
93
|
+
echo " Status: ${STATUS}"
|
|
94
|
+
|
|
95
|
+
case "${STATUS}" in
|
|
96
|
+
Completed)
|
|
97
|
+
# Check if results already exist locally
|
|
98
|
+
PROJECT_ROOT="${SCRIPT_DIR}/.."
|
|
99
|
+
LOCAL_RESULTS_DIR="${PROJECT_ROOT}/benchmarks/${JOB_NAME}"
|
|
100
|
+
RESULTS_JSONL=$(find "${LOCAL_RESULTS_DIR}" -name "profile_export.jsonl" -type f 2>/dev/null | head -1)
|
|
101
|
+
|
|
102
|
+
if [ -z "${RESULTS_JSONL}" ]; then
|
|
103
|
+
echo ""
|
|
104
|
+
echo " 📥 Downloading results..."
|
|
105
|
+
RESULTS_S3_PATH=$(aws sagemaker describe-ai-benchmark-job \
|
|
106
|
+
--ai-benchmark-job-name "${JOB_NAME}" \
|
|
107
|
+
--region "${AWS_REGION}" \
|
|
108
|
+
--query 'OutputConfig.S3OutputLocation' \
|
|
109
|
+
--output text 2>/dev/null)
|
|
110
|
+
|
|
111
|
+
if [ -n "${RESULTS_S3_PATH}" ]; then
|
|
112
|
+
mkdir -p "${LOCAL_RESULTS_DIR}/output"
|
|
113
|
+
aws s3 sync "${RESULTS_S3_PATH}" "${LOCAL_RESULTS_DIR}/output/" \
|
|
114
|
+
--region "${AWS_REGION}" --quiet
|
|
115
|
+
# Untar if output.tar.gz exists
|
|
116
|
+
local tar_file
|
|
117
|
+
tar_file=$(find "${LOCAL_RESULTS_DIR}" -name "output.tar.gz" -type f 2>/dev/null | head -1)
|
|
118
|
+
if [ -n "${tar_file}" ]; then
|
|
119
|
+
tar -xzf "${tar_file}" --strip-components=1 -C "${LOCAL_RESULTS_DIR}/output/" 2>/dev/null || true
|
|
120
|
+
fi
|
|
121
|
+
# Re-search after extraction
|
|
122
|
+
RESULTS_JSONL=$(find "${LOCAL_RESULTS_DIR}" -name "profile_export.jsonl" -type f 2>/dev/null | head -1)
|
|
123
|
+
echo " ✅ Results downloaded to: benchmarks/${JOB_NAME}/"
|
|
124
|
+
fi
|
|
125
|
+
else
|
|
126
|
+
echo " ✅ Results already available locally"
|
|
127
|
+
fi
|
|
128
|
+
|
|
129
|
+
# Write to Athena if CI bucket is configured and results exist
|
|
130
|
+
if [ -n "${CI_BENCHMARK_RESULTS_BUCKET:-}" ]; then
|
|
131
|
+
_WRITER_INPUT=""
|
|
132
|
+
if [ -n "${RESULTS_JSONL}" ] && [ -f "${RESULTS_JSONL}" ]; then
|
|
133
|
+
_WRITER_INPUT="${RESULTS_JSONL}"
|
|
134
|
+
else
|
|
135
|
+
_WRITER_INPUT=$(find "${LOCAL_RESULTS_DIR}" -name "profile_export_aiperf.json" -type f 2>/dev/null | head -1)
|
|
136
|
+
fi
|
|
137
|
+
|
|
138
|
+
if [ -n "${_WRITER_INPUT}" ]; then
|
|
139
|
+
echo ""
|
|
140
|
+
echo " 📊 Writing to Athena..."
|
|
141
|
+
if python3 "$(dirname "${BASH_SOURCE[0]}")/.benchmark_writer.py" write \
|
|
142
|
+
--results-file "${_WRITER_INPUT}" \
|
|
143
|
+
--config-file "$(dirname "${BASH_SOURCE[0]}")/config" \
|
|
144
|
+
--project-name "${PROJECT_NAME}" \
|
|
145
|
+
--workload "${BENCHMARK_WORKLOAD:-manual}" \
|
|
146
|
+
--concurrency "${BENCHMARK_CONCURRENCY:-2}" \
|
|
147
|
+
--bucket "${CI_BENCHMARK_RESULTS_BUCKET}" \
|
|
148
|
+
--region "${AWS_REGION:-${REGION}}"; then
|
|
149
|
+
echo " ✅ Results persisted to Athena"
|
|
150
|
+
else
|
|
151
|
+
echo " ⚠️ Athena write failed (non-fatal)"
|
|
152
|
+
fi
|
|
153
|
+
fi
|
|
154
|
+
fi
|
|
155
|
+
;;
|
|
156
|
+
InProgress|Starting|Pending)
|
|
157
|
+
echo ""
|
|
158
|
+
echo " Job is still running. Check again with: ./do/benchmark --status"
|
|
159
|
+
;;
|
|
160
|
+
Failed)
|
|
161
|
+
FAILURE_REASON=$(aws sagemaker describe-ai-benchmark-job \
|
|
162
|
+
--ai-benchmark-job-name "${JOB_NAME}" \
|
|
163
|
+
--region "${AWS_REGION}" \
|
|
164
|
+
--query 'FailureReason' \
|
|
165
|
+
--output text 2>/dev/null) || FAILURE_REASON="unknown"
|
|
166
|
+
echo " Reason: ${FAILURE_REASON}"
|
|
167
|
+
;;
|
|
168
|
+
esac
|
|
169
|
+
exit 0
|
|
170
|
+
fi
|
|
171
|
+
|
|
172
|
+
|
|
62
173
|
# ── Require --workload flag ───────────────────────────────────────────────────
|
|
63
174
|
if [ -z "${ARG_WORKLOAD}" ]; then
|
|
64
175
|
echo "❌ --workload <name> is required"
|
|
@@ -172,8 +283,11 @@ print(f's3://{bucket}/${PROJECT_NAME}/')
|
|
|
172
283
|
|
|
173
284
|
CI_BENCHMARK_RESULTS_BUCKET=$(echo "${_PROFILE_JSON}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('ciBenchmarkResultsBucket', ''))" 2>/dev/null) || CI_BENCHMARK_RESULTS_BUCKET=""
|
|
174
285
|
|
|
175
|
-
|
|
176
|
-
|
|
286
|
+
ROLE_ARN=$(echo "${_PROFILE_JSON}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('roleArn', ''))" 2>/dev/null) || ROLE_ARN=""
|
|
287
|
+
|
|
288
|
+
# Derive job names at runtime (unique per invocation).
|
|
289
|
+
# Preserve BENCHMARK_JOB_NAME if already set (from do/config or env) for resume logic.
|
|
290
|
+
BENCHMARK_JOB_NAME="${BENCHMARK_JOB_NAME:-${PROJECT_NAME}-benchmark-$(date +%Y%m%d-%H%M%S)}"
|
|
177
291
|
BENCHMARK_WORKLOAD_CONFIG_NAME="${PROJECT_NAME}-benchmark-config-$(date +%Y%m%d-%H%M%S)"
|
|
178
292
|
|
|
179
293
|
# Ensure benchmark params have defaults (in case workload catalog wasn't found)
|
|
@@ -228,7 +342,15 @@ if [ -n "${BENCHMARK_CONCURRENCY_LEVELS:-}" ] && [ -z "${_BENCHMARK_SINGLE_LEVEL
|
|
|
228
342
|
if [ -n "${IC_ARG}" ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --ic ${IC_ARG}"; fi
|
|
229
343
|
if [ -n "${ADAPTER_ARG}" ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --adapter ${ADAPTER_ARG}"; fi
|
|
230
344
|
|
|
231
|
-
|
|
345
|
+
_CHILD_EXIT=0
|
|
346
|
+
"${BASH_SOURCE[0]}" ${_REINVOKE_ARGS} || _CHILD_EXIT=$?
|
|
347
|
+
|
|
348
|
+
if [ ${_CHILD_EXIT} -eq 130 ]; then
|
|
349
|
+
# Child was interrupted (Ctrl+C) — propagate cleanly
|
|
350
|
+
exit 130
|
|
351
|
+
fi
|
|
352
|
+
|
|
353
|
+
if [ ${_CHILD_EXIT} -eq 0 ]; then
|
|
232
354
|
# Copy results to aggregation directory — find the child's results
|
|
233
355
|
# Try the marker file first (set by child), then fall back to ls -td
|
|
234
356
|
_LATEST_JOB_DIR=""
|
|
@@ -747,7 +869,10 @@ echo ""
|
|
|
747
869
|
echo "⚙️ Step 1: Creating AI Workload Config: ${WORKLOAD_CONFIG_NAME}"
|
|
748
870
|
|
|
749
871
|
# Build parameters block
|
|
750
|
-
|
|
872
|
+
# Use HF_MODEL_ID for tokenizer (the original HuggingFace repo ID, e.g. "Qwen/Qwen3-0.6B").
|
|
873
|
+
# MODEL_NAME may have been rewritten to an S3 URI by do/stage, which AIPerf can't use as a tokenizer source.
|
|
874
|
+
BENCHMARK_TOKENIZER="${HF_MODEL_ID:-${MODEL_NAME}}"
|
|
875
|
+
PARAMS_JSON="{\"prompt_input_tokens_mean\":${BENCHMARK_INPUT_TOKENS_MEAN},\"output_tokens_mean\":${BENCHMARK_OUTPUT_TOKENS_MEAN},\"concurrency\":${BENCHMARK_CONCURRENCY},\"streaming\":${BENCHMARK_STREAMING},\"tokenizer\":\"${BENCHMARK_TOKENIZER}\""
|
|
751
876
|
|
|
752
877
|
# Add optional request_count if specified
|
|
753
878
|
if [ -n "${BENCHMARK_REQUEST_COUNT:-}" ]; then
|
|
@@ -856,6 +981,18 @@ fi # end of RESUME_EXISTING=false block
|
|
|
856
981
|
# Skip polling if we already know the job completed (resumed a finished job)
|
|
857
982
|
if [ "${JOB_STATUS:-}" != "Completed" ] && [ "${JOB_STATUS:-}" != "Failed" ] && [ "${JOB_STATUS:-}" != "Stopped" ]; then
|
|
858
983
|
|
|
984
|
+
# Handle Ctrl+C during polling — exit cleanly without stopping the remote job.
|
|
985
|
+
_handle_benchmark_interrupt() {
|
|
986
|
+
echo ""
|
|
987
|
+
echo ""
|
|
988
|
+
echo "⚠️ Interrupted — job continues running in background"
|
|
989
|
+
echo " Job: ${BENCHMARK_JOB_NAME}"
|
|
990
|
+
echo ""
|
|
991
|
+
echo " Check status: aws sagemaker describe-ai-benchmark-job --ai-benchmark-job-name ${BENCHMARK_JOB_NAME} --region ${AWS_REGION}"
|
|
992
|
+
exit 130
|
|
993
|
+
}
|
|
994
|
+
trap '_handle_benchmark_interrupt' INT
|
|
995
|
+
|
|
859
996
|
echo "⏳ Step 3: Waiting for benchmark to complete..."
|
|
860
997
|
echo " Polling every ${POLL_INTERVAL}s (max ${MAX_POLL_ATTEMPTS} attempts = 30 min)"
|
|
861
998
|
echo ""
|
|
@@ -897,13 +1034,14 @@ while [ ${POLL_COUNT} -lt ${MAX_POLL_ATTEMPTS} ]; do
|
|
|
897
1034
|
esac
|
|
898
1035
|
done
|
|
899
1036
|
|
|
1037
|
+
trap - INT
|
|
1038
|
+
|
|
900
1039
|
# Check for timeout
|
|
901
1040
|
if [ ${POLL_COUNT} -ge ${MAX_POLL_ATTEMPTS} ]; then
|
|
902
1041
|
echo ""
|
|
903
1042
|
echo "⚠️ Benchmark timed out after 30 minutes (status: ${JOB_STATUS})"
|
|
904
|
-
echo " The job may still be running.
|
|
905
|
-
echo "
|
|
906
|
-
echo " aws sagemaker describe-ai-benchmark-job --ai-benchmark-job-name ${BENCHMARK_JOB_NAME} --region ${AWS_REGION}"
|
|
1043
|
+
echo " The job may still be running."
|
|
1044
|
+
echo " Check status: ./do/benchmark --status"
|
|
907
1045
|
exit 1
|
|
908
1046
|
fi
|
|
909
1047
|
|
|
@@ -949,7 +1087,7 @@ if [ "${JOB_STATUS}" = "Completed" ]; then
|
|
|
949
1087
|
# Extract any tar.gz archives (benchmark service packages results as output.tar.gz)
|
|
950
1088
|
for ARCHIVE in $(find "${LOCAL_RESULTS_DIR}" -name "*.tar.gz" -type f 2>/dev/null); do
|
|
951
1089
|
ARCHIVE_DIR=$(dirname "${ARCHIVE}")
|
|
952
|
-
tar -xzf "${ARCHIVE}" -C "${ARCHIVE_DIR}" 2>/dev/null || true
|
|
1090
|
+
tar -xzf "${ARCHIVE}" --strip-components=1 -C "${ARCHIVE_DIR}" 2>/dev/null || true
|
|
953
1091
|
done
|
|
954
1092
|
|
|
955
1093
|
# Look for specific result files (priority: JSONL > aiperf JSON)
|
package/templates/do/build
CHANGED
|
@@ -12,11 +12,8 @@ source "${SCRIPT_DIR}/config"
|
|
|
12
12
|
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
13
13
|
|
|
14
14
|
# ── Profile-resolved variables (env var > profile > default) ──────────────────
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE[ecrRepositoryName]:-ml-container-creator}}"
|
|
18
|
-
export AWS_REGION="${AWS_REGION:-${_PROFILE[awsRegion]:-us-east-1}}"
|
|
19
|
-
set -u
|
|
15
|
+
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE_ecrRepositoryName:-ml-container-creator}}"
|
|
16
|
+
export AWS_REGION="${AWS_REGION:-${_PROFILE_awsRegion:-us-east-1}}"
|
|
20
17
|
|
|
21
18
|
echo "🚀 Building Docker image for ${PROJECT_NAME}"
|
|
22
19
|
echo " Deployment config: ${DEPLOYMENT_CONFIG}"
|
|
@@ -12,11 +12,8 @@ source "${SCRIPT_DIR}/config"
|
|
|
12
12
|
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
13
13
|
|
|
14
14
|
# ── Profile-resolved variables (env var > profile > default) ──────────────────
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE[ecrRepositoryName]:-ml-container-creator}}"
|
|
18
|
-
export AWS_REGION="${AWS_REGION:-${_PROFILE[awsRegion]:-us-east-1}}"
|
|
19
|
-
set -u
|
|
15
|
+
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE_ecrRepositoryName:-ml-container-creator}}"
|
|
16
|
+
export AWS_REGION="${AWS_REGION:-${_PROFILE_awsRegion:-us-east-1}}"
|
|
20
17
|
|
|
21
18
|
# Parse arguments
|
|
22
19
|
CLEANUP_TARGET=""
|
|
@@ -12,11 +12,8 @@ source "${SCRIPT_DIR}/config"
|
|
|
12
12
|
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
13
13
|
|
|
14
14
|
# ── Profile-resolved variables (env var > profile > default) ──────────────────
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE[ecrRepositoryName]:-ml-container-creator}}"
|
|
18
|
-
export AWS_REGION="${AWS_REGION:-${_PROFILE[awsRegion]:-us-east-1}}"
|
|
19
|
-
set -u
|
|
15
|
+
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE_ecrRepositoryName:-ml-container-creator}}"
|
|
16
|
+
export AWS_REGION="${AWS_REGION:-${_PROFILE_awsRegion:-us-east-1}}"
|
|
20
17
|
|
|
21
18
|
# Parse arguments
|
|
22
19
|
CLEANUP_TARGET=""
|
|
@@ -12,11 +12,8 @@ source "${SCRIPT_DIR}/config"
|
|
|
12
12
|
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
13
13
|
|
|
14
14
|
# ── Profile-resolved variables (env var > profile > default) ──────────────────
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE[ecrRepositoryName]:-ml-container-creator}}"
|
|
18
|
-
export AWS_REGION="${AWS_REGION:-${_PROFILE[awsRegion]:-us-east-1}}"
|
|
19
|
-
set -u
|
|
15
|
+
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE_ecrRepositoryName:-ml-container-creator}}"
|
|
16
|
+
export AWS_REGION="${AWS_REGION:-${_PROFILE_awsRegion:-us-east-1}}"
|
|
20
17
|
|
|
21
18
|
# Parse arguments
|
|
22
19
|
CLEANUP_TARGET=""
|
|
@@ -12,11 +12,8 @@ source "${SCRIPT_DIR}/config"
|
|
|
12
12
|
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
13
13
|
|
|
14
14
|
# ── Profile-resolved variables (env var > profile > default) ──────────────────
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE[ecrRepositoryName]:-ml-container-creator}}"
|
|
18
|
-
export AWS_REGION="${AWS_REGION:-${_PROFILE[awsRegion]:-us-east-1}}"
|
|
19
|
-
set -u
|
|
15
|
+
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE_ecrRepositoryName:-ml-container-creator}}"
|
|
16
|
+
export AWS_REGION="${AWS_REGION:-${_PROFILE_awsRegion:-us-east-1}}"
|
|
20
17
|
|
|
21
18
|
# Parse arguments
|
|
22
19
|
CLEANUP_TARGET=""
|
package/templates/do/config
CHANGED
|
@@ -214,6 +214,9 @@ export <%= key %>=${<%= key %>:-<%= value %>}
|
|
|
214
214
|
# Framework-specific configuration
|
|
215
215
|
<% if (framework === 'transformers') { %>
|
|
216
216
|
export MODEL_NAME="<%= modelName %>"
|
|
217
|
+
# HuggingFace Model ID — preserved even after do/stage rewrites MODEL_NAME to S3.
|
|
218
|
+
# Used by do/benchmark (tokenizer), do/tune (model catalog), and do/test (chat template).
|
|
219
|
+
export HF_MODEL_ID="<%= modelName %>"
|
|
217
220
|
# Secrets Manager integration: when an ARN is configured, do-scripts resolve the
|
|
218
221
|
# secret at the appropriate stage (build-time or runtime). When a plaintext value
|
|
219
222
|
# is configured, it is exported directly. The _ARN suffix signals resolution is needed.
|
|
@@ -253,6 +256,7 @@ export TUNE_MODEL_ID="<%= tuneModelId %>"
|
|
|
253
256
|
|
|
254
257
|
<% if (framework === 'diffusors') { %>
|
|
255
258
|
export MODEL_NAME="<%= modelName %>"
|
|
259
|
+
export HF_MODEL_ID="<%= modelName %>"
|
|
256
260
|
# Secrets Manager integration: when an ARN is configured, do-scripts resolve the
|
|
257
261
|
# secret at the appropriate stage (build-time or runtime). When a plaintext value
|
|
258
262
|
# is configured, it is exported directly. The _ARN suffix signals resolution is needed.
|
|
@@ -41,18 +41,15 @@ source "${SCRIPT_DIR}/config"
|
|
|
41
41
|
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
42
42
|
|
|
43
43
|
# ── Profile-resolved variables (env var > profile > default) ──────────────────
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE[ecrRepositoryName]:-ml-container-creator}}"
|
|
48
|
-
export AWS_REGION="${AWS_REGION:-${_PROFILE[awsRegion]:-us-east-1}}"
|
|
44
|
+
ROLE_ARN="${ROLE_ARN:-${_PROFILE_roleArn:-}}"
|
|
45
|
+
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE_ecrRepositoryName:-ml-container-creator}}"
|
|
46
|
+
export AWS_REGION="${AWS_REGION:-${_PROFILE_awsRegion:-us-east-1}}"
|
|
49
47
|
|
|
50
48
|
# Async-specific derived variables
|
|
51
|
-
_ASYNC_BUCKET="${
|
|
49
|
+
_ASYNC_BUCKET="${_PROFILE_asyncS3Bucket:-mlcc-async-${_PROFILE_accountId:-unknown}-${_PROFILE_awsRegion:-us-east-1}}"
|
|
52
50
|
ASYNC_S3_OUTPUT_PATH="${ASYNC_S3_OUTPUT_PATH:-s3://${_ASYNC_BUCKET}/${PROJECT_NAME}/output/}"
|
|
53
|
-
ASYNC_SNS_SUCCESS_TOPIC="${ASYNC_SNS_SUCCESS_TOPIC:-arn:aws:sns:${
|
|
54
|
-
ASYNC_SNS_ERROR_TOPIC="${ASYNC_SNS_ERROR_TOPIC:-arn:aws:sns:${
|
|
55
|
-
set -u
|
|
51
|
+
ASYNC_SNS_SUCCESS_TOPIC="${ASYNC_SNS_SUCCESS_TOPIC:-arn:aws:sns:${_PROFILE_awsRegion:-us-east-1}:${_PROFILE_accountId:-unknown}:ml-container-creator-${PROJECT_NAME}-async-success}"
|
|
52
|
+
ASYNC_SNS_ERROR_TOPIC="${ASYNC_SNS_ERROR_TOPIC:-arn:aws:sns:${_PROFILE_awsRegion:-us-east-1}:${_PROFILE_accountId:-unknown}:ml-container-creator-${PROJECT_NAME}-async-error}"
|
|
56
53
|
|
|
57
54
|
echo "🚀 Deploying to AWS"
|
|
58
55
|
echo " Project: ${PROJECT_NAME}"
|
|
@@ -41,17 +41,14 @@ source "${SCRIPT_DIR}/config"
|
|
|
41
41
|
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
42
42
|
|
|
43
43
|
# ── Profile-resolved variables (env var > profile > default) ──────────────────
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE[ecrRepositoryName]:-ml-container-creator}}"
|
|
48
|
-
export AWS_REGION="${AWS_REGION:-${_PROFILE[awsRegion]:-us-east-1}}"
|
|
44
|
+
ROLE_ARN="${ROLE_ARN:-${_PROFILE_roleArn:-}}"
|
|
45
|
+
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE_ecrRepositoryName:-ml-container-creator}}"
|
|
46
|
+
export AWS_REGION="${AWS_REGION:-${_PROFILE_awsRegion:-us-east-1}}"
|
|
49
47
|
|
|
50
48
|
# Batch-specific derived variables
|
|
51
|
-
_BATCH_BUCKET="${
|
|
49
|
+
_BATCH_BUCKET="${_PROFILE_batchS3Bucket:-mlcc-batch-${_PROFILE_accountId:-unknown}-${_PROFILE_awsRegion:-us-east-1}}"
|
|
52
50
|
BATCH_INPUT_PATH="${BATCH_INPUT_PATH:-s3://${_BATCH_BUCKET}/${PROJECT_NAME}/input/}"
|
|
53
51
|
BATCH_OUTPUT_PATH="${BATCH_OUTPUT_PATH:-s3://${_BATCH_BUCKET}/${PROJECT_NAME}/output/}"
|
|
54
|
-
set -u
|
|
55
52
|
|
|
56
53
|
echo "🚀 Deploying to AWS"
|
|
57
54
|
echo " Project: ${PROJECT_NAME}"
|
|
@@ -41,10 +41,7 @@ source "${SCRIPT_DIR}/config"
|
|
|
41
41
|
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
42
42
|
|
|
43
43
|
# ── Profile-resolved variables (env var > profile > default) ──────────────────
|
|
44
|
-
|
|
45
|
-
set +u
|
|
46
|
-
export AWS_REGION="${AWS_REGION:-${_PROFILE[awsRegion]:-us-east-1}}"
|
|
47
|
-
set -u
|
|
44
|
+
export AWS_REGION="${AWS_REGION:-${_PROFILE_awsRegion:-us-east-1}}"
|
|
48
45
|
|
|
49
46
|
echo "🚀 Deploying to AWS"
|
|
50
47
|
echo " Project: ${PROJECT_NAME}"
|
|
@@ -214,12 +214,9 @@ source "${SCRIPT_DIR}/config"
|
|
|
214
214
|
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
215
215
|
|
|
216
216
|
# ── Profile-resolved variables (env var > profile > default) ──────────────────
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE[ecrRepositoryName]:-ml-container-creator}}"
|
|
221
|
-
export AWS_REGION="${AWS_REGION:-${_PROFILE[awsRegion]:-us-east-1}}"
|
|
222
|
-
set -u
|
|
217
|
+
ROLE_ARN="${ROLE_ARN:-${_PROFILE_roleArn:-}}"
|
|
218
|
+
ECR_REPOSITORY_NAME="${ECR_REPOSITORY_NAME:-${_PROFILE_ecrRepositoryName:-ml-container-creator}}"
|
|
219
|
+
export AWS_REGION="${AWS_REGION:-${_PROFILE_awsRegion:-us-east-1}}"
|
|
223
220
|
|
|
224
221
|
echo "🚀 Deploying to AWS"
|
|
225
222
|
echo " Project: ${PROJECT_NAME}"
|
|
@@ -332,6 +329,18 @@ if [ -z "${ROLE_ARN:-}" ]; then
|
|
|
332
329
|
exit 3
|
|
333
330
|
fi
|
|
334
331
|
|
|
332
|
+
# Validate ROLE_ARN looks like an IAM role ARN
|
|
333
|
+
if ! echo "${ROLE_ARN}" | grep -qE '^arn:aws[a-z-]*:iam::[0-9]{12}:role/.+'; then
|
|
334
|
+
echo "❌ ROLE_ARN is not a valid IAM role ARN:"
|
|
335
|
+
echo " Got: ${ROLE_ARN}"
|
|
336
|
+
echo " Expected format: arn:aws:iam::123456789012:role/RoleName"
|
|
337
|
+
echo ""
|
|
338
|
+
echo " This may indicate a misconfigured bootstrap profile."
|
|
339
|
+
echo " Check ~/.ml-container-creator/config.json 'roleArn' field,"
|
|
340
|
+
echo " or set the correct value: export ROLE_ARN=arn:aws:iam::ACCOUNT:role/YOUR_ROLE"
|
|
341
|
+
exit 3
|
|
342
|
+
fi
|
|
343
|
+
|
|
335
344
|
echo " Using execution role: ${ROLE_ARN}"
|
|
336
345
|
|
|
337
346
|
# Validate --ic argument if specified (set by --ic <name> or --force-ic <name>)
|