@aws/ml-container-creator 0.13.4 → 0.13.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -5
- package/infra/ci-harness/package-lock.json +1 -5
- package/package.json +4 -2
- package/pyproject.toml +21 -0
- package/requirements.txt +19 -0
- package/src/app.js +2 -0
- package/src/lib/bootstrap-command-handler.js +33 -23
- package/templates/do/.adapter_helper.py +451 -0
- package/templates/do/.benchmark_writer.py +13 -0
- package/templates/do/.stage_helper.py +419 -0
- package/templates/do/.tune_helper.py +213 -65
- package/templates/do/__pycache__/.adapter_helper.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.tune_helper.cpython-312.pyc +0 -0
- package/templates/do/adapter +108 -0
- package/templates/do/benchmark +150 -12
- package/templates/do/config +4 -0
- package/templates/do/lib/profile.sh +5 -0
- package/templates/do/stage +91 -272
- package/templates/do/tune +63 -6
package/templates/do/adapter
CHANGED
|
@@ -43,12 +43,16 @@ _usage() {
|
|
|
43
43
|
echo ""
|
|
44
44
|
echo "Options:"
|
|
45
45
|
echo " --help, -h Show this help message"
|
|
46
|
+
echo " --local Use local aws s3 cp instead of Processing Job (--from-tune)"
|
|
47
|
+
echo " --no-wait Submit Processing Job and return immediately (--from-tune)"
|
|
46
48
|
echo ""
|
|
47
49
|
echo "Examples:"
|
|
48
50
|
echo " ./do/adapter add ectsum --weights s3://my-bucket/adapters/ectsum/adapter.tar.gz"
|
|
49
51
|
echo " ./do/adapter add ectsum --from-hub predibase/llama-3.1-8b-ectsum"
|
|
50
52
|
echo " ./do/adapter add tuned-sft --from-tune"
|
|
51
53
|
echo " ./do/adapter add tuned-sft --from-tune sft"
|
|
54
|
+
echo " ./do/adapter add tuned-sft --from-tune --local"
|
|
55
|
+
echo " ./do/adapter add tuned-sft --from-tune --no-wait"
|
|
52
56
|
echo " ./do/adapter list"
|
|
53
57
|
echo " ./do/adapter remove ectsum"
|
|
54
58
|
echo " ./do/adapter update ectsum --weights s3://my-bucket/adapters/ectsum-v2/adapter.tar.gz"
|
|
@@ -367,6 +371,8 @@ _adapter_add() {
|
|
|
367
371
|
local from_hub=""
|
|
368
372
|
local from_tune=""
|
|
369
373
|
local from_tune_technique=""
|
|
374
|
+
local use_local=""
|
|
375
|
+
local no_wait=""
|
|
370
376
|
|
|
371
377
|
# Parse add arguments
|
|
372
378
|
shift # remove 'add' from args
|
|
@@ -400,6 +406,14 @@ _adapter_add() {
|
|
|
400
406
|
shift
|
|
401
407
|
fi
|
|
402
408
|
;;
|
|
409
|
+
--local)
|
|
410
|
+
use_local="true"
|
|
411
|
+
shift
|
|
412
|
+
;;
|
|
413
|
+
--no-wait)
|
|
414
|
+
no_wait="true"
|
|
415
|
+
shift
|
|
416
|
+
;;
|
|
403
417
|
--help|-h)
|
|
404
418
|
echo "Usage: ./do/adapter add <name> --weights <s3-uri>"
|
|
405
419
|
echo " ./do/adapter add <name> --from-hub <hf-repo-id>"
|
|
@@ -414,6 +428,8 @@ _adapter_add() {
|
|
|
414
428
|
echo " --from-tune [technique] Use adapter output from do/tune"
|
|
415
429
|
echo " Without technique: uses latest tune output"
|
|
416
430
|
echo " With technique (e.g., sft, dpo): uses technique-specific output"
|
|
431
|
+
echo " --local Use local aws s3 cp instead of Processing Job (--from-tune only)"
|
|
432
|
+
echo " --no-wait Submit Processing Job and return immediately (--from-tune only)"
|
|
417
433
|
echo ""
|
|
418
434
|
echo "Note: --weights, --from-hub, and --from-tune are mutually exclusive."
|
|
419
435
|
echo ""
|
|
@@ -422,6 +438,8 @@ _adapter_add() {
|
|
|
422
438
|
echo " ./do/adapter add ectsum --from-hub predibase/llama-3.1-8b-ectsum"
|
|
423
439
|
echo " ./do/adapter add tuned-sft --from-tune"
|
|
424
440
|
echo " ./do/adapter add tuned-sft --from-tune sft"
|
|
441
|
+
echo " ./do/adapter add tuned-sft --from-tune --local"
|
|
442
|
+
echo " ./do/adapter add tuned-sft --from-tune --no-wait"
|
|
425
443
|
exit 0
|
|
426
444
|
;;
|
|
427
445
|
-*)
|
|
@@ -529,6 +547,95 @@ _adapter_add() {
|
|
|
529
547
|
fi
|
|
530
548
|
echo ""
|
|
531
549
|
|
|
550
|
+
# ── Route to Processing Job helper (default) or local path ────────
|
|
551
|
+
if [ -z "${use_local}" ]; then
|
|
552
|
+
# Default: use Processing Job via .adapter_helper.py
|
|
553
|
+
echo "🚀 Submitting Processing Job to stage adapter..."
|
|
554
|
+
echo ""
|
|
555
|
+
|
|
556
|
+
# Resolve execution role
|
|
557
|
+
local exec_role="${EXECUTION_ROLE_ARN:-}"
|
|
558
|
+
if [ -z "${exec_role}" ]; then
|
|
559
|
+
exec_role="${ROLE_ARN:-}"
|
|
560
|
+
fi
|
|
561
|
+
if [ -z "${exec_role}" ]; then
|
|
562
|
+
exec_role="${SAGEMAKER_ROLE_ARN:-}"
|
|
563
|
+
fi
|
|
564
|
+
if [ -z "${exec_role}" ]; then
|
|
565
|
+
echo "❌ No execution role found."
|
|
566
|
+
echo ""
|
|
567
|
+
echo " Run 'ml-container-creator bootstrap' to set up your profile,"
|
|
568
|
+
echo " or set ROLE_ARN / EXECUTION_ROLE_ARN in do/config."
|
|
569
|
+
exit 1
|
|
570
|
+
fi
|
|
571
|
+
|
|
572
|
+
# Resolve S3 bucket
|
|
573
|
+
local adapter_bucket="${ADAPTER_S3_BUCKET:-}"
|
|
574
|
+
if [ -z "${adapter_bucket}" ]; then
|
|
575
|
+
local account_id
|
|
576
|
+
account_id=$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo "")
|
|
577
|
+
if [ -z "${account_id}" ]; then
|
|
578
|
+
echo "❌ Could not determine AWS account ID."
|
|
579
|
+
echo " Ensure AWS credentials are configured."
|
|
580
|
+
exit 4
|
|
581
|
+
fi
|
|
582
|
+
adapter_bucket="mlcc-adapters-${account_id}-${AWS_REGION}"
|
|
583
|
+
fi
|
|
584
|
+
|
|
585
|
+
# Build helper args
|
|
586
|
+
local helper_args=(
|
|
587
|
+
"stage-from-tune"
|
|
588
|
+
"--training-output-s3-uri" "${weights_uri}"
|
|
589
|
+
"--adapter-name" "${adapter_name}"
|
|
590
|
+
"--bucket" "${adapter_bucket}"
|
|
591
|
+
"--project" "${PROJECT_NAME}"
|
|
592
|
+
"--role-arn" "${exec_role}"
|
|
593
|
+
"--region" "${AWS_REGION}"
|
|
594
|
+
)
|
|
595
|
+
if [ -n "${no_wait}" ]; then
|
|
596
|
+
helper_args+=("--no-wait")
|
|
597
|
+
fi
|
|
598
|
+
|
|
599
|
+
# Invoke the Python helper
|
|
600
|
+
local helper_output
|
|
601
|
+
if ! helper_output=$(python3 "${SCRIPT_DIR}/.adapter_helper.py" "${helper_args[@]}" 2>/dev/null); then
|
|
602
|
+
echo "❌ Processing Job failed. See error above."
|
|
603
|
+
exit 1
|
|
604
|
+
fi
|
|
605
|
+
|
|
606
|
+
# Parse JSON output from helper (extract only the JSON line, skip any log noise)
|
|
607
|
+
local json_line
|
|
608
|
+
json_line=$(echo "${helper_output}" | grep -E '^\{' | tail -1)
|
|
609
|
+
local job_status
|
|
610
|
+
job_status=$(echo "${json_line}" | python3 -c "import sys,json; print(json.loads(sys.stdin.read()).get('status',''))" 2>/dev/null || echo "")
|
|
611
|
+
|
|
612
|
+
if [ "${job_status}" = "Completed" ] || [ "${job_status}" = "InProgress" ]; then
|
|
613
|
+
echo "${json_line}"
|
|
614
|
+
# Extract adapter_s3_uri for downstream use
|
|
615
|
+
local staged_adapter_uri
|
|
616
|
+
staged_adapter_uri=$(echo "${json_line}" | python3 -c "import sys,json; print(json.loads(sys.stdin.read()).get('adapter_s3_uri',''))" 2>/dev/null || echo "")
|
|
617
|
+
|
|
618
|
+
if [ -n "${no_wait}" ]; then
|
|
619
|
+
echo ""
|
|
620
|
+
echo "✅ Processing Job submitted. Check status with:"
|
|
621
|
+
echo " python3 ${SCRIPT_DIR}/.adapter_helper.py status --job-name <job-name>"
|
|
622
|
+
echo ""
|
|
623
|
+
echo " Once complete, re-run without --no-wait to register the adapter."
|
|
624
|
+
exit 0
|
|
625
|
+
fi
|
|
626
|
+
|
|
627
|
+
# Update weights_uri to point to the staged adapter
|
|
628
|
+
weights_uri="${staged_adapter_uri}"
|
|
629
|
+
echo ""
|
|
630
|
+
echo "✅ Adapter staged via Processing Job: ${weights_uri}"
|
|
631
|
+
else
|
|
632
|
+
echo "❌ Unexpected status from Processing Job helper: ${job_status}"
|
|
633
|
+
echo " Output: ${helper_output}"
|
|
634
|
+
exit 1
|
|
635
|
+
fi
|
|
636
|
+
else
|
|
637
|
+
# ── --local flag: Package tune artifacts locally (original behavior) ──
|
|
638
|
+
|
|
532
639
|
# ── Package tune artifacts as tar.gz if needed ────────────────────
|
|
533
640
|
# Tune output is an S3 path that may be:
|
|
534
641
|
# 1. Already a tar.gz file (s3://...adapter.tar.gz) → use directly
|
|
@@ -677,6 +784,7 @@ _adapter_add() {
|
|
|
677
784
|
weights_uri="${s3_tar_path}"
|
|
678
785
|
fi
|
|
679
786
|
echo ""
|
|
787
|
+
fi # end --local else branch
|
|
680
788
|
fi
|
|
681
789
|
|
|
682
790
|
# ── Validate HF repo ID format (if --from-hub) ───────────────────────
|
package/templates/do/benchmark
CHANGED
|
@@ -12,10 +12,12 @@ set -o pipefail
|
|
|
12
12
|
# ── Source project configuration ──────────────────────────────────────────────
|
|
13
13
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
14
14
|
source "${SCRIPT_DIR}/config"
|
|
15
|
+
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
15
16
|
|
|
16
17
|
# ── Parse flags ───────────────────────────────────────────────────────────────
|
|
17
18
|
CLEAN_AFTER=false
|
|
18
19
|
FORCE=false
|
|
20
|
+
ARG_STATUS=false
|
|
19
21
|
IC_ARG=""
|
|
20
22
|
ADAPTER_ARG=""
|
|
21
23
|
ARG_NO_STALE_WARNING=false
|
|
@@ -24,30 +26,33 @@ while [ $# -gt 0 ]; do
|
|
|
24
26
|
case "$1" in
|
|
25
27
|
--clean) CLEAN_AFTER=true; shift ;;
|
|
26
28
|
--force) FORCE=true; shift ;;
|
|
29
|
+
--status) ARG_STATUS=true; shift ;;
|
|
27
30
|
--no-stale-warning) ARG_NO_STALE_WARNING=true; shift ;;
|
|
28
31
|
--workload) shift; ARG_WORKLOAD="${1:-}"; shift ;;
|
|
29
32
|
--ic) shift; IC_ARG="${1:-}"; shift ;;
|
|
30
33
|
--adapter) shift; ADAPTER_ARG="${1:-}"; shift ;;
|
|
31
34
|
--help|-h)
|
|
32
|
-
echo "Usage: ./do/benchmark [--workload <name>] [--ic <name>] [--adapter <name>] [--force] [--clean]
|
|
35
|
+
echo "Usage: ./do/benchmark [--workload <name>] [--status] [--ic <name>] [--adapter <name>] [--force] [--clean]"
|
|
33
36
|
echo ""
|
|
34
37
|
echo "Run SageMaker AI Benchmark against the deployed endpoint."
|
|
35
38
|
echo ""
|
|
36
39
|
echo "Options:"
|
|
40
|
+
echo " --status Check job status; if completed, download results + write to Athena"
|
|
37
41
|
echo " --ic <name> Benchmark a specific inference component"
|
|
38
42
|
echo " --adapter <name> Benchmark a specific LoRA adapter IC"
|
|
39
43
|
echo " --force Create a new benchmark job even if one is already running"
|
|
40
44
|
echo " --clean Delete workload config and benchmark job after displaying results"
|
|
41
45
|
echo " --no-stale-warning Suppress schema registry staleness warning"
|
|
46
|
+
echo " --no-stale-warning Suppress schema registry staleness warning"
|
|
42
47
|
echo ""
|
|
43
48
|
echo "IC resolution:"
|
|
44
49
|
echo " --adapter <name> Use ADAPTER_IC_NAME from do/adapters/<name>.conf"
|
|
45
50
|
echo " --ic <name> Use IC_DEPLOYED_NAME from do/ic/<name>.conf"
|
|
46
51
|
echo " (no flag) Use first IC in do/ic/ alphabetically, or legacy config"
|
|
47
52
|
echo ""
|
|
48
|
-
echo "
|
|
49
|
-
echo "
|
|
50
|
-
echo "
|
|
53
|
+
echo "Status:"
|
|
54
|
+
echo " After interrupting a running benchmark, use --status to check completion"
|
|
55
|
+
echo " and trigger results download + Athena write."
|
|
51
56
|
echo ""
|
|
52
57
|
echo "Prerequisites:"
|
|
53
58
|
echo " • Endpoint must be deployed and InService (run ./do/deploy first)"
|
|
@@ -59,6 +64,112 @@ while [ $# -gt 0 ]; do
|
|
|
59
64
|
done
|
|
60
65
|
|
|
61
66
|
|
|
67
|
+
# ── Handle --status (early exit) ─────────────────────────────────────────────
|
|
68
|
+
# Query the tracked benchmark job, display status, and if completed:
|
|
69
|
+
# download results, display metrics, and write to Athena (if not already done).
|
|
70
|
+
if [ "${ARG_STATUS}" = true ]; then
|
|
71
|
+
JOB_NAME="${BENCHMARK_JOB_NAME:-}"
|
|
72
|
+
if [ -z "${JOB_NAME}" ]; then
|
|
73
|
+
echo "❌ No benchmark job tracked"
|
|
74
|
+
echo " Run ./do/benchmark --workload <name> to start one."
|
|
75
|
+
exit 1
|
|
76
|
+
fi
|
|
77
|
+
|
|
78
|
+
echo "📊 Benchmark Job Status"
|
|
79
|
+
echo ""
|
|
80
|
+
echo " Job: ${JOB_NAME}"
|
|
81
|
+
|
|
82
|
+
STATUS=$(aws sagemaker describe-ai-benchmark-job \
|
|
83
|
+
--ai-benchmark-job-name "${JOB_NAME}" \
|
|
84
|
+
--region "${AWS_REGION}" \
|
|
85
|
+
--query 'AIBenchmarkJobStatus' \
|
|
86
|
+
--output text 2>/dev/null) || STATUS=""
|
|
87
|
+
|
|
88
|
+
if [ -z "${STATUS}" ]; then
|
|
89
|
+
echo " Status: Unknown (job not found or credentials expired)"
|
|
90
|
+
exit 1
|
|
91
|
+
fi
|
|
92
|
+
|
|
93
|
+
echo " Status: ${STATUS}"
|
|
94
|
+
|
|
95
|
+
case "${STATUS}" in
|
|
96
|
+
Completed)
|
|
97
|
+
# Check if results already exist locally
|
|
98
|
+
PROJECT_ROOT="${SCRIPT_DIR}/.."
|
|
99
|
+
LOCAL_RESULTS_DIR="${PROJECT_ROOT}/benchmarks/${JOB_NAME}"
|
|
100
|
+
RESULTS_JSONL=$(find "${LOCAL_RESULTS_DIR}" -name "profile_export.jsonl" -type f 2>/dev/null | head -1)
|
|
101
|
+
|
|
102
|
+
if [ -z "${RESULTS_JSONL}" ]; then
|
|
103
|
+
echo ""
|
|
104
|
+
echo " 📥 Downloading results..."
|
|
105
|
+
RESULTS_S3_PATH=$(aws sagemaker describe-ai-benchmark-job \
|
|
106
|
+
--ai-benchmark-job-name "${JOB_NAME}" \
|
|
107
|
+
--region "${AWS_REGION}" \
|
|
108
|
+
--query 'OutputConfig.S3OutputLocation' \
|
|
109
|
+
--output text 2>/dev/null)
|
|
110
|
+
|
|
111
|
+
if [ -n "${RESULTS_S3_PATH}" ]; then
|
|
112
|
+
mkdir -p "${LOCAL_RESULTS_DIR}/output"
|
|
113
|
+
aws s3 sync "${RESULTS_S3_PATH}" "${LOCAL_RESULTS_DIR}/output/" \
|
|
114
|
+
--region "${AWS_REGION}" --quiet
|
|
115
|
+
# Untar if output.tar.gz exists
|
|
116
|
+
local tar_file
|
|
117
|
+
tar_file=$(find "${LOCAL_RESULTS_DIR}" -name "output.tar.gz" -type f 2>/dev/null | head -1)
|
|
118
|
+
if [ -n "${tar_file}" ]; then
|
|
119
|
+
tar -xzf "${tar_file}" --strip-components=1 -C "${LOCAL_RESULTS_DIR}/output/" 2>/dev/null || true
|
|
120
|
+
fi
|
|
121
|
+
# Re-search after extraction
|
|
122
|
+
RESULTS_JSONL=$(find "${LOCAL_RESULTS_DIR}" -name "profile_export.jsonl" -type f 2>/dev/null | head -1)
|
|
123
|
+
echo " ✅ Results downloaded to: benchmarks/${JOB_NAME}/"
|
|
124
|
+
fi
|
|
125
|
+
else
|
|
126
|
+
echo " ✅ Results already available locally"
|
|
127
|
+
fi
|
|
128
|
+
|
|
129
|
+
# Write to Athena if CI bucket is configured and results exist
|
|
130
|
+
if [ -n "${CI_BENCHMARK_RESULTS_BUCKET:-}" ]; then
|
|
131
|
+
_WRITER_INPUT=""
|
|
132
|
+
if [ -n "${RESULTS_JSONL}" ] && [ -f "${RESULTS_JSONL}" ]; then
|
|
133
|
+
_WRITER_INPUT="${RESULTS_JSONL}"
|
|
134
|
+
else
|
|
135
|
+
_WRITER_INPUT=$(find "${LOCAL_RESULTS_DIR}" -name "profile_export_aiperf.json" -type f 2>/dev/null | head -1)
|
|
136
|
+
fi
|
|
137
|
+
|
|
138
|
+
if [ -n "${_WRITER_INPUT}" ]; then
|
|
139
|
+
echo ""
|
|
140
|
+
echo " 📊 Writing to Athena..."
|
|
141
|
+
if python3 "$(dirname "${BASH_SOURCE[0]}")/.benchmark_writer.py" write \
|
|
142
|
+
--results-file "${_WRITER_INPUT}" \
|
|
143
|
+
--config-file "$(dirname "${BASH_SOURCE[0]}")/config" \
|
|
144
|
+
--project-name "${PROJECT_NAME}" \
|
|
145
|
+
--workload "${BENCHMARK_WORKLOAD:-manual}" \
|
|
146
|
+
--concurrency "${BENCHMARK_CONCURRENCY:-2}" \
|
|
147
|
+
--bucket "${CI_BENCHMARK_RESULTS_BUCKET}" \
|
|
148
|
+
--region "${AWS_REGION:-${REGION}}"; then
|
|
149
|
+
echo " ✅ Results persisted to Athena"
|
|
150
|
+
else
|
|
151
|
+
echo " ⚠️ Athena write failed (non-fatal)"
|
|
152
|
+
fi
|
|
153
|
+
fi
|
|
154
|
+
fi
|
|
155
|
+
;;
|
|
156
|
+
InProgress|Starting|Pending)
|
|
157
|
+
echo ""
|
|
158
|
+
echo " Job is still running. Check again with: ./do/benchmark --status"
|
|
159
|
+
;;
|
|
160
|
+
Failed)
|
|
161
|
+
FAILURE_REASON=$(aws sagemaker describe-ai-benchmark-job \
|
|
162
|
+
--ai-benchmark-job-name "${JOB_NAME}" \
|
|
163
|
+
--region "${AWS_REGION}" \
|
|
164
|
+
--query 'FailureReason' \
|
|
165
|
+
--output text 2>/dev/null) || FAILURE_REASON="unknown"
|
|
166
|
+
echo " Reason: ${FAILURE_REASON}"
|
|
167
|
+
;;
|
|
168
|
+
esac
|
|
169
|
+
exit 0
|
|
170
|
+
fi
|
|
171
|
+
|
|
172
|
+
|
|
62
173
|
# ── Require --workload flag ───────────────────────────────────────────────────
|
|
63
174
|
if [ -z "${ARG_WORKLOAD}" ]; then
|
|
64
175
|
echo "❌ --workload <name> is required"
|
|
@@ -172,8 +283,11 @@ print(f's3://{bucket}/${PROJECT_NAME}/')
|
|
|
172
283
|
|
|
173
284
|
CI_BENCHMARK_RESULTS_BUCKET=$(echo "${_PROFILE_JSON}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('ciBenchmarkResultsBucket', ''))" 2>/dev/null) || CI_BENCHMARK_RESULTS_BUCKET=""
|
|
174
285
|
|
|
175
|
-
|
|
176
|
-
|
|
286
|
+
ROLE_ARN=$(echo "${_PROFILE_JSON}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('roleArn', ''))" 2>/dev/null) || ROLE_ARN=""
|
|
287
|
+
|
|
288
|
+
# Derive job names at runtime (unique per invocation).
|
|
289
|
+
# Preserve BENCHMARK_JOB_NAME if already set (from do/config or env) for resume logic.
|
|
290
|
+
BENCHMARK_JOB_NAME="${BENCHMARK_JOB_NAME:-${PROJECT_NAME}-benchmark-$(date +%Y%m%d-%H%M%S)}"
|
|
177
291
|
BENCHMARK_WORKLOAD_CONFIG_NAME="${PROJECT_NAME}-benchmark-config-$(date +%Y%m%d-%H%M%S)"
|
|
178
292
|
|
|
179
293
|
# Ensure benchmark params have defaults (in case workload catalog wasn't found)
|
|
@@ -228,7 +342,15 @@ if [ -n "${BENCHMARK_CONCURRENCY_LEVELS:-}" ] && [ -z "${_BENCHMARK_SINGLE_LEVEL
|
|
|
228
342
|
if [ -n "${IC_ARG}" ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --ic ${IC_ARG}"; fi
|
|
229
343
|
if [ -n "${ADAPTER_ARG}" ]; then _REINVOKE_ARGS="${_REINVOKE_ARGS} --adapter ${ADAPTER_ARG}"; fi
|
|
230
344
|
|
|
231
|
-
|
|
345
|
+
_CHILD_EXIT=0
|
|
346
|
+
"${BASH_SOURCE[0]}" ${_REINVOKE_ARGS} || _CHILD_EXIT=$?
|
|
347
|
+
|
|
348
|
+
if [ ${_CHILD_EXIT} -eq 130 ]; then
|
|
349
|
+
# Child was interrupted (Ctrl+C) — propagate cleanly
|
|
350
|
+
exit 130
|
|
351
|
+
fi
|
|
352
|
+
|
|
353
|
+
if [ ${_CHILD_EXIT} -eq 0 ]; then
|
|
232
354
|
# Copy results to aggregation directory — find the child's results
|
|
233
355
|
# Try the marker file first (set by child), then fall back to ls -td
|
|
234
356
|
_LATEST_JOB_DIR=""
|
|
@@ -747,7 +869,10 @@ echo ""
|
|
|
747
869
|
echo "⚙️ Step 1: Creating AI Workload Config: ${WORKLOAD_CONFIG_NAME}"
|
|
748
870
|
|
|
749
871
|
# Build parameters block
|
|
750
|
-
|
|
872
|
+
# Use HF_MODEL_ID for tokenizer (the original HuggingFace repo ID, e.g. "Qwen/Qwen3-0.6B").
|
|
873
|
+
# MODEL_NAME may have been rewritten to an S3 URI by do/stage, which AIPerf can't use as a tokenizer source.
|
|
874
|
+
BENCHMARK_TOKENIZER="${HF_MODEL_ID:-${MODEL_NAME}}"
|
|
875
|
+
PARAMS_JSON="{\"prompt_input_tokens_mean\":${BENCHMARK_INPUT_TOKENS_MEAN},\"output_tokens_mean\":${BENCHMARK_OUTPUT_TOKENS_MEAN},\"concurrency\":${BENCHMARK_CONCURRENCY},\"streaming\":${BENCHMARK_STREAMING},\"tokenizer\":\"${BENCHMARK_TOKENIZER}\""
|
|
751
876
|
|
|
752
877
|
# Add optional request_count if specified
|
|
753
878
|
if [ -n "${BENCHMARK_REQUEST_COUNT:-}" ]; then
|
|
@@ -856,6 +981,18 @@ fi # end of RESUME_EXISTING=false block
|
|
|
856
981
|
# Skip polling if we already know the job completed (resumed a finished job)
|
|
857
982
|
if [ "${JOB_STATUS:-}" != "Completed" ] && [ "${JOB_STATUS:-}" != "Failed" ] && [ "${JOB_STATUS:-}" != "Stopped" ]; then
|
|
858
983
|
|
|
984
|
+
# Handle Ctrl+C during polling — exit cleanly without stopping the remote job.
|
|
985
|
+
_handle_benchmark_interrupt() {
|
|
986
|
+
echo ""
|
|
987
|
+
echo ""
|
|
988
|
+
echo "⚠️ Interrupted — job continues running in background"
|
|
989
|
+
echo " Job: ${BENCHMARK_JOB_NAME}"
|
|
990
|
+
echo ""
|
|
991
|
+
echo " Check status: aws sagemaker describe-ai-benchmark-job --ai-benchmark-job-name ${BENCHMARK_JOB_NAME} --region ${AWS_REGION}"
|
|
992
|
+
exit 130
|
|
993
|
+
}
|
|
994
|
+
trap '_handle_benchmark_interrupt' INT
|
|
995
|
+
|
|
859
996
|
echo "⏳ Step 3: Waiting for benchmark to complete..."
|
|
860
997
|
echo " Polling every ${POLL_INTERVAL}s (max ${MAX_POLL_ATTEMPTS} attempts = 30 min)"
|
|
861
998
|
echo ""
|
|
@@ -897,13 +1034,14 @@ while [ ${POLL_COUNT} -lt ${MAX_POLL_ATTEMPTS} ]; do
|
|
|
897
1034
|
esac
|
|
898
1035
|
done
|
|
899
1036
|
|
|
1037
|
+
trap - INT
|
|
1038
|
+
|
|
900
1039
|
# Check for timeout
|
|
901
1040
|
if [ ${POLL_COUNT} -ge ${MAX_POLL_ATTEMPTS} ]; then
|
|
902
1041
|
echo ""
|
|
903
1042
|
echo "⚠️ Benchmark timed out after 30 minutes (status: ${JOB_STATUS})"
|
|
904
|
-
echo " The job may still be running.
|
|
905
|
-
echo "
|
|
906
|
-
echo " aws sagemaker describe-ai-benchmark-job --ai-benchmark-job-name ${BENCHMARK_JOB_NAME} --region ${AWS_REGION}"
|
|
1043
|
+
echo " The job may still be running."
|
|
1044
|
+
echo " Check status: ./do/benchmark --status"
|
|
907
1045
|
exit 1
|
|
908
1046
|
fi
|
|
909
1047
|
|
|
@@ -949,7 +1087,7 @@ if [ "${JOB_STATUS}" = "Completed" ]; then
|
|
|
949
1087
|
# Extract any tar.gz archives (benchmark service packages results as output.tar.gz)
|
|
950
1088
|
for ARCHIVE in $(find "${LOCAL_RESULTS_DIR}" -name "*.tar.gz" -type f 2>/dev/null); do
|
|
951
1089
|
ARCHIVE_DIR=$(dirname "${ARCHIVE}")
|
|
952
|
-
tar -xzf "${ARCHIVE}" -C "${ARCHIVE_DIR}" 2>/dev/null || true
|
|
1090
|
+
tar -xzf "${ARCHIVE}" --strip-components=1 -C "${ARCHIVE_DIR}" 2>/dev/null || true
|
|
953
1091
|
done
|
|
954
1092
|
|
|
955
1093
|
# Look for specific result files (priority: JSONL > aiperf JSON)
|
package/templates/do/config
CHANGED
|
@@ -214,6 +214,9 @@ export <%= key %>=${<%= key %>:-<%= value %>}
|
|
|
214
214
|
# Framework-specific configuration
|
|
215
215
|
<% if (framework === 'transformers') { %>
|
|
216
216
|
export MODEL_NAME="<%= modelName %>"
|
|
217
|
+
# HuggingFace Model ID — preserved even after do/stage rewrites MODEL_NAME to S3.
|
|
218
|
+
# Used by do/benchmark (tokenizer), do/tune (model catalog), and do/test (chat template).
|
|
219
|
+
export HF_MODEL_ID="<%= modelName %>"
|
|
217
220
|
# Secrets Manager integration: when an ARN is configured, do-scripts resolve the
|
|
218
221
|
# secret at the appropriate stage (build-time or runtime). When a plaintext value
|
|
219
222
|
# is configured, it is exported directly. The _ARN suffix signals resolution is needed.
|
|
@@ -253,6 +256,7 @@ export TUNE_MODEL_ID="<%= tuneModelId %>"
|
|
|
253
256
|
|
|
254
257
|
<% if (framework === 'diffusors') { %>
|
|
255
258
|
export MODEL_NAME="<%= modelName %>"
|
|
259
|
+
export HF_MODEL_ID="<%= modelName %>"
|
|
256
260
|
# Secrets Manager integration: when an ARN is configured, do-scripts resolve the
|
|
257
261
|
# secret at the appropriate stage (build-time or runtime). When a plaintext value
|
|
258
262
|
# is configured, it is exported directly. The _ARN suffix signals resolution is needed.
|
|
@@ -44,5 +44,10 @@ except:
|
|
|
44
44
|
fi
|
|
45
45
|
fi
|
|
46
46
|
|
|
47
|
+
# Map commonly-used profile values to the variable names scripts expect.
|
|
48
|
+
# Explicit env vars take precedence (${X:-...} pattern).
|
|
49
|
+
ROLE_ARN="${ROLE_ARN:-${_PROFILE_roleArn:-}}"
|
|
50
|
+
CI_BENCHMARK_RESULTS_BUCKET="${CI_BENCHMARK_RESULTS_BUCKET:-${_PROFILE_ciBenchmarkResultsBucket:-}}"
|
|
51
|
+
|
|
47
52
|
# NOTE: set -u is NOT re-enabled here. The caller is responsible for managing
|
|
48
53
|
# their own shell options.
|