@aws/ml-container-creator 0.10.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE-THIRD-PARTY +9304 -0
- package/bin/cli.js +2 -0
- package/config/bootstrap-e2e-stack.json +341 -0
- package/config/bootstrap-stack.json +40 -3
- package/config/parameter-schema-v2.json +33 -22
- package/config/tune-catalog.json +1781 -0
- package/infra/ci-harness/buildspec.yml +1 -0
- package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
- package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
- package/infra/ci-harness/lib/ci-harness-stack.ts +851 -7
- package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
- package/package.json +53 -67
- package/servers/base-image-picker/index.js +121 -121
- package/servers/e2e-status/index.js +297 -0
- package/servers/e2e-status/manifest.json +14 -0
- package/servers/e2e-status/package.json +15 -0
- package/servers/endpoint-picker/LICENSE +202 -0
- package/servers/endpoint-picker/index.js +536 -0
- package/servers/endpoint-picker/manifest.json +14 -0
- package/servers/endpoint-picker/package.json +18 -0
- package/servers/hyperpod-cluster-picker/index.js +125 -125
- package/servers/instance-sizer/index.js +166 -153
- package/servers/instance-sizer/lib/instance-ranker.js +120 -76
- package/servers/instance-sizer/lib/model-resolver.js +61 -61
- package/servers/instance-sizer/lib/quota-resolver.js +113 -113
- package/servers/instance-sizer/lib/vram-estimator.js +31 -31
- package/servers/lib/bedrock-client.js +38 -38
- package/servers/lib/catalogs/instances.json +27 -0
- package/servers/lib/catalogs/model-servers.json +201 -3
- package/servers/lib/custom-validators.js +13 -13
- package/servers/lib/dynamic-resolver.js +4 -4
- package/servers/marketplace-picker/index.js +342 -0
- package/servers/marketplace-picker/manifest.json +14 -0
- package/servers/marketplace-picker/package.json +18 -0
- package/servers/model-picker/index.js +382 -382
- package/servers/region-picker/index.js +56 -56
- package/servers/workload-picker/LICENSE +202 -0
- package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
- package/servers/workload-picker/index.js +171 -0
- package/servers/workload-picker/manifest.json +16 -0
- package/servers/workload-picker/package.json +16 -0
- package/src/app.js +12 -3
- package/src/lib/bootstrap-command-handler.js +609 -15
- package/src/lib/bootstrap-config.js +36 -0
- package/src/lib/bootstrap-profile-manager.js +48 -41
- package/src/lib/ci-register-helpers.js +74 -0
- package/src/lib/config-loader.js +3 -0
- package/src/lib/config-manager.js +7 -0
- package/src/lib/config-validator.js +1 -1
- package/src/lib/cuda-resolver.js +17 -8
- package/src/lib/generated/cli-options.js +319 -314
- package/src/lib/generated/parameter-matrix.js +672 -661
- package/src/lib/generated/validation-rules.js +76 -72
- package/src/lib/path-prover-brain.js +664 -0
- package/src/lib/prompts/infrastructure-prompts.js +2 -2
- package/src/lib/prompts/model-prompts.js +6 -0
- package/src/lib/prompts/project-prompts.js +12 -0
- package/src/lib/secrets-prompt-runner.js +4 -0
- package/src/lib/template-manager.js +1 -1
- package/src/lib/template-variable-resolver.js +87 -1
- package/src/lib/tune-catalog-validator.js +37 -4
- package/templates/Dockerfile +9 -0
- package/templates/code/adapter_sidecar.py +444 -0
- package/templates/code/serve +6 -0
- package/templates/code/serve.d/vllm.ejs +1 -1
- package/templates/do/.benchmark_writer.py +1476 -0
- package/templates/do/.tune_helper.py +982 -57
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
- package/templates/do/adapter +154 -0
- package/templates/do/benchmark +639 -85
- package/templates/do/build +5 -0
- package/templates/do/clean.d/async-inference.ejs +5 -0
- package/templates/do/clean.d/batch-transform.ejs +5 -0
- package/templates/do/clean.d/hyperpod-eks.ejs +5 -0
- package/templates/do/clean.d/managed-inference.ejs +5 -0
- package/templates/do/config +115 -45
- package/templates/do/deploy.d/async-inference.ejs +30 -3
- package/templates/do/deploy.d/batch-transform.ejs +29 -3
- package/templates/do/deploy.d/hyperpod-eks.ejs +4 -0
- package/templates/do/deploy.d/managed-inference.ejs +216 -14
- package/templates/do/lib/endpoint-config.sh +1 -1
- package/templates/do/lib/profile.sh +44 -0
- package/templates/do/optimize +106 -37
- package/templates/do/push +5 -0
- package/templates/do/register +94 -0
- package/templates/do/stage +567 -0
- package/templates/do/submit +7 -0
- package/templates/do/test +14 -0
- package/templates/do/tune +382 -59
- package/templates/do/validate +44 -4
package/templates/do/tune
CHANGED
|
@@ -13,6 +13,10 @@ set -o pipefail
|
|
|
13
13
|
# ── Source project configuration ──────────────────────────────────────────────
|
|
14
14
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
15
15
|
source "${SCRIPT_DIR}/config"
|
|
16
|
+
source "${SCRIPT_DIR}/lib/profile.sh"
|
|
17
|
+
|
|
18
|
+
# ── Profile-resolved variables (env var > profile > default) ──────────────────
|
|
19
|
+
TUNE_S3_BUCKET="${TUNE_S3_BUCKET:-mlcc-tune-${_PROFILE[accountId]:-unknown}-${_PROFILE[awsRegion]:-us-east-1}}"
|
|
16
20
|
|
|
17
21
|
# ── Constants ─────────────────────────────────────────────────────────────────
|
|
18
22
|
CATALOG_FILE="${SCRIPT_DIR}/.tune_catalog.json"
|
|
@@ -40,6 +44,11 @@ ARG_STATUS=false
|
|
|
40
44
|
ARG_HELP=false
|
|
41
45
|
ARG_DRY_RUN=false
|
|
42
46
|
ARG_LIST_MODELS=false
|
|
47
|
+
ARG_NO_STALE_WARNING=false
|
|
48
|
+
ARG_DISCOVER=false
|
|
49
|
+
ARG_DISCOVER_FILTER=""
|
|
50
|
+
ARG_COLUMN_MAP=""
|
|
51
|
+
ARG_ACCEPT_EULA=false
|
|
43
52
|
|
|
44
53
|
|
|
45
54
|
# ── _parse_args() ─────────────────────────────────────────────────────────────
|
|
@@ -132,11 +141,27 @@ _parse_args() {
|
|
|
132
141
|
fi
|
|
133
142
|
ARG_ROLE="$2"; shift 2 ;;
|
|
134
143
|
--force) ARG_FORCE=true; shift ;;
|
|
144
|
+
--accept-eula) ARG_ACCEPT_EULA=true; shift ;;
|
|
135
145
|
--no-wait) ARG_NO_WAIT=true; shift ;;
|
|
136
146
|
--status) ARG_STATUS=true; shift ;;
|
|
137
147
|
--help|-h) ARG_HELP=true; shift ;;
|
|
138
148
|
--dry-run) ARG_DRY_RUN=true; shift ;;
|
|
139
149
|
--list-models) ARG_LIST_MODELS=true; shift ;;
|
|
150
|
+
--no-stale-warning) ARG_NO_STALE_WARNING=true; shift ;;
|
|
151
|
+
--column-map)
|
|
152
|
+
if [ -z "${2:-}" ]; then
|
|
153
|
+
echo "❌ --column-map requires a value (e.g., prompt=question,completion=answer)"
|
|
154
|
+
exit 1
|
|
155
|
+
fi
|
|
156
|
+
ARG_COLUMN_MAP="$2"; shift 2 ;;
|
|
157
|
+
--discover)
|
|
158
|
+
ARG_DISCOVER=true
|
|
159
|
+
shift
|
|
160
|
+
if [ $# -gt 0 ] && [[ ! "$1" == --* ]]; then
|
|
161
|
+
ARG_DISCOVER_FILTER="$1"
|
|
162
|
+
shift
|
|
163
|
+
fi
|
|
164
|
+
;;
|
|
140
165
|
*)
|
|
141
166
|
echo "❌ Unknown option: $1"
|
|
142
167
|
echo " Run ./do/tune --help for usage."
|
|
@@ -150,6 +175,8 @@ _parse_args() {
|
|
|
150
175
|
# ── _show_help() ──────────────────────────────────────────────────────────────
|
|
151
176
|
_show_help() {
|
|
152
177
|
echo "Usage: ./do/tune --technique <technique> --dataset <source> [options]"
|
|
178
|
+
echo " ./do/tune --model <id> --technique <technique> --dataset <source>"
|
|
179
|
+
echo " ./do/tune --discover [filter]"
|
|
153
180
|
echo " ./do/tune --status"
|
|
154
181
|
echo " ./do/tune --list-models"
|
|
155
182
|
echo " ./do/tune --help"
|
|
@@ -157,10 +184,48 @@ _show_help() {
|
|
|
157
184
|
echo "SageMaker AI Managed Model Customization — fine-tune supported foundation"
|
|
158
185
|
echo "models using SFT, DPO, RLAIF, or RLVR without managing infrastructure."
|
|
159
186
|
echo ""
|
|
187
|
+
echo "How it works:"
|
|
188
|
+
echo ""
|
|
189
|
+
echo " ┌─────────────────────────────────────────────────────────────────┐"
|
|
190
|
+
echo " │ JumpStart model (tune) ──→ LoRA adapter weights (S3) │"
|
|
191
|
+
echo " │ ↓ │"
|
|
192
|
+
echo " │ HuggingFace model (deploy) ←──── do/adapter add │"
|
|
193
|
+
echo " │ ↓ │"
|
|
194
|
+
echo " │ vLLM loads adapter at runtime │"
|
|
195
|
+
echo " └─────────────────────────────────────────────────────────────────┘"
|
|
196
|
+
echo ""
|
|
197
|
+
echo " Managed fine-tuning uses a JumpStart Hub model (identified by TUNE_MODEL_ID)"
|
|
198
|
+
echo " to produce LoRA adapter weights. These adapters are then attached to your"
|
|
199
|
+
echo " HuggingFace BYOC deployment via do/adapter add — no redeployment needed."
|
|
200
|
+
echo ""
|
|
201
|
+
echo "Supported model families:"
|
|
202
|
+
echo " • qwen-2.5 (Alibaba) — Qwen 2.5 7B/14B/32B/72B Instruct"
|
|
203
|
+
echo " • qwen-3 (Alibaba) — Qwen 3 0.6B/1.7B/4B/8B/14B/32B"
|
|
204
|
+
echo " • llama-3 (Meta) — Llama 3.1 8B, 3.2 1B/3B, 3.3 70B Instruct"
|
|
205
|
+
echo " • deepseek-r1 (DeepSeek) — R1 Distill Llama 8B/70B, Qwen 1.5B/7B/14B/32B"
|
|
206
|
+
echo " • gpt-oss (OpenAI) — GPT-OSS 20B/120B"
|
|
207
|
+
echo ""
|
|
208
|
+
echo " Only models registered in the SageMaker JumpStart Hub support managed"
|
|
209
|
+
echo " fine-tuning. Not all HuggingFace models have a JumpStart equivalent."
|
|
210
|
+
echo ""
|
|
211
|
+
echo "Finding your JumpStart model ID:"
|
|
212
|
+
echo ""
|
|
213
|
+
echo " aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
|
|
214
|
+
echo " --hub-content-type Model \\"
|
|
215
|
+
echo " --query \"HubContentSummaries[?contains(HubContentName,'<family>')].HubContentName\""
|
|
216
|
+
echo ""
|
|
217
|
+
echo " Or use: ./do/tune --discover [filter]"
|
|
218
|
+
echo ""
|
|
160
219
|
echo "Required:"
|
|
161
220
|
echo " --technique <t> Customization technique: sft, dpo, rlaif, rlvr"
|
|
162
221
|
echo " --dataset <source> Dataset: s3://bucket/path.jsonl or hf://org/name[/split]"
|
|
163
222
|
echo ""
|
|
223
|
+
echo "Model selection:"
|
|
224
|
+
echo " --model <id> JumpStart Hub content name to use for fine-tuning."
|
|
225
|
+
echo " Takes precedence over TUNE_MODEL_ID in do/config."
|
|
226
|
+
echo " Accepts the Hub content name as-is (no catalog lookup)."
|
|
227
|
+
echo " Example: --model huggingface-reasoning-qwen3-8b"
|
|
228
|
+
echo ""
|
|
164
229
|
echo "Training type:"
|
|
165
230
|
echo " --training-type <t> lora (default) or full-rank"
|
|
166
231
|
echo ""
|
|
@@ -177,26 +242,55 @@ _show_help() {
|
|
|
177
242
|
echo " --reward-prompt <uri> S3 URI for reward prompt file"
|
|
178
243
|
echo ""
|
|
179
244
|
echo "Overrides:"
|
|
180
|
-
echo " --model <id> Override model (defaults to MODEL_ID from do/config)"
|
|
181
245
|
echo " --output-bucket <b> Override output bucket (defaults to TUNE_S3_BUCKET)"
|
|
182
246
|
echo " --role <arn> Override execution role (defaults to ROLE_ARN)"
|
|
183
247
|
echo ""
|
|
184
248
|
echo "Job control:"
|
|
185
249
|
echo " --force Force new job even if one exists for this technique"
|
|
250
|
+
echo " --accept-eula Accept model EULA (required for gated models like Llama)"
|
|
186
251
|
echo " --no-wait Submit and exit without polling for completion"
|
|
187
252
|
echo " --status Show status of all tracked tune jobs"
|
|
188
253
|
echo ""
|
|
254
|
+
echo "Discovery and diagnostics:"
|
|
255
|
+
echo " --discover [filter] Query JumpStart Hub for tune-eligible models."
|
|
256
|
+
echo " Without a filter, shows models for the current family."
|
|
257
|
+
echo " With a filter, narrows results by keyword."
|
|
258
|
+
echo " --no-stale-warning Suppress catalog staleness warnings (useful for CI)."
|
|
259
|
+
echo " Also suppressed by MCC_NO_STALE_WARNING=true env var."
|
|
260
|
+
echo ""
|
|
189
261
|
echo "Informational:"
|
|
190
262
|
echo " --help, -h Show this help message"
|
|
191
263
|
echo " --dry-run Validate inputs and show what would be submitted"
|
|
192
264
|
echo " --list-models Print supported models, techniques, and training types"
|
|
193
265
|
echo ""
|
|
194
266
|
echo "Examples:"
|
|
267
|
+
echo " # Fine-tune with pre-configured TUNE_MODEL_ID from do/config:"
|
|
195
268
|
echo " ./do/tune --technique sft --dataset s3://my-bucket/train.jsonl"
|
|
269
|
+
echo ""
|
|
270
|
+
echo " # Override model ID directly:"
|
|
271
|
+
echo " ./do/tune --model huggingface-reasoning-qwen3-8b --technique sft --dataset s3://bucket/data.jsonl"
|
|
272
|
+
echo ""
|
|
273
|
+
echo " # Use a HuggingFace dataset:"
|
|
196
274
|
echo " ./do/tune --technique dpo --dataset hf://my-org/pref-data --learning-rate 1e-5"
|
|
275
|
+
echo ""
|
|
276
|
+
echo " # Fine-tune a gated model (Meta Llama) — requires EULA acceptance:"
|
|
277
|
+
echo " ./do/tune --technique dpo --dataset hf://argilla/ultrafeedback-binarized-preferences-cleaned --accept-eula"
|
|
278
|
+
echo ""
|
|
279
|
+
echo " # Discover available models:"
|
|
280
|
+
echo " ./do/tune --discover # Models for current family"
|
|
281
|
+
echo " ./do/tune --discover qwen # Filter by keyword"
|
|
282
|
+
echo ""
|
|
283
|
+
echo " # Other:"
|
|
197
284
|
echo " ./do/tune --technique sft --dataset s3://bucket/data.jsonl --training-type full-rank"
|
|
198
|
-
echo " ./do/tune --status"
|
|
199
285
|
echo " ./do/tune --technique sft --dataset s3://bucket/data.jsonl --dry-run"
|
|
286
|
+
echo " ./do/tune --status"
|
|
287
|
+
echo ""
|
|
288
|
+
echo "Configuration:"
|
|
289
|
+
echo " TUNE_MODEL_ID is set in do/config at generation time when a matching"
|
|
290
|
+
echo " JumpStart model is found for your HuggingFace model. If not set, use"
|
|
291
|
+
echo " --model <id> or run --discover to find the correct Hub content name."
|
|
292
|
+
echo ""
|
|
293
|
+
echo " For custom training without JumpStart, see: ./do/train --help"
|
|
200
294
|
exit 0
|
|
201
295
|
}
|
|
202
296
|
|
|
@@ -213,7 +307,7 @@ _show_status() {
|
|
|
213
307
|
|
|
214
308
|
if [ -n "${job_name}" ]; then
|
|
215
309
|
found_any=true
|
|
216
|
-
echo " ${technique
|
|
310
|
+
echo " $(echo "${technique}" | tr "[:lower:]" "[:upper:]"):"
|
|
217
311
|
echo " Job: ${job_name}"
|
|
218
312
|
|
|
219
313
|
# Query status via Python helper
|
|
@@ -318,66 +412,203 @@ _update_config_var() {
|
|
|
318
412
|
fi
|
|
319
413
|
}
|
|
320
414
|
|
|
321
|
-
# ──
|
|
322
|
-
#
|
|
415
|
+
# ── _check_catalog_staleness() ─────────────────────────────────────────────────
|
|
416
|
+
# Warn if the tune catalog's lastSynced timestamp is older than the threshold.
|
|
417
|
+
# Configurable via MCC_CATALOG_STALENESS_DAYS (default: 90).
|
|
418
|
+
# Suppressed by --no-stale-warning flag or MCC_NO_STALE_WARNING=true env var.
|
|
419
|
+
_check_catalog_staleness() {
|
|
420
|
+
if [ "${MCC_NO_STALE_WARNING:-}" = "true" ] || [ "${ARG_NO_STALE_WARNING:-false}" = true ]; then
|
|
421
|
+
return 0
|
|
422
|
+
fi
|
|
423
|
+
local threshold="${MCC_CATALOG_STALENESS_DAYS:-90}"
|
|
424
|
+
local last_synced
|
|
425
|
+
last_synced=$(python3 -c "
|
|
426
|
+
import json, sys
|
|
427
|
+
from datetime import datetime, timezone
|
|
428
|
+
try:
|
|
429
|
+
with open('${CATALOG_FILE}') as f:
|
|
430
|
+
catalog = json.load(f)
|
|
431
|
+
ls = catalog.get('lastSynced', '')
|
|
432
|
+
if not ls:
|
|
433
|
+
sys.exit(0)
|
|
434
|
+
synced = datetime.fromisoformat(ls.replace('Z', '+00:00'))
|
|
435
|
+
days = (datetime.now(timezone.utc) - synced).days
|
|
436
|
+
if days > int('${threshold}'):
|
|
437
|
+
print(days)
|
|
438
|
+
except:
|
|
439
|
+
pass
|
|
440
|
+
" 2>/dev/null)
|
|
441
|
+
if [ -n "${last_synced}" ]; then
|
|
442
|
+
echo "⚠️ Tune catalog is ${last_synced} days old. Run 'ml-container-creator bootstrap sync-model-families' to update."
|
|
443
|
+
fi
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
# ── _resolve_tune_model() ─────────────────────────────────────────────────────
|
|
447
|
+
# Resolve the JumpStart Hub content name for managed fine-tuning.
|
|
448
|
+
# Priority: --model flag > TUNE_MODEL_ID config > discovery
|
|
323
449
|
# Sets RESOLVED_MODEL_ID on success.
|
|
324
|
-
|
|
325
|
-
#
|
|
450
|
+
_resolve_tune_model() {
|
|
451
|
+
# Priority 1: --model flag (format-check only, no catalog validation)
|
|
326
452
|
if [ -n "${ARG_MODEL}" ]; then
|
|
453
|
+
if ! echo "${ARG_MODEL}" | grep -qE '^[a-zA-Z0-9](-*[a-zA-Z0-9])*$'; then
|
|
454
|
+
echo "❌ Invalid model ID format: ${ARG_MODEL}"
|
|
455
|
+
echo " Hub content names must match: [a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}"
|
|
456
|
+
exit 1
|
|
457
|
+
fi
|
|
327
458
|
RESOLVED_MODEL_ID="${ARG_MODEL}"
|
|
328
|
-
|
|
329
|
-
RESOLVED_MODEL_ID="${MODEL_ID}"
|
|
330
|
-
elif [ -n "${MODEL_NAME:-}" ]; then
|
|
331
|
-
RESOLVED_MODEL_ID="${MODEL_NAME}"
|
|
332
|
-
else
|
|
333
|
-
echo "❌ No model configured"
|
|
334
|
-
echo " Set MODEL_ID in do/config or use --model <id>"
|
|
335
|
-
exit 1
|
|
459
|
+
return 0
|
|
336
460
|
fi
|
|
337
461
|
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
462
|
+
# Priority 2: TUNE_MODEL_ID from do/config
|
|
463
|
+
if [ -n "${TUNE_MODEL_ID:-}" ]; then
|
|
464
|
+
RESOLVED_MODEL_ID="${TUNE_MODEL_ID}"
|
|
465
|
+
return 0
|
|
342
466
|
fi
|
|
343
467
|
|
|
344
|
-
#
|
|
345
|
-
|
|
346
|
-
|
|
468
|
+
# Priority 3: Neither set — attempt runtime discovery, then show guidance
|
|
469
|
+
_discover_and_guide
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
# ── _discover_and_guide() ─────────────────────────────────────────────────────
|
|
473
|
+
# Display guidance when no model ID is configured and attempt runtime discovery.
|
|
474
|
+
# Attempts Hub discovery via helper script, falls back to static guidance on failure.
|
|
475
|
+
_discover_and_guide() {
|
|
476
|
+
echo ""
|
|
477
|
+
echo "🔧 SageMaker AI Managed Model Customization"
|
|
478
|
+
echo ""
|
|
479
|
+
echo " This feature uses SageMaker Serverless Fine-Tuning, which requires"
|
|
480
|
+
echo " the model to be registered in the SageMaker JumpStart Hub."
|
|
481
|
+
echo ""
|
|
482
|
+
echo " Your deployed model: ${MODEL_NAME:-unknown} (HuggingFace BYOC)"
|
|
483
|
+
echo " JumpStart model ID: (not configured)"
|
|
484
|
+
echo ""
|
|
485
|
+
|
|
486
|
+
# Derive model family from the catalog based on MODEL_NAME (HuggingFace ID)
|
|
487
|
+
local model_family=""
|
|
488
|
+
if [ -f "${CATALOG_FILE}" ] && [ -n "${MODEL_NAME:-}" ]; then
|
|
489
|
+
model_family=$(python3 -c "
|
|
347
490
|
import json, sys
|
|
491
|
+
try:
|
|
492
|
+
with open('${CATALOG_FILE}') as f:
|
|
493
|
+
catalog = json.load(f)
|
|
494
|
+
model_name = '${MODEL_NAME}'
|
|
495
|
+
for entry in catalog.get('models', {}).values():
|
|
496
|
+
if entry.get('huggingFaceId', '') == model_name:
|
|
497
|
+
print(entry.get('family', ''))
|
|
498
|
+
sys.exit(0)
|
|
499
|
+
except:
|
|
500
|
+
pass
|
|
501
|
+
" 2>/dev/null) || model_family=""
|
|
502
|
+
fi
|
|
348
503
|
|
|
349
|
-
|
|
350
|
-
|
|
504
|
+
# Attempt runtime discovery via helper script
|
|
505
|
+
local discover_result=""
|
|
506
|
+
if [ -f "${HELPER_SCRIPT}" ]; then
|
|
507
|
+
discover_result=$(python3 "${HELPER_SCRIPT}" discover \
|
|
508
|
+
--family "${model_family}" \
|
|
509
|
+
--region "${AWS_REGION}" 2>/dev/null) || discover_result=""
|
|
510
|
+
fi
|
|
351
511
|
|
|
352
|
-
|
|
353
|
-
|
|
512
|
+
if [ -n "${discover_result}" ] && echo "${discover_result}" | python3 -c "import sys,json; d=json.load(sys.stdin); sys.exit(0 if d.get('models') else 1)" 2>/dev/null; then
|
|
513
|
+
echo " 📋 Suggested models for your family:"
|
|
514
|
+
echo "${discover_result}" | python3 -c "
|
|
515
|
+
import sys, json
|
|
516
|
+
d = json.load(sys.stdin)
|
|
517
|
+
for m in d.get('models', [])[:5]:
|
|
518
|
+
print(f' • {m}')
|
|
519
|
+
" 2>/dev/null
|
|
520
|
+
echo ""
|
|
521
|
+
fi
|
|
354
522
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
"
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
523
|
+
echo " To find your model's JumpStart ID:"
|
|
524
|
+
echo " aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
|
|
525
|
+
echo " --hub-content-type Model --query \"HubContentSummaries[?contains(HubContentName,'<family>')].HubContentName\""
|
|
526
|
+
echo ""
|
|
527
|
+
echo " Then run:"
|
|
528
|
+
echo " ./do/tune --model <jumpstart-id> --technique ${ARG_TECHNIQUE} --dataset ${ARG_DATASET}"
|
|
529
|
+
echo ""
|
|
530
|
+
echo " Or set it permanently in do/config:"
|
|
531
|
+
echo " export TUNE_MODEL_ID=\"<jumpstart-id>\""
|
|
532
|
+
echo ""
|
|
533
|
+
echo " ┌─────────────────────────────────────────────────────────────┐"
|
|
534
|
+
echo " │ JumpStart model (tune) ──→ LoRA adapter weights (S3) │"
|
|
535
|
+
echo " │ ↓ │"
|
|
536
|
+
echo " │ HuggingFace model (deploy) ←── do/adapter add │"
|
|
537
|
+
echo " │ ↓ │"
|
|
538
|
+
echo " │ vLLM loads adapter at runtime │"
|
|
539
|
+
echo " └─────────────────────────────────────────────────────────────┘"
|
|
540
|
+
echo ""
|
|
541
|
+
echo " For custom training without JumpStart, see: ./do/train --help"
|
|
542
|
+
exit 3
|
|
543
|
+
}
|
|
366
544
|
|
|
367
|
-
|
|
368
|
-
|
|
545
|
+
# ── _run_discover() ───────────────────────────────────────────────────────────
|
|
546
|
+
# Explicit --discover mode: query the JumpStart Hub and display tune-eligible models.
|
|
547
|
+
# Accepts an optional filter keyword to narrow results.
|
|
548
|
+
_run_discover() {
|
|
549
|
+
local filter="${1:-}"
|
|
550
|
+
|
|
551
|
+
echo ""
|
|
552
|
+
echo "🔍 Discovering tune-eligible models in SageMaker JumpStart Hub"
|
|
553
|
+
echo " Region: ${AWS_REGION}"
|
|
554
|
+
if [ -n "${filter}" ]; then
|
|
555
|
+
echo " Filter: ${filter}"
|
|
556
|
+
elif [ -n "${MODEL_FAMILY:-}" ]; then
|
|
557
|
+
echo " Family: ${MODEL_FAMILY}"
|
|
558
|
+
fi
|
|
559
|
+
echo ""
|
|
560
|
+
|
|
561
|
+
# Build discover arguments
|
|
562
|
+
local discover_args=(
|
|
563
|
+
--region "${AWS_REGION}"
|
|
564
|
+
)
|
|
565
|
+
if [ -n "${filter}" ]; then
|
|
566
|
+
discover_args+=(--filter "${filter}")
|
|
567
|
+
elif [ -n "${MODEL_FAMILY:-}" ]; then
|
|
568
|
+
discover_args+=(--family "${MODEL_FAMILY}")
|
|
369
569
|
fi
|
|
370
570
|
|
|
371
|
-
#
|
|
372
|
-
local
|
|
373
|
-
|
|
571
|
+
# Call helper script discover subcommand
|
|
572
|
+
local discover_result
|
|
573
|
+
discover_result=$(python3 "${HELPER_SCRIPT}" discover "${discover_args[@]}" 2>&1) || {
|
|
574
|
+
echo "❌ Discovery failed"
|
|
575
|
+
echo " ${discover_result}"
|
|
576
|
+
echo ""
|
|
577
|
+
echo " Ensure AWS credentials are configured and you have sagemaker:ListHubContents permission."
|
|
578
|
+
echo ""
|
|
579
|
+
echo " Manual alternative:"
|
|
580
|
+
echo " aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
|
|
581
|
+
echo " --hub-content-type Model --query \"HubContentSummaries[].HubContentName\""
|
|
582
|
+
exit 1
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
# Parse and display results
|
|
586
|
+
local count
|
|
587
|
+
count=$(echo "${discover_result}" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('count', 0))" 2>/dev/null) || count="0"
|
|
374
588
|
|
|
375
|
-
|
|
376
|
-
|
|
589
|
+
if [ "${count}" = "0" ]; then
|
|
590
|
+
echo " No tune-eligible models found."
|
|
591
|
+
echo ""
|
|
592
|
+
echo " Try a different filter or check available models manually:"
|
|
593
|
+
echo " aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
|
|
594
|
+
echo " --hub-content-type Model --query \"HubContentSummaries[].HubContentName\""
|
|
595
|
+
else
|
|
596
|
+
echo " 📋 Tune-eligible models (${count} found):"
|
|
597
|
+
echo ""
|
|
598
|
+
echo "${discover_result}" | python3 -c "
|
|
599
|
+
import sys, json
|
|
600
|
+
d = json.load(sys.stdin)
|
|
601
|
+
for m in d.get('models', []):
|
|
602
|
+
print(f' • {m}')
|
|
603
|
+
" 2>/dev/null
|
|
604
|
+
echo ""
|
|
605
|
+
echo " Use with:"
|
|
606
|
+
echo " ./do/tune --model <id> --technique <sft|dpo|rlaif|rlvr> --dataset <source>"
|
|
607
|
+
echo ""
|
|
608
|
+
echo " Or set permanently in do/config:"
|
|
609
|
+
echo " export TUNE_MODEL_ID=\"<id>\""
|
|
610
|
+
fi
|
|
377
611
|
echo ""
|
|
378
|
-
echo " Additional model support and custom training workflows are expected in future releases."
|
|
379
|
-
echo " For custom training workflows, see \`do/train\`."
|
|
380
|
-
exit 1
|
|
381
612
|
}
|
|
382
613
|
|
|
383
614
|
# ── _validate_technique() ─────────────────────────────────────────────────────
|
|
@@ -546,9 +777,17 @@ _validate_dataset() {
|
|
|
546
777
|
elif [[ "${dataset}" == hf://* ]]; then
|
|
547
778
|
# Hugging Face dataset — parse reference and stage to S3
|
|
548
779
|
local hf_path="${dataset#hf://}"
|
|
780
|
+
local hf_file=""
|
|
781
|
+
|
|
782
|
+
# Extract ?file= parameter before parsing path components
|
|
783
|
+
if [[ "${hf_path}" == *"?file="* ]]; then
|
|
784
|
+
hf_file="${hf_path#*?file=}"
|
|
785
|
+
hf_path="${hf_path%%\?file=*}"
|
|
786
|
+
fi
|
|
787
|
+
|
|
549
788
|
local hf_org hf_name hf_split
|
|
550
789
|
|
|
551
|
-
# Parse org/name/split
|
|
790
|
+
# Parse org/name/split from the cleaned path
|
|
552
791
|
hf_org=$(echo "${hf_path}" | cut -d'/' -f1)
|
|
553
792
|
hf_name=$(echo "${hf_path}" | cut -d'/' -f2)
|
|
554
793
|
hf_split=$(echo "${hf_path}" | cut -d'/' -f3-)
|
|
@@ -583,9 +822,16 @@ _validate_dataset() {
|
|
|
583
822
|
if [ -n "${HF_TOKEN_ARN:-}" ]; then
|
|
584
823
|
stage_args+=(--hf-secret-name "${HF_TOKEN_ARN}")
|
|
585
824
|
fi
|
|
825
|
+
if [ -n "${ARG_COLUMN_MAP}" ]; then
|
|
826
|
+
stage_args+=(--column-map "${ARG_COLUMN_MAP}")
|
|
827
|
+
fi
|
|
828
|
+
stage_args+=(--technique "${ARG_TECHNIQUE}")
|
|
829
|
+
if [ -n "${hf_file}" ]; then
|
|
830
|
+
stage_args+=(--hf-file "${hf_file}")
|
|
831
|
+
fi
|
|
586
832
|
|
|
587
833
|
local stage_result
|
|
588
|
-
stage_result=$(python3 "${HELPER_SCRIPT}" stage-hf "${stage_args[@]}"
|
|
834
|
+
stage_result=$(python3 "${HELPER_SCRIPT}" stage-hf "${stage_args[@]}") || {
|
|
589
835
|
local error_msg
|
|
590
836
|
error_msg=$(echo "${stage_result}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('error','Failed to stage dataset'))" 2>/dev/null) || error_msg="Failed to stage HF dataset"
|
|
591
837
|
echo "❌ ${error_msg}"
|
|
@@ -663,7 +909,7 @@ _check_idempotency() {
|
|
|
663
909
|
return 0 # No existing job or --force: proceed with new job
|
|
664
910
|
fi
|
|
665
911
|
|
|
666
|
-
echo "🔍 Found existing ${ARG_TECHNIQUE
|
|
912
|
+
echo "🔍 Found existing $(echo "${ARG_TECHNIQUE}" | tr "[:lower:]" "[:upper:]") job: ${existing_job}"
|
|
667
913
|
|
|
668
914
|
# Query status via Python helper
|
|
669
915
|
local status_json
|
|
@@ -752,7 +998,26 @@ _submit_job() {
|
|
|
752
998
|
timestamp=$(date +%Y%m%d-%H%M%S)
|
|
753
999
|
JOB_NAME="${PROJECT_NAME}-tune-${ARG_TECHNIQUE}-${timestamp}"
|
|
754
1000
|
|
|
755
|
-
|
|
1001
|
+
# Check if model requires EULA acceptance (gated models from Meta, etc.)
|
|
1002
|
+
if [ "${ARG_ACCEPT_EULA}" != true ]; then
|
|
1003
|
+
local model_provider
|
|
1004
|
+
model_provider=$(python3 -c "
|
|
1005
|
+
import json
|
|
1006
|
+
with open('${CATALOG_FILE}') as f:
|
|
1007
|
+
catalog = json.load(f)
|
|
1008
|
+
entry = catalog.get('models', {}).get('${RESOLVED_MODEL_ID}', {})
|
|
1009
|
+
print(entry.get('provider', ''))
|
|
1010
|
+
" 2>/dev/null) || model_provider=""
|
|
1011
|
+
if [ "${model_provider}" = "meta" ]; then
|
|
1012
|
+
echo "⚠️ ${RESOLVED_MODEL_ID} is a gated model that requires EULA acceptance."
|
|
1013
|
+
echo " Add --accept-eula to proceed:"
|
|
1014
|
+
echo " ./do/tune --technique ${ARG_TECHNIQUE} --dataset ${ARG_DATASET} --accept-eula"
|
|
1015
|
+
echo ""
|
|
1016
|
+
exit 1
|
|
1017
|
+
fi
|
|
1018
|
+
fi
|
|
1019
|
+
|
|
1020
|
+
echo "🚀 Submitting $(echo "${ARG_TECHNIQUE}" | tr "[:lower:]" "[:upper:]") customization job"
|
|
756
1021
|
echo " Job name: ${JOB_NAME}"
|
|
757
1022
|
echo " Model: ${RESOLVED_MODEL_ID}"
|
|
758
1023
|
echo " Technique: ${ARG_TECHNIQUE}"
|
|
@@ -764,6 +1029,7 @@ _submit_job() {
|
|
|
764
1029
|
# Build submit arguments
|
|
765
1030
|
local submit_args=(
|
|
766
1031
|
--model-id "${RESOLVED_MODEL_ID}"
|
|
1032
|
+
--region "${AWS_REGION}"
|
|
767
1033
|
--technique "${ARG_TECHNIQUE}"
|
|
768
1034
|
--training-type "${ARG_TRAINING_TYPE}"
|
|
769
1035
|
--dataset-s3-uri "${RESOLVED_DATASET_S3_URI}"
|
|
@@ -801,15 +1067,54 @@ _submit_job() {
|
|
|
801
1067
|
if [ -n "${ARG_REWARD_PROMPT}" ]; then
|
|
802
1068
|
submit_args+=(--reward-prompt "${ARG_REWARD_PROMPT}")
|
|
803
1069
|
fi
|
|
1070
|
+
if [ "${ARG_ACCEPT_EULA}" = true ]; then
|
|
1071
|
+
submit_args+=(--accept-eula)
|
|
1072
|
+
fi
|
|
804
1073
|
|
|
805
|
-
# Invoke Python helper
|
|
1074
|
+
# Invoke Python helper (stderr visible to user for diagnostics)
|
|
806
1075
|
local submit_result
|
|
807
|
-
|
|
1076
|
+
local submit_stderr
|
|
1077
|
+
submit_stderr=$(mktemp)
|
|
1078
|
+
submit_result=$(python3 "${HELPER_SCRIPT}" submit "${submit_args[@]}" 2>"${submit_stderr}") || {
|
|
808
1079
|
echo "❌ Failed to submit customization job"
|
|
809
|
-
echo "
|
|
1080
|
+
echo " Model ID used: ${RESOLVED_MODEL_ID}"
|
|
1081
|
+
echo ""
|
|
1082
|
+
# Show stderr from helper script
|
|
1083
|
+
if [ -s "${submit_stderr}" ]; then
|
|
1084
|
+
echo " Error output:"
|
|
1085
|
+
sed 's/^/ /' "${submit_stderr}"
|
|
1086
|
+
echo ""
|
|
1087
|
+
# Check for ResourceNotFound and suggest verification
|
|
1088
|
+
if grep -qi "ResourceNotFound\|ResourceNotFoundException\|not found" "${submit_stderr}"; then
|
|
1089
|
+
echo " 💡 The model ID may not exist in the JumpStart Hub."
|
|
1090
|
+
echo " Verify with: aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
|
|
1091
|
+
echo " --hub-content-type Model --name-contains \"${RESOLVED_MODEL_ID}\" --region ${AWS_REGION}"
|
|
1092
|
+
echo ""
|
|
1093
|
+
fi
|
|
1094
|
+
fi
|
|
1095
|
+
# Show stdout error JSON if available
|
|
1096
|
+
if [ -n "${submit_result:-}" ]; then
|
|
1097
|
+
local err_msg
|
|
1098
|
+
err_msg=$(echo "${submit_result}" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('error',''))" 2>/dev/null) || err_msg=""
|
|
1099
|
+
if [ -n "${err_msg}" ]; then
|
|
1100
|
+
echo " SDK error: ${err_msg}"
|
|
1101
|
+
echo ""
|
|
1102
|
+
fi
|
|
1103
|
+
fi
|
|
1104
|
+
rm -f "${submit_stderr}"
|
|
810
1105
|
exit 1
|
|
811
1106
|
}
|
|
812
1107
|
|
|
1108
|
+
# Show any stderr warnings from helper script (non-fatal)
|
|
1109
|
+
if [ -s "${submit_stderr}" ]; then
|
|
1110
|
+
sed 's/^/ ⚠️ /' "${submit_stderr}"
|
|
1111
|
+
fi
|
|
1112
|
+
rm -f "${submit_stderr}"
|
|
1113
|
+
|
|
1114
|
+
# SDK may print status lines to stdout before our JSON (e.g., "Training Job Name: ...")
|
|
1115
|
+
# Extract only the JSON line (last line starting with '{')
|
|
1116
|
+
submit_result=$(echo "${submit_result}" | grep '^{' | tail -1)
|
|
1117
|
+
|
|
813
1118
|
# Check for error in response
|
|
814
1119
|
local has_error
|
|
815
1120
|
has_error=$(echo "${submit_result}" | python3 -c "import sys,json; d=json.load(sys.stdin); print('yes' if 'error' in d else 'no')" 2>/dev/null) || has_error="yes"
|
|
@@ -818,6 +1123,14 @@ _submit_job() {
|
|
|
818
1123
|
local error_msg
|
|
819
1124
|
error_msg=$(echo "${submit_result}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('error','Unknown error'))" 2>/dev/null) || error_msg="Unknown error"
|
|
820
1125
|
echo "❌ ${error_msg}"
|
|
1126
|
+
echo " Model ID used: ${RESOLVED_MODEL_ID}"
|
|
1127
|
+
# Check for ResourceNotFound in the error message
|
|
1128
|
+
if echo "${error_msg}" | grep -qi "ResourceNotFound\|ResourceNotFoundException\|not found"; then
|
|
1129
|
+
echo ""
|
|
1130
|
+
echo " 💡 The model ID may not exist in the JumpStart Hub."
|
|
1131
|
+
echo " Verify with: aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
|
|
1132
|
+
echo " --hub-content-type Model --name-contains \"${RESOLVED_MODEL_ID}\" --region ${AWS_REGION}"
|
|
1133
|
+
fi
|
|
821
1134
|
exit 1
|
|
822
1135
|
fi
|
|
823
1136
|
|
|
@@ -1084,6 +1397,12 @@ if [ "${ARG_STATUS}" = true ]; then
|
|
|
1084
1397
|
_show_status
|
|
1085
1398
|
fi
|
|
1086
1399
|
|
|
1400
|
+
# Handle --discover flag (before requiring --technique and --dataset)
|
|
1401
|
+
if [ "${ARG_DISCOVER}" = true ]; then
|
|
1402
|
+
_run_discover "${ARG_DISCOVER_FILTER}"
|
|
1403
|
+
exit 0
|
|
1404
|
+
fi
|
|
1405
|
+
|
|
1087
1406
|
# Validate required arguments for job submission
|
|
1088
1407
|
if [ -z "${ARG_TECHNIQUE}" ]; then
|
|
1089
1408
|
echo "❌ --technique is required"
|
|
@@ -1099,11 +1418,14 @@ if [ -z "${ARG_DATASET}" ]; then
|
|
|
1099
1418
|
exit 1
|
|
1100
1419
|
fi
|
|
1101
1420
|
|
|
1102
|
-
#
|
|
1103
|
-
if [ "${TUNE_SUPPORTED:-}"
|
|
1104
|
-
echo "⚠️ Managed customization is not supported for the configured model."
|
|
1105
|
-
echo " Checking catalog for current support..."
|
|
1421
|
+
# Golden-path gating — check TUNE_SUPPORTED before any model resolution
|
|
1422
|
+
if [ "${TUNE_SUPPORTED:-}" != "true" ]; then
|
|
1106
1423
|
echo ""
|
|
1424
|
+
echo "❌ Managed fine-tuning is not available for this model family."
|
|
1425
|
+
echo ""
|
|
1426
|
+
echo " Use ./do/train for custom fine-tuning."
|
|
1427
|
+
echo ""
|
|
1428
|
+
exit 1
|
|
1107
1429
|
fi
|
|
1108
1430
|
|
|
1109
1431
|
# Validate Python availability
|
|
@@ -1117,7 +1439,8 @@ fi
|
|
|
1117
1439
|
echo "🔧 SageMaker AI Managed Model Customization"
|
|
1118
1440
|
echo ""
|
|
1119
1441
|
|
|
1120
|
-
|
|
1442
|
+
_check_catalog_staleness
|
|
1443
|
+
_resolve_tune_model
|
|
1121
1444
|
_validate_technique
|
|
1122
1445
|
_validate_training_type
|
|
1123
1446
|
_validate_dataset
|