@aws/ml-container-creator 0.10.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/LICENSE-THIRD-PARTY +9304 -0
  2. package/bin/cli.js +2 -0
  3. package/config/bootstrap-e2e-stack.json +341 -0
  4. package/config/bootstrap-stack.json +40 -3
  5. package/config/parameter-schema-v2.json +33 -22
  6. package/config/tune-catalog.json +1781 -0
  7. package/infra/ci-harness/buildspec.yml +1 -0
  8. package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
  9. package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
  10. package/infra/ci-harness/lib/ci-harness-stack.ts +851 -7
  11. package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
  12. package/package.json +53 -67
  13. package/servers/base-image-picker/index.js +121 -121
  14. package/servers/e2e-status/index.js +297 -0
  15. package/servers/e2e-status/manifest.json +14 -0
  16. package/servers/e2e-status/package.json +15 -0
  17. package/servers/endpoint-picker/LICENSE +202 -0
  18. package/servers/endpoint-picker/index.js +536 -0
  19. package/servers/endpoint-picker/manifest.json +14 -0
  20. package/servers/endpoint-picker/package.json +18 -0
  21. package/servers/hyperpod-cluster-picker/index.js +125 -125
  22. package/servers/instance-sizer/index.js +166 -153
  23. package/servers/instance-sizer/lib/instance-ranker.js +120 -76
  24. package/servers/instance-sizer/lib/model-resolver.js +61 -61
  25. package/servers/instance-sizer/lib/quota-resolver.js +113 -113
  26. package/servers/instance-sizer/lib/vram-estimator.js +31 -31
  27. package/servers/lib/bedrock-client.js +38 -38
  28. package/servers/lib/catalogs/instances.json +27 -0
  29. package/servers/lib/catalogs/model-servers.json +201 -3
  30. package/servers/lib/custom-validators.js +13 -13
  31. package/servers/lib/dynamic-resolver.js +4 -4
  32. package/servers/marketplace-picker/index.js +342 -0
  33. package/servers/marketplace-picker/manifest.json +14 -0
  34. package/servers/marketplace-picker/package.json +18 -0
  35. package/servers/model-picker/index.js +382 -382
  36. package/servers/region-picker/index.js +56 -56
  37. package/servers/workload-picker/LICENSE +202 -0
  38. package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
  39. package/servers/workload-picker/index.js +171 -0
  40. package/servers/workload-picker/manifest.json +16 -0
  41. package/servers/workload-picker/package.json +16 -0
  42. package/src/app.js +12 -3
  43. package/src/lib/bootstrap-command-handler.js +609 -15
  44. package/src/lib/bootstrap-config.js +36 -0
  45. package/src/lib/bootstrap-profile-manager.js +48 -41
  46. package/src/lib/ci-register-helpers.js +74 -0
  47. package/src/lib/config-loader.js +3 -0
  48. package/src/lib/config-manager.js +7 -0
  49. package/src/lib/config-validator.js +1 -1
  50. package/src/lib/cuda-resolver.js +17 -8
  51. package/src/lib/generated/cli-options.js +319 -314
  52. package/src/lib/generated/parameter-matrix.js +672 -661
  53. package/src/lib/generated/validation-rules.js +76 -72
  54. package/src/lib/path-prover-brain.js +664 -0
  55. package/src/lib/prompts/infrastructure-prompts.js +2 -2
  56. package/src/lib/prompts/model-prompts.js +6 -0
  57. package/src/lib/prompts/project-prompts.js +12 -0
  58. package/src/lib/secrets-prompt-runner.js +4 -0
  59. package/src/lib/template-manager.js +1 -1
  60. package/src/lib/template-variable-resolver.js +87 -1
  61. package/src/lib/tune-catalog-validator.js +37 -4
  62. package/templates/Dockerfile +9 -0
  63. package/templates/code/adapter_sidecar.py +444 -0
  64. package/templates/code/serve +6 -0
  65. package/templates/code/serve.d/vllm.ejs +1 -1
  66. package/templates/do/.benchmark_writer.py +1476 -0
  67. package/templates/do/.tune_helper.py +982 -57
  68. package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
  69. package/templates/do/adapter +154 -0
  70. package/templates/do/benchmark +639 -85
  71. package/templates/do/build +5 -0
  72. package/templates/do/clean.d/async-inference.ejs +5 -0
  73. package/templates/do/clean.d/batch-transform.ejs +5 -0
  74. package/templates/do/clean.d/hyperpod-eks.ejs +5 -0
  75. package/templates/do/clean.d/managed-inference.ejs +5 -0
  76. package/templates/do/config +115 -45
  77. package/templates/do/deploy.d/async-inference.ejs +30 -3
  78. package/templates/do/deploy.d/batch-transform.ejs +29 -3
  79. package/templates/do/deploy.d/hyperpod-eks.ejs +4 -0
  80. package/templates/do/deploy.d/managed-inference.ejs +216 -14
  81. package/templates/do/lib/endpoint-config.sh +1 -1
  82. package/templates/do/lib/profile.sh +44 -0
  83. package/templates/do/optimize +106 -37
  84. package/templates/do/push +5 -0
  85. package/templates/do/register +94 -0
  86. package/templates/do/stage +567 -0
  87. package/templates/do/submit +7 -0
  88. package/templates/do/test +14 -0
  89. package/templates/do/tune +382 -59
  90. package/templates/do/validate +44 -4
package/templates/do/tune CHANGED
@@ -13,6 +13,10 @@ set -o pipefail
13
13
  # ── Source project configuration ──────────────────────────────────────────────
14
14
  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
15
15
  source "${SCRIPT_DIR}/config"
16
+ source "${SCRIPT_DIR}/lib/profile.sh"
17
+
18
+ # ── Profile-resolved variables (env var > profile > default) ──────────────────
19
+ TUNE_S3_BUCKET="${TUNE_S3_BUCKET:-mlcc-tune-${_PROFILE[accountId]:-unknown}-${_PROFILE[awsRegion]:-us-east-1}}"
16
20
 
17
21
  # ── Constants ─────────────────────────────────────────────────────────────────
18
22
  CATALOG_FILE="${SCRIPT_DIR}/.tune_catalog.json"
@@ -40,6 +44,11 @@ ARG_STATUS=false
40
44
  ARG_HELP=false
41
45
  ARG_DRY_RUN=false
42
46
  ARG_LIST_MODELS=false
47
+ ARG_NO_STALE_WARNING=false
48
+ ARG_DISCOVER=false
49
+ ARG_DISCOVER_FILTER=""
50
+ ARG_COLUMN_MAP=""
51
+ ARG_ACCEPT_EULA=false
43
52
 
44
53
 
45
54
  # ── _parse_args() ─────────────────────────────────────────────────────────────
@@ -132,11 +141,27 @@ _parse_args() {
132
141
  fi
133
142
  ARG_ROLE="$2"; shift 2 ;;
134
143
  --force) ARG_FORCE=true; shift ;;
144
+ --accept-eula) ARG_ACCEPT_EULA=true; shift ;;
135
145
  --no-wait) ARG_NO_WAIT=true; shift ;;
136
146
  --status) ARG_STATUS=true; shift ;;
137
147
  --help|-h) ARG_HELP=true; shift ;;
138
148
  --dry-run) ARG_DRY_RUN=true; shift ;;
139
149
  --list-models) ARG_LIST_MODELS=true; shift ;;
150
+ --no-stale-warning) ARG_NO_STALE_WARNING=true; shift ;;
151
+ --column-map)
152
+ if [ -z "${2:-}" ]; then
153
+ echo "❌ --column-map requires a value (e.g., prompt=question,completion=answer)"
154
+ exit 1
155
+ fi
156
+ ARG_COLUMN_MAP="$2"; shift 2 ;;
157
+ --discover)
158
+ ARG_DISCOVER=true
159
+ shift
160
+ if [ $# -gt 0 ] && [[ ! "$1" == --* ]]; then
161
+ ARG_DISCOVER_FILTER="$1"
162
+ shift
163
+ fi
164
+ ;;
140
165
  *)
141
166
  echo "❌ Unknown option: $1"
142
167
  echo " Run ./do/tune --help for usage."
@@ -150,6 +175,8 @@ _parse_args() {
150
175
  # ── _show_help() ──────────────────────────────────────────────────────────────
151
176
  _show_help() {
152
177
  echo "Usage: ./do/tune --technique <technique> --dataset <source> [options]"
178
+ echo " ./do/tune --model <id> --technique <technique> --dataset <source>"
179
+ echo " ./do/tune --discover [filter]"
153
180
  echo " ./do/tune --status"
154
181
  echo " ./do/tune --list-models"
155
182
  echo " ./do/tune --help"
@@ -157,10 +184,48 @@ _show_help() {
157
184
  echo "SageMaker AI Managed Model Customization — fine-tune supported foundation"
158
185
  echo "models using SFT, DPO, RLAIF, or RLVR without managing infrastructure."
159
186
  echo ""
187
+ echo "How it works:"
188
+ echo ""
189
+ echo " ┌─────────────────────────────────────────────────────────────────┐"
190
+ echo " │ JumpStart model (tune) ──→ LoRA adapter weights (S3) │"
191
+ echo " │ ↓ │"
192
+ echo " │ HuggingFace model (deploy) ←──── do/adapter add │"
193
+ echo " │ ↓ │"
194
+ echo " │ vLLM loads adapter at runtime │"
195
+ echo " └─────────────────────────────────────────────────────────────────┘"
196
+ echo ""
197
+ echo " Managed fine-tuning uses a JumpStart Hub model (identified by TUNE_MODEL_ID)"
198
+ echo " to produce LoRA adapter weights. These adapters are then attached to your"
199
+ echo " HuggingFace BYOC deployment via do/adapter add — no redeployment needed."
200
+ echo ""
201
+ echo "Supported model families:"
202
+ echo " • qwen-2.5 (Alibaba) — Qwen 2.5 7B/14B/32B/72B Instruct"
203
+ echo " • qwen-3 (Alibaba) — Qwen 3 0.6B/1.7B/4B/8B/14B/32B"
204
+ echo " • llama-3 (Meta) — Llama 3.1 8B, 3.2 1B/3B, 3.3 70B Instruct"
205
+ echo " • deepseek-r1 (DeepSeek) — R1 Distill Llama 8B/70B, Qwen 1.5B/7B/14B/32B"
206
+ echo " • gpt-oss (OpenAI) — GPT-OSS 20B/120B"
207
+ echo ""
208
+ echo " Only models registered in the SageMaker JumpStart Hub support managed"
209
+ echo " fine-tuning. Not all HuggingFace models have a JumpStart equivalent."
210
+ echo ""
211
+ echo "Finding your JumpStart model ID:"
212
+ echo ""
213
+ echo " aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
214
+ echo " --hub-content-type Model \\"
215
+ echo " --query \"HubContentSummaries[?contains(HubContentName,'<family>')].HubContentName\""
216
+ echo ""
217
+ echo " Or use: ./do/tune --discover [filter]"
218
+ echo ""
160
219
  echo "Required:"
161
220
  echo " --technique <t> Customization technique: sft, dpo, rlaif, rlvr"
162
221
  echo " --dataset <source> Dataset: s3://bucket/path.jsonl or hf://org/name[/split]"
163
222
  echo ""
223
+ echo "Model selection:"
224
+ echo " --model <id> JumpStart Hub content name to use for fine-tuning."
225
+ echo " Takes precedence over TUNE_MODEL_ID in do/config."
226
+ echo " Accepts the Hub content name as-is (no catalog lookup)."
227
+ echo " Example: --model huggingface-reasoning-qwen3-8b"
228
+ echo ""
164
229
  echo "Training type:"
165
230
  echo " --training-type <t> lora (default) or full-rank"
166
231
  echo ""
@@ -177,26 +242,55 @@ _show_help() {
177
242
  echo " --reward-prompt <uri> S3 URI for reward prompt file"
178
243
  echo ""
179
244
  echo "Overrides:"
180
- echo " --model <id> Override model (defaults to MODEL_ID from do/config)"
181
245
  echo " --output-bucket <b> Override output bucket (defaults to TUNE_S3_BUCKET)"
182
246
  echo " --role <arn> Override execution role (defaults to ROLE_ARN)"
183
247
  echo ""
184
248
  echo "Job control:"
185
249
  echo " --force Force new job even if one exists for this technique"
250
+ echo " --accept-eula Accept model EULA (required for gated models like Llama)"
186
251
  echo " --no-wait Submit and exit without polling for completion"
187
252
  echo " --status Show status of all tracked tune jobs"
188
253
  echo ""
254
+ echo "Discovery and diagnostics:"
255
+ echo " --discover [filter] Query JumpStart Hub for tune-eligible models."
256
+ echo " Without a filter, shows models for the current family."
257
+ echo " With a filter, narrows results by keyword."
258
+ echo " --no-stale-warning Suppress catalog staleness warnings (useful for CI)."
259
+ echo " Also suppressed by MCC_NO_STALE_WARNING=true env var."
260
+ echo ""
189
261
  echo "Informational:"
190
262
  echo " --help, -h Show this help message"
191
263
  echo " --dry-run Validate inputs and show what would be submitted"
192
264
  echo " --list-models Print supported models, techniques, and training types"
193
265
  echo ""
194
266
  echo "Examples:"
267
+ echo " # Fine-tune with pre-configured TUNE_MODEL_ID from do/config:"
195
268
  echo " ./do/tune --technique sft --dataset s3://my-bucket/train.jsonl"
269
+ echo ""
270
+ echo " # Override model ID directly:"
271
+ echo " ./do/tune --model huggingface-reasoning-qwen3-8b --technique sft --dataset s3://bucket/data.jsonl"
272
+ echo ""
273
+ echo " # Use a HuggingFace dataset:"
196
274
  echo " ./do/tune --technique dpo --dataset hf://my-org/pref-data --learning-rate 1e-5"
275
+ echo ""
276
+ echo " # Fine-tune a gated model (Meta Llama) — requires EULA acceptance:"
277
+ echo " ./do/tune --technique dpo --dataset hf://argilla/ultrafeedback-binarized-preferences-cleaned --accept-eula"
278
+ echo ""
279
+ echo " # Discover available models:"
280
+ echo " ./do/tune --discover # Models for current family"
281
+ echo " ./do/tune --discover qwen # Filter by keyword"
282
+ echo ""
283
+ echo " # Other:"
197
284
  echo " ./do/tune --technique sft --dataset s3://bucket/data.jsonl --training-type full-rank"
198
- echo " ./do/tune --status"
199
285
  echo " ./do/tune --technique sft --dataset s3://bucket/data.jsonl --dry-run"
286
+ echo " ./do/tune --status"
287
+ echo ""
288
+ echo "Configuration:"
289
+ echo " TUNE_MODEL_ID is set in do/config at generation time when a matching"
290
+ echo " JumpStart model is found for your HuggingFace model. If not set, use"
291
+ echo " --model <id> or run --discover to find the correct Hub content name."
292
+ echo ""
293
+ echo " For custom training without JumpStart, see: ./do/train --help"
200
294
  exit 0
201
295
  }
202
296
 
@@ -213,7 +307,7 @@ _show_status() {
213
307
 
214
308
  if [ -n "${job_name}" ]; then
215
309
  found_any=true
216
- echo " ${technique^^}:"
310
+ echo " $(echo "${technique}" | tr "[:lower:]" "[:upper:]"):"
217
311
  echo " Job: ${job_name}"
218
312
 
219
313
  # Query status via Python helper
@@ -318,66 +412,203 @@ _update_config_var() {
318
412
  fi
319
413
  }
320
414
 
321
- # ── _validate_model() ─────────────────────────────────────────────────────────
322
- # Read MODEL_ID from do/config (or --model override), check against catalog.
415
+ # ── _check_catalog_staleness() ─────────────────────────────────────────────────
416
+ # Warn if the tune catalog's lastSynced timestamp is older than the threshold.
417
+ # Configurable via MCC_CATALOG_STALENESS_DAYS (default: 90).
418
+ # Suppressed by --no-stale-warning flag or MCC_NO_STALE_WARNING=true env var.
419
+ _check_catalog_staleness() {
420
+ if [ "${MCC_NO_STALE_WARNING:-}" = "true" ] || [ "${ARG_NO_STALE_WARNING:-false}" = true ]; then
421
+ return 0
422
+ fi
423
+ local threshold="${MCC_CATALOG_STALENESS_DAYS:-90}"
424
+ local last_synced
425
+ last_synced=$(python3 -c "
426
+ import json, sys
427
+ from datetime import datetime, timezone
428
+ try:
429
+ with open('${CATALOG_FILE}') as f:
430
+ catalog = json.load(f)
431
+ ls = catalog.get('lastSynced', '')
432
+ if not ls:
433
+ sys.exit(0)
434
+ synced = datetime.fromisoformat(ls.replace('Z', '+00:00'))
435
+ days = (datetime.now(timezone.utc) - synced).days
436
+ if days > int('${threshold}'):
437
+ print(days)
438
+ except:
439
+ pass
440
+ " 2>/dev/null)
441
+ if [ -n "${last_synced}" ]; then
442
+ echo "⚠️ Tune catalog is ${last_synced} days old. Run 'ml-container-creator bootstrap sync-model-families' to update."
443
+ fi
444
+ }
445
+
446
+ # ── _resolve_tune_model() ─────────────────────────────────────────────────────
447
+ # Resolve the JumpStart Hub content name for managed fine-tuning.
448
+ # Priority: --model flag > TUNE_MODEL_ID config > discovery
323
449
  # Sets RESOLVED_MODEL_ID on success.
324
- _validate_model() {
325
- # Resolve model ID: --model override, MODEL_ID from config, or MODEL_NAME fallback
450
+ _resolve_tune_model() {
451
+ # Priority 1: --model flag (format-check only, no catalog validation)
326
452
  if [ -n "${ARG_MODEL}" ]; then
453
+ if ! echo "${ARG_MODEL}" | grep -qE '^[a-zA-Z0-9](-*[a-zA-Z0-9])*$'; then
454
+ echo "❌ Invalid model ID format: ${ARG_MODEL}"
455
+ echo " Hub content names must match: [a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}"
456
+ exit 1
457
+ fi
327
458
  RESOLVED_MODEL_ID="${ARG_MODEL}"
328
- elif [ -n "${MODEL_ID:-}" ]; then
329
- RESOLVED_MODEL_ID="${MODEL_ID}"
330
- elif [ -n "${MODEL_NAME:-}" ]; then
331
- RESOLVED_MODEL_ID="${MODEL_NAME}"
332
- else
333
- echo "❌ No model configured"
334
- echo " Set MODEL_ID in do/config or use --model <id>"
335
- exit 1
459
+ return 0
336
460
  fi
337
461
 
338
- if [ ! -f "${CATALOG_FILE}" ]; then
339
- echo "❌ Catalog file not found: ${CATALOG_FILE}"
340
- echo " The tune catalog is required for model validation."
341
- exit 1
462
+ # Priority 2: TUNE_MODEL_ID from do/config
463
+ if [ -n "${TUNE_MODEL_ID:-}" ]; then
464
+ RESOLVED_MODEL_ID="${TUNE_MODEL_ID}"
465
+ return 0
342
466
  fi
343
467
 
344
- # Check if model is in catalog using python3 for JSON parsing
345
- local result
346
- result=$(python3 -c "
468
+ # Priority 3: Neither set attempt runtime discovery, then show guidance
469
+ _discover_and_guide
470
+ }
471
+
472
+ # ── _discover_and_guide() ─────────────────────────────────────────────────────
473
+ # Display guidance when no model ID is configured and attempt runtime discovery.
474
+ # Attempts Hub discovery via helper script, falls back to static guidance on failure.
475
+ _discover_and_guide() {
476
+ echo ""
477
+ echo "🔧 SageMaker AI Managed Model Customization"
478
+ echo ""
479
+ echo " This feature uses SageMaker Serverless Fine-Tuning, which requires"
480
+ echo " the model to be registered in the SageMaker JumpStart Hub."
481
+ echo ""
482
+ echo " Your deployed model: ${MODEL_NAME:-unknown} (HuggingFace BYOC)"
483
+ echo " JumpStart model ID: (not configured)"
484
+ echo ""
485
+
486
+ # Derive model family from the catalog based on MODEL_NAME (HuggingFace ID)
487
+ local model_family=""
488
+ if [ -f "${CATALOG_FILE}" ] && [ -n "${MODEL_NAME:-}" ]; then
489
+ model_family=$(python3 -c "
347
490
  import json, sys
491
+ try:
492
+ with open('${CATALOG_FILE}') as f:
493
+ catalog = json.load(f)
494
+ model_name = '${MODEL_NAME}'
495
+ for entry in catalog.get('models', {}).values():
496
+ if entry.get('huggingFaceId', '') == model_name:
497
+ print(entry.get('family', ''))
498
+ sys.exit(0)
499
+ except:
500
+ pass
501
+ " 2>/dev/null) || model_family=""
502
+ fi
348
503
 
349
- with open('${CATALOG_FILE}') as f:
350
- catalog = json.load(f)
504
+ # Attempt runtime discovery via helper script
505
+ local discover_result=""
506
+ if [ -f "${HELPER_SCRIPT}" ]; then
507
+ discover_result=$(python3 "${HELPER_SCRIPT}" discover \
508
+ --family "${model_family}" \
509
+ --region "${AWS_REGION}" 2>/dev/null) || discover_result=""
510
+ fi
351
511
 
352
- model_id = '${RESOLVED_MODEL_ID}'
353
- models = catalog.get('models', {})
512
+ if [ -n "${discover_result}" ] && echo "${discover_result}" | python3 -c "import sys,json; d=json.load(sys.stdin); sys.exit(0 if d.get('models') else 1)" 2>/dev/null; then
513
+ echo " 📋 Suggested models for your family:"
514
+ echo "${discover_result}" | python3 -c "
515
+ import sys, json
516
+ d = json.load(sys.stdin)
517
+ for m in d.get('models', [])[:5]:
518
+ print(f' • {m}')
519
+ " 2>/dev/null
520
+ echo ""
521
+ fi
354
522
 
355
- if model_id in models:
356
- print('SUPPORTED')
357
- else:
358
- # Collect unique families
359
- families = sorted(set(e.get('family', '') for e in models.values() if e.get('family')))
360
- print('UNSUPPORTED|' + '|'.join(families))
361
- " 2>/dev/null) || {
362
- echo " Failed to validate model against catalog"
363
- echo " Ensure python3 is available."
364
- exit 1
365
- }
523
+ echo " To find your model's JumpStart ID:"
524
+ echo " aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
525
+ echo " --hub-content-type Model --query \"HubContentSummaries[?contains(HubContentName,'<family>')].HubContentName\""
526
+ echo ""
527
+ echo " Then run:"
528
+ echo " ./do/tune --model <jumpstart-id> --technique ${ARG_TECHNIQUE} --dataset ${ARG_DATASET}"
529
+ echo ""
530
+ echo " Or set it permanently in do/config:"
531
+ echo " export TUNE_MODEL_ID=\"<jumpstart-id>\""
532
+ echo ""
533
+ echo " ┌─────────────────────────────────────────────────────────────┐"
534
+ echo " │ JumpStart model (tune) ──→ LoRA adapter weights (S3) │"
535
+ echo " │ ↓ │"
536
+ echo " │ HuggingFace model (deploy) ←── do/adapter add │"
537
+ echo " │ ↓ │"
538
+ echo " │ vLLM loads adapter at runtime │"
539
+ echo " └─────────────────────────────────────────────────────────────┘"
540
+ echo ""
541
+ echo " For custom training without JumpStart, see: ./do/train --help"
542
+ exit 3
543
+ }
366
544
 
367
- if [ "${result}" = "SUPPORTED" ]; then
368
- return 0
545
+ # ── _run_discover() ───────────────────────────────────────────────────────────
546
+ # Explicit --discover mode: query the JumpStart Hub and display tune-eligible models.
547
+ # Accepts an optional filter keyword to narrow results.
548
+ _run_discover() {
549
+ local filter="${1:-}"
550
+
551
+ echo ""
552
+ echo "🔍 Discovering tune-eligible models in SageMaker JumpStart Hub"
553
+ echo " Region: ${AWS_REGION}"
554
+ if [ -n "${filter}" ]; then
555
+ echo " Filter: ${filter}"
556
+ elif [ -n "${MODEL_FAMILY:-}" ]; then
557
+ echo " Family: ${MODEL_FAMILY}"
558
+ fi
559
+ echo ""
560
+
561
+ # Build discover arguments
562
+ local discover_args=(
563
+ --region "${AWS_REGION}"
564
+ )
565
+ if [ -n "${filter}" ]; then
566
+ discover_args+=(--filter "${filter}")
567
+ elif [ -n "${MODEL_FAMILY:-}" ]; then
568
+ discover_args+=(--family "${MODEL_FAMILY}")
369
569
  fi
370
570
 
371
- # Model not supported extract families from result
372
- local families
373
- families=$(echo "${result}" | cut -d'|' -f2- | tr '|' ', ')
571
+ # Call helper script discover subcommand
572
+ local discover_result
573
+ discover_result=$(python3 "${HELPER_SCRIPT}" discover "${discover_args[@]}" 2>&1) || {
574
+ echo "❌ Discovery failed"
575
+ echo " ${discover_result}"
576
+ echo ""
577
+ echo " Ensure AWS credentials are configured and you have sagemaker:ListHubContents permission."
578
+ echo ""
579
+ echo " Manual alternative:"
580
+ echo " aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
581
+ echo " --hub-content-type Model --query \"HubContentSummaries[].HubContentName\""
582
+ exit 1
583
+ }
584
+
585
+ # Parse and display results
586
+ local count
587
+ count=$(echo "${discover_result}" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('count', 0))" 2>/dev/null) || count="0"
374
588
 
375
- echo "❌ Model \"${RESOLVED_MODEL_ID}\" is not yet supported for managed serverless customization."
376
- echo " Supported model families: ${families}"
589
+ if [ "${count}" = "0" ]; then
590
+ echo " No tune-eligible models found."
591
+ echo ""
592
+ echo " Try a different filter or check available models manually:"
593
+ echo " aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
594
+ echo " --hub-content-type Model --query \"HubContentSummaries[].HubContentName\""
595
+ else
596
+ echo " 📋 Tune-eligible models (${count} found):"
597
+ echo ""
598
+ echo "${discover_result}" | python3 -c "
599
+ import sys, json
600
+ d = json.load(sys.stdin)
601
+ for m in d.get('models', []):
602
+ print(f' • {m}')
603
+ " 2>/dev/null
604
+ echo ""
605
+ echo " Use with:"
606
+ echo " ./do/tune --model <id> --technique <sft|dpo|rlaif|rlvr> --dataset <source>"
607
+ echo ""
608
+ echo " Or set permanently in do/config:"
609
+ echo " export TUNE_MODEL_ID=\"<id>\""
610
+ fi
377
611
  echo ""
378
- echo " Additional model support and custom training workflows are expected in future releases."
379
- echo " For custom training workflows, see \`do/train\`."
380
- exit 1
381
612
  }
382
613
 
383
614
  # ── _validate_technique() ─────────────────────────────────────────────────────
@@ -546,9 +777,17 @@ _validate_dataset() {
546
777
  elif [[ "${dataset}" == hf://* ]]; then
547
778
  # Hugging Face dataset — parse reference and stage to S3
548
779
  local hf_path="${dataset#hf://}"
780
+ local hf_file=""
781
+
782
+ # Extract ?file= parameter before parsing path components
783
+ if [[ "${hf_path}" == *"?file="* ]]; then
784
+ hf_file="${hf_path#*?file=}"
785
+ hf_path="${hf_path%%\?file=*}"
786
+ fi
787
+
549
788
  local hf_org hf_name hf_split
550
789
 
551
- # Parse org/name/split
790
+ # Parse org/name/split from the cleaned path
552
791
  hf_org=$(echo "${hf_path}" | cut -d'/' -f1)
553
792
  hf_name=$(echo "${hf_path}" | cut -d'/' -f2)
554
793
  hf_split=$(echo "${hf_path}" | cut -d'/' -f3-)
@@ -583,9 +822,16 @@ _validate_dataset() {
583
822
  if [ -n "${HF_TOKEN_ARN:-}" ]; then
584
823
  stage_args+=(--hf-secret-name "${HF_TOKEN_ARN}")
585
824
  fi
825
+ if [ -n "${ARG_COLUMN_MAP}" ]; then
826
+ stage_args+=(--column-map "${ARG_COLUMN_MAP}")
827
+ fi
828
+ stage_args+=(--technique "${ARG_TECHNIQUE}")
829
+ if [ -n "${hf_file}" ]; then
830
+ stage_args+=(--hf-file "${hf_file}")
831
+ fi
586
832
 
587
833
  local stage_result
588
- stage_result=$(python3 "${HELPER_SCRIPT}" stage-hf "${stage_args[@]}" 2>/dev/null) || {
834
+ stage_result=$(python3 "${HELPER_SCRIPT}" stage-hf "${stage_args[@]}") || {
589
835
  local error_msg
590
836
  error_msg=$(echo "${stage_result}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('error','Failed to stage dataset'))" 2>/dev/null) || error_msg="Failed to stage HF dataset"
591
837
  echo "❌ ${error_msg}"
@@ -663,7 +909,7 @@ _check_idempotency() {
663
909
  return 0 # No existing job or --force: proceed with new job
664
910
  fi
665
911
 
666
- echo "🔍 Found existing ${ARG_TECHNIQUE^^} job: ${existing_job}"
912
+ echo "🔍 Found existing $(echo "${ARG_TECHNIQUE}" | tr "[:lower:]" "[:upper:]") job: ${existing_job}"
667
913
 
668
914
  # Query status via Python helper
669
915
  local status_json
@@ -752,7 +998,26 @@ _submit_job() {
752
998
  timestamp=$(date +%Y%m%d-%H%M%S)
753
999
  JOB_NAME="${PROJECT_NAME}-tune-${ARG_TECHNIQUE}-${timestamp}"
754
1000
 
755
- echo "🚀 Submitting ${ARG_TECHNIQUE^^} customization job"
1001
+ # Check if model requires EULA acceptance (gated models from Meta, etc.)
1002
+ if [ "${ARG_ACCEPT_EULA}" != true ]; then
1003
+ local model_provider
1004
+ model_provider=$(python3 -c "
1005
+ import json
1006
+ with open('${CATALOG_FILE}') as f:
1007
+ catalog = json.load(f)
1008
+ entry = catalog.get('models', {}).get('${RESOLVED_MODEL_ID}', {})
1009
+ print(entry.get('provider', ''))
1010
+ " 2>/dev/null) || model_provider=""
1011
+ if [ "${model_provider}" = "meta" ]; then
1012
+ echo "⚠️ ${RESOLVED_MODEL_ID} is a gated model that requires EULA acceptance."
1013
+ echo " Add --accept-eula to proceed:"
1014
+ echo " ./do/tune --technique ${ARG_TECHNIQUE} --dataset ${ARG_DATASET} --accept-eula"
1015
+ echo ""
1016
+ exit 1
1017
+ fi
1018
+ fi
1019
+
1020
+ echo "🚀 Submitting $(echo "${ARG_TECHNIQUE}" | tr "[:lower:]" "[:upper:]") customization job"
756
1021
  echo " Job name: ${JOB_NAME}"
757
1022
  echo " Model: ${RESOLVED_MODEL_ID}"
758
1023
  echo " Technique: ${ARG_TECHNIQUE}"
@@ -764,6 +1029,7 @@ _submit_job() {
764
1029
  # Build submit arguments
765
1030
  local submit_args=(
766
1031
  --model-id "${RESOLVED_MODEL_ID}"
1032
+ --region "${AWS_REGION}"
767
1033
  --technique "${ARG_TECHNIQUE}"
768
1034
  --training-type "${ARG_TRAINING_TYPE}"
769
1035
  --dataset-s3-uri "${RESOLVED_DATASET_S3_URI}"
@@ -801,15 +1067,54 @@ _submit_job() {
801
1067
  if [ -n "${ARG_REWARD_PROMPT}" ]; then
802
1068
  submit_args+=(--reward-prompt "${ARG_REWARD_PROMPT}")
803
1069
  fi
1070
+ if [ "${ARG_ACCEPT_EULA}" = true ]; then
1071
+ submit_args+=(--accept-eula)
1072
+ fi
804
1073
 
805
- # Invoke Python helper
1074
+ # Invoke Python helper (stderr visible to user for diagnostics)
806
1075
  local submit_result
807
- submit_result=$(python3 "${HELPER_SCRIPT}" submit "${submit_args[@]}" 2>/dev/null) || {
1076
+ local submit_stderr
1077
+ submit_stderr=$(mktemp)
1078
+ submit_result=$(python3 "${HELPER_SCRIPT}" submit "${submit_args[@]}" 2>"${submit_stderr}") || {
808
1079
  echo "❌ Failed to submit customization job"
809
- echo " Ensure the SageMaker Python SDK is installed: pip install 'sagemaker>=2.232.0'"
1080
+ echo " Model ID used: ${RESOLVED_MODEL_ID}"
1081
+ echo ""
1082
+ # Show stderr from helper script
1083
+ if [ -s "${submit_stderr}" ]; then
1084
+ echo " Error output:"
1085
+ sed 's/^/ /' "${submit_stderr}"
1086
+ echo ""
1087
+ # Check for ResourceNotFound and suggest verification
1088
+ if grep -qi "ResourceNotFound\|ResourceNotFoundException\|not found" "${submit_stderr}"; then
1089
+ echo " 💡 The model ID may not exist in the JumpStart Hub."
1090
+ echo " Verify with: aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
1091
+ echo " --hub-content-type Model --name-contains \"${RESOLVED_MODEL_ID}\" --region ${AWS_REGION}"
1092
+ echo ""
1093
+ fi
1094
+ fi
1095
+ # Show stdout error JSON if available
1096
+ if [ -n "${submit_result:-}" ]; then
1097
+ local err_msg
1098
+ err_msg=$(echo "${submit_result}" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('error',''))" 2>/dev/null) || err_msg=""
1099
+ if [ -n "${err_msg}" ]; then
1100
+ echo " SDK error: ${err_msg}"
1101
+ echo ""
1102
+ fi
1103
+ fi
1104
+ rm -f "${submit_stderr}"
810
1105
  exit 1
811
1106
  }
812
1107
 
1108
+ # Show any stderr warnings from helper script (non-fatal)
1109
+ if [ -s "${submit_stderr}" ]; then
1110
+ sed 's/^/ ⚠️ /' "${submit_stderr}"
1111
+ fi
1112
+ rm -f "${submit_stderr}"
1113
+
1114
+ # SDK may print status lines to stdout before our JSON (e.g., "Training Job Name: ...")
1115
+ # Extract only the JSON line (last line starting with '{')
1116
+ submit_result=$(echo "${submit_result}" | grep '^{' | tail -1)
1117
+
813
1118
  # Check for error in response
814
1119
  local has_error
815
1120
  has_error=$(echo "${submit_result}" | python3 -c "import sys,json; d=json.load(sys.stdin); print('yes' if 'error' in d else 'no')" 2>/dev/null) || has_error="yes"
@@ -818,6 +1123,14 @@ _submit_job() {
818
1123
  local error_msg
819
1124
  error_msg=$(echo "${submit_result}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('error','Unknown error'))" 2>/dev/null) || error_msg="Unknown error"
820
1125
  echo "❌ ${error_msg}"
1126
+ echo " Model ID used: ${RESOLVED_MODEL_ID}"
1127
+ # Check for ResourceNotFound in the error message
1128
+ if echo "${error_msg}" | grep -qi "ResourceNotFound\|ResourceNotFoundException\|not found"; then
1129
+ echo ""
1130
+ echo " 💡 The model ID may not exist in the JumpStart Hub."
1131
+ echo " Verify with: aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
1132
+ echo " --hub-content-type Model --name-contains \"${RESOLVED_MODEL_ID}\" --region ${AWS_REGION}"
1133
+ fi
821
1134
  exit 1
822
1135
  fi
823
1136
 
@@ -1084,6 +1397,12 @@ if [ "${ARG_STATUS}" = true ]; then
1084
1397
  _show_status
1085
1398
  fi
1086
1399
 
1400
+ # Handle --discover flag (before requiring --technique and --dataset)
1401
+ if [ "${ARG_DISCOVER}" = true ]; then
1402
+ _run_discover "${ARG_DISCOVER_FILTER}"
1403
+ exit 0
1404
+ fi
1405
+
1087
1406
  # Validate required arguments for job submission
1088
1407
  if [ -z "${ARG_TECHNIQUE}" ]; then
1089
1408
  echo "❌ --technique is required"
@@ -1099,11 +1418,14 @@ if [ -z "${ARG_DATASET}" ]; then
1099
1418
  exit 1
1100
1419
  fi
1101
1420
 
1102
- # Check runtime support
1103
- if [ "${TUNE_SUPPORTED:-}" = "false" ]; then
1104
- echo "⚠️ Managed customization is not supported for the configured model."
1105
- echo " Checking catalog for current support..."
1421
+ # Golden-path gating — check TUNE_SUPPORTED before any model resolution
1422
+ if [ "${TUNE_SUPPORTED:-}" != "true" ]; then
1106
1423
  echo ""
1424
+ echo "❌ Managed fine-tuning is not available for this model family."
1425
+ echo ""
1426
+ echo " Use ./do/train for custom fine-tuning."
1427
+ echo ""
1428
+ exit 1
1107
1429
  fi
1108
1430
 
1109
1431
  # Validate Python availability
@@ -1117,7 +1439,8 @@ fi
1117
1439
  echo "🔧 SageMaker AI Managed Model Customization"
1118
1440
  echo ""
1119
1441
 
1120
- _validate_model
1442
+ _check_catalog_staleness
1443
+ _resolve_tune_model
1121
1444
  _validate_technique
1122
1445
  _validate_training_type
1123
1446
  _validate_dataset