@aws/ml-container-creator 0.10.0 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE-THIRD-PARTY +9304 -0
  2. package/bin/cli.js +2 -0
  3. package/config/bootstrap-e2e-stack.json +341 -0
  4. package/config/bootstrap-stack.json +40 -3
  5. package/config/parameter-schema-v2.json +5 -21
  6. package/config/tune-catalog.json +1781 -0
  7. package/infra/ci-harness/buildspec.yml +1 -0
  8. package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
  9. package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
  10. package/infra/ci-harness/lib/ci-harness-stack.ts +837 -7
  11. package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
  12. package/package.json +51 -66
  13. package/servers/base-image-picker/index.js +121 -121
  14. package/servers/e2e-status/index.js +297 -0
  15. package/servers/e2e-status/manifest.json +14 -0
  16. package/servers/e2e-status/package.json +15 -0
  17. package/servers/endpoint-picker/LICENSE +202 -0
  18. package/servers/endpoint-picker/index.js +536 -0
  19. package/servers/endpoint-picker/manifest.json +14 -0
  20. package/servers/endpoint-picker/package.json +18 -0
  21. package/servers/hyperpod-cluster-picker/index.js +125 -125
  22. package/servers/instance-sizer/index.js +138 -138
  23. package/servers/instance-sizer/lib/instance-ranker.js +76 -76
  24. package/servers/instance-sizer/lib/model-resolver.js +61 -61
  25. package/servers/instance-sizer/lib/quota-resolver.js +113 -113
  26. package/servers/instance-sizer/lib/vram-estimator.js +31 -31
  27. package/servers/lib/bedrock-client.js +38 -38
  28. package/servers/lib/catalogs/model-servers.json +201 -3
  29. package/servers/lib/custom-validators.js +13 -13
  30. package/servers/lib/dynamic-resolver.js +4 -4
  31. package/servers/marketplace-picker/index.js +342 -0
  32. package/servers/marketplace-picker/manifest.json +14 -0
  33. package/servers/marketplace-picker/package.json +18 -0
  34. package/servers/model-picker/index.js +382 -382
  35. package/servers/region-picker/index.js +56 -56
  36. package/servers/workload-picker/LICENSE +202 -0
  37. package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
  38. package/servers/workload-picker/index.js +171 -0
  39. package/servers/workload-picker/manifest.json +16 -0
  40. package/servers/workload-picker/package.json +16 -0
  41. package/src/app.js +4 -2
  42. package/src/lib/bootstrap-command-handler.js +579 -14
  43. package/src/lib/bootstrap-config.js +36 -0
  44. package/src/lib/bootstrap-profile-manager.js +48 -41
  45. package/src/lib/ci-register-helpers.js +74 -0
  46. package/src/lib/config-loader.js +3 -0
  47. package/src/lib/config-manager.js +7 -0
  48. package/src/lib/cuda-resolver.js +17 -8
  49. package/src/lib/generated/cli-options.js +315 -315
  50. package/src/lib/generated/parameter-matrix.js +661 -661
  51. package/src/lib/generated/validation-rules.js +71 -71
  52. package/src/lib/path-prover-brain.js +607 -0
  53. package/src/lib/prompts/project-prompts.js +12 -0
  54. package/src/lib/template-variable-resolver.js +25 -1
  55. package/src/lib/tune-catalog-validator.js +37 -4
  56. package/templates/Dockerfile +9 -0
  57. package/templates/code/adapter_sidecar.py +444 -0
  58. package/templates/code/serve +6 -0
  59. package/templates/code/serve.d/vllm.ejs +1 -1
  60. package/templates/do/.benchmark_writer.py +1476 -0
  61. package/templates/do/.tune_helper.py +982 -57
  62. package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
  63. package/templates/do/adapter +149 -0
  64. package/templates/do/benchmark +639 -85
  65. package/templates/do/config +108 -5
  66. package/templates/do/deploy.d/managed-inference.ejs +192 -11
  67. package/templates/do/optimize +106 -37
  68. package/templates/do/register +89 -0
  69. package/templates/do/test +13 -0
  70. package/templates/do/tune +378 -59
  71. package/templates/do/validate +44 -4
package/templates/do/tune CHANGED
@@ -40,6 +40,11 @@ ARG_STATUS=false
40
40
  ARG_HELP=false
41
41
  ARG_DRY_RUN=false
42
42
  ARG_LIST_MODELS=false
43
+ ARG_NO_STALE_WARNING=false
44
+ ARG_DISCOVER=false
45
+ ARG_DISCOVER_FILTER=""
46
+ ARG_COLUMN_MAP=""
47
+ ARG_ACCEPT_EULA=false
43
48
 
44
49
 
45
50
  # ── _parse_args() ─────────────────────────────────────────────────────────────
@@ -132,11 +137,27 @@ _parse_args() {
132
137
  fi
133
138
  ARG_ROLE="$2"; shift 2 ;;
134
139
  --force) ARG_FORCE=true; shift ;;
140
+ --accept-eula) ARG_ACCEPT_EULA=true; shift ;;
135
141
  --no-wait) ARG_NO_WAIT=true; shift ;;
136
142
  --status) ARG_STATUS=true; shift ;;
137
143
  --help|-h) ARG_HELP=true; shift ;;
138
144
  --dry-run) ARG_DRY_RUN=true; shift ;;
139
145
  --list-models) ARG_LIST_MODELS=true; shift ;;
146
+ --no-stale-warning) ARG_NO_STALE_WARNING=true; shift ;;
147
+ --column-map)
148
+ if [ -z "${2:-}" ]; then
149
+ echo "❌ --column-map requires a value (e.g., prompt=question,completion=answer)"
150
+ exit 1
151
+ fi
152
+ ARG_COLUMN_MAP="$2"; shift 2 ;;
153
+ --discover)
154
+ ARG_DISCOVER=true
155
+ shift
156
+ if [ $# -gt 0 ] && [[ ! "$1" == --* ]]; then
157
+ ARG_DISCOVER_FILTER="$1"
158
+ shift
159
+ fi
160
+ ;;
140
161
  *)
141
162
  echo "❌ Unknown option: $1"
142
163
  echo " Run ./do/tune --help for usage."
@@ -150,6 +171,8 @@ _parse_args() {
150
171
  # ── _show_help() ──────────────────────────────────────────────────────────────
151
172
  _show_help() {
152
173
  echo "Usage: ./do/tune --technique <technique> --dataset <source> [options]"
174
+ echo " ./do/tune --model <id> --technique <technique> --dataset <source>"
175
+ echo " ./do/tune --discover [filter]"
153
176
  echo " ./do/tune --status"
154
177
  echo " ./do/tune --list-models"
155
178
  echo " ./do/tune --help"
@@ -157,10 +180,48 @@ _show_help() {
157
180
  echo "SageMaker AI Managed Model Customization — fine-tune supported foundation"
158
181
  echo "models using SFT, DPO, RLAIF, or RLVR without managing infrastructure."
159
182
  echo ""
183
+ echo "How it works:"
184
+ echo ""
185
+ echo " ┌─────────────────────────────────────────────────────────────────┐"
186
+ echo " │ JumpStart model (tune) ──→ LoRA adapter weights (S3) │"
187
+ echo " │ ↓ │"
188
+ echo " │ HuggingFace model (deploy) ←──── do/adapter add │"
189
+ echo " │ ↓ │"
190
+ echo " │ vLLM loads adapter at runtime │"
191
+ echo " └─────────────────────────────────────────────────────────────────┘"
192
+ echo ""
193
+ echo " Managed fine-tuning uses a JumpStart Hub model (identified by TUNE_MODEL_ID)"
194
+ echo " to produce LoRA adapter weights. These adapters are then attached to your"
195
+ echo " HuggingFace BYOC deployment via do/adapter add — no redeployment needed."
196
+ echo ""
197
+ echo "Supported model families:"
198
+ echo " • qwen-2.5 (Alibaba) — Qwen 2.5 7B/14B/32B/72B Instruct"
199
+ echo " • qwen-3 (Alibaba) — Qwen 3 0.6B/1.7B/4B/8B/14B/32B"
200
+ echo " • llama-3 (Meta) — Llama 3.1 8B, 3.2 1B/3B, 3.3 70B Instruct"
201
+ echo " • deepseek-r1 (DeepSeek) — R1 Distill Llama 8B/70B, Qwen 1.5B/7B/14B/32B"
202
+ echo " • gpt-oss (OpenAI) — GPT-OSS 20B/120B"
203
+ echo ""
204
+ echo " Only models registered in the SageMaker JumpStart Hub support managed"
205
+ echo " fine-tuning. Not all HuggingFace models have a JumpStart equivalent."
206
+ echo ""
207
+ echo "Finding your JumpStart model ID:"
208
+ echo ""
209
+ echo " aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
210
+ echo " --hub-content-type Model \\"
211
+ echo " --query \"HubContentSummaries[?contains(HubContentName,'<family>')].HubContentName\""
212
+ echo ""
213
+ echo " Or use: ./do/tune --discover [filter]"
214
+ echo ""
160
215
  echo "Required:"
161
216
  echo " --technique <t> Customization technique: sft, dpo, rlaif, rlvr"
162
217
  echo " --dataset <source> Dataset: s3://bucket/path.jsonl or hf://org/name[/split]"
163
218
  echo ""
219
+ echo "Model selection:"
220
+ echo " --model <id> JumpStart Hub content name to use for fine-tuning."
221
+ echo " Takes precedence over TUNE_MODEL_ID in do/config."
222
+ echo " Accepts the Hub content name as-is (no catalog lookup)."
223
+ echo " Example: --model huggingface-reasoning-qwen3-8b"
224
+ echo ""
164
225
  echo "Training type:"
165
226
  echo " --training-type <t> lora (default) or full-rank"
166
227
  echo ""
@@ -177,26 +238,55 @@ _show_help() {
177
238
  echo " --reward-prompt <uri> S3 URI for reward prompt file"
178
239
  echo ""
179
240
  echo "Overrides:"
180
- echo " --model <id> Override model (defaults to MODEL_ID from do/config)"
181
241
  echo " --output-bucket <b> Override output bucket (defaults to TUNE_S3_BUCKET)"
182
242
  echo " --role <arn> Override execution role (defaults to ROLE_ARN)"
183
243
  echo ""
184
244
  echo "Job control:"
185
245
  echo " --force Force new job even if one exists for this technique"
246
+ echo " --accept-eula Accept model EULA (required for gated models like Llama)"
186
247
  echo " --no-wait Submit and exit without polling for completion"
187
248
  echo " --status Show status of all tracked tune jobs"
188
249
  echo ""
250
+ echo "Discovery and diagnostics:"
251
+ echo " --discover [filter] Query JumpStart Hub for tune-eligible models."
252
+ echo " Without a filter, shows models for the current family."
253
+ echo " With a filter, narrows results by keyword."
254
+ echo " --no-stale-warning Suppress catalog staleness warnings (useful for CI)."
255
+ echo " Also suppressed by MCC_NO_STALE_WARNING=true env var."
256
+ echo ""
189
257
  echo "Informational:"
190
258
  echo " --help, -h Show this help message"
191
259
  echo " --dry-run Validate inputs and show what would be submitted"
192
260
  echo " --list-models Print supported models, techniques, and training types"
193
261
  echo ""
194
262
  echo "Examples:"
263
+ echo " # Fine-tune with pre-configured TUNE_MODEL_ID from do/config:"
195
264
  echo " ./do/tune --technique sft --dataset s3://my-bucket/train.jsonl"
265
+ echo ""
266
+ echo " # Override model ID directly:"
267
+ echo " ./do/tune --model huggingface-reasoning-qwen3-8b --technique sft --dataset s3://bucket/data.jsonl"
268
+ echo ""
269
+ echo " # Use a HuggingFace dataset:"
196
270
  echo " ./do/tune --technique dpo --dataset hf://my-org/pref-data --learning-rate 1e-5"
271
+ echo ""
272
+ echo " # Fine-tune a gated model (Meta Llama) — requires EULA acceptance:"
273
+ echo " ./do/tune --technique dpo --dataset hf://argilla/ultrafeedback-binarized-preferences-cleaned --accept-eula"
274
+ echo ""
275
+ echo " # Discover available models:"
276
+ echo " ./do/tune --discover # Models for current family"
277
+ echo " ./do/tune --discover qwen # Filter by keyword"
278
+ echo ""
279
+ echo " # Other:"
197
280
  echo " ./do/tune --technique sft --dataset s3://bucket/data.jsonl --training-type full-rank"
198
- echo " ./do/tune --status"
199
281
  echo " ./do/tune --technique sft --dataset s3://bucket/data.jsonl --dry-run"
282
+ echo " ./do/tune --status"
283
+ echo ""
284
+ echo "Configuration:"
285
+ echo " TUNE_MODEL_ID is set in do/config at generation time when a matching"
286
+ echo " JumpStart model is found for your HuggingFace model. If not set, use"
287
+ echo " --model <id> or run --discover to find the correct Hub content name."
288
+ echo ""
289
+ echo " For custom training without JumpStart, see: ./do/train --help"
200
290
  exit 0
201
291
  }
202
292
 
@@ -213,7 +303,7 @@ _show_status() {
213
303
 
214
304
  if [ -n "${job_name}" ]; then
215
305
  found_any=true
216
- echo " ${technique^^}:"
306
+ echo " $(echo "${technique}" | tr "[:lower:]" "[:upper:]"):"
217
307
  echo " Job: ${job_name}"
218
308
 
219
309
  # Query status via Python helper
@@ -318,66 +408,203 @@ _update_config_var() {
318
408
  fi
319
409
  }
320
410
 
321
- # ── _validate_model() ─────────────────────────────────────────────────────────
322
- # Read MODEL_ID from do/config (or --model override), check against catalog.
411
+ # ── _check_catalog_staleness() ─────────────────────────────────────────────────
412
+ # Warn if the tune catalog's lastSynced timestamp is older than the threshold.
413
+ # Configurable via MCC_CATALOG_STALENESS_DAYS (default: 90).
414
+ # Suppressed by --no-stale-warning flag or MCC_NO_STALE_WARNING=true env var.
415
+ _check_catalog_staleness() {
416
+ if [ "${MCC_NO_STALE_WARNING:-}" = "true" ] || [ "${ARG_NO_STALE_WARNING:-false}" = true ]; then
417
+ return 0
418
+ fi
419
+ local threshold="${MCC_CATALOG_STALENESS_DAYS:-90}"
420
+ local last_synced
421
+ last_synced=$(python3 -c "
422
+ import json, sys
423
+ from datetime import datetime, timezone
424
+ try:
425
+ with open('${CATALOG_FILE}') as f:
426
+ catalog = json.load(f)
427
+ ls = catalog.get('lastSynced', '')
428
+ if not ls:
429
+ sys.exit(0)
430
+ synced = datetime.fromisoformat(ls.replace('Z', '+00:00'))
431
+ days = (datetime.now(timezone.utc) - synced).days
432
+ if days > int('${threshold}'):
433
+ print(days)
434
+ except:
435
+ pass
436
+ " 2>/dev/null)
437
+ if [ -n "${last_synced}" ]; then
438
+ echo "⚠️ Tune catalog is ${last_synced} days old. Run 'ml-container-creator bootstrap sync-model-families' to update."
439
+ fi
440
+ }
441
+
442
+ # ── _resolve_tune_model() ─────────────────────────────────────────────────────
443
+ # Resolve the JumpStart Hub content name for managed fine-tuning.
444
+ # Priority: --model flag > TUNE_MODEL_ID config > discovery
323
445
  # Sets RESOLVED_MODEL_ID on success.
324
- _validate_model() {
325
- # Resolve model ID: --model override, MODEL_ID from config, or MODEL_NAME fallback
446
+ _resolve_tune_model() {
447
+ # Priority 1: --model flag (format-check only, no catalog validation)
326
448
  if [ -n "${ARG_MODEL}" ]; then
449
+ if ! echo "${ARG_MODEL}" | grep -qE '^[a-zA-Z0-9](-*[a-zA-Z0-9])*$'; then
450
+ echo "❌ Invalid model ID format: ${ARG_MODEL}"
451
+ echo " Hub content names must match: [a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}"
452
+ exit 1
453
+ fi
327
454
  RESOLVED_MODEL_ID="${ARG_MODEL}"
328
- elif [ -n "${MODEL_ID:-}" ]; then
329
- RESOLVED_MODEL_ID="${MODEL_ID}"
330
- elif [ -n "${MODEL_NAME:-}" ]; then
331
- RESOLVED_MODEL_ID="${MODEL_NAME}"
332
- else
333
- echo "❌ No model configured"
334
- echo " Set MODEL_ID in do/config or use --model <id>"
335
- exit 1
455
+ return 0
336
456
  fi
337
457
 
338
- if [ ! -f "${CATALOG_FILE}" ]; then
339
- echo "❌ Catalog file not found: ${CATALOG_FILE}"
340
- echo " The tune catalog is required for model validation."
341
- exit 1
458
+ # Priority 2: TUNE_MODEL_ID from do/config
459
+ if [ -n "${TUNE_MODEL_ID:-}" ]; then
460
+ RESOLVED_MODEL_ID="${TUNE_MODEL_ID}"
461
+ return 0
342
462
  fi
343
463
 
344
- # Check if model is in catalog using python3 for JSON parsing
345
- local result
346
- result=$(python3 -c "
464
+ # Priority 3: Neither set attempt runtime discovery, then show guidance
465
+ _discover_and_guide
466
+ }
467
+
468
+ # ── _discover_and_guide() ─────────────────────────────────────────────────────
469
+ # Display guidance when no model ID is configured and attempt runtime discovery.
470
+ # Attempts Hub discovery via helper script, falls back to static guidance on failure.
471
+ _discover_and_guide() {
472
+ echo ""
473
+ echo "🔧 SageMaker AI Managed Model Customization"
474
+ echo ""
475
+ echo " This feature uses SageMaker Serverless Fine-Tuning, which requires"
476
+ echo " the model to be registered in the SageMaker JumpStart Hub."
477
+ echo ""
478
+ echo " Your deployed model: ${MODEL_NAME:-unknown} (HuggingFace BYOC)"
479
+ echo " JumpStart model ID: (not configured)"
480
+ echo ""
481
+
482
+ # Derive model family from the catalog based on MODEL_NAME (HuggingFace ID)
483
+ local model_family=""
484
+ if [ -f "${CATALOG_FILE}" ] && [ -n "${MODEL_NAME:-}" ]; then
485
+ model_family=$(python3 -c "
347
486
  import json, sys
487
+ try:
488
+ with open('${CATALOG_FILE}') as f:
489
+ catalog = json.load(f)
490
+ model_name = '${MODEL_NAME}'
491
+ for entry in catalog.get('models', {}).values():
492
+ if entry.get('huggingFaceId', '') == model_name:
493
+ print(entry.get('family', ''))
494
+ sys.exit(0)
495
+ except:
496
+ pass
497
+ " 2>/dev/null) || model_family=""
498
+ fi
348
499
 
349
- with open('${CATALOG_FILE}') as f:
350
- catalog = json.load(f)
500
+ # Attempt runtime discovery via helper script
501
+ local discover_result=""
502
+ if [ -f "${HELPER_SCRIPT}" ]; then
503
+ discover_result=$(python3 "${HELPER_SCRIPT}" discover \
504
+ --family "${model_family}" \
505
+ --region "${AWS_REGION}" 2>/dev/null) || discover_result=""
506
+ fi
351
507
 
352
- model_id = '${RESOLVED_MODEL_ID}'
353
- models = catalog.get('models', {})
508
+ if [ -n "${discover_result}" ] && echo "${discover_result}" | python3 -c "import sys,json; d=json.load(sys.stdin); sys.exit(0 if d.get('models') else 1)" 2>/dev/null; then
509
+ echo " 📋 Suggested models for your family:"
510
+ echo "${discover_result}" | python3 -c "
511
+ import sys, json
512
+ d = json.load(sys.stdin)
513
+ for m in d.get('models', [])[:5]:
514
+ print(f' • {m}')
515
+ " 2>/dev/null
516
+ echo ""
517
+ fi
354
518
 
355
- if model_id in models:
356
- print('SUPPORTED')
357
- else:
358
- # Collect unique families
359
- families = sorted(set(e.get('family', '') for e in models.values() if e.get('family')))
360
- print('UNSUPPORTED|' + '|'.join(families))
361
- " 2>/dev/null) || {
362
- echo " Failed to validate model against catalog"
363
- echo " Ensure python3 is available."
364
- exit 1
365
- }
519
+ echo " To find your model's JumpStart ID:"
520
+ echo " aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
521
+ echo " --hub-content-type Model --query \"HubContentSummaries[?contains(HubContentName,'<family>')].HubContentName\""
522
+ echo ""
523
+ echo " Then run:"
524
+ echo " ./do/tune --model <jumpstart-id> --technique ${ARG_TECHNIQUE} --dataset ${ARG_DATASET}"
525
+ echo ""
526
+ echo " Or set it permanently in do/config:"
527
+ echo " export TUNE_MODEL_ID=\"<jumpstart-id>\""
528
+ echo ""
529
+ echo " ┌─────────────────────────────────────────────────────────────┐"
530
+ echo " │ JumpStart model (tune) ──→ LoRA adapter weights (S3) │"
531
+ echo " │ ↓ │"
532
+ echo " │ HuggingFace model (deploy) ←── do/adapter add │"
533
+ echo " │ ↓ │"
534
+ echo " │ vLLM loads adapter at runtime │"
535
+ echo " └─────────────────────────────────────────────────────────────┘"
536
+ echo ""
537
+ echo " For custom training without JumpStart, see: ./do/train --help"
538
+ exit 3
539
+ }
366
540
 
367
- if [ "${result}" = "SUPPORTED" ]; then
368
- return 0
541
+ # ── _run_discover() ───────────────────────────────────────────────────────────
542
+ # Explicit --discover mode: query the JumpStart Hub and display tune-eligible models.
543
+ # Accepts an optional filter keyword to narrow results.
544
+ _run_discover() {
545
+ local filter="${1:-}"
546
+
547
+ echo ""
548
+ echo "🔍 Discovering tune-eligible models in SageMaker JumpStart Hub"
549
+ echo " Region: ${AWS_REGION}"
550
+ if [ -n "${filter}" ]; then
551
+ echo " Filter: ${filter}"
552
+ elif [ -n "${MODEL_FAMILY:-}" ]; then
553
+ echo " Family: ${MODEL_FAMILY}"
554
+ fi
555
+ echo ""
556
+
557
+ # Build discover arguments
558
+ local discover_args=(
559
+ --region "${AWS_REGION}"
560
+ )
561
+ if [ -n "${filter}" ]; then
562
+ discover_args+=(--filter "${filter}")
563
+ elif [ -n "${MODEL_FAMILY:-}" ]; then
564
+ discover_args+=(--family "${MODEL_FAMILY}")
369
565
  fi
370
566
 
371
- # Model not supported extract families from result
372
- local families
373
- families=$(echo "${result}" | cut -d'|' -f2- | tr '|' ', ')
567
+ # Call helper script discover subcommand
568
+ local discover_result
569
+ discover_result=$(python3 "${HELPER_SCRIPT}" discover "${discover_args[@]}" 2>&1) || {
570
+ echo "❌ Discovery failed"
571
+ echo " ${discover_result}"
572
+ echo ""
573
+ echo " Ensure AWS credentials are configured and you have sagemaker:ListHubContents permission."
574
+ echo ""
575
+ echo " Manual alternative:"
576
+ echo " aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
577
+ echo " --hub-content-type Model --query \"HubContentSummaries[].HubContentName\""
578
+ exit 1
579
+ }
580
+
581
+ # Parse and display results
582
+ local count
583
+ count=$(echo "${discover_result}" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('count', 0))" 2>/dev/null) || count="0"
374
584
 
375
- echo "❌ Model \"${RESOLVED_MODEL_ID}\" is not yet supported for managed serverless customization."
376
- echo " Supported model families: ${families}"
585
+ if [ "${count}" = "0" ]; then
586
+ echo " No tune-eligible models found."
587
+ echo ""
588
+ echo " Try a different filter or check available models manually:"
589
+ echo " aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
590
+ echo " --hub-content-type Model --query \"HubContentSummaries[].HubContentName\""
591
+ else
592
+ echo " 📋 Tune-eligible models (${count} found):"
593
+ echo ""
594
+ echo "${discover_result}" | python3 -c "
595
+ import sys, json
596
+ d = json.load(sys.stdin)
597
+ for m in d.get('models', []):
598
+ print(f' • {m}')
599
+ " 2>/dev/null
600
+ echo ""
601
+ echo " Use with:"
602
+ echo " ./do/tune --model <id> --technique <sft|dpo|rlaif|rlvr> --dataset <source>"
603
+ echo ""
604
+ echo " Or set permanently in do/config:"
605
+ echo " export TUNE_MODEL_ID=\"<id>\""
606
+ fi
377
607
  echo ""
378
- echo " Additional model support and custom training workflows are expected in future releases."
379
- echo " For custom training workflows, see \`do/train\`."
380
- exit 1
381
608
  }
382
609
 
383
610
  # ── _validate_technique() ─────────────────────────────────────────────────────
@@ -546,9 +773,17 @@ _validate_dataset() {
546
773
  elif [[ "${dataset}" == hf://* ]]; then
547
774
  # Hugging Face dataset — parse reference and stage to S3
548
775
  local hf_path="${dataset#hf://}"
776
+ local hf_file=""
777
+
778
+ # Extract ?file= parameter before parsing path components
779
+ if [[ "${hf_path}" == *"?file="* ]]; then
780
+ hf_file="${hf_path#*?file=}"
781
+ hf_path="${hf_path%%\?file=*}"
782
+ fi
783
+
549
784
  local hf_org hf_name hf_split
550
785
 
551
- # Parse org/name/split
786
+ # Parse org/name/split from the cleaned path
552
787
  hf_org=$(echo "${hf_path}" | cut -d'/' -f1)
553
788
  hf_name=$(echo "${hf_path}" | cut -d'/' -f2)
554
789
  hf_split=$(echo "${hf_path}" | cut -d'/' -f3-)
@@ -583,9 +818,16 @@ _validate_dataset() {
583
818
  if [ -n "${HF_TOKEN_ARN:-}" ]; then
584
819
  stage_args+=(--hf-secret-name "${HF_TOKEN_ARN}")
585
820
  fi
821
+ if [ -n "${ARG_COLUMN_MAP}" ]; then
822
+ stage_args+=(--column-map "${ARG_COLUMN_MAP}")
823
+ fi
824
+ stage_args+=(--technique "${ARG_TECHNIQUE}")
825
+ if [ -n "${hf_file}" ]; then
826
+ stage_args+=(--hf-file "${hf_file}")
827
+ fi
586
828
 
587
829
  local stage_result
588
- stage_result=$(python3 "${HELPER_SCRIPT}" stage-hf "${stage_args[@]}" 2>/dev/null) || {
830
+ stage_result=$(python3 "${HELPER_SCRIPT}" stage-hf "${stage_args[@]}") || {
589
831
  local error_msg
590
832
  error_msg=$(echo "${stage_result}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('error','Failed to stage dataset'))" 2>/dev/null) || error_msg="Failed to stage HF dataset"
591
833
  echo "❌ ${error_msg}"
@@ -663,7 +905,7 @@ _check_idempotency() {
663
905
  return 0 # No existing job or --force: proceed with new job
664
906
  fi
665
907
 
666
- echo "🔍 Found existing ${ARG_TECHNIQUE^^} job: ${existing_job}"
908
+ echo "🔍 Found existing $(echo "${ARG_TECHNIQUE}" | tr "[:lower:]" "[:upper:]") job: ${existing_job}"
667
909
 
668
910
  # Query status via Python helper
669
911
  local status_json
@@ -752,7 +994,26 @@ _submit_job() {
752
994
  timestamp=$(date +%Y%m%d-%H%M%S)
753
995
  JOB_NAME="${PROJECT_NAME}-tune-${ARG_TECHNIQUE}-${timestamp}"
754
996
 
755
- echo "🚀 Submitting ${ARG_TECHNIQUE^^} customization job"
997
+ # Check if model requires EULA acceptance (gated models from Meta, etc.)
998
+ if [ "${ARG_ACCEPT_EULA}" != true ]; then
999
+ local model_provider
1000
+ model_provider=$(python3 -c "
1001
+ import json
1002
+ with open('${CATALOG_FILE}') as f:
1003
+ catalog = json.load(f)
1004
+ entry = catalog.get('models', {}).get('${RESOLVED_MODEL_ID}', {})
1005
+ print(entry.get('provider', ''))
1006
+ " 2>/dev/null) || model_provider=""
1007
+ if [ "${model_provider}" = "meta" ]; then
1008
+ echo "⚠️ ${RESOLVED_MODEL_ID} is a gated model that requires EULA acceptance."
1009
+ echo " Add --accept-eula to proceed:"
1010
+ echo " ./do/tune --technique ${ARG_TECHNIQUE} --dataset ${ARG_DATASET} --accept-eula"
1011
+ echo ""
1012
+ exit 1
1013
+ fi
1014
+ fi
1015
+
1016
+ echo "🚀 Submitting $(echo "${ARG_TECHNIQUE}" | tr "[:lower:]" "[:upper:]") customization job"
756
1017
  echo " Job name: ${JOB_NAME}"
757
1018
  echo " Model: ${RESOLVED_MODEL_ID}"
758
1019
  echo " Technique: ${ARG_TECHNIQUE}"
@@ -764,6 +1025,7 @@ _submit_job() {
764
1025
  # Build submit arguments
765
1026
  local submit_args=(
766
1027
  --model-id "${RESOLVED_MODEL_ID}"
1028
+ --region "${AWS_REGION}"
767
1029
  --technique "${ARG_TECHNIQUE}"
768
1030
  --training-type "${ARG_TRAINING_TYPE}"
769
1031
  --dataset-s3-uri "${RESOLVED_DATASET_S3_URI}"
@@ -801,15 +1063,54 @@ _submit_job() {
801
1063
  if [ -n "${ARG_REWARD_PROMPT}" ]; then
802
1064
  submit_args+=(--reward-prompt "${ARG_REWARD_PROMPT}")
803
1065
  fi
1066
+ if [ "${ARG_ACCEPT_EULA}" = true ]; then
1067
+ submit_args+=(--accept-eula)
1068
+ fi
804
1069
 
805
- # Invoke Python helper
1070
+ # Invoke Python helper (stderr visible to user for diagnostics)
806
1071
  local submit_result
807
- submit_result=$(python3 "${HELPER_SCRIPT}" submit "${submit_args[@]}" 2>/dev/null) || {
1072
+ local submit_stderr
1073
+ submit_stderr=$(mktemp)
1074
+ submit_result=$(python3 "${HELPER_SCRIPT}" submit "${submit_args[@]}" 2>"${submit_stderr}") || {
808
1075
  echo "❌ Failed to submit customization job"
809
- echo " Ensure the SageMaker Python SDK is installed: pip install 'sagemaker>=2.232.0'"
1076
+ echo " Model ID used: ${RESOLVED_MODEL_ID}"
1077
+ echo ""
1078
+ # Show stderr from helper script
1079
+ if [ -s "${submit_stderr}" ]; then
1080
+ echo " Error output:"
1081
+ sed 's/^/ /' "${submit_stderr}"
1082
+ echo ""
1083
+ # Check for ResourceNotFound and suggest verification
1084
+ if grep -qi "ResourceNotFound\|ResourceNotFoundException\|not found" "${submit_stderr}"; then
1085
+ echo " 💡 The model ID may not exist in the JumpStart Hub."
1086
+ echo " Verify with: aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
1087
+ echo " --hub-content-type Model --name-contains \"${RESOLVED_MODEL_ID}\" --region ${AWS_REGION}"
1088
+ echo ""
1089
+ fi
1090
+ fi
1091
+ # Show stdout error JSON if available
1092
+ if [ -n "${submit_result:-}" ]; then
1093
+ local err_msg
1094
+ err_msg=$(echo "${submit_result}" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('error',''))" 2>/dev/null) || err_msg=""
1095
+ if [ -n "${err_msg}" ]; then
1096
+ echo " SDK error: ${err_msg}"
1097
+ echo ""
1098
+ fi
1099
+ fi
1100
+ rm -f "${submit_stderr}"
810
1101
  exit 1
811
1102
  }
812
1103
 
1104
+ # Show any stderr warnings from helper script (non-fatal)
1105
+ if [ -s "${submit_stderr}" ]; then
1106
+ sed 's/^/ ⚠️ /' "${submit_stderr}"
1107
+ fi
1108
+ rm -f "${submit_stderr}"
1109
+
1110
+ # SDK may print status lines to stdout before our JSON (e.g., "Training Job Name: ...")
1111
+ # Extract only the JSON line (last line starting with '{')
1112
+ submit_result=$(echo "${submit_result}" | grep '^{' | tail -1)
1113
+
813
1114
  # Check for error in response
814
1115
  local has_error
815
1116
  has_error=$(echo "${submit_result}" | python3 -c "import sys,json; d=json.load(sys.stdin); print('yes' if 'error' in d else 'no')" 2>/dev/null) || has_error="yes"
@@ -818,6 +1119,14 @@ _submit_job() {
818
1119
  local error_msg
819
1120
  error_msg=$(echo "${submit_result}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('error','Unknown error'))" 2>/dev/null) || error_msg="Unknown error"
820
1121
  echo "❌ ${error_msg}"
1122
+ echo " Model ID used: ${RESOLVED_MODEL_ID}"
1123
+ # Check for ResourceNotFound in the error message
1124
+ if echo "${error_msg}" | grep -qi "ResourceNotFound\|ResourceNotFoundException\|not found"; then
1125
+ echo ""
1126
+ echo " 💡 The model ID may not exist in the JumpStart Hub."
1127
+ echo " Verify with: aws sagemaker list-hub-contents --hub-name SageMakerPublicHub \\"
1128
+ echo " --hub-content-type Model --name-contains \"${RESOLVED_MODEL_ID}\" --region ${AWS_REGION}"
1129
+ fi
821
1130
  exit 1
822
1131
  fi
823
1132
 
@@ -1084,6 +1393,12 @@ if [ "${ARG_STATUS}" = true ]; then
1084
1393
  _show_status
1085
1394
  fi
1086
1395
 
1396
+ # Handle --discover flag (before requiring --technique and --dataset)
1397
+ if [ "${ARG_DISCOVER}" = true ]; then
1398
+ _run_discover "${ARG_DISCOVER_FILTER}"
1399
+ exit 0
1400
+ fi
1401
+
1087
1402
  # Validate required arguments for job submission
1088
1403
  if [ -z "${ARG_TECHNIQUE}" ]; then
1089
1404
  echo "❌ --technique is required"
@@ -1099,11 +1414,14 @@ if [ -z "${ARG_DATASET}" ]; then
1099
1414
  exit 1
1100
1415
  fi
1101
1416
 
1102
- # Check runtime support
1103
- if [ "${TUNE_SUPPORTED:-}" = "false" ]; then
1104
- echo "⚠️ Managed customization is not supported for the configured model."
1105
- echo " Checking catalog for current support..."
1417
+ # Golden-path gating — check TUNE_SUPPORTED before any model resolution
1418
+ if [ "${TUNE_SUPPORTED:-}" != "true" ]; then
1106
1419
  echo ""
1420
+ echo "❌ Managed fine-tuning is not available for this model family."
1421
+ echo ""
1422
+ echo " Use ./do/train for custom fine-tuning."
1423
+ echo ""
1424
+ exit 1
1107
1425
  fi
1108
1426
 
1109
1427
  # Validate Python availability
@@ -1117,7 +1435,8 @@ fi
1117
1435
  echo "🔧 SageMaker AI Managed Model Customization"
1118
1436
  echo ""
1119
1437
 
1120
- _validate_model
1438
+ _check_catalog_staleness
1439
+ _resolve_tune_model
1121
1440
  _validate_technique
1122
1441
  _validate_training_type
1123
1442
  _validate_dataset