@aws/ml-container-creator 0.10.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/LICENSE-THIRD-PARTY +9304 -0
  2. package/bin/cli.js +2 -0
  3. package/config/bootstrap-e2e-stack.json +341 -0
  4. package/config/bootstrap-stack.json +40 -3
  5. package/config/parameter-schema-v2.json +33 -22
  6. package/config/tune-catalog.json +1781 -0
  7. package/infra/ci-harness/buildspec.yml +1 -0
  8. package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
  9. package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
  10. package/infra/ci-harness/lib/ci-harness-stack.ts +851 -7
  11. package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
  12. package/package.json +53 -67
  13. package/servers/base-image-picker/index.js +121 -121
  14. package/servers/e2e-status/index.js +297 -0
  15. package/servers/e2e-status/manifest.json +14 -0
  16. package/servers/e2e-status/package.json +15 -0
  17. package/servers/endpoint-picker/LICENSE +202 -0
  18. package/servers/endpoint-picker/index.js +536 -0
  19. package/servers/endpoint-picker/manifest.json +14 -0
  20. package/servers/endpoint-picker/package.json +18 -0
  21. package/servers/hyperpod-cluster-picker/index.js +125 -125
  22. package/servers/instance-sizer/index.js +166 -153
  23. package/servers/instance-sizer/lib/instance-ranker.js +120 -76
  24. package/servers/instance-sizer/lib/model-resolver.js +61 -61
  25. package/servers/instance-sizer/lib/quota-resolver.js +113 -113
  26. package/servers/instance-sizer/lib/vram-estimator.js +31 -31
  27. package/servers/lib/bedrock-client.js +38 -38
  28. package/servers/lib/catalogs/instances.json +27 -0
  29. package/servers/lib/catalogs/model-servers.json +201 -3
  30. package/servers/lib/custom-validators.js +13 -13
  31. package/servers/lib/dynamic-resolver.js +4 -4
  32. package/servers/marketplace-picker/index.js +342 -0
  33. package/servers/marketplace-picker/manifest.json +14 -0
  34. package/servers/marketplace-picker/package.json +18 -0
  35. package/servers/model-picker/index.js +382 -382
  36. package/servers/region-picker/index.js +56 -56
  37. package/servers/workload-picker/LICENSE +202 -0
  38. package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
  39. package/servers/workload-picker/index.js +171 -0
  40. package/servers/workload-picker/manifest.json +16 -0
  41. package/servers/workload-picker/package.json +16 -0
  42. package/src/app.js +12 -3
  43. package/src/lib/bootstrap-command-handler.js +609 -15
  44. package/src/lib/bootstrap-config.js +36 -0
  45. package/src/lib/bootstrap-profile-manager.js +48 -41
  46. package/src/lib/ci-register-helpers.js +74 -0
  47. package/src/lib/config-loader.js +3 -0
  48. package/src/lib/config-manager.js +7 -0
  49. package/src/lib/config-validator.js +1 -1
  50. package/src/lib/cuda-resolver.js +17 -8
  51. package/src/lib/generated/cli-options.js +319 -314
  52. package/src/lib/generated/parameter-matrix.js +672 -661
  53. package/src/lib/generated/validation-rules.js +76 -72
  54. package/src/lib/path-prover-brain.js +664 -0
  55. package/src/lib/prompts/infrastructure-prompts.js +2 -2
  56. package/src/lib/prompts/model-prompts.js +6 -0
  57. package/src/lib/prompts/project-prompts.js +12 -0
  58. package/src/lib/secrets-prompt-runner.js +4 -0
  59. package/src/lib/template-manager.js +1 -1
  60. package/src/lib/template-variable-resolver.js +87 -1
  61. package/src/lib/tune-catalog-validator.js +37 -4
  62. package/templates/Dockerfile +9 -0
  63. package/templates/code/adapter_sidecar.py +444 -0
  64. package/templates/code/serve +6 -0
  65. package/templates/code/serve.d/vllm.ejs +1 -1
  66. package/templates/do/.benchmark_writer.py +1476 -0
  67. package/templates/do/.tune_helper.py +982 -57
  68. package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
  69. package/templates/do/adapter +154 -0
  70. package/templates/do/benchmark +639 -85
  71. package/templates/do/build +5 -0
  72. package/templates/do/clean.d/async-inference.ejs +5 -0
  73. package/templates/do/clean.d/batch-transform.ejs +5 -0
  74. package/templates/do/clean.d/hyperpod-eks.ejs +5 -0
  75. package/templates/do/clean.d/managed-inference.ejs +5 -0
  76. package/templates/do/config +115 -45
  77. package/templates/do/deploy.d/async-inference.ejs +30 -3
  78. package/templates/do/deploy.d/batch-transform.ejs +29 -3
  79. package/templates/do/deploy.d/hyperpod-eks.ejs +4 -0
  80. package/templates/do/deploy.d/managed-inference.ejs +216 -14
  81. package/templates/do/lib/endpoint-config.sh +1 -1
  82. package/templates/do/lib/profile.sh +44 -0
  83. package/templates/do/optimize +106 -37
  84. package/templates/do/push +5 -0
  85. package/templates/do/register +94 -0
  86. package/templates/do/stage +567 -0
  87. package/templates/do/submit +7 -0
  88. package/templates/do/test +14 -0
  89. package/templates/do/tune +382 -59
  90. package/templates/do/validate +44 -4
@@ -18,6 +18,11 @@ set -o pipefail
18
18
  # ── Source project configuration ──────────────────────────────────────────────
19
19
  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
20
20
  source "${SCRIPT_DIR}/config"
21
+ source "${SCRIPT_DIR}/lib/profile.sh"
22
+
23
+ # ── Profile-resolved variables (env var > profile > default) ──────────────────
24
+ ADAPTER_S3_BUCKET="${ADAPTER_S3_BUCKET:-mlcc-adapters-${_PROFILE[accountId]:-unknown}-${_PROFILE[awsRegion]:-us-east-1}}"
25
+
21
26
  source "${SCRIPT_DIR}/lib/wait.sh"
22
27
 
23
28
  # ── Usage ─────────────────────────────────────────────────────────────────────
@@ -523,6 +528,155 @@ _adapter_add() {
523
528
  echo "📦 Using latest tune adapter output: ${weights_uri}"
524
529
  fi
525
530
  echo ""
531
+
532
+ # ── Package tune artifacts as tar.gz if needed ────────────────────
533
+ # Tune output is an S3 path that may be:
534
+ # 1. Already a tar.gz file (s3://...adapter.tar.gz) → use directly
535
+ # 2. An S3 directory prefix containing adapter files → download, validate, package, upload
536
+ if echo "${weights_uri}" | grep -qE '\.tar\.gz$'; then
537
+ echo "✅ Tune output is already a tar.gz archive."
538
+
539
+ # Validate adapter_config.json exists in the tar.gz
540
+ echo "🔍 Validating adapter_config.json in archive..."
541
+ local tar_validate_dir="/tmp/adapter_tar_validate_$$"
542
+ mkdir -p "${tar_validate_dir}"
543
+
544
+ if aws s3 cp "${weights_uri}" "${tar_validate_dir}/adapter.tar.gz" --region "${AWS_REGION}" --quiet 2>/dev/null; then
545
+ if ! tar -tzf "${tar_validate_dir}/adapter.tar.gz" 2>/dev/null | grep -q 'adapter_config\.json'; then
546
+ echo "❌ adapter_config.json not found in tar.gz archive."
547
+ echo ""
548
+ echo " Path: ${weights_uri}"
549
+ echo ""
550
+ echo " The archive does not appear to contain a valid"
551
+ echo " PEFT/LoRA adapter. A valid adapter must include:"
552
+ echo " • adapter_config.json"
553
+ echo " • adapter_model.safetensors (or adapter_model.bin)"
554
+ echo ""
555
+ echo " Check that the tune job completed successfully:"
556
+ echo " ./do/tune status"
557
+ rm -rf "${tar_validate_dir}"
558
+ exit 1
559
+ fi
560
+ echo " ✅ adapter_config.json found in archive"
561
+ else
562
+ echo " ⚠️ Could not download archive for validation. Proceeding anyway..."
563
+ fi
564
+
565
+ rm -rf "${tar_validate_dir}"
566
+ echo " Using directly without re-packaging."
567
+ else
568
+ echo "📦 Tune output is a directory — packaging as tar.gz..."
569
+ local tune_tmp_dir="/tmp/adapter_tune_package_$$"
570
+ mkdir -p "${tune_tmp_dir}/adapter_files"
571
+
572
+ # Normalize S3 prefix (ensure trailing slash for directory listing)
573
+ local s3_prefix="${weights_uri}"
574
+ if [[ "${s3_prefix}" != */ ]]; then
575
+ s3_prefix="${s3_prefix}/"
576
+ fi
577
+
578
+ # Download all adapter files from S3 directory
579
+ echo " Downloading adapter artifacts from: ${s3_prefix}"
580
+ if ! aws s3 cp "${s3_prefix}" "${tune_tmp_dir}/adapter_files/" --recursive --region "${AWS_REGION}" 2>/dev/null; then
581
+ echo "❌ Failed to download adapter artifacts from S3."
582
+ echo ""
583
+ echo " Path: ${s3_prefix}"
584
+ echo " Check that:"
585
+ echo " • The S3 path exists and contains adapter files"
586
+ echo " • Your IAM credentials have s3:GetObject and s3:ListBucket permission"
587
+ rm -rf "${tune_tmp_dir}"
588
+ exit 1
589
+ fi
590
+
591
+ # Validate adapter_config.json exists
592
+ if [ ! -f "${tune_tmp_dir}/adapter_files/adapter_config.json" ]; then
593
+ echo "❌ adapter_config.json not found in tune output."
594
+ echo ""
595
+ echo " Path: ${s3_prefix}"
596
+ echo ""
597
+ echo " The tune output does not appear to contain a valid"
598
+ echo " PEFT/LoRA adapter. A valid adapter must include:"
599
+ echo " • adapter_config.json"
600
+ echo " • adapter_model.safetensors (or adapter_model.bin)"
601
+ echo ""
602
+ echo " Check that the tune job completed successfully:"
603
+ echo " ./do/tune status"
604
+ rm -rf "${tune_tmp_dir}"
605
+ exit 1
606
+ fi
607
+
608
+ echo " ✅ adapter_config.json found"
609
+
610
+ # Optional: check base_model_name_or_path matches MODEL_NAME
611
+ if [ -n "${MODEL_NAME:-}" ]; then
612
+ local adapter_base_model=""
613
+ if command -v jq &>/dev/null; then
614
+ adapter_base_model=$(jq -r '.base_model_name_or_path // empty' "${tune_tmp_dir}/adapter_files/adapter_config.json" 2>/dev/null)
615
+ else
616
+ adapter_base_model=$(grep -o '"base_model_name_or_path"[[:space:]]*:[[:space:]]*"[^"]*"' "${tune_tmp_dir}/adapter_files/adapter_config.json" 2>/dev/null | sed 's/.*"base_model_name_or_path"[[:space:]]*:[[:space:]]*"//' | sed 's/"$//')
617
+ fi
618
+
619
+ if [ -n "${adapter_base_model}" ] && [ "${adapter_base_model}" != "${MODEL_NAME}" ]; then
620
+ echo " ⚠️ Adapter was trained on '${adapter_base_model}' but base model is '${MODEL_NAME}'. Adapter may not work correctly."
621
+ fi
622
+ fi
623
+
624
+ # Flatten: move any nested files to root level and remove subdirectories
625
+ find "${tune_tmp_dir}/adapter_files" -mindepth 2 -type f -exec mv {} "${tune_tmp_dir}/adapter_files/" \; 2>/dev/null || true
626
+ find "${tune_tmp_dir}/adapter_files" -mindepth 1 -type d -exec rm -rf {} + 2>/dev/null || true
627
+
628
+ # Create flat tar.gz archive
629
+ echo " Creating adapter.tar.gz..."
630
+ if ! tar -czf "${tune_tmp_dir}/adapter.tar.gz" -C "${tune_tmp_dir}/adapter_files" . 2>/dev/null; then
631
+ echo "❌ Failed to create adapter.tar.gz from tune output."
632
+ rm -rf "${tune_tmp_dir}"
633
+ exit 1
634
+ fi
635
+
636
+ local tar_size
637
+ tar_size=$(du -h "${tune_tmp_dir}/adapter.tar.gz" | cut -f1)
638
+ echo " Archive size: ${tar_size}"
639
+
640
+ # Resolve S3 bucket for upload
641
+ local s3_bucket=""
642
+ if [ -n "${ADAPTER_S3_BUCKET:-}" ]; then
643
+ s3_bucket="${ADAPTER_S3_BUCKET}"
644
+ else
645
+ local account_id
646
+ account_id=$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo "")
647
+ if [ -z "${account_id}" ]; then
648
+ echo "❌ Could not determine AWS account ID."
649
+ echo " Ensure AWS credentials are configured."
650
+ rm -rf "${tune_tmp_dir}"
651
+ exit 1
652
+ fi
653
+ s3_bucket="mlcc-adapters-${account_id}-${AWS_REGION}"
654
+ fi
655
+
656
+ # Upload tar.gz to S3
657
+ local s3_tar_path="s3://${s3_bucket}/adapters/${PROJECT_NAME}/${adapter_name}/adapter.tar.gz"
658
+ echo " ☁️ Uploading to S3: ${s3_tar_path}"
659
+
660
+ if ! aws s3 cp "${tune_tmp_dir}/adapter.tar.gz" "${s3_tar_path}" --region "${AWS_REGION}"; then
661
+ echo "❌ Failed to upload adapter tar.gz to S3."
662
+ echo ""
663
+ echo " Check that:"
664
+ echo " • The S3 bucket '${s3_bucket}' exists"
665
+ echo " • Your IAM credentials have s3:PutObject permission"
666
+ echo " • Run bootstrap if the bucket doesn't exist: ./do/bootstrap"
667
+ rm -rf "${tune_tmp_dir}"
668
+ exit 1
669
+ fi
670
+
671
+ echo " ✅ Uploaded to S3: ${s3_tar_path}"
672
+
673
+ # Clean up temp directory
674
+ rm -rf "${tune_tmp_dir}"
675
+
676
+ # Update weights_uri to point to the uploaded tar.gz
677
+ weights_uri="${s3_tar_path}"
678
+ fi
679
+ echo ""
526
680
  fi
527
681
 
528
682
  # ── Validate HF repo ID format (if --from-hub) ───────────────────────