npm - @aws/ml-container-creator - Versions diffs - 0.13.3 → 0.13.5 - Mend

@aws/ml-container-creator 0.13.3 → 0.13.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

package/README.md +23 -5
package/infra/ci-harness/package-lock.json +1 -5
package/package.json +5 -3
package/pyproject.toml +21 -0
package/requirements.txt +19 -0
package/servers/instance-sizer/lib/model-resolver.js +127 -185
package/servers/instance-sizer/lib/vram-estimator.js +86 -0
package/servers/lib/catalogs/instances.json +0 -27
package/src/app.js +2 -0
package/src/lib/bootstrap-command-handler.js +35 -25
package/src/lib/generated/cli-options.js +1 -1
package/src/lib/generated/parameter-matrix.js +1 -1
package/src/lib/generated/validation-rules.js +1 -1
package/src/lib/prompt-runner.js +14 -31
package/templates/IAM_PERMISSIONS.md +64 -13
package/templates/do/.adapter_helper.py +451 -0
package/templates/do/.benchmark_writer.py +13 -0
package/templates/do/.stage_helper.py +419 -0
package/templates/do/.tune_helper.py +218 -67
package/templates/do/README.md +50 -604
package/templates/do/__pycache__/.adapter_helper.cpython-312.pyc +0 -0
package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
package/templates/do/__pycache__/.tune_helper.cpython-312.pyc +0 -0
package/templates/do/adapter +109 -4
package/templates/do/benchmark +150 -12
package/templates/do/build +2 -5
package/templates/do/clean.d/async-inference.ejs +2 -5
package/templates/do/clean.d/batch-transform.ejs +2 -5
package/templates/do/clean.d/hyperpod-eks.ejs +2 -5
package/templates/do/clean.d/managed-inference.ejs +2 -5
package/templates/do/config +4 -0
package/templates/do/deploy.d/async-inference.ejs +6 -9
package/templates/do/deploy.d/batch-transform.ejs +4 -7
package/templates/do/deploy.d/hyperpod-eks.ejs +1 -4
package/templates/do/deploy.d/managed-inference.ejs +15 -6
package/templates/do/lib/profile.sh +24 -15
package/templates/do/push +2 -5
package/templates/do/register +2 -5
package/templates/do/stage +114 -292
package/templates/do/submit +1 -4
package/templates/do/tune +64 -10
package/templates/MIGRATION.md +0 -488
package/templates/TEMPLATE_SYSTEM.md +0 -243

package/templates/do/tune CHANGED Viewed

@@ -16,10 +16,7 @@ source "${SCRIPT_DIR}/config"
 source "${SCRIPT_DIR}/lib/profile.sh"
 # ── Profile-resolved variables (env var > profile > default) ──────────────────
-# Disable unbound-variable checking for associative array access (bash 3.2 compat)
-set +u
-TUNE_S3_BUCKET="${TUNE_S3_BUCKET:-mlcc-tune-${_PROFILE[accountId]:-unknown}-${_PROFILE[awsRegion]:-us-east-1}}"
-set -u
+TUNE_S3_BUCKET="${TUNE_S3_BUCKET:-mlcc-tune-${_PROFILE_accountId:-unknown}-${_PROFILE_awsRegion:-us-east-1}}"
 # ── Constants ─────────────────────────────────────────────────────────────────
 CATALOG_FILE="${SCRIPT_DIR}/.tune_catalog.json"
@@ -51,6 +48,7 @@ ARG_NO_STALE_WARNING=false
 ARG_DISCOVER=false
 ARG_DISCOVER_FILTER=""
 ARG_COLUMN_MAP=""
+ARG_TAKE=""
 ARG_ACCEPT_EULA=false
@@ -165,6 +163,12 @@ _parse_args() {
                     shift
                 fi
                 ;;
+            --take)
+                if [ -z "${2:-}" ]; then
+                    echo "❌ --take requires an integer value"
+                    exit 1
+                fi
+                ARG_TAKE="$2"; shift 2 ;;
             *)
                 echo "❌ Unknown option: $1"
                 echo "   Run ./do/tune --help for usage."
@@ -221,7 +225,8 @@ _show_help() {
     echo ""
     echo "Required:"
     echo "  --technique <t>       Customization technique: sft, dpo, rlaif, rlvr"
-    echo "  --dataset <source>    Dataset: s3://bucket/path.jsonl or hf://org/name[/split]"
+    echo "  --dataset <source>    Dataset: s3://bucket/path.jsonl or hf://org/name[/split][?file=pattern]"
+    echo "                        ⚠️  Quote the URI if it contains ? or * to prevent shell expansion"
     echo ""
     echo "Model selection:"
     echo "  --model <id>          JumpStart Hub content name to use for fine-tuning."
@@ -254,6 +259,13 @@ _show_help() {
     echo "  --no-wait             Submit and exit without polling for completion"
     echo "  --status              Show status of all tracked tune jobs"
     echo ""
+    echo "Dataset options:"
+    echo "  --column-map <map>    Rename columns (e.g., prompt=question,completion=answer)"
+    echo "  --take <n>            Take only the first N records from the dataset"
+    echo ""
+    echo "  Note: Always quote --dataset values containing ? or * characters."
+    echo "  Unquoted, bash may interpret ? as a glob and * as a wildcard expansion."
+    echo ""
     echo "Discovery and diagnostics:"
     echo "  --discover [filter]   Query JumpStart Hub for tune-eligible models."
     echo "                        Without a filter, shows models for the current family."
@@ -279,6 +291,12 @@ _show_help() {
     echo "  # Fine-tune a gated model (Meta Llama) — requires EULA acceptance:"
     echo "  ./do/tune --technique dpo --dataset hf://argilla/ultrafeedback-binarized-preferences-cleaned --accept-eula"
     echo ""
+    echo "  # Take only 500 records for a quick test run:"
+    echo "  ./do/tune --technique sft --dataset hf://timdettmers/openassistant-guanaco --take 500"
+    echo ""
+    echo "  # Filter files in a multi-file dataset (quote to prevent shell glob expansion):"
+    echo "  ./do/tune --technique sft --dataset \"hf://Open-Orca/OpenOrca?file=1M-GPT4\""
+    echo ""
     echo "  # Discover available models:"
     echo "  ./do/tune --discover                    # Models for current family"
     echo "  ./do/tune --discover qwen               # Filter by keyword"
@@ -332,13 +350,47 @@ _show_status() {
                 echo "     Elapsed: ${mins}m ${secs}s"
             fi
-            # Show output path if completed
+            # Show output path if completed — resolve if not yet set
             local output_var="TUNE_ADAPTER_PATH_$(echo "${technique}" | tr '[:lower:]' '[:upper:]')"
             local model_var="TUNE_MODEL_PATH_$(echo "${technique}" | tr '[:lower:]' '[:upper:]')"
             if [ -n "${!output_var:-}" ]; then
                 echo "     Output (adapter): ${!output_var}"
             elif [ -n "${!model_var:-}" ]; then
                 echo "     Output (model): ${!model_var}"
+            elif [ "${status}" = "Completed" ]; then
+                # Job is complete but output path not set — resolve now
+                echo "     🔄 Resolving artifacts..."
+                local training_type="${TUNE_TRAINING_TYPE:-lora}"
+                local resolve_result
+                resolve_result=$(python3 "${HELPER_SCRIPT}" resolve \
+                    --job-name "${job_name}" \
+                    --region "${AWS_REGION}" \
+                    --training-type "${training_type}" \
+                    --model-package-group "${PROJECT_NAME}-tune-models" 2>/dev/null) || resolve_result=""
+                if [ -n "${resolve_result}" ]; then
+                    local artifact_path
+                    artifact_path=$(echo "${resolve_result}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('artifact_path',''))" 2>/dev/null) || artifact_path=""
+                    local output_type
+                    output_type=$(echo "${resolve_result}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('output_type',''))" 2>/dev/null) || output_type=""
+                    if [ -n "${artifact_path}" ]; then
+                        local technique_upper
+                        technique_upper=$(echo "${technique}" | tr '[:lower:]' '[:upper:]')
+                        # Update config
+                        if [ "${output_type}" = "adapter" ]; then
+                            _update_config_var "TUNE_ADAPTER_PATH_${technique_upper}" "${artifact_path}"
+                            echo "     Output (adapter): ${artifact_path}"
+                        else
+                            _update_config_var "TUNE_MODEL_PATH_${technique_upper}" "${artifact_path}"
+                            echo "     Output (model): ${artifact_path}"
+                        fi
+                        _update_config_var "TUNE_OUTPUT_PATH_LATEST" "${artifact_path}"
+                        _update_config_var "TUNE_OUTPUT_TYPE_LATEST" "${output_type}"
+                        echo "     ✅ Updated do/config with output paths"
+                    fi
+                fi
             fi
             echo ""
         fi
@@ -829,6 +881,9 @@ _validate_dataset() {
             stage_args+=(--column-map "${ARG_COLUMN_MAP}")
         fi
         stage_args+=(--technique "${ARG_TECHNIQUE}")
+        if [ -n "${ARG_TAKE}" ]; then
+            stage_args+=(--take "${ARG_TAKE}")
+        fi
         if [ -n "${hf_file}" ]; then
             stage_args+=(--hf-file "${hf_file}")
         fi
@@ -862,7 +917,8 @@ _validate_dataset() {
     else
         echo "❌ Invalid dataset format: ${dataset}"
-        echo "   Expected: s3://bucket/path.jsonl or hf://org/name[/split]"
+        echo "   Expected: s3://bucket/path.jsonl or hf://org/name[/split][?file=pattern]"
+        echo "   Hint: Quote the value if it contains ? or * (e.g., \"hf://org/name?file=pattern\")"
         exit 1
     fi
 }
@@ -1232,9 +1288,7 @@ _handle_interrupt() {
     echo ""
     echo "⚠️  Interrupted — job continues running in background"
     echo "   Job: ${job_name}"
-    echo ""
-    echo "   Resume monitoring: ./do/tune --technique ${ARG_TECHNIQUE} --dataset ${ARG_DATASET}"
-    echo "   Check status:      ./do/tune --status"
+    echo "   Check status: ./do/tune --status"
     exit 130
 }

package/templates/MIGRATION.md DELETED Viewed

@@ -1,488 +0,0 @@
-# Migration Guide: Legacy Scripts to do-framework
-This guide helps you transition from the legacy `deploy/` scripts to the new do-framework commands.
-## Why Migrate?
-The do-framework provides:
-- **Standardization**: Consistent interface across all ML Container Creator projects
-- **Better Organization**: Clear separation of concerns with dedicated scripts
-- **Enhanced Features**: More granular control over build, push, deploy, test, and cleanup
-- **Community Standard**: Follows the widely-adopted do-framework conventions
-- **Improved Maintainability**: Centralized configuration in `do/config`
-## Quick Reference
-| Legacy Command | do-framework Command | Notes |
-|----------------|---------------------|-------|
-| `./deploy/build_and_push.sh` | `./do/build && ./do/push` | Now split into two commands |
-| `./deploy/deploy.sh <role>` | `./do/deploy <role>` | Same functionality |
-<% if (buildTarget === 'codebuild') { %>| `./deploy/submit_build.sh` | `./do/submit` | CodeBuild integration |
-<% } %>| N/A | `./do/run` | New: Run container locally |
-| N/A | `./do/test [endpoint]` | New: Test container or endpoint |
-| N/A | `./do/clean <target>` | New: Clean up resources |
-## Detailed Migration Steps
-### Step 1: Understand the New Structure
-The do-framework organizes scripts in the `do/` directory:
-```
-do/
-├── config      # Centralized configuration
-├── build       # Build Docker image
-├── push        # Push to ECR
-├── deploy      # Deploy to SageMaker
-├── run         # Run locally
-├── test        # Test container/endpoint
-├── clean       # Clean up resources
-<% if (buildTarget === 'codebuild') { %>├── submit      # Submit to CodeBuild
-<% } %>└── README.md   # Detailed documentation
-```
-### Step 2: Update Your Workflow
-#### Old Workflow
-```bash
-# Build and push
-./deploy/build_and_push.sh
-# Deploy
-./deploy/deploy.sh arn:aws:iam::123456789012:role/SageMakerRole
-```
-#### New Workflow
-```bash
-# Build
-./do/build
-# Test locally (optional but recommended)
-./do/run &
-./do/test
-# Push to ECR
-./do/push
-# Deploy to SageMaker
-./do/deploy arn:aws:iam::123456789012:role/SageMakerRole
-# Test the endpoint
-./do/test <%= projectName %>-endpoint
-```
-<% if (buildTarget === 'codebuild') { %>#### CodeBuild Workflow
-**Old**:
-```bash
-./deploy/submit_build.sh
-./deploy/deploy.sh <role-arn>
-```
-**New**:
-```bash
-./do/submit  # Builds and pushes via CodeBuild
-./do/deploy <role-arn>
-./do/test <%= projectName %>-endpoint
-```
-<% } %>### Step 3: Update Configuration
-#### Old: Hardcoded in Scripts
-Legacy scripts had configuration hardcoded or passed as arguments.
-#### New: Centralized in do/config
-All configuration is now in `do/config`:
-```bash
-# Edit do/config
-export PROJECT_NAME="<%= projectName %>"
-export AWS_REGION="<%= awsRegion %>"
-export INSTANCE_TYPE="<%= instanceType %>"
-export DEPLOYMENT_CONFIG="<%= deploymentConfig %>"
-```
-You can override these with environment variables:
-```bash
-AWS_REGION=us-west-2 ./do/push
-INSTANCE_TYPE=ml.m5.2xlarge ./do/deploy <role-arn>
-```
-### Step 4: Update CI/CD Pipelines
-#### Old Pipeline
-```yaml
-# .github/workflows/deploy.yml
-- name: Build and Push
-  run: ./deploy/build_and_push.sh
-- name: Deploy
-  run: ./deploy/deploy.sh ${{ secrets.SAGEMAKER_ROLE }}
-```
-#### New Pipeline
-```yaml
-# .github/workflows/deploy.yml
-- name: Build
-  run: ./do/build
-- name: Push
-  run: ./do/push
-- name: Deploy
-  run: ./do/deploy ${{ secrets.SAGEMAKER_ROLE }}
-- name: Test
-  run: ./do/test <%= projectName %>-endpoint
-```
-### Step 5: Update Documentation
-Update any project documentation that references the old scripts:
-**Find and replace**:
-- `./deploy/build_and_push.sh` → `./do/build && ./do/push`
-- `./deploy/deploy.sh` → `./do/deploy`
-<% if (buildTarget === 'codebuild') { %>- `./deploy/submit_build.sh` → `./do/submit`
-<% } %>
-## Command Mapping Details
-### Build and Push
-**Legacy**:
-```bash
-./deploy/build_and_push.sh
-```
-This single script built the Docker image and pushed it to ECR.
-**do-framework**:
-```bash
-./do/build  # Build Docker image
-./do/push   # Push to ECR
-```
-**Why the change?** Separating build and push allows you to:
-- Test the image locally before pushing
-- Build once and push to multiple registries
-- Skip pushing if you only need local testing
-**Benefits**:
-- Test locally with `./do/run` before pushing
-- More granular control over the workflow
-- Clearer error messages for each step
-### Deploy
-**Legacy**:
-```bash
-./deploy/deploy.sh arn:aws:iam::123456789012:role/SageMakerRole
-```
-**do-framework**:
-```bash
-./do/deploy arn:aws:iam::123456789012:role/SageMakerRole
-```
-**What's the same?**
-- Same command-line interface
-- Same functionality
-- Same SageMaker endpoint creation
-**What's different?**
-- Better error messages
-- Progress indicators
-- Automatic endpoint status polling
-- Displays test command when complete
-<% if (buildTarget === 'codebuild') { %>### CodeBuild Submit
-**Legacy**:
-```bash
-./deploy/submit_build.sh
-```
-**do-framework**:
-```bash
-./do/submit
-```
-**What's improved?**
-- Better build progress monitoring
-- Clearer error messages
-- Automatic ECR image URI display
-- Build log streaming
-<% } %>### New Commands
-The do-framework adds several new commands that weren't available with legacy scripts:
-#### Run Locally
-```bash
-./do/run
-```
-Starts the container locally on port 8080 for testing before deployment.
-**Use cases**:
-- Test model loading
-- Verify inference logic
-- Debug issues locally
-- Validate container configuration
-#### Test
-```bash
-# Test local container
-./do/test
-# Test SageMaker endpoint
-./do/test <%= projectName %>-endpoint
-```
-Sends health check and inference requests to validate functionality.
-**Use cases**:
-- Verify endpoints are working
-- Validate inference responses
-- Automated testing in CI/CD
-- Quick smoke tests
-#### Clean
-```bash
-# Remove local images
-./do/clean local
-# Remove ECR images
-./do/clean ecr
-# Delete SageMaker endpoint
-./do/clean endpoint
-# Clean everything
-./do/clean all
-```
-Manages cleanup of resources across different environments.
-**Use cases**:
-- Free up disk space
-- Remove old ECR images
-- Delete test endpoints
-- Complete project cleanup
-## Configuration Changes
-### Legacy Configuration
-Configuration was scattered across multiple scripts:
-```bash
-# In deploy/build_and_push.sh
-PROJECT_NAME="my-model"
-REGION="us-east-1"
-# In deploy/deploy.sh
-INSTANCE_TYPE="ml.m5.xlarge"
-```
-### do-framework Configuration
-All configuration is centralized in `do/config`:
-```bash
-# do/config
-export PROJECT_NAME="<%= projectName %>"
-export DEPLOYMENT_CONFIG="<%= deploymentConfig %>"
-export FRAMEWORK="<%= framework %>"
-export MODEL_SERVER="<%= modelServer %>"
-export AWS_REGION="<%= awsRegion %>"
-export INSTANCE_TYPE="<%= instanceType %>"
-export ECR_REPOSITORY_NAME="ml-container-creator"
-<% if (buildTarget === 'codebuild') { %>export BUILD_TARGET="codebuild"
-export CODEBUILD_COMPUTE_TYPE="<%= codebuildComputeType %>"
-<% } %><% if (framework === 'transformers') { %>export MODEL_NAME="<%= modelName %>"
-<% if (hfToken) { %>export HF_TOKEN="<%= hfToken %>"
-<% } %><% } %>
-```
-**Benefits**:
-- Single source of truth
-- Easy to override with environment variables
-- Clear documentation of all settings
-- Consistent across all scripts
-## Backward Compatibility
-The legacy scripts are still available in the `deploy/` directory for backward compatibility:
-```bash
-./deploy/build_and_push.sh  # Still works
-./deploy/deploy.sh          # Still works
-<% if (buildTarget === 'codebuild') { %>./deploy/submit_build.sh     # Still works
-<% } %>
-```
-**However**:
-- They display deprecation warnings
-- They forward to do-framework commands
-- They will be removed in a future version
-**Deprecation timeline**:
-- Current version: Legacy scripts work with warnings
-- Next major version: Legacy scripts may be removed
-- Recommendation: Migrate now to avoid future issues
-## Troubleshooting Migration
-### Issue: "Command not found"
-**Problem**: `./do/build: command not found`
-**Solution**: Ensure scripts are executable:
-```bash
-chmod +x do/*
-```
-The generator should set this automatically, but if you copied files manually, you may need to set permissions.
-### Issue: "Configuration variable not set"
-**Problem**: `PROJECT_NAME not set in do/config`
-**Solution**: Ensure `do/config` is properly sourced:
-```bash
-# Check if config exists
-cat do/config
-# Manually source to test
-source do/config
-echo $PROJECT_NAME
-```
-### Issue: "AWS credentials not configured"
-**Problem**: `AWS credentials not configured`
-**Solution**: Configure AWS CLI:
-```bash
-aws configure
-# Or set environment variables
-export AWS_ACCESS_KEY_ID=your-key
-export AWS_SECRET_ACCESS_KEY=your-secret
-```
-### Issue: "Docker permission denied"
-**Problem**: `permission denied while trying to connect to the Docker daemon`
-**Solution**: Add user to docker group:
-```bash
-sudo usermod -aG docker $USER
-# Log out and back in for changes to take effect
-```
-### Issue: Legacy scripts not working
-**Problem**: Legacy scripts fail after migration
-**Solution**:
-1. Check that do-framework scripts work: `./do/build`
-2. Verify do/config exists and is valid
-3. Check script permissions: `ls -la do/`
-4. Review deprecation warnings for guidance
-## FAQ
-### Q: Do I have to migrate immediately?
-**A**: No, legacy scripts still work. However, we recommend migrating to benefit from new features and avoid future compatibility issues.
-### Q: Can I use both legacy and do-framework commands?
-**A**: Yes, but it's not recommended. Choose one approach for consistency.
-### Q: Will my existing CI/CD pipelines break?
-**A**: No, legacy scripts still work. But you should update pipelines to use do-framework commands for better features and future compatibility.
-### Q: What if I have custom modifications to legacy scripts?
-**A**: Review your modifications and apply them to the appropriate do-framework scripts. The modular structure makes customization easier.
-### Q: Can I customize do-framework scripts?
-**A**: Yes! The scripts are designed to be customizable. Edit them as needed for your use case.
-### Q: Where can I find detailed documentation?
-**A**: See `do/README.md` for comprehensive documentation of all do-framework commands.
-### Q: What if I encounter issues during migration?
-**A**:
-1. Check this migration guide
-2. Review `do/README.md`
-3. Check CloudWatch logs for deployment issues
-4. Open an issue on the ML Container Creator repository
-## Benefits Summary
-### For Developers
-- **Clearer workflow**: Separate commands for each step
-- **Better testing**: Test locally before deploying
-- **Easier debugging**: Granular control over each phase
-- **Consistent interface**: Same commands across all projects
-### For Teams
-- **Standardization**: Everyone uses the same commands
-- **Better documentation**: Clear, comprehensive guides
-- **Easier onboarding**: New team members learn one system
-- **Community alignment**: Follows do-framework conventions
-### For CI/CD
-- **More control**: Fine-grained pipeline steps
-- **Better error handling**: Clear failure points
-- **Easier testing**: Test at each stage
-- **Improved monitoring**: Track each step separately
-## Next Steps
-1. **Read** `do/README.md` for detailed command documentation
-2. **Test** the new commands in a development environment
-3. **Update** your CI/CD pipelines
-4. **Update** your team documentation
-5. **Remove** references to legacy scripts from your workflows
-## Additional Resources
-- [do-framework Documentation](https://github.com/iankoulski/do-framework)
-- [ML Container Creator Documentation](https://github.com/yourusername/ml-container-creator)
-- [AWS SageMaker BYOC Guide](https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms.html)
-## Feedback
-We'd love to hear about your migration experience! If you encounter issues or have suggestions, please:
-1. Open an issue on the ML Container Creator repository
-2. Share your feedback with the team
-3. Contribute improvements to this guide
----
-**Last Updated**: <%= buildTimestamp %>
-**Generated by**: ML Container Creator v2.0 (do-framework integration)