npm - @aws/ml-container-creator - Versions diffs - 0.7.1 → 0.9.0 - Mend

@aws/ml-container-creator 0.7.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

package/LICENSE-THIRD-PARTY +50760 -16218
package/bin/cli.js +1 -1
package/infra/ci-harness/buildspec.yml +4 -0
package/package.json +3 -1
package/servers/lib/catalogs/instances.json +52 -1275
package/servers/lib/catalogs/model-servers.json +80 -0
package/servers/lib/catalogs/models.json +0 -132
package/servers/lib/catalogs/popular-diffusors.json +1 -110
package/servers/model-picker/index.js +27 -16
package/src/app.js +113 -23
package/src/lib/cli-handler.js +1 -1
package/src/lib/config-manager.js +39 -2
package/src/lib/cross-cutting-checker.js +146 -33
package/src/lib/deployment-config-resolver.js +10 -4
package/src/lib/e2e-bootstrap.js +227 -0
package/src/lib/e2e-catalog-validator.js +103 -0
package/src/lib/e2e-quota-validator.js +135 -0
package/src/lib/mcp-client.js +16 -1
package/src/lib/mcp-command-handler.js +10 -2
package/src/lib/prompt-runner.js +306 -24
package/src/lib/prompts.js +9 -3
package/src/lib/template-manager.js +10 -4
package/src/lib/train-config-parser.js +136 -0
package/src/lib/train-config-persistence.js +143 -0
package/src/lib/train-config-validator.js +112 -0
package/src/lib/train-feedback.js +46 -0
package/src/lib/train-idempotency.js +97 -0
package/src/lib/train-request-builder.js +120 -0
package/src/lib/tune-catalog-validator.js +5 -5
package/templates/code/serve +2 -2
package/templates/code/serving.properties +2 -2
package/templates/diffusors/serve +3 -3
package/templates/do/.train_build_request.py +141 -0
package/templates/do/.train_poll_parser.py +135 -0
package/templates/do/.train_status_parser.py +187 -0
package/templates/do/.tune_helper.py +2 -2
package/templates/do/lib/feedback.sh +41 -0
package/templates/do/register +8 -2
package/templates/do/test +5 -5
package/templates/do/train +786 -0
package/templates/do/training/config.yaml +140 -0
package/templates/do/training/train.py +463 -0
package/templates/do/tune +2 -2
package/templates/marketplace/config +118 -0
package/templates/marketplace/deploy +890 -0
package/templates/marketplace/test +453 -0

package/src/lib/tune-catalog-validator.js CHANGED Viewed

@@ -13,7 +13,7 @@
 /**
  * Look up a model entry in the catalog by model ID.
- * @param {string} modelId - The JumpStart model ID to look up
+ * @param {string} modelId - The model ID to look up
  * @param {Object} catalog - The tune catalog object with a `models` map
  * @returns {Object|null} The catalog entry for the model, or null if not found
  */
@@ -29,7 +29,7 @@ export function lookupModel(modelId, catalog) {
 /**
  * Check whether a model ID is present in the Supported Model Catalog.
- * @param {string} modelId - The JumpStart model ID to check
+ * @param {string} modelId - The model ID to check
  * @param {Object} catalog - The tune catalog object with a `models` map
  * @returns {boolean} True if the model is in the catalog
  */
@@ -41,7 +41,7 @@ export function isTuneSupported(modelId, catalog) {
  * Validate that a model ID exists in the catalog.
  * Returns a descriptive error when the model is not supported, including
  * the model name, supported families, and a reference to `do/train`.
- * @param {string} modelId - The JumpStart model ID to validate
+ * @param {string} modelId - The model ID to validate
  * @param {Object} catalog - The tune catalog object with a `models` map
  * @returns {{ valid: boolean, error?: string }}
  */
@@ -65,7 +65,7 @@ export function validateModel(modelId, catalog) {
  * Validate that a technique is supported for the given model.
  * Returns a descriptive error listing the supported techniques when
  * the requested technique is not available.
- * @param {string} modelId - The JumpStart model ID
+ * @param {string} modelId - The model ID
  * @param {string} technique - The technique to validate (e.g., 'sft', 'dpo')
  * @param {Object} catalog - The tune catalog object with a `models` map
  * @returns {{ valid: boolean, error?: string }}
@@ -92,7 +92,7 @@ export function validateTechnique(modelId, technique, catalog) {
  * Validate that a training type is supported for the given model and technique.
  * Returns a descriptive error listing the supported training types when
  * the requested type is not available.
- * @param {string} modelId - The JumpStart model ID
+ * @param {string} modelId - The model ID
  * @param {string} technique - The technique (e.g., 'sft', 'dpo')
  * @param {string} trainingType - The training type to validate (e.g., 'lora', 'full-rank')
  * @param {Object} catalog - The tune catalog object with a `models` map

package/templates/code/serve CHANGED Viewed

@@ -113,7 +113,7 @@ resolve_model() {
             echo "${!_MODEL_VAR}"
             return
             ;;
-        s3|jumpstart|jumpstart-hub|registry)
+        s3|registry)
             # Check for pre-mounted artifacts first
             if [ -d "$LOCAL_MODEL_PATH" ] && [ "$(ls -A $LOCAL_MODEL_PATH 2>/dev/null)" ]; then
                 echo "Using pre-mounted model artifacts at $LOCAL_MODEL_PATH" >&2
@@ -245,7 +245,7 @@ ARG_PREFIX="--"
 # Define environment variables to exclude (internal variables set by base images)
 <% if (modelServer === 'vllm') { %>
-EXCLUDE_VARS=("VLLM_USAGE_SOURCE")
+EXCLUDE_VARS=("VLLM_USAGE_SOURCE" "VLLM_ENABLE_CUDA_COMPATIBILITY")
 <% } else if (modelServer === 'sglang') { %>
 EXCLUDE_VARS=()
 <% } else if (modelServer === 'tensorrt-llm') { %>

package/templates/code/serving.properties CHANGED Viewed

@@ -15,7 +15,7 @@ option.model_id=<%= modelName %>
 option.model_id=<%= artifactUri %>
 <% } else { %>
 # Model will be loaded from /opt/ml/model at runtime
-# (JumpStart model without artifact URI — requires SageMaker ModelDataUrl)
+# (requires SageMaker ModelDataUrl or MODEL_ARTIFACT_URI)
 # option.model_id=/opt/ml/model
 <% } %>
@@ -71,7 +71,7 @@ option.model_id=<%= modelName %>
 option.model_id=<%= artifactUri %>
 <% } else { %>
 # Model will be loaded from /opt/ml/model at runtime
-# (JumpStart model without artifact URI — requires SageMaker ModelDataUrl)
+# (requires SageMaker ModelDataUrl or MODEL_ARTIFACT_URI)
 # option.model_id=/opt/ml/model
 <% } %>

package/templates/diffusors/serve CHANGED Viewed

@@ -9,10 +9,10 @@ echo "Starting vLLM-Omni server (diffusion model serving)"
 # Resolve model URI prefixes that engines cannot handle natively.
 # The generator's model-picker may store provider-specific URIs
-# (e.g. jumpstart://model-txt2img-stabilityai-stable-diffusion-v2-1-base)
-# as the model identifier. vLLM expects a HuggingFace repo ID or local path.
+# (e.g. registry://my-model-group/1) as the model identifier.
+# vLLM expects a HuggingFace repo ID or local path.
 _RAW_MODEL="${VLLM_MODEL:-}"
-if [[ "$_RAW_MODEL" == jumpstart://* ]] || [[ "$_RAW_MODEL" == jumpstart-hub://* ]] || [[ "$_RAW_MODEL" == registry://* ]]; then
+if [[ "$_RAW_MODEL" == registry://* ]]; then
     if [ -d /opt/ml/model ] && [ "$(ls -A /opt/ml/model 2>/dev/null)" ]; then
         echo "Resolved VLLM_MODEL='${_RAW_MODEL}' → /opt/ml/model (local artifacts found)"
         export VLLM_MODEL="/opt/ml/model"

package/templates/do/.train_build_request.py ADDED Viewed

@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Build the CreateTrainingJob JSON request for SageMaker.
+This helper is called by do/train to construct the full API request body.
+It handles conditional fields (spot training, metric definitions, environment,
+tags) and writes the result to a JSON file for use with:
+    aws sagemaker create-training-job --cli-input-json file://path.json
+"""
+import argparse
+import json
+import sys
+def parse_args():
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(description='Build CreateTrainingJob request JSON')
+    parser.add_argument('--job-name', required=True, help='Training job name')
+    parser.add_argument('--role-arn', required=True, help='SageMaker execution role ARN')
+    parser.add_argument('--image', required=True, help='Training container image URI')
+    parser.add_argument('--instance-type', required=True, help='Instance type')
+    parser.add_argument('--instance-count', required=True, help='Instance count')
+    parser.add_argument('--volume-size', required=True, help='Volume size in GB')
+    parser.add_argument('--dataset', required=True, help='S3 URI for training dataset')
+    parser.add_argument('--output-path', required=True, help='S3 URI for output')
+    parser.add_argument('--max-runtime', required=True, help='Max runtime in seconds')
+    parser.add_argument('--hyperparams', required=True, help='Hyperparameters as JSON string')
+    parser.add_argument('--enable-spot', required=True, help='Enable spot training (true/false)')
+    parser.add_argument('--max-wait', required=True, help='Max wait time for spot in seconds')
+    parser.add_argument('--checkpoint-path', required=True, help='S3 checkpoint path')
+    parser.add_argument('--metric-definitions', required=True, help='Metric definitions as JSON array')
+    parser.add_argument('--environment', required=True, help='Environment variables as JSON object')
+    parser.add_argument('--tags', required=True, help='Tags as JSON object (key-value map)')
+    parser.add_argument('--output-file', required=True, help='Output file path for the JSON')
+    return parser.parse_args()
+def build_request(args):
+    """Construct the CreateTrainingJob request dictionary."""
+    # Parse JSON inputs
+    hyperparams = json.loads(args.hyperparams) if args.hyperparams else {}
+    metric_definitions = json.loads(args.metric_definitions) if args.metric_definitions else []
+    environment = json.loads(args.environment) if args.environment else {}
+    tags = json.loads(args.tags) if args.tags else {}
+    # Base request structure
+    request = {
+        'TrainingJobName': args.job_name,
+        'RoleArn': args.role_arn,
+        'AlgorithmSpecification': {
+            'TrainingImage': args.image,
+            'TrainingInputMode': 'File'
+        },
+        'InputDataConfig': [
+            {
+                'ChannelName': 'training',
+                'DataSource': {
+                    'S3DataSource': {
+                        'S3DataType': 'S3Prefix',
+                        'S3Uri': args.dataset,
+                        'S3DataDistributionType': 'FullyReplicated'
+                    }
+                }
+            }
+        ],
+        'OutputDataConfig': {
+            'S3OutputPath': args.output_path
+        },
+        'ResourceConfig': {
+            'InstanceType': args.instance_type,
+            'InstanceCount': int(args.instance_count),
+            'VolumeSizeInGB': int(args.volume_size)
+        },
+        'StoppingCondition': {
+            'MaxRuntimeInSeconds': int(args.max_runtime)
+        }
+    }
+    # Hyperparameters — ensure all values are strings (SageMaker requirement)
+    if hyperparams:
+        request['HyperParameters'] = {
+            str(k): str(v) for k, v in hyperparams.items()
+        }
+    # Managed spot training
+    if args.enable_spot == 'true':
+        request['EnableManagedSpotTraining'] = True
+        request['StoppingCondition']['MaxWaitTimeInSeconds'] = int(args.max_wait)
+    # Checkpoint configuration (for spot training resumption)
+    if args.checkpoint_path:
+        request['CheckpointConfig'] = {
+            'S3Uri': args.checkpoint_path
+        }
+    # Metric definitions (custom CloudWatch metrics)
+    if metric_definitions and metric_definitions != []:
+        request['AlgorithmSpecification']['MetricDefinitions'] = [
+            {'Name': m['name'], 'Regex': m['regex']}
+            for m in metric_definitions
+        ]
+    # Environment variables for the container
+    if environment and environment != {}:
+        request['Environment'] = environment
+    # Tags — convert from {key: value} map to [{Key: k, Value: v}] array
+    if tags and tags != {}:
+        request['Tags'] = [
+            {'Key': str(k), 'Value': str(v)}
+            for k, v in tags.items()
+        ]
+    return request
+def main():
+    """Main entry point."""
+    args = parse_args()
+    try:
+        request = build_request(args)
+    except (json.JSONDecodeError, ValueError) as e:
+        print(f'❌ Failed to build request: {e}', file=sys.stderr)
+        sys.exit(1)
+    # Write the JSON request to the output file
+    try:
+        with open(args.output_file, 'w') as f:
+            json.dump(request, f, indent=2)
+    except IOError as e:
+        print(f'❌ Failed to write request file: {e}', file=sys.stderr)
+        sys.exit(1)
+if __name__ == '__main__':
+    main()

package/templates/do/.train_poll_parser.py ADDED Viewed

@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Parse DescribeTrainingJob JSON for the polling loop in do/train.
+Reads JSON from stdin and outputs structured key=value lines for bash consumption:
+  STATUS=<TrainingJobStatus>
+  SECONDARY=<SecondaryStatus>
+  FAILURE_REASON=<FailureReason or empty>
+  DISPLAY=<formatted single-line status display>
+This keeps the bash poll loop simple while handling JSON parsing in Python.
+"""
+import json
+import sys
+from datetime import datetime, timezone
+def format_duration(seconds):
+    """Format seconds into a human-readable duration string."""
+    if seconds is None or seconds < 0:
+        return 'N/A'
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = int(seconds % 60)
+    if hours > 0:
+        return f'{hours}h {minutes}m {secs}s'
+    elif minutes > 0:
+        return f'{minutes}m {secs}s'
+    else:
+        return f'{secs}s'
+def parse_iso_time(time_str):
+    """Parse an ISO 8601 timestamp string to a datetime object."""
+    if not time_str:
+        return None
+    try:
+        time_str = time_str.replace('Z', '+00:00')
+        return datetime.fromisoformat(time_str)
+    except (ValueError, TypeError):
+        return None
+def calculate_elapsed(start_time_str):
+    """Calculate elapsed time from start to now."""
+    start = parse_iso_time(start_time_str)
+    if not start:
+        return None
+    now = datetime.now(timezone.utc)
+    elapsed = (now - start).total_seconds()
+    return max(0, elapsed)
+def format_metrics(final_metrics):
+    """Format FinalMetricDataList into a compact string."""
+    if not final_metrics:
+        return ''
+    parts = []
+    for metric in final_metrics:
+        name = metric.get('MetricName', 'unknown')
+        value = metric.get('Value', 0)
+        if isinstance(value, float):
+            if abs(value) < 0.001:
+                parts.append(f'{name}={value:.6f}')
+            elif abs(value) < 1:
+                parts.append(f'{name}={value:.4f}')
+            else:
+                parts.append(f'{name}={value:.2f}')
+        else:
+            parts.append(f'{name}={value}')
+    return ', '.join(parts)
+# Status emoji mapping
+STATUS_EMOJI = {
+    'InProgress': '🔄',
+    'Completed': '✅',
+    'Failed': '❌',
+    'Stopping': '⏸️',
+    'Stopped': '⏹️'
+}
+def main():
+    """Parse DescribeTrainingJob JSON from stdin and output structured lines."""
+    try:
+        job_data = json.load(sys.stdin)
+    except json.JSONDecodeError as e:
+        print(f'Error parsing JSON: {e}', file=sys.stderr)
+        sys.exit(1)
+    status = job_data.get('TrainingJobStatus', 'Unknown')
+    secondary_status = job_data.get('SecondaryStatus', '')
+    failure_reason = job_data.get('FailureReason', '')
+    training_start = job_data.get('TrainingStartTime', '')
+    final_metrics = job_data.get('FinalMetricDataList', [])
+    # Calculate elapsed time
+    elapsed_str = ''
+    if training_start:
+        elapsed = calculate_elapsed(training_start)
+        if elapsed is not None:
+            elapsed_str = format_duration(elapsed)
+    # Format metrics
+    metrics_str = format_metrics(final_metrics)
+    # Build display line
+    emoji = STATUS_EMOJI.get(status, '❓')
+    display_parts = [f'   {emoji} {status}']
+    if secondary_status:
+        display_parts.append(f'| {secondary_status}')
+    if elapsed_str:
+        display_parts.append(f'| elapsed: {elapsed_str}')
+    if metrics_str:
+        display_parts.append(f'| {metrics_str}')
+    display_line = ' '.join(display_parts)
+    # Output structured lines for bash
+    print(f'STATUS={status}')
+    print(f'SECONDARY={secondary_status}')
+    print(f'FAILURE_REASON={failure_reason}')
+    print(f'DISPLAY={display_line}')
+if __name__ == '__main__':
+    main()

package/templates/do/.train_status_parser.py ADDED Viewed

@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Parse DescribeTrainingJob JSON response and display formatted status.
+This helper is called by do/train --status to parse the AWS CLI JSON output
+from DescribeTrainingJob and display a user-friendly status summary.
+"""
+import json
+import sys
+import time
+from datetime import datetime, timezone
+# Status emoji mapping
+STATUS_EMOJI = {
+    'InProgress': '🔄',
+    'Completed': '✅',
+    'Failed': '❌',
+    'Stopping': '⏸️',
+    'Stopped': '⏹️'
+}
+# Secondary status descriptions
+SECONDARY_DESCRIPTIONS = {
+    'Starting': 'Preparing training instance',
+    'LaunchingMLInstances': 'Launching ML instances',
+    'PreparingTrainingStack': 'Preparing training stack',
+    'Downloading': 'Downloading training data',
+    'DownloadingTrainingImage': 'Downloading training image',
+    'Training': 'Training in progress',
+    'Uploading': 'Uploading model artifacts',
+    'Completed': 'Training completed',
+    'MaxRuntimeExceeded': 'Max runtime exceeded',
+    'Stopped': 'Training stopped',
+    'MaxWaitTimeExceeded': 'Max wait time exceeded (spot)',
+    'Interrupted': 'Spot instance interrupted'
+}
+def format_duration(seconds):
+    """Format seconds into a human-readable duration string."""
+    if seconds is None or seconds < 0:
+        return 'N/A'
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = int(seconds % 60)
+    if hours > 0:
+        return f'{hours}h {minutes}m {secs}s'
+    elif minutes > 0:
+        return f'{minutes}m {secs}s'
+    else:
+        return f'{secs}s'
+def parse_iso_time(time_str):
+    """Parse an ISO 8601 timestamp string to a datetime object."""
+    if not time_str:
+        return None
+    try:
+        # Handle various AWS timestamp formats
+        # Remove trailing 'Z' and replace with +00:00 for fromisoformat
+        time_str = time_str.replace('Z', '+00:00')
+        return datetime.fromisoformat(time_str)
+    except (ValueError, TypeError):
+        return None
+def calculate_elapsed(start_time_str):
+    """Calculate elapsed time from start to now."""
+    start = parse_iso_time(start_time_str)
+    if not start:
+        return None
+    now = datetime.now(timezone.utc)
+    elapsed = (now - start).total_seconds()
+    return max(0, elapsed)
+def display_status(job_data):
+    """Display formatted training job status."""
+    job_name = job_data.get('TrainingJobName', 'Unknown')
+    status = job_data.get('TrainingJobStatus', 'Unknown')
+    secondary_status = job_data.get('SecondaryStatus', '')
+    failure_reason = job_data.get('FailureReason', '')
+    training_start = job_data.get('TrainingStartTime', '')
+    training_end = job_data.get('TrainingEndTime', '')
+    billable_seconds = job_data.get('BillableTimeInSeconds')
+    training_seconds = job_data.get('TrainingTimeInSeconds')
+    final_metrics = job_data.get('FinalMetricDataList', [])
+    output_path = job_data.get('OutputDataConfig', {}).get('S3OutputPath', '')
+    model_artifacts = job_data.get('ModelArtifacts', {}).get('S3ModelArtifacts', '')
+    instance_type = job_data.get('ResourceConfig', {}).get('InstanceType', '')
+    instance_count = job_data.get('ResourceConfig', {}).get('InstanceCount', 1)
+    spot_enabled = job_data.get('EnableManagedSpotTraining', False)
+    emoji = STATUS_EMOJI.get(status, '❓')
+    print(f'')
+    print(f'   {emoji} Status: {status}')
+    # Secondary status with description
+    if secondary_status:
+        desc = SECONDARY_DESCRIPTIONS.get(secondary_status, '')
+        if desc:
+            print(f'   📍 Phase:  {secondary_status} ({desc})')
+        else:
+            print(f'   📍 Phase:  {secondary_status}')
+    # Elapsed time
+    if status == 'InProgress' and training_start:
+        elapsed = calculate_elapsed(training_start)
+        if elapsed is not None:
+            print(f'   ⏱️  Elapsed: {format_duration(elapsed)}')
+    elif training_seconds is not None:
+        print(f'   ⏱️  Training time: {format_duration(training_seconds)}')
+    # Instance info
+    if instance_type:
+        instance_info = f'{instance_type}'
+        if instance_count and instance_count > 1:
+            instance_info += f' x {instance_count}'
+        if spot_enabled:
+            instance_info += ' (spot)'
+        print(f'   🖥️  Instance: {instance_info}')
+    # Billable time and cost savings (for completed spot jobs)
+    if status == 'Completed' and spot_enabled and billable_seconds is not None and training_seconds is not None:
+        savings_seconds = training_seconds - billable_seconds
+        if training_seconds > 0:
+            savings_pct = (savings_seconds / training_seconds) * 100
+            print(f'   💰 Spot savings: {format_duration(savings_seconds)} saved ({savings_pct:.0f}% discount)')
+            print(f'      Billable: {format_duration(billable_seconds)} / Total: {format_duration(training_seconds)}')
+    # Training metrics
+    if final_metrics:
+        print(f'   📈 Metrics:')
+        for metric in final_metrics:
+            name = metric.get('MetricName', 'unknown')
+            value = metric.get('Value', 0)
+            # Format value nicely
+            if isinstance(value, float):
+                if abs(value) < 0.001:
+                    print(f'      {name}: {value:.6f}')
+                elif abs(value) < 1:
+                    print(f'      {name}: {value:.4f}')
+                else:
+                    print(f'      {name}: {value:.2f}')
+            else:
+                print(f'      {name}: {value}')
+    # Output artifacts (for completed jobs)
+    if status == 'Completed' and model_artifacts:
+        print(f'   📦 Artifacts: {model_artifacts}')
+    elif status == 'Completed' and output_path:
+        print(f'   📦 Output: {output_path}')
+    # Failure reason
+    if status == 'Failed' and failure_reason:
+        print(f'   💥 Reason: {failure_reason}')
+        print(f'')
+        print(f'   To start a new job: ./do/train --force')
+    # Spot interruption guidance
+    if secondary_status == 'Interrupted':
+        print(f'')
+        print(f'   ℹ️  Spot instance was interrupted. The job will automatically')
+        print(f'      resume from the last checkpoint. Re-run ./do/train to poll.')
+    print(f'')
+def main():
+    """Main entry point — reads JSON from stdin."""
+    try:
+        job_data = json.load(sys.stdin)
+    except json.JSONDecodeError as e:
+        print(f'❌ Failed to parse DescribeTrainingJob response: {e}', file=sys.stderr)
+        sys.exit(1)
+    display_status(job_data)
+if __name__ == '__main__':
+    main()

package/templates/do/.tune_helper.py CHANGED Viewed

@@ -176,7 +176,7 @@ def cmd_submit(args):
             )
         elif "ValidationException" in error_msg and "license" in error_msg.lower():
             _error_exit(
-                f"Model license not accepted. Accept the license in JumpStart before "
+                f"Model license not accepted. Accept the model license before "
                 f"using this model for customization. Details: {error_msg}"
             )
         else:
@@ -660,7 +660,7 @@ def main():
     # ── submit ────────────────────────────────────────────────────────────────
     submit_parser = subparsers.add_parser("submit", help="Submit a customization job")
-    submit_parser.add_argument("--model-id", required=True, help="JumpStart model ID")
+    submit_parser.add_argument("--model-id", required=True, help="Model ID")
     submit_parser.add_argument("--technique", required=True,
                                choices=["sft", "dpo", "rlaif", "rlvr"],
                                help="Customization technique")

package/templates/do/lib/feedback.sh ADDED Viewed

@@ -0,0 +1,41 @@
+#!/bin/bash
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Shared helper: post-completion feedback loop for training and tuning jobs.
+# Sourced by do/tune and do/train — prints artifact locations and deployment suggestions.
+# print_completion_feedback()
+#   Display completion summary with artifact path and next-step deployment commands.
+#   Tailors suggestions based on the detected artifact type (adapter vs full model).
+#
+#   Arguments:
+#     $1 - output_path:       S3 URI to the output artifacts
+#     $2 - output_type:       "adapter" or "full-model"
+#     $3 - job_name:          Job name for reference
+#     $4 - model_package_arn: (optional) Model package ARN if registered
+print_completion_feedback() {
+    local output_path="$1"
+    local output_type="$2"
+    local job_name="$3"
+    local model_package_arn="${4:-}"
+    echo ""
+    echo "✅ Training complete: ${job_name}"
+    echo ""
+    echo "   Artifacts: ${output_path}"
+    if [ -n "${model_package_arn}" ]; then
+        echo "   Model Package: ${model_package_arn}"
+    fi
+    echo ""
+    echo "   Next steps:"
+    if [ "${output_type}" = "adapter" ]; then
+        echo "     • Deploy as LoRA adapter:  ./do/adapter add my-adapter --weights ${output_path}"
+        echo "     • (Requires running endpoint with LoRA enabled)"
+    elif [ "${output_type}" = "full-model" ]; then
+        echo "     • Deploy as new IC:        ./do/add-ic my-model --model-data ${output_path}"
+        echo "     • Replace current base:    ./do/deploy --force-ic --model-data ${output_path}"
+    fi
+    echo ""
+}

package/templates/do/register CHANGED Viewed

@@ -191,8 +191,14 @@ fi
 # ============================================================
 # DEPLOYMENT_CONFIG format: <architecture>-<backend> (e.g., transformers-vllm, http-flask, triton-fil)
-ARCHITECTURE="${DEPLOYMENT_CONFIG%%-*}"
-BACKEND="${DEPLOYMENT_CONFIG#*-}"
+# Special case: marketplace has no backend
+if [ "${DEPLOYMENT_CONFIG}" = "marketplace" ]; then
+    ARCHITECTURE="marketplace"
+    BACKEND=""
+else
+    ARCHITECTURE="${DEPLOYMENT_CONFIG%%-*}"
+    BACKEND="${DEPLOYMENT_CONFIG#*-}"
+fi
 echo "📋 Registering deployment to registry"
 echo "   Project: ${PROJECT_NAME}"