npm - @aws/ml-container-creator - Versions diffs - 0.10.0 → 0.12.1 - Mend

@aws/ml-container-creator 0.10.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

package/LICENSE-THIRD-PARTY +9304 -0
package/bin/cli.js +2 -0
package/config/bootstrap-e2e-stack.json +341 -0
package/config/bootstrap-stack.json +40 -3
package/config/parameter-schema-v2.json +33 -22
package/config/tune-catalog.json +1781 -0
package/infra/ci-harness/buildspec.yml +1 -0
package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
package/infra/ci-harness/lib/ci-harness-stack.ts +851 -7
package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
package/package.json +53 -67
package/servers/base-image-picker/index.js +121 -121
package/servers/e2e-status/index.js +297 -0
package/servers/e2e-status/manifest.json +14 -0
package/servers/e2e-status/package.json +15 -0
package/servers/endpoint-picker/LICENSE +202 -0
package/servers/endpoint-picker/index.js +536 -0
package/servers/endpoint-picker/manifest.json +14 -0
package/servers/endpoint-picker/package.json +18 -0
package/servers/hyperpod-cluster-picker/index.js +125 -125
package/servers/instance-sizer/index.js +166 -153
package/servers/instance-sizer/lib/instance-ranker.js +120 -76
package/servers/instance-sizer/lib/model-resolver.js +61 -61
package/servers/instance-sizer/lib/quota-resolver.js +113 -113
package/servers/instance-sizer/lib/vram-estimator.js +31 -31
package/servers/lib/bedrock-client.js +38 -38
package/servers/lib/catalogs/instances.json +27 -0
package/servers/lib/catalogs/model-servers.json +201 -3
package/servers/lib/custom-validators.js +13 -13
package/servers/lib/dynamic-resolver.js +4 -4
package/servers/marketplace-picker/index.js +342 -0
package/servers/marketplace-picker/manifest.json +14 -0
package/servers/marketplace-picker/package.json +18 -0
package/servers/model-picker/index.js +382 -382
package/servers/region-picker/index.js +56 -56
package/servers/workload-picker/LICENSE +202 -0
package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
package/servers/workload-picker/index.js +171 -0
package/servers/workload-picker/manifest.json +16 -0
package/servers/workload-picker/package.json +16 -0
package/src/app.js +12 -3
package/src/lib/bootstrap-command-handler.js +609 -15
package/src/lib/bootstrap-config.js +36 -0
package/src/lib/bootstrap-profile-manager.js +48 -41
package/src/lib/ci-register-helpers.js +74 -0
package/src/lib/config-loader.js +3 -0
package/src/lib/config-manager.js +7 -0
package/src/lib/config-validator.js +1 -1
package/src/lib/cuda-resolver.js +17 -8
package/src/lib/generated/cli-options.js +319 -314
package/src/lib/generated/parameter-matrix.js +672 -661
package/src/lib/generated/validation-rules.js +76 -72
package/src/lib/path-prover-brain.js +664 -0
package/src/lib/prompts/infrastructure-prompts.js +2 -2
package/src/lib/prompts/model-prompts.js +6 -0
package/src/lib/prompts/project-prompts.js +12 -0
package/src/lib/secrets-prompt-runner.js +4 -0
package/src/lib/template-manager.js +1 -1
package/src/lib/template-variable-resolver.js +87 -1
package/src/lib/tune-catalog-validator.js +37 -4
package/templates/Dockerfile +9 -0
package/templates/code/adapter_sidecar.py +444 -0
package/templates/code/serve +6 -0
package/templates/code/serve.d/vllm.ejs +1 -1
package/templates/do/.benchmark_writer.py +1476 -0
package/templates/do/.tune_helper.py +982 -57
package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
package/templates/do/adapter +154 -0
package/templates/do/benchmark +639 -85
package/templates/do/build +5 -0
package/templates/do/clean.d/async-inference.ejs +5 -0
package/templates/do/clean.d/batch-transform.ejs +5 -0
package/templates/do/clean.d/hyperpod-eks.ejs +5 -0
package/templates/do/clean.d/managed-inference.ejs +5 -0
package/templates/do/config +115 -45
package/templates/do/deploy.d/async-inference.ejs +30 -3
package/templates/do/deploy.d/batch-transform.ejs +29 -3
package/templates/do/deploy.d/hyperpod-eks.ejs +4 -0
package/templates/do/deploy.d/managed-inference.ejs +216 -14
package/templates/do/lib/endpoint-config.sh +1 -1
package/templates/do/lib/profile.sh +44 -0
package/templates/do/optimize +106 -37
package/templates/do/push +5 -0
package/templates/do/register +94 -0
package/templates/do/stage +567 -0
package/templates/do/submit +7 -0
package/templates/do/test +14 -0
package/templates/do/tune +382 -59
package/templates/do/validate +44 -4

package/templates/do/.tune_helper.py CHANGED Viewed

@@ -10,30 +10,44 @@ Subcommands:
     resolve  - Resolve output artifact path from job
     stage-hf - Download HF dataset to S3
     validate - Validate dataset format against schema
+    discover - Discover tune-eligible models from JumpStart Hub
 All output is JSON on stdout for bash consumption.
 """
 import argparse
+import fnmatch
 import json
 import os
+import re
 import sys
 import time
+import warnings
+# Suppress noisy dependency version warnings from requests/urllib3
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", message=".*urllib3.*")
+warnings.filterwarnings("ignore", message=".*charset_normalizer.*")
 # ── Inline dependency check ───────────────────────────────────────────────────
-MIN_SAGEMAKER_VERSION = "2.232.0"
+MIN_SAGEMAKER_VERSION = "3.0"
+_GLOB_METACHAR_RE = re.compile(r'[*?\[]')
 def _check_sagemaker_sdk():
     """Verify sagemaker SDK is installed with minimum version."""
     try:
         import sagemaker  # noqa: F401
+        # SDK v3 removed __version__; use importlib.metadata instead
+        from importlib.metadata import version as pkg_version
         from packaging.version import Version
-        if Version(sagemaker.__version__) < Version(MIN_SAGEMAKER_VERSION):
+        installed = pkg_version("sagemaker")
+        if Version(installed) < Version(MIN_SAGEMAKER_VERSION):
             _error_exit(
-                f"sagemaker SDK version {sagemaker.__version__} is below minimum "
+                f"sagemaker SDK version {installed} is below minimum "
                 f"required version {MIN_SAGEMAKER_VERSION}. "
-                f"Please upgrade: pip install 'sagemaker>={MIN_SAGEMAKER_VERSION}'"
+                f"Please upgrade: pip install --upgrade 'sagemaker>={MIN_SAGEMAKER_VERSION}'"
             )
     except ImportError:
         _error_exit(
@@ -65,11 +79,37 @@ def cmd_submit(args):
     Returns: {"job_name": str, "job_arn": str, "mlflow_url": str|None}
     """
+    # Suppress SDK rich logging that pollutes stdout (we only want JSON output)
+    import logging
+    logging.disable(logging.CRITICAL)
+    os.environ["SAGEMAKER_LOG_LEVEL"] = "CRITICAL"
+    # Ensure region is set before ANY sagemaker import (v3 creates boto3 clients at import time)
+    region = getattr(args, 'region', None) or os.environ.get('AWS_DEFAULT_REGION') or os.environ.get('AWS_REGION')
+    if region:
+        os.environ["AWS_DEFAULT_REGION"] = region
+        os.environ.setdefault("AWS_REGION", region)
     _check_sagemaker_sdk()
-    from sagemaker.modules.train.sft_trainer import SFTTrainer
-    from sagemaker.modules.train.dpo_trainer import DPOTrainer
-    from sagemaker.modules.train.common import TrainingType
+    # SDK v3 moved trainers from sagemaker.modules.train → sagemaker.train
+    # Note: catch Exception (not just ImportError) because SDK v3 AIRHub
+    # creates boto3 clients at class-definition time, which can raise
+    # NoRegionError if AWS_DEFAULT_REGION is not set despite our best efforts.
+    try:
+        from sagemaker.train.sft_trainer import SFTTrainer
+        from sagemaker.train.dpo_trainer import DPOTrainer
+        from sagemaker.train.common import TrainingType
+    except Exception:
+        try:
+            from sagemaker.modules.train.sft_trainer import SFTTrainer
+            from sagemaker.modules.train.dpo_trainer import DPOTrainer
+            from sagemaker.modules.train.common import TrainingType
+        except Exception:
+            _error_exit(
+                "SFTTrainer not found. Requires sagemaker>=3.0. "
+                "Install: pip install --upgrade 'sagemaker>=3.0'"
+            )
     # Technique → Trainer class mapping
     TRAINER_MAP = {
@@ -88,63 +128,164 @@ def cmd_submit(args):
     # Resolve training type
     training_type_map = {
         "lora": TrainingType.LORA,
-        "full-rank": TrainingType.FULL_RANK,
+        "full-rank": getattr(TrainingType, 'FULL_RANK', None) or getattr(TrainingType, 'FULL', None),
     }
     training_type = training_type_map.get(args.training_type)
     if not training_type:
         _error_exit(f"Unsupported training type: {args.training_type}")
     # Build hyperparameters dict from optional overrides
+    # Map CLI flag names to SDK v3 fine-tuning option names
     hyperparameters = {}
     if args.epochs is not None:
-        hyperparameters["epochs"] = args.epochs
+        hyperparameters["max_epochs"] = args.epochs
     if args.learning_rate is not None:
         hyperparameters["learning_rate"] = args.learning_rate
     if args.max_seq_length is not None:
-        hyperparameters["max_seq_length"] = args.max_seq_length
+        hyperparameters["dataset_max_len"] = args.max_seq_length
     if args.lora_rank is not None:
         hyperparameters["lora_rank"] = args.lora_rank
     if args.lora_alpha is not None:
         hyperparameters["lora_alpha"] = args.lora_alpha
     if args.batch_size is not None:
-        hyperparameters["batch_size"] = args.batch_size
-    # Build trainer kwargs
-    trainer_kwargs = {
-        "model_id": args.model_id,
-        "training_type": training_type,
-        "train_data_uri": args.dataset_s3_uri,
-        "output_path": f"s3://{args.output_bucket}/{args.project_name}/tune/{technique}/",
-        "role": args.role_arn,
-        "job_name": args.job_name,
-    }
+        hyperparameters["global_batch_size"] = args.batch_size
-    # Add model package group for artifact registration
-    if args.model_package_group:
-        trainer_kwargs["model_package_group_name"] = args.model_package_group
+    # Build trainer kwargs — API differs between SDK v2 and v3
+    output_path = f"s3://{args.output_bucket}/{args.project_name}/tune/{technique}/"
-    # Add hyperparameters if any were specified
-    if hyperparameters:
-        trainer_kwargs["hyperparameters"] = hyperparameters
+    # Detect SDK version to use appropriate API
+    sdk_v3 = hasattr(trainer_cls, 'role')  # v3 trainers have role as a settable attribute
-    # Add evaluator config for RLVR/RLAIF techniques
-    if technique in ("rlvr", "rlaif"):
-        if args.reward_function:
-            trainer_kwargs["evaluator_config"] = {
-                "reward_function_arn": args.reward_function
+    try:
+        if sdk_v3:
+            # SDK v3 API: positional model, keyword training_dataset, s3_output_path
+            trainer_kwargs = {
+                "model": args.model_id,
+                "training_type": training_type,
+                "training_dataset": args.dataset_s3_uri,
+                "s3_output_path": output_path,
             }
-        elif args.reward_prompt:
-            trainer_kwargs["evaluator_config"] = {
-                "reward_prompt_s3_uri": args.reward_prompt
+            # Accept EULA for gated models (e.g., Meta Llama)
+            # SDK v3.12+ accepts accept_eula as a constructor parameter
+            if args.accept_eula:
+                trainer_kwargs["accept_eula"] = True
+            # Resolve model package group — create if it doesn't exist
+            mpg_name = args.model_package_group or f"{args.project_name}-tune-models"
+            try:
+                import boto3 as _boto3
+                _sm = _boto3.client("sagemaker", region_name=args.region or os.environ.get("AWS_REGION", "us-west-2"))
+                _sm.describe_model_package_group(ModelPackageGroupName=mpg_name)
+            except Exception as _mpg_err:
+                if "does not exist" in str(_mpg_err) or "ValidationException" in str(_mpg_err):
+                    try:
+                        _sm.create_model_package_group(
+                            ModelPackageGroupName=mpg_name,
+                            ModelPackageGroupDescription=f"Fine-tuned models for {args.project_name}",
+                        )
+                    except Exception:
+                        pass  # May already exist or lack permissions — let the trainer handle it
+            trainer_kwargs["model_package_group"] = mpg_name
+            trainer = trainer_cls(**trainer_kwargs)
+            trainer.role = args.role_arn
+            trainer.base_job_name = args.job_name
+            if hyperparameters:
+                # SDK v3 expects hyperparameters with a .to_dict() method
+                # Wrap our plain dict to satisfy the interface
+                hp_obj = trainer.hyperparameters
+                if hp_obj is not None and hasattr(hp_obj, '__dict__'):
+                    for k, v in hyperparameters.items():
+                        setattr(hp_obj, k, v)
+                else:
+                    # Fallback: create a simple wrapper
+                    class _HyperParams:
+                        def __init__(self, d):
+                            self._data = d
+                            for k, v in d.items():
+                                setattr(self, k, v)
+                        def to_dict(self):
+                            return {k: v for k, v in self._data.items() if v is not None}
+                    trainer.hyperparameters = _HyperParams(hyperparameters)
+            # Use MLCC-owned MLflow app if available (avoids permission issues with Studio apps)
+            mlflow_arn = os.environ.get('MLFLOW_APP_ARN', '')
+            if mlflow_arn:
+                trainer.mlflow_resource_arn = mlflow_arn
+            # Suppress SDK print() output (e.g., "Training Job Name: ...")
+            # that pollutes stdout and breaks JSON parsing by the shell script
+            import io as _io
+            _orig_stdout = sys.stdout
+            sys.stdout = _io.StringIO()
+            try:
+                trainer.train(training_dataset=args.dataset_s3_uri, wait=False)
+            finally:
+                sys.stdout = _orig_stdout
+        else:
+            # SDK v2 API: model_id, train_data_uri, output_path, role, job_name
+            trainer_kwargs = {
+                "model_id": args.model_id,
+                "training_type": training_type,
+                "train_data_uri": args.dataset_s3_uri,
+                "output_path": output_path,
+                "role": args.role_arn,
+                "job_name": args.job_name,
             }
-    try:
-        trainer = trainer_cls(**trainer_kwargs)
-        trainer.train(wait=False)
+            if args.model_package_group:
+                trainer_kwargs["model_package_group_name"] = args.model_package_group
+            if hyperparameters:
+                trainer_kwargs["hyperparameters"] = hyperparameters
+            # Add evaluator config for RLVR/RLAIF techniques
+            if technique in ("rlvr", "rlaif"):
+                if args.reward_function:
+                    trainer_kwargs["evaluator_config"] = {"reward_function_arn": args.reward_function}
+                elif args.reward_prompt:
+                    trainer_kwargs["evaluator_config"] = {"reward_prompt_s3_uri": args.reward_prompt}
+            # Accept EULA for gated models (e.g., Meta Llama)
+            if args.accept_eula:
+                trainer_kwargs["accept_eula"] = True
+            trainer = trainer_cls(**trainer_kwargs)
+            # Suppress SDK print() output that pollutes stdout
+            import io as _io
+            _orig_stdout = sys.stdout
+            sys.stdout = _io.StringIO()
+            try:
+                trainer.train(wait=False)
+            finally:
+                sys.stdout = _orig_stdout
         # Extract job info from the trainer
-        job_name = trainer.training_job_name
+        job_name = getattr(trainer, 'training_job_name', None) or getattr(trainer, 'base_job_name', None)
         job_arn = getattr(trainer, "training_job_arn", None)
+        latest_job = getattr(trainer, 'latest_training_job', None)
+        if latest_job:
+            job_name = job_name or getattr(latest_job, 'name', None) or getattr(latest_job, 'job_name', None)
+            job_arn = job_arn or getattr(latest_job, 'arn', None)
+        # If we still don't have the actual job name (SDK appends suffix),
+        # query ListTrainingJobs to find it by our base_job_name prefix
+        if not job_name or job_name == args.job_name:
+            import boto3 as _boto3
+            _sm = _boto3.client("sagemaker", region_name=args.region or os.environ.get("AWS_REGION", "us-west-2"))
+            try:
+                # Brief delay to allow job to register
+                time.sleep(2)
+                list_resp = _sm.list_training_jobs(
+                    NameContains=args.job_name,
+                    SortBy="CreationTime",
+                    SortOrder="Descending",
+                    MaxResults=1,
+                )
+                summaries = list_resp.get("TrainingJobSummaries", [])
+                if summaries:
+                    job_name = summaries[0]["TrainingJobName"]
+                    job_arn = summaries[0].get("TrainingJobArn", job_arn)
+            except Exception:
+                pass  # Fall back to whatever we have
         # Attempt to get MLflow URL if available
         mlflow_url = None
@@ -154,7 +295,7 @@ def cmd_submit(args):
             pass
         _output({
-            "job_name": job_name,
+            "job_name": job_name or args.job_name,
             "job_arn": job_arn or "",
             "mlflow_url": mlflow_url,
             "model_package_group": args.model_package_group or "",
@@ -176,8 +317,15 @@ def cmd_submit(args):
             )
         elif "ValidationException" in error_msg and "license" in error_msg.lower():
             _error_exit(
-                f"Model license not accepted. Accept the model license before "
-                f"using this model for customization. Details: {error_msg}"
+                f"Model requires EULA acceptance. Re-run with --accept-eula flag: "
+                f"./do/tune --technique {technique} --accept-eula ... "
+                f"Details: {error_msg}"
+            )
+        elif "ValidationException" in error_msg and "eula" in error_msg.lower():
+            _error_exit(
+                f"Model requires EULA acceptance. Re-run with --accept-eula flag: "
+                f"./do/tune --technique {technique} --accept-eula ... "
+                f"Details: {error_msg}"
             )
         else:
             _error_exit(f"Failed to submit training job: {error_msg}")
@@ -189,6 +337,9 @@ def cmd_submit(args):
 def cmd_status(args):
     """Query job status via DescribeTrainingJob.
+    Falls back to ListTrainingJobs with name-contains if exact name not found
+    (SDK v3 appends a timestamp suffix to the base job name).
     Returns: {"status": str, "failure_reason": str|None,
               "metrics": dict|None, "elapsed_seconds": int}
     """
@@ -196,16 +347,36 @@ def cmd_status(args):
     client = boto3.client("sagemaker", region_name=args.region)
+    # Try exact name first
+    response = None
     try:
         response = client.describe_training_job(TrainingJobName=args.job_name)
     except client.exceptions.ClientError as e:
         error_code = e.response["Error"]["Code"]
-        if error_code == "ValidationException":
-            _error_exit(f"Training job not found: {args.job_name}")
-        _error_exit(f"Failed to describe training job: {e}")
+        if error_code != "ValidationException":
+            _error_exit(f"Failed to describe training job: {e}")
+        # Job not found by exact name — try name-contains search
     except Exception as e:
         _error_exit(f"Failed to describe training job: {e}")
+    # Fallback: search by name prefix (SDK appends timestamp suffix)
+    if response is None:
+        try:
+            list_response = client.list_training_jobs(
+                NameContains=args.job_name,
+                SortBy="CreationTime",
+                SortOrder="Descending",
+                MaxResults=1,
+            )
+            summaries = list_response.get("TrainingJobSummaries", [])
+            if summaries:
+                actual_name = summaries[0]["TrainingJobName"]
+                response = client.describe_training_job(TrainingJobName=actual_name)
+            else:
+                _error_exit(f"Training job not found: {args.job_name}")
+        except Exception as e:
+            _error_exit(f"Failed to find training job: {e}")
     status = response.get("TrainingJobStatus", "Unknown")
     failure_reason = response.get("FailureReason")
@@ -278,6 +449,14 @@ def cmd_resolve(args):
     # Determine output type from training type
     output_type = "adapter" if args.training_type == "lora" else "full-model"
+    # For LoRA adapters, the actual adapter files are in checkpoints/hf/ subdirectory
+    # The S3ModelArtifacts path points to the top-level output directory
+    if output_type == "adapter":
+        # Ensure trailing slash for directory path
+        if not artifact_path.endswith("/"):
+            artifact_path += "/"
+        artifact_path += "checkpoints/hf/"
     # Try to find model package ARN if a model package group was used
     model_package_arn = None
     if args.model_package_group:
@@ -306,6 +485,332 @@ def cmd_resolve(args):
 # ── Subcommand: stage-hf ─────────────────────────────────────────────────────
+def _get_required_columns(technique):
+    """Return the required column names for a given technique."""
+    schemas = {
+        "sft": ["prompt", "completion"],
+        "dpo": ["prompt", "chosen", "rejected"],
+        "rlaif": ["prompt"],  # prompt is an array of messages
+        "rlvr": ["prompt"],   # prompt is an array of messages
+    }
+    return schemas.get(technique, ["prompt", "completion"])
+def _suggest_column_map(detected_columns, required_columns):
+    """Suggest a --column-map based on common column name patterns."""
+    # Common aliases for each required field
+    aliases = {
+        "prompt": ["question", "instruction", "input", "query", "text", "context", "user", "human"],
+        "completion": ["answer", "output", "response", "assistant", "target", "label", "reply"],
+        "chosen": ["chosen", "preferred", "good", "positive", "accepted"],
+        "rejected": ["rejected", "dispreferred", "bad", "negative", "refused"],
+    }
+    suggestions = {}
+    for req_col in required_columns:
+        if req_col in detected_columns:
+            continue  # Already present
+        # Check aliases
+        for alias in aliases.get(req_col, []):
+            if alias in detected_columns:
+                suggestions[req_col] = alias
+                break
+    if not suggestions:
+        return None
+    # Format as --column-map string
+    mapping_str = ",".join(f"{k}={v}" for k, v in suggestions.items())
+    return mapping_str
+def _parse_column_map(column_map_str):
+    """Parse a column map string like 'prompt=question,completion=answer' into a dict."""
+    if not column_map_str:
+        return {}
+    mapping = {}
+    for pair in column_map_str.split(","):
+        pair = pair.strip()
+        if "=" not in pair:
+            continue
+        target, source = pair.split("=", 1)
+        mapping[target.strip()] = source.strip()
+    return mapping
+def _apply_column_map(record, column_map):
+    """Apply column mapping to a record: rename source columns to target names."""
+    if not column_map:
+        return record
+    mapped = dict(record)
+    for target, source in column_map.items():
+        if source in mapped and target not in mapped:
+            mapped[target] = mapped.pop(source)
+    return mapped
+def _detect_chat_columns(record, required_columns, schema_types):
+    """Detect which required columns contain chat-format data.
+    Only inspects columns whose schema type is "string". Columns with
+    "array" type (RLAIF/RLVR) are excluded from detection entirely.
+    Args:
+        record: The first record (dict) after column mapping
+        required_columns: List of required column names for the technique
+        schema_types: Dict mapping column name -> expected type from schema
+    Returns:
+        dict: Maps column_name -> detection_result where detection_result is:
+              {"type": "single_dict"} or
+              {"type": "message_list", "strategy": "extract"|"same_role"|"multi_role", "count": int}
+              Only columns detected as chat-format are included.
+    """
+    results = {}
+    for column in required_columns:
+        # Only inspect columns whose schema type is "string"
+        if schema_types.get(column) != "string":
+            continue
+        # Skip if column is not present in the record
+        if column not in record:
+            continue
+        value = record[column]
+        # Check for Single_Message_Dict: dict with both "role" and "content" keys
+        if isinstance(value, dict) and "role" in value and "content" in value:
+            results[column] = {"type": "single_dict"}
+            continue
+        # Check for Message_List: non-empty list whose first element is a dict
+        # with both "role" and "content" keys
+        if isinstance(value, list) and len(value) > 0:
+            first_element = value[0]
+            if isinstance(first_element, dict) and "role" in first_element and "content" in first_element:
+                count = len(value)
+                if count == 1:
+                    strategy = "extract"
+                elif all(
+                    isinstance(elem, dict) and elem.get("role") == first_element["role"]
+                    for elem in value
+                ):
+                    strategy = "same_role"
+                else:
+                    strategy = "multi_role"
+                results[column] = {"type": "message_list", "strategy": strategy, "count": count}
+                continue
+    return results
+def _flatten_value(value, detection_result):
+    """Flatten a chat-format column value to a plain string.
+    Args:
+        value: The column value (dict, list, string, or other)
+        detection_result: The detection metadata for this column
+    Returns:
+        str: The flattened string value
+    Raises:
+        ValueError: If the value cannot be converted at all (str() also fails)
+    """
+    import json
+    # Edge case: string pass-through
+    if isinstance(value, str):
+        return value
+    # Edge case: None → ""
+    if value is None:
+        return ""
+    # Edge case: empty list → ""
+    if isinstance(value, list) and len(value) == 0:
+        return ""
+    det_type = detection_result.get("type")
+    if det_type == "single_dict":
+        if isinstance(value, dict):
+            role = value.get("role", "")
+            if "content" in value:
+                content = value["content"]
+                if isinstance(content, str):
+                    return content
+                # Non-string content: format as "role: json_content"
+                return f"{role}: {json.dumps(content)}"
+            else:
+                # No content key: format as "role: remaining_values"
+                remaining = {k: v for k, v in value.items() if k != "role"}
+                return f"{role}: {json.dumps(remaining)}"
+    elif det_type == "message_list":
+        strategy = detection_result.get("strategy")
+        if isinstance(value, list) and len(value) > 0:
+            if strategy == "extract":
+                # Extract single element's content
+                elem = value[0]
+                if isinstance(elem, dict):
+                    content = elem.get("content")
+                    if content is None:
+                        return ""
+                    if isinstance(content, str):
+                        return content
+                    return f"{elem.get('role', '')}: {json.dumps(content)}"
+                return ""
+            elif strategy == "same_role":
+                # Join all content fields with newline
+                parts = []
+                for elem in value:
+                    if isinstance(elem, dict):
+                        content = elem.get("content")
+                        if content is None or content == "":
+                            parts.append("")
+                        elif isinstance(content, str):
+                            parts.append(content)
+                        else:
+                            parts.append(json.dumps(content))
+                    else:
+                        parts.append("")
+                return "\n".join(parts)
+            elif strategy == "multi_role":
+                # Format as "role: content" per line
+                lines = []
+                for elem in value:
+                    if isinstance(elem, dict):
+                        role = elem.get("role", "")
+                        content = elem.get("content")
+                        if content is None:
+                            content = ""
+                        elif not isinstance(content, str):
+                            content = json.dumps(content)
+                        lines.append(f"{role}: {content}")
+                    else:
+                        lines.append("")
+                return "\n".join(lines)
+    # Fallback for unexpected types: int/bool → str()
+    try:
+        return str(value)
+    except Exception as e:
+        raise ValueError(f"Cannot convert value to string: {e}")
+def _flatten_record(record, chat_columns):
+    """Apply flattening to all chat-format columns in a record.
+    Args:
+        record: The mapped record dict
+        chat_columns: Detection results from _detect_chat_columns
+    Returns:
+        dict: The record with chat-format columns replaced by flat strings
+    """
+    flattened = dict(record)
+    for column_name, detection_result in chat_columns.items():
+        if column_name in flattened:
+            flattened[column_name] = _flatten_value(flattened[column_name], detection_result)
+    return flattened
+def _log_flatten_info(chat_columns, no_transform):
+    """Log auto-flatten detection and strategy information.
+    Logs regardless of --no-transform state (per requirement 6.3/6.4).
+    When --no-transform is active, detection still runs for logging purposes.
+    All output goes to stderr to avoid polluting stdout JSON output.
+    Args:
+        chat_columns: Detection results dict (from _detect_chat_columns)
+        no_transform: Whether --no-transform flag is active
+    """
+    for column_name, detection_result in chat_columns.items():
+        print(f"\u2139\ufe0f  Auto-converted column '{column_name}' from chat-format to string", file=sys.stderr)
+        det_type = detection_result.get("type")
+        if det_type == "single_dict":
+            print("    Format: extracted content field", file=sys.stderr)
+        elif det_type == "message_list":
+            strategy = detection_result.get("strategy")
+            count = detection_result.get("count", 0)
+            if strategy == "multi_role":
+                print(f"    Format: role: content (multi-turn, {count} messages)", file=sys.stderr)
+            elif strategy == "same_role":
+                print(f"    Format: newline-joined content ({count} messages, same role)", file=sys.stderr)
+            elif strategy == "extract":
+                print("    Format: extracted content field", file=sys.stderr)
+def _get_schema_types(technique):
+    """Return a dict mapping column names to their expected types for a technique.
+    Args:
+        technique: One of 'sft', 'dpo', 'rlaif', 'rlvr'
+    Returns:
+        dict: Maps column_name -> expected type ("string" or "array")
+    """
+    schemas = {
+        "sft": {"prompt": "string", "completion": "string"},
+        "dpo": {"prompt": "string", "chosen": "string", "rejected": "string"},
+        "rlaif": {"prompt": "array"},
+        "rlvr": {"prompt": "array"},
+    }
+    return schemas.get(technique, {"prompt": "string", "completion": "string"})
+def _validate_dataset_columns(first_record, technique, column_map_str, dataset_id):
+    """Validate that the first record has required columns after mapping.
+    Returns (mapped_record, column_map_dict) on success.
+    Calls _error_exit with helpful suggestions on failure.
+    """
+    column_map = _parse_column_map(column_map_str)
+    mapped = _apply_column_map(first_record, column_map)
+    required = _get_required_columns(technique)
+    detected = list(first_record.keys())
+    missing = [col for col in required if col not in mapped]
+    if not missing:
+        return mapped, column_map
+    # Build helpful error message
+    lines = [
+        f"Dataset columns don't match {technique.upper()} requirements.",
+        f"",
+        f"   Required columns: {', '.join(required)}",
+        f"   Detected columns: {', '.join(detected)}",
+        f"   Missing: {', '.join(missing)}",
+    ]
+    # Suggest a column map
+    suggestion = _suggest_column_map(detected, required)
+    if suggestion:
+        lines.append(f"")
+        lines.append(f"   💡 Suggested fix:")
+        lines.append(f"      ./do/tune --technique {technique} --dataset hf://{dataset_id} --column-map {suggestion}")
+    else:
+        lines.append(f"")
+        lines.append(f"   💡 Use --column-map to rename columns:")
+        example_map = ",".join(f"{r}=<your_column>" for r in missing)
+        lines.append(f"      ./do/tune --technique {technique} --dataset hf://{dataset_id} --column-map {example_map}")
+    lines.append(f"")
+    lines.append(f"   First record sample:")
+    # Show truncated first record
+    for k, v in list(first_record.items())[:5]:
+        val_str = str(v)[:80] + ("..." if len(str(v)) > 80 else "")
+        lines.append(f"      {k}: {val_str}")
+    _error_exit("\n".join(lines))
 def cmd_stage_hf(args):
     """Download HF dataset to S3 using huggingface_hub.
@@ -313,6 +818,9 @@ def cmd_stage_hf(args):
     Returns: {"s3_uri": str, "num_records": int}
     """
+    # Suppress HF Hub progress bars — they pollute stdout which must be clean JSON
+    os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
     try:
         from huggingface_hub import hf_hub_download, HfApi
     except ImportError:
@@ -352,12 +860,28 @@ def cmd_stage_hf(args):
                 f"Available files: {', '.join(repo_files[:20])}"
             )
+        # Apply file filter if --hf-file is provided
+        hf_file_pattern = getattr(args, 'hf_file', None)
+        if hf_file_pattern:
+            data_files = _filter_data_files(data_files, hf_file_pattern)
         # Download and upload to S3
         s3_client = boto3.client("s3", region_name=args.region)
         s3_prefix = f"{args.project_name}/datasets/{org}/{name}/{split}"
         num_records = 0
         with tempfile.TemporaryDirectory() as tmpdir:
+            # Schema divergence check (skip for single file)
+            if len(data_files) > 1:
+                column_map = _parse_column_map(getattr(args, 'column_map', None))
+                technique = getattr(args, 'technique', 'sft')
+                no_transform = getattr(args, 'no_transform', False)
+                file_records = _inspect_file_schemas(
+                    data_files, dataset_id, hf_token, tmpdir,
+                    column_map, technique, no_transform
+                )
+                _check_schema_divergence(file_records, dataset_id, technique)
             for data_file in data_files:
                 local_path = hf_hub_download(
                     repo_id=dataset_id,
@@ -367,17 +891,162 @@ def cmd_stage_hf(args):
                     local_dir=tmpdir,
                 )
-                # Count records (lines for JSONL)
-                with open(local_path, "r") as f:
-                    for line in f:
-                        if line.strip():
-                            num_records += 1
-                # Upload to S3
-                s3_key = f"{s3_prefix}/{os.path.basename(data_file)}"
-                s3_client.upload_file(local_path, args.output_bucket, s3_key)
-        s3_uri = f"s3://{args.output_bucket}/{s3_prefix}/{os.path.basename(data_files[0])}"
+                # Handle Parquet files: convert to JSONL for SageMaker compatibility
+                if data_file.endswith(".parquet"):
+                    try:
+                        import pyarrow.parquet as pq
+                        import json as json_mod
+                        table = pq.read_table(local_path)
+                        jsonl_filename = os.path.splitext(os.path.basename(data_file))[0] + ".jsonl"
+                        jsonl_path = os.path.join(tmpdir, jsonl_filename)
+                        # Parse column map and validate against first record
+                        column_map = _parse_column_map(getattr(args, 'column_map', None))
+                        technique = getattr(args, 'technique', 'sft')
+                        no_transform = getattr(args, 'no_transform', False)
+                        batches = table.to_batches(max_chunksize=1)
+                        first_record = batches[0].to_pylist()[0] if batches else {}
+                        _validate_dataset_columns(first_record, technique, getattr(args, 'column_map', None), f"{org}/{name}")
+                        # Apply column map to first record for detection
+                        mapped_first = _apply_column_map(first_record, column_map)
+                        required_columns = _get_required_columns(technique)
+                        schema_types = _get_schema_types(technique)
+                        # Detect chat-format columns on first record
+                        chat_columns = _detect_chat_columns(mapped_first, required_columns, schema_types)
+                        # Log detection results if any chat columns found
+                        if chat_columns:
+                            _log_flatten_info(chat_columns, no_transform)
+                        # If --no-transform is active and chat-format detected, halt with error
+                        if no_transform and chat_columns:
+                            col_name = next(iter(chat_columns))
+                            det = chat_columns[col_name]
+                            det_type = det.get("type")
+                            strategy = det.get("strategy", "")
+                            if det_type == "single_dict":
+                                strategy_desc = "single message dict with role+content"
+                            elif strategy == "extract":
+                                strategy_desc = "message list (single element)"
+                            elif strategy == "same_role":
+                                strategy_desc = f"message list ({det.get('count', 0)} messages, same role)"
+                            elif strategy == "multi_role":
+                                strategy_desc = f"message list (multi-turn, {det.get('count', 0)} messages)"
+                            else:
+                                strategy_desc = det_type
+                            _error_exit(
+                                f"Column '{col_name}' contains chat-format data (detected: {det_type}) but --no-transform is active.\n\n"
+                                f"   Remove --no-transform to enable automatic conversion:\n"
+                                f"      ./do/tune --technique {technique} --dataset hf://{org}/{name} [--column-map ...]\n\n"
+                                f"   Detected format: {strategy_desc}"
+                            )
+                        with open(jsonl_path, "w", encoding="utf-8") as out_f:
+                            for batch in table.to_batches():
+                                for row in batch.to_pylist():
+                                    mapped_row = _apply_column_map(row, column_map)
+                                    if chat_columns and not no_transform:
+                                        mapped_row = _flatten_record(mapped_row, chat_columns)
+                                    out_f.write(json_mod.dumps(mapped_row, ensure_ascii=False) + "\n")
+                                    num_records += 1
+                        # Upload converted JSONL
+                        s3_key = f"{s3_prefix}/{jsonl_filename}"
+                        s3_client.upload_file(jsonl_path, args.output_bucket, s3_key)
+                    except ImportError:
+                        _error_exit(
+                            "Dataset is in Parquet format but pyarrow is not installed. "
+                            "Please install: pip install pyarrow"
+                        )
+                else:
+                    # JSONL file — validate columns and apply mapping
+                    import json as json_mod
+                    column_map = _parse_column_map(getattr(args, 'column_map', None))
+                    technique = getattr(args, 'technique', 'sft')
+                    no_transform = getattr(args, 'no_transform', False)
+                    # Read first line to validate
+                    chat_columns = {}
+                    with open(local_path, "r", encoding="utf-8", errors="replace") as f:
+                        first_line = f.readline().strip()
+                        if first_line:
+                            first_record = json_mod.loads(first_line)
+                            _validate_dataset_columns(first_record, technique, getattr(args, 'column_map', None), f"{org}/{name}")
+                            # Apply column map to first record for detection
+                            mapped_first = _apply_column_map(first_record, column_map)
+                            required_columns = _get_required_columns(technique)
+                            schema_types = _get_schema_types(technique)
+                            # Detect chat-format columns on first record
+                            chat_columns = _detect_chat_columns(mapped_first, required_columns, schema_types)
+                            # Log detection results if any chat columns found
+                            if chat_columns:
+                                _log_flatten_info(chat_columns, no_transform)
+                            # If --no-transform is active and chat-format detected, halt with error
+                            if no_transform and chat_columns:
+                                col_name = next(iter(chat_columns))
+                                det = chat_columns[col_name]
+                                det_type = det.get("type")
+                                strategy = det.get("strategy", "")
+                                if det_type == "single_dict":
+                                    strategy_desc = "single message dict with role+content"
+                                elif strategy == "extract":
+                                    strategy_desc = "message list (single element)"
+                                elif strategy == "same_role":
+                                    strategy_desc = f"message list ({det.get('count', 0)} messages, same role)"
+                                elif strategy == "multi_role":
+                                    strategy_desc = f"message list (multi-turn, {det.get('count', 0)} messages)"
+                                else:
+                                    strategy_desc = det_type
+                                _error_exit(
+                                    f"Column '{col_name}' contains chat-format data (detected: {det_type}) but --no-transform is active.\n\n"
+                                    f"   Remove --no-transform to enable automatic conversion:\n"
+                                    f"      ./do/tune --technique {technique} --dataset hf://{org}/{name} [--column-map ...]\n\n"
+                                    f"   Detected format: {strategy_desc}"
+                                )
+                    # Rewrite the file with mapped (and optionally flattened) columns
+                    should_flatten = bool(chat_columns) and not no_transform
+                    if column_map or should_flatten:
+                        mapped_path = local_path + ".mapped"
+                        with open(local_path, "r", encoding="utf-8", errors="replace") as f_in, \
+                             open(mapped_path, "w", encoding="utf-8") as f_out:
+                            for line in f_in:
+                                line = line.strip()
+                                if not line:
+                                    continue
+                                record = json_mod.loads(line)
+                                mapped_record = _apply_column_map(record, column_map)
+                                if should_flatten:
+                                    mapped_record = _flatten_record(mapped_record, chat_columns)
+                                f_out.write(json_mod.dumps(mapped_record, ensure_ascii=False) + "\n")
+                                num_records += 1
+                        local_path = mapped_path
+                    else:
+                        # Count records
+                        with open(local_path, "r", encoding="utf-8", errors="replace") as f:
+                            for line in f:
+                                if line.strip():
+                                    num_records += 1
+                    # Upload to S3
+                    s3_key = f"{s3_prefix}/{os.path.basename(data_file)}"
+                    s3_client.upload_file(local_path, args.output_bucket, s3_key)
+        # Use the first file's name for the S3 URI (JSONL extension for Parquet conversions)
+        first_file = data_files[0]
+        if first_file.endswith(".parquet"):
+            output_filename = os.path.splitext(os.path.basename(first_file))[0] + ".jsonl"
+        else:
+            output_filename = os.path.basename(first_file)
+        s3_uri = f"s3://{args.output_bucket}/{s3_prefix}/{output_filename}"
         _output({
             "s3_uri": s3_uri,
@@ -475,9 +1144,194 @@ def _find_data_files(repo_files, split):
     if data_jsonl:
         return sorted(data_jsonl)
+    # Final fallback: any JSONL/JSON file in the repo root (single-file datasets)
+    root_data = [f for f in repo_files if "/" not in f and (f.endswith(".jsonl") or f.endswith(".json")) and not f.startswith(".")]
+    if root_data:
+        return sorted(root_data)
     return []
+def _is_glob_pattern(pattern):
+    """Return True if pattern contains glob metacharacters (*, ?, [)."""
+    return bool(_GLOB_METACHAR_RE.search(pattern))
+def _filter_data_files(data_files, pattern):
+    """Filter data files by glob or substring pattern.
+    If the pattern is empty or None, returns all files (no-filter).
+    If the pattern contains glob metacharacters (*, ?, [), uses fnmatch
+    against the full relative path. Otherwise, performs substring match
+    on the basename.
+    Args:
+        data_files: List of file paths from _find_data_files
+        pattern: The filter pattern string
+    Returns:
+        list: Filtered file paths that match the pattern
+    Raises:
+        SystemExit: via _error_exit if no files match (includes available files list)
+    """
+    if not pattern:
+        return data_files
+    if _is_glob_pattern(pattern):
+        matched = [f for f in data_files if fnmatch.fnmatch(f, pattern)]
+    else:
+        matched = [f for f in data_files if pattern in os.path.basename(f)]
+    if not matched:
+        file_list = "\n".join(f"  • {f}" for f in data_files)
+        _error_exit(
+            f"No files matched pattern '{pattern}'.\n\n"
+            f"Available files:\n{file_list}"
+        )
+    return matched
+def _inspect_file_schemas(data_files, dataset_id, hf_token, tmpdir,
+                          column_map, technique, no_transform):
+    """Inspect first record of each file to extract effective column sets.
+    Downloads each file, reads its first record, applies column-map and
+    flattening, then returns the resulting column names.
+    Args:
+        data_files: List of file paths to inspect
+        dataset_id: HF dataset identifier for downloads
+        hf_token: Authentication token
+        tmpdir: Temporary directory for downloads
+        column_map: Parsed column mapping dict
+        technique: Technique name for schema types
+        no_transform: Whether --no-transform is active
+    Returns:
+        list: [(filename, set_of_column_names), ...] for each file
+    """
+    from huggingface_hub import hf_hub_download
+    required_columns = _get_required_columns(technique)
+    schema_types = _get_schema_types(technique)
+    results = []
+    for data_file in data_files:
+        local_path = hf_hub_download(
+            repo_id=dataset_id,
+            filename=data_file,
+            repo_type="dataset",
+            token=hf_token,
+            local_dir=tmpdir,
+        )
+        first_record = {}
+        if data_file.endswith(".parquet"):
+            try:
+                import pyarrow.parquet as pq
+                table = pq.read_table(local_path)
+                batches = table.to_batches(max_chunksize=1)
+                if batches:
+                    first_record = batches[0].to_pylist()[0]
+            except ImportError:
+                _error_exit(
+                    "Dataset is in Parquet format but pyarrow is not installed. "
+                    "Please install: pip install pyarrow"
+                )
+        else:
+            import json as json_mod
+            with open(local_path, "r", encoding="utf-8", errors="replace") as f:
+                first_line = f.readline().strip()
+                if first_line:
+                    first_record = json_mod.loads(first_line)
+        # Apply column mapping
+        mapped_record = _apply_column_map(first_record, column_map)
+        # Apply flattening if --no-transform is not active
+        if not no_transform:
+            chat_columns = _detect_chat_columns(mapped_record, required_columns, schema_types)
+            if chat_columns:
+                mapped_record = _flatten_record(mapped_record, chat_columns)
+        results.append((data_file, set(mapped_record.keys())))
+    return results
+def _check_schema_divergence(file_records, dataset_id, technique):
+    """Check that all files have identical effective columns.
+    Args:
+        file_records: List of (filename, first_record_columns) tuples where
+                      first_record_columns is the set of column names after
+                      column-map and flattening
+        dataset_id: The dataset identifier (for error messages)
+        technique: The technique name (for error messages)
+    Returns:
+        None on success (all schemas match)
+    Raises:
+        SystemExit: via _error_exit with per-file column listing and
+                    ?file= remediation suggestion if schemas differ
+    """
+    if not file_records:
+        return None
+    # Compare all column sets to the first file's columns
+    first_columns = file_records[0][1]
+    all_identical = all(cols == first_columns for _, cols in file_records)
+    if all_identical:
+        return None
+    # Build per-file column listing
+    file_sections = []
+    for filename, columns in file_records:
+        sorted_cols = ", ".join(sorted(columns))
+        file_sections.append(
+            f"  \U0001f4c4 {filename}\n"
+            f"     Columns: {sorted_cols}"
+        )
+    # Derive remediation pattern from first file's basename
+    first_file = file_records[0][0]
+    basename = os.path.basename(first_file)
+    # Strip extension and wrap with wildcards for a useful pattern
+    name_without_ext = os.path.splitext(basename)[0]
+    # Use a distinctive portion — take the first numeric segment if present
+    import re as _re
+    numeric_match = _re.search(r'\d+', name_without_ext)
+    if numeric_match:
+        pattern_suggestion = f"*{numeric_match.group()}*"
+    else:
+        pattern_suggestion = f"*{name_without_ext}*"
+    # Build available files list
+    available_files = "\n".join(
+        f"     \u2022 {filename}" for filename, _ in file_records
+    )
+    # Build the full error message
+    file_listing = "\n\n".join(file_sections)
+    message = (
+        f"Schema divergence detected in dataset {dataset_id}.\n"
+        f"Files have different columns after applying column-map and transforms:\n\n"
+        f"{file_listing}\n\n"
+        f"\U0001f4a1 Use ?file=<pattern> to select compatible files:\n"
+        f"   ./do/tune --technique {technique} --dataset hf://{dataset_id}?file={pattern_suggestion}\n\n"
+        f"   Available files:\n{available_files}"
+    )
+    _error_exit(message)
 # ── Subcommand: validate ──────────────────────────────────────────────────────
@@ -648,6 +1502,53 @@ def _build_expected_format(schema):
     return "Each line must be a JSON object with: {" + ", ".join(fields) + "}"
+# ── Subcommand: discover ──────────────────────────────────────────────────────
+def cmd_discover(args):
+    """Query JumpStart Hub for tune-eligible models matching a family.
+    Returns: {"models": [str], "count": int}
+    """
+    import boto3
+    region = args.region or os.environ.get('AWS_REGION', 'us-east-1')
+    family = args.family or ""
+    # Map family names to Hub content name prefixes
+    FAMILY_PREFIX_MAP = {
+        "qwen-2.5": "huggingface-llm-qwen2-5",
+        "qwen-3": "huggingface-reasoning-qwen3",
+        "llama-3": "meta-textgeneration-llama-3",
+        "deepseek-r1": "deepseek-llm-r1-distill",
+        "gpt-oss": "openai-reasoning-gpt-oss",
+    }
+    prefix = FAMILY_PREFIX_MAP.get(family, args.filter or "")
+    if not prefix:
+        _error_exit("No family or filter provided for discovery")
+    try:
+        client = boto3.client("sagemaker", region_name=region)
+        models = []
+        paginator = client.get_paginator('list_hub_contents')
+        pages = paginator.paginate(
+            HubName="SageMakerPublicHub",
+            HubContentType="Model",
+            NameContains=prefix,
+            MaxResults=20
+        )
+        for page in pages:
+            for item in page.get('HubContentSummaries', []):
+                if item.get('HubContentStatus') == 'Available':
+                    models.append(item['HubContentName'])
+        _output({"models": models[:5], "count": len(models)})
+    except Exception as e:
+        _error_exit(f"Hub discovery failed: {e}")
 # ── CLI argument parsing ──────────────────────────────────────────────────────
@@ -661,6 +1562,8 @@ def main():
     # ── submit ────────────────────────────────────────────────────────────────
     submit_parser = subparsers.add_parser("submit", help="Submit a customization job")
     submit_parser.add_argument("--model-id", required=True, help="Model ID")
+    submit_parser.add_argument("--region", default=None,
+                               help="AWS region (defaults to AWS_REGION env var)")
     submit_parser.add_argument("--technique", required=True,
                                choices=["sft", "dpo", "rlaif", "rlvr"],
                                help="Customization technique")
@@ -695,6 +1598,8 @@ def main():
                                help="Lambda ARN for reward function (RLVR)")
     submit_parser.add_argument("--reward-prompt", default=None,
                                help="S3 URI for reward prompt (RLAIF)")
+    submit_parser.add_argument("--accept-eula", action="store_true", default=False,
+                               help="Accept model EULA for gated models (e.g., Llama)")
     # ── status ────────────────────────────────────────────────────────────────
     status_parser = subparsers.add_parser("status", help="Get job status and metrics")
@@ -725,6 +1630,8 @@ def main():
                                  help="Hugging Face dataset name")
     stage_hf_parser.add_argument("--hf-split", default="train",
                                  help="Dataset split (default: train)")
+    stage_hf_parser.add_argument("--hf-file", default=None,
+                                 help="File filter pattern (glob or substring)")
     stage_hf_parser.add_argument("--output-bucket", required=True,
                                  help="S3 bucket for staged dataset")
     stage_hf_parser.add_argument("--project-name", required=True,
@@ -733,6 +1640,13 @@ def main():
                                  help="AWS region")
     stage_hf_parser.add_argument("--hf-secret-name", default=None,
                                  help="Secrets Manager secret name for HF token")
+    stage_hf_parser.add_argument("--column-map", default=None,
+                                 help="Column mapping (e.g., prompt=question,completion=answer)")
+    stage_hf_parser.add_argument("--technique", default="sft",
+                                 choices=["sft", "dpo", "rlaif", "rlvr"],
+                                 help="Customization technique (determines required columns)")
+    stage_hf_parser.add_argument("--no-transform", action="store_true", default=False,
+                                 help="Disable automatic chat-format flattening")
     # ── validate ──────────────────────────────────────────────────────────────
     validate_parser = subparsers.add_parser("validate",
@@ -742,6 +1656,16 @@ def main():
     validate_parser.add_argument("--file", default="-",
                                  help="Path to dataset file (default: stdin)")
+    # ── discover ──────────────────────────────────────────────────────────────
+    discover_parser = subparsers.add_parser("discover",
+                                            help="Discover tune-eligible models from JumpStart Hub")
+    discover_parser.add_argument("--family", default="",
+                                 help="Model family name (e.g., qwen-3, llama-3, deepseek-r1)")
+    discover_parser.add_argument("--filter", default="",
+                                 help="Hub content name prefix filter (overrides family mapping)")
+    discover_parser.add_argument("--region", default="",
+                                 help="AWS region (default: AWS_REGION env or us-east-1)")
     # ── Parse and dispatch ────────────────────────────────────────────────────
     args = parser.parse_args()
@@ -755,6 +1679,7 @@ def main():
         "resolve": cmd_resolve,
         "stage-hf": cmd_stage_hf,
         "validate": cmd_validate,
+        "discover": cmd_discover,
     }
     handler = command_map.get(args.command)