npm - @aws/ml-container-creator - Versions diffs - 0.13.4 → 0.15.0 - Mend

@aws/ml-container-creator 0.13.4 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

package/README.md +23 -5
package/config/parameter-schema-v2.json +32 -4
package/infra/ci-harness/lib/ci-harness-stack.ts +13 -5
package/infra/ci-harness/package-lock.json +122 -116
package/infra/ci-harness/package.json +1 -1
package/package.json +5 -3
package/pyproject.toml +21 -0
package/requirements.txt +19 -0
package/servers/instance-sizer/index.js +72 -4
package/servers/instance-sizer/lib/model-resolver.js +28 -2
package/src/app.js +17 -0
package/src/lib/bootstrap-command-handler.js +33 -23
package/src/lib/config-loader.js +18 -0
package/src/lib/config-manager.js +6 -1
package/src/lib/dataset-slug.js +152 -0
package/src/lib/generated/cli-options.js +9 -3
package/src/lib/generated/parameter-matrix.js +14 -3
package/src/lib/generated/validation-rules.js +1 -1
package/src/lib/mcp-query-runner.js +6 -0
package/src/lib/prompt-runner.js +5 -0
package/src/lib/prompts/feature-prompts.js +1 -1
package/src/lib/template-manager.js +0 -7
package/src/lib/template-variable-resolver.js +51 -1
package/src/lib/tune-config-state.js +14 -1
package/templates/do/.adapter_helper.py +451 -0
package/templates/do/.benchmark_writer.py +22 -0
package/templates/do/.register_helper.py +1163 -0
package/templates/do/.stage_helper.py +419 -0
package/templates/do/.tune_helper.py +379 -65
package/templates/do/__pycache__/.adapter_helper.cpython-312.pyc +0 -0
package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
package/templates/do/__pycache__/.tune_helper.cpython-312.pyc +0 -0
package/templates/do/adapter +427 -27
package/templates/do/add-ic +85 -3
package/templates/do/benchmark +173 -15
package/templates/do/config +24 -0
package/templates/do/lib/inference-component.sh +56 -3
package/templates/do/lib/profile.sh +5 -0
package/templates/do/register +552 -6
package/templates/do/stage +91 -272
package/templates/do/test +12 -2
package/templates/do/tune +264 -12

package/templates/do/.tune_helper.py CHANGED Viewed

@@ -29,6 +29,12 @@ warnings.filterwarnings("ignore", category=DeprecationWarning)
 warnings.filterwarnings("ignore", message=".*urllib3.*")
 warnings.filterwarnings("ignore", message=".*charset_normalizer.*")
+# Suppress ALL logging to prevent sagemaker-core/rich from writing to stdout.
+# This script outputs JSON on stdout — any other stdout output corrupts parsing.
+import logging as _logging
+_logging.disable(_logging.CRITICAL)
+os.environ.setdefault("SAGEMAKER_LOG_LEVEL", "CRITICAL")
 # ── Inline dependency check ───────────────────────────────────────────────────
 MIN_SAGEMAKER_VERSION = "3.0"
@@ -71,6 +77,164 @@ def _output(data):
     sys.exit(0)
+def _sanitize_for_json(value):
+    """Convert sagemaker-core Unassigned sentinel values to None for JSON serialization.
+    sagemaker-core uses an 'Unassigned' type instead of None for unset fields.
+    This function converts any non-standard types to JSON-safe values.
+    """
+    if value is None:
+        return None
+    # Check for Unassigned type from sagemaker-core
+    type_name = type(value).__name__
+    if type_name == "Unassigned" or type_name == "UnassignedValue":
+        return None
+    if isinstance(value, (str, int, float, bool)):
+        return value
+    if isinstance(value, dict):
+        return {k: _sanitize_for_json(v) for k, v in value.items()}
+    if isinstance(value, list):
+        return [_sanitize_for_json(v) for v in value]
+    # For other types, try str conversion as fallback
+    try:
+        # Check if it's JSON serializable as-is
+        import json as _json
+        _json.dumps(value)
+        return value
+    except (TypeError, ValueError):
+        return str(value) if value else None
+# ── Registry resolution helpers ───────────────────────────────────────────────
+def _resolve_dataset_name(dataset_name):
+    """Resolve a registered dataset name to S3 URI (or ARN) via .register_helper.py.
+    Calls the resolve-dataset subcommand of .register_helper.py and returns
+    the resolved value. If the response contains an 'arn' field (Backlog #023,
+    AI Registry mode), returns the ARN for use with SFTTrainer(training_dataset=arn).
+    Otherwise returns the S3 URI for backward compatibility.
+    """
+    import subprocess
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    helper_path = os.path.join(script_dir, ".register_helper.py")
+    if not os.path.exists(helper_path):
+        _error_exit(
+            f"Cannot resolve dataset '{dataset_name}': .register_helper.py not found. "
+            f"Register datasets first with: ./do/register --dataset"
+        )
+    try:
+        result = subprocess.run(
+            ["python3", helper_path, "resolve-dataset", "--name", dataset_name],
+            capture_output=True, text=True, timeout=30
+        )
+    except subprocess.TimeoutExpired:
+        _error_exit(f"Timeout resolving dataset '{dataset_name}' from registry")
+    except Exception as e:
+        _error_exit(f"Failed to resolve dataset '{dataset_name}': {e}")
+    if result.returncode != 0:
+        _error_exit(
+            f"Dataset '{dataset_name}' not found in registry. "
+            f"Register it first: ./do/register --dataset --dataset-name {dataset_name} --dataset-s3-uri s3://..."
+        )
+    # Parse JSON output from resolve-dataset
+    try:
+        output = json.loads(result.stdout.strip())
+    except (json.JSONDecodeError, ValueError):
+        _error_exit(
+            f"Failed to parse registry response for dataset '{dataset_name}'. "
+            f"Raw output: {result.stdout[:200]}"
+        )
+    if "error" in output:
+        _error_exit(
+            f"Dataset '{dataset_name}' not found in registry: {output['error']}. "
+            f"Register it first: ./do/register --dataset --dataset-name {dataset_name} --dataset-s3-uri s3://..."
+        )
+    # Prefer ARN if available (Backlog #023 — AI Registry mode)
+    # When arn is present, use it directly with SFTTrainer(training_dataset=arn)
+    arn = output.get("arn")
+    if arn:
+        return arn
+    # Fallback: use S3 URI
+    s3_uri = output.get("s3_uri", "")
+    if not s3_uri:
+        _error_exit(
+            f"Dataset '{dataset_name}' resolved but has no S3 URI or ARN. "
+            f"Re-register with: ./do/register --dataset --dataset-name {dataset_name} --dataset-s3-uri s3://..."
+        )
+    return s3_uri
+def _resolve_evaluator_name(evaluator_name):
+    """Resolve a registered evaluator name to type and ARN/URI via .register_helper.py.
+    Returns (evaluator_type, arn_or_uri) tuple.
+    evaluator_type is "lambda" for RLVR or "model" for RLAIF.
+    """
+    import subprocess
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    helper_path = os.path.join(script_dir, ".register_helper.py")
+    if not os.path.exists(helper_path):
+        _error_exit(
+            f"Cannot resolve evaluator '{evaluator_name}': .register_helper.py not found. "
+            f"Register evaluators first with: ./do/register --evaluator"
+        )
+    try:
+        result = subprocess.run(
+            ["python3", helper_path, "resolve-evaluator", "--name", evaluator_name],
+            capture_output=True, text=True, timeout=30
+        )
+    except subprocess.TimeoutExpired:
+        _error_exit(f"Timeout resolving evaluator '{evaluator_name}' from registry")
+    except Exception as e:
+        _error_exit(f"Failed to resolve evaluator '{evaluator_name}': {e}")
+    if result.returncode != 0:
+        _error_exit(
+            f"Evaluator '{evaluator_name}' not found in registry. "
+            f"Register it first: ./do/register --evaluator --evaluator-name {evaluator_name} ..."
+        )
+    # Parse JSON output from resolve-evaluator
+    try:
+        output = json.loads(result.stdout.strip())
+    except (json.JSONDecodeError, ValueError):
+        _error_exit(
+            f"Failed to parse registry response for evaluator '{evaluator_name}'. "
+            f"Raw output: {result.stdout[:200]}"
+        )
+    if "error" in output:
+        _error_exit(
+            f"Evaluator '{evaluator_name}' not found in registry: {output['error']}. "
+            f"Register it first: ./do/register --evaluator --evaluator-name {evaluator_name} ..."
+        )
+    ev_type = output.get("type", "")
+    arn_or_uri = output.get("arn_or_uri", "")
+    if not arn_or_uri:
+        _error_exit(
+            f"Evaluator '{evaluator_name}' resolved but has no ARN/URI. "
+            f"Re-register with: ./do/register --evaluator --evaluator-name {evaluator_name} ..."
+        )
+    return ev_type, arn_or_uri
 # ── Subcommand: submit ────────────────────────────────────────────────────────
@@ -90,6 +254,26 @@ def cmd_submit(args):
         os.environ["AWS_DEFAULT_REGION"] = region
         os.environ.setdefault("AWS_REGION", region)
+    # ── Resolve --dataset-name from registry (AC-2b.4) ────────────────────────
+    # --dataset-s3-uri wins if both are provided (backward compatible override)
+    if not args.dataset_s3_uri and args.dataset_name:
+        resolved_uri = _resolve_dataset_name(args.dataset_name)
+        args.dataset_s3_uri = resolved_uri
+    elif not args.dataset_s3_uri and not args.dataset_name:
+        _error_exit(
+            "Either --dataset-s3-uri or --dataset-name is required. "
+            "Provide an S3 URI directly or a registered dataset name."
+        )
+    # ── Resolve --evaluator-name from registry (AC-2c.3, AC-2c.4) ────────────
+    # --reward-function / --reward-prompt win if provided (backward compatible override)
+    if args.evaluator_name and not args.reward_function and not args.reward_prompt:
+        ev_type, ev_arn_or_uri = _resolve_evaluator_name(args.evaluator_name)
+        if ev_type == "lambda":
+            args.reward_function = ev_arn_or_uri
+        else:
+            args.reward_prompt = ev_arn_or_uri
     _check_sagemaker_sdk()
     # SDK v3 moved trainers from sagemaker.modules.train → sagemaker.train
@@ -171,20 +355,25 @@ def cmd_submit(args):
                 trainer_kwargs["accept_eula"] = True
             # Resolve model package group — create if it doesn't exist
+            # Using sagemaker-core ModelPackageGroup.create() per SDK v3 policy
             mpg_name = args.model_package_group or f"{args.project_name}-tune-models"
             try:
-                import boto3 as _boto3
-                _sm = _boto3.client("sagemaker", region_name=args.region or os.environ.get("AWS_REGION", "us-west-2"))
-                _sm.describe_model_package_group(ModelPackageGroupName=mpg_name)
-            except Exception as _mpg_err:
-                if "does not exist" in str(_mpg_err) or "ValidationException" in str(_mpg_err):
-                    try:
-                        _sm.create_model_package_group(
-                            ModelPackageGroupName=mpg_name,
-                            ModelPackageGroupDescription=f"Fine-tuned models for {args.project_name}",
-                        )
-                    except Exception:
-                        pass  # May already exist or lack permissions — let the trainer handle it
+                from sagemaker.core.resources import ModelPackageGroup
+                from botocore.exceptions import ClientError as _ClientError
+                try:
+                    ModelPackageGroup.get(model_package_group_name=mpg_name)
+                except (_ClientError, Exception) as _mpg_err:
+                    if "does not exist" in str(_mpg_err) or "ValidationException" in str(_mpg_err):
+                        try:
+                            ModelPackageGroup.create(
+                                model_package_group_name=mpg_name,
+                                model_package_group_description=f"Fine-tuned models for {args.project_name}",
+                            )
+                        except Exception:
+                            pass  # May already exist or lack permissions — let the trainer handle it
+            except ImportError:
+                # sagemaker-core not available — skip MPG creation, let trainer handle it
+                pass
             trainer_kwargs["model_package_group"] = mpg_name
             trainer = trainer_cls(**trainer_kwargs)
@@ -267,7 +456,9 @@ def cmd_submit(args):
             job_arn = job_arn or getattr(latest_job, 'arn', None)
         # If we still don't have the actual job name (SDK appends suffix),
-        # query ListTrainingJobs to find it by our base_job_name prefix
+        # query ListTrainingJobs to find it by our base_job_name prefix.
+        # Note: list_training_jobs with NameContains filter is not available
+        # via sagemaker-core resource API, so boto3 is retained here.
         if not job_name or job_name == args.job_name:
             import boto3 as _boto3
             _sm = _boto3.client("sagemaker", region_name=args.region or os.environ.get("AWS_REGION", "us-west-2"))
@@ -335,23 +526,28 @@ def cmd_submit(args):
 def cmd_status(args):
-    """Query job status via DescribeTrainingJob.
+    """Query job status via sagemaker-core TrainingJob.get().
-    Falls back to ListTrainingJobs with name-contains if exact name not found
+    Falls back to boto3 ListTrainingJobs with name-contains if exact name not found
     (SDK v3 appends a timestamp suffix to the base job name).
     Returns: {"status": str, "failure_reason": str|None,
               "metrics": dict|None, "elapsed_seconds": int}
     """
-    import boto3
+    # Set region before any sagemaker import (creates boto3 clients at import time)
+    region = getattr(args, 'region', None) or os.environ.get('AWS_DEFAULT_REGION') or os.environ.get('AWS_REGION')
+    if region:
+        os.environ['AWS_DEFAULT_REGION'] = region
+        os.environ.setdefault('AWS_REGION', region)
-    client = boto3.client("sagemaker", region_name=args.region)
+    from sagemaker.core.resources import TrainingJob
+    from botocore.exceptions import ClientError
-    # Try exact name first
-    response = None
+    # Try exact name first via sagemaker-core
+    job = None
     try:
-        response = client.describe_training_job(TrainingJobName=args.job_name)
-    except client.exceptions.ClientError as e:
+        job = TrainingJob.get(training_job_name=args.job_name)
+    except ClientError as e:
         error_code = e.response["Error"]["Code"]
         if error_code != "ValidationException":
             _error_exit(f"Failed to describe training job: {e}")
@@ -360,8 +556,13 @@ def cmd_status(args):
         _error_exit(f"Failed to describe training job: {e}")
     # Fallback: search by name prefix (SDK appends timestamp suffix)
-    if response is None:
+    # Note: TrainingJob.get_all() with name_contains is not available in
+    # sagemaker-core for list operations, so we use boto3 list_training_jobs
+    # to find the actual name, then call TrainingJob.get() with it.
+    if job is None:
         try:
+            import boto3
+            client = boto3.client("sagemaker", region_name=args.region)
             list_response = client.list_training_jobs(
                 NameContains=args.job_name,
                 SortBy="CreationTime",
@@ -371,18 +572,26 @@ def cmd_status(args):
             summaries = list_response.get("TrainingJobSummaries", [])
             if summaries:
                 actual_name = summaries[0]["TrainingJobName"]
-                response = client.describe_training_job(TrainingJobName=actual_name)
+                job = TrainingJob.get(training_job_name=actual_name)
             else:
                 _error_exit(f"Training job not found: {args.job_name}")
         except Exception as e:
             _error_exit(f"Failed to find training job: {e}")
-    status = response.get("TrainingJobStatus", "Unknown")
-    failure_reason = response.get("FailureReason")
+    # Read status attributes directly from the TrainingJob resource object.
+    # sagemaker-core returns status values in the same casing as the API
+    # (e.g., "InProgress", "Completed", "Failed", "Stopped").
+    status = getattr(job, "training_job_status", "Unknown") or "Unknown"
+    failure_reason = getattr(job, "failure_reason", None)
     # Calculate elapsed time
-    start_time = response.get("TrainingStartTime")
-    end_time = response.get("TrainingEndTime")
+    start_time = getattr(job, "training_start_time", None)
+    end_time = getattr(job, "training_end_time", None)
+    # Convert Unassigned sentinel to None
+    if start_time and type(start_time).__name__ in ("Unassigned", "UnassignedValue"):
+        start_time = None
+    if end_time and type(end_time).__name__ in ("Unassigned", "UnassignedValue"):
+        end_time = None
     elapsed_seconds = 0
     if start_time:
@@ -393,24 +602,30 @@ def cmd_status(args):
     # Extract final metrics if available
     metrics = None
-    final_metrics = response.get("FinalMetricDataList")
+    final_metrics = getattr(job, "final_metric_data_list", None)
+    if final_metrics and type(final_metrics).__name__ in ("Unassigned", "UnassignedValue"):
+        final_metrics = None
     if final_metrics:
         metrics = {}
         for metric in final_metrics:
-            metrics[metric["MetricName"]] = metric["Value"]
+            # sagemaker-core returns metrics as objects with snake_case attributes
+            metric_name = getattr(metric, "metric_name", None) or metric.get("MetricName", "")
+            metric_value = getattr(metric, "value", None) or metric.get("Value", 0)
+            metrics[metric_name] = metric_value
     # Get output path if completed
     output_path = None
     if status == "Completed":
-        model_artifacts = response.get("ModelArtifacts", {})
-        output_path = model_artifacts.get("S3ModelArtifacts")
+        model_artifacts = getattr(job, "model_artifacts", None)
+        if model_artifacts:
+            output_path = getattr(model_artifacts, "s3_model_artifacts", None)
     _output({
-        "status": status,
-        "failure_reason": failure_reason,
-        "metrics": metrics,
+        "status": _sanitize_for_json(status),
+        "failure_reason": _sanitize_for_json(failure_reason),
+        "metrics": _sanitize_for_json(metrics),
         "elapsed_seconds": elapsed_seconds,
-        "output_path": output_path,
+        "output_path": _sanitize_for_json(output_path),
     })
@@ -420,28 +635,37 @@ def cmd_status(args):
 def cmd_resolve(args):
     """Resolve artifact path within S3 output directory.
+    Uses sagemaker-core TrainingJob.get() to read model_artifacts and
+    output_data_config. Uses ModelPackage for model package lookup.
     Returns: {"artifact_path": str, "model_package_arn": str|None,
               "output_type": str}
     """
-    import boto3
+    # Set region before any sagemaker import (creates boto3 clients at import time)
+    region = getattr(args, 'region', None) or os.environ.get('AWS_DEFAULT_REGION') or os.environ.get('AWS_REGION')
+    if region:
+        os.environ['AWS_DEFAULT_REGION'] = region
+        os.environ.setdefault('AWS_REGION', region)
-    client = boto3.client("sagemaker", region_name=args.region)
+    from sagemaker.core.resources import TrainingJob
     try:
-        response = client.describe_training_job(TrainingJobName=args.job_name)
+        job = TrainingJob.get(training_job_name=args.job_name)
     except Exception as e:
         _error_exit(f"Failed to describe training job: {e}")
-    status = response.get("TrainingJobStatus")
+    status = getattr(job, "training_job_status", None)
     if status != "Completed":
         _error_exit(
             f"Cannot resolve artifacts for job in status: {status}. "
             f"Job must be Completed."
         )
-    # Get the S3 model artifacts path
-    model_artifacts = response.get("ModelArtifacts", {})
-    artifact_path = model_artifacts.get("S3ModelArtifacts", "")
+    # Get the S3 model artifacts path from TrainingJob resource
+    model_artifacts = getattr(job, "model_artifacts", None)
+    artifact_path = ""
+    if model_artifacts:
+        artifact_path = getattr(model_artifacts, "s3_model_artifacts", "") or ""
     if not artifact_path:
         _error_exit("No model artifacts found in training job output.")
@@ -461,6 +685,9 @@ def cmd_resolve(args):
     model_package_arn = None
     if args.model_package_group:
         try:
+            # Use boto3 for list_model_packages since sagemaker-core ModelPackage
+            # doesn't have a direct list-by-group method with sort/limit
+            import boto3
             mp_client = boto3.client("sagemaker", region_name=args.region)
             packages = mp_client.list_model_packages(
                 ModelPackageGroupName=args.model_package_group,
@@ -476,8 +703,8 @@ def cmd_resolve(args):
             pass
     _output({
-        "artifact_path": artifact_path,
-        "model_package_arn": model_package_arn,
+        "artifact_path": _sanitize_for_json(artifact_path),
+        "model_package_arn": _sanitize_for_json(model_package_arn),
         "output_type": output_type,
     })
@@ -765,11 +992,12 @@ def _get_schema_types(technique):
     return schemas.get(technique, {"prompt": "string", "completion": "string"})
-def _validate_dataset_columns(first_record, technique, column_map_str, dataset_id):
+def _validate_dataset_columns(first_record, technique, column_map_str, dataset_id, take=None):
     """Validate that the first record has required columns after mapping.
     Returns (mapped_record, column_map_dict) on success.
     Calls _error_exit with helpful suggestions on failure.
+    If take is provided, includes --take N in the suggested command.
     """
     column_map = _parse_column_map(column_map_str)
     mapped = _apply_column_map(first_record, column_map)
@@ -794,12 +1022,14 @@ def _validate_dataset_columns(first_record, technique, column_map_str, dataset_i
     if suggestion:
         lines.append(f"")
         lines.append(f"   💡 Suggested fix:")
-        lines.append(f"      ./do/tune --technique {technique} --dataset hf://{dataset_id} --column-map {suggestion}")
+        take_suffix = f" --take {take}" if take else ""
+        lines.append(f"      ./do/tune --technique {technique} --dataset hf://{dataset_id} --column-map {suggestion}{take_suffix}")
     else:
         lines.append(f"")
         lines.append(f"   💡 Use --column-map to rename columns:")
         example_map = ",".join(f"{r}=<your_column>" for r in missing)
-        lines.append(f"      ./do/tune --technique {technique} --dataset hf://{dataset_id} --column-map {example_map}")
+        take_suffix = f" --take {take}" if take else ""
+        lines.append(f"      ./do/tune --technique {technique} --dataset hf://{dataset_id} --column-map {example_map}{take_suffix}")
     lines.append(f"")
     lines.append(f"   First record sample:")
@@ -811,6 +1041,16 @@ def _validate_dataset_columns(first_record, technique, column_map_str, dataset_i
     _error_exit("\n".join(lines))
+def _check_empty_fields(record, required_columns):
+    """Return list of required column names that are empty/blank in this record."""
+    empty = []
+    for col in required_columns:
+        value = record.get(col, "")
+        if value is None or (isinstance(value, str) and not value.strip()):
+            empty.append(col)
+    return empty
 def cmd_stage_hf(args):
     """Download HF dataset to S3 using huggingface_hub.
@@ -854,21 +1094,35 @@ def cmd_stage_hf(args):
         # Find the appropriate data file for the split
         data_files = _find_data_files(repo_files, split)
+        # Apply file filter if --hf-file is provided
+        hf_file_pattern = getattr(args, 'hf_file', None)
+        if not data_files and hf_file_pattern:
+            # Split-based lookup found nothing, but user specified a file filter.
+            # Fall back to filtering directly from all data files in the repo.
+            all_data_files = [
+                f for f in repo_files
+                if f.endswith(('.parquet', '.jsonl', '.json'))
+                and not f.startswith('.')
+            ]
+            if all_data_files:
+                data_files = _filter_data_files(all_data_files, hf_file_pattern)
+        elif hf_file_pattern and data_files:
+            # Normal case: apply file filter to split-matched results
+            data_files = _filter_data_files(data_files, hf_file_pattern)
         if not data_files:
             _error_exit(
                 f"No data files found for split '{split}' in dataset {dataset_id}. "
                 f"Available files: {', '.join(repo_files[:20])}"
             )
-        # Apply file filter if --hf-file is provided
-        hf_file_pattern = getattr(args, 'hf_file', None)
-        if hf_file_pattern:
-            data_files = _filter_data_files(data_files, hf_file_pattern)
         # Download and upload to S3
         s3_client = boto3.client("s3", region_name=args.region)
         s3_prefix = f"{args.project_name}/datasets/{org}/{name}/{split}"
         num_records = 0
+        empty_field_counts = {}  # Track empty required fields: {field_name: count}
         with tempfile.TemporaryDirectory() as tmpdir:
             # Schema divergence check (skip for single file)
@@ -907,7 +1161,7 @@ def cmd_stage_hf(args):
                         no_transform = getattr(args, 'no_transform', False)
                         batches = table.to_batches(max_chunksize=1)
                         first_record = batches[0].to_pylist()[0] if batches else {}
-                        _validate_dataset_columns(first_record, technique, getattr(args, 'column_map', None), f"{org}/{name}")
+                        _validate_dataset_columns(first_record, technique, getattr(args, 'column_map', None), f"{org}/{name}", take=getattr(args, 'take', None))
                         # Apply column map to first record for detection
                         mapped_first = _apply_column_map(first_record, column_map)
@@ -944,16 +1198,31 @@ def cmd_stage_hf(args):
                                 f"   Detected format: {strategy_desc}"
                             )
+                        take_limit = getattr(args, 'take', None)
                         with open(jsonl_path, "w", encoding="utf-8") as out_f:
                             for batch in table.to_batches():
                                 for row in batch.to_pylist():
+                                    if take_limit and num_records >= take_limit:
+                                        break
                                     mapped_row = _apply_column_map(row, column_map)
                                     if chat_columns and not no_transform:
                                         mapped_row = _flatten_record(mapped_row, chat_columns)
+                                    # Track empty required fields
+                                    for col in _check_empty_fields(mapped_row, required_columns):
+                                        empty_field_counts[col] = empty_field_counts.get(col, 0) + 1
                                     out_f.write(json_mod.dumps(mapped_row, ensure_ascii=False) + "\n")
                                     num_records += 1
+                                if take_limit and num_records >= take_limit:
+                                    break
                         # Upload converted JSONL
+                        # Verify file has content before uploading
+                        file_size = os.path.getsize(jsonl_path)
+                        if file_size == 0:
+                            _error_exit(
+                                f"Converted JSONL file is empty (0 bytes) after processing "
+                                f"{num_records} records. This is a bug — please report it."
+                            )
                         s3_key = f"{s3_prefix}/{jsonl_filename}"
                         s3_client.upload_file(jsonl_path, args.output_bucket, s3_key)
@@ -975,7 +1244,7 @@ def cmd_stage_hf(args):
                         first_line = f.readline().strip()
                         if first_line:
                             first_record = json_mod.loads(first_line)
-                            _validate_dataset_columns(first_record, technique, getattr(args, 'column_map', None), f"{org}/{name}")
+                            _validate_dataset_columns(first_record, technique, getattr(args, 'column_map', None), f"{org}/{name}", take=getattr(args, 'take', None))
                             # Apply column map to first record for detection
                             mapped_first = _apply_column_map(first_record, column_map)
@@ -1014,11 +1283,14 @@ def cmd_stage_hf(args):
                     # Rewrite the file with mapped (and optionally flattened) columns
                     should_flatten = bool(chat_columns) and not no_transform
-                    if column_map or should_flatten:
+                    take_limit = getattr(args, 'take', None)
+                    if column_map or should_flatten or take_limit:
                         mapped_path = local_path + ".mapped"
                         with open(local_path, "r", encoding="utf-8", errors="replace") as f_in, \
                              open(mapped_path, "w", encoding="utf-8") as f_out:
                             for line in f_in:
+                                if take_limit and num_records >= take_limit:
+                                    break
                                 line = line.strip()
                                 if not line:
                                     continue
@@ -1026,15 +1298,32 @@ def cmd_stage_hf(args):
                                 mapped_record = _apply_column_map(record, column_map)
                                 if should_flatten:
                                     mapped_record = _flatten_record(mapped_record, chat_columns)
+                                # Track empty required fields
+                                for col in _check_empty_fields(mapped_record, _get_required_columns(technique)):
+                                    empty_field_counts[col] = empty_field_counts.get(col, 0) + 1
                                 f_out.write(json_mod.dumps(mapped_record, ensure_ascii=False) + "\n")
                                 num_records += 1
                         local_path = mapped_path
                     else:
-                        # Count records
-                        with open(local_path, "r", encoding="utf-8", errors="replace") as f:
-                            for line in f:
-                                if line.strip():
-                                    num_records += 1
+                        # Count records (and truncate if --take specified)
+                        take_limit = getattr(args, 'take', None)
+                        if take_limit:
+                            # Need to rewrite the file truncated
+                            mapped_path = local_path + ".mapped"
+                            with open(local_path, "r", encoding="utf-8", errors="replace") as f_in, \
+                                 open(mapped_path, "w", encoding="utf-8") as f_out:
+                                for line in f_in:
+                                    if num_records >= take_limit:
+                                        break
+                                    if line.strip():
+                                        f_out.write(line)
+                                        num_records += 1
+                            local_path = mapped_path
+                        else:
+                            with open(local_path, "r", encoding="utf-8", errors="replace") as f:
+                                for line in f:
+                                    if line.strip():
+                                        num_records += 1
                     # Upload to S3
                     s3_key = f"{s3_prefix}/{os.path.basename(data_file)}"
@@ -1048,6 +1337,19 @@ def cmd_stage_hf(args):
             output_filename = os.path.basename(first_file)
         s3_uri = f"s3://{args.output_bucket}/{s3_prefix}/{output_filename}"
+        # Warn if required columns have many empty values
+        if num_records > 0 and empty_field_counts:
+            for field, count in empty_field_counts.items():
+                pct = (count / num_records) * 100
+                if pct > 30:
+                    print(
+                        f"\u26a0\ufe0f  Warning: {pct:.0f}% of records ({count}/{num_records}) "
+                        f"have empty '{field}' after column mapping.\n"
+                        f"   SageMaker may reject these as invalid samples.\n"
+                        f"   Consider using a different --column-map or dataset.",
+                        file=sys.stderr,
+                    )
         _output({
             "s3_uri": s3_uri,
             "num_records": num_records,
@@ -1124,12 +1426,12 @@ def _find_data_files(repo_files, split):
         if pattern in repo_files:
             return [pattern]
-    # Prefix match for sharded files
-    matches = []
+    # Prefix match for sharded files (deduplicate via set)
+    matches = set()
     for f in repo_files:
         for pattern in patterns[4:]:
             if pattern in f:
-                matches.append(f)
+                matches.add(f)
     if matches:
         return sorted(matches)
@@ -1508,6 +1810,10 @@ def _build_expected_format(schema):
 def cmd_discover(args):
     """Query JumpStart Hub for tune-eligible models matching a family.
+    NOTE: This subcommand intentionally stays on boto3.client('sagemaker')
+    because list_hub_contents / Hub API is NOT available in sagemaker-core.
+    This is a documented exception per the SDK v3 migration policy.
     Returns: {"models": [str], "count": int}
     """
     region = args.region or os.environ.get('AWS_REGION', 'us-east-1')
@@ -1532,6 +1838,8 @@ def cmd_discover(args):
         _error_exit("Hub discovery failed: boto3 is not installed. Install with: pip install boto3")
     try:
+        # Documented exception: Hub API (list_hub_contents) is not available in
+        # sagemaker-core, so we retain boto3.client('sagemaker') here.
         client = boto3.client("sagemaker", region_name=region)
         models = []
         paginator = client.get_paginator('list_hub_contents')
@@ -1573,8 +1881,10 @@ def main():
     submit_parser.add_argument("--training-type", required=True,
                                choices=["lora", "full-rank"],
                                help="Training type (lora or full-rank)")
-    submit_parser.add_argument("--dataset-s3-uri", required=True,
-                               help="S3 URI of the training dataset")
+    submit_parser.add_argument("--dataset-s3-uri", required=False, default=None,
+                               help="S3 URI of the training dataset (direct override)")
+    submit_parser.add_argument("--dataset-name", default=None,
+                               help="Registered dataset name to resolve from registry")
     submit_parser.add_argument("--output-bucket", required=True,
                                help="S3 bucket for output artifacts")
     submit_parser.add_argument("--role-arn", required=True,
@@ -1601,6 +1911,8 @@ def main():
                                help="Lambda ARN for reward function (RLVR)")
     submit_parser.add_argument("--reward-prompt", default=None,
                                help="S3 URI for reward prompt (RLAIF)")
+    submit_parser.add_argument("--evaluator-name", default=None,
+                               help="Registered evaluator name to resolve from registry")
     submit_parser.add_argument("--accept-eula", action="store_true", default=False,
                                help="Accept model EULA for gated models (e.g., Llama)")
@@ -1650,6 +1962,8 @@ def main():
                                  help="Customization technique (determines required columns)")
     stage_hf_parser.add_argument("--no-transform", action="store_true", default=False,
                                  help="Disable automatic chat-format flattening")
+    stage_hf_parser.add_argument("--take", type=int, default=None,
+                                 help="Take only the first N records from the dataset")
     # ── validate ──────────────────────────────────────────────────────────────
     validate_parser = subparsers.add_parser("validate",