PyPI - workbench - Versions diffs - 0.8.205__py3-none-any.whl → 0.8.212__py3-none-any.whl - Mend

workbench 0.8.205py3-none-any.whl → 0.8.212py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

workbench/algorithms/models/noise_model.py +388 -0
workbench/api/endpoint.py +3 -6
workbench/api/feature_set.py +1 -1
workbench/api/model.py +5 -11
workbench/cached/cached_model.py +4 -4
workbench/core/artifacts/endpoint_core.py +57 -145
workbench/core/artifacts/model_core.py +21 -19
workbench/core/transforms/features_to_model/features_to_model.py +2 -2
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +1 -1
workbench/model_script_utils/model_script_utils.py +335 -0
workbench/model_script_utils/pytorch_utils.py +395 -0
workbench/model_script_utils/uq_harness.py +278 -0
workbench/model_scripts/chemprop/chemprop.template +289 -666
workbench/model_scripts/chemprop/generated_model_script.py +292 -669
workbench/model_scripts/chemprop/model_script_utils.py +335 -0
workbench/model_scripts/chemprop/requirements.txt +2 -10
workbench/model_scripts/pytorch_model/generated_model_script.py +355 -612
workbench/model_scripts/pytorch_model/model_script_utils.py +335 -0
workbench/model_scripts/pytorch_model/pytorch.template +350 -607
workbench/model_scripts/pytorch_model/pytorch_utils.py +395 -0
workbench/model_scripts/pytorch_model/requirements.txt +1 -1
workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
workbench/model_scripts/script_generation.py +2 -5
workbench/model_scripts/uq_models/generated_model_script.py +65 -422
workbench/model_scripts/xgb_model/generated_model_script.py +349 -412
workbench/model_scripts/xgb_model/model_script_utils.py +335 -0
workbench/model_scripts/xgb_model/uq_harness.py +278 -0
workbench/model_scripts/xgb_model/xgb_model.template +344 -407
workbench/scripts/training_test.py +85 -0
workbench/utils/chemprop_utils.py +18 -656
workbench/utils/metrics_utils.py +172 -0
workbench/utils/model_utils.py +104 -47
workbench/utils/pytorch_utils.py +32 -472
workbench/utils/xgboost_local_crossfold.py +267 -0
workbench/utils/xgboost_model_utils.py +49 -356
workbench/web_interface/components/plugins/model_details.py +30 -68
{workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/METADATA +5 -5
{workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/RECORD +42 -31
{workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/entry_points.txt +1 -0
workbench/model_scripts/uq_models/mapie.template +0 -605
workbench/model_scripts/uq_models/requirements.txt +0 -1
{workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/WHEEL +0 -0
{workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/top_level.txt +0 -0

workbench/model_scripts/pytorch_model/pytorch.template CHANGED Viewed

@@ -1,39 +1,76 @@
-# Imports for PyTorch Tabular Model
+# PyTorch Tabular Model Template for Workbench
+#
+# This template handles both classification and regression models with:
+# - K-fold cross-validation ensemble training (or single train/val split)
+# - Out-of-fold predictions for validation metrics
+# - Categorical feature embedding via TabularMLP
+# - Compressed feature decompression
+import argparse
+import json
 import os
 import awswrangler as wr
+import joblib
 import numpy as np
-# PyTorch compatibility: pytorch-tabular saves complex objects, not just tensors
-# Use legacy loading behavior for compatibility (recommended by PyTorch docs for this scenario)
-os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
-from pytorch_tabular import TabularModel
-from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
-from pytorch_tabular.models import CategoryEmbeddingModelConfig
-# Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    median_absolute_error,
-    r2_score,
-    root_mean_squared_error,
-    precision_recall_fscore_support,
-    confusion_matrix,
-)
-from scipy.stats import spearmanr
-# Classification Encoder
+import pandas as pd
+import torch
+from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
 from sklearn.preprocessing import LabelEncoder
-# Scikit Learn Imports
-from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
+# Enable Tensor Core optimization for GPUs that support it
+torch.set_float32_matmul_precision("medium")
+from model_script_utils import (
+    check_dataframe,
+    compute_classification_metrics,
+    compute_regression_metrics,
+    convert_categorical_types,
+    decompress_features,
+    expand_proba_column,
+    input_fn,
+    match_features_case_insensitive,
+    output_fn,
+    print_classification_metrics,
+    print_confusion_matrix,
+    print_regression_metrics,
+)
+from pytorch_utils import (
+    FeatureScaler,
+    create_model,
+    load_model,
+    predict,
+    prepare_data,
+    save_model,
+    train_model,
+)
+from uq_harness import (
+    compute_confidence,
+    load_uq_models,
+    predict_intervals,
+    save_uq_models,
+    train_uq_models,
+)
-from io import StringIO
-import json
-import argparse
-import joblib
-import pandas as pd
+# =============================================================================
+# Default Hyperparameters
+# =============================================================================
+DEFAULT_HYPERPARAMETERS = {
+    # Training parameters
+    "n_folds": 5,
+    "max_epochs": 200,
+    "early_stopping_patience": 20,
+    "batch_size": 128,
+    # Model architecture
+    "layers": "256-128-64",
+    "learning_rate": 1e-3,
+    "dropout": 0.1,
+    "use_batch_norm": True,
+    # Random seed
+    "seed": 42,
+}
-# Template Parameters
+# Template parameters (filled in by Workbench)
 TEMPLATE_PARAMS = {
     "model_type": "{{model_type}}",
     "target": "{{target_column}}",
@@ -45,373 +82,167 @@ TEMPLATE_PARAMS = {
 }
-def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
-    """
-    Check if the provided dataframe is empty and raise an exception if it is.
-    Args:
-        df (pd.DataFrame): DataFrame to check
-        df_name (str): Name of the DataFrame
-    """
-    if df.empty:
-        msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
-        print(msg)
-        raise ValueError(msg)
-def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFrame:
-    """
-    Expands a column in a DataFrame containing a list of probabilities into separate columns.
-    Args:
-        df (pd.DataFrame): DataFrame containing a "pred_proba" column
-        class_labels (list[str]): List of class labels
-    Returns:
-        pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
-    """
-    proba_column = "pred_proba"
-    if proba_column not in df.columns:
-        raise ValueError('DataFrame does not contain a "pred_proba" column')
-    # Construct new column names with '_proba' suffix
-    proba_splits = [f"{label}_proba" for label in class_labels]
-    # Expand the proba_column into separate columns for each probability
-    proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
-    # Drop any proba columns and reset the index in prep for the concat
-    df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
-    df = df.reset_index(drop=True)
-    # Concatenate the new columns with the original DataFrame
-    df = pd.concat([df, proba_df], axis=1)
-    return df
-def match_features_case_insensitive(df: pd.DataFrame, model_features: list[str]) -> pd.DataFrame:
-    """
-    Matches and renames DataFrame columns to match model feature names (case-insensitive).
-    Prioritizes exact matches, then case-insensitive matches.
-    Raises ValueError if any model features cannot be matched.
-    """
-    df_columns_lower = {col.lower(): col for col in df.columns}
-    rename_dict = {}
-    missing = []
-    for feature in model_features:
-        if feature in df.columns:
-            continue  # Exact match
-        elif feature.lower() in df_columns_lower:
-            rename_dict[df_columns_lower[feature.lower()]] = feature
-        else:
-            missing.append(feature)
-    if missing:
-        raise ValueError(f"Features not found: {missing}")
-    # Rename the DataFrame columns to match the model features
-    return df.rename(columns=rename_dict)
-def convert_categorical_types(
-    df: pd.DataFrame, features: list[str], category_mappings: dict[str, list[str]] | None = None
-) -> tuple[pd.DataFrame, dict[str, list[str]]]:
-    """
-    Converts appropriate columns to categorical type with consistent mappings.
-    Args:
-        df (pd.DataFrame): The DataFrame to process.
-        features (list): List of feature names to consider for conversion.
-        category_mappings (dict, optional): Existing category mappings. If None or empty,
-                                            we're in training mode. If populated, we're in
-                                            inference mode.
-    Returns:
-        tuple: (processed DataFrame, category mappings dictionary)
-    """
-    if category_mappings is None:
-        category_mappings = {}
-    # Training mode
-    if not category_mappings:
-        for col in df.select_dtypes(include=["object", "string"]):
-            if col in features and df[col].nunique() < 20:
-                print(f"Training mode: Converting {col} to category")
-                df[col] = df[col].astype("category")
-                category_mappings[col] = df[col].cat.categories.tolist()
-    # Inference mode
-    else:
-        for col, categories in category_mappings.items():
-            if col in df.columns:
-                print(f"Inference mode: Applying categorical mapping for {col}")
-                df[col] = pd.Categorical(df[col], categories=categories)
-    return df, category_mappings
-def decompress_features(
-    df: pd.DataFrame, features: list[str], compressed_features: list[str]
-) -> tuple[pd.DataFrame, list[str]]:
-    """Prepare features for the model
-    Args:
-        df (pd.DataFrame): The features DataFrame
-        features (list[str]): Full list of feature names
-        compressed_features (list[str]): List of feature names to decompress (bitstrings)
-    Returns:
-        pd.DataFrame: DataFrame with the decompressed features
-        list[str]: Updated list of feature names after decompression
-    Raises:
-        ValueError: If any missing values are found in the specified features
-    """
-    # Check for any missing values in the required features
-    missing_counts = df[features].isna().sum()
-    if missing_counts.any():
-        missing_features = missing_counts[missing_counts > 0]
-        print(
-            f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
-            "WARNING: You might want to remove/replace all NaN values before processing."
-        )
-    # Make a copy to avoid mutating the original list
-    decompressed_features = features.copy()
-    for feature in compressed_features:
-        if (feature not in df.columns) or (feature not in decompressed_features):
-            print(f"Feature '{feature}' not in the features list, skipping decompression.")
-            continue
-        # Remove the feature from the list of features to avoid duplication
-        decompressed_features.remove(feature)
-        # Handle all compressed features as bitstrings
-        bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
-        prefix = feature[:3]
-        # Create all new columns at once - avoids fragmentation
-        new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
-        new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
-        # Add to features list
-        decompressed_features.extend(new_col_names)
-        # Drop original column and concatenate new ones
-        df = df.drop(columns=[feature])
-        df = pd.concat([df, new_df], axis=1)
-    return df, decompressed_features
+# =============================================================================
+# Model Loading (for SageMaker inference)
+# =============================================================================
 def model_fn(model_dir: str) -> dict:
-    """Load the PyTorch Tabular ensemble models from the specified directory.
-    Args:
-        model_dir: Directory containing the saved model(s)
-    Returns:
-        Dictionary with ensemble models and metadata
-    """
-    import torch
-    from functools import partial
-    # Load ensemble metadata if present
-    ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
-    if os.path.exists(ensemble_metadata_path):
-        ensemble_metadata = joblib.load(ensemble_metadata_path)
-        n_ensemble = ensemble_metadata["n_ensemble"]
+    """Load TabularMLP ensemble from the specified directory."""
+    # Load ensemble metadata
+    metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
+    if os.path.exists(metadata_path):
+        metadata = joblib.load(metadata_path)
+        n_ensemble = metadata["n_ensemble"]
     else:
         n_ensemble = 1
-    # Determine map_location for loading models (handle CUDA trained models on CPU inference)
-    map_location = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Determine device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
-    # Patch torch.load globally to use map_location (needed for joblib-loaded callbacks)
-    # This handles the case where pytorch-tabular loads callbacks.sav via joblib,
-    # which internally calls torch.load without map_location
-    original_torch_load = torch.load
-    torch.load = partial(original_torch_load, map_location=map_location)
-    # Save current working directory
-    original_cwd = os.getcwd()
+    # Load ensemble models
     ensemble_models = []
-    try:
-        # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
-        os.chdir("/tmp")
-        for ens_idx in range(n_ensemble):
-            # Try numbered model path first, fall back to legacy path
-            model_path = os.path.join(model_dir, f"tabular_model_{ens_idx}")
-            if not os.path.exists(model_path):
-                model_path = os.path.join(model_dir, "tabular_model")
-            model = TabularModel.load_model(model_path, map_location=map_location)
-            ensemble_models.append(model)
-    finally:
-        # Restore torch.load and working directory
-        torch.load = original_torch_load
-        os.chdir(original_cwd)
-    return {"ensemble_models": ensemble_models, "n_ensemble": n_ensemble}
-def input_fn(input_data, content_type: str) -> pd.DataFrame:
-    """Parse input data and return a DataFrame."""
-    if not input_data:
-        raise ValueError("Empty input data is not supported!")
-    # Decode bytes to string if necessary
-    if isinstance(input_data, bytes):
-        input_data = input_data.decode("utf-8")
-    if "text/csv" in content_type:
-        return pd.read_csv(StringIO(input_data))
-    elif "application/json" in content_type:
-        return pd.DataFrame(json.loads(input_data))  # Assumes JSON array of records
-    else:
-        raise ValueError(f"{content_type} not supported!")
-def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
-    """Supports both CSV and JSON output formats."""
-    if "text/csv" in accept_type:
-        csv_output = output_df.fillna("N/A").to_csv(index=False)
-        return csv_output, "text/csv"
-    elif "application/json" in accept_type:
-        return output_df.to_json(orient="records"), "application/json"
-    else:
-        raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
+    for i in range(n_ensemble):
+        model_path = os.path.join(model_dir, f"model_{i}")
+        model = load_model(model_path, device=device)
+        ensemble_models.append(model)
+    print(f"Loaded {len(ensemble_models)} model(s)")
+    # Load feature scaler
+    scaler = FeatureScaler.load(os.path.join(model_dir, "scaler.joblib"))
+    # Load UQ models (regression only)
+    uq_models, uq_metadata = None, None
+    uq_path = os.path.join(model_dir, "uq_metadata.json")
+    if os.path.exists(uq_path):
+        uq_models, uq_metadata = load_uq_models(model_dir)
+    return {
+        "ensemble_models": ensemble_models,
+        "n_ensemble": n_ensemble,
+        "scaler": scaler,
+        "uq_models": uq_models,
+        "uq_metadata": uq_metadata,
+    }
+# =============================================================================
+# Inference (for SageMaker inference)
+# =============================================================================
 def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
-    """Make Predictions with our PyTorch Tabular Model ensemble.
-    Args:
-        df (pd.DataFrame): The input DataFrame
-        model_dict: Dictionary containing ensemble models and metadata
-    Returns:
-        pd.DataFrame: The DataFrame with predictions (and prediction_std for ensembles)
-    """
+    """Make predictions with TabularMLP ensemble."""
     model_type = TEMPLATE_PARAMS["model_type"]
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
+    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
-    # Extract ensemble models
+    # Load artifacts
     ensemble_models = model_dict["ensemble_models"]
-    n_ensemble = model_dict["n_ensemble"]
+    scaler = model_dict["scaler"]
+    uq_models = model_dict.get("uq_models")
+    uq_metadata = model_dict.get("uq_metadata")
-    # Grab our feature columns (from training)
-    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
-    with open(os.path.join(model_dir, "feature_columns.json")) as fp:
-        features = json.load(fp)
-    print(f"Model Features: {features}")
+    with open(os.path.join(model_dir, "feature_columns.json")) as f:
+        features = json.load(f)
+    with open(os.path.join(model_dir, "category_mappings.json")) as f:
+        category_mappings = json.load(f)
+    with open(os.path.join(model_dir, "feature_metadata.json")) as f:
+        feature_metadata = json.load(f)
-    # Load the category mappings (from training)
-    with open(os.path.join(model_dir, "category_mappings.json")) as fp:
-        category_mappings = json.load(fp)
+    continuous_cols = feature_metadata["continuous_cols"]
+    categorical_cols = feature_metadata["categorical_cols"]
-    # Load our Label Encoder if we have one
     label_encoder = None
-    label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
-    if os.path.exists(label_encoder_path):
-        label_encoder = joblib.load(label_encoder_path)
+    encoder_path = os.path.join(model_dir, "label_encoder.joblib")
+    if os.path.exists(encoder_path):
+        label_encoder = joblib.load(encoder_path)
-    # Match features in a case-insensitive manner
-    matched_df = match_features_case_insensitive(df, features)
+    print(f"Model Features: {features}")
-    # Detect categorical types in the incoming DataFrame
+    # Prepare features
+    matched_df = match_features_case_insensitive(df, features)
     matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
-    # If we have compressed features, decompress them
     if compressed_features:
         print("Decompressing features for prediction...")
         matched_df, features = decompress_features(matched_df, features, compressed_features)
-    # Track rows with missing features
+    # Track missing features
     missing_mask = matched_df[features].isna().any(axis=1)
     if missing_mask.any():
-        print(f"Warning: {missing_mask.sum()} rows have missing features, will return NaN predictions")
+        print(f"Warning: {missing_mask.sum()} rows have missing features")
-    # Initialize prediction columns
+    # Initialize output columns
     df["prediction"] = np.nan
     if model_type in ["regressor", "uq_regressor"]:
         df["prediction_std"] = np.nan
-    # Only predict on complete rows
-    complete_df = matched_df[~missing_mask]
+    complete_df = matched_df[~missing_mask].copy()
     if len(complete_df) == 0:
         print("Warning: No complete rows to predict on")
         return df
-    # pytorch-tabular returns predictions using f"{target}_prediction" column
-    target = TEMPLATE_PARAMS["target"]
-    prediction_column = f"{target}_prediction"
-    # Collect predictions from all ensemble members
-    all_ensemble_preds = []
-    all_ensemble_probs = []
-    for ens_idx, ens_model in enumerate(ensemble_models):
-        result = ens_model.predict(complete_df[features])
-        if prediction_column in result.columns:
-            ens_preds = result[prediction_column].values
-        else:
-            raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
-        all_ensemble_preds.append(ens_preds)
+    # Prepare data for inference (with standardization)
+    x_cont, x_cat, _, _, _ = prepare_data(
+        complete_df, continuous_cols, categorical_cols, category_mappings=category_mappings, scaler=scaler
+    )
-        # For classification, collect probabilities
-        if label_encoder is not None:
-            prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
-            if prob_cols:
-                all_ensemble_probs.append(result[prob_cols].values)
+    # Collect ensemble predictions
+    all_preds = []
+    for model in ensemble_models:
+        preds = predict(model, x_cont, x_cat)
+        all_preds.append(preds)
-    # Stack and compute mean/std (std is 0 for single model)
-    ensemble_preds = np.stack(all_ensemble_preds, axis=0)  # (n_ensemble, n_samples)
+    # Aggregate predictions
+    ensemble_preds = np.stack(all_preds, axis=0)
     preds = np.mean(ensemble_preds, axis=0)
-    preds_std = np.std(ensemble_preds, axis=0)  # Will be 0s for n_ensemble=1
+    preds_std = np.std(ensemble_preds, axis=0)
-    print(f"Inference: Ensemble predictions shape: {preds.shape}, n_ensemble: {n_ensemble}")
+    print(f"Inference complete: {len(preds)} predictions, {len(ensemble_models)} ensemble members")
-    # Handle classification vs regression
     if label_encoder is not None:
-        # For classification, average probabilities then take argmax
-        if all_ensemble_probs:
-            ensemble_probs = np.stack(all_ensemble_probs, axis=0)  # (n_ensemble, n_samples, n_classes)
-            avg_probs = np.mean(ensemble_probs, axis=0)  # (n_samples, n_classes)
-            class_preds = np.argmax(avg_probs, axis=1)
-            predictions = label_encoder.inverse_transform(class_preds)
-            # Build full proba Series with None for missing rows
-            all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
-            all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
-            df["pred_proba"] = all_proba
-            # Expand the pred_proba column into separate columns for each class
-            df = expand_proba_column(df, label_encoder.classes_)
-        else:
-            # No probabilities, use averaged predictions
-            predictions = label_encoder.inverse_transform(preds.astype(int))
+        # Classification: average probabilities, then argmax
+        avg_probs = preds  # Already softmax output
+        class_preds = np.argmax(avg_probs, axis=1)
+        predictions = label_encoder.inverse_transform(class_preds)
+        all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
+        all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
+        df["pred_proba"] = all_proba
+        df = expand_proba_column(df, label_encoder.classes_)
     else:
-        # Regression (includes uq_regressor)
-        predictions = preds
-        df.loc[~missing_mask, "prediction_std"] = preds_std
+        # Regression
+        predictions = preds.flatten()
+        df.loc[~missing_mask, "prediction_std"] = preds_std.flatten()
+        # Add UQ intervals if available
+        if uq_models and uq_metadata:
+            X_complete = complete_df[features]
+            df_complete = df.loc[~missing_mask].copy()
+            df_complete["prediction"] = predictions  # Set prediction before compute_confidence
+            df_complete = predict_intervals(df_complete, X_complete, uq_models, uq_metadata)
+            df_complete = compute_confidence(df_complete, uq_metadata["median_interval_width"], "q_10", "q_90")
+            # Copy UQ columns back to main dataframe
+            for col in df_complete.columns:
+                if col.startswith("q_") or col == "confidence":
+                    df.loc[~missing_mask, col] = df_complete[col].values
-    # Set predictions only for complete rows
     df.loc[~missing_mask, "prediction"] = predictions
     return df
+# =============================================================================
+# Training
+# =============================================================================
 if __name__ == "__main__":
-    """The main function is for training the PyTorch Tabular model"""
+    # -------------------------------------------------------------------------
+    # Setup: Parse arguments and load data
+    # -------------------------------------------------------------------------
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
+    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
+    parser.add_argument("--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"))
+    args = parser.parse_args()
-    # Harness Template Parameters
+    # Extract template parameters
     target = TEMPLATE_PARAMS["target"]
     features = TEMPLATE_PARAMS["features"]
     orig_features = features.copy()
@@ -419,341 +250,253 @@ if __name__ == "__main__":
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
     model_type = TEMPLATE_PARAMS["model_type"]
     model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
-    hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
-    # Script arguments for input/output directories
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
-    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
-    parser.add_argument(
-        "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
-    )
-    args = parser.parse_args()
+    hyperparameters = {**DEFAULT_HYPERPARAMETERS, **(TEMPLATE_PARAMS["hyperparameters"] or {})}
-    # Read the training data into DataFrames
-    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
+    # Load training data
+    training_files = [os.path.join(args.train, f) for f in os.listdir(args.train) if f.endswith(".csv")]
     print(f"Training Files: {training_files}")
-    # Combine files and read them all into a single pandas dataframe
-    all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
-    # Print out some info about the dataframe
-    print(f"All Data Shape: {all_df.shape}")
-    print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
-    print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
-    # Check if the dataframe is empty
+    all_df = pd.concat([pd.read_csv(f, engine="python") for f in training_files])
     check_dataframe(all_df, "training_df")
-    # Drop any rows with missing feature values
-    initial_row_count = all_df.shape[0]
+    # Drop rows with missing features
+    initial_count = len(all_df)
     all_df = all_df.dropna(subset=features)
-    dropped_rows = initial_row_count - all_df.shape[0]
-    if dropped_rows > 0:
-        print(f"Dropped {dropped_rows} rows due to missing feature values.")
+    if len(all_df) < initial_count:
+        print(f"Dropped {initial_count - len(all_df)} rows with missing features")
-    # Features/Target output
     print(f"Target: {target}")
-    print(f"Features: {str(features)}")
+    print(f"Features: {features}")
+    print(f"Hyperparameters: {hyperparameters}")
-    # Convert any features that might be categorical to 'category' type
+    # -------------------------------------------------------------------------
+    # Preprocessing
+    # -------------------------------------------------------------------------
     all_df, category_mappings = convert_categorical_types(all_df, features)
-    # Print out some info about the dataframe
-    print(f"All Data Shape: {all_df.shape}")
-    print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
-    print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
-    # If we have compressed features, decompress them
     if compressed_features:
-        print(f"Decompressing features {compressed_features}...")
+        print(f"Decompressing features: {compressed_features}")
         all_df, features = decompress_features(all_df, features, compressed_features)
-    # Determine categorical and continuous columns
-    categorical_cols = [col for col in features if all_df[col].dtype.name == "category"]
-    continuous_cols = [col for col in features if col not in categorical_cols]
-    print(f"Categorical columns: {categorical_cols}")
-    print(f"Continuous columns: {continuous_cols}")
-    # Cast continuous columns to float
+    # Determine categorical vs continuous columns
+    categorical_cols = [c for c in features if all_df[c].dtype.name == "category"]
+    continuous_cols = [c for c in features if c not in categorical_cols]
     all_df[continuous_cols] = all_df[continuous_cols].astype("float64")
+    print(f"Categorical: {categorical_cols}")
+    print(f"Continuous: {len(continuous_cols)} columns")
-    # Choose the 'task' based on model type and set up the label encoder if needed
+    # -------------------------------------------------------------------------
+    # Classification setup
+    # -------------------------------------------------------------------------
+    label_encoder = None
+    n_outputs = 1
     if model_type == "classifier":
-        task = "classification"
-        # Encode the target column on full dataset for consistent encoding
         label_encoder = LabelEncoder()
         all_df[target] = label_encoder.fit_transform(all_df[target])
-        num_classes = len(label_encoder.classes_)
-    else:
-        task = "regression"
-        label_encoder = None
-        num_classes = None
+        n_outputs = len(label_encoder.classes_)
+        print(f"Class labels: {label_encoder.classes_.tolist()}")
-    # Use any hyperparameters to set up both the trainer and model configurations
-    print(f"Hyperparameters: {hyperparameters}")
-    n_folds = hyperparameters.get("n_folds", 5)  # Number of CV folds (default: 5)
+    # -------------------------------------------------------------------------
+    # Cross-validation setup
+    # -------------------------------------------------------------------------
+    n_folds = hyperparameters["n_folds"]
+    task = "classification" if model_type == "classifier" else "regression"
+    hidden_layers = [int(x) for x in hyperparameters["layers"].split("-")]
-    # =========================================================================
-    # UNIFIED TRAINING: Works for n_folds=1 (single model) or n_folds>1 (K-fold CV)
-    # =========================================================================
-    print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
+    # Get categorical cardinalities
+    categorical_cardinalities = [len(category_mappings.get(col, {})) for col in categorical_cols]
-    # Create fold splits
     if n_folds == 1:
-        # Single fold: use train/val split from "training" column or random split
         if "training" in all_df.columns:
-            print("Found training column, splitting data based on training column")
+            print("Using 'training' column for train/val split")
             train_idx = np.where(all_df["training"])[0]
             val_idx = np.where(~all_df["training"])[0]
         else:
-            print("WARNING: No training column found, splitting data with random 80/20 split")
-            indices = np.arange(len(all_df))
-            train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
+            print("WARNING: No 'training' column found, using random 80/20 split")
+            train_idx, val_idx = train_test_split(np.arange(len(all_df)), test_size=0.2, random_state=42)
         folds = [(train_idx, val_idx)]
     else:
-        # K-Fold CV
         if model_type == "classifier":
             kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
-            split_target = all_df[target]
+            folds = list(kfold.split(all_df, all_df[target]))
         else:
             kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
-            split_target = None
-        folds = list(kfold.split(all_df, split_target))
-    # Initialize storage for out-of-fold predictions
-    oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
-    if model_type == "classifier" and num_classes and num_classes > 1:
-        oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
-    else:
-        oof_proba = None
+            folds = list(kfold.split(all_df))
-    ensemble_models = []
+    print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold ensemble'}...")
-    # Set up PyTorch Tabular data configuration (shared across folds)
-    data_config = DataConfig(
-        target=[target],
-        continuous_cols=continuous_cols,
-        categorical_cols=categorical_cols,
-    )
+    # Fit scaler on all training data (used across all folds)
+    scaler = FeatureScaler()
+    scaler.fit(all_df, continuous_cols)
+    print(f"Fitted scaler on {len(continuous_cols)} continuous features")
-    # Model config defaults
-    model_defaults = {
-        "layers": "256-128-64",
-        "activation": "LeakyReLU",
-        "learning_rate": 1e-3,
-        "dropout": 0.1,
-        "use_batch_norm": True,
-        "initialization": "kaiming",
-    }
-    # Override defaults with model_config if present
-    model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
-    for key, value in model_overrides.items():
-        print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
-    model_params = {**model_defaults, **model_overrides}
+    # Determine device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
-    model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
-    optimizer_config = OptimizerConfig()
+    # -------------------------------------------------------------------------
+    # Training loop
+    # -------------------------------------------------------------------------
+    oof_predictions = np.full((len(all_df), n_outputs), np.nan, dtype=np.float64)
+    ensemble_models = []
     for fold_idx, (train_idx, val_idx) in enumerate(folds):
         print(f"\n{'='*50}")
-        print(f"Training Fold {fold_idx + 1}/{len(folds)}")
+        print(f"Fold {fold_idx + 1}/{len(folds)} - Train: {len(train_idx)}, Val: {len(val_idx)}")
         print(f"{'='*50}")
-        # Split data for this fold
         df_train = all_df.iloc[train_idx].reset_index(drop=True)
         df_val = all_df.iloc[val_idx].reset_index(drop=True)
-        print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
-        # Set up PyTorch Tabular trainer configuration (per-fold for batch_size)
-        # Calculate batch size that avoids single-sample last batch (batch norm requires >1)
-        batch_size = min(128, max(32, len(df_train) // 16))
-        if len(df_train) % batch_size == 1:
-            batch_size += 1  # Adjust to avoid last batch of size 1
-        trainer_defaults = {
-            "auto_lr_find": False,
-            "batch_size": batch_size,
-            "max_epochs": 200,
-            "min_epochs": 10,
-            "early_stopping": "valid_loss",
-            "early_stopping_patience": 20,
-            "checkpoints": "valid_loss",
-            "accelerator": "auto",
-            "progress_bar": "none",
-            "gradient_clip_val": 1.0,
-            "seed": 42 + fold_idx,
-        }
-        # Override defaults with training_config if present
-        training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
-        if fold_idx == 0:  # Only print overrides once
-            for key, value in training_overrides.items():
-                print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
-        trainer_params = {**trainer_defaults, **training_overrides}
-        trainer_config = TrainerConfig(**trainer_params)
-        # Create and train the TabularModel for this fold
-        tabular_model = TabularModel(
-            data_config=data_config,
-            model_config=model_config,
-            optimizer_config=optimizer_config,
-            trainer_config=trainer_config,
+        # Prepare data (using pre-fitted scaler)
+        train_x_cont, train_x_cat, train_y, _, _ = prepare_data(
+            df_train, continuous_cols, categorical_cols, target, category_mappings, scaler=scaler
+        )
+        val_x_cont, val_x_cat, val_y, _, _ = prepare_data(
+            df_val, continuous_cols, categorical_cols, target, category_mappings, scaler=scaler
         )
-        tabular_model.fit(train=df_train, validation=df_val)
-        ensemble_models.append(tabular_model)
-        # Make out-of-fold predictions
-        result = tabular_model.predict(df_val, include_input_features=False)
-        fold_preds = result[f"{target}_prediction"].values
+        # Create model
+        torch.manual_seed(hyperparameters["seed"] + fold_idx)
+        model = create_model(
+            n_continuous=len(continuous_cols),
+            categorical_cardinalities=categorical_cardinalities,
+            hidden_layers=hidden_layers,
+            n_outputs=n_outputs,
+            task=task,
+            dropout=hyperparameters["dropout"],
+            use_batch_norm=hyperparameters["use_batch_norm"],
+        )
-        # Store out-of-fold predictions
-        if model_type == "classifier":
-            oof_predictions[val_idx] = fold_preds.astype(int)
-            prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
-            if prob_cols and oof_proba is not None:
-                oof_proba[val_idx] = result[prob_cols].values
-        else:
-            oof_predictions[val_idx] = fold_preds.flatten()
+        # Train
+        model, history = train_model(
+            model,
+            train_x_cont, train_x_cat, train_y,
+            val_x_cont, val_x_cat, val_y,
+            task=task,
+            max_epochs=hyperparameters["max_epochs"],
+            patience=hyperparameters["early_stopping_patience"],
+            batch_size=hyperparameters["batch_size"],
+            learning_rate=hyperparameters["learning_rate"],
+            device=device,
+        )
+        ensemble_models.append(model)
-        print(f"Fold {fold_idx + 1} complete!")
+        # Out-of-fold predictions
+        fold_preds = predict(model, val_x_cont, val_x_cat)
+        oof_predictions[val_idx] = fold_preds
     print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
-    # Use out-of-fold predictions for metrics
-    # For n_folds=1, we only have predictions for val_idx, so filter to those rows
+    # -------------------------------------------------------------------------
+    # Prepare validation results
+    # -------------------------------------------------------------------------
     if n_folds == 1:
-        val_mask = ~np.isnan(oof_predictions)
-        preds = oof_predictions[val_mask]
+        val_mask = ~np.isnan(oof_predictions[:, 0])
         df_val = all_df[val_mask].copy()
-        if oof_proba is not None:
-            oof_proba = oof_proba[val_mask]
+        predictions = oof_predictions[val_mask]
     else:
-        preds = oof_predictions
         df_val = all_df.copy()
+        predictions = oof_predictions
-    # Compute prediction_std by running all ensemble models on validation data
-    # For n_folds=1, std will be 0 (only one model). For n_folds>1, std shows ensemble disagreement.
-    preds_std = None
-    if model_type in ["regressor", "uq_regressor"] and len(ensemble_models) > 0:
-        print("Computing prediction_std from ensemble predictions on validation data...")
-        all_ensemble_preds_for_std = []
-        for ens_model in ensemble_models:
-            result = ens_model.predict(df_val[features], include_input_features=False)
-            ens_preds = result[f"{target}_prediction"].values.flatten()
-            all_ensemble_preds_for_std.append(ens_preds)
-        ensemble_preds_stacked = np.stack(all_ensemble_preds_for_std, axis=0)
-        preds_std = np.std(ensemble_preds_stacked, axis=0)
-        print(f"Ensemble prediction_std - mean: {np.mean(preds_std):.4f}, max: {np.max(preds_std):.4f}")
+    # Decode labels for classification
+    if model_type == "classifier":
+        class_preds = np.argmax(predictions, axis=1)
+        df_val[target] = label_encoder.inverse_transform(df_val[target].astype(int))
+        df_val["prediction"] = label_encoder.inverse_transform(class_preds)
+        df_val["pred_proba"] = [p.tolist() for p in predictions]
+        df_val = expand_proba_column(df_val, label_encoder.classes_)
+    else:
+        df_val["prediction"] = predictions.flatten()
+    # -------------------------------------------------------------------------
+    # Compute and print metrics
+    # -------------------------------------------------------------------------
+    y_true = df_val[target].values
+    y_pred = df_val["prediction"].values
     if model_type == "classifier":
-        # Get probabilities for classification
-        if oof_proba is not None:
-            df_val = df_val.copy()
-            df_val["pred_proba"] = [p.tolist() for p in oof_proba]
-            df_val = expand_proba_column(df_val, label_encoder.classes_)
-        # Decode the target and prediction labels
-        y_validate = label_encoder.inverse_transform(df_val[target])
-        preds_decoded = label_encoder.inverse_transform(preds.astype(int))
+        score_df = compute_classification_metrics(y_true, y_pred, label_encoder.classes_, target)
+        print_classification_metrics(score_df, target, label_encoder.classes_)
+        print_confusion_matrix(y_true, y_pred, label_encoder.classes_)
     else:
-        y_validate = df_val[target].values
-        preds_decoded = preds
+        metrics = compute_regression_metrics(y_true, y_pred)
+        print_regression_metrics(metrics)
+        # Compute ensemble prediction_std
+        if n_folds > 1:
+            # Re-run inference with all models to get std
+            x_cont, x_cat, _, _, _ = prepare_data(
+                df_val, continuous_cols, categorical_cols, category_mappings=category_mappings, scaler=scaler
+            )
+            all_preds = [predict(m, x_cont, x_cat).flatten() for m in ensemble_models]
+            df_val["prediction_std"] = np.std(np.stack(all_preds), axis=0)
+            print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
+        else:
+            df_val["prediction_std"] = 0.0
-    # Save predictions to S3
-    df_val = df_val.copy()
-    df_val["prediction"] = preds_decoded
+        # Train UQ models for uncertainty quantification
+        print("\n" + "=" * 50)
+        print("Training UQ Models")
+        print("=" * 50)
+        uq_models, uq_metadata = train_uq_models(
+            all_df[features], all_df[target], df_val[features], y_true
+        )
+        df_val = predict_intervals(df_val, df_val[features], uq_models, uq_metadata)
+        df_val = compute_confidence(df_val, uq_metadata["median_interval_width"])
-    # Build output columns - include id_column if it exists
+    # -------------------------------------------------------------------------
+    # Save validation predictions to S3
+    # -------------------------------------------------------------------------
     output_columns = []
     if id_column in df_val.columns:
         output_columns.append(id_column)
     output_columns += [target, "prediction"]
-    # Add prediction_std for regression models (always present, 0 for single model)
-    if model_type in ["regressor", "uq_regressor"]:
-        if preds_std is not None:
-            df_val["prediction_std"] = preds_std
-        else:
-            df_val["prediction_std"] = 0.0
+    if model_type != "classifier":
         output_columns.append("prediction_std")
-        print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
+        output_columns += [c for c in df_val.columns if c.startswith("q_") or c == "confidence"]
+    output_columns += [c for c in df_val.columns if c.endswith("_proba")]
+    wr.s3.to_csv(df_val[output_columns], f"{model_metrics_s3_path}/validation_predictions.csv", index=False)
+    # -------------------------------------------------------------------------
+    # Save model artifacts
+    # -------------------------------------------------------------------------
+    model_config = {
+        "n_continuous": len(continuous_cols),
+        "categorical_cardinalities": categorical_cardinalities,
+        "hidden_layers": hidden_layers,
+        "n_outputs": n_outputs,
+        "task": task,
+        "dropout": hyperparameters["dropout"],
+        "use_batch_norm": hyperparameters["use_batch_norm"],
+    }
-    output_columns += [col for col in df_val.columns if col.endswith("_proba")]
-    wr.s3.to_csv(
-        df_val[output_columns],
-        path=f"{model_metrics_s3_path}/validation_predictions.csv",
-        index=False,
-    )
+    for idx, m in enumerate(ensemble_models):
+        save_model(m, os.path.join(args.model_dir, f"model_{idx}"), model_config)
+    print(f"Saved {len(ensemble_models)} model(s)")
-    # Report Performance Metrics
-    if model_type == "classifier":
-        # Get the label names and their integer mapping
-        label_names = label_encoder.classes_
-        # Calculate various model performance metrics
-        scores = precision_recall_fscore_support(y_validate, preds_decoded, average=None, labels=label_names)
-        # Put the scores into a dataframe
-        score_df = pd.DataFrame(
-            {
-                target: label_names,
-                "precision": scores[0],
-                "recall": scores[1],
-                "f1": scores[2],
-                "support": scores[3],
-            }
-        )
+    joblib.dump({"n_ensemble": len(ensemble_models), "n_folds": n_folds}, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
-        # Output metrics per class
-        metrics = ["precision", "recall", "f1", "support"]
-        for t in label_names:
-            for m in metrics:
-                value = score_df.loc[score_df[target] == t, m].iloc[0]
-                print(f"Metrics:{t}:{m} {value}")
+    with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as f:
+        json.dump(orig_features, f)
-        # Compute and output the confusion matrix
-        conf_mtx = confusion_matrix(y_validate, preds_decoded, labels=label_names)
-        for i, row_name in enumerate(label_names):
-            for j, col_name in enumerate(label_names):
-                value = conf_mtx[i, j]
-                print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
+    with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as f:
+        json.dump(category_mappings, f)
-    else:
-        # Calculate various model performance metrics (regression)
-        rmse = root_mean_squared_error(y_validate, preds_decoded)
-        mae = mean_absolute_error(y_validate, preds_decoded)
-        medae = median_absolute_error(y_validate, preds_decoded)
-        r2 = r2_score(y_validate, preds_decoded)
-        spearman_corr = spearmanr(y_validate, preds_decoded).correlation
-        support = len(df_val)
-        print(f"rmse: {rmse:.3f}")
-        print(f"mae: {mae:.3f}")
-        print(f"medae: {medae:.3f}")
-        print(f"r2: {r2:.3f}")
-        print(f"spearmanr: {spearman_corr:.3f}")
-        print(f"support: {support}")
-    # Save ensemble models
-    for model_idx, ens_model in enumerate(ensemble_models):
-        model_path = os.path.join(args.model_dir, f"tabular_model_{model_idx}")
-        ens_model.save_model(model_path)
-        print(f"Saved model {model_idx + 1} to {model_path}")
-    # Save ensemble metadata
-    n_ensemble = len(ensemble_models)
-    ensemble_metadata = {"n_ensemble": n_ensemble, "n_folds": n_folds}
-    joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
-    print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
+    with open(os.path.join(args.model_dir, "feature_metadata.json"), "w") as f:
+        json.dump({"continuous_cols": continuous_cols, "categorical_cols": categorical_cols}, f)
+    with open(os.path.join(args.model_dir, "hyperparameters.json"), "w") as f:
+        json.dump(hyperparameters, f, indent=2)
+    scaler.save(os.path.join(args.model_dir, "scaler.joblib"))
     if label_encoder:
         joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
-    # Save the features (this will validate input during predictions)
-    with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
-        json.dump(orig_features, fp)
+    if model_type != "classifier":
+        save_uq_models(uq_models, uq_metadata, args.model_dir)
-    # Save the category mappings
-    with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
-        json.dump(category_mappings, fp)
+    print(f"\nModel training complete! Artifacts saved to {args.model_dir}")

workbench 0.8.205__py3-none-any.whl → 0.8.212__py3-none-any.whl

workbench 0.8.205py3-none-any.whl → 0.8.212py3-none-any.whl