PyPI - workbench - Versions diffs - 0.8.201__py3-none-any.whl → 0.8.204__py3-none-any.whl - Mend

workbench 0.8.201py3-none-any.whl → 0.8.204py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

workbench/model_scripts/pytorch_model/generated_model_script.py CHANGED Viewed

@@ -13,17 +13,19 @@ from pytorch_tabular.models import CategoryEmbeddingModelConfig
 # Model Performance Scores
 from sklearn.metrics import (
     mean_absolute_error,
+    median_absolute_error,
     r2_score,
     root_mean_squared_error,
     precision_recall_fscore_support,
     confusion_matrix,
 )
+from scipy.stats import spearmanr
 # Classification Encoder
 from sklearn.preprocessing import LabelEncoder
 # Scikit Learn Imports
-from sklearn.model_selection import train_test_split
+from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
 from io import StringIO
 import json
@@ -33,13 +35,13 @@ import pandas as pd
 # Template Parameters
 TEMPLATE_PARAMS = {
-    "model_type": "regressor",
-    "target": "caco_2_efflux",
-    "features": ['smr_vsa3', 'xch_6dv', 'tpsa', 'numhdonors', 'vsa_estate2', 'fr_imidazole', 'mollogp', 'molmr', 'vsa_estate3', 'bcut2d_mwhi', 'slogp_vsa1', 'peoe_vsa6', 'peoe_vsa9', 'fr_halogen', 'vsa_estate10', 'fr_pyridine', 'minabsestateindex', 'bcut2d_mwlow', 'peoe_vsa2', 'bcut2d_logplow', 'vsa_estate7', 'peoe_vsa1', 'bcut2d_mrlow', 'slogp_vsa4', 'peoe_vsa7', 'peoe_vsa8', 'bcut2d_logphi', 'estate_vsa8', 'sv', 'fpdensitymorgan1', 'peoe_vsa3', 'slogp_vsa2', 'vsa_estate8', 'hallkieralpha', 'chi2v', 'axp_3dv', 'estate_vsa7', 'peoe_vsa11', 'xc_3d', 'vsa_estate4', 'vsa_estate9', 'bcut2d_chghi', 'vsa_estate5', 'c1sp2', 'slogp_vsa6', 'axp_3d', 'axp_2dv', 'bcut2d_mrhi', 'xc_3dv', 'xpc_4d', 'chi3n', 'xp_3dv', 'kappa3', 'vsa_estate6', 'minestateindex', 'mp', 'avgipc', 'axp_5dv', 'heavyatommolwt', 'maxpartialcharge', 'peoe_vsa10', 'kappa1', 'nocount', 'xp_6d', 'xpc_4dv', 'axp_4dv', 'sz', 'axp_4d', 'xc_5d', 'qed', 'xch_7dv', 'axp_1dv', 'chi0v', 'smr_vsa5', 'maxabspartialcharge', 'minpartialcharge', 'estate_vsa5', 'fpdensitymorgan3', 'fpdensitymorgan2', 'labuteasa', 'fr_methoxy', 'kappa2', 'sps', 'mi', 'bcut2d_chglo', 'vsa_estate1', 'fr_nh2', 'axp_1d', 'maxestateindex', 'estate_vsa3', 'bertzct', 'phi', 'fractioncsp3', 'xc_5dv', 'smr_vsa6', 'estate_vsa6', 'fr_ar_coo', 'axp_7dv', 'slogp_vsa8', 'estate_vsa2', 'axp_2d', 'balabanj', 'mz', 'estate_vsa4', 'smr_vsa7', 'slogp_vsa5', 'chi0n', 'xp_5d', 'xch_7d', 'chi4n', 'smr_vsa10', 'numheterocycles', 'chi1v', 'axp_5d', 'axp_0dv', 'fr_al_oh', 'chi2n', 'fr_nh0', 'chi1n', 'xp_7dv', 'xpc_5d', 'ringcount', 'xpc_5dv', 'peoe_vsa12', 'xpc_6dv', 'xp_6dv', 'mse', 'sp', 'xpc_6d', 'chi4v', 'fr_aniline', 'c2sp3', 'peoe_vsa4', 'numheteroatoms', 'smr_vsa9', 'xp_7d', 'axp_6d', 'molwt', 'numrotatablebonds', 'smr_vsa4', 'stereo_complexity', 'axp_7d', 'slogp_vsa3', 'spe', 'num_r_centers', 'si', 'axp_0d', 'xp_2d', 'xp_4d', 'c2sp2', 'fr_aryl_methyl', 'mpe', 'xch_6d', 'axp_6dv', 'numsaturatedcarbocycles', 'fr_para_hydroxylation', 'estate_vsa10', 'estate_vsa1', 'hybratio', 'numhacceptors', 'naromatom', 'chi1', 'fr_urea', 'xp_3d', 'smr_vsa1', 'num_s_centers', 'xch_5dv', 'c3sp2', 'mare', 'xp_5dv', 'fr_al_oh_notert', 'estate_vsa9', 'fr_piperdine', 'numunspecifiedatomstereocenters', 'chi3v', 'c3sp3', 'chi0', 'numsaturatedheterocycles', 'xp_4dv', 'fr_amide', 'fr_nhpyrrole', 'mv', 'fr_ar_n', 'xc_4dv', 'fr_morpholine', 'fr_ndealkylation2', 'xch_3d', 'xch_4d', 'slogp_vsa10', 'fr_ar_oh', 'fr_benzene', 'fr_nh1', 'c1sp1', 'sse', 'num_defined_stereocenters', 'xch_4dv', 'peoe_vsa14', 'xch_5d', 'fr_hoccn', 'fr_nitrile', 'mm', 'fr_priamide', 'xc_6dv', 'num_unspecified_stereocenters', 'fr_ether', 'fr_piperzine', 'fr_bicyclic', 'fr_term_acetylene'],
+    "model_type": "uq_regressor",
+    "target": "mppb",
+    "features": ['mollogp', 'mi', 'fr_benzene', 'smr_vsa3', 'fr_halogen', 'c2sp2', 'peoe_vsa6', 'bcut2d_mwhi', 'vsa_estate1', 'mv', 'numaromaticcarbocycles', 'vsa_estate5', 'fr_nh0', 'mm', 'smr_vsa7', 'tpsa', 'c1sp2', 'mz', 'vsa_estate2', 'peoe_vsa7', 'vsa_estate10', 'vsa_estate7', 'vsa_estate6', 'smr_vsa10', 'slogp_vsa2', 'bcut2d_logphi', 'naromatom', 'axp_2dv', 'bcut2d_mrhi', 'vsa_estate8', 'slogp_vsa3', 'vsa_estate4', 'xpc_6dv', 'slogp_vsa12', 'peoe_vsa9', 'mp', 'slogp_vsa1', 'peoe_vsa1', 'xch_5dv', 'qed', 'vsa_estate3', 'fpdensitymorgan3', 'axp_2d', 'axp_0d', 'mse', 'numhacceptors', 'bertzct', 'estate_vsa8', 'minestateindex', 'estate_vsa3', 'fpdensitymorgan2', 'smr_vsa6', 'peoe_vsa8', 'slogp_vsa6', 'xp_5dv', 'hallkieralpha', 'avgipc', 'fr_arn', 'xp_7d', 'mare', 'xp_6d', 'bcut2d_mrlow', 'estate_vsa4', 'bcut2d_logplow', 'peoe_vsa10', 'maxabspartialcharge', 'peoe_vsa3', 'bcut2d_mwlow', 'axp_7d', 'minpartialcharge', 'xpc_4d', 'axp_1d', 'estate_vsa9', 'vsa_estate9', 'estate_vsa7', 'maxestateindex', 'estate_vsa6', 'smr_vsa1', 'xpc_6d', 'xch_7d', 'xc_5d', 'phi', 'axp_0dv', 'axp_3dv', 'mpe', 'xc_3d', 'xch_5d', 'xc_5dv', 'xch_6d', 'chi4n', 'axp_7dv', 'slogp_vsa5', 'axp_1dv', 'xch_6dv', 'minabsestateindex', 'numrotatablebonds', 'peoe_vsa2', 'estate_vsa2', 'slogp_vsa8', 'bcut2d_chglo', 'xch_7dv', 'kappa2', 'axp_4dv', 'xc_3dv', 'kappa1', 'nbase', 'xpc_5dv', 'maxpartialcharge', 'bcut2d_chghi', 'axp_5d', 'balabanj', 'xpc_5d', 'fpdensitymorgan1', 'xp_5d', 'smr_vsa5', 'axp_4d', 'kappa3', 'fr_morpholine', 'estate_vsa5', 'chi2n', 'labuteasa', 'axp_5dv', 'molwt', 'smr_vsa9', 'maxabsestateindex', 'xp_7dv', 'fr_bicyclic', 'numaliphaticheterocycles', 'axp_6dv', 'slogp_vsa4', 'axp_3d', 'xp_6dv', 'nocount', 'axp_6d', 'fr_aniline', 'xpc_4dv', 'xp_1d', 'c3sp2', 'numheterocycles', 'nhohcount', 'molmr', 'numaromaticheterocycles', 'chi0', 'minabspartialcharge', 'fr_ar_n', 'xp_3d', 'chi2v', 'fr_ether', 'chi1v', 'chi1', 'xp_2d', 'xp_4dv', 'xp_4d', 'chi4v', 'fr_pyridine', 'smr_vsa4', 'sps', 'chi3n', 'heavyatommolwt', 'slogp_vsa11', 'fr_aryl_methyl', 'si', 'fractioncsp3', 'sse', 'fr_para_hydroxylation', 'slogp_vsa10', 'c1sp3', 'exactmolwt', 'numsaturatedheterocycles', 'chi1n', 'chi0n', 'fcsp3'],
+    "id_column": "molecule_name",
     "compressed_features": [],
-    "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/caco2-efflux-ref-pytorch/training",
-    "train_all_data": True,
-    "hyperparameters": {},
+    "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/mppb-reg-pytorch/training",
+    "hyperparameters": {'n_folds': 5},
 }
@@ -204,36 +206,57 @@ def decompress_features(
     return df, decompressed_features
-def model_fn(model_dir: str) -> TabularModel:
-    """Load the PyTorch Tabular model from the specified directory.
+def model_fn(model_dir: str) -> dict:
+    """Load the PyTorch Tabular ensemble models from the specified directory.
     Args:
-        model_dir: Directory containing the saved model
+        model_dir: Directory containing the saved model(s)
     Returns:
-        Loaded TabularModel instance
+        Dictionary with ensemble models and metadata
     """
+    import torch
+    from functools import partial
+    # Load ensemble metadata if present
+    ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
+    if os.path.exists(ensemble_metadata_path):
+        ensemble_metadata = joblib.load(ensemble_metadata_path)
+        n_ensemble = ensemble_metadata["n_ensemble"]
+    else:
+        n_ensemble = 1
+    # Determine map_location for loading models (handle CUDA trained models on CPU inference)
+    map_location = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Patch torch.load globally to use map_location (needed for joblib-loaded callbacks)
+    # This handles the case where pytorch-tabular loads callbacks.sav via joblib,
+    # which internally calls torch.load without map_location
+    original_torch_load = torch.load
+    torch.load = partial(original_torch_load, map_location=map_location)
     # Save current working directory
     original_cwd = os.getcwd()
+    ensemble_models = []
     try:
         # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
         os.chdir("/tmp")
-        # Remove callbacks.sav if it exists - it's not needed for inference and causes
-        # GPU->CPU loading issues (joblib.load doesn't support map_location)
-        model_path = os.path.join(model_dir, "tabular_model")
-        callbacks_path = os.path.join(model_path, "callbacks.sav")
-        if os.path.exists(callbacks_path):
-            os.remove(callbacks_path)
-        # Load the model (map_location="cpu" ensures GPU-trained models work on CPU endpoints)
-        model = TabularModel.load_model(model_path, map_location="cpu")
+        for ens_idx in range(n_ensemble):
+            # Try numbered model path first, fall back to legacy path
+            model_path = os.path.join(model_dir, f"tabular_model_{ens_idx}")
+            if not os.path.exists(model_path):
+                model_path = os.path.join(model_dir, "tabular_model")
+            model = TabularModel.load_model(model_path, map_location=map_location)
+            ensemble_models.append(model)
     finally:
-        # Restore the original working directory
+        # Restore torch.load and working directory
+        torch.load = original_torch_load
         os.chdir(original_cwd)
-    return model
+    return {"ensemble_models": ensemble_models, "n_ensemble": n_ensemble}
 def input_fn(input_data, content_type: str) -> pd.DataFrame:
@@ -264,18 +287,23 @@ def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
         raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
-def predict_fn(df: pd.DataFrame, model: TabularModel) -> pd.DataFrame:
-    """Make Predictions with our PyTorch Tabular Model
+def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
+    """Make Predictions with our PyTorch Tabular Model ensemble.
     Args:
         df (pd.DataFrame): The input DataFrame
-        model: The TabularModel use for predictions
+        model_dict: Dictionary containing ensemble models and metadata
     Returns:
-        pd.DataFrame: The DataFrame with the predictions added
+        pd.DataFrame: The DataFrame with predictions (and prediction_std for ensembles)
     """
+    model_type = TEMPLATE_PARAMS["model_type"]
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
+    # Extract ensemble models
+    ensemble_models = model_dict["ensemble_models"]
+    n_ensemble = model_dict["n_ensemble"]
     # Grab our feature columns (from training)
     model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
     with open(os.path.join(model_dir, "feature_columns.json")) as fp:
@@ -308,8 +336,10 @@ def predict_fn(df: pd.DataFrame, model: TabularModel) -> pd.DataFrame:
     if missing_mask.any():
         print(f"Warning: {missing_mask.sum()} rows have missing features, will return NaN predictions")
-    # Initialize prediction column with NaN
+    # Initialize prediction columns
     df["prediction"] = np.nan
+    if model_type in ["regressor", "uq_regressor"]:
+        df["prediction_std"] = np.nan
     # Only predict on complete rows
     complete_df = matched_df[~missing_mask]
@@ -317,37 +347,63 @@ def predict_fn(df: pd.DataFrame, model: TabularModel) -> pd.DataFrame:
         print("Warning: No complete rows to predict on")
         return df
-    # Make predictions using the TabularModel
-    result = model.predict(complete_df[features])
     # pytorch-tabular returns predictions using f"{target}_prediction" column
     target = TEMPLATE_PARAMS["target"]
     prediction_column = f"{target}_prediction"
-    if prediction_column in result.columns:
-        predictions = result[prediction_column].values
-    else:
-        raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
-    # If we have a label encoder, decode the predictions
-    if label_encoder:
-        predictions = label_encoder.inverse_transform(predictions.astype(int))
+    # Collect predictions from all ensemble members
+    all_ensemble_preds = []
+    all_ensemble_probs = []
-    # Set predictions only for complete rows
-    df.loc[~missing_mask, "prediction"] = predictions
+    for ens_idx, ens_model in enumerate(ensemble_models):
+        result = ens_model.predict(complete_df[features])
+        if prediction_column in result.columns:
+            ens_preds = result[prediction_column].values
+        else:
+            raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
-    # For classification, get probabilities
+        all_ensemble_preds.append(ens_preds)
+        # For classification, collect probabilities
+        if label_encoder is not None:
+            prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
+            if prob_cols:
+                all_ensemble_probs.append(result[prob_cols].values)
+    # Stack and compute mean/std (std is 0 for single model)
+    ensemble_preds = np.stack(all_ensemble_preds, axis=0)  # (n_ensemble, n_samples)
+    preds = np.mean(ensemble_preds, axis=0)
+    preds_std = np.std(ensemble_preds, axis=0)  # Will be 0s for n_ensemble=1
+    print(f"Inference: Ensemble predictions shape: {preds.shape}, n_ensemble: {n_ensemble}")
+    # Handle classification vs regression
     if label_encoder is not None:
-        prob_cols = [col for col in result.columns if col.endswith("_probability")]
-        if prob_cols:
-            probs = result[prob_cols].values
+        # For classification, average probabilities then take argmax
+        if all_ensemble_probs:
+            ensemble_probs = np.stack(all_ensemble_probs, axis=0)  # (n_ensemble, n_samples, n_classes)
+            avg_probs = np.mean(ensemble_probs, axis=0)  # (n_samples, n_classes)
+            class_preds = np.argmax(avg_probs, axis=1)
+            predictions = label_encoder.inverse_transform(class_preds)
             # Build full proba Series with None for missing rows
             all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
-            all_proba.loc[~missing_mask] = [p.tolist() for p in probs]
+            all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
             df["pred_proba"] = all_proba
             # Expand the pred_proba column into separate columns for each class
             df = expand_proba_column(df, label_encoder.classes_)
+        else:
+            # No probabilities, use averaged predictions
+            predictions = label_encoder.inverse_transform(preds.astype(int))
+    else:
+        # Regression (includes uq_regressor)
+        predictions = preds
+        df.loc[~missing_mask, "prediction_std"] = preds_std
+    # Set predictions only for complete rows
+    df.loc[~missing_mask, "prediction"] = predictions
     return df
@@ -359,12 +415,11 @@ if __name__ == "__main__":
     target = TEMPLATE_PARAMS["target"]
     features = TEMPLATE_PARAMS["features"]
     orig_features = features.copy()
+    id_column = TEMPLATE_PARAMS["id_column"]
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
     model_type = TEMPLATE_PARAMS["model_type"]
     model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
-    train_all_data = TEMPLATE_PARAMS["train_all_data"]
     hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
-    validation_split = 0.2
     # Script arguments for input/output directories
     parser = argparse.ArgumentParser()
@@ -423,72 +478,71 @@ if __name__ == "__main__":
     # Cast continuous columns to float
     all_df[continuous_cols] = all_df[continuous_cols].astype("float64")
-    # Do we want to train on all the data?
-    if train_all_data:
-        print("Training on ALL of the data")
-        df_train = all_df.copy()
-        df_val = all_df.copy()
-    # Does the dataframe have a training column?
-    elif "training" in all_df.columns:
-        print("Found training column, splitting data based on training column")
-        df_train = all_df[all_df["training"]].copy()
-        df_val = all_df[~all_df["training"]].copy()
-    else:
-        # Just do a random training Split
-        print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
-    print(f"FIT/TRAIN: {df_train.shape}")
-    print(f"VALIDATION: {df_val.shape}")
-    # Set up PyTorch Tabular configuration
-    data_config = DataConfig(
-        target=[target],
-        continuous_cols=continuous_cols,
-        categorical_cols=categorical_cols,
-    )
-    # Choose the 'task' based on model type also set up the label encoder if needed
+    # Choose the 'task' based on model type and set up the label encoder if needed
     if model_type == "classifier":
         task = "classification"
-        # Encode the target column
+        # Encode the target column on full dataset for consistent encoding
         label_encoder = LabelEncoder()
-        df_train[target] = label_encoder.fit_transform(df_train[target])
-        df_val[target] = label_encoder.transform(df_val[target])
+        all_df[target] = label_encoder.fit_transform(all_df[target])
+        num_classes = len(label_encoder.classes_)
     else:
         task = "regression"
         label_encoder = None
+        num_classes = None
     # Use any hyperparameters to set up both the trainer and model configurations
     print(f"Hyperparameters: {hyperparameters}")
+    n_folds = hyperparameters.get("n_folds", 5)  # Number of CV folds (default: 5)
+    # =========================================================================
+    # UNIFIED TRAINING: Works for n_folds=1 (single model) or n_folds>1 (K-fold CV)
+    # =========================================================================
+    print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
+    # Create fold splits
+    if n_folds == 1:
+        # Single fold: use train/val split from "training" column or random split
+        if "training" in all_df.columns:
+            print("Found training column, splitting data based on training column")
+            train_idx = np.where(all_df["training"])[0]
+            val_idx = np.where(~all_df["training"])[0]
+        else:
+            print("WARNING: No training column found, splitting data with random 80/20 split")
+            indices = np.arange(len(all_df))
+            train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
+        folds = [(train_idx, val_idx)]
+    else:
+        # K-Fold CV
+        if model_type == "classifier":
+            kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
+            split_target = all_df[target]
+        else:
+            kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
+            split_target = None
+        folds = list(kfold.split(all_df, split_target))
+    # Initialize storage for out-of-fold predictions
+    oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
+    if model_type == "classifier" and num_classes and num_classes > 1:
+        oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
+    else:
+        oof_proba = None
-    # Set up PyTorch Tabular configuration with defaults
-    trainer_defaults = {
-        "auto_lr_find": False,
-        "batch_size": min(128, max(32, len(df_train) // 16)),
-        "max_epochs": 100,
-        "min_epochs": 10,
-        "early_stopping": "valid_loss",
-        "early_stopping_patience": 10,
-        "checkpoints": "valid_loss",
-        "accelerator": "auto",
-        "progress_bar": "none",
-        "gradient_clip_val": 1.0,
-    }
+    ensemble_models = []
-    # Override defaults with training_config if present
-    training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
-    for key, value in training_overrides.items():
-        print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
-    trainer_params = {**trainer_defaults, **training_overrides}
-    trainer_config = TrainerConfig(**trainer_params)
+    # Set up PyTorch Tabular data configuration (shared across folds)
+    data_config = DataConfig(
+        target=[target],
+        continuous_cols=continuous_cols,
+        categorical_cols=categorical_cols,
+    )
     # Model config defaults
     model_defaults = {
         "layers": "256-128-64",
         "activation": "LeakyReLU",
         "learning_rate": 1e-3,
-        "dropout": 0.3,
+        "dropout": 0.1,
         "use_batch_norm": True,
         "initialization": "kaiming",
     }
@@ -498,41 +552,107 @@ if __name__ == "__main__":
         print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
     model_params = {**model_defaults, **model_overrides}
-    # Use CategoryEmbedding model configuration for general-purpose tabular modeling.
     model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
     optimizer_config = OptimizerConfig()
-    #####################################
-    # Create and train the TabularModel #
-    #####################################
-    tabular_model = TabularModel(
-        data_config=data_config,
-        model_config=model_config,
-        optimizer_config=optimizer_config,
-        trainer_config=trainer_config,
-    )
-    tabular_model.fit(train=df_train, validation=df_val)
+    for fold_idx, (train_idx, val_idx) in enumerate(folds):
+        print(f"\n{'='*50}")
+        print(f"Training Fold {fold_idx + 1}/{len(folds)}")
+        print(f"{'='*50}")
+        # Split data for this fold
+        df_train = all_df.iloc[train_idx].reset_index(drop=True)
+        df_val = all_df.iloc[val_idx].reset_index(drop=True)
+        print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
+        # Set up PyTorch Tabular trainer configuration (per-fold for batch_size)
+        # Calculate batch size that avoids single-sample last batch (batch norm requires >1)
+        batch_size = min(128, max(32, len(df_train) // 16))
+        if len(df_train) % batch_size == 1:
+            batch_size += 1  # Adjust to avoid last batch of size 1
+        trainer_defaults = {
+            "auto_lr_find": False,
+            "batch_size": batch_size,
+            "max_epochs": 200,
+            "min_epochs": 10,
+            "early_stopping": "valid_loss",
+            "early_stopping_patience": 20,
+            "checkpoints": "valid_loss",
+            "accelerator": "auto",
+            "progress_bar": "none",
+            "gradient_clip_val": 1.0,
+            "seed": 42 + fold_idx,
+        }
+        # Override defaults with training_config if present
+        training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
+        if fold_idx == 0:  # Only print overrides once
+            for key, value in training_overrides.items():
+                print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
+        trainer_params = {**trainer_defaults, **training_overrides}
+        trainer_config = TrainerConfig(**trainer_params)
+        # Create and train the TabularModel for this fold
+        tabular_model = TabularModel(
+            data_config=data_config,
+            model_config=model_config,
+            optimizer_config=optimizer_config,
+            trainer_config=trainer_config,
+        )
+        tabular_model.fit(train=df_train, validation=df_val)
+        ensemble_models.append(tabular_model)
+        # Make out-of-fold predictions
+        result = tabular_model.predict(df_val, include_input_features=False)
+        fold_preds = result[f"{target}_prediction"].values
+        # Store out-of-fold predictions
+        if model_type == "classifier":
+            oof_predictions[val_idx] = fold_preds.astype(int)
+            prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
+            if prob_cols and oof_proba is not None:
+                oof_proba[val_idx] = result[prob_cols].values
+        else:
+            oof_predictions[val_idx] = fold_preds.flatten()
-    # Make Predictions on the Validation Set
-    print("Making Predictions on Validation Set...")
-    result = tabular_model.predict(df_val, include_input_features=False)
+        print(f"Fold {fold_idx + 1} complete!")
-    # pytorch-tabular returns predictions using f"{target}_prediction" column
-    preds = result[f"{target}_prediction"].values
+    print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
+    # Use out-of-fold predictions for metrics
+    # For n_folds=1, we only have predictions for val_idx, so filter to those rows
+    if n_folds == 1:
+        val_mask = ~np.isnan(oof_predictions)
+        preds = oof_predictions[val_mask]
+        df_val = all_df[val_mask].copy()
+        if oof_proba is not None:
+            oof_proba = oof_proba[val_mask]
+    else:
+        preds = oof_predictions
+        df_val = all_df.copy()
+    # Compute prediction_std by running all ensemble models on validation data
+    # For n_folds=1, std will be 0 (only one model). For n_folds>1, std shows ensemble disagreement.
+    preds_std = None
+    if model_type in ["regressor", "uq_regressor"] and len(ensemble_models) > 0:
+        print("Computing prediction_std from ensemble predictions on validation data...")
+        all_ensemble_preds_for_std = []
+        for ens_model in ensemble_models:
+            result = ens_model.predict(df_val[features], include_input_features=False)
+            ens_preds = result[f"{target}_prediction"].values.flatten()
+            all_ensemble_preds_for_std.append(ens_preds)
+        ensemble_preds_stacked = np.stack(all_ensemble_preds_for_std, axis=0)
+        preds_std = np.std(ensemble_preds_stacked, axis=0)
+        print(f"Ensemble prediction_std - mean: {np.mean(preds_std):.4f}, max: {np.max(preds_std):.4f}")
     if model_type == "classifier":
         # Get probabilities for classification
-        print("Processing Probabilities...")
-        prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
-        if prob_cols:
-            probs = result[prob_cols].values
-            df_val = df_val.copy()  # Avoid SettingWithCopyWarning
-            df_val["pred_proba"] = [p.tolist() for p in probs]
-            # Expand the pred_proba column into separate columns for each class
-            print(df_val.columns.tolist())
+        if oof_proba is not None:
+            df_val = df_val.copy()
+            df_val["pred_proba"] = [p.tolist() for p in oof_proba]
             df_val = expand_proba_column(df_val, label_encoder.classes_)
-            print(df_val.columns.tolist())
         # Decode the target and prediction labels
         y_validate = label_encoder.inverse_transform(df_val[target])
@@ -544,7 +664,22 @@ if __name__ == "__main__":
     # Save predictions to S3
     df_val = df_val.copy()
     df_val["prediction"] = preds_decoded
-    output_columns = [target, "prediction"]
+    # Build output columns - include id_column if it exists
+    output_columns = []
+    if id_column in df_val.columns:
+        output_columns.append(id_column)
+    output_columns += [target, "prediction"]
+    # Add prediction_std for regression models (always present, 0 for single model)
+    if model_type in ["regressor", "uq_regressor"]:
+        if preds_std is not None:
+            df_val["prediction_std"] = preds_std
+        else:
+            df_val["prediction_std"] = 0.0
+        output_columns.append("prediction_std")
+        print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
     output_columns += [col for col in df_val.columns if col.endswith("_proba")]
     wr.s3.to_csv(
         df_val[output_columns],
@@ -589,14 +724,29 @@ if __name__ == "__main__":
         # Calculate various model performance metrics (regression)
         rmse = root_mean_squared_error(y_validate, preds_decoded)
         mae = mean_absolute_error(y_validate, preds_decoded)
+        medae = median_absolute_error(y_validate, preds_decoded)
         r2 = r2_score(y_validate, preds_decoded)
-        print(f"RMSE: {rmse:.3f}")
-        print(f"MAE: {mae:.3f}")
-        print(f"R2: {r2:.3f}")
-        print(f"NumRows: {len(df_val)}")
+        spearman_corr = spearmanr(y_validate, preds_decoded).correlation
+        support = len(df_val)
+        print(f"rmse: {rmse:.3f}")
+        print(f"mae: {mae:.3f}")
+        print(f"medae: {medae:.3f}")
+        print(f"r2: {r2:.3f}")
+        print(f"spearmanr: {spearman_corr:.3f}")
+        print(f"support: {support}")
+    # Save ensemble models
+    for model_idx, ens_model in enumerate(ensemble_models):
+        model_path = os.path.join(args.model_dir, f"tabular_model_{model_idx}")
+        ens_model.save_model(model_path)
+        print(f"Saved model {model_idx + 1} to {model_path}")
+    # Save ensemble metadata
+    n_ensemble = len(ensemble_models)
+    ensemble_metadata = {"n_ensemble": n_ensemble, "n_folds": n_folds}
+    joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
+    print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
-    # Save the model to the standard place/name
-    tabular_model.save_model(os.path.join(args.model_dir, "tabular_model"))
     if label_encoder:
         joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))

workbench 0.8.201__py3-none-any.whl → 0.8.204__py3-none-any.whl

workbench 0.8.201py3-none-any.whl → 0.8.204py3-none-any.whl