PyPI - workbench - Versions diffs - 0.8.198__py3-none-any.whl → 0.8.203__py3-none-any.whl - Mend

workbench 0.8.198py3-none-any.whl → 0.8.203py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

workbench/algorithms/dataframe/proximity.py +11 -4
workbench/api/__init__.py +2 -1
workbench/api/df_store.py +17 -108
workbench/api/feature_set.py +48 -11
workbench/api/model.py +1 -1
workbench/api/parameter_store.py +3 -52
workbench/core/artifacts/__init__.py +11 -2
workbench/core/artifacts/artifact.py +5 -5
workbench/core/artifacts/df_store_core.py +114 -0
workbench/core/artifacts/endpoint_core.py +261 -78
workbench/core/artifacts/feature_set_core.py +69 -1
workbench/core/artifacts/model_core.py +48 -14
workbench/core/artifacts/parameter_store_core.py +98 -0
workbench/core/transforms/features_to_model/features_to_model.py +50 -33
workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
workbench/core/views/view.py +2 -2
workbench/model_scripts/chemprop/chemprop.template +933 -0
workbench/model_scripts/chemprop/generated_model_script.py +933 -0
workbench/model_scripts/chemprop/requirements.txt +11 -0
workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
workbench/model_scripts/custom_models/proximity/proximity.py +11 -4
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +11 -5
workbench/model_scripts/custom_models/uq_models/meta_uq.template +11 -5
workbench/model_scripts/custom_models/uq_models/ngboost.template +11 -5
workbench/model_scripts/custom_models/uq_models/proximity.py +11 -4
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +11 -5
workbench/model_scripts/pytorch_model/generated_model_script.py +365 -173
workbench/model_scripts/pytorch_model/pytorch.template +362 -170
workbench/model_scripts/scikit_learn/generated_model_script.py +302 -0
workbench/model_scripts/script_generation.py +10 -7
workbench/model_scripts/uq_models/generated_model_script.py +43 -27
workbench/model_scripts/uq_models/mapie.template +40 -24
workbench/model_scripts/xgb_model/generated_model_script.py +36 -7
workbench/model_scripts/xgb_model/xgb_model.template +36 -7
workbench/repl/workbench_shell.py +14 -5
workbench/resources/open_source_api.key +1 -1
workbench/scripts/endpoint_test.py +162 -0
workbench/scripts/{lambda_launcher.py → lambda_test.py} +10 -0
workbench/utils/chemprop_utils.py +761 -0
workbench/utils/pytorch_utils.py +527 -0
workbench/utils/xgboost_model_utils.py +10 -5
workbench/web_interface/components/model_plot.py +7 -1
{workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/METADATA +3 -3
{workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/RECORD +49 -43
{workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/entry_points.txt +2 -1
workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
workbench/model_scripts/__pycache__/script_generation.cpython-312.pyc +0 -0
workbench/model_scripts/__pycache__/script_generation.cpython-313.pyc +0 -0
{workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/WHEEL +0 -0
{workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/top_level.txt +0 -0

workbench/model_scripts/pytorch_model/generated_model_script.py CHANGED Viewed

@@ -13,39 +13,38 @@ from pytorch_tabular.models import CategoryEmbeddingModelConfig
 # Model Performance Scores
 from sklearn.metrics import (
     mean_absolute_error,
+    median_absolute_error,
     r2_score,
     root_mean_squared_error,
     precision_recall_fscore_support,
     confusion_matrix,
 )
+from scipy.stats import spearmanr
 # Classification Encoder
 from sklearn.preprocessing import LabelEncoder
 # Scikit Learn Imports
-from sklearn.model_selection import train_test_split
+from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
 from io import StringIO
 import json
 import argparse
 import joblib
-import os
 import pandas as pd
-from typing import List, Tuple
 # Template Parameters
 TEMPLATE_PARAMS = {
-    "model_type": "regressor",
-    "target": "udm_asy_res_efflux_ratio",
-    "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
+    "model_type": "uq_regressor",
+    "target": "mppb",
+    "features": ['mollogp', 'mi', 'fr_benzene', 'smr_vsa3', 'fr_halogen', 'c2sp2', 'peoe_vsa6', 'bcut2d_mwhi', 'vsa_estate1', 'mv', 'numaromaticcarbocycles', 'vsa_estate5', 'fr_nh0', 'mm', 'smr_vsa7', 'tpsa', 'c1sp2', 'mz', 'vsa_estate2', 'peoe_vsa7', 'vsa_estate10', 'vsa_estate7', 'vsa_estate6', 'smr_vsa10', 'slogp_vsa2', 'bcut2d_logphi', 'naromatom', 'axp_2dv', 'bcut2d_mrhi', 'vsa_estate8', 'slogp_vsa3', 'vsa_estate4', 'xpc_6dv', 'slogp_vsa12', 'peoe_vsa9', 'mp', 'slogp_vsa1', 'peoe_vsa1', 'xch_5dv', 'qed', 'vsa_estate3', 'fpdensitymorgan3', 'axp_2d', 'axp_0d', 'mse', 'numhacceptors', 'bertzct', 'estate_vsa8', 'minestateindex', 'estate_vsa3', 'fpdensitymorgan2', 'smr_vsa6', 'peoe_vsa8', 'slogp_vsa6', 'xp_5dv', 'hallkieralpha', 'avgipc', 'fr_arn', 'xp_7d', 'mare', 'xp_6d', 'bcut2d_mrlow', 'estate_vsa4', 'bcut2d_logplow', 'peoe_vsa10', 'maxabspartialcharge', 'peoe_vsa3', 'bcut2d_mwlow', 'axp_7d', 'minpartialcharge', 'xpc_4d', 'axp_1d', 'estate_vsa9', 'vsa_estate9', 'estate_vsa7', 'maxestateindex', 'estate_vsa6', 'smr_vsa1', 'xpc_6d', 'xch_7d', 'xc_5d', 'phi', 'axp_0dv', 'axp_3dv', 'mpe', 'xc_3d', 'xch_5d', 'xc_5dv', 'xch_6d', 'chi4n', 'axp_7dv', 'slogp_vsa5', 'axp_1dv', 'xch_6dv', 'minabsestateindex', 'numrotatablebonds', 'peoe_vsa2', 'estate_vsa2', 'slogp_vsa8', 'bcut2d_chglo', 'xch_7dv', 'kappa2', 'axp_4dv', 'xc_3dv', 'kappa1', 'nbase', 'xpc_5dv', 'maxpartialcharge', 'bcut2d_chghi', 'axp_5d', 'balabanj', 'xpc_5d', 'fpdensitymorgan1', 'xp_5d', 'smr_vsa5', 'axp_4d', 'kappa3', 'fr_morpholine', 'estate_vsa5', 'chi2n', 'labuteasa', 'axp_5dv', 'molwt', 'smr_vsa9', 'maxabsestateindex', 'xp_7dv', 'fr_bicyclic', 'numaliphaticheterocycles', 'axp_6dv', 'slogp_vsa4', 'axp_3d', 'xp_6dv', 'nocount', 'axp_6d', 'fr_aniline', 'xpc_4dv', 'xp_1d', 'c3sp2', 'numheterocycles', 'nhohcount', 'molmr', 'numaromaticheterocycles', 'chi0', 'minabspartialcharge', 'fr_ar_n', 'xp_3d', 'chi2v', 'fr_ether', 'chi1v', 'chi1', 'xp_2d', 'xp_4dv', 'xp_4d', 'chi4v', 'fr_pyridine', 'smr_vsa4', 'sps', 'chi3n', 'heavyatommolwt', 'slogp_vsa11', 'fr_aryl_methyl', 'si', 'fractioncsp3', 'sse', 'fr_para_hydroxylation', 'slogp_vsa10', 'c1sp3', 'exactmolwt', 'numsaturatedheterocycles', 'chi1n', 'chi0n', 'fcsp3'],
+    "id_column": "molecule_name",
     "compressed_features": [],
-    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-pytorch-test/training",
-    "train_all_data": False,
-    "hyperparameters": {},
+    "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/mppb-reg-pytorch/training",
+    "hyperparameters": {'n_folds': 5},
 }
-# Function to check if dataframe is empty
 def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
     """
     Check if the provided dataframe is empty and raise an exception if it is.
@@ -60,19 +59,17 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
         raise ValueError(msg)
-def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
+def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFrame:
     """
     Expands a column in a DataFrame containing a list of probabilities into separate columns.
     Args:
         df (pd.DataFrame): DataFrame containing a "pred_proba" column
-        class_labels (List[str]): List of class labels
+        class_labels (list[str]): List of class labels
     Returns:
         pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
     """
-    # Sanity check
     proba_column = "pred_proba"
     if proba_column not in df.columns:
         raise ValueError('DataFrame does not contain a "pred_proba" column')
@@ -89,11 +86,10 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
     # Concatenate the new columns with the original DataFrame
     df = pd.concat([df, proba_df], axis=1)
-    print(df)
     return df
-def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
+def match_features_case_insensitive(df: pd.DataFrame, model_features: list[str]) -> pd.DataFrame:
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
@@ -118,55 +114,60 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     return df.rename(columns=rename_dict)
-def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
+def convert_categorical_types(
+    df: pd.DataFrame, features: list[str], category_mappings: dict[str, list[str]] | None = None
+) -> tuple[pd.DataFrame, dict[str, list[str]]]:
     """
     Converts appropriate columns to categorical type with consistent mappings.
     Args:
         df (pd.DataFrame): The DataFrame to process.
         features (list): List of feature names to consider for conversion.
-        category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
-                                            training mode. If populated, we're in inference mode.
+        category_mappings (dict, optional): Existing category mappings. If None or empty,
+                                            we're in training mode. If populated, we're in
+                                            inference mode.
     Returns:
         tuple: (processed DataFrame, category mappings dictionary)
     """
+    if category_mappings is None:
+        category_mappings = {}
     # Training mode
-    if category_mappings == {}:
+    if not category_mappings:
         for col in df.select_dtypes(include=["object", "string"]):
             if col in features and df[col].nunique() < 20:
                 print(f"Training mode: Converting {col} to category")
                 df[col] = df[col].astype("category")
-                category_mappings[col] = df[col].cat.categories.tolist()  # Store category mappings
+                category_mappings[col] = df[col].cat.categories.tolist()
     # Inference mode
     else:
         for col, categories in category_mappings.items():
             if col in df.columns:
                 print(f"Inference mode: Applying categorical mapping for {col}")
-                df[col] = pd.Categorical(df[col], categories=categories)  # Apply consistent categorical mapping
+                df[col] = pd.Categorical(df[col], categories=categories)
     return df, category_mappings
 def decompress_features(
-    df: pd.DataFrame, features: List[str], compressed_features: List[str]
-) -> Tuple[pd.DataFrame, List[str]]:
+    df: pd.DataFrame, features: list[str], compressed_features: list[str]
+) -> tuple[pd.DataFrame, list[str]]:
     """Prepare features for the model
     Args:
         df (pd.DataFrame): The features DataFrame
-        features (List[str]): Full list of feature names
-        compressed_features (List[str]): List of feature names to decompress (bitstrings)
+        features (list[str]): Full list of feature names
+        compressed_features (list[str]): List of feature names to decompress (bitstrings)
     Returns:
         pd.DataFrame: DataFrame with the decompressed features
-        List[str]: Updated list of feature names after decompression
+        list[str]: Updated list of feature names after decompression
     Raises:
         ValueError: If any missing values are found in the specified features
     """
     # Check for any missing values in the required features
     missing_counts = df[features].isna().sum()
     if missing_counts.any():
@@ -176,10 +177,11 @@ def decompress_features(
             "WARNING: You might want to remove/replace all NaN values before processing."
         )
-    # Decompress the specified compressed features
-    decompressed_features = features
+    # Make a copy to avoid mutating the original list
+    decompressed_features = features.copy()
     for feature in compressed_features:
-        if (feature not in df.columns) or (feature not in features):
+        if (feature not in df.columns) or (feature not in decompressed_features):
             print(f"Feature '{feature}' not in the features list, skipping decompression.")
             continue
@@ -204,26 +206,60 @@ def decompress_features(
     return df, decompressed_features
-def model_fn(model_dir):
+def model_fn(model_dir: str) -> dict:
+    """Load the PyTorch Tabular ensemble models from the specified directory.
+    Args:
+        model_dir: Directory containing the saved model(s)
+    Returns:
+        Dictionary with ensemble models and metadata
+    """
+    import torch
+    from functools import partial
+    # Load ensemble metadata if present
+    ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
+    if os.path.exists(ensemble_metadata_path):
+        ensemble_metadata = joblib.load(ensemble_metadata_path)
+        n_ensemble = ensemble_metadata["n_ensemble"]
+    else:
+        n_ensemble = 1
+    # Determine map_location for loading models (handle CUDA trained models on CPU inference)
+    map_location = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Patch torch.load globally to use map_location (needed for joblib-loaded callbacks)
+    # This handles the case where pytorch-tabular loads callbacks.sav via joblib,
+    # which internally calls torch.load without map_location
+    original_torch_load = torch.load
+    torch.load = partial(original_torch_load, map_location=map_location)
     # Save current working directory
     original_cwd = os.getcwd()
+    ensemble_models = []
     try:
         # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
         os.chdir("/tmp")
-        # Load the model
-        model_path = os.path.join(model_dir, "tabular_model")
-        model = TabularModel.load_model(model_path)
+        for ens_idx in range(n_ensemble):
+            # Try numbered model path first, fall back to legacy path
+            model_path = os.path.join(model_dir, f"tabular_model_{ens_idx}")
+            if not os.path.exists(model_path):
+                model_path = os.path.join(model_dir, "tabular_model")
+            model = TabularModel.load_model(model_path, map_location=map_location)
+            ensemble_models.append(model)
-    # Restore the original working directory
     finally:
+        # Restore torch.load and working directory
+        torch.load = original_torch_load
         os.chdir(original_cwd)
-    return model
+    return {"ensemble_models": ensemble_models, "n_ensemble": n_ensemble}
-def input_fn(input_data, content_type):
+def input_fn(input_data, content_type: str) -> pd.DataFrame:
     """Parse input data and return a DataFrame."""
     if not input_data:
         raise ValueError("Empty input data is not supported!")
@@ -240,29 +276,34 @@ def input_fn(input_data, content_type):
         raise ValueError(f"{content_type} not supported!")
-def output_fn(output_df, accept_type):
+def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
     """Supports both CSV and JSON output formats."""
     if "text/csv" in accept_type:
-        csv_output = output_df.fillna("N/A").to_csv(index=False)  # CSV with N/A for missing values
+        csv_output = output_df.fillna("N/A").to_csv(index=False)
         return csv_output, "text/csv"
     elif "application/json" in accept_type:
-        return output_df.to_json(orient="records"), "application/json"  # JSON array of records (NaNs -> null)
+        return output_df.to_json(orient="records"), "application/json"
     else:
         raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
-def predict_fn(df, model) -> pd.DataFrame:
-    """Make Predictions with our PyTorch Tabular Model
+def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
+    """Make Predictions with our PyTorch Tabular Model ensemble.
     Args:
         df (pd.DataFrame): The input DataFrame
-        model: The TabularModel use for predictions
+        model_dict: Dictionary containing ensemble models and metadata
     Returns:
-        pd.DataFrame: The DataFrame with the predictions added
+        pd.DataFrame: The DataFrame with predictions (and prediction_std for ensembles)
     """
+    model_type = TEMPLATE_PARAMS["model_type"]
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
+    # Extract ensemble models
+    ensemble_models = model_dict["ensemble_models"]
+    n_ensemble = model_dict["n_ensemble"]
     # Grab our feature columns (from training)
     model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
     with open(os.path.join(model_dir, "feature_columns.json")) as fp:
@@ -275,12 +316,11 @@ def predict_fn(df, model) -> pd.DataFrame:
     # Load our Label Encoder if we have one
     label_encoder = None
-    if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
-        label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
+    label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
+    if os.path.exists(label_encoder_path):
+        label_encoder = joblib.load(label_encoder_path)
-    # We're going match features in a case-insensitive manner, accounting for all the permutations
-    # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
-    # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
+    # Match features in a case-insensitive manner
     matched_df = match_features_case_insensitive(df, features)
     # Detect categorical types in the incoming DataFrame
@@ -291,36 +331,80 @@ def predict_fn(df, model) -> pd.DataFrame:
         print("Decompressing features for prediction...")
         matched_df, features = decompress_features(matched_df, features, compressed_features)
-    # Make predictions using the TabularModel
-    result = model.predict(matched_df[features])
+    # Track rows with missing features
+    missing_mask = matched_df[features].isna().any(axis=1)
+    if missing_mask.any():
+        print(f"Warning: {missing_mask.sum()} rows have missing features, will return NaN predictions")
+    # Initialize prediction columns
+    df["prediction"] = np.nan
+    if model_type in ["regressor", "uq_regressor"]:
+        df["prediction_std"] = np.nan
+    # Only predict on complete rows
+    complete_df = matched_df[~missing_mask]
+    if len(complete_df) == 0:
+        print("Warning: No complete rows to predict on")
+        return df
     # pytorch-tabular returns predictions using f"{target}_prediction" column
-    # and classification probabilities in columns ending with "_probability"
-    target = TEMPLATE_PARAMS["target_column"]
+    target = TEMPLATE_PARAMS["target"]
     prediction_column = f"{target}_prediction"
-    if prediction_column in result.columns:
-        predictions = result[prediction_column].values
-    else:
-        raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
-    # If we have a label encoder, decode the predictions
-    if label_encoder:
-        predictions = label_encoder.inverse_transform(predictions.astype(int))
+    # Collect predictions from all ensemble members
+    all_ensemble_preds = []
+    all_ensemble_probs = []
+    for ens_idx, ens_model in enumerate(ensemble_models):
+        result = ens_model.predict(complete_df[features])
+        if prediction_column in result.columns:
+            ens_preds = result[prediction_column].values
+        else:
+            raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
+        all_ensemble_preds.append(ens_preds)
-    # Set the predictions on the DataFrame
-    df["prediction"] = predictions
+        # For classification, collect probabilities
+        if label_encoder is not None:
+            prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
+            if prob_cols:
+                all_ensemble_probs.append(result[prob_cols].values)
-    # For classification, get probabilities
+    # Stack and compute mean/std (std is 0 for single model)
+    ensemble_preds = np.stack(all_ensemble_preds, axis=0)  # (n_ensemble, n_samples)
+    preds = np.mean(ensemble_preds, axis=0)
+    preds_std = np.std(ensemble_preds, axis=0)  # Will be 0s for n_ensemble=1
+    print(f"Inference: Ensemble predictions shape: {preds.shape}, n_ensemble: {n_ensemble}")
+    # Handle classification vs regression
     if label_encoder is not None:
-        prob_cols = [col for col in result.columns if col.endswith("_probability")]
-        if prob_cols:
-            probs = result[prob_cols].values
-            df["pred_proba"] = [p.tolist() for p in probs]
+        # For classification, average probabilities then take argmax
+        if all_ensemble_probs:
+            ensemble_probs = np.stack(all_ensemble_probs, axis=0)  # (n_ensemble, n_samples, n_classes)
+            avg_probs = np.mean(ensemble_probs, axis=0)  # (n_samples, n_classes)
+            class_preds = np.argmax(avg_probs, axis=1)
+            predictions = label_encoder.inverse_transform(class_preds)
+            # Build full proba Series with None for missing rows
+            all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
+            all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
+            df["pred_proba"] = all_proba
             # Expand the pred_proba column into separate columns for each class
             df = expand_proba_column(df, label_encoder.classes_)
+        else:
+            # No probabilities, use averaged predictions
+            predictions = label_encoder.inverse_transform(preds.astype(int))
+    else:
+        # Regression (includes uq_regressor)
+        predictions = preds
+        df.loc[~missing_mask, "prediction_std"] = preds_std
+    # Set predictions only for complete rows
+    df.loc[~missing_mask, "prediction"] = predictions
-    # All done, return the DataFrame with new columns for the predictions
     return df
@@ -331,12 +415,11 @@ if __name__ == "__main__":
     target = TEMPLATE_PARAMS["target"]
     features = TEMPLATE_PARAMS["features"]
     orig_features = features.copy()
+    id_column = TEMPLATE_PARAMS["id_column"]
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
     model_type = TEMPLATE_PARAMS["model_type"]
     model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
-    train_all_data = TEMPLATE_PARAMS["train_all_data"]
     hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
-    validation_split = 0.2
     # Script arguments for input/output directories
     parser = argparse.ArgumentParser()
@@ -354,9 +437,21 @@ if __name__ == "__main__":
     # Combine files and read them all into a single pandas dataframe
     all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
+    # Print out some info about the dataframe
+    print(f"All Data Shape: {all_df.shape}")
+    print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
+    print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
     # Check if the dataframe is empty
     check_dataframe(all_df, "training_df")
+    # Drop any rows with missing feature values
+    initial_row_count = all_df.shape[0]
+    all_df = all_df.dropna(subset=features)
+    dropped_rows = initial_row_count - all_df.shape[0]
+    if dropped_rows > 0:
+        print(f"Dropped {dropped_rows} rows due to missing feature values.")
     # Features/Target output
     print(f"Target: {target}")
     print(f"Features: {str(features)}")
@@ -364,82 +459,88 @@ if __name__ == "__main__":
     # Convert any features that might be categorical to 'category' type
     all_df, category_mappings = convert_categorical_types(all_df, features)
+    # Print out some info about the dataframe
+    print(f"All Data Shape: {all_df.shape}")
+    print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
+    print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
     # If we have compressed features, decompress them
     if compressed_features:
         print(f"Decompressing features {compressed_features}...")
         all_df, features = decompress_features(all_df, features, compressed_features)
-    # Do we want to train on all the data?
-    if train_all_data:
-        print("Training on ALL of the data")
-        df_train = all_df.copy()
-        df_val = all_df.copy()
-    # Does the dataframe have a training column?
-    elif "training" in all_df.columns:
-        print("Found training column, splitting data based on training column")
-        df_train = all_df[all_df["training"]]
-        df_val = all_df[~all_df["training"]]
-    else:
-        # Just do a random training Split
-        print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
-    print(f"FIT/TRAIN: {df_train.shape}")
-    print(f"VALIDATION: {df_val.shape}")
     # Determine categorical and continuous columns
-    categorical_cols = [col for col in features if df_train[col].dtype.name == "category"]
+    categorical_cols = [col for col in features if all_df[col].dtype.name == "category"]
     continuous_cols = [col for col in features if col not in categorical_cols]
     print(f"Categorical columns: {categorical_cols}")
     print(f"Continuous columns: {continuous_cols}")
-    # Set up PyTorch Tabular configuration
-    data_config = DataConfig(
-        target=[target],
-        continuous_cols=continuous_cols,
-        categorical_cols=categorical_cols,
-    )
+    # Cast continuous columns to float
+    all_df[continuous_cols] = all_df[continuous_cols].astype("float64")
-    # Choose the 'task' based on model type also set up the label encoder if needed
+    # Choose the 'task' based on model type and set up the label encoder if needed
     if model_type == "classifier":
         task = "classification"
-        # Encode the target column
+        # Encode the target column on full dataset for consistent encoding
         label_encoder = LabelEncoder()
-        df_train[target] = label_encoder.fit_transform(df_train[target])
-        df_val[target] = label_encoder.transform(df_val[target])
+        all_df[target] = label_encoder.fit_transform(all_df[target])
+        num_classes = len(label_encoder.classes_)
     else:
         task = "regression"
         label_encoder = None
+        num_classes = None
     # Use any hyperparameters to set up both the trainer and model configurations
     print(f"Hyperparameters: {hyperparameters}")
+    n_folds = hyperparameters.get("n_folds", 5)  # Number of CV folds (default: 5)
+    # =========================================================================
+    # UNIFIED TRAINING: Works for n_folds=1 (single model) or n_folds>1 (K-fold CV)
+    # =========================================================================
+    print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
+    # Create fold splits
+    if n_folds == 1:
+        # Single fold: use train/val split from "training" column or random split
+        if "training" in all_df.columns:
+            print("Found training column, splitting data based on training column")
+            train_idx = np.where(all_df["training"])[0]
+            val_idx = np.where(~all_df["training"])[0]
+        else:
+            print("WARNING: No training column found, splitting data with random 80/20 split")
+            indices = np.arange(len(all_df))
+            train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
+        folds = [(train_idx, val_idx)]
+    else:
+        # K-Fold CV
+        if model_type == "classifier":
+            kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
+            split_target = all_df[target]
+        else:
+            kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
+            split_target = None
+        folds = list(kfold.split(all_df, split_target))
+    # Initialize storage for out-of-fold predictions
+    oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
+    if model_type == "classifier" and num_classes and num_classes > 1:
+        oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
+    else:
+        oof_proba = None
-    # Set up PyTorch Tabular configuration with defaults
-    trainer_defaults = {
-        "auto_lr_find": True,
-        "batch_size": min(1024, max(32, len(df_train) // 4)),
-        "max_epochs": 100,
-        "early_stopping": "valid_loss",
-        "early_stopping_patience": 15,
-        "checkpoints": "valid_loss",
-        "accelerator": "auto",
-        "progress_bar": "none",
-        "gradient_clip_val": 1.0,
-    }
+    ensemble_models = []
-    # Override defaults with training_config if present
-    training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
-    # Print overwrites
-    for key, value in training_overrides.items():
-        print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
-    trainer_params = {**trainer_defaults, **training_overrides}
-    trainer_config = TrainerConfig(**trainer_params)
+    # Set up PyTorch Tabular data configuration (shared across folds)
+    data_config = DataConfig(
+        target=[target],
+        continuous_cols=continuous_cols,
+        categorical_cols=categorical_cols,
+    )
     # Model config defaults
     model_defaults = {
-        "layers": "1024-512-512",
-        "activation": "ReLU",
+        "layers": "256-128-64",
+        "activation": "LeakyReLU",
         "learning_rate": 1e-3,
         "dropout": 0.1,
         "use_batch_norm": True,
@@ -447,63 +548,139 @@ if __name__ == "__main__":
     }
     # Override defaults with model_config if present
     model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
-    # Print overwrites
     for key, value in model_overrides.items():
         print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
     model_params = {**model_defaults, **model_overrides}
-    # Use CategoryEmbedding model configuration for general-purpose tabular modeling.
-    # Works effectively for both regression and classification as the foundational
-    # architecture in PyTorch Tabular
     model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
     optimizer_config = OptimizerConfig()
-    #####################################
-    # Create and train the TabularModel #
-    #####################################
-    tabular_model = TabularModel(
-        data_config=data_config,
-        model_config=model_config,
-        optimizer_config=optimizer_config,
-        trainer_config=trainer_config,
-    )
-    tabular_model.fit(train=df_train, validation=df_val)
+    for fold_idx, (train_idx, val_idx) in enumerate(folds):
+        print(f"\n{'='*50}")
+        print(f"Training Fold {fold_idx + 1}/{len(folds)}")
+        print(f"{'='*50}")
+        # Split data for this fold
+        df_train = all_df.iloc[train_idx].reset_index(drop=True)
+        df_val = all_df.iloc[val_idx].reset_index(drop=True)
+        print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
+        # Set up PyTorch Tabular trainer configuration (per-fold for batch_size)
+        # Calculate batch size that avoids single-sample last batch (batch norm requires >1)
+        batch_size = min(128, max(32, len(df_train) // 16))
+        if len(df_train) % batch_size == 1:
+            batch_size += 1  # Adjust to avoid last batch of size 1
+        trainer_defaults = {
+            "auto_lr_find": False,
+            "batch_size": batch_size,
+            "max_epochs": 200,
+            "min_epochs": 10,
+            "early_stopping": "valid_loss",
+            "early_stopping_patience": 20,
+            "checkpoints": "valid_loss",
+            "accelerator": "auto",
+            "progress_bar": "none",
+            "gradient_clip_val": 1.0,
+            "seed": 42 + fold_idx,
+        }
+        # Override defaults with training_config if present
+        training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
+        if fold_idx == 0:  # Only print overrides once
+            for key, value in training_overrides.items():
+                print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
+        trainer_params = {**trainer_defaults, **training_overrides}
+        trainer_config = TrainerConfig(**trainer_params)
+        # Create and train the TabularModel for this fold
+        tabular_model = TabularModel(
+            data_config=data_config,
+            model_config=model_config,
+            optimizer_config=optimizer_config,
+            trainer_config=trainer_config,
+        )
+        tabular_model.fit(train=df_train, validation=df_val)
+        ensemble_models.append(tabular_model)
+        # Make out-of-fold predictions
+        result = tabular_model.predict(df_val, include_input_features=False)
+        fold_preds = result[f"{target}_prediction"].values
+        # Store out-of-fold predictions
+        if model_type == "classifier":
+            oof_predictions[val_idx] = fold_preds.astype(int)
+            prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
+            if prob_cols and oof_proba is not None:
+                oof_proba[val_idx] = result[prob_cols].values
+        else:
+            oof_predictions[val_idx] = fold_preds.flatten()
-    # Make Predictions on the Validation Set
-    print("Making Predictions on Validation Set...")
-    result = tabular_model.predict(df_val, include_input_features=False)
+        print(f"Fold {fold_idx + 1} complete!")
-    # pytorch-tabular returns predictions using f"{target}_prediction" column
-    # and classification probabilities in columns ending with "_probability"
-    if model_type == "classifier":
-        preds = result[f"{target}_prediction"].values
+    print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
+    # Use out-of-fold predictions for metrics
+    # For n_folds=1, we only have predictions for val_idx, so filter to those rows
+    if n_folds == 1:
+        val_mask = ~np.isnan(oof_predictions)
+        preds = oof_predictions[val_mask]
+        df_val = all_df[val_mask].copy()
+        if oof_proba is not None:
+            oof_proba = oof_proba[val_mask]
     else:
-        # Regression: use the target column name
-        preds = result[f"{target}_prediction"].values
+        preds = oof_predictions
+        df_val = all_df.copy()
+    # Compute prediction_std by running all ensemble models on validation data
+    # For n_folds=1, std will be 0 (only one model). For n_folds>1, std shows ensemble disagreement.
+    preds_std = None
+    if model_type in ["regressor", "uq_regressor"] and len(ensemble_models) > 0:
+        print("Computing prediction_std from ensemble predictions on validation data...")
+        all_ensemble_preds_for_std = []
+        for ens_model in ensemble_models:
+            result = ens_model.predict(df_val[features], include_input_features=False)
+            ens_preds = result[f"{target}_prediction"].values.flatten()
+            all_ensemble_preds_for_std.append(ens_preds)
+        ensemble_preds_stacked = np.stack(all_ensemble_preds_for_std, axis=0)
+        preds_std = np.std(ensemble_preds_stacked, axis=0)
+        print(f"Ensemble prediction_std - mean: {np.mean(preds_std):.4f}, max: {np.max(preds_std):.4f}")
     if model_type == "classifier":
         # Get probabilities for classification
-        print("Processing Probabilities...")
-        prob_cols = [col for col in result.columns if col.endswith("_probability")]
-        if prob_cols:
-            probs = result[prob_cols].values
-            df_val["pred_proba"] = [p.tolist() for p in probs]
-            # Expand the pred_proba column into separate columns for each class
-            print(df_val.columns)
+        if oof_proba is not None:
+            df_val = df_val.copy()
+            df_val["pred_proba"] = [p.tolist() for p in oof_proba]
             df_val = expand_proba_column(df_val, label_encoder.classes_)
-            print(df_val.columns)
         # Decode the target and prediction labels
         y_validate = label_encoder.inverse_transform(df_val[target])
-        preds = label_encoder.inverse_transform(preds.astype(int))
+        preds_decoded = label_encoder.inverse_transform(preds.astype(int))
     else:
         y_validate = df_val[target].values
+        preds_decoded = preds
+    # Save predictions to S3
+    df_val = df_val.copy()
+    df_val["prediction"] = preds_decoded
+    # Build output columns - include id_column if it exists
+    output_columns = []
+    if id_column in df_val.columns:
+        output_columns.append(id_column)
+    output_columns += [target, "prediction"]
+    # Add prediction_std for regression models (always present, 0 for single model)
+    if model_type in ["regressor", "uq_regressor"]:
+        if preds_std is not None:
+            df_val["prediction_std"] = preds_std
+        else:
+            df_val["prediction_std"] = 0.0
+        output_columns.append("prediction_std")
+        print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
-    # Save predictions to S3 (just the target, prediction, and '_probability' columns)
-    df_val["prediction"] = preds
-    output_columns = [target, "prediction"]
-    output_columns += [col for col in df_val.columns if col.endswith("_probability")]
+    output_columns += [col for col in df_val.columns if col.endswith("_proba")]
     wr.s3.to_csv(
         df_val[output_columns],
         path=f"{model_metrics_s3_path}/validation_predictions.csv",
@@ -516,7 +693,7 @@ if __name__ == "__main__":
         label_names = label_encoder.classes_
         # Calculate various model performance metrics
-        scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
+        scores = precision_recall_fscore_support(y_validate, preds_decoded, average=None, labels=label_names)
         # Put the scores into a dataframe
         score_df = pd.DataFrame(
@@ -529,7 +706,7 @@ if __name__ == "__main__":
             }
         )
-        # We need to get creative with the Classification Metrics
+        # Output metrics per class
         metrics = ["precision", "recall", "f1", "support"]
         for t in label_names:
             for m in metrics:
@@ -537,7 +714,7 @@ if __name__ == "__main__":
                 print(f"Metrics:{t}:{m} {value}")
         # Compute and output the confusion matrix
-        conf_mtx = confusion_matrix(y_validate, preds, labels=label_names)
+        conf_mtx = confusion_matrix(y_validate, preds_decoded, labels=label_names)
         for i, row_name in enumerate(label_names):
             for j, col_name in enumerate(label_names):
                 value = conf_mtx[i, j]
@@ -545,22 +722,37 @@ if __name__ == "__main__":
     else:
         # Calculate various model performance metrics (regression)
-        rmse = root_mean_squared_error(y_validate, preds)
-        mae = mean_absolute_error(y_validate, preds)
-        r2 = r2_score(y_validate, preds)
-        print(f"RMSE: {rmse:.3f}")
-        print(f"MAE: {mae:.3f}")
-        print(f"R2: {r2:.3f}")
-        print(f"NumRows: {len(df_val)}")
-    # Save the model to the standard place/name
-    tabular_model.save_model(os.path.join(args.model_dir, "tabular_model"))
+        rmse = root_mean_squared_error(y_validate, preds_decoded)
+        mae = mean_absolute_error(y_validate, preds_decoded)
+        medae = median_absolute_error(y_validate, preds_decoded)
+        r2 = r2_score(y_validate, preds_decoded)
+        spearman_corr = spearmanr(y_validate, preds_decoded).correlation
+        support = len(df_val)
+        print(f"rmse: {rmse:.3f}")
+        print(f"mae: {mae:.3f}")
+        print(f"medae: {medae:.3f}")
+        print(f"r2: {r2:.3f}")
+        print(f"spearmanr: {spearman_corr:.3f}")
+        print(f"support: {support}")
+    # Save ensemble models
+    for model_idx, ens_model in enumerate(ensemble_models):
+        model_path = os.path.join(args.model_dir, f"tabular_model_{model_idx}")
+        ens_model.save_model(model_path)
+        print(f"Saved model {model_idx + 1} to {model_path}")
+    # Save ensemble metadata
+    n_ensemble = len(ensemble_models)
+    ensemble_metadata = {"n_ensemble": n_ensemble, "n_folds": n_folds}
+    joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
+    print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
     if label_encoder:
         joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
     # Save the features (this will validate input during predictions)
     with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
-        json.dump(orig_features, fp)  # We save the original features, not the decompressed ones
+        json.dump(orig_features, fp)
     # Save the category mappings
     with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:

workbench 0.8.198__py3-none-any.whl → 0.8.203__py3-none-any.whl

workbench 0.8.198py3-none-any.whl → 0.8.203py3-none-any.whl