PyPI - workbench - Versions diffs - 0.8.157__py3-none-any.whl → 0.8.159__py3-none-any.whl - Mend

workbench 0.8.157py3-none-any.whl → 0.8.159py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

workbench/model_scripts/{custom_models/nn_models → pytorch_model}/generated_model_script.py RENAMED Viewed

@@ -1,16 +1,14 @@
 # Imports for PyTorch Tabular Model
-import torch
-from pytorch_tabular import TabularModel
-from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
-from pytorch_tabular.models import CategoryEmbeddingModelConfig, NodeConfig, TabNetModelConfig
+import os
 import awswrangler as wr
 import numpy as np
-# PyTorch 2.6 compatibility: pytorch-tabular saves complex objects, not just tensors
+# PyTorch compatibility: pytorch-tabular saves complex objects, not just tensors
 # Use legacy loading behavior for compatibility (recommended by PyTorch docs for this scenario)
-import os
 os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
+from pytorch_tabular import TabularModel
+from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
+from pytorch_tabular.models import CategoryEmbeddingModelConfig, TabNetModelConfig
 # Model Performance Scores
 from sklearn.metrics import (
@@ -37,11 +35,11 @@ from typing import List, Tuple
 # Template Parameters
 TEMPLATE_PARAMS = {
-    "model_type": "regressor",
-    "target_column": "solubility",
+    "model_type": "classifier",
+    "target_column": "solubility_class",
     "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
     "compressed_features": [],
-    "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-pytorch-reg/training",
+    "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-pytorch-class/training",
     "train_all_data": False
 }
@@ -150,7 +148,9 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
     return df, category_mappings
-def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
+def decompress_features(
+    df: pd.DataFrame, features: List[str], compressed_features: List[str]
+) -> Tuple[pd.DataFrame, List[str]]:
     """Prepare features for the model
     Args:
@@ -203,6 +203,135 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
     return df, decompressed_features
+def model_fn(model_dir):
+    """Deserialize and return fitted PyTorch Tabular model"""
+    #
+    os.environ['TEMP'] = '/tmp'
+    model_path = os.path.join(model_dir, "tabular_model")
+    model = TabularModel.load_model(model_path)
+    return model
+def model_fn(model_dir):
+    # Save current working directory
+    original_cwd = os.getcwd()
+    try:
+        # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
+        os.chdir('/tmp')
+        # Load the model
+        model_path = os.path.join(model_dir, "tabular_model")
+        model = TabularModel.load_model(model_path)
+    # Restore the original working directory
+    finally:
+        os.chdir(original_cwd)
+    return model
+def input_fn(input_data, content_type):
+    """Parse input data and return a DataFrame."""
+    if not input_data:
+        raise ValueError("Empty input data is not supported!")
+    # Decode bytes to string if necessary
+    if isinstance(input_data, bytes):
+        input_data = input_data.decode("utf-8")
+    if "text/csv" in content_type:
+        return pd.read_csv(StringIO(input_data))
+    elif "application/json" in content_type:
+        return pd.DataFrame(json.loads(input_data))  # Assumes JSON array of records
+    else:
+        raise ValueError(f"{content_type} not supported!")
+def output_fn(output_df, accept_type):
+    """Supports both CSV and JSON output formats."""
+    if "text/csv" in accept_type:
+        csv_output = output_df.fillna("N/A").to_csv(index=False)  # CSV with N/A for missing values
+        return csv_output, "text/csv"
+    elif "application/json" in accept_type:
+        return output_df.to_json(orient="records"), "application/json"  # JSON array of records (NaNs -> null)
+    else:
+        raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
+def predict_fn(df, model) -> pd.DataFrame:
+    """Make Predictions with our PyTorch Tabular Model
+    Args:
+        df (pd.DataFrame): The input DataFrame
+        model: The TabularModel use for predictions
+    Returns:
+        pd.DataFrame: The DataFrame with the predictions added
+    """
+    compressed_features = TEMPLATE_PARAMS["compressed_features"]
+    # Grab our feature columns (from training)
+    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
+    with open(os.path.join(model_dir, "feature_columns.json")) as fp:
+        features = json.load(fp)
+    print(f"Model Features: {features}")
+    # Load the category mappings (from training)
+    with open(os.path.join(model_dir, "category_mappings.json")) as fp:
+        category_mappings = json.load(fp)
+    # Load our Label Encoder if we have one
+    label_encoder = None
+    if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
+        label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
+    # We're going match features in a case-insensitive manner, accounting for all the permutations
+    # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
+    # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
+    matched_df = match_features_case_insensitive(df, features)
+    # Detect categorical types in the incoming DataFrame
+    matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
+    # If we have compressed features, decompress them
+    if compressed_features:
+        print("Decompressing features for prediction...")
+        matched_df, features = decompress_features(matched_df, features, compressed_features)
+    # Make predictions using the TabularModel
+    result = model.predict(matched_df[features])
+    # pytorch-tabular returns predictions using f"{target}_prediction" column
+    # and classification probabilities in columns ending with "_probability"
+    target = TEMPLATE_PARAMS["target_column"]
+    prediction_column = f"{target}_prediction"
+    if prediction_column in result.columns:
+        predictions = result[prediction_column].values
+    else:
+        raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
+    # If we have a label encoder, decode the predictions
+    if label_encoder:
+        predictions = label_encoder.inverse_transform(predictions.astype(int))
+    # Set the predictions on the DataFrame
+    df["prediction"] = predictions
+    # For classification, get probabilities
+    if label_encoder is not None:
+        prob_cols = [col for col in result.columns if col.endswith("_probability")]
+        if prob_cols:
+            probs = result[prob_cols].values
+            df["pred_proba"] = [p.tolist() for p in probs]
+            # Expand the pred_proba column into separate columns for each class
+            df = expand_proba_column(df, label_encoder.classes_)
+    # All done, return the DataFrame with new columns for the predictions
+    return df
 if __name__ == "__main__":
     """The main function is for training the PyTorch Tabular model"""
@@ -265,14 +394,12 @@ if __name__ == "__main__":
     else:
         # Just do a random training Split
         print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(
-            all_df, test_size=validation_split, random_state=42
-        )
+        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
     print(f"FIT/TRAIN: {df_train.shape}")
     print(f"VALIDATION: {df_val.shape}")
     # Determine categorical and continuous columns
-    categorical_cols = [col for col in features if df_train[col].dtype.name == 'category']
+    categorical_cols = [col for col in features if df_train[col].dtype.name == "category"]
     continuous_cols = [col for col in features if col not in categorical_cols]
     print(f"Categorical columns: {categorical_cols}")
@@ -287,37 +414,39 @@ if __name__ == "__main__":
     trainer_config = TrainerConfig(
         auto_lr_find=True,
-        batch_size=1024,
+        batch_size=min(1024, len(df_train) // 4),
         max_epochs=100,
         early_stopping="valid_loss",
-        early_stopping_patience=20,
+        early_stopping_patience=15,
+        checkpoints="valid_loss",
+        accelerator="auto",
         progress_bar="none",
+        gradient_clip_val=1.0,
     )
     optimizer_config = OptimizerConfig()
     # Choose model configuration based on model type
     if model_type == "classifier":
-        # Use TabNet for classification
-        model_config = TabNetModelConfig(
-            task="classification",
-            learning_rate=1e-3,
-        )
+        task = "classification"
         # Encode the target column
         label_encoder = LabelEncoder()
         df_train[target] = label_encoder.fit_transform(df_train[target])
         df_val[target] = label_encoder.transform(df_val[target])
     else:
-        # Use CategoryEmbedding for regression
-        model_config = CategoryEmbeddingModelConfig(
-            task="regression",
-            layers="1024-512-512",
-            activation="ReLU",
-            learning_rate=1e-3,
-        )
-        label_encoder = None  # We don't need this for regression
+        task = "regression"
+        label_encoder = None
+    # Use CategoryEmbedding for both regression and classification tasks
+    model_config = CategoryEmbeddingModelConfig(
+        task=task,
+        layers="1024-512-512",
+        activation="ReLU",
+        learning_rate=1e-3,
+        dropout=0.1,
+        use_batch_norm=True,
+        initialization="kaiming",
+    )
     # Create and train the TabularModel
     tabular_model = TabularModel(
@@ -332,16 +461,15 @@ if __name__ == "__main__":
     # Make Predictions on the Validation Set
     print(f"Making Predictions on Validation Set...")
-    result = tabular_model.predict(df_val)
-    print(f"Result Columns: {result.columns.tolist()}")
+    result = tabular_model.predict(df_val, include_input_features=False)
-    # For regression: pytorch-tabular returns predictions using the target column name
-    # For classification: pytorch-tabular returns predictions using "prediction" column
+    # pytorch-tabular returns predictions using f"{target}_prediction" column
+    # and classification probabilities in columns ending with "_probability"
     if model_type == "classifier":
-        preds = result["prediction"].values
+        preds = result[f"{target}_prediction"].values
     else:
         # Regression: use the target column name
-        preds = result[target].values
+        preds = result[f"{target}_prediction"].values
     if model_type == "classifier":
         # Get probabilities for classification
@@ -362,10 +490,10 @@ if __name__ == "__main__":
     else:
         y_validate = df_val[target].values
-    # Save predictions to S3 (just the target, prediction, and '_proba' columns)
+    # Save predictions to S3 (just the target, prediction, and '_probability' columns)
     df_val["prediction"] = preds
     output_columns = [target, "prediction"]
-    output_columns += [col for col in df_val.columns if col.endswith("_proba")]
+    output_columns += [col for col in df_val.columns if col.endswith("_probability")]
     wr.s3.to_csv(
         df_val[output_columns],
         path=f"{model_metrics_s3_path}/validation_predictions.csv",
@@ -378,9 +506,7 @@ if __name__ == "__main__":
         label_names = label_encoder.classes_
         # Calculate various model performance metrics
-        scores = precision_recall_fscore_support(
-            y_validate, preds, average=None, labels=label_names
-        )
+        scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
         # Put the scores into a dataframe
         score_df = pd.DataFrame(
@@ -428,116 +554,4 @@ if __name__ == "__main__":
     # Save the category mappings
     with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
-        json.dump(category_mappings, fp)
-def model_fn(model_dir):
-    """Deserialize and return fitted PyTorch Tabular model"""
-    model_path = os.path.join(model_dir, "tabular_model")
-    model = TabularModel.load_model(model_path)
-    return model
-def input_fn(input_data, content_type):
-    """Parse input data and return a DataFrame."""
-    if not input_data:
-        raise ValueError("Empty input data is not supported!")
-    # Decode bytes to string if necessary
-    if isinstance(input_data, bytes):
-        input_data = input_data.decode("utf-8")
-    if "text/csv" in content_type:
-        return pd.read_csv(StringIO(input_data))
-    elif "application/json" in content_type:
-        return pd.DataFrame(json.loads(input_data))  # Assumes JSON array of records
-    else:
-        raise ValueError(f"{content_type} not supported!")
-def output_fn(output_df, accept_type):
-    """Supports both CSV and JSON output formats."""
-    if "text/csv" in accept_type:
-        csv_output = output_df.fillna("N/A").to_csv(index=False)  # CSV with N/A for missing values
-        return csv_output, "text/csv"
-    elif "application/json" in accept_type:
-        return output_df.to_json(orient="records"), "application/json"  # JSON array of records (NaNs -> null)
-    else:
-        raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
-def predict_fn(df, model) -> pd.DataFrame:
-    """Make Predictions with our PyTorch Tabular Model
-    Args:
-        df (pd.DataFrame): The input DataFrame
-        model: The TabularModel use for predictions
-    Returns:
-        pd.DataFrame: The DataFrame with the predictions added
-    """
-    compressed_features = TEMPLATE_PARAMS["compressed_features"]
-    # Grab our feature columns (from training)
-    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
-    with open(os.path.join(model_dir, "feature_columns.json")) as fp:
-        features = json.load(fp)
-    print(f"Model Features: {features}")
-    # Load the category mappings (from training)
-    with open(os.path.join(model_dir, "category_mappings.json")) as fp:
-        category_mappings = json.load(fp)
-    # Load our Label Encoder if we have one
-    label_encoder = None
-    if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
-        label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
-    # We're going match features in a case-insensitive manner, accounting for all the permutations
-    # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
-    # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
-    matched_df = match_features_case_insensitive(df, features)
-    # Detect categorical types in the incoming DataFrame
-    matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
-    # If we have compressed features, decompress them
-    if compressed_features:
-        print("Decompressing features for prediction...")
-        matched_df, features = decompress_features(matched_df, features, compressed_features)
-    # Make predictions using the TabularModel
-    result = model.predict(matched_df)
-    # Extract predictions based on model type
-    # For regression: pytorch-tabular uses target column name
-    # For classification: pytorch-tabular uses "prediction" column
-    if "prediction" in result.columns:
-        predictions = result["prediction"].values
-    else:
-        # For regression, find the new column (not in original dataframe)
-        pred_cols = [col for col in result.columns if col not in matched_df.columns]
-        if pred_cols:
-            predictions = result[pred_cols[0]].values
-        else:
-            raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
-    # If we have a label encoder, decode the predictions
-    if label_encoder:
-        predictions = label_encoder.inverse_transform(predictions.astype(int))
-    # Set the predictions on the DataFrame
-    df["prediction"] = predictions
-    # For classification, get probabilities
-    if label_encoder is not None:
-        prob_cols = [col for col in result.columns if col.endswith("_probability")]
-        if prob_cols:
-            probs = result[prob_cols].values
-            df["pred_proba"] = [p.tolist() for p in probs]
-            # Expand the pred_proba column into separate columns for each class
-            df = expand_proba_column(df, label_encoder.classes_)
-    # All done, return the DataFrame with new columns for the predictions
-    return df
+        json.dump(category_mappings, fp)

workbench 0.8.157__py3-none-any.whl → 0.8.159__py3-none-any.whl

workbench 0.8.157py3-none-any.whl → 0.8.159py3-none-any.whl