PyPI - workbench - Versions diffs - 0.8.178__py3-none-any.whl → 0.8.179__py3-none-any.whl - Mend

workbench 0.8.178py3-none-any.whl → 0.8.179py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (25) hide show

workbench/model_scripts/custom_models/uq_models/mapie.template CHANGED Viewed

@@ -5,11 +5,7 @@ from xgboost import XGBRegressor
 from sklearn.model_selection import train_test_split
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from io import StringIO
 import json
@@ -25,7 +21,8 @@ TEMPLATE_PARAMS = {
     "target": "{{target_column}}",
     "features": "{{feature_list}}",
     "compressed_features": "{{compressed_features}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
+    "hyperparameters": "{{hyperparameters}}",
 }
@@ -101,7 +98,7 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
 def decompress_features(
-        df: pd.DataFrame, features: List[str], compressed_features: List[str]
+    df: pd.DataFrame, features: List[str], compressed_features: List[str]
 ) -> Tuple[pd.DataFrame, List[str]]:
     """Prepare features for the model by decompressing bitstring features
@@ -162,6 +159,7 @@ if __name__ == "__main__":
     orig_features = features.copy()
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
+    hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
     validation_split = 0.2
     # Script arguments for input/output directories
@@ -174,11 +172,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -213,9 +207,7 @@ if __name__ == "__main__":
     else:
         # Just do a random training Split
         print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(
-            all_df, test_size=validation_split, random_state=42
-        )
+        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
     print(f"FIT/TRAIN: {df_train.shape}")
     print(f"VALIDATION: {df_val.shape}")
@@ -227,7 +219,8 @@ if __name__ == "__main__":
     # Train XGBoost for point predictions
     print("\nTraining XGBoost for point predictions...")
-    xgb_model = XGBRegressor(enable_categorical=True)
+    print(f"  Hyperparameters: {hyperparameters}")
+    xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
     xgb_model.fit(X_train, y_train)
     # Evaluate XGBoost performance
@@ -272,7 +265,7 @@ if __name__ == "__main__":
                 colsample_bytree=0.8,
                 random_state=42,
                 verbose=-1,
-                force_col_wise=True
+                force_col_wise=True,
             )
             est.fit(X_train, y_train)
             quantile_estimators.append(est)
@@ -280,9 +273,7 @@ if __name__ == "__main__":
         # Create MAPIE CQR model for this confidence level
         print(f"  Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
         mapie_model = ConformalizedQuantileRegressor(
-            quantile_estimators,
-            confidence_level=confidence_level,
-            prefit=True
+            quantile_estimators, confidence_level=confidence_level, prefit=True
         )
         # Conformalize the model
@@ -337,8 +328,8 @@ if __name__ == "__main__":
             "xgb_rmse": float(xgb_rmse),
             "xgb_mae": float(xgb_mae),
             "xgb_r2": float(xgb_r2),
-            "n_validation": len(df_val)
-        }
+            "n_validation": len(df_val),
+        },
     }
     with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
         json.dump(model_config, fp, indent=2)
@@ -379,7 +370,7 @@ def model_fn(model_dir) -> dict:
         "xgb_model": xgb_model,
         "mapie_models": mapie_models,
         "confidence_levels": config["confidence_levels"],
-        "category_mappings": category_mappings
+        "category_mappings": category_mappings,
     }
@@ -404,7 +395,7 @@ def output_fn(output_df, accept_type):
     """Supports both CSV and JSON output formats."""
     if "text/csv" in accept_type:
         # Convert categorical columns to string to avoid fillna issues
-        for col in output_df.select_dtypes(include=['category']).columns:
+        for col in output_df.select_dtypes(include=["category"]).columns:
             output_df[col] = output_df[col].astype(str)
         csv_output = output_df.fillna("N/A").to_csv(index=False)
         return csv_output, "text/csv"
@@ -425,6 +416,10 @@ def predict_fn(df, models) -> pd.DataFrame:
         pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
     """
+    # Flag for outlier stretch adjustment for the prediction intervals
+    # if the predicted values are outside the intervals
+    outlier_stretch = False
     # Grab our feature columns (from training)
     model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
     with open(os.path.join(model_dir, "feature_columns.json")) as fp:
@@ -435,11 +430,7 @@ def predict_fn(df, models) -> pd.DataFrame:
     # Apply categorical mappings if they exist
     if models.get("category_mappings"):
-        matched_df, _ = convert_categorical_types(
-            matched_df,
-            model_features,
-            models["category_mappings"]
-        )
+        matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
     # Get features for prediction
     X = matched_df[model_features]
@@ -475,7 +466,7 @@ def predict_fn(df, models) -> pd.DataFrame:
     # Add median (q_50) from XGBoost prediction
     df["q_50"] = df["prediction"]
-    # Calculate a psueduo-standard deviation from the 68% interval width
+    # Calculate a pseudo-standard deviation from the 68% interval width
     df["prediction_std"] = (df["q_84"] - df["q_16"]) / 2.0
     # Reorder the quantile columns for easier reading
@@ -484,7 +475,19 @@ def predict_fn(df, models) -> pd.DataFrame:
     df = df[other_cols + quantile_cols]
     # Adjust the outer quantiles to ensure they encompass the prediction
-    df["q_025"] = np.minimum(df["q_025"], df["prediction"])
-    df["q_975"] = np.maximum(df["q_975"], df["prediction"])
+    if outlier_stretch:
+        # Lower intervals adjustments
+        df["q_025"] = np.minimum(df["q_025"], df["prediction"])
+        df["q_05"] = np.minimum(df["q_05"], df["prediction"])
+        df["q_10"] = np.minimum(df["q_10"], df["prediction"])
+        df["q_16"] = np.minimum(df["q_16"], df["prediction"])
+        df["q_25"] = np.minimum(df["q_25"], df["prediction"])
+        # Upper intervals adjustments
+        df["q_75"] = np.maximum(df["q_75"], df["prediction"])
+        df["q_84"] = np.maximum(df["q_84"], df["prediction"])
+        df["q_90"] = np.maximum(df["q_90"], df["prediction"])
+        df["q_95"] = np.maximum(df["q_95"], df["prediction"])
+        df["q_975"] = np.maximum(df["q_975"], df["prediction"])
     return df

workbench/model_scripts/custom_models/uq_models/meta_uq.template CHANGED Viewed

@@ -5,11 +5,7 @@ from xgboost import XGBRegressor  # Point Estimator
 from sklearn.model_selection import train_test_split
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from io import StringIO
 import json
@@ -24,7 +20,6 @@ from typing import List, Tuple
 from proximity import Proximity
 # Template Placeholders
 TEMPLATE_PARAMS = {
     "id_column": "{{id_column}}",
@@ -32,7 +27,7 @@ TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "compressed_features": "{{compressed_features}}",
     "train_all_data": "{{train_all_data}}",
-    "track_columns": "{{track_columns}}"
+    "track_columns": "{{track_columns}}",
 }
@@ -183,11 +178,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -222,9 +213,7 @@ if __name__ == "__main__":
     else:
         # Just do a random training Split
         print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(
-            all_df, test_size=validation_split, random_state=42
-        )
+        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
     print(f"FIT/TRAIN: {df_train.shape}")
     print(f"VALIDATION: {df_val.shape}")
@@ -289,11 +278,7 @@ def model_fn(model_dir) -> dict:
     # Deserialize the proximity model
     prox_model = Proximity.deserialize(model_dir)
-    return {
-        "xgboost": xgb_model,
-        "ngboost": ngb_model,
-        "proximity": prox_model
-    }
+    return {"xgboost": xgb_model, "ngboost": ngb_model, "proximity": prox_model}
 def input_fn(input_data, content_type):
@@ -353,8 +338,8 @@ def predict_fn(df, models) -> pd.DataFrame:
     dist_params = y_dists.params
     # Extract mean and std from distribution parameters
-    df["prediction_uq"] = dist_params['loc']  # mean
-    df["prediction_std"] = dist_params['scale']  # standard deviation
+    df["prediction_uq"] = dist_params["loc"]  # mean
+    df["prediction_std"] = dist_params["scale"]  # standard deviation
     # Add 95% prediction intervals using ppf (percent point function)
     # Note: Our hybrid model uses XGB point prediction and NGBoost UQ

workbench/model_scripts/custom_models/uq_models/ngboost.template CHANGED Viewed

@@ -3,11 +3,7 @@ from ngboost import NGBRegressor
 from sklearn.model_selection import train_test_split
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from io import StringIO
 import json
@@ -21,7 +17,7 @@ import pandas as pd
 TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
 }
@@ -87,10 +83,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -212,8 +205,8 @@ def predict_fn(df, model) -> pd.DataFrame:
     dist_params = y_dists.params
     # Extract mean and std from distribution parameters
-    df["prediction"] = dist_params['loc']  # mean
-    df["prediction_std"] = dist_params['scale']  # standard deviation
+    df["prediction"] = dist_params["loc"]  # mean
+    df["prediction_std"] = dist_params["scale"]  # standard deviation
     # Add 95% prediction intervals using ppf (percent point function)
     df["q_025"] = y_dists.ppf(0.025)  # 2.5th percentile

workbench/model_scripts/ensemble_xgb/ensemble_xgb.template CHANGED Viewed

@@ -3,7 +3,7 @@ TEMPLATE_PARAMS = {
     "model_type": "{{model_type}}",
     "target_column": "{{target_column}}",
     "feature_list": "{{feature_list}}",
-    "model_metrics_s3_path": "{{model_metrics_s3_path}}"
+    "model_metrics_s3_path": "{{model_metrics_s3_path}}",
 }
 # Imports for XGB Model
@@ -12,11 +12,7 @@ import awswrangler as wr
 import numpy as np
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from io import StringIO
 import json
@@ -39,6 +35,7 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
         print(msg)
         raise ValueError(msg)
 def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
     """
     Matches and renames the DataFrame's column names to match the model's feature names (case-insensitive).
@@ -95,11 +92,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -150,7 +143,6 @@ if __name__ == "__main__":
     result_df["residual"] = result_df[target] - result_df["prediction"]
     result_df["residual_abs"] = result_df["residual"].abs()
     # Save the results dataframe to S3
     wr.s3.to_csv(
         result_df,
@@ -210,7 +202,7 @@ def input_fn(input_data, content_type):
     """Parse input data and return a DataFrame."""
     if not input_data:
         raise ValueError("Empty input data is not supported!")
     # Decode bytes to string if necessary
     if isinstance(input_data, bytes):
         input_data = input_data.decode("utf-8")

workbench/model_scripts/pytorch_model/pytorch.template CHANGED Viewed

@@ -36,12 +36,12 @@ from typing import List, Tuple
 # Template Parameters
 TEMPLATE_PARAMS = {
     "model_type": "{{model_type}}",
-    "target_column": "{{target_column}}",
+    "target": "{{target_column}}",
     "features": "{{feature_list}}",
     "compressed_features": "{{compressed_features}}",
     "model_metrics_s3_path": "{{model_metrics_s3_path}}",
     "train_all_data": "{{train_all_data}}",
-    "hyperparameters": "{{hyperparameters}}"
+    "hyperparameters": "{{hyperparameters}}",
 }
@@ -103,7 +103,6 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     df_columns_lower = {col.lower(): col for col in df.columns}
     rename_dict = {}
     missing = []
     for feature in model_features:
         if feature in df.columns:
             continue  # Exact match
@@ -115,6 +114,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     if missing:
         raise ValueError(f"Features not found: {missing}")
+    # Rename the DataFrame columns to match the model features
     return df.rename(columns=rename_dict)
@@ -210,7 +210,7 @@ def model_fn(model_dir):
     original_cwd = os.getcwd()
     try:
         # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
-        os.chdir('/tmp')
+        os.chdir("/tmp")
         # Load the model
         model_path = os.path.join(model_dir, "tabular_model")
@@ -328,7 +328,7 @@ if __name__ == "__main__":
     """The main function is for training the PyTorch Tabular model"""
     # Harness Template Parameters
-    target = TEMPLATE_PARAMS["target_column"]
+    target = TEMPLATE_PARAMS["target"]
     features = TEMPLATE_PARAMS["features"]
     orig_features = features.copy()
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
@@ -348,11 +348,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -433,8 +429,7 @@ if __name__ == "__main__":
     }
     # Override defaults with training_config if present
-    training_overrides = {k: v for k, v in hyperparameters.get('training_config', {}).items()
-                          if k in trainer_defaults}
+    training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
     # Print overwrites
     for key, value in training_overrides.items():
         print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
@@ -451,8 +446,7 @@ if __name__ == "__main__":
         "initialization": "kaiming",
     }
     # Override defaults with model_config if present
-    model_overrides = {k: v for k, v in hyperparameters.get('model_config', {}).items()
-                          if k in model_defaults}
+    model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
     # Print overwrites
     for key, value in model_overrides.items():
         print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
@@ -461,10 +455,7 @@ if __name__ == "__main__":
     # Use CategoryEmbedding model configuration for general-purpose tabular modeling.
     # Works effectively for both regression and classification as the foundational
     # architecture in PyTorch Tabular
-    model_config = CategoryEmbeddingModelConfig(
-        task=task,
-        **model_params
-    )
+    model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
     optimizer_config = OptimizerConfig()
     #####################################

workbench/model_scripts/quant_regression/quant_regression.template CHANGED Viewed

@@ -4,11 +4,7 @@ import awswrangler as wr
 from sklearn.model_selection import train_test_split
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from io import StringIO
 import json
@@ -22,9 +18,10 @@ TEMPLATE_PARAMS = {
     "target_column": "{{target_column}}",
     "features": "{{feature_list}}",
     "model_metrics_s3_path": "{{model_metrics_s3_path}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
 }
 # Function to check if dataframe is empty
 def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
     """
@@ -64,6 +61,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     # Rename the DataFrame columns to match the model features
     return df.rename(columns=rename_dict)
 if __name__ == "__main__":
     """The main function is for training the XGBoost Quantile Regression models"""
@@ -86,10 +84,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe

workbench/model_scripts/scikit_learn/scikit_learn.template CHANGED Viewed

@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
     "feature_list": "{{feature_list}}",
     "model_class": "{{model_class}}",
     "model_metrics_s3_path": "{{model_metrics_s3_path}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
 }
 import awswrangler as wr
@@ -99,10 +99,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -116,10 +113,7 @@ if __name__ == "__main__":
     if needs_standardization:
         # Create a pipeline with standardization and the model
-        model = Pipeline([
-            ("scaler", StandardScaler()),
-            ("model", model)
-        ])
+        model = Pipeline([("scaler", StandardScaler()), ("model", model)])
     # Handle logic based on the model_type
     if model_type in ["classifier", "regressor"]:
@@ -206,6 +200,7 @@ if __name__ == "__main__":
     with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
         json.dump(feature_list, fp)
 #
 # Inference Section
 #

workbench 0.8.178__py3-none-any.whl → 0.8.179__py3-none-any.whl

Potentially problematic release.

workbench 0.8.178py3-none-any.whl → 0.8.179py3-none-any.whl