PyPI - workbench - Versions diffs - 0.8.178__py3-none-any.whl → 0.8.180__py3-none-any.whl - Mend

workbench 0.8.178py3-none-any.whl → 0.8.180py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (26) hide show

workbench/api/endpoint.py CHANGED Viewed

@@ -4,6 +4,7 @@ Endpoints can be viewed in the AWS Sagemaker interfaces or in the Workbench
 Dashboard UI, which provides additional model details and performance metrics"""
 import pandas as pd
+from typing import Tuple
 # Workbench Imports
 from workbench.core.artifacts.endpoint_core import EndpointCore
@@ -70,14 +71,14 @@ class Endpoint(EndpointCore):
         """
         return super().fast_inference(eval_df, threads=threads)
-    def cross_fold_inference(self, nfolds: int = 5) -> dict:
+    def cross_fold_inference(self, nfolds: int = 5) -> Tuple[dict, pd.DataFrame]:
         """Run cross-fold inference (only works for XGBoost models)
         Args:
             nfolds (int): The number of folds to use for cross-validation (default: 5)
         Returns:
-            dict: A dictionary with fold results
+            Tuple(dict, pd.DataFrame): A tuple containing a dictionary of metrics and a DataFrame with predictions
         """
         return super().cross_fold_inference(nfolds)

workbench/core/artifacts/endpoint_core.py CHANGED Viewed

@@ -8,7 +8,7 @@ import pandas as pd
 import numpy as np
 from io import StringIO
 import awswrangler as wr
-from typing import Union, Optional
+from typing import Union, Optional, Tuple
 import hashlib
 # Model Performance Scores
@@ -436,24 +436,24 @@ class EndpointCore(Artifact):
         # Return the prediction DataFrame
         return prediction_df
-    def cross_fold_inference(self, nfolds: int = 5) -> dict:
+    def cross_fold_inference(self, nfolds: int = 5) -> Tuple[dict, pd.DataFrame]:
         """Run cross-fold inference (only works for XGBoost models)
         Args:
             nfolds (int): Number of folds to use for cross-fold (default: 5)
         Returns:
-            dict: Dictionary with the cross-fold inference results
+            Tuple[dict, pd.DataFrame]: Tuple of (cross_fold_metrics, out_of_fold_df)
         """
         # Grab our model
         model = ModelCore(self.model_name)
         # Compute CrossFold Metrics
-        cross_fold_metrics = cross_fold_inference(model, nfolds=nfolds)
+        cross_fold_metrics, out_of_fold_df = cross_fold_inference(model, nfolds=nfolds)
         if cross_fold_metrics:
             self.param_store.upsert(f"/workbench/models/{model.name}/inference/cross_fold", cross_fold_metrics)
-        return cross_fold_metrics
+        return cross_fold_metrics, out_of_fold_df
     def fast_inference(self, eval_df: pd.DataFrame, threads: int = 4) -> pd.DataFrame:
         """Run inference on the Endpoint using the provided DataFrame

workbench/core/artifacts/feature_set_core.py CHANGED Viewed

@@ -17,7 +17,7 @@ from workbench.core.artifacts.artifact import Artifact
 from workbench.core.artifacts.data_source_factory import DataSourceFactory
 from workbench.core.artifacts.athena_source import AthenaSource
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Optional, List, Union
 from workbench.utils.aws_utils import aws_throttle
@@ -514,7 +514,7 @@ class FeatureSetCore(Artifact):
         Args:
             filter_expression (Optional[str]): A SQL filter expression (e.g., "age > 25 AND status = 'active'")
-                If None or empty string, will reset to default training view with no filter
+                If None or empty string, will reset to training view with no filter
                 (default: None)
         """
         from workbench.core.views import TrainingView
@@ -528,6 +528,29 @@ class FeatureSetCore(Artifact):
             self, id_column=self.id_column, holdout_ids=holdout_ids, filter_expression=filter_expression
         )
+    def exclude_ids_from_training(self, ids: List[Union[str, int]], column_name: Optional[str] = None):
+        """Exclude a list of IDs from the training view
+        Args:
+            ids (List[Union[str, int]],): List of IDs to exclude from training
+            column_name (Optional[str]): Column name to filter on.
+                If None, uses self.id_column (default: None)
+        """
+        # Use the default id_column if not specified
+        column = column_name or self.id_column
+        # Handle empty list case
+        if not ids:
+            self.log.warning("No IDs provided to exclude")
+            return
+        # Build the filter expression with proper SQL quoting
+        quoted_ids = ", ".join([repr(id) for id in ids])
+        filter_expression = f"{column} NOT IN ({quoted_ids})"
+        # Apply the filter
+        self.set_training_filter(filter_expression)
     @classmethod
     def delete_views(cls, table: str, database: str):
         """Delete any views associated with this FeatureSet
@@ -769,6 +792,13 @@ if __name__ == "__main__":
     print(f"Training Data: {training_data.shape}")
     print(training_data)
+    # Test excluding ids from training
+    print("Excluding ids from training...")
+    my_features.exclude_ids_from_training([1, 2, 3, 4, 5])
+    training_data = my_features.get_training_data()
+    print(f"Training Data: {training_data.shape}")
+    print(training_data)
     # Now delete the AWS artifacts associated with this Feature Set
     # print("Deleting Workbench Feature Set...")
     # my_features.delete()

workbench/model_scripts/custom_models/proximity/feature_space_proximity.template CHANGED Viewed

@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
     "id_column": "{{id_column}}",
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
-    "track_columns": "{{track_columns}}"
+    "track_columns": "{{track_columns}}",
 }
 from io import StringIO
@@ -73,10 +73,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -88,6 +85,7 @@ if __name__ == "__main__":
     # Now serialize the model
     model.serialize(args.model_dir)
 # Model loading and prediction functions
 def model_fn(model_dir):

workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template CHANGED Viewed

@@ -14,7 +14,7 @@ import pandas as pd
 TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
 }
@@ -37,7 +37,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
     Raises ValueError if any model features cannot be matched.
     """
     df_columns_lower = {col.lower(): col for col in df.columns}
@@ -81,10 +81,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -109,8 +106,10 @@ if __name__ == "__main__":
     # Create and train the Regression/Confidence model
     # model = BayesianRidge()
     model = BayesianRidge(
-        alpha_1=1e-6, alpha_2=1e-6,  # Noise precision
-        lambda_1=1e-6, lambda_2=1e-6,  # Weight precision
+        alpha_1=1e-6,
+        alpha_2=1e-6,  # Noise precision
+        lambda_1=1e-6,
+        lambda_2=1e-6,  # Weight precision
         fit_intercept=True,
     )

workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template CHANGED Viewed

@@ -4,11 +4,7 @@ import awswrangler as wr
 import numpy as np
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from sklearn.model_selection import KFold
 from scipy.optimize import minimize
@@ -23,7 +19,7 @@ TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
     "train_all_data": "{{train_all_data}}",
-    "model_metrics_s3_path": "{{model_metrics_s3_path}}"
+    "model_metrics_s3_path": "{{model_metrics_s3_path}}",
 }
@@ -47,7 +43,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
     Raises ValueError if any model features cannot be matched.
     """
     df_columns_lower = {col.lower(): col for col in df.columns}
@@ -90,10 +86,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -172,16 +165,14 @@ if __name__ == "__main__":
     cv_residuals = np.array(cv_residuals)
     cv_uncertainties = np.array(cv_uncertainties)
     # Optimize calibration parameters: σ_cal = a * σ_uc + b
     def neg_log_likelihood(params):
         a, b = params
         sigma_cal = a * cv_uncertainties + b
         sigma_cal = np.maximum(sigma_cal, 1e-8)  # Prevent division by zero
-        return np.sum(0.5 * np.log(2 * np.pi * sigma_cal ** 2) + 0.5 * (cv_residuals ** 2) / (sigma_cal ** 2))
+        return np.sum(0.5 * np.log(2 * np.pi * sigma_cal**2) + 0.5 * (cv_residuals**2) / (sigma_cal**2))
-    result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method='Nelder-Mead')
+    result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method="Nelder-Mead")
     cal_a, cal_b = result.x
     print(f"Calibration parameters: a={cal_a:.4f}, b={cal_b:.4f}")
@@ -205,7 +196,9 @@ if __name__ == "__main__":
     result_df["prediction"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].mean(axis=1)
     # Compute uncalibrated uncertainty
-    result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(axis=1)
+    result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(
+        axis=1
+    )
     # Apply calibration to uncertainty
     result_df["prediction_std"] = cal_a * result_df["prediction_std_uc"] + cal_b
@@ -352,4 +345,4 @@ def predict_fn(df, models) -> pd.DataFrame:
     df = df.reindex(sorted(df.columns), axis=1)
     # All done, return the DataFrame
-    return df
+    return df

workbench/model_scripts/custom_models/uq_models/gaussian_process.template CHANGED Viewed

@@ -9,7 +9,7 @@ from sklearn.model_selection import train_test_split
 TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
 }
 from io import StringIO
@@ -33,7 +33,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
     Raises ValueError if any model features cannot be matched.
     """
     df_columns_lower = {col.lower(): col for col in df.columns}
@@ -46,7 +46,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
             rename_dict[df_columns_lower[feature.lower()]] = feature
         else:
             missing.append(feature)
     if missing:
         raise ValueError(f"Features not found: {missing}")
@@ -76,10 +76,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -112,10 +109,7 @@ if __name__ == "__main__":
     )
     # Create a Pipeline with StandardScaler
-    model = Pipeline([
-        ("scaler", StandardScaler()),
-        ("model", model)
-    ])
+    model = Pipeline([("scaler", StandardScaler()), ("model", model)])
     # Prepare features and targets for training
     X_train = df_train[features]

workbench/model_scripts/custom_models/uq_models/generated_model_script.py CHANGED Viewed

@@ -5,11 +5,7 @@ from xgboost import XGBRegressor
 from sklearn.model_selection import train_test_split
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from io import StringIO
 import json
@@ -22,10 +18,11 @@ from typing import List, Tuple
 # Template Placeholders
 TEMPLATE_PARAMS = {
-    "target": "logs",
-    "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
+    "target": "solubility",
+    "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
     "compressed_features": [],
-    "train_all_data": True
+    "train_all_data": False,
+    "hyperparameters": {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.7, 'colsample_bytree': 0.3, 'colsample_bylevel': 0.5, 'min_child_weight': 5, 'gamma': 0.2, 'reg_alpha': 0.5, 'reg_lambda': 2.0, 'scale_pos_weight': 1},
 }
@@ -101,7 +98,7 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
 def decompress_features(
-        df: pd.DataFrame, features: List[str], compressed_features: List[str]
+    df: pd.DataFrame, features: List[str], compressed_features: List[str]
 ) -> Tuple[pd.DataFrame, List[str]]:
     """Prepare features for the model by decompressing bitstring features
@@ -162,6 +159,7 @@ if __name__ == "__main__":
     orig_features = features.copy()
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
+    hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
     validation_split = 0.2
     # Script arguments for input/output directories
@@ -174,11 +172,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -213,9 +207,7 @@ if __name__ == "__main__":
     else:
         # Just do a random training Split
         print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(
-            all_df, test_size=validation_split, random_state=42
-        )
+        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
     print(f"FIT/TRAIN: {df_train.shape}")
     print(f"VALIDATION: {df_val.shape}")
@@ -227,7 +219,8 @@ if __name__ == "__main__":
     # Train XGBoost for point predictions
     print("\nTraining XGBoost for point predictions...")
-    xgb_model = XGBRegressor(enable_categorical=True)
+    print(f"  Hyperparameters: {hyperparameters}")
+    xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
     xgb_model.fit(X_train, y_train)
     # Evaluate XGBoost performance
@@ -272,7 +265,7 @@ if __name__ == "__main__":
                 colsample_bytree=0.8,
                 random_state=42,
                 verbose=-1,
-                force_col_wise=True
+                force_col_wise=True,
             )
             est.fit(X_train, y_train)
             quantile_estimators.append(est)
@@ -280,9 +273,7 @@ if __name__ == "__main__":
         # Create MAPIE CQR model for this confidence level
         print(f"  Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
         mapie_model = ConformalizedQuantileRegressor(
-            quantile_estimators,
-            confidence_level=confidence_level,
-            prefit=True
+            quantile_estimators, confidence_level=confidence_level, prefit=True
         )
         # Conformalize the model
@@ -337,8 +328,8 @@ if __name__ == "__main__":
             "xgb_rmse": float(xgb_rmse),
             "xgb_mae": float(xgb_mae),
             "xgb_r2": float(xgb_r2),
-            "n_validation": len(df_val)
-        }
+            "n_validation": len(df_val),
+        },
     }
     with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
         json.dump(model_config, fp, indent=2)
@@ -379,7 +370,7 @@ def model_fn(model_dir) -> dict:
         "xgb_model": xgb_model,
         "mapie_models": mapie_models,
         "confidence_levels": config["confidence_levels"],
-        "category_mappings": category_mappings
+        "category_mappings": category_mappings,
     }
@@ -404,7 +395,7 @@ def output_fn(output_df, accept_type):
     """Supports both CSV and JSON output formats."""
     if "text/csv" in accept_type:
         # Convert categorical columns to string to avoid fillna issues
-        for col in output_df.select_dtypes(include=['category']).columns:
+        for col in output_df.select_dtypes(include=["category"]).columns:
             output_df[col] = output_df[col].astype(str)
         csv_output = output_df.fillna("N/A").to_csv(index=False)
         return csv_output, "text/csv"
@@ -425,6 +416,10 @@ def predict_fn(df, models) -> pd.DataFrame:
         pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
     """
+    # Flag for outlier stretch adjustment for the prediction intervals
+    # if the predicted values are outside the intervals
+    outlier_stretch = False
     # Grab our feature columns (from training)
     model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
     with open(os.path.join(model_dir, "feature_columns.json")) as fp:
@@ -435,11 +430,7 @@ def predict_fn(df, models) -> pd.DataFrame:
     # Apply categorical mappings if they exist
     if models.get("category_mappings"):
-        matched_df, _ = convert_categorical_types(
-            matched_df,
-            model_features,
-            models["category_mappings"]
-        )
+        matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
     # Get features for prediction
     X = matched_df[model_features]
@@ -475,7 +466,7 @@ def predict_fn(df, models) -> pd.DataFrame:
     # Add median (q_50) from XGBoost prediction
     df["q_50"] = df["prediction"]
-    # Calculate a psueduo-standard deviation from the 68% interval width
+    # Calculate a pseudo-standard deviation from the 68% interval width
     df["prediction_std"] = (df["q_84"] - df["q_16"]) / 2.0
     # Reorder the quantile columns for easier reading
@@ -484,7 +475,19 @@ def predict_fn(df, models) -> pd.DataFrame:
     df = df[other_cols + quantile_cols]
     # Adjust the outer quantiles to ensure they encompass the prediction
-    df["q_025"] = np.minimum(df["q_025"], df["prediction"])
-    df["q_975"] = np.maximum(df["q_975"], df["prediction"])
+    if outlier_stretch:
+        # Lower intervals adjustments
+        df["q_025"] = np.minimum(df["q_025"], df["prediction"])
+        df["q_05"] = np.minimum(df["q_05"], df["prediction"])
+        df["q_10"] = np.minimum(df["q_10"], df["prediction"])
+        df["q_16"] = np.minimum(df["q_16"], df["prediction"])
+        df["q_25"] = np.minimum(df["q_25"], df["prediction"])
+        # Upper intervals adjustments
+        df["q_75"] = np.maximum(df["q_75"], df["prediction"])
+        df["q_84"] = np.maximum(df["q_84"], df["prediction"])
+        df["q_90"] = np.maximum(df["q_90"], df["prediction"])
+        df["q_95"] = np.maximum(df["q_95"], df["prediction"])
+        df["q_975"] = np.maximum(df["q_975"], df["prediction"])
     return df

workbench 0.8.178__py3-none-any.whl → 0.8.180__py3-none-any.whl

Potentially problematic release.

workbench 0.8.178py3-none-any.whl → 0.8.180py3-none-any.whl