PyPI - workbench - Versions diffs - 0.8.177__py3-none-any.whl → 0.8.179__py3-none-any.whl - Mend

workbench 0.8.177py3-none-any.whl → 0.8.179py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (27) hide show

workbench/model_scripts/custom_models/uq_models/generated_model_script.py CHANGED Viewed

@@ -5,11 +5,7 @@ from xgboost import XGBRegressor
 from sklearn.model_selection import train_test_split
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from io import StringIO
 import json
@@ -22,10 +18,11 @@ from typing import List, Tuple
 # Template Placeholders
 TEMPLATE_PARAMS = {
-    "target": "udm_asy_res_value",
-    "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
+    "target": "solubility",
+    "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
     "compressed_features": [],
-    "train_all_data": True
+    "train_all_data": False,
+    "hyperparameters": {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.7, 'colsample_bytree': 0.3, 'colsample_bylevel': 0.5, 'min_child_weight': 5, 'gamma': 0.2, 'reg_alpha': 0.5, 'reg_lambda': 2.0, 'scale_pos_weight': 1},
 }
@@ -101,7 +98,7 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
 def decompress_features(
-        df: pd.DataFrame, features: List[str], compressed_features: List[str]
+    df: pd.DataFrame, features: List[str], compressed_features: List[str]
 ) -> Tuple[pd.DataFrame, List[str]]:
     """Prepare features for the model by decompressing bitstring features
@@ -162,6 +159,7 @@ if __name__ == "__main__":
     orig_features = features.copy()
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
+    hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
     validation_split = 0.2
     # Script arguments for input/output directories
@@ -174,11 +172,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -213,9 +207,7 @@ if __name__ == "__main__":
     else:
         # Just do a random training Split
         print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(
-            all_df, test_size=validation_split, random_state=42
-        )
+        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
     print(f"FIT/TRAIN: {df_train.shape}")
     print(f"VALIDATION: {df_val.shape}")
@@ -227,7 +219,8 @@ if __name__ == "__main__":
     # Train XGBoost for point predictions
     print("\nTraining XGBoost for point predictions...")
-    xgb_model = XGBRegressor(enable_categorical=True)
+    print(f"  Hyperparameters: {hyperparameters}")
+    xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
     xgb_model.fit(X_train, y_train)
     # Evaluate XGBoost performance
@@ -242,7 +235,7 @@ if __name__ == "__main__":
     print(f"R2: {xgb_r2:.3f}")
     # Define confidence levels we want to model
-    confidence_levels = [0.50, 0.80, 0.90, 0.95]  # 50%, 80%, 90%, 95% confidence intervals
+    confidence_levels = [0.50, 0.68, 0.80, 0.90, 0.95]  # 50%, 68%, 80%, 90%, 95% confidence intervals
     # Store MAPIE models for each confidence level
     mapie_models = {}
@@ -272,7 +265,7 @@ if __name__ == "__main__":
                 colsample_bytree=0.8,
                 random_state=42,
                 verbose=-1,
-                force_col_wise=True
+                force_col_wise=True,
             )
             est.fit(X_train, y_train)
             quantile_estimators.append(est)
@@ -280,9 +273,7 @@ if __name__ == "__main__":
         # Create MAPIE CQR model for this confidence level
         print(f"  Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
         mapie_model = ConformalizedQuantileRegressor(
-            quantile_estimators,
-            confidence_level=confidence_level,
-            prefit=True
+            quantile_estimators, confidence_level=confidence_level, prefit=True
         )
         # Conformalize the model
@@ -337,8 +328,8 @@ if __name__ == "__main__":
             "xgb_rmse": float(xgb_rmse),
             "xgb_mae": float(xgb_mae),
             "xgb_r2": float(xgb_r2),
-            "n_validation": len(df_val)
-        }
+            "n_validation": len(df_val),
+        },
     }
     with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
         json.dump(model_config, fp, indent=2)
@@ -379,7 +370,7 @@ def model_fn(model_dir) -> dict:
         "xgb_model": xgb_model,
         "mapie_models": mapie_models,
         "confidence_levels": config["confidence_levels"],
-        "category_mappings": category_mappings
+        "category_mappings": category_mappings,
     }
@@ -404,7 +395,7 @@ def output_fn(output_df, accept_type):
     """Supports both CSV and JSON output formats."""
     if "text/csv" in accept_type:
         # Convert categorical columns to string to avoid fillna issues
-        for col in output_df.select_dtypes(include=['category']).columns:
+        for col in output_df.select_dtypes(include=["category"]).columns:
             output_df[col] = output_df[col].astype(str)
         csv_output = output_df.fillna("N/A").to_csv(index=False)
         return csv_output, "text/csv"
@@ -425,6 +416,10 @@ def predict_fn(df, models) -> pd.DataFrame:
         pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
     """
+    # Flag for outlier stretch adjustment for the prediction intervals
+    # if the predicted values are outside the intervals
+    outlier_stretch = False
     # Grab our feature columns (from training)
     model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
     with open(os.path.join(model_dir, "feature_columns.json")) as fp:
@@ -435,11 +430,7 @@ def predict_fn(df, models) -> pd.DataFrame:
     # Apply categorical mappings if they exist
     if models.get("category_mappings"):
-        matched_df, _ = convert_categorical_types(
-            matched_df,
-            model_features,
-            models["category_mappings"]
-        )
+        matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
     # Get features for prediction
     X = matched_df[model_features]
@@ -459,6 +450,9 @@ def predict_fn(df, models) -> pd.DataFrame:
         if conf_level == 0.50:  # 50% CI
             df["q_25"] = y_pis[:, 0, 0]
             df["q_75"] = y_pis[:, 1, 0]
+        elif conf_level == 0.68:  # 68% CI
+            df["q_16"] = y_pis[:, 0, 0]
+            df["q_84"] = y_pis[:, 1, 0]
         elif conf_level == 0.80:  # 80% CI
             df["q_10"] = y_pis[:, 0, 0]
             df["q_90"] = y_pis[:, 1, 0]
@@ -472,23 +466,28 @@ def predict_fn(df, models) -> pd.DataFrame:
     # Add median (q_50) from XGBoost prediction
     df["q_50"] = df["prediction"]
-    # Calculate uncertainty metrics based on 95% interval
-    interval_width = df["q_975"] - df["q_025"]
-    df["prediction_std"] = interval_width / 3.92
+    # Calculate a pseudo-standard deviation from the 68% interval width
+    df["prediction_std"] = (df["q_84"] - df["q_16"]) / 2.0
     # Reorder the quantile columns for easier reading
-    quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
+    quantile_cols = ["q_025", "q_05", "q_10", "q_16", "q_25", "q_75", "q_84", "q_90", "q_95", "q_975"]
     other_cols = [col for col in df.columns if col not in quantile_cols]
     df = df[other_cols + quantile_cols]
-    # Uncertainty score
-    df["uncertainty_score"] = interval_width / (np.abs(df["prediction"]) + 1e-6)
-    # Confidence bands
-    df["confidence_band"] = pd.cut(
-        df["uncertainty_score"],
-        bins=[0, 0.5, 1.0, 2.0, np.inf],
-        labels=["high", "medium", "low", "very_low"]
-    )
+    # Adjust the outer quantiles to ensure they encompass the prediction
+    if outlier_stretch:
+        # Lower intervals adjustments
+        df["q_025"] = np.minimum(df["q_025"], df["prediction"])
+        df["q_05"] = np.minimum(df["q_05"], df["prediction"])
+        df["q_10"] = np.minimum(df["q_10"], df["prediction"])
+        df["q_16"] = np.minimum(df["q_16"], df["prediction"])
+        df["q_25"] = np.minimum(df["q_25"], df["prediction"])
+        # Upper intervals adjustments
+        df["q_75"] = np.maximum(df["q_75"], df["prediction"])
+        df["q_84"] = np.maximum(df["q_84"], df["prediction"])
+        df["q_90"] = np.maximum(df["q_90"], df["prediction"])
+        df["q_95"] = np.maximum(df["q_95"], df["prediction"])
+        df["q_975"] = np.maximum(df["q_975"], df["prediction"])
     return df

workbench/model_scripts/custom_models/uq_models/mapie.template CHANGED Viewed

@@ -5,11 +5,7 @@ from xgboost import XGBRegressor
 from sklearn.model_selection import train_test_split
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from io import StringIO
 import json
@@ -25,7 +21,8 @@ TEMPLATE_PARAMS = {
     "target": "{{target_column}}",
     "features": "{{feature_list}}",
     "compressed_features": "{{compressed_features}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
+    "hyperparameters": "{{hyperparameters}}",
 }
@@ -101,7 +98,7 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
 def decompress_features(
-        df: pd.DataFrame, features: List[str], compressed_features: List[str]
+    df: pd.DataFrame, features: List[str], compressed_features: List[str]
 ) -> Tuple[pd.DataFrame, List[str]]:
     """Prepare features for the model by decompressing bitstring features
@@ -162,6 +159,7 @@ if __name__ == "__main__":
     orig_features = features.copy()
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
+    hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
     validation_split = 0.2
     # Script arguments for input/output directories
@@ -174,11 +172,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -213,9 +207,7 @@ if __name__ == "__main__":
     else:
         # Just do a random training Split
         print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(
-            all_df, test_size=validation_split, random_state=42
-        )
+        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
     print(f"FIT/TRAIN: {df_train.shape}")
     print(f"VALIDATION: {df_val.shape}")
@@ -227,7 +219,8 @@ if __name__ == "__main__":
     # Train XGBoost for point predictions
     print("\nTraining XGBoost for point predictions...")
-    xgb_model = XGBRegressor(enable_categorical=True)
+    print(f"  Hyperparameters: {hyperparameters}")
+    xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
     xgb_model.fit(X_train, y_train)
     # Evaluate XGBoost performance
@@ -242,7 +235,7 @@ if __name__ == "__main__":
     print(f"R2: {xgb_r2:.3f}")
     # Define confidence levels we want to model
-    confidence_levels = [0.50, 0.80, 0.90, 0.95]  # 50%, 80%, 90%, 95% confidence intervals
+    confidence_levels = [0.50, 0.68, 0.80, 0.90, 0.95]  # 50%, 68%, 80%, 90%, 95% confidence intervals
     # Store MAPIE models for each confidence level
     mapie_models = {}
@@ -272,7 +265,7 @@ if __name__ == "__main__":
                 colsample_bytree=0.8,
                 random_state=42,
                 verbose=-1,
-                force_col_wise=True
+                force_col_wise=True,
             )
             est.fit(X_train, y_train)
             quantile_estimators.append(est)
@@ -280,9 +273,7 @@ if __name__ == "__main__":
         # Create MAPIE CQR model for this confidence level
         print(f"  Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
         mapie_model = ConformalizedQuantileRegressor(
-            quantile_estimators,
-            confidence_level=confidence_level,
-            prefit=True
+            quantile_estimators, confidence_level=confidence_level, prefit=True
         )
         # Conformalize the model
@@ -337,8 +328,8 @@ if __name__ == "__main__":
             "xgb_rmse": float(xgb_rmse),
             "xgb_mae": float(xgb_mae),
             "xgb_r2": float(xgb_r2),
-            "n_validation": len(df_val)
-        }
+            "n_validation": len(df_val),
+        },
     }
     with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
         json.dump(model_config, fp, indent=2)
@@ -379,7 +370,7 @@ def model_fn(model_dir) -> dict:
         "xgb_model": xgb_model,
         "mapie_models": mapie_models,
         "confidence_levels": config["confidence_levels"],
-        "category_mappings": category_mappings
+        "category_mappings": category_mappings,
     }
@@ -404,7 +395,7 @@ def output_fn(output_df, accept_type):
     """Supports both CSV and JSON output formats."""
     if "text/csv" in accept_type:
         # Convert categorical columns to string to avoid fillna issues
-        for col in output_df.select_dtypes(include=['category']).columns:
+        for col in output_df.select_dtypes(include=["category"]).columns:
             output_df[col] = output_df[col].astype(str)
         csv_output = output_df.fillna("N/A").to_csv(index=False)
         return csv_output, "text/csv"
@@ -425,6 +416,10 @@ def predict_fn(df, models) -> pd.DataFrame:
         pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
     """
+    # Flag for outlier stretch adjustment for the prediction intervals
+    # if the predicted values are outside the intervals
+    outlier_stretch = False
     # Grab our feature columns (from training)
     model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
     with open(os.path.join(model_dir, "feature_columns.json")) as fp:
@@ -435,11 +430,7 @@ def predict_fn(df, models) -> pd.DataFrame:
     # Apply categorical mappings if they exist
     if models.get("category_mappings"):
-        matched_df, _ = convert_categorical_types(
-            matched_df,
-            model_features,
-            models["category_mappings"]
-        )
+        matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
     # Get features for prediction
     X = matched_df[model_features]
@@ -459,6 +450,9 @@ def predict_fn(df, models) -> pd.DataFrame:
         if conf_level == 0.50:  # 50% CI
             df["q_25"] = y_pis[:, 0, 0]
             df["q_75"] = y_pis[:, 1, 0]
+        elif conf_level == 0.68:  # 68% CI
+            df["q_16"] = y_pis[:, 0, 0]
+            df["q_84"] = y_pis[:, 1, 0]
         elif conf_level == 0.80:  # 80% CI
             df["q_10"] = y_pis[:, 0, 0]
             df["q_90"] = y_pis[:, 1, 0]
@@ -472,23 +466,28 @@ def predict_fn(df, models) -> pd.DataFrame:
     # Add median (q_50) from XGBoost prediction
     df["q_50"] = df["prediction"]
-    # Calculate uncertainty metrics based on 50% interval
-    interval_width = df["q_75"] - df["q_25"]
-    df["prediction_std"] = interval_width / 1.348
+    # Calculate a pseudo-standard deviation from the 68% interval width
+    df["prediction_std"] = (df["q_84"] - df["q_16"]) / 2.0
     # Reorder the quantile columns for easier reading
-    quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
+    quantile_cols = ["q_025", "q_05", "q_10", "q_16", "q_25", "q_75", "q_84", "q_90", "q_95", "q_975"]
     other_cols = [col for col in df.columns if col not in quantile_cols]
     df = df[other_cols + quantile_cols]
-    # Uncertainty score
-    df["uncertainty_score"] = interval_width / (np.abs(df["prediction"]) + 1e-6)
-    # Confidence bands
-    df["confidence_band"] = pd.cut(
-        df["uncertainty_score"],
-        bins=[0, 0.5, 1.0, 2.0, np.inf],
-        labels=["high", "medium", "low", "very_low"]
-    )
+    # Adjust the outer quantiles to ensure they encompass the prediction
+    if outlier_stretch:
+        # Lower intervals adjustments
+        df["q_025"] = np.minimum(df["q_025"], df["prediction"])
+        df["q_05"] = np.minimum(df["q_05"], df["prediction"])
+        df["q_10"] = np.minimum(df["q_10"], df["prediction"])
+        df["q_16"] = np.minimum(df["q_16"], df["prediction"])
+        df["q_25"] = np.minimum(df["q_25"], df["prediction"])
+        # Upper intervals adjustments
+        df["q_75"] = np.maximum(df["q_75"], df["prediction"])
+        df["q_84"] = np.maximum(df["q_84"], df["prediction"])
+        df["q_90"] = np.maximum(df["q_90"], df["prediction"])
+        df["q_95"] = np.maximum(df["q_95"], df["prediction"])
+        df["q_975"] = np.maximum(df["q_975"], df["prediction"])
     return df

workbench/model_scripts/custom_models/uq_models/meta_uq.template CHANGED Viewed

@@ -5,11 +5,7 @@ from xgboost import XGBRegressor  # Point Estimator
 from sklearn.model_selection import train_test_split
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from io import StringIO
 import json
@@ -24,7 +20,6 @@ from typing import List, Tuple
 from proximity import Proximity
 # Template Placeholders
 TEMPLATE_PARAMS = {
     "id_column": "{{id_column}}",
@@ -32,7 +27,7 @@ TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "compressed_features": "{{compressed_features}}",
     "train_all_data": "{{train_all_data}}",
-    "track_columns": "{{track_columns}}"
+    "track_columns": "{{track_columns}}",
 }
@@ -183,11 +178,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -222,9 +213,7 @@ if __name__ == "__main__":
     else:
         # Just do a random training Split
         print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(
-            all_df, test_size=validation_split, random_state=42
-        )
+        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
     print(f"FIT/TRAIN: {df_train.shape}")
     print(f"VALIDATION: {df_val.shape}")
@@ -289,11 +278,7 @@ def model_fn(model_dir) -> dict:
     # Deserialize the proximity model
     prox_model = Proximity.deserialize(model_dir)
-    return {
-        "xgboost": xgb_model,
-        "ngboost": ngb_model,
-        "proximity": prox_model
-    }
+    return {"xgboost": xgb_model, "ngboost": ngb_model, "proximity": prox_model}
 def input_fn(input_data, content_type):
@@ -353,8 +338,8 @@ def predict_fn(df, models) -> pd.DataFrame:
     dist_params = y_dists.params
     # Extract mean and std from distribution parameters
-    df["prediction_uq"] = dist_params['loc']  # mean
-    df["prediction_std"] = dist_params['scale']  # standard deviation
+    df["prediction_uq"] = dist_params["loc"]  # mean
+    df["prediction_std"] = dist_params["scale"]  # standard deviation
     # Add 95% prediction intervals using ppf (percent point function)
     # Note: Our hybrid model uses XGB point prediction and NGBoost UQ

workbench/model_scripts/custom_models/uq_models/ngboost.template CHANGED Viewed

@@ -3,11 +3,7 @@ from ngboost import NGBRegressor
 from sklearn.model_selection import train_test_split
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from io import StringIO
 import json
@@ -21,7 +17,7 @@ import pandas as pd
 TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
 }
@@ -87,10 +83,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -212,8 +205,8 @@ def predict_fn(df, model) -> pd.DataFrame:
     dist_params = y_dists.params
     # Extract mean and std from distribution parameters
-    df["prediction"] = dist_params['loc']  # mean
-    df["prediction_std"] = dist_params['scale']  # standard deviation
+    df["prediction"] = dist_params["loc"]  # mean
+    df["prediction_std"] = dist_params["scale"]  # standard deviation
     # Add 95% prediction intervals using ppf (percent point function)
     df["q_025"] = y_dists.ppf(0.025)  # 2.5th percentile

workbench/model_scripts/ensemble_xgb/ensemble_xgb.template CHANGED Viewed

@@ -3,7 +3,7 @@ TEMPLATE_PARAMS = {
     "model_type": "{{model_type}}",
     "target_column": "{{target_column}}",
     "feature_list": "{{feature_list}}",
-    "model_metrics_s3_path": "{{model_metrics_s3_path}}"
+    "model_metrics_s3_path": "{{model_metrics_s3_path}}",
 }
 # Imports for XGB Model
@@ -12,11 +12,7 @@ import awswrangler as wr
 import numpy as np
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from io import StringIO
 import json
@@ -39,6 +35,7 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
         print(msg)
         raise ValueError(msg)
 def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
     """
     Matches and renames the DataFrame's column names to match the model's feature names (case-insensitive).
@@ -95,11 +92,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -150,7 +143,6 @@ if __name__ == "__main__":
     result_df["residual"] = result_df[target] - result_df["prediction"]
     result_df["residual_abs"] = result_df["residual"].abs()
     # Save the results dataframe to S3
     wr.s3.to_csv(
         result_df,
@@ -210,7 +202,7 @@ def input_fn(input_data, content_type):
     """Parse input data and return a DataFrame."""
     if not input_data:
         raise ValueError("Empty input data is not supported!")
     # Decode bytes to string if necessary
     if isinstance(input_data, bytes):
         input_data = input_data.decode("utf-8")

workbench 0.8.177__py3-none-any.whl → 0.8.179__py3-none-any.whl

Potentially problematic release.

workbench 0.8.177py3-none-any.whl → 0.8.179py3-none-any.whl