PyPI - workbench - Versions diffs - 0.8.173__py3-none-any.whl → 0.8.175__py3-none-any.whl - Mend

workbench 0.8.173py3-none-any.whl → 0.8.175py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (10) hide show

workbench/core/artifacts/data_capture_core.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import logging
 import re
+import time
 from datetime import datetime
 from typing import Tuple
 import pandas as pd
@@ -14,6 +15,9 @@ from workbench.core.artifacts.endpoint_core import EndpointCore
 from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
 from workbench.utils.monitor_utils import process_data_capture
+# Setup logging
+log = logging.getLogger("workbench")
 class DataCaptureCore:
     """Manages data capture configuration and retrieval for SageMaker endpoints"""
@@ -203,7 +207,7 @@ class DataCaptureCore:
         modes = [opt.get("CaptureMode") for opt in capture_options]
         return ["REQUEST" if m == "Input" else "RESPONSE" for m in modes if m]
-    def get_captured_data(self, from_date=None, add_timestamp=True) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    def get_captured_data(self, from_date: str = None, add_timestamp: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame]:
         """
         Read and process captured data from S3.
@@ -226,29 +230,65 @@ class DataCaptureCore:
             files = [f for f in files if self._file_date_filter(f, from_date_obj)]
             self.log.info(f"Processing {len(files)} files from {from_date} onwards.")
         else:
-            self.log.info(f"Processing all {len(files)} files.")
+            self.log.info(f"Processing all {len(files)} files...")
+        # Check if any files remain after filtering
+        if not files:
+            self.log.info("No files to process after date filtering.")
+            return pd.DataFrame(), pd.DataFrame()
+        # Sort files by name (assumed to include timestamp)
         files.sort()
-        # Process files
-        all_input_dfs, all_output_dfs = [], []
-        for file_path in files:
+        # Get all timestamps in one batch if needed
+        timestamps = {}
+        if add_timestamp:
+            # Batch describe operation - much more efficient than per-file calls
+            timestamps = wr.s3.describe_objects(path=files)
+        # Process files using concurrent.futures
+        start_time = time.time()
+        def process_single_file(file_path):
+            """Process a single file and return input/output DataFrames."""
             try:
+                log.debug(f"Processing file: {file_path}...")
                 df = wr.s3.read_json(path=file_path, lines=True)
                 if not df.empty:
                     input_df, output_df = process_data_capture(df)
-                    if add_timestamp:
-                        timestamp = wr.s3.describe_objects(path=file_path)[file_path]["LastModified"]
-                        output_df["timestamp"] = timestamp
-                    all_input_dfs.append(input_df)
-                    all_output_dfs.append(output_df)
+                    if add_timestamp and file_path in timestamps:
+                        output_df["timestamp"] = timestamps[file_path]["LastModified"]
+                    return input_df, output_df
+                return pd.DataFrame(), pd.DataFrame()
             except Exception as e:
                 self.log.warning(f"Error processing {file_path}: {e}")
+                return pd.DataFrame(), pd.DataFrame()
+        # Use ThreadPoolExecutor for I/O-bound operations
+        from concurrent.futures import ThreadPoolExecutor
+        max_workers = min(32, len(files))  # Cap at 32 threads or number of files
+        all_input_dfs, all_output_dfs = [], []
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = [executor.submit(process_single_file, file_path) for file_path in files]
+            for future in futures:
+                input_df, output_df = future.result()
+                if not input_df.empty:
+                    all_input_dfs.append(input_df)
+                if not output_df.empty:
+                    all_output_dfs.append(output_df)
         if not all_input_dfs:
             self.log.warning("No valid data was processed.")
             return pd.DataFrame(), pd.DataFrame()
-        return pd.concat(all_input_dfs, ignore_index=True), pd.concat(all_output_dfs, ignore_index=True)
+        input_df = pd.concat(all_input_dfs, ignore_index=True)
+        output_df = pd.concat(all_output_dfs, ignore_index=True)
+        elapsed_time = time.time() - start_time
+        self.log.info(f"Processed {len(files)} files in {elapsed_time:.2f} seconds.")
+        return input_df, output_df
     def _file_date_filter(self, file_path, from_date_obj):
         """Extract date from S3 path and compare with from_date."""
@@ -304,7 +344,7 @@ if __name__ == "__main__":
     # print(pred_df.head())
     # Check that data capture is working
-    input_df, output_df = dc.get_captured_data()
+    input_df, output_df = dc.get_captured_data(from_date="2025-09-01")
     if input_df.empty and output_df.empty:
         print("No data capture files found, for a new endpoint it may take a few minutes to start capturing data")
     else:

workbench/model_scripts/custom_models/uq_models/generated_model_script.py CHANGED Viewed

@@ -1,7 +1,7 @@
-# Model: NGBoost Regressor with Distribution output
-from ngboost import NGBRegressor
-from ngboost.distns import Cauchy, T
-from xgboost import XGBRegressor  # Point Estimator
+# Model: XGBoost for point predictions + LightGBM with MAPIE for conformalized intervals
+from mapie.regression import ConformalizedQuantileRegressor
+from lightgbm import LGBMRegressor
+from xgboost import XGBRegressor
 from sklearn.model_selection import train_test_split
 # Model Performance Scores
@@ -20,19 +20,12 @@ import numpy as np
 import pandas as pd
 from typing import List, Tuple
-# Local Imports
-from proximity import Proximity
 # Template Placeholders
 TEMPLATE_PARAMS = {
-    "id_column": "udm_mol_id",
     "target": "udm_asy_res_value",
-    "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v', 'chiral_centers', 'r_cnt', 's_cnt', 'db_stereo', 'e_cnt', 'z_cnt', 'chiral_fp', 'db_fp'],
+    "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
     "compressed_features": [],
-    "train_all_data": False,
-    "track_columns": "udm_asy_res_value"
+    "train_all_data": True
 }
@@ -108,7 +101,7 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
 def decompress_features(
-    df: pd.DataFrame, features: List[str], compressed_features: List[str]
+        df: pd.DataFrame, features: List[str], compressed_features: List[str]
 ) -> Tuple[pd.DataFrame, List[str]]:
     """Prepare features for the model by decompressing bitstring features
@@ -164,13 +157,11 @@ def decompress_features(
 if __name__ == "__main__":
     # Template Parameters
-    id_column = TEMPLATE_PARAMS["id_column"]
     target = TEMPLATE_PARAMS["target"]
     features = TEMPLATE_PARAMS["features"]
     orig_features = features.copy()
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
-    track_columns = TEMPLATE_PARAMS["track_columns"]  # Can be None
     validation_split = 0.2
     # Script arguments for input/output directories
@@ -228,78 +219,167 @@ if __name__ == "__main__":
     print(f"FIT/TRAIN: {df_train.shape}")
     print(f"VALIDATION: {df_val.shape}")
-    # We're using XGBoost for point predictions and NGBoost for uncertainty quantification
-    xgb_model = XGBRegressor()
-    ngb_model = NGBRegressor()  # Dist=Cauchy) Seems to give HUGE prediction intervals
-    ngb_model = NGBRegressor(
-        Dist=T,
-        learning_rate=0.005,
-        minibatch_frac=0.1,  # Very small batches
-        col_sample=0.8  # This parameter DOES exist
-    ) # Testing this out
-    print("NGBoost using T distribution for uncertainty quantification")
     # Prepare features and targets for training
     X_train = df_train[features]
     X_validate = df_val[features]
     y_train = df_train[target]
     y_validate = df_val[target]
-    # Train both models using the training data
+    # Train XGBoost for point predictions
+    print("\nTraining XGBoost for point predictions...")
+    xgb_model = XGBRegressor(enable_categorical=True)
     xgb_model.fit(X_train, y_train)
-    ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
-    # Make Predictions on the Validation Set
-    print(f"Making Predictions on Validation Set...")
-    preds = xgb_model.predict(X_validate)
-    # Calculate various model performance metrics (regression)
-    rmse = root_mean_squared_error(y_validate, preds)
-    mae = mean_absolute_error(y_validate, preds)
-    r2 = r2_score(y_validate, preds)
-    print(f"RMSE: {rmse:.3f}")
-    print(f"MAE: {mae:.3f}")
-    print(f"R2: {r2:.3f}")
+    # Evaluate XGBoost performance
+    y_pred_xgb = xgb_model.predict(X_validate)
+    xgb_rmse = root_mean_squared_error(y_validate, y_pred_xgb)
+    xgb_mae = mean_absolute_error(y_validate, y_pred_xgb)
+    xgb_r2 = r2_score(y_validate, y_pred_xgb)
+    print(f"\nXGBoost Point Prediction Performance:")
+    print(f"RMSE: {xgb_rmse:.3f}")
+    print(f"MAE: {xgb_mae:.3f}")
+    print(f"R2: {xgb_r2:.3f}")
+    # Define confidence levels we want to model
+    confidence_levels = [0.50, 0.80, 0.90, 0.95]  # 50%, 80%, 90%, 95% confidence intervals
+    # Store MAPIE models for each confidence level
+    mapie_models = {}
+    # Train models for each confidence level
+    for confidence_level in confidence_levels:
+        alpha = 1 - confidence_level
+        lower_q = alpha / 2
+        upper_q = 1 - alpha / 2
+        print(f"\nTraining quantile models for {confidence_level * 100:.0f}% confidence interval...")
+        print(f"  Quantiles: {lower_q:.3f}, {upper_q:.3f}, 0.500")
+        # Train three models for this confidence level
+        quantile_estimators = []
+        for q in [lower_q, upper_q, 0.5]:
+            print(f"    Training model for quantile {q:.3f}...")
+            est = LGBMRegressor(
+                objective="quantile",
+                alpha=q,
+                n_estimators=1000,
+                max_depth=6,
+                learning_rate=0.01,
+                num_leaves=31,
+                min_child_samples=20,
+                subsample=0.8,
+                colsample_bytree=0.8,
+                random_state=42,
+                verbose=-1,
+                force_col_wise=True
+            )
+            est.fit(X_train, y_train)
+            quantile_estimators.append(est)
+        # Create MAPIE CQR model for this confidence level
+        print(f"  Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
+        mapie_model = ConformalizedQuantileRegressor(
+            quantile_estimators,
+            confidence_level=confidence_level,
+            prefit=True
+        )
+        # Conformalize the model
+        print(f"  Conformalizing with validation data...")
+        mapie_model.conformalize(X_validate, y_validate)
+        # Store the model
+        mapie_models[f"mapie_{confidence_level:.2f}"] = mapie_model
+        # Validate coverage for this confidence level
+        y_pred, y_pis = mapie_model.predict_interval(X_validate)
+        coverage = np.mean((y_validate >= y_pis[:, 0, 0]) & (y_validate <= y_pis[:, 1, 0]))
+        print(f"  Coverage: Target={confidence_level * 100:.0f}%, Empirical={coverage * 100:.1f}%")
+    print(f"\nOverall Model Performance Summary:")
+    print(f"XGBoost RMSE: {xgb_rmse:.3f}")
+    print(f"XGBoost MAE: {xgb_mae:.3f}")
+    print(f"XGBoost R2: {xgb_r2:.3f}")
     print(f"NumRows: {len(df_val)}")
+    # Analyze interval widths across confidence levels
+    print(f"\nInterval Width Analysis:")
+    for conf_level in confidence_levels:
+        model = mapie_models[f"mapie_{conf_level:.2f}"]
+        _, y_pis = model.predict_interval(X_validate)
+        widths = y_pis[:, 1, 0] - y_pis[:, 0, 0]
+        print(f"  {conf_level * 100:.0f}% CI: Mean width={np.mean(widths):.3f}, Std={np.std(widths):.3f}")
     # Save the trained XGBoost model
     xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
-    # Save the trained NGBoost model
-    joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
+    # Save all MAPIE models
+    for model_name, model in mapie_models.items():
+        joblib.dump(model, os.path.join(args.model_dir, f"{model_name}.joblib"))
-    # Save the features (this will validate input during predictions)
+    # Save the feature list
     with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
-        json.dump(orig_features, fp)  # We save the original features, not the decompressed ones
-    # Now the Proximity model
-    model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
+        json.dump(features, fp)
+    # Save category mappings if any
+    if category_mappings:
+        with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
+            json.dump(category_mappings, fp)
+    # Save model configuration
+    model_config = {
+        "model_type": "XGBoost_MAPIE_CQR_LightGBM",
+        "confidence_levels": confidence_levels,
+        "n_features": len(features),
+        "target": target,
+        "validation_metrics": {
+            "xgb_rmse": float(xgb_rmse),
+            "xgb_mae": float(xgb_mae),
+            "xgb_r2": float(xgb_r2),
+            "n_validation": len(df_val)
+        }
+    }
+    with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
+        json.dump(model_config, fp, indent=2)
-    # Now serialize the model
-    model.serialize(args.model_dir)
+    print(f"\nModel training complete!")
+    print(f"Saved 1 XGBoost model and {len(mapie_models)} MAPIE models to {args.model_dir}")
 #
 # Inference Section
 #
 def model_fn(model_dir) -> dict:
-    """Load and return XGBoost, NGBoost, and Prox Model from model directory."""
+    """Load XGBoost and all MAPIE models from the specified directory."""
+    # Load model configuration to know which models to load
+    with open(os.path.join(model_dir, "model_config.json")) as fp:
+        config = json.load(fp)
     # Load XGBoost regressor
     xgb_path = os.path.join(model_dir, "xgb_model.json")
     xgb_model = XGBRegressor(enable_categorical=True)
     xgb_model.load_model(xgb_path)
-    # Load NGBoost regressor
-    ngb_model = joblib.load(os.path.join(model_dir, "ngb_model.joblib"))
+    # Load all MAPIE models
+    mapie_models = {}
+    for conf_level in config["confidence_levels"]:
+        model_name = f"mapie_{conf_level:.2f}"
+        mapie_models[model_name] = joblib.load(os.path.join(model_dir, f"{model_name}.joblib"))
-    # Deserialize the proximity model
-    prox_model = Proximity.deserialize(model_dir)
+    # Load category mappings if they exist
+    category_mappings = {}
+    category_path = os.path.join(model_dir, "category_mappings.json")
+    if os.path.exists(category_path):
+        with open(category_path) as fp:
+            category_mappings = json.load(fp)
     return {
-        "xgboost": xgb_model,
-        "ngboost": ngb_model,
-        "proximity": prox_model
+        "xgb_model": xgb_model,
+        "mapie_models": mapie_models,
+        "confidence_levels": config["confidence_levels"],
+        "category_mappings": category_mappings
     }
@@ -315,7 +395,7 @@ def input_fn(input_data, content_type):
     if "text/csv" in content_type:
         return pd.read_csv(StringIO(input_data))
     elif "application/json" in content_type:
-        return pd.DataFrame(json.loads(input_data))  # Assumes JSON array of records
+        return pd.DataFrame(json.loads(input_data))
     else:
         raise ValueError(f"{content_type} not supported!")
@@ -323,23 +403,26 @@ def input_fn(input_data, content_type):
 def output_fn(output_df, accept_type):
     """Supports both CSV and JSON output formats."""
     if "text/csv" in accept_type:
-        csv_output = output_df.fillna("N/A").to_csv(index=False)  # CSV with N/A for missing values
+        # Convert categorical columns to string to avoid fillna issues
+        for col in output_df.select_dtypes(include=['category']).columns:
+            output_df[col] = output_df[col].astype(str)
+        csv_output = output_df.fillna("N/A").to_csv(index=False)
         return csv_output, "text/csv"
     elif "application/json" in accept_type:
-        return output_df.to_json(orient="records"), "application/json"  # JSON array of records (NaNs -> null)
+        return output_df.to_json(orient="records"), "application/json"
     else:
         raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
 def predict_fn(df, models) -> pd.DataFrame:
-    """Make Predictions with our XGB Quantile Regression Model
+    """Make predictions using XGBoost for point estimates and MAPIE for conformalized intervals
     Args:
         df (pd.DataFrame): The input DataFrame
-        models (dict): The dictionary of models to use for predictions
+        models (dict): Dictionary containing XGBoost and MAPIE models
     Returns:
-        pd.DataFrame: The DataFrame with the predictions added
+        pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
     """
     # Grab our feature columns (from training)
@@ -350,44 +433,62 @@ def predict_fn(df, models) -> pd.DataFrame:
     # Match features in a case-insensitive manner
     matched_df = match_features_case_insensitive(df, model_features)
-    # Use XGBoost for point predictions
-    df["prediction"] = models["xgboost"].predict(matched_df[model_features])
-    # NGBoost predict returns distribution objects
-    y_dists = models["ngboost"].pred_dist(matched_df[model_features])
-    # Extract parameters from distribution
-    dist_params = y_dists.params
-    # Extract mean and std from distribution parameters
-    df["prediction_uq"] = dist_params['loc']  # mean
-    df["prediction_std"] = dist_params['scale']  # standard deviation
-    # Add 95% prediction intervals using ppf (percent point function)
-    # Note: Our hybrid model uses XGB point prediction and NGBoost UQ
-    #  so we need to adjust the bounds to include the point prediction
-    df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
-    df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
-    # Add 90% prediction intervals
-    df["q_05"] = y_dists.ppf(0.05)  # 5th percentile
-    df["q_95"] = y_dists.ppf(0.95)  # 95th percentile
-    # Add 80% prediction intervals
-    df["q_10"] = y_dists.ppf(0.10)  # 10th percentile
-    df["q_90"] = y_dists.ppf(0.90)  # 90th percentile
+    # Apply categorical mappings if they exist
+    if models.get("category_mappings"):
+        matched_df, _ = convert_categorical_types(
+            matched_df,
+            model_features,
+            models["category_mappings"]
+        )
-    # Add 50% prediction intervals
-    df["q_25"] = y_dists.ppf(0.25)  # 25th percentile
-    df["q_75"] = y_dists.ppf(0.75)  # 75th percentile
+    # Get features for prediction
+    X = matched_df[model_features]
+    # Get XGBoost point predictions
+    df["prediction"] = models["xgb_model"].predict(X)
+    # Get predictions from each MAPIE model for conformalized intervals
+    for conf_level in models["confidence_levels"]:
+        model_name = f"mapie_{conf_level:.2f}"
+        model = models["mapie_models"][model_name]
+        # Get conformalized predictions
+        y_pred, y_pis = model.predict_interval(X)
+        # Map confidence levels to quantile names
+        if conf_level == 0.50:  # 50% CI
+            df["q_25"] = y_pis[:, 0, 0]
+            df["q_75"] = y_pis[:, 1, 0]
+        elif conf_level == 0.80:  # 80% CI
+            df["q_10"] = y_pis[:, 0, 0]
+            df["q_90"] = y_pis[:, 1, 0]
+        elif conf_level == 0.90:  # 90% CI
+            df["q_05"] = y_pis[:, 0, 0]
+            df["q_95"] = y_pis[:, 1, 0]
+        elif conf_level == 0.95:  # 95% CI
+            df["q_025"] = y_pis[:, 0, 0]
+            df["q_975"] = y_pis[:, 1, 0]
+    # Add median (q_50) from XGBoost prediction
+    df["q_50"] = df["prediction"]
+    # Calculate uncertainty metrics based on 95% interval
+    interval_width = df["q_975"] - df["q_025"]
+    df["prediction_std"] = interval_width / 3.92
     # Reorder the quantile columns for easier reading
     quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
     other_cols = [col for col in df.columns if col not in quantile_cols]
     df = df[other_cols + quantile_cols]
-    # Compute Nearest neighbors with Proximity model
-    models["proximity"].neighbors(df)
+    # Uncertainty score
+    df["uncertainty_score"] = interval_width / (np.abs(df["prediction"]) + 1e-6)
+    # Confidence bands
+    df["confidence_band"] = pd.cut(
+        df["uncertainty_score"],
+        bins=[0, 0.5, 1.0, 2.0, np.inf],
+        labels=["high", "medium", "low", "very_low"]
+    )
-    # Return the modified DataFrame
     return df

workbench/model_scripts/custom_models/uq_models/mapie.template CHANGED Viewed

@@ -227,15 +227,7 @@ if __name__ == "__main__":
     # Train XGBoost for point predictions
     print("\nTraining XGBoost for point predictions...")
-    xgb_model = XGBRegressor(
-        n_estimators=1000,
-        max_depth=6,
-        learning_rate=0.01,
-        subsample=0.8,
-        colsample_bytree=0.8,
-        random_state=42,
-        verbosity=0
-    )
+    xgb_model = XGBRegressor(enable_categorical=True)
     xgb_model.fit(X_train, y_train)
     # Evaluate XGBoost performance

workbench/utils/monitor_utils.py CHANGED Viewed

@@ -76,55 +76,44 @@ def process_data_capture(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
     Returns:
         tuple[DataFrame, DataFrame]: Input and output DataFrames.
     """
+    def parse_endpoint_data(data: dict) -> pd.DataFrame:
+        """Parse endpoint data based on encoding type."""
+        encoding = data["encoding"].upper()
+        if encoding == "CSV":
+            return pd.read_csv(StringIO(data["data"]))
+        elif encoding == "JSON":
+            json_data = json.loads(data["data"])
+            if isinstance(json_data, dict):
+                return pd.DataFrame({k: [v] if not isinstance(v, list) else v for k, v in json_data.items()})
+            else:
+                return pd.DataFrame(json_data)
+        else:
+            return None  # Unknown encoding
     input_dfs = []
     output_dfs = []
-    for idx, row in df.iterrows():
+    # Use itertuples() instead of iterrows() for better performance
+    for row in df.itertuples(index=True):
         try:
-            capture_data = row["captureData"]
+            capture_data = row.captureData
             # Process input data if present
             if "endpointInput" in capture_data:
-                input_data = capture_data["endpointInput"]
-                encoding = input_data["encoding"].upper()
-                if encoding == "CSV":
-                    input_df = pd.read_csv(StringIO(input_data["data"]))
-                elif encoding == "JSON":
-                    json_data = json.loads(input_data["data"])
-                    if isinstance(json_data, dict):
-                        input_df = pd.DataFrame(
-                            {k: [v] if not isinstance(v, list) else v for k, v in json_data.items()}
-                        )
-                    else:
-                        input_df = pd.DataFrame(json_data)
-                else:
-                    continue  # Skip unknown encodings
-                input_dfs.append(input_df)
+                input_df = parse_endpoint_data(capture_data["endpointInput"])
+                if input_df is not None:
+                    input_dfs.append(input_df)
             # Process output data if present
             if "endpointOutput" in capture_data:
-                output_data = capture_data["endpointOutput"]
-                encoding = output_data["encoding"].upper()
-                if encoding == "CSV":
-                    output_df = pd.read_csv(StringIO(output_data["data"]))
-                elif encoding == "JSON":
-                    json_data = json.loads(output_data["data"])
-                    if isinstance(json_data, dict):
-                        output_df = pd.DataFrame(
-                            {k: [v] if not isinstance(v, list) else v for k, v in json_data.items()}
-                        )
-                    else:
-                        output_df = pd.DataFrame(json_data)
-                else:
-                    continue  # Skip unknown encodings
-                output_dfs.append(output_df)
+                output_df = parse_endpoint_data(capture_data["endpointOutput"])
+                if output_df is not None:
+                    output_dfs.append(output_df)
         except Exception as e:
-            log.debug(f"Row {idx}: Failed to process row: {e}")
+            log.debug(f"Row {row.Index}: Failed to process row: {e}")
             continue
     # Combine and return results

{workbench-0.8.173.dist-info → workbench-0.8.175.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: workbench
-Version: 0.8.173
+Version: 0.8.175
 Summary: Workbench: A Dashboard and Python API for creating and deploying AWS SageMaker Model Pipelines
 Author-email: SuperCowPowers LLC <support@supercowpowers.com>
 License-Expression: MIT

{workbench-0.8.173.dist-info → workbench-0.8.175.dist-info}/RECORD RENAMED Viewed

@@ -51,7 +51,7 @@ workbench/core/artifacts/__init__.py,sha256=ps7rA_rbWnDbvWbg4kvu--IKMY8WmbPRyv4S
 workbench/core/artifacts/artifact.py,sha256=AtTw8wfMd-fi7cHJHsBAXHUk53kRW_6lyBwwsIbHw54,17750
 workbench/core/artifacts/athena_source.py,sha256=RNmCe7s6uH4gVHpcdJcL84aSbF5Q1ahJBLLGwHYRXEU,26081
 workbench/core/artifacts/cached_artifact_mixin.py,sha256=ngqFLZ4cQx_TFouXZgXZQsv_7W6XCvxVGXXSfzzaft8,3775
-workbench/core/artifacts/data_capture_core.py,sha256=VJL5AcXOx8PxY1Urw0AFm-czqvs55cDiwH_ZTcr2LS0,13207
+workbench/core/artifacts/data_capture_core.py,sha256=q8f79rRTYiZ7T4IQRWXl8ZvPpcvZyNxYERwvo8o0OQc,14858
 workbench/core/artifacts/data_source_abstract.py,sha256=5IRCzFVK-17cd4NXPMRfx99vQAmQ0WHE5jcm5RfsVTg,10619
 workbench/core/artifacts/data_source_factory.py,sha256=YL_tA5fsgubbB3dPF6T4tO0rGgz-6oo3ge4i_YXVC-M,2380
 workbench/core/artifacts/endpoint_core.py,sha256=lwgiz0jttW8C4YqcKaA8nf231WI3kol-nLnKcAbFJko,49049
@@ -140,8 +140,8 @@ workbench/model_scripts/custom_models/uq_models/Readme.md,sha256=UVpL-lvtTrLqwBe
 workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template,sha256=U4LIlpp8Rbu3apyzPR7-55lvlutpTsCro_PUvQ5pklY,6457
 workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template,sha256=0IJnSBACQ556ldEiPqR7yPCOOLJs1hQhHmPBvB2d9tY,13491
 workbench/model_scripts/custom_models/uq_models/gaussian_process.template,sha256=QbDUfkiPCwJ-c-4Twgu4utZuYZaAyeW_3T1IP-_tutw,6683
-workbench/model_scripts/custom_models/uq_models/generated_model_script.py,sha256=yJOL29TWtIAPbhuqK1m9w-MfWq0MVfJcI412VVgDO04,17583
-workbench/model_scripts/custom_models/uq_models/mapie.template,sha256=D94Y3U7IruGQlu9m6gXyLRjm502qZafYrwhEM9GP6oE,18337
+workbench/model_scripts/custom_models/uq_models/generated_model_script.py,sha256=AcLf-vXOmn_vpTeiKpNKCW_dRhR8Co1sMFC84EPT4IE,22392
+workbench/model_scripts/custom_models/uq_models/mapie.template,sha256=VkFM0eZM2d-hzDbngk9s08DD5vn2nQRD4coCUfj36Fk,18181
 workbench/model_scripts/custom_models/uq_models/meta_uq.template,sha256=eawh0Fp3DhbdCXzWN6KloczT5ZS_ou4ayW65yUTTE4o,14109
 workbench/model_scripts/custom_models/uq_models/ngboost.template,sha256=9-O6P-SW50ul5Wl6es2DMWXSbrwOg7HWsdc8Qdln0MM,8278
 workbench/model_scripts/custom_models/uq_models/proximity.py,sha256=zqmNlX70LnWXr5fdtFFQppSNTLjlOciQVrjGr-g9jRE,13716
@@ -221,7 +221,7 @@ workbench/utils/license_manager.py,sha256=sDuhk1mZZqUbFmnuFXehyGnui_ALxrmYBg7gYw
 workbench/utils/log_utils.py,sha256=7n1NJXO_jUX82e6LWAQug6oPo3wiPDBYsqk9gsYab_A,3167
 workbench/utils/markdown_utils.py,sha256=4lEqzgG4EVmLcvvKKNUwNxVCySLQKJTJmWDiaDroI1w,8306
 workbench/utils/model_utils.py,sha256=JeEztmFyDJ7yqRozDX0L6apuhLgKx1sgNlO5duB73qc,11938
-workbench/utils/monitor_utils.py,sha256=LbfZImf4tHqYz9J8NnW_ggZP45Has_4QwXHQ-Wi3sLw,8381
+workbench/utils/monitor_utils.py,sha256=kVaJ7BgUXs3VPMFYfLC03wkIV4Dq-pEhoXS0wkJFxCc,7858
 workbench/utils/pandas_utils.py,sha256=uTUx-d1KYfjbS9PMQp2_9FogCV7xVZR6XLzU5YAGmfs,39371
 workbench/utils/performance_utils.py,sha256=WDNvz-bOdC99cDuXl0urAV4DJ7alk_V3yzKPwvqgST4,1329
 workbench/utils/pipeline_utils.py,sha256=yzR5tgAzz6zNqvxzZR6YqsbS7r3QDKzBXozaM_ADXlc,2171
@@ -288,9 +288,9 @@ workbench/web_interface/page_views/main_page.py,sha256=X4-KyGTKLAdxR-Zk2niuLJB2Y
 workbench/web_interface/page_views/models_page_view.py,sha256=M0bdC7bAzLyIaE2jviY12FF4abdMFZmg6sFuOY_LaGI,2650
 workbench/web_interface/page_views/page_view.py,sha256=Gh6YnpOGlUejx-bHZAf5pzqoQ1H1R0OSwOpGhOBO06w,455
 workbench/web_interface/page_views/pipelines_page_view.py,sha256=v2pxrIbsHBcYiblfius3JK766NZ7ciD2yPx0t3E5IJo,2656
-workbench-0.8.173.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
-workbench-0.8.173.dist-info/METADATA,sha256=b1gas8B3zXhFnVPVFB8vLCeqoeb8brx4rdMXRus-YJo,9210
-workbench-0.8.173.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-workbench-0.8.173.dist-info/entry_points.txt,sha256=zPFPruY9uayk8-wsKrhfnIyIB6jvZOW_ibyllEIsLWo,356
-workbench-0.8.173.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
-workbench-0.8.173.dist-info/RECORD,,
+workbench-0.8.175.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
+workbench-0.8.175.dist-info/METADATA,sha256=hAjhM-oXEqxffYyDwawIsSdTv3iKsRs5_OiZw1sv2RQ,9210
+workbench-0.8.175.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+workbench-0.8.175.dist-info/entry_points.txt,sha256=zPFPruY9uayk8-wsKrhfnIyIB6jvZOW_ibyllEIsLWo,356
+workbench-0.8.175.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
+workbench-0.8.175.dist-info/RECORD,,

{workbench-0.8.173.dist-info → workbench-0.8.175.dist-info}/WHEEL RENAMED Viewed

File without changes

{workbench-0.8.173.dist-info → workbench-0.8.175.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{workbench-0.8.173.dist-info → workbench-0.8.175.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{workbench-0.8.173.dist-info → workbench-0.8.175.dist-info}/top_level.txt RENAMED Viewed

File without changes

workbench 0.8.173__py3-none-any.whl → 0.8.175__py3-none-any.whl

Potentially problematic release.

workbench 0.8.173py3-none-any.whl → 0.8.175py3-none-any.whl