workbench 0.8.170__py3-none-any.whl → 0.8.172__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (27) hide show
  1. workbench/api/feature_set.py +4 -4
  2. workbench/core/artifacts/artifact.py +11 -3
  3. workbench/core/artifacts/model_core.py +37 -14
  4. workbench/core/cloud_platform/aws/aws_account_clamp.py +4 -1
  5. workbench/core/cloud_platform/aws/aws_meta.py +11 -4
  6. workbench/core/transforms/features_to_model/features_to_model.py +4 -4
  7. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +319 -210
  8. workbench/model_scripts/custom_models/uq_models/mapie.template +502 -0
  9. workbench/model_scripts/custom_models/uq_models/meta_uq.template +154 -41
  10. workbench/model_scripts/custom_models/uq_models/ngboost.template +15 -2
  11. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  12. workbench/model_scripts/script_generation.py +5 -0
  13. workbench/model_scripts/xgb_model/generated_model_script.py +11 -11
  14. workbench/model_scripts/xgb_model/xgb_model.template +7 -7
  15. workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +1 -1
  16. workbench/scripts/ml_pipeline_sqs.py +139 -0
  17. workbench/utils/model_utils.py +13 -1
  18. workbench/utils/workbench_sqs.py +1 -1
  19. workbench/utils/xgboost_model_utils.py +1 -0
  20. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  21. {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/METADATA +1 -1
  22. {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/RECORD +26 -25
  23. {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/entry_points.txt +2 -1
  24. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  25. {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/WHEEL +0 -0
  26. {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/licenses/LICENSE +0 -0
  27. {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,8 @@
1
- # Model: NGBoost Regressor with Distribution output
2
- from ngboost import NGBRegressor
3
- from xgboost import XGBRegressor # Base Estimator
1
+ # Model: XGBoost for point predictions + LightGBM with MAPIE for conformalized intervals
2
+ from mapie.regression import ConformalizedQuantileRegressor
3
+ from lightgbm import LGBMRegressor
4
+ from xgboost import XGBRegressor
4
5
  from sklearn.model_selection import train_test_split
5
- import numpy as np
6
6
 
7
7
  # Model Performance Scores
8
8
  from sklearn.metrics import (
@@ -16,20 +16,16 @@ import json
16
16
  import argparse
17
17
  import joblib
18
18
  import os
19
+ import numpy as np
19
20
  import pandas as pd
20
-
21
- # Local Imports
22
- from proximity import Proximity
23
-
24
-
21
+ from typing import List, Tuple
25
22
 
26
23
  # Template Placeholders
27
24
  TEMPLATE_PARAMS = {
28
- "id_column": "id",
29
- "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
30
- "target": "solubility",
31
- "train_all_data": True,
32
- "track_columns": ['solubility']
25
+ "target": "udm_asy_res_value",
26
+ "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v', 'chiral_centers', 'r_cnt', 's_cnt', 'db_stereo', 'e_cnt', 'z_cnt', 'chiral_fp', 'db_fp'],
27
+ "compressed_features": [],
28
+ "train_all_data": True
33
29
  }
34
30
 
35
31
 
@@ -73,138 +69,99 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
73
69
  return df.rename(columns=rename_dict)
74
70
 
75
71
 
76
- def distance_weighted_calibrated_intervals(
77
- df_pred: pd.DataFrame,
78
- prox_df: pd.DataFrame,
79
- calibration_strength: float = 0.7,
80
- distance_decay: float = 3.0,
81
- ) -> pd.DataFrame:
82
- """
83
- Calibrate intervals using distance-weighted neighbor quantiles.
84
- Uses all 10 neighbors with distance-based weighting.
72
+ def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
85
73
  """
86
- id_column = TEMPLATE_PARAMS["id_column"]
87
- target_column = TEMPLATE_PARAMS["target"]
88
-
89
- # Distance-weighted neighbor statistics
90
- def weighted_quantile(values, weights, q):
91
- """Calculate weighted quantile"""
92
- if len(values) == 0:
93
- return np.nan
94
- sorted_indices = np.argsort(values)
95
- sorted_values = values[sorted_indices]
96
- sorted_weights = weights[sorted_indices]
97
- cumsum = np.cumsum(sorted_weights)
98
- cutoff = q * cumsum[-1]
99
- return np.interp(cutoff, cumsum, sorted_values)
100
-
101
- # Calculate distance weights (closer neighbors get more weight)
102
- prox_df = prox_df.copy()
103
- prox_df['weight'] = 1 / (1 + prox_df['distance'] ** distance_decay)
104
-
105
- # Get weighted quantiles and statistics for each ID
106
- neighbor_stats = []
107
- for id_val, group in prox_df.groupby(id_column):
108
- values = group[target_column].values
109
- weights = group['weight'].values
110
-
111
- # Normalize weights
112
- weights = weights / weights.sum()
113
-
114
- stats = {
115
- id_column: id_val,
116
- 'local_q025': weighted_quantile(values, weights, 0.025),
117
- 'local_q25': weighted_quantile(values, weights, 0.25),
118
- 'local_q75': weighted_quantile(values, weights, 0.75),
119
- 'local_q975': weighted_quantile(values, weights, 0.975),
120
- 'local_median': weighted_quantile(values, weights, 0.5),
121
- 'local_std': np.sqrt(np.average((values - np.average(values, weights=weights)) ** 2, weights=weights)),
122
- 'avg_distance': group['distance'].mean(),
123
- 'min_distance': group['distance'].min(),
124
- 'max_distance': group['distance'].max(),
125
- }
126
- neighbor_stats.append(stats)
74
+ Converts appropriate columns to categorical type with consistent mappings.
127
75
 
128
- neighbor_df = pd.DataFrame(neighbor_stats)
129
- out = df_pred.merge(neighbor_df, on=id_column, how='left')
130
-
131
- # Model disagreement score (normalized by prediction std)
132
- model_disagreement = (out["prediction"] - out["prediction_uq"]).abs()
133
- disagreement_score = (model_disagreement / out["prediction_std"]).clip(0, 2)
76
+ Args:
77
+ df (pd.DataFrame): The DataFrame to process.
78
+ features (list): List of feature names to consider for conversion.
79
+ category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
80
+ training mode. If populated, we're in inference mode.
134
81
 
135
- # Local confidence based on:
136
- # 1. How close the neighbors are (closer = more confident)
137
- # 2. How much local variance there is (less variance = more confident)
138
- max_reasonable_distance = out['max_distance'].quantile(0.8) # 80th percentile as reference
139
- distance_confidence = (1 - (out['avg_distance'] / max_reasonable_distance)).clip(0.1, 1.0)
82
+ Returns:
83
+ tuple: (processed DataFrame, category mappings dictionary)
84
+ """
85
+ # Training mode
86
+ if category_mappings == {}:
87
+ for col in df.select_dtypes(include=["object", "string"]):
88
+ if col in features and df[col].nunique() < 20:
89
+ print(f"Training mode: Converting {col} to category")
90
+ df[col] = df[col].astype("category")
91
+ category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
92
+
93
+ # Inference mode
94
+ else:
95
+ for col, categories in category_mappings.items():
96
+ if col in df.columns:
97
+ print(f"Inference mode: Applying categorical mapping for {col}")
98
+ df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
140
99
 
141
- variance_confidence = (out["prediction_std"] / out["local_std"]).clip(0.5, 2.0)
142
- local_confidence = distance_confidence * variance_confidence.clip(0.5, 1.5)
100
+ return df, category_mappings
143
101
 
144
- # Calibration weight: higher when models disagree and we have good local data
145
- calibration_weight = (
146
- calibration_strength *
147
- local_confidence * # Weight by local data quality
148
- disagreement_score.clip(0.3, 1.0) # More calibration when models disagree
149
- )
150
102
 
151
- # Consensus prediction (slight preference for NGBoost since it provides intervals)
152
- consensus_pred = 0.65 * out["prediction_uq"] + 0.35 * out["prediction"]
103
+ def decompress_features(
104
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
105
+ ) -> Tuple[pd.DataFrame, List[str]]:
106
+ """Prepare features for the model by decompressing bitstring features
153
107
 
154
- # Re-center local intervals around consensus prediction
155
- local_center_offset = consensus_pred - out["local_median"]
108
+ Args:
109
+ df (pd.DataFrame): The features DataFrame
110
+ features (List[str]): Full list of feature names
111
+ compressed_features (List[str]): List of feature names to decompress (bitstrings)
156
112
 
157
- # Apply calibration to each quantile
158
- quantile_pairs = [
159
- ("q_025", "local_q025"),
160
- ("q_25", "local_q25"),
161
- ("q_75", "local_q75"),
162
- ("q_975", "local_q975")
163
- ]
113
+ Returns:
114
+ pd.DataFrame: DataFrame with the decompressed features
115
+ List[str]: Updated list of feature names after decompression
164
116
 
165
- for model_q, local_q in quantile_pairs:
166
- # Adjust local quantiles to be centered around consensus
167
- adjusted_local_q = out[local_q] + local_center_offset
117
+ Raises:
118
+ ValueError: If any missing values are found in the specified features
119
+ """
168
120
 
169
- # Blend model and local intervals
170
- out[model_q] = (
171
- (1 - calibration_weight) * out[model_q] +
172
- calibration_weight * adjusted_local_q
121
+ # Check for any missing values in the required features
122
+ missing_counts = df[features].isna().sum()
123
+ if missing_counts.any():
124
+ missing_features = missing_counts[missing_counts > 0]
125
+ print(
126
+ f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
127
+ "WARNING: You might want to remove/replace all NaN values before processing."
173
128
  )
174
129
 
175
- # Ensure proper interval ordering and bounds using pandas
176
- out["q_025"] = pd.concat([out["q_025"], consensus_pred], axis=1).min(axis=1)
177
- out["q_975"] = pd.concat([out["q_975"], consensus_pred], axis=1).max(axis=1)
178
- out["q_25"] = pd.concat([out["q_25"], out["q_75"]], axis=1).min(axis=1)
130
+ # Decompress the specified compressed features
131
+ decompressed_features = features.copy()
132
+ for feature in compressed_features:
133
+ if (feature not in df.columns) or (feature not in features):
134
+ print(f"Feature '{feature}' not in the features list, skipping decompression.")
135
+ continue
179
136
 
180
- # Optional: Add some interval expansion when neighbors are very far
181
- # (indicates we're in a sparse region of feature space)
182
- sparse_region_mask = out['min_distance'] > out['min_distance'].quantile(0.9)
183
- expansion_factor = 1 + 0.2 * sparse_region_mask # 20% expansion in sparse regions
137
+ # Remove the feature from the list of features to avoid duplication
138
+ decompressed_features.remove(feature)
184
139
 
185
- for q in ["q_025", "q_25", "q_75", "q_975"]:
186
- interval_width = out[q] - consensus_pred
187
- out[q] = consensus_pred + interval_width * expansion_factor
140
+ # Handle all compressed features as bitstrings
141
+ bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
142
+ prefix = feature[:3]
188
143
 
189
- # Clean up temporary columns
190
- cleanup_cols = [col for col in out.columns if col.startswith("local_")] + \
191
- ['avg_distance', 'min_distance', 'max_distance']
144
+ # Create all new columns at once - avoids fragmentation
145
+ new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
146
+ new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
192
147
 
193
- return out.drop(columns=cleanup_cols)
148
+ # Add to features list
149
+ decompressed_features.extend(new_col_names)
150
+
151
+ # Drop original column and concatenate new ones
152
+ df = df.drop(columns=[feature])
153
+ df = pd.concat([df, new_df], axis=1)
154
+
155
+ return df, decompressed_features
194
156
 
195
157
 
196
- # TRAINING SECTION
197
- #
198
- # This section (__main__) is where SageMaker will execute the training job
199
- # and save the model artifacts to the model directory.
200
- #
201
158
  if __name__ == "__main__":
202
159
  # Template Parameters
203
- id_column = TEMPLATE_PARAMS["id_column"]
204
- features = TEMPLATE_PARAMS["features"]
205
160
  target = TEMPLATE_PARAMS["target"]
161
+ features = TEMPLATE_PARAMS["features"]
162
+ orig_features = features.copy()
163
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
206
164
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
207
- track_columns = TEMPLATE_PARAMS["track_columns"] # Can be None
208
165
  validation_split = 0.2
209
166
 
210
167
  # Script arguments for input/output directories
@@ -216,102 +173,221 @@ if __name__ == "__main__":
216
173
  )
217
174
  args = parser.parse_args()
218
175
 
219
- # Load training data from the specified directory
176
+ # Read the training data into DataFrames
220
177
  training_files = [
221
178
  os.path.join(args.train, file)
222
- for file in os.listdir(args.train) if file.endswith(".csv")
179
+ for file in os.listdir(args.train)
180
+ if file.endswith(".csv")
223
181
  ]
224
182
  print(f"Training Files: {training_files}")
225
183
 
226
184
  # Combine files and read them all into a single pandas dataframe
227
- df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
185
+ all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
186
+
187
+ # Check if the dataframe is empty
188
+ check_dataframe(all_df, "training_df")
189
+
190
+ # Features/Target output
191
+ print(f"Target: {target}")
192
+ print(f"Features: {str(features)}")
228
193
 
229
- # Check if the DataFrame is empty
230
- check_dataframe(df, "training_df")
194
+ # Convert any features that might be categorical to 'category' type
195
+ all_df, category_mappings = convert_categorical_types(all_df, features)
231
196
 
232
- # Training data split logic
197
+ # If we have compressed features, decompress them
198
+ if compressed_features:
199
+ print(f"Decompressing features {compressed_features}...")
200
+ all_df, features = decompress_features(all_df, features, compressed_features)
201
+
202
+ # Do we want to train on all the data?
233
203
  if train_all_data:
234
- # Use all data for both training and validation
235
- print("Training on all data...")
236
- df_train = df.copy()
237
- df_val = df.copy()
238
- elif "training" in df.columns:
239
- # Split data based on a 'training' column if it exists
240
- print("Splitting data based on 'training' column...")
241
- df_train = df[df["training"]].copy()
242
- df_val = df[~df["training"]].copy()
204
+ print("Training on ALL of the data")
205
+ df_train = all_df.copy()
206
+ df_val = all_df.copy()
207
+
208
+ # Does the dataframe have a training column?
209
+ elif "training" in all_df.columns:
210
+ print("Found training column, splitting data based on training column")
211
+ df_train = all_df[all_df["training"]]
212
+ df_val = all_df[~all_df["training"]]
243
213
  else:
244
- # Perform a random split if no 'training' column is found
245
- print("Splitting data randomly...")
246
- df_train, df_val = train_test_split(df, test_size=validation_split, random_state=42)
247
-
248
- # We're using XGBoost for point predictions and NGBoost for uncertainty quantification
249
- xgb_model = XGBRegressor()
250
- ngb_model = NGBRegressor()
214
+ # Just do a random training Split
215
+ print("WARNING: No training column found, splitting data with random state=42")
216
+ df_train, df_val = train_test_split(
217
+ all_df, test_size=validation_split, random_state=42
218
+ )
219
+ print(f"FIT/TRAIN: {df_train.shape}")
220
+ print(f"VALIDATION: {df_val.shape}")
251
221
 
252
222
  # Prepare features and targets for training
253
223
  X_train = df_train[features]
254
- X_val = df_val[features]
224
+ X_validate = df_val[features]
255
225
  y_train = df_train[target]
256
- y_val = df_val[target]
226
+ y_validate = df_val[target]
257
227
 
258
- # Train both models using the training data
228
+ # Train XGBoost for point predictions
229
+ print("\nTraining XGBoost for point predictions...")
230
+ xgb_model = XGBRegressor(
231
+ n_estimators=1000,
232
+ max_depth=6,
233
+ learning_rate=0.01,
234
+ subsample=0.8,
235
+ colsample_bytree=0.8,
236
+ random_state=42,
237
+ verbosity=0
238
+ )
259
239
  xgb_model.fit(X_train, y_train)
260
- ngb_model.fit(X_train, y_train, X_val=X_val, Y_val=y_val)
261
240
 
262
- # Make Predictions on the Validation Set
263
- print(f"Making Predictions on Validation Set...")
264
- y_validate = df_val[target]
265
- X_validate = df_val[features]
266
- preds = xgb_model.predict(X_validate)
267
-
268
- # Calculate various model performance metrics (regression)
269
- rmse = root_mean_squared_error(y_validate, preds)
270
- mae = mean_absolute_error(y_validate, preds)
271
- r2 = r2_score(y_validate, preds)
272
- print(f"RMSE: {rmse:.3f}")
273
- print(f"MAE: {mae:.3f}")
274
- print(f"R2: {r2:.3f}")
241
+ # Evaluate XGBoost performance
242
+ y_pred_xgb = xgb_model.predict(X_validate)
243
+ xgb_rmse = root_mean_squared_error(y_validate, y_pred_xgb)
244
+ xgb_mae = mean_absolute_error(y_validate, y_pred_xgb)
245
+ xgb_r2 = r2_score(y_validate, y_pred_xgb)
246
+
247
+ print(f"\nXGBoost Point Prediction Performance:")
248
+ print(f"RMSE: {xgb_rmse:.3f}")
249
+ print(f"MAE: {xgb_mae:.3f}")
250
+ print(f"R2: {xgb_r2:.3f}")
251
+
252
+ # Define confidence levels we want to model
253
+ confidence_levels = [0.50, 0.80, 0.90, 0.95] # 50%, 80%, 90%, 95% confidence intervals
254
+
255
+ # Store MAPIE models for each confidence level
256
+ mapie_models = {}
257
+
258
+ # Train models for each confidence level
259
+ for confidence_level in confidence_levels:
260
+ alpha = 1 - confidence_level
261
+ lower_q = alpha / 2
262
+ upper_q = 1 - alpha / 2
263
+
264
+ print(f"\nTraining quantile models for {confidence_level * 100:.0f}% confidence interval...")
265
+ print(f" Quantiles: {lower_q:.3f}, {upper_q:.3f}, 0.500")
266
+
267
+ # Train three models for this confidence level
268
+ quantile_estimators = []
269
+ for q in [lower_q, upper_q, 0.5]:
270
+ print(f" Training model for quantile {q:.3f}...")
271
+ est = LGBMRegressor(
272
+ objective="quantile",
273
+ alpha=q,
274
+ n_estimators=1000,
275
+ max_depth=6,
276
+ learning_rate=0.01,
277
+ num_leaves=31,
278
+ min_child_samples=20,
279
+ subsample=0.8,
280
+ colsample_bytree=0.8,
281
+ random_state=42,
282
+ verbose=-1,
283
+ force_col_wise=True
284
+ )
285
+ est.fit(X_train, y_train)
286
+ quantile_estimators.append(est)
287
+
288
+ # Create MAPIE CQR model for this confidence level
289
+ print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
290
+ mapie_model = ConformalizedQuantileRegressor(
291
+ quantile_estimators,
292
+ confidence_level=confidence_level,
293
+ prefit=True
294
+ )
295
+
296
+ # Conformalize the model
297
+ print(f" Conformalizing with validation data...")
298
+ mapie_model.conformalize(X_validate, y_validate)
299
+
300
+ # Store the model
301
+ mapie_models[f"mapie_{confidence_level:.2f}"] = mapie_model
302
+
303
+ # Validate coverage for this confidence level
304
+ y_pred, y_pis = mapie_model.predict_interval(X_validate)
305
+ coverage = np.mean((y_validate >= y_pis[:, 0, 0]) & (y_validate <= y_pis[:, 1, 0]))
306
+ print(f" Coverage: Target={confidence_level * 100:.0f}%, Empirical={coverage * 100:.1f}%")
307
+
308
+ print(f"\nOverall Model Performance Summary:")
309
+ print(f"XGBoost RMSE: {xgb_rmse:.3f}")
310
+ print(f"XGBoost MAE: {xgb_mae:.3f}")
311
+ print(f"XGBoost R2: {xgb_r2:.3f}")
275
312
  print(f"NumRows: {len(df_val)}")
276
313
 
314
+ # Analyze interval widths across confidence levels
315
+ print(f"\nInterval Width Analysis:")
316
+ for conf_level in confidence_levels:
317
+ model = mapie_models[f"mapie_{conf_level:.2f}"]
318
+ _, y_pis = model.predict_interval(X_validate)
319
+ widths = y_pis[:, 1, 0] - y_pis[:, 0, 0]
320
+ print(f" {conf_level * 100:.0f}% CI: Mean width={np.mean(widths):.3f}, Std={np.std(widths):.3f}")
321
+
277
322
  # Save the trained XGBoost model
278
323
  xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
279
324
 
280
- # Save the trained NGBoost model
281
- joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
325
+ # Save all MAPIE models
326
+ for model_name, model in mapie_models.items():
327
+ joblib.dump(model, os.path.join(args.model_dir, f"{model_name}.joblib"))
282
328
 
283
- # Save the feature list to validate input during predictions
329
+ # Save the feature list
284
330
  with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
285
331
  json.dump(features, fp)
286
332
 
287
- # Now the Proximity model
288
- model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
333
+ # Save category mappings if any
334
+ if category_mappings:
335
+ with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
336
+ json.dump(category_mappings, fp)
337
+
338
+ # Save model configuration
339
+ model_config = {
340
+ "model_type": "XGBoost_MAPIE_CQR_LightGBM",
341
+ "confidence_levels": confidence_levels,
342
+ "n_features": len(features),
343
+ "target": target,
344
+ "validation_metrics": {
345
+ "xgb_rmse": float(xgb_rmse),
346
+ "xgb_mae": float(xgb_mae),
347
+ "xgb_r2": float(xgb_r2),
348
+ "n_validation": len(df_val)
349
+ }
350
+ }
351
+ with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
352
+ json.dump(model_config, fp, indent=2)
289
353
 
290
- # Now serialize the model
291
- model.serialize(args.model_dir)
354
+ print(f"\nModel training complete!")
355
+ print(f"Saved 1 XGBoost model and {len(mapie_models)} MAPIE models to {args.model_dir}")
292
356
 
293
357
 
294
358
  #
295
359
  # Inference Section
296
360
  #
297
361
  def model_fn(model_dir) -> dict:
298
- """Load and return XGBoost and NGBoost regressors from model directory."""
362
+ """Load XGBoost and all MAPIE models from the specified directory."""
363
+
364
+ # Load model configuration to know which models to load
365
+ with open(os.path.join(model_dir, "model_config.json")) as fp:
366
+ config = json.load(fp)
299
367
 
300
368
  # Load XGBoost regressor
301
369
  xgb_path = os.path.join(model_dir, "xgb_model.json")
302
370
  xgb_model = XGBRegressor(enable_categorical=True)
303
371
  xgb_model.load_model(xgb_path)
304
372
 
305
- # Load NGBoost regressor
306
- ngb_model = joblib.load(os.path.join(model_dir, "ngb_model.joblib"))
373
+ # Load all MAPIE models
374
+ mapie_models = {}
375
+ for conf_level in config["confidence_levels"]:
376
+ model_name = f"mapie_{conf_level:.2f}"
377
+ mapie_models[model_name] = joblib.load(os.path.join(model_dir, f"{model_name}.joblib"))
307
378
 
308
- # Deserialize the proximity model
309
- prox_model = Proximity.deserialize(model_dir)
379
+ # Load category mappings if they exist
380
+ category_mappings = {}
381
+ category_path = os.path.join(model_dir, "category_mappings.json")
382
+ if os.path.exists(category_path):
383
+ with open(category_path) as fp:
384
+ category_mappings = json.load(fp)
310
385
 
311
386
  return {
312
- "xgboost": xgb_model,
313
- "ngboost": ngb_model,
314
- "proximity": prox_model
387
+ "xgb_model": xgb_model,
388
+ "mapie_models": mapie_models,
389
+ "confidence_levels": config["confidence_levels"],
390
+ "category_mappings": category_mappings
315
391
  }
316
392
 
317
393
 
@@ -327,7 +403,7 @@ def input_fn(input_data, content_type):
327
403
  if "text/csv" in content_type:
328
404
  return pd.read_csv(StringIO(input_data))
329
405
  elif "application/json" in content_type:
330
- return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
406
+ return pd.DataFrame(json.loads(input_data))
331
407
  else:
332
408
  raise ValueError(f"{content_type} not supported!")
333
409
 
@@ -335,23 +411,26 @@ def input_fn(input_data, content_type):
335
411
  def output_fn(output_df, accept_type):
336
412
  """Supports both CSV and JSON output formats."""
337
413
  if "text/csv" in accept_type:
338
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
414
+ # Convert categorical columns to string to avoid fillna issues
415
+ for col in output_df.select_dtypes(include=['category']).columns:
416
+ output_df[col] = output_df[col].astype(str)
417
+ csv_output = output_df.fillna("N/A").to_csv(index=False)
339
418
  return csv_output, "text/csv"
340
419
  elif "application/json" in accept_type:
341
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
420
+ return output_df.to_json(orient="records"), "application/json"
342
421
  else:
343
422
  raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
344
423
 
345
424
 
346
425
  def predict_fn(df, models) -> pd.DataFrame:
347
- """Make Predictions with our XGB Quantile Regression Model
426
+ """Make predictions using XGBoost for point estimates and MAPIE for conformalized intervals
348
427
 
349
428
  Args:
350
429
  df (pd.DataFrame): The input DataFrame
351
- models (dict): The dictionary of models to use for predictions
430
+ models (dict): Dictionary containing XGBoost and MAPIE models
352
431
 
353
432
  Returns:
354
- pd.DataFrame: The DataFrame with the predictions added
433
+ pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
355
434
  """
356
435
 
357
436
  # Grab our feature columns (from training)
@@ -362,32 +441,62 @@ def predict_fn(df, models) -> pd.DataFrame:
362
441
  # Match features in a case-insensitive manner
363
442
  matched_df = match_features_case_insensitive(df, model_features)
364
443
 
365
- # Use XGBoost for point predictions
366
- df["prediction"] = models["xgboost"].predict(matched_df[model_features])
367
-
368
- # NGBoost predict returns distribution objects
369
- y_dists = models["ngboost"].pred_dist(matched_df[model_features])
370
-
371
- # Extract parameters from distribution
372
- dist_params = y_dists.params
373
-
374
- # Extract mean and std from distribution parameters
375
- df["prediction_uq"] = dist_params['loc'] # mean
376
- df["prediction_std"] = dist_params['scale'] # standard deviation
377
-
378
- # Add 95% prediction intervals using ppf (percent point function)
379
- df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
380
- df["q_975"] = y_dists.ppf(0.975) # 97.5th percentile
381
-
382
- # Add 50% prediction intervals
383
- df["q_25"] = y_dists.ppf(0.25) # 25th percentile
384
- df["q_75"] = y_dists.ppf(0.75) # 75th percentile
385
-
386
- # Compute Nearest neighbors with Proximity model
387
- prox_df = models["proximity"].neighbors(df)
444
+ # Apply categorical mappings if they exist
445
+ if models.get("category_mappings"):
446
+ matched_df, _ = convert_categorical_types(
447
+ matched_df,
448
+ model_features,
449
+ models["category_mappings"]
450
+ )
388
451
 
389
- # Shrink prediction intervals based on KNN variance
390
- df = distance_weighted_calibrated_intervals(df, prox_df)
452
+ # Get features for prediction
453
+ X = matched_df[model_features]
454
+
455
+ # Get XGBoost point predictions
456
+ df["prediction"] = models["xgb_model"].predict(X)
457
+
458
+ # Get predictions from each MAPIE model for conformalized intervals
459
+ for conf_level in models["confidence_levels"]:
460
+ model_name = f"mapie_{conf_level:.2f}"
461
+ model = models["mapie_models"][model_name]
462
+
463
+ # Get conformalized predictions
464
+ y_pred, y_pis = model.predict_interval(X)
465
+
466
+ # Map confidence levels to quantile names
467
+ if conf_level == 0.50: # 50% CI
468
+ df["q_25"] = y_pis[:, 0, 0]
469
+ df["q_75"] = y_pis[:, 1, 0]
470
+ elif conf_level == 0.80: # 80% CI
471
+ df["q_10"] = y_pis[:, 0, 0]
472
+ df["q_90"] = y_pis[:, 1, 0]
473
+ elif conf_level == 0.90: # 90% CI
474
+ df["q_05"] = y_pis[:, 0, 0]
475
+ df["q_95"] = y_pis[:, 1, 0]
476
+ elif conf_level == 0.95: # 95% CI
477
+ df["q_025"] = y_pis[:, 0, 0]
478
+ df["q_975"] = y_pis[:, 1, 0]
479
+
480
+ # Add median (q_50) from XGBoost prediction
481
+ df["q_50"] = df["prediction"]
482
+
483
+ # Calculate uncertainty metrics based on 95% interval
484
+ interval_width = df["q_975"] - df["q_025"]
485
+ df["prediction_std"] = interval_width / 3.92
486
+
487
+ # Reorder the quantile columns for easier reading
488
+ quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
489
+ other_cols = [col for col in df.columns if col not in quantile_cols]
490
+ df = df[other_cols + quantile_cols]
491
+
492
+ # Uncertainty score
493
+ df["uncertainty_score"] = interval_width / (np.abs(df["prediction"]) + 1e-6)
494
+
495
+ # Confidence bands
496
+ df["confidence_band"] = pd.cut(
497
+ df["uncertainty_score"],
498
+ bins=[0, 0.5, 1.0, 2.0, np.inf],
499
+ labels=["high", "medium", "low", "very_low"]
500
+ )
391
501
 
392
- # Return the modified DataFrame
393
502
  return df