workbench 0.8.192__py3-none-any.whl → 0.8.197__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. workbench/algorithms/dataframe/__init__.py +1 -2
  2. workbench/algorithms/dataframe/fingerprint_proximity.py +2 -2
  3. workbench/algorithms/dataframe/proximity.py +212 -234
  4. workbench/algorithms/graph/light/proximity_graph.py +8 -7
  5. workbench/api/endpoint.py +2 -3
  6. workbench/api/model.py +2 -5
  7. workbench/core/artifacts/endpoint_core.py +25 -16
  8. workbench/core/artifacts/feature_set_core.py +126 -4
  9. workbench/core/artifacts/model_core.py +37 -55
  10. workbench/core/transforms/features_to_model/features_to_model.py +3 -3
  11. workbench/core/views/training_view.py +75 -0
  12. workbench/core/views/view.py +1 -1
  13. workbench/model_scripts/custom_models/proximity/proximity.py +212 -234
  14. workbench/model_scripts/custom_models/uq_models/proximity.py +212 -234
  15. workbench/model_scripts/pytorch_model/generated_model_script.py +567 -0
  16. workbench/model_scripts/uq_models/generated_model_script.py +589 -0
  17. workbench/model_scripts/uq_models/mapie.template +103 -6
  18. workbench/model_scripts/xgb_model/generated_model_script.py +468 -0
  19. workbench/repl/workbench_shell.py +3 -3
  20. workbench/utils/model_utils.py +25 -10
  21. workbench/utils/xgboost_model_utils.py +117 -47
  22. workbench/web_interface/components/model_plot.py +7 -1
  23. workbench/web_interface/components/plugin_unit_test.py +5 -2
  24. workbench/web_interface/components/plugins/model_details.py +9 -7
  25. {workbench-0.8.192.dist-info → workbench-0.8.197.dist-info}/METADATA +23 -2
  26. {workbench-0.8.192.dist-info → workbench-0.8.197.dist-info}/RECORD +30 -27
  27. {workbench-0.8.192.dist-info → workbench-0.8.197.dist-info}/licenses/LICENSE +1 -1
  28. {workbench-0.8.192.dist-info → workbench-0.8.197.dist-info}/WHEEL +0 -0
  29. {workbench-0.8.192.dist-info → workbench-0.8.197.dist-info}/entry_points.txt +0 -0
  30. {workbench-0.8.192.dist-info → workbench-0.8.197.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,589 @@
1
+ # Model: XGBoost for point predictions + LightGBM with MAPIE for conformalized intervals
2
+ from mapie.regression import ConformalizedQuantileRegressor
3
+ from lightgbm import LGBMRegressor
4
+ from xgboost import XGBRegressor
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ # Model Performance Scores
8
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
9
+
10
+ from io import StringIO
11
+ import json
12
+ import argparse
13
+ import joblib
14
+ import os
15
+ import numpy as np
16
+ import pandas as pd
17
+ from typing import List, Tuple, Optional, Dict
18
+
19
+ # Template Placeholders
20
+ TEMPLATE_PARAMS = {
21
+ "target": "udm_asy_res_efflux_ratio",
22
+ "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo', 'tertiary_amine_count', 'type_i_pattern_count', 'type_ii_pattern_count', 'aromatic_interaction_score', 'molecular_axis_length', 'molecular_asymmetry', 'molecular_volume_3d', 'radius_of_gyration', 'asphericity'],
23
+ "compressed_features": [],
24
+ "train_all_data": True,
25
+ "hyperparameters": {},
26
+ }
27
+
28
+
29
+ def compute_confidence(
30
+ df: pd.DataFrame,
31
+ median_interval_width: float,
32
+ lower_q: str = "q_10",
33
+ upper_q: str = "q_90",
34
+ alpha: float = 1.0,
35
+ beta: float = 1.0,
36
+ ) -> pd.DataFrame:
37
+ """
38
+ Compute confidence scores (0.0 to 1.0) based on prediction interval width
39
+ and distance from median using exponential decay.
40
+
41
+ Args:
42
+ df: DataFrame with 'prediction', 'q_50', and quantile columns
43
+ median_interval_width: Pre-computed median interval width from training data
44
+ lower_q: Lower quantile column name (default: 'q_10')
45
+ upper_q: Upper quantile column name (default: 'q_90')
46
+ alpha: Weight for interval width term (default: 1.0)
47
+ beta: Weight for distance from median term (default: 1.0)
48
+
49
+ Returns:
50
+ DataFrame with added 'confidence' column
51
+ """
52
+ # Interval width
53
+ interval_width = (df[upper_q] - df[lower_q]).abs()
54
+
55
+ # Distance from median, normalized by interval width
56
+ distance_from_median = (df['prediction'] - df['q_50']).abs()
57
+ normalized_distance = distance_from_median / (interval_width + 1e-6)
58
+
59
+ # Cap the distance penalty at 1.0
60
+ normalized_distance = np.minimum(normalized_distance, 1.0)
61
+
62
+ # Confidence using exponential decay
63
+ interval_term = interval_width / median_interval_width
64
+ df['confidence'] = np.exp(-(alpha * interval_term + beta * normalized_distance))
65
+
66
+ return df
67
+
68
+
69
+ # Function to check if dataframe is empty
70
+ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
71
+ """
72
+ Check if the provided dataframe is empty and raise an exception if it is.
73
+
74
+ Args:
75
+ df (pd.DataFrame): DataFrame to check
76
+ df_name (str): Name of the DataFrame
77
+ """
78
+ if df.empty:
79
+ msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
80
+ print(msg)
81
+ raise ValueError(msg)
82
+
83
+
84
+ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
85
+ """
86
+ Matches and renames DataFrame columns to match model feature names (case-insensitive).
87
+ Prioritizes exact matches, then case-insensitive matches.
88
+
89
+ Raises ValueError if any model features cannot be matched.
90
+ """
91
+ df_columns_lower = {col.lower(): col for col in df.columns}
92
+ rename_dict = {}
93
+ missing = []
94
+ for feature in model_features:
95
+ if feature in df.columns:
96
+ continue # Exact match
97
+ elif feature.lower() in df_columns_lower:
98
+ rename_dict[df_columns_lower[feature.lower()]] = feature
99
+ else:
100
+ missing.append(feature)
101
+
102
+ if missing:
103
+ raise ValueError(f"Features not found: {missing}")
104
+
105
+ # Rename the DataFrame columns to match the model features
106
+ return df.rename(columns=rename_dict)
107
+
108
+
109
+ def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
110
+ """
111
+ Converts appropriate columns to categorical type with consistent mappings.
112
+
113
+ Args:
114
+ df (pd.DataFrame): The DataFrame to process.
115
+ features (list): List of feature names to consider for conversion.
116
+ category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
117
+ training mode. If populated, we're in inference mode.
118
+
119
+ Returns:
120
+ tuple: (processed DataFrame, category mappings dictionary)
121
+ """
122
+ # Training mode
123
+ if category_mappings == {}:
124
+ for col in df.select_dtypes(include=["object", "string"]):
125
+ if col in features and df[col].nunique() < 20:
126
+ print(f"Training mode: Converting {col} to category")
127
+ df[col] = df[col].astype("category")
128
+ category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
129
+
130
+ # Inference mode
131
+ else:
132
+ for col, categories in category_mappings.items():
133
+ if col in df.columns:
134
+ print(f"Inference mode: Applying categorical mapping for {col}")
135
+ df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
136
+
137
+ return df, category_mappings
138
+
139
+
140
+ def decompress_features(
141
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
142
+ ) -> Tuple[pd.DataFrame, List[str]]:
143
+ """Prepare features for the model by decompressing bitstring features
144
+
145
+ Args:
146
+ df (pd.DataFrame): The features DataFrame
147
+ features (List[str]): Full list of feature names
148
+ compressed_features (List[str]): List of feature names to decompress (bitstrings)
149
+
150
+ Returns:
151
+ pd.DataFrame: DataFrame with the decompressed features
152
+ List[str]: Updated list of feature names after decompression
153
+
154
+ Raises:
155
+ ValueError: If any missing values are found in the specified features
156
+ """
157
+
158
+ # Check for any missing values in the required features
159
+ missing_counts = df[features].isna().sum()
160
+ if missing_counts.any():
161
+ missing_features = missing_counts[missing_counts > 0]
162
+ print(
163
+ f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
164
+ "WARNING: You might want to remove/replace all NaN values before processing."
165
+ )
166
+
167
+ # Decompress the specified compressed features
168
+ decompressed_features = features.copy()
169
+ for feature in compressed_features:
170
+ if (feature not in df.columns) or (feature not in features):
171
+ print(f"Feature '{feature}' not in the features list, skipping decompression.")
172
+ continue
173
+
174
+ # Remove the feature from the list of features to avoid duplication
175
+ decompressed_features.remove(feature)
176
+
177
+ # Handle all compressed features as bitstrings
178
+ bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
179
+ prefix = feature[:3]
180
+
181
+ # Create all new columns at once - avoids fragmentation
182
+ new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
183
+ new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
184
+
185
+ # Add to features list
186
+ decompressed_features.extend(new_col_names)
187
+
188
+ # Drop original column and concatenate new ones
189
+ df = df.drop(columns=[feature])
190
+ df = pd.concat([df, new_df], axis=1)
191
+
192
+ return df, decompressed_features
193
+
194
+
195
+ if __name__ == "__main__":
196
+ # Template Parameters
197
+ target = TEMPLATE_PARAMS["target"]
198
+ features = TEMPLATE_PARAMS["features"]
199
+ orig_features = features.copy()
200
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
201
+ train_all_data = TEMPLATE_PARAMS["train_all_data"]
202
+ hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
203
+ validation_split = 0.2
204
+
205
+ # Script arguments for input/output directories
206
+ parser = argparse.ArgumentParser()
207
+ parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
208
+ parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
209
+ parser.add_argument(
210
+ "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
211
+ )
212
+ args = parser.parse_args()
213
+
214
+ # Read the training data into DataFrames
215
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
216
+ print(f"Training Files: {training_files}")
217
+
218
+ # Combine files and read them all into a single pandas dataframe
219
+ all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
220
+
221
+ # Check if the dataframe is empty
222
+ check_dataframe(all_df, "training_df")
223
+
224
+ # Features/Target output
225
+ print(f"Target: {target}")
226
+ print(f"Features: {str(features)}")
227
+
228
+ # Convert any features that might be categorical to 'category' type
229
+ all_df, category_mappings = convert_categorical_types(all_df, features)
230
+
231
+ # If we have compressed features, decompress them
232
+ if compressed_features:
233
+ print(f"Decompressing features {compressed_features}...")
234
+ all_df, features = decompress_features(all_df, features, compressed_features)
235
+
236
+ # Do we want to train on all the data?
237
+ if train_all_data:
238
+ print("Training on ALL of the data")
239
+ df_train = all_df.copy()
240
+ df_val = all_df.copy()
241
+
242
+ # Does the dataframe have a training column?
243
+ elif "training" in all_df.columns:
244
+ print("Found training column, splitting data based on training column")
245
+ df_train = all_df[all_df["training"]]
246
+ df_val = all_df[~all_df["training"]]
247
+ else:
248
+ # Just do a random training Split
249
+ print("WARNING: No training column found, splitting data with random state=42")
250
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
251
+ print(f"FIT/TRAIN: {df_train.shape}")
252
+ print(f"VALIDATION: {df_val.shape}")
253
+
254
+ # Prepare features and targets for training
255
+ X_train = df_train[features]
256
+ X_validate = df_val[features]
257
+ y_train = df_train[target]
258
+ y_validate = df_val[target]
259
+
260
+ # Train XGBoost for point predictions
261
+ print("\nTraining XGBoost for point predictions...")
262
+ print(f" Hyperparameters: {hyperparameters}")
263
+ xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
264
+ xgb_model.fit(X_train, y_train)
265
+
266
+ # Evaluate XGBoost performance
267
+ y_pred_xgb = xgb_model.predict(X_validate)
268
+ xgb_rmse = root_mean_squared_error(y_validate, y_pred_xgb)
269
+ xgb_mae = mean_absolute_error(y_validate, y_pred_xgb)
270
+ xgb_r2 = r2_score(y_validate, y_pred_xgb)
271
+
272
+ print(f"\nXGBoost Point Prediction Performance:")
273
+ print(f"RMSE: {xgb_rmse:.3f}")
274
+ print(f"MAE: {xgb_mae:.3f}")
275
+ print(f"R2: {xgb_r2:.3f}")
276
+
277
+ # Define confidence levels we want to model
278
+ confidence_levels = [0.50, 0.68, 0.80, 0.90, 0.95] # 50%, 68%, 80%, 90%, 95% confidence intervals
279
+
280
+ # Store MAPIE models for each confidence level
281
+ mapie_models = {}
282
+
283
+ # Train models for each confidence level
284
+ for confidence_level in confidence_levels:
285
+ alpha = 1 - confidence_level
286
+ lower_q = alpha / 2
287
+ upper_q = 1 - alpha / 2
288
+
289
+ print(f"\nTraining quantile models for {confidence_level * 100:.0f}% confidence interval...")
290
+ print(f" Quantiles: {lower_q:.3f}, {upper_q:.3f}, 0.500")
291
+
292
+ # Train three models for this confidence level
293
+ quantile_estimators = []
294
+ for q in [lower_q, upper_q, 0.5]:
295
+ print(f" Training model for quantile {q:.3f}...")
296
+ est = LGBMRegressor(
297
+ objective="quantile",
298
+ alpha=q,
299
+ n_estimators=1000,
300
+ max_depth=6,
301
+ learning_rate=0.01,
302
+ num_leaves=31,
303
+ min_child_samples=20,
304
+ subsample=0.8,
305
+ colsample_bytree=0.8,
306
+ random_state=42,
307
+ verbose=-1,
308
+ force_col_wise=True,
309
+ )
310
+ est.fit(X_train, y_train)
311
+ quantile_estimators.append(est)
312
+
313
+ # Create MAPIE CQR model for this confidence level
314
+ print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
315
+ mapie_model = ConformalizedQuantileRegressor(
316
+ quantile_estimators, confidence_level=confidence_level, prefit=True
317
+ )
318
+
319
+ # Conformalize the model
320
+ print(f" Conformalizing with validation data...")
321
+ mapie_model.conformalize(X_validate, y_validate)
322
+
323
+ # Store the model
324
+ mapie_models[f"mapie_{confidence_level:.2f}"] = mapie_model
325
+
326
+ # Validate coverage for this confidence level
327
+ y_pred, y_pis = mapie_model.predict_interval(X_validate)
328
+ coverage = np.mean((y_validate >= y_pis[:, 0, 0]) & (y_validate <= y_pis[:, 1, 0]))
329
+ print(f" Coverage: Target={confidence_level * 100:.0f}%, Empirical={coverage * 100:.1f}%")
330
+
331
+ print(f"\nOverall Model Performance Summary:")
332
+ print(f"XGBoost RMSE: {xgb_rmse:.3f}")
333
+ print(f"XGBoost MAE: {xgb_mae:.3f}")
334
+ print(f"XGBoost R2: {xgb_r2:.3f}")
335
+ print(f"NumRows: {len(df_val)}")
336
+
337
+ # Analyze interval widths across confidence levels
338
+ print(f"\nInterval Width Analysis:")
339
+ for conf_level in confidence_levels:
340
+ model = mapie_models[f"mapie_{conf_level:.2f}"]
341
+ _, y_pis = model.predict_interval(X_validate)
342
+ widths = y_pis[:, 1, 0] - y_pis[:, 0, 0]
343
+ print(f" {conf_level * 100:.0f}% CI: Mean width={np.mean(widths):.3f}, Std={np.std(widths):.3f}")
344
+
345
+ # Compute normalization statistics for confidence calculation
346
+ print(f"\nComputing normalization statistics for confidence scores...")
347
+
348
+ # Create a temporary validation dataframe with predictions
349
+ temp_val_df = df_val.copy()
350
+ temp_val_df["prediction"] = xgb_model.predict(X_validate)
351
+
352
+ # Add all quantile predictions
353
+ for conf_level in confidence_levels:
354
+ model_name = f"mapie_{conf_level:.2f}"
355
+ model = mapie_models[model_name]
356
+ y_pred, y_pis = model.predict_interval(X_validate)
357
+
358
+ if conf_level == 0.50:
359
+ temp_val_df["q_25"] = y_pis[:, 0, 0]
360
+ temp_val_df["q_75"] = y_pis[:, 1, 0]
361
+ # y_pred is the median prediction
362
+ temp_val_df["q_50"] = y_pred
363
+ elif conf_level == 0.68:
364
+ temp_val_df["q_16"] = y_pis[:, 0, 0]
365
+ temp_val_df["q_84"] = y_pis[:, 1, 0]
366
+ elif conf_level == 0.80:
367
+ temp_val_df["q_10"] = y_pis[:, 0, 0]
368
+ temp_val_df["q_90"] = y_pis[:, 1, 0]
369
+ elif conf_level == 0.90:
370
+ temp_val_df["q_05"] = y_pis[:, 0, 0]
371
+ temp_val_df["q_95"] = y_pis[:, 1, 0]
372
+ elif conf_level == 0.95:
373
+ temp_val_df["q_025"] = y_pis[:, 0, 0]
374
+ temp_val_df["q_975"] = y_pis[:, 1, 0]
375
+
376
+ # Compute normalization stats using q_10 and q_90 (default range)
377
+ interval_width = (temp_val_df["q_90"] - temp_val_df["q_10"]).abs()
378
+ median_interval_width = float(interval_width.median())
379
+ print(f" Median interval width (q_10-q_90): {median_interval_width:.6f}")
380
+
381
+ # Save median interval width for confidence calculation
382
+ with open(os.path.join(args.model_dir, "median_interval_width.json"), "w") as fp:
383
+ json.dump(median_interval_width, fp)
384
+
385
+ # Save the trained XGBoost model
386
+ joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
387
+
388
+ # Save all MAPIE models
389
+ for model_name, model in mapie_models.items():
390
+ joblib.dump(model, os.path.join(args.model_dir, f"{model_name}.joblib"))
391
+
392
+ # Save the feature list
393
+ with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
394
+ json.dump(features, fp)
395
+
396
+ # Save category mappings if any
397
+ if category_mappings:
398
+ with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
399
+ json.dump(category_mappings, fp)
400
+
401
+ # Save model configuration
402
+ model_config = {
403
+ "model_type": "XGBoost_MAPIE_CQR_LightGBM",
404
+ "confidence_levels": confidence_levels,
405
+ "n_features": len(features),
406
+ "target": target,
407
+ "validation_metrics": {
408
+ "xgb_rmse": float(xgb_rmse),
409
+ "xgb_mae": float(xgb_mae),
410
+ "xgb_r2": float(xgb_r2),
411
+ "n_validation": len(df_val),
412
+ },
413
+ }
414
+ with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
415
+ json.dump(model_config, fp, indent=2)
416
+
417
+ print(f"\nModel training complete!")
418
+ print(f"Saved 1 XGBoost model and {len(mapie_models)} MAPIE models to {args.model_dir}")
419
+
420
+
421
+ #
422
+ # Inference Section
423
+ #
424
+ def model_fn(model_dir) -> dict:
425
+ """Load XGBoost and all MAPIE models from the specified directory."""
426
+
427
+ # Load model configuration to know which models to load
428
+ with open(os.path.join(model_dir, "model_config.json")) as fp:
429
+ config = json.load(fp)
430
+
431
+ # Load XGBoost regressor
432
+ xgb_path = os.path.join(model_dir, "xgb_model.joblib")
433
+ xgb_model = joblib.load(xgb_path)
434
+
435
+ # Load all MAPIE models
436
+ mapie_models = {}
437
+ for conf_level in config["confidence_levels"]:
438
+ model_name = f"mapie_{conf_level:.2f}"
439
+ mapie_models[model_name] = joblib.load(os.path.join(model_dir, f"{model_name}.joblib"))
440
+
441
+ # Load category mappings if they exist
442
+ category_mappings = {}
443
+ category_path = os.path.join(model_dir, "category_mappings.json")
444
+ if os.path.exists(category_path):
445
+ with open(category_path) as fp:
446
+ category_mappings = json.load(fp)
447
+
448
+ # Load median interval width for confidence calculation
449
+ median_interval_width = None
450
+ median_width_path = os.path.join(model_dir, "median_interval_width.json")
451
+ if os.path.exists(median_width_path):
452
+ with open(median_width_path) as fp:
453
+ median_interval_width = json.load(fp)
454
+
455
+ return {
456
+ "xgb_model": xgb_model,
457
+ "mapie_models": mapie_models,
458
+ "confidence_levels": config["confidence_levels"],
459
+ "category_mappings": category_mappings,
460
+ "median_interval_width": median_interval_width,
461
+ }
462
+
463
+
464
+ def input_fn(input_data, content_type):
465
+ """Parse input data and return a DataFrame."""
466
+ if not input_data:
467
+ raise ValueError("Empty input data is not supported!")
468
+
469
+ # Decode bytes to string if necessary
470
+ if isinstance(input_data, bytes):
471
+ input_data = input_data.decode("utf-8")
472
+
473
+ if "text/csv" in content_type:
474
+ return pd.read_csv(StringIO(input_data))
475
+ elif "application/json" in content_type:
476
+ return pd.DataFrame(json.loads(input_data))
477
+ else:
478
+ raise ValueError(f"{content_type} not supported!")
479
+
480
+
481
+ def output_fn(output_df, accept_type):
482
+ """Supports both CSV and JSON output formats."""
483
+ if "text/csv" in accept_type:
484
+ # Convert categorical columns to string to avoid fillna issues
485
+ for col in output_df.select_dtypes(include=["category"]).columns:
486
+ output_df[col] = output_df[col].astype(str)
487
+ csv_output = output_df.fillna("N/A").to_csv(index=False)
488
+ return csv_output, "text/csv"
489
+ elif "application/json" in accept_type:
490
+ return output_df.to_json(orient="records"), "application/json"
491
+ else:
492
+ raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
493
+
494
+
495
+ def predict_fn(df, models) -> pd.DataFrame:
496
+ """Make predictions using XGBoost for point estimates and MAPIE for conformalized intervals
497
+
498
+ Args:
499
+ df (pd.DataFrame): The input DataFrame
500
+ models (dict): Dictionary containing XGBoost and MAPIE models
501
+
502
+ Returns:
503
+ pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
504
+ """
505
+
506
+ # Flag for outlier stretch adjustment for the prediction intervals
507
+ # if the predicted values are outside the intervals
508
+ outlier_stretch = False
509
+
510
+ # Grab our feature columns (from training)
511
+ model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
512
+ with open(os.path.join(model_dir, "feature_columns.json")) as fp:
513
+ model_features = json.load(fp)
514
+
515
+ # Match features in a case-insensitive manner
516
+ matched_df = match_features_case_insensitive(df, model_features)
517
+
518
+ # Apply categorical mappings if they exist
519
+ if models.get("category_mappings"):
520
+ matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
521
+
522
+ # Get features for prediction
523
+ X = matched_df[model_features]
524
+
525
+ # Get XGBoost point predictions
526
+ df["prediction"] = models["xgb_model"].predict(X)
527
+
528
+ # Get predictions from each MAPIE model for conformalized intervals
529
+ for conf_level in models["confidence_levels"]:
530
+ model_name = f"mapie_{conf_level:.2f}"
531
+ model = models["mapie_models"][model_name]
532
+
533
+ # Get conformalized predictions
534
+ y_pred, y_pis = model.predict_interval(X)
535
+
536
+ # Map confidence levels to quantile names
537
+ if conf_level == 0.50: # 50% CI
538
+ df["q_25"] = y_pis[:, 0, 0]
539
+ df["q_75"] = y_pis[:, 1, 0]
540
+ # y_pred is the median prediction
541
+ df["q_50"] = y_pred
542
+ elif conf_level == 0.68: # 68% CI
543
+ df["q_16"] = y_pis[:, 0, 0]
544
+ df["q_84"] = y_pis[:, 1, 0]
545
+ elif conf_level == 0.80: # 80% CI
546
+ df["q_10"] = y_pis[:, 0, 0]
547
+ df["q_90"] = y_pis[:, 1, 0]
548
+ elif conf_level == 0.90: # 90% CI
549
+ df["q_05"] = y_pis[:, 0, 0]
550
+ df["q_95"] = y_pis[:, 1, 0]
551
+ elif conf_level == 0.95: # 95% CI
552
+ df["q_025"] = y_pis[:, 0, 0]
553
+ df["q_975"] = y_pis[:, 1, 0]
554
+
555
+ # Calculate a pseudo-standard deviation from the 68% interval width
556
+ df["prediction_std"] = (df["q_84"] - df["q_16"]).abs() / 2.0
557
+
558
+ # Reorder the quantile columns for easier reading
559
+ quantile_cols = ["q_025", "q_05", "q_10", "q_16", "q_25", "q_50", "q_75", "q_84", "q_90", "q_95", "q_975"]
560
+ other_cols = [col for col in df.columns if col not in quantile_cols]
561
+ df = df[other_cols + quantile_cols]
562
+
563
+ # Adjust the outer quantiles to ensure they encompass the prediction
564
+ if outlier_stretch:
565
+ # Lower intervals adjustments
566
+ df["q_025"] = np.minimum(df["q_025"], df["prediction"])
567
+ df["q_05"] = np.minimum(df["q_05"], df["prediction"])
568
+ df["q_10"] = np.minimum(df["q_10"], df["prediction"])
569
+ df["q_16"] = np.minimum(df["q_16"], df["prediction"])
570
+ df["q_25"] = np.minimum(df["q_25"], df["prediction"])
571
+
572
+ # Upper intervals adjustments
573
+ df["q_75"] = np.maximum(df["q_75"], df["prediction"])
574
+ df["q_84"] = np.maximum(df["q_84"], df["prediction"])
575
+ df["q_90"] = np.maximum(df["q_90"], df["prediction"])
576
+ df["q_95"] = np.maximum(df["q_95"], df["prediction"])
577
+ df["q_975"] = np.maximum(df["q_975"], df["prediction"])
578
+
579
+ # Compute confidence scores using pre-computed normalization stats
580
+ df = compute_confidence(
581
+ df,
582
+ lower_q="q_10",
583
+ upper_q="q_90",
584
+ alpha=1.0,
585
+ beta=1.0,
586
+ median_interval_width=models["median_interval_width"],
587
+ )
588
+
589
+ return df