workbench 0.8.177__py3-none-any.whl → 0.8.179__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (27) hide show
  1. workbench/api/endpoint.py +3 -2
  2. workbench/core/artifacts/endpoint_core.py +5 -5
  3. workbench/core/artifacts/feature_set_core.py +67 -8
  4. workbench/core/views/training_view.py +38 -48
  5. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  6. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  7. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
  8. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  9. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +44 -45
  10. workbench/model_scripts/custom_models/uq_models/mapie.template +42 -43
  11. workbench/model_scripts/custom_models/uq_models/meta_uq.template +7 -22
  12. workbench/model_scripts/custom_models/uq_models/ngboost.template +5 -12
  13. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
  14. workbench/model_scripts/pytorch_model/pytorch.template +9 -18
  15. workbench/model_scripts/quant_regression/quant_regression.template +5 -10
  16. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  17. workbench/model_scripts/xgb_model/generated_model_script.py +24 -33
  18. workbench/model_scripts/xgb_model/xgb_model.template +23 -32
  19. workbench/scripts/ml_pipeline_sqs.py +14 -2
  20. workbench/utils/model_utils.py +12 -2
  21. workbench/utils/xgboost_model_utils.py +161 -138
  22. {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/METADATA +1 -1
  23. {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/RECORD +27 -27
  24. {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/WHEEL +0 -0
  25. {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/entry_points.txt +0 -0
  26. {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/licenses/LICENSE +0 -0
  27. {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/top_level.txt +0 -0
@@ -5,11 +5,7 @@ from xgboost import XGBRegressor
5
5
  from sklearn.model_selection import train_test_split
6
6
 
7
7
  # Model Performance Scores
8
- from sklearn.metrics import (
9
- mean_absolute_error,
10
- r2_score,
11
- root_mean_squared_error
12
- )
8
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
13
9
 
14
10
  from io import StringIO
15
11
  import json
@@ -22,10 +18,11 @@ from typing import List, Tuple
22
18
 
23
19
  # Template Placeholders
24
20
  TEMPLATE_PARAMS = {
25
- "target": "udm_asy_res_value",
26
- "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
21
+ "target": "solubility",
22
+ "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
27
23
  "compressed_features": [],
28
- "train_all_data": True
24
+ "train_all_data": False,
25
+ "hyperparameters": {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.7, 'colsample_bytree': 0.3, 'colsample_bylevel': 0.5, 'min_child_weight': 5, 'gamma': 0.2, 'reg_alpha': 0.5, 'reg_lambda': 2.0, 'scale_pos_weight': 1},
29
26
  }
30
27
 
31
28
 
@@ -101,7 +98,7 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
101
98
 
102
99
 
103
100
  def decompress_features(
104
- df: pd.DataFrame, features: List[str], compressed_features: List[str]
101
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
105
102
  ) -> Tuple[pd.DataFrame, List[str]]:
106
103
  """Prepare features for the model by decompressing bitstring features
107
104
 
@@ -162,6 +159,7 @@ if __name__ == "__main__":
162
159
  orig_features = features.copy()
163
160
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
164
161
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
162
+ hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
165
163
  validation_split = 0.2
166
164
 
167
165
  # Script arguments for input/output directories
@@ -174,11 +172,7 @@ if __name__ == "__main__":
174
172
  args = parser.parse_args()
175
173
 
176
174
  # Read the training data into DataFrames
177
- training_files = [
178
- os.path.join(args.train, file)
179
- for file in os.listdir(args.train)
180
- if file.endswith(".csv")
181
- ]
175
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
182
176
  print(f"Training Files: {training_files}")
183
177
 
184
178
  # Combine files and read them all into a single pandas dataframe
@@ -213,9 +207,7 @@ if __name__ == "__main__":
213
207
  else:
214
208
  # Just do a random training Split
215
209
  print("WARNING: No training column found, splitting data with random state=42")
216
- df_train, df_val = train_test_split(
217
- all_df, test_size=validation_split, random_state=42
218
- )
210
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
219
211
  print(f"FIT/TRAIN: {df_train.shape}")
220
212
  print(f"VALIDATION: {df_val.shape}")
221
213
 
@@ -227,7 +219,8 @@ if __name__ == "__main__":
227
219
 
228
220
  # Train XGBoost for point predictions
229
221
  print("\nTraining XGBoost for point predictions...")
230
- xgb_model = XGBRegressor(enable_categorical=True)
222
+ print(f" Hyperparameters: {hyperparameters}")
223
+ xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
231
224
  xgb_model.fit(X_train, y_train)
232
225
 
233
226
  # Evaluate XGBoost performance
@@ -242,7 +235,7 @@ if __name__ == "__main__":
242
235
  print(f"R2: {xgb_r2:.3f}")
243
236
 
244
237
  # Define confidence levels we want to model
245
- confidence_levels = [0.50, 0.80, 0.90, 0.95] # 50%, 80%, 90%, 95% confidence intervals
238
+ confidence_levels = [0.50, 0.68, 0.80, 0.90, 0.95] # 50%, 68%, 80%, 90%, 95% confidence intervals
246
239
 
247
240
  # Store MAPIE models for each confidence level
248
241
  mapie_models = {}
@@ -272,7 +265,7 @@ if __name__ == "__main__":
272
265
  colsample_bytree=0.8,
273
266
  random_state=42,
274
267
  verbose=-1,
275
- force_col_wise=True
268
+ force_col_wise=True,
276
269
  )
277
270
  est.fit(X_train, y_train)
278
271
  quantile_estimators.append(est)
@@ -280,9 +273,7 @@ if __name__ == "__main__":
280
273
  # Create MAPIE CQR model for this confidence level
281
274
  print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
282
275
  mapie_model = ConformalizedQuantileRegressor(
283
- quantile_estimators,
284
- confidence_level=confidence_level,
285
- prefit=True
276
+ quantile_estimators, confidence_level=confidence_level, prefit=True
286
277
  )
287
278
 
288
279
  # Conformalize the model
@@ -337,8 +328,8 @@ if __name__ == "__main__":
337
328
  "xgb_rmse": float(xgb_rmse),
338
329
  "xgb_mae": float(xgb_mae),
339
330
  "xgb_r2": float(xgb_r2),
340
- "n_validation": len(df_val)
341
- }
331
+ "n_validation": len(df_val),
332
+ },
342
333
  }
343
334
  with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
344
335
  json.dump(model_config, fp, indent=2)
@@ -379,7 +370,7 @@ def model_fn(model_dir) -> dict:
379
370
  "xgb_model": xgb_model,
380
371
  "mapie_models": mapie_models,
381
372
  "confidence_levels": config["confidence_levels"],
382
- "category_mappings": category_mappings
373
+ "category_mappings": category_mappings,
383
374
  }
384
375
 
385
376
 
@@ -404,7 +395,7 @@ def output_fn(output_df, accept_type):
404
395
  """Supports both CSV and JSON output formats."""
405
396
  if "text/csv" in accept_type:
406
397
  # Convert categorical columns to string to avoid fillna issues
407
- for col in output_df.select_dtypes(include=['category']).columns:
398
+ for col in output_df.select_dtypes(include=["category"]).columns:
408
399
  output_df[col] = output_df[col].astype(str)
409
400
  csv_output = output_df.fillna("N/A").to_csv(index=False)
410
401
  return csv_output, "text/csv"
@@ -425,6 +416,10 @@ def predict_fn(df, models) -> pd.DataFrame:
425
416
  pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
426
417
  """
427
418
 
419
+ # Flag for outlier stretch adjustment for the prediction intervals
420
+ # if the predicted values are outside the intervals
421
+ outlier_stretch = False
422
+
428
423
  # Grab our feature columns (from training)
429
424
  model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
430
425
  with open(os.path.join(model_dir, "feature_columns.json")) as fp:
@@ -435,11 +430,7 @@ def predict_fn(df, models) -> pd.DataFrame:
435
430
 
436
431
  # Apply categorical mappings if they exist
437
432
  if models.get("category_mappings"):
438
- matched_df, _ = convert_categorical_types(
439
- matched_df,
440
- model_features,
441
- models["category_mappings"]
442
- )
433
+ matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
443
434
 
444
435
  # Get features for prediction
445
436
  X = matched_df[model_features]
@@ -459,6 +450,9 @@ def predict_fn(df, models) -> pd.DataFrame:
459
450
  if conf_level == 0.50: # 50% CI
460
451
  df["q_25"] = y_pis[:, 0, 0]
461
452
  df["q_75"] = y_pis[:, 1, 0]
453
+ elif conf_level == 0.68: # 68% CI
454
+ df["q_16"] = y_pis[:, 0, 0]
455
+ df["q_84"] = y_pis[:, 1, 0]
462
456
  elif conf_level == 0.80: # 80% CI
463
457
  df["q_10"] = y_pis[:, 0, 0]
464
458
  df["q_90"] = y_pis[:, 1, 0]
@@ -472,23 +466,28 @@ def predict_fn(df, models) -> pd.DataFrame:
472
466
  # Add median (q_50) from XGBoost prediction
473
467
  df["q_50"] = df["prediction"]
474
468
 
475
- # Calculate uncertainty metrics based on 95% interval
476
- interval_width = df["q_975"] - df["q_025"]
477
- df["prediction_std"] = interval_width / 3.92
469
+ # Calculate a pseudo-standard deviation from the 68% interval width
470
+ df["prediction_std"] = (df["q_84"] - df["q_16"]) / 2.0
478
471
 
479
472
  # Reorder the quantile columns for easier reading
480
- quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
473
+ quantile_cols = ["q_025", "q_05", "q_10", "q_16", "q_25", "q_75", "q_84", "q_90", "q_95", "q_975"]
481
474
  other_cols = [col for col in df.columns if col not in quantile_cols]
482
475
  df = df[other_cols + quantile_cols]
483
476
 
484
- # Uncertainty score
485
- df["uncertainty_score"] = interval_width / (np.abs(df["prediction"]) + 1e-6)
486
-
487
- # Confidence bands
488
- df["confidence_band"] = pd.cut(
489
- df["uncertainty_score"],
490
- bins=[0, 0.5, 1.0, 2.0, np.inf],
491
- labels=["high", "medium", "low", "very_low"]
492
- )
477
+ # Adjust the outer quantiles to ensure they encompass the prediction
478
+ if outlier_stretch:
479
+ # Lower intervals adjustments
480
+ df["q_025"] = np.minimum(df["q_025"], df["prediction"])
481
+ df["q_05"] = np.minimum(df["q_05"], df["prediction"])
482
+ df["q_10"] = np.minimum(df["q_10"], df["prediction"])
483
+ df["q_16"] = np.minimum(df["q_16"], df["prediction"])
484
+ df["q_25"] = np.minimum(df["q_25"], df["prediction"])
485
+
486
+ # Upper intervals adjustments
487
+ df["q_75"] = np.maximum(df["q_75"], df["prediction"])
488
+ df["q_84"] = np.maximum(df["q_84"], df["prediction"])
489
+ df["q_90"] = np.maximum(df["q_90"], df["prediction"])
490
+ df["q_95"] = np.maximum(df["q_95"], df["prediction"])
491
+ df["q_975"] = np.maximum(df["q_975"], df["prediction"])
493
492
 
494
493
  return df
@@ -5,11 +5,7 @@ from xgboost import XGBRegressor
5
5
  from sklearn.model_selection import train_test_split
6
6
 
7
7
  # Model Performance Scores
8
- from sklearn.metrics import (
9
- mean_absolute_error,
10
- r2_score,
11
- root_mean_squared_error
12
- )
8
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
13
9
 
14
10
  from io import StringIO
15
11
  import json
@@ -25,7 +21,8 @@ TEMPLATE_PARAMS = {
25
21
  "target": "{{target_column}}",
26
22
  "features": "{{feature_list}}",
27
23
  "compressed_features": "{{compressed_features}}",
28
- "train_all_data": "{{train_all_data}}"
24
+ "train_all_data": "{{train_all_data}}",
25
+ "hyperparameters": "{{hyperparameters}}",
29
26
  }
30
27
 
31
28
 
@@ -101,7 +98,7 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
101
98
 
102
99
 
103
100
  def decompress_features(
104
- df: pd.DataFrame, features: List[str], compressed_features: List[str]
101
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
105
102
  ) -> Tuple[pd.DataFrame, List[str]]:
106
103
  """Prepare features for the model by decompressing bitstring features
107
104
 
@@ -162,6 +159,7 @@ if __name__ == "__main__":
162
159
  orig_features = features.copy()
163
160
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
164
161
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
162
+ hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
165
163
  validation_split = 0.2
166
164
 
167
165
  # Script arguments for input/output directories
@@ -174,11 +172,7 @@ if __name__ == "__main__":
174
172
  args = parser.parse_args()
175
173
 
176
174
  # Read the training data into DataFrames
177
- training_files = [
178
- os.path.join(args.train, file)
179
- for file in os.listdir(args.train)
180
- if file.endswith(".csv")
181
- ]
175
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
182
176
  print(f"Training Files: {training_files}")
183
177
 
184
178
  # Combine files and read them all into a single pandas dataframe
@@ -213,9 +207,7 @@ if __name__ == "__main__":
213
207
  else:
214
208
  # Just do a random training Split
215
209
  print("WARNING: No training column found, splitting data with random state=42")
216
- df_train, df_val = train_test_split(
217
- all_df, test_size=validation_split, random_state=42
218
- )
210
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
219
211
  print(f"FIT/TRAIN: {df_train.shape}")
220
212
  print(f"VALIDATION: {df_val.shape}")
221
213
 
@@ -227,7 +219,8 @@ if __name__ == "__main__":
227
219
 
228
220
  # Train XGBoost for point predictions
229
221
  print("\nTraining XGBoost for point predictions...")
230
- xgb_model = XGBRegressor(enable_categorical=True)
222
+ print(f" Hyperparameters: {hyperparameters}")
223
+ xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
231
224
  xgb_model.fit(X_train, y_train)
232
225
 
233
226
  # Evaluate XGBoost performance
@@ -242,7 +235,7 @@ if __name__ == "__main__":
242
235
  print(f"R2: {xgb_r2:.3f}")
243
236
 
244
237
  # Define confidence levels we want to model
245
- confidence_levels = [0.50, 0.80, 0.90, 0.95] # 50%, 80%, 90%, 95% confidence intervals
238
+ confidence_levels = [0.50, 0.68, 0.80, 0.90, 0.95] # 50%, 68%, 80%, 90%, 95% confidence intervals
246
239
 
247
240
  # Store MAPIE models for each confidence level
248
241
  mapie_models = {}
@@ -272,7 +265,7 @@ if __name__ == "__main__":
272
265
  colsample_bytree=0.8,
273
266
  random_state=42,
274
267
  verbose=-1,
275
- force_col_wise=True
268
+ force_col_wise=True,
276
269
  )
277
270
  est.fit(X_train, y_train)
278
271
  quantile_estimators.append(est)
@@ -280,9 +273,7 @@ if __name__ == "__main__":
280
273
  # Create MAPIE CQR model for this confidence level
281
274
  print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
282
275
  mapie_model = ConformalizedQuantileRegressor(
283
- quantile_estimators,
284
- confidence_level=confidence_level,
285
- prefit=True
276
+ quantile_estimators, confidence_level=confidence_level, prefit=True
286
277
  )
287
278
 
288
279
  # Conformalize the model
@@ -337,8 +328,8 @@ if __name__ == "__main__":
337
328
  "xgb_rmse": float(xgb_rmse),
338
329
  "xgb_mae": float(xgb_mae),
339
330
  "xgb_r2": float(xgb_r2),
340
- "n_validation": len(df_val)
341
- }
331
+ "n_validation": len(df_val),
332
+ },
342
333
  }
343
334
  with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
344
335
  json.dump(model_config, fp, indent=2)
@@ -379,7 +370,7 @@ def model_fn(model_dir) -> dict:
379
370
  "xgb_model": xgb_model,
380
371
  "mapie_models": mapie_models,
381
372
  "confidence_levels": config["confidence_levels"],
382
- "category_mappings": category_mappings
373
+ "category_mappings": category_mappings,
383
374
  }
384
375
 
385
376
 
@@ -404,7 +395,7 @@ def output_fn(output_df, accept_type):
404
395
  """Supports both CSV and JSON output formats."""
405
396
  if "text/csv" in accept_type:
406
397
  # Convert categorical columns to string to avoid fillna issues
407
- for col in output_df.select_dtypes(include=['category']).columns:
398
+ for col in output_df.select_dtypes(include=["category"]).columns:
408
399
  output_df[col] = output_df[col].astype(str)
409
400
  csv_output = output_df.fillna("N/A").to_csv(index=False)
410
401
  return csv_output, "text/csv"
@@ -425,6 +416,10 @@ def predict_fn(df, models) -> pd.DataFrame:
425
416
  pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
426
417
  """
427
418
 
419
+ # Flag for outlier stretch adjustment for the prediction intervals
420
+ # if the predicted values are outside the intervals
421
+ outlier_stretch = False
422
+
428
423
  # Grab our feature columns (from training)
429
424
  model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
430
425
  with open(os.path.join(model_dir, "feature_columns.json")) as fp:
@@ -435,11 +430,7 @@ def predict_fn(df, models) -> pd.DataFrame:
435
430
 
436
431
  # Apply categorical mappings if they exist
437
432
  if models.get("category_mappings"):
438
- matched_df, _ = convert_categorical_types(
439
- matched_df,
440
- model_features,
441
- models["category_mappings"]
442
- )
433
+ matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
443
434
 
444
435
  # Get features for prediction
445
436
  X = matched_df[model_features]
@@ -459,6 +450,9 @@ def predict_fn(df, models) -> pd.DataFrame:
459
450
  if conf_level == 0.50: # 50% CI
460
451
  df["q_25"] = y_pis[:, 0, 0]
461
452
  df["q_75"] = y_pis[:, 1, 0]
453
+ elif conf_level == 0.68: # 68% CI
454
+ df["q_16"] = y_pis[:, 0, 0]
455
+ df["q_84"] = y_pis[:, 1, 0]
462
456
  elif conf_level == 0.80: # 80% CI
463
457
  df["q_10"] = y_pis[:, 0, 0]
464
458
  df["q_90"] = y_pis[:, 1, 0]
@@ -472,23 +466,28 @@ def predict_fn(df, models) -> pd.DataFrame:
472
466
  # Add median (q_50) from XGBoost prediction
473
467
  df["q_50"] = df["prediction"]
474
468
 
475
- # Calculate uncertainty metrics based on 50% interval
476
- interval_width = df["q_75"] - df["q_25"]
477
- df["prediction_std"] = interval_width / 1.348
469
+ # Calculate a pseudo-standard deviation from the 68% interval width
470
+ df["prediction_std"] = (df["q_84"] - df["q_16"]) / 2.0
478
471
 
479
472
  # Reorder the quantile columns for easier reading
480
- quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
473
+ quantile_cols = ["q_025", "q_05", "q_10", "q_16", "q_25", "q_75", "q_84", "q_90", "q_95", "q_975"]
481
474
  other_cols = [col for col in df.columns if col not in quantile_cols]
482
475
  df = df[other_cols + quantile_cols]
483
476
 
484
- # Uncertainty score
485
- df["uncertainty_score"] = interval_width / (np.abs(df["prediction"]) + 1e-6)
486
-
487
- # Confidence bands
488
- df["confidence_band"] = pd.cut(
489
- df["uncertainty_score"],
490
- bins=[0, 0.5, 1.0, 2.0, np.inf],
491
- labels=["high", "medium", "low", "very_low"]
492
- )
477
+ # Adjust the outer quantiles to ensure they encompass the prediction
478
+ if outlier_stretch:
479
+ # Lower intervals adjustments
480
+ df["q_025"] = np.minimum(df["q_025"], df["prediction"])
481
+ df["q_05"] = np.minimum(df["q_05"], df["prediction"])
482
+ df["q_10"] = np.minimum(df["q_10"], df["prediction"])
483
+ df["q_16"] = np.minimum(df["q_16"], df["prediction"])
484
+ df["q_25"] = np.minimum(df["q_25"], df["prediction"])
485
+
486
+ # Upper intervals adjustments
487
+ df["q_75"] = np.maximum(df["q_75"], df["prediction"])
488
+ df["q_84"] = np.maximum(df["q_84"], df["prediction"])
489
+ df["q_90"] = np.maximum(df["q_90"], df["prediction"])
490
+ df["q_95"] = np.maximum(df["q_95"], df["prediction"])
491
+ df["q_975"] = np.maximum(df["q_975"], df["prediction"])
493
492
 
494
493
  return df
@@ -5,11 +5,7 @@ from xgboost import XGBRegressor # Point Estimator
5
5
  from sklearn.model_selection import train_test_split
6
6
 
7
7
  # Model Performance Scores
8
- from sklearn.metrics import (
9
- mean_absolute_error,
10
- r2_score,
11
- root_mean_squared_error
12
- )
8
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
13
9
 
14
10
  from io import StringIO
15
11
  import json
@@ -24,7 +20,6 @@ from typing import List, Tuple
24
20
  from proximity import Proximity
25
21
 
26
22
 
27
-
28
23
  # Template Placeholders
29
24
  TEMPLATE_PARAMS = {
30
25
  "id_column": "{{id_column}}",
@@ -32,7 +27,7 @@ TEMPLATE_PARAMS = {
32
27
  "features": "{{feature_list}}",
33
28
  "compressed_features": "{{compressed_features}}",
34
29
  "train_all_data": "{{train_all_data}}",
35
- "track_columns": "{{track_columns}}"
30
+ "track_columns": "{{track_columns}}",
36
31
  }
37
32
 
38
33
 
@@ -183,11 +178,7 @@ if __name__ == "__main__":
183
178
  args = parser.parse_args()
184
179
 
185
180
  # Read the training data into DataFrames
186
- training_files = [
187
- os.path.join(args.train, file)
188
- for file in os.listdir(args.train)
189
- if file.endswith(".csv")
190
- ]
181
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
191
182
  print(f"Training Files: {training_files}")
192
183
 
193
184
  # Combine files and read them all into a single pandas dataframe
@@ -222,9 +213,7 @@ if __name__ == "__main__":
222
213
  else:
223
214
  # Just do a random training Split
224
215
  print("WARNING: No training column found, splitting data with random state=42")
225
- df_train, df_val = train_test_split(
226
- all_df, test_size=validation_split, random_state=42
227
- )
216
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
228
217
  print(f"FIT/TRAIN: {df_train.shape}")
229
218
  print(f"VALIDATION: {df_val.shape}")
230
219
 
@@ -289,11 +278,7 @@ def model_fn(model_dir) -> dict:
289
278
  # Deserialize the proximity model
290
279
  prox_model = Proximity.deserialize(model_dir)
291
280
 
292
- return {
293
- "xgboost": xgb_model,
294
- "ngboost": ngb_model,
295
- "proximity": prox_model
296
- }
281
+ return {"xgboost": xgb_model, "ngboost": ngb_model, "proximity": prox_model}
297
282
 
298
283
 
299
284
  def input_fn(input_data, content_type):
@@ -353,8 +338,8 @@ def predict_fn(df, models) -> pd.DataFrame:
353
338
  dist_params = y_dists.params
354
339
 
355
340
  # Extract mean and std from distribution parameters
356
- df["prediction_uq"] = dist_params['loc'] # mean
357
- df["prediction_std"] = dist_params['scale'] # standard deviation
341
+ df["prediction_uq"] = dist_params["loc"] # mean
342
+ df["prediction_std"] = dist_params["scale"] # standard deviation
358
343
 
359
344
  # Add 95% prediction intervals using ppf (percent point function)
360
345
  # Note: Our hybrid model uses XGB point prediction and NGBoost UQ
@@ -3,11 +3,7 @@ from ngboost import NGBRegressor
3
3
  from sklearn.model_selection import train_test_split
4
4
 
5
5
  # Model Performance Scores
6
- from sklearn.metrics import (
7
- mean_absolute_error,
8
- r2_score,
9
- root_mean_squared_error
10
- )
6
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
11
7
 
12
8
  from io import StringIO
13
9
  import json
@@ -21,7 +17,7 @@ import pandas as pd
21
17
  TEMPLATE_PARAMS = {
22
18
  "features": "{{feature_list}}",
23
19
  "target": "{{target_column}}",
24
- "train_all_data": "{{train_all_data}}"
20
+ "train_all_data": "{{train_all_data}}",
25
21
  }
26
22
 
27
23
 
@@ -87,10 +83,7 @@ if __name__ == "__main__":
87
83
  args = parser.parse_args()
88
84
 
89
85
  # Load training data from the specified directory
90
- training_files = [
91
- os.path.join(args.train, file)
92
- for file in os.listdir(args.train) if file.endswith(".csv")
93
- ]
86
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
94
87
  print(f"Training Files: {training_files}")
95
88
 
96
89
  # Combine files and read them all into a single pandas dataframe
@@ -212,8 +205,8 @@ def predict_fn(df, model) -> pd.DataFrame:
212
205
  dist_params = y_dists.params
213
206
 
214
207
  # Extract mean and std from distribution parameters
215
- df["prediction"] = dist_params['loc'] # mean
216
- df["prediction_std"] = dist_params['scale'] # standard deviation
208
+ df["prediction"] = dist_params["loc"] # mean
209
+ df["prediction_std"] = dist_params["scale"] # standard deviation
217
210
 
218
211
  # Add 95% prediction intervals using ppf (percent point function)
219
212
  df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
@@ -3,7 +3,7 @@ TEMPLATE_PARAMS = {
3
3
  "model_type": "{{model_type}}",
4
4
  "target_column": "{{target_column}}",
5
5
  "feature_list": "{{feature_list}}",
6
- "model_metrics_s3_path": "{{model_metrics_s3_path}}"
6
+ "model_metrics_s3_path": "{{model_metrics_s3_path}}",
7
7
  }
8
8
 
9
9
  # Imports for XGB Model
@@ -12,11 +12,7 @@ import awswrangler as wr
12
12
  import numpy as np
13
13
 
14
14
  # Model Performance Scores
15
- from sklearn.metrics import (
16
- mean_absolute_error,
17
- r2_score,
18
- root_mean_squared_error
19
- )
15
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
20
16
 
21
17
  from io import StringIO
22
18
  import json
@@ -39,6 +35,7 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
39
35
  print(msg)
40
36
  raise ValueError(msg)
41
37
 
38
+
42
39
  def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
43
40
  """
44
41
  Matches and renames the DataFrame's column names to match the model's feature names (case-insensitive).
@@ -95,11 +92,7 @@ if __name__ == "__main__":
95
92
  args = parser.parse_args()
96
93
 
97
94
  # Read the training data into DataFrames
98
- training_files = [
99
- os.path.join(args.train, file)
100
- for file in os.listdir(args.train)
101
- if file.endswith(".csv")
102
- ]
95
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
103
96
  print(f"Training Files: {training_files}")
104
97
 
105
98
  # Combine files and read them all into a single pandas dataframe
@@ -150,7 +143,6 @@ if __name__ == "__main__":
150
143
  result_df["residual"] = result_df[target] - result_df["prediction"]
151
144
  result_df["residual_abs"] = result_df["residual"].abs()
152
145
 
153
-
154
146
  # Save the results dataframe to S3
155
147
  wr.s3.to_csv(
156
148
  result_df,
@@ -210,7 +202,7 @@ def input_fn(input_data, content_type):
210
202
  """Parse input data and return a DataFrame."""
211
203
  if not input_data:
212
204
  raise ValueError("Empty input data is not supported!")
213
-
205
+
214
206
  # Decode bytes to string if necessary
215
207
  if isinstance(input_data, bytes):
216
208
  input_data = input_data.decode("utf-8")