workbench 0.8.171__py3-none-any.whl → 0.8.172__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

@@ -87,8 +87,8 @@ class FeatureSet(FeatureSetCore):
87
87
  model_import_str: str = None,
88
88
  custom_script: Union[str, Path] = None,
89
89
  custom_args: dict = None,
90
- training_image: str = "xgb_training",
91
- inference_image: str = "xgb_inference",
90
+ training_image: str = "training",
91
+ inference_image: str = "inference",
92
92
  inference_arch: str = "x86_64",
93
93
  **kwargs,
94
94
  ) -> Union[Model, None]:
@@ -105,8 +105,8 @@ class FeatureSet(FeatureSetCore):
105
105
  model_class (str, optional): Model class to use (e.g. "KMeans", "PyTorch", default: None)
106
106
  model_import_str (str, optional): The import for the model (e.g. "from sklearn.cluster import KMeans")
107
107
  custom_script (str, optional): The custom script to use for the model (default: None)
108
- training_image (str, optional): The training image to use (default: "xgb_training")
109
- inference_image (str, optional): The inference image to use (default: "xgb_inference")
108
+ training_image (str, optional): The training image to use (default: "training")
109
+ inference_image (str, optional): The inference image to use (default: "inference")
110
110
  inference_arch (str, optional): The architecture to use for inference (default: "x86_64")
111
111
  kwargs (dict, optional): Additional keyword arguments to pass to the model
112
112
 
@@ -236,6 +236,12 @@ class Artifact(ABC):
236
236
  This functionality will work for FeatureSets, Models, and Endpoints
237
237
  but not for DataSources. The DataSource class overrides this method.
238
238
  """
239
+
240
+ # Check for ReadOnly Role
241
+ if self.aws_account_clamp.read_only_role:
242
+ self.log.info("Cannot add metadata with a ReadOnly Role...")
243
+ return
244
+
239
245
  # Sanity check
240
246
  aws_arn = self.arn()
241
247
  if aws_arn is None:
@@ -444,10 +450,12 @@ class Artifact(ABC):
444
450
 
445
451
  if __name__ == "__main__":
446
452
  """Exercise the Artifact Class"""
447
- from workbench.api.data_source import DataSource
448
- from workbench.api.feature_set import FeatureSet
453
+ from workbench.api import DataSource, FeatureSet, Endpoint
454
+
455
+ # Grab an Endpoint (which is a subclass of Artifact)
456
+ end = Endpoint("wine-classification")
449
457
 
450
- # Create a DataSource (which is a subclass of Artifact)
458
+ # Grab a DataSource (which is a subclass of Artifact)
451
459
  data_source = DataSource("test_data")
452
460
 
453
461
  # Just some random tests
@@ -37,16 +37,45 @@ class ModelType(Enum):
37
37
  UNKNOWN = "unknown"
38
38
 
39
39
 
40
+ # Deprecated Images
41
+ """
42
+ # US East 1 images
43
+ "py312-general-ml-training"
44
+ ("us-east-1", "training", "0.1", "x86_64"): (
45
+ "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1"
46
+ ),
47
+ ("us-east-1", "inference", "0.1", "x86_64"): (
48
+ "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1"
49
+ ),
50
+
51
+ # US West 2 images
52
+ ("us-west-2", "training", "0.1", "x86_64"): (
53
+ "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1"
54
+ ),
55
+ ("us-west-2", "inference", "0.1", "x86_64"): (
56
+ "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1"
57
+ ),
58
+
59
+ # ARM64 images
60
+ ("us-east-1", "inference", "0.1", "arm64"): (
61
+ "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1-arm64"
62
+ ),
63
+ ("us-west-2", "inference", "0.1", "arm64"): (
64
+ "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1-arm64"
65
+ ),
66
+ """
67
+
68
+
40
69
  class ModelImages:
41
70
  """Class for retrieving workbench inference images"""
42
71
 
43
72
  image_uris = {
44
73
  # US East 1 images
45
- ("us-east-1", "xgb_training", "0.1", "x86_64"): (
46
- "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1"
74
+ ("us-east-1", "training", "0.1", "x86_64"): (
75
+ "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-general-ml-training:0.1"
47
76
  ),
48
- ("us-east-1", "xgb_inference", "0.1", "x86_64"): (
49
- "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1"
77
+ ("us-east-1", "inference", "0.1", "x86_64"): (
78
+ "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-general-ml-inference:0.1"
50
79
  ),
51
80
  ("us-east-1", "pytorch_training", "0.1", "x86_64"): (
52
81
  "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-pytorch-training:0.1"
@@ -55,11 +84,11 @@ class ModelImages:
55
84
  "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-pytorch-inference:0.1"
56
85
  ),
57
86
  # US West 2 images
58
- ("us-west-2", "xgb_training", "0.1", "x86_64"): (
59
- "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1"
87
+ ("us-west-2", "training", "0.1", "x86_64"): (
88
+ "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-general-ml-training:0.1"
60
89
  ),
61
- ("us-west-2", "xgb_inference", "0.1", "x86_64"): (
62
- "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1"
90
+ ("us-west-2", "inference", "0.1", "x86_64"): (
91
+ "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-general-ml-inference:0.1"
63
92
  ),
64
93
  ("us-west-2", "pytorch_training", "0.1", "x86_64"): (
65
94
  "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-pytorch-training:0.1"
@@ -68,12 +97,6 @@ class ModelImages:
68
97
  "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-pytorch-inference:0.1"
69
98
  ),
70
99
  # ARM64 images
71
- ("us-east-1", "xgb_inference", "0.1", "arm64"): (
72
- "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1-arm64"
73
- ),
74
- ("us-west-2", "xgb_inference", "0.1", "arm64"): (
75
- "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1-arm64"
76
- ),
77
100
  # Meta Endpoint inference images
78
101
  ("us-east-1", "meta-endpoint", "0.1", "x86_64"): (
79
102
  "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-meta-endpoint:0.1"
@@ -54,7 +54,10 @@ class AWSAccountClamp:
54
54
 
55
55
  # Check our Assume Role
56
56
  self.log.info("Checking Workbench Assumed Role...")
57
- self.aws_session.assumed_role_info()
57
+ role_info = self.aws_session.assumed_role_info()
58
+
59
+ # Check if the Role is a 'ReadOnly' role
60
+ self.read_only_role = "readonly" in role_info["AssumedRoleArn"].lower()
58
61
 
59
62
  # Check our Workbench API Key and Load the License
60
63
  self.log.info("Checking Workbench API License...")
@@ -37,8 +37,8 @@ class FeaturesToModel(Transform):
37
37
  model_import_str=None,
38
38
  custom_script=None,
39
39
  custom_args=None,
40
- training_image="xgb_training",
41
- inference_image="xgb_inference",
40
+ training_image="training",
41
+ inference_image="inference",
42
42
  inference_arch="x86_64",
43
43
  ):
44
44
  """FeaturesToModel Initialization
@@ -50,8 +50,8 @@ class FeaturesToModel(Transform):
50
50
  model_import_str (str, optional): The import string for the model (default None)
51
51
  custom_script (str, optional): Custom script to use for the model (default None)
52
52
  custom_args (dict, optional): Custom arguments to pass to custom model scripts (default None)
53
- training_image (str, optional): Training image (default "xgb_training")
54
- inference_image (str, optional): Inference image (default "xgb_inference")
53
+ training_image (str, optional): Training image (default "training")
54
+ inference_image (str, optional): Inference image (default "inference")
55
55
  inference_arch (str, optional): Inference architecture (default "x86_64")
56
56
  """
57
57
 
@@ -1,6 +1,7 @@
1
- # Model: NGBoost Regressor with Distribution output
2
- from ngboost import NGBRegressor
3
- from xgboost import XGBRegressor # Base Estimator
1
+ # Model: XGBoost for point predictions + LightGBM with MAPIE for conformalized intervals
2
+ from mapie.regression import ConformalizedQuantileRegressor
3
+ from lightgbm import LGBMRegressor
4
+ from xgboost import XGBRegressor
4
5
  from sklearn.model_selection import train_test_split
5
6
 
6
7
  # Model Performance Scores
@@ -19,19 +20,12 @@ import numpy as np
19
20
  import pandas as pd
20
21
  from typing import List, Tuple
21
22
 
22
- # Local Imports
23
- from proximity import Proximity
24
-
25
-
26
-
27
23
  # Template Placeholders
28
24
  TEMPLATE_PARAMS = {
29
- "id_column": "udm_mol_bat_id",
30
- "target": "udm_asy_res_intrinsic_clearance_ul_per_min_per_mg_protein",
31
- "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v'],
25
+ "target": "udm_asy_res_value",
26
+ "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v', 'chiral_centers', 'r_cnt', 's_cnt', 'db_stereo', 'e_cnt', 'z_cnt', 'chiral_fp', 'db_fp'],
32
27
  "compressed_features": [],
33
- "train_all_data": False,
34
- "track_columns": ['udm_asy_res_intrinsic_clearance_ul_per_min_per_mg_protein']
28
+ "train_all_data": True
35
29
  }
36
30
 
37
31
 
@@ -106,8 +100,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
106
100
  return df, category_mappings
107
101
 
108
102
 
109
- def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
110
- """Prepare features for the XGBoost model
103
+ def decompress_features(
104
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
105
+ ) -> Tuple[pd.DataFrame, List[str]]:
106
+ """Prepare features for the model by decompressing bitstring features
111
107
 
112
108
  Args:
113
109
  df (pd.DataFrame): The features DataFrame
@@ -132,7 +128,7 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
132
128
  )
133
129
 
134
130
  # Decompress the specified compressed features
135
- decompressed_features = features
131
+ decompressed_features = features.copy()
136
132
  for feature in compressed_features:
137
133
  if (feature not in df.columns) or (feature not in features):
138
134
  print(f"Feature '{feature}' not in the features list, skipping decompression.")
@@ -161,13 +157,11 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
161
157
 
162
158
  if __name__ == "__main__":
163
159
  # Template Parameters
164
- id_column = TEMPLATE_PARAMS["id_column"]
165
160
  target = TEMPLATE_PARAMS["target"]
166
161
  features = TEMPLATE_PARAMS["features"]
167
162
  orig_features = features.copy()
168
163
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
169
164
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
170
- track_columns = TEMPLATE_PARAMS["track_columns"] # Can be None
171
165
  validation_split = 0.2
172
166
 
173
167
  # Script arguments for input/output directories
@@ -225,71 +219,175 @@ if __name__ == "__main__":
225
219
  print(f"FIT/TRAIN: {df_train.shape}")
226
220
  print(f"VALIDATION: {df_val.shape}")
227
221
 
228
- # We're using XGBoost for point predictions and NGBoost for uncertainty quantification
229
- xgb_model = XGBRegressor()
230
- ngb_model = NGBRegressor()
231
-
232
222
  # Prepare features and targets for training
233
223
  X_train = df_train[features]
234
224
  X_validate = df_val[features]
235
225
  y_train = df_train[target]
236
226
  y_validate = df_val[target]
237
227
 
238
- # Train both models using the training data
228
+ # Train XGBoost for point predictions
229
+ print("\nTraining XGBoost for point predictions...")
230
+ xgb_model = XGBRegressor(
231
+ n_estimators=1000,
232
+ max_depth=6,
233
+ learning_rate=0.01,
234
+ subsample=0.8,
235
+ colsample_bytree=0.8,
236
+ random_state=42,
237
+ verbosity=0
238
+ )
239
239
  xgb_model.fit(X_train, y_train)
240
- ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
241
-
242
- # Make Predictions on the Validation Set
243
- print(f"Making Predictions on Validation Set...")
244
- preds = xgb_model.predict(X_validate)
245
-
246
- # Calculate various model performance metrics (regression)
247
- rmse = root_mean_squared_error(y_validate, preds)
248
- mae = mean_absolute_error(y_validate, preds)
249
- r2 = r2_score(y_validate, preds)
250
- print(f"RMSE: {rmse:.3f}")
251
- print(f"MAE: {mae:.3f}")
252
- print(f"R2: {r2:.3f}")
240
+
241
+ # Evaluate XGBoost performance
242
+ y_pred_xgb = xgb_model.predict(X_validate)
243
+ xgb_rmse = root_mean_squared_error(y_validate, y_pred_xgb)
244
+ xgb_mae = mean_absolute_error(y_validate, y_pred_xgb)
245
+ xgb_r2 = r2_score(y_validate, y_pred_xgb)
246
+
247
+ print(f"\nXGBoost Point Prediction Performance:")
248
+ print(f"RMSE: {xgb_rmse:.3f}")
249
+ print(f"MAE: {xgb_mae:.3f}")
250
+ print(f"R2: {xgb_r2:.3f}")
251
+
252
+ # Define confidence levels we want to model
253
+ confidence_levels = [0.50, 0.80, 0.90, 0.95] # 50%, 80%, 90%, 95% confidence intervals
254
+
255
+ # Store MAPIE models for each confidence level
256
+ mapie_models = {}
257
+
258
+ # Train models for each confidence level
259
+ for confidence_level in confidence_levels:
260
+ alpha = 1 - confidence_level
261
+ lower_q = alpha / 2
262
+ upper_q = 1 - alpha / 2
263
+
264
+ print(f"\nTraining quantile models for {confidence_level * 100:.0f}% confidence interval...")
265
+ print(f" Quantiles: {lower_q:.3f}, {upper_q:.3f}, 0.500")
266
+
267
+ # Train three models for this confidence level
268
+ quantile_estimators = []
269
+ for q in [lower_q, upper_q, 0.5]:
270
+ print(f" Training model for quantile {q:.3f}...")
271
+ est = LGBMRegressor(
272
+ objective="quantile",
273
+ alpha=q,
274
+ n_estimators=1000,
275
+ max_depth=6,
276
+ learning_rate=0.01,
277
+ num_leaves=31,
278
+ min_child_samples=20,
279
+ subsample=0.8,
280
+ colsample_bytree=0.8,
281
+ random_state=42,
282
+ verbose=-1,
283
+ force_col_wise=True
284
+ )
285
+ est.fit(X_train, y_train)
286
+ quantile_estimators.append(est)
287
+
288
+ # Create MAPIE CQR model for this confidence level
289
+ print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
290
+ mapie_model = ConformalizedQuantileRegressor(
291
+ quantile_estimators,
292
+ confidence_level=confidence_level,
293
+ prefit=True
294
+ )
295
+
296
+ # Conformalize the model
297
+ print(f" Conformalizing with validation data...")
298
+ mapie_model.conformalize(X_validate, y_validate)
299
+
300
+ # Store the model
301
+ mapie_models[f"mapie_{confidence_level:.2f}"] = mapie_model
302
+
303
+ # Validate coverage for this confidence level
304
+ y_pred, y_pis = mapie_model.predict_interval(X_validate)
305
+ coverage = np.mean((y_validate >= y_pis[:, 0, 0]) & (y_validate <= y_pis[:, 1, 0]))
306
+ print(f" Coverage: Target={confidence_level * 100:.0f}%, Empirical={coverage * 100:.1f}%")
307
+
308
+ print(f"\nOverall Model Performance Summary:")
309
+ print(f"XGBoost RMSE: {xgb_rmse:.3f}")
310
+ print(f"XGBoost MAE: {xgb_mae:.3f}")
311
+ print(f"XGBoost R2: {xgb_r2:.3f}")
253
312
  print(f"NumRows: {len(df_val)}")
254
313
 
314
+ # Analyze interval widths across confidence levels
315
+ print(f"\nInterval Width Analysis:")
316
+ for conf_level in confidence_levels:
317
+ model = mapie_models[f"mapie_{conf_level:.2f}"]
318
+ _, y_pis = model.predict_interval(X_validate)
319
+ widths = y_pis[:, 1, 0] - y_pis[:, 0, 0]
320
+ print(f" {conf_level * 100:.0f}% CI: Mean width={np.mean(widths):.3f}, Std={np.std(widths):.3f}")
321
+
255
322
  # Save the trained XGBoost model
256
323
  xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
257
324
 
258
- # Save the trained NGBoost model
259
- joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
325
+ # Save all MAPIE models
326
+ for model_name, model in mapie_models.items():
327
+ joblib.dump(model, os.path.join(args.model_dir, f"{model_name}.joblib"))
260
328
 
261
- # Save the features (this will validate input during predictions)
329
+ # Save the feature list
262
330
  with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
263
- json.dump(orig_features, fp) # We save the original features, not the decompressed ones
264
-
265
- # Now the Proximity model
266
- model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
331
+ json.dump(features, fp)
332
+
333
+ # Save category mappings if any
334
+ if category_mappings:
335
+ with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
336
+ json.dump(category_mappings, fp)
337
+
338
+ # Save model configuration
339
+ model_config = {
340
+ "model_type": "XGBoost_MAPIE_CQR_LightGBM",
341
+ "confidence_levels": confidence_levels,
342
+ "n_features": len(features),
343
+ "target": target,
344
+ "validation_metrics": {
345
+ "xgb_rmse": float(xgb_rmse),
346
+ "xgb_mae": float(xgb_mae),
347
+ "xgb_r2": float(xgb_r2),
348
+ "n_validation": len(df_val)
349
+ }
350
+ }
351
+ with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
352
+ json.dump(model_config, fp, indent=2)
267
353
 
268
- # Now serialize the model
269
- model.serialize(args.model_dir)
354
+ print(f"\nModel training complete!")
355
+ print(f"Saved 1 XGBoost model and {len(mapie_models)} MAPIE models to {args.model_dir}")
270
356
 
271
357
 
272
358
  #
273
359
  # Inference Section
274
360
  #
275
361
  def model_fn(model_dir) -> dict:
276
- """Load and return XGBoost, NGBoost, and Prox Model from model directory."""
362
+ """Load XGBoost and all MAPIE models from the specified directory."""
363
+
364
+ # Load model configuration to know which models to load
365
+ with open(os.path.join(model_dir, "model_config.json")) as fp:
366
+ config = json.load(fp)
277
367
 
278
368
  # Load XGBoost regressor
279
369
  xgb_path = os.path.join(model_dir, "xgb_model.json")
280
370
  xgb_model = XGBRegressor(enable_categorical=True)
281
371
  xgb_model.load_model(xgb_path)
282
372
 
283
- # Load NGBoost regressor
284
- ngb_model = joblib.load(os.path.join(model_dir, "ngb_model.joblib"))
373
+ # Load all MAPIE models
374
+ mapie_models = {}
375
+ for conf_level in config["confidence_levels"]:
376
+ model_name = f"mapie_{conf_level:.2f}"
377
+ mapie_models[model_name] = joblib.load(os.path.join(model_dir, f"{model_name}.joblib"))
285
378
 
286
- # Deserialize the proximity model
287
- prox_model = Proximity.deserialize(model_dir)
379
+ # Load category mappings if they exist
380
+ category_mappings = {}
381
+ category_path = os.path.join(model_dir, "category_mappings.json")
382
+ if os.path.exists(category_path):
383
+ with open(category_path) as fp:
384
+ category_mappings = json.load(fp)
288
385
 
289
386
  return {
290
- "xgboost": xgb_model,
291
- "ngboost": ngb_model,
292
- "proximity": prox_model
387
+ "xgb_model": xgb_model,
388
+ "mapie_models": mapie_models,
389
+ "confidence_levels": config["confidence_levels"],
390
+ "category_mappings": category_mappings
293
391
  }
294
392
 
295
393
 
@@ -305,7 +403,7 @@ def input_fn(input_data, content_type):
305
403
  if "text/csv" in content_type:
306
404
  return pd.read_csv(StringIO(input_data))
307
405
  elif "application/json" in content_type:
308
- return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
406
+ return pd.DataFrame(json.loads(input_data))
309
407
  else:
310
408
  raise ValueError(f"{content_type} not supported!")
311
409
 
@@ -313,23 +411,26 @@ def input_fn(input_data, content_type):
313
411
  def output_fn(output_df, accept_type):
314
412
  """Supports both CSV and JSON output formats."""
315
413
  if "text/csv" in accept_type:
316
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
414
+ # Convert categorical columns to string to avoid fillna issues
415
+ for col in output_df.select_dtypes(include=['category']).columns:
416
+ output_df[col] = output_df[col].astype(str)
417
+ csv_output = output_df.fillna("N/A").to_csv(index=False)
317
418
  return csv_output, "text/csv"
318
419
  elif "application/json" in accept_type:
319
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
420
+ return output_df.to_json(orient="records"), "application/json"
320
421
  else:
321
422
  raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
322
423
 
323
424
 
324
425
  def predict_fn(df, models) -> pd.DataFrame:
325
- """Make Predictions with our XGB Quantile Regression Model
426
+ """Make predictions using XGBoost for point estimates and MAPIE for conformalized intervals
326
427
 
327
428
  Args:
328
429
  df (pd.DataFrame): The input DataFrame
329
- models (dict): The dictionary of models to use for predictions
430
+ models (dict): Dictionary containing XGBoost and MAPIE models
330
431
 
331
432
  Returns:
332
- pd.DataFrame: The DataFrame with the predictions added
433
+ pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
333
434
  """
334
435
 
335
436
  # Grab our feature columns (from training)
@@ -340,44 +441,62 @@ def predict_fn(df, models) -> pd.DataFrame:
340
441
  # Match features in a case-insensitive manner
341
442
  matched_df = match_features_case_insensitive(df, model_features)
342
443
 
343
- # Use XGBoost for point predictions
344
- df["prediction"] = models["xgboost"].predict(matched_df[model_features])
345
-
346
- # NGBoost predict returns distribution objects
347
- y_dists = models["ngboost"].pred_dist(matched_df[model_features])
348
-
349
- # Extract parameters from distribution
350
- dist_params = y_dists.params
351
-
352
- # Extract mean and std from distribution parameters
353
- df["prediction_uq"] = dist_params['loc'] # mean
354
- df["prediction_std"] = dist_params['scale'] # standard deviation
355
-
356
- # Add 95% prediction intervals using ppf (percent point function)
357
- # Note: Our hybrid model uses XGB point prediction and NGBoost UQ
358
- # so we need to adjust the bounds to include the point prediction
359
- df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
360
- df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
361
-
362
- # Add 90% prediction intervals
363
- df["q_05"] = y_dists.ppf(0.05) # 5th percentile
364
- df["q_95"] = y_dists.ppf(0.95) # 95th percentile
365
-
366
- # Add 80% prediction intervals
367
- df["q_10"] = y_dists.ppf(0.10) # 10th percentile
368
- df["q_90"] = y_dists.ppf(0.90) # 90th percentile
444
+ # Apply categorical mappings if they exist
445
+ if models.get("category_mappings"):
446
+ matched_df, _ = convert_categorical_types(
447
+ matched_df,
448
+ model_features,
449
+ models["category_mappings"]
450
+ )
369
451
 
370
- # Add 50% prediction intervals
371
- df["q_25"] = y_dists.ppf(0.25) # 25th percentile
372
- df["q_75"] = y_dists.ppf(0.75) # 75th percentile
452
+ # Get features for prediction
453
+ X = matched_df[model_features]
454
+
455
+ # Get XGBoost point predictions
456
+ df["prediction"] = models["xgb_model"].predict(X)
457
+
458
+ # Get predictions from each MAPIE model for conformalized intervals
459
+ for conf_level in models["confidence_levels"]:
460
+ model_name = f"mapie_{conf_level:.2f}"
461
+ model = models["mapie_models"][model_name]
462
+
463
+ # Get conformalized predictions
464
+ y_pred, y_pis = model.predict_interval(X)
465
+
466
+ # Map confidence levels to quantile names
467
+ if conf_level == 0.50: # 50% CI
468
+ df["q_25"] = y_pis[:, 0, 0]
469
+ df["q_75"] = y_pis[:, 1, 0]
470
+ elif conf_level == 0.80: # 80% CI
471
+ df["q_10"] = y_pis[:, 0, 0]
472
+ df["q_90"] = y_pis[:, 1, 0]
473
+ elif conf_level == 0.90: # 90% CI
474
+ df["q_05"] = y_pis[:, 0, 0]
475
+ df["q_95"] = y_pis[:, 1, 0]
476
+ elif conf_level == 0.95: # 95% CI
477
+ df["q_025"] = y_pis[:, 0, 0]
478
+ df["q_975"] = y_pis[:, 1, 0]
479
+
480
+ # Add median (q_50) from XGBoost prediction
481
+ df["q_50"] = df["prediction"]
482
+
483
+ # Calculate uncertainty metrics based on 95% interval
484
+ interval_width = df["q_975"] - df["q_025"]
485
+ df["prediction_std"] = interval_width / 3.92
373
486
 
374
487
  # Reorder the quantile columns for easier reading
375
488
  quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
376
489
  other_cols = [col for col in df.columns if col not in quantile_cols]
377
490
  df = df[other_cols + quantile_cols]
378
491
 
379
- # Compute Nearest neighbors with Proximity model
380
- models["proximity"].neighbors(df)
492
+ # Uncertainty score
493
+ df["uncertainty_score"] = interval_width / (np.abs(df["prediction"]) + 1e-6)
494
+
495
+ # Confidence bands
496
+ df["confidence_band"] = pd.cut(
497
+ df["uncertainty_score"],
498
+ bins=[0, 0.5, 1.0, 2.0, np.inf],
499
+ labels=["high", "medium", "low", "very_low"]
500
+ )
381
501
 
382
- # Return the modified DataFrame
383
502
  return df