workbench 0.8.170__py3-none-any.whl → 0.8.172__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/api/feature_set.py +4 -4
- workbench/core/artifacts/artifact.py +11 -3
- workbench/core/artifacts/model_core.py +37 -14
- workbench/core/cloud_platform/aws/aws_account_clamp.py +4 -1
- workbench/core/cloud_platform/aws/aws_meta.py +11 -4
- workbench/core/transforms/features_to_model/features_to_model.py +4 -4
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +319 -210
- workbench/model_scripts/custom_models/uq_models/mapie.template +502 -0
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +154 -41
- workbench/model_scripts/custom_models/uq_models/ngboost.template +15 -2
- workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
- workbench/model_scripts/script_generation.py +5 -0
- workbench/model_scripts/xgb_model/generated_model_script.py +11 -11
- workbench/model_scripts/xgb_model/xgb_model.template +7 -7
- workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +1 -1
- workbench/scripts/ml_pipeline_sqs.py +139 -0
- workbench/utils/model_utils.py +13 -1
- workbench/utils/workbench_sqs.py +1 -1
- workbench/utils/xgboost_model_utils.py +1 -0
- workbench/web_interface/components/plugins/dashboard_status.py +3 -1
- {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/METADATA +1 -1
- {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/RECORD +26 -25
- {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/entry_points.txt +2 -1
- workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
- {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/WHEEL +0 -0
- {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
# Model:
|
|
2
|
-
from
|
|
3
|
-
from
|
|
1
|
+
# Model: XGBoost for point predictions + LightGBM with MAPIE for conformalized intervals
|
|
2
|
+
from mapie.regression import ConformalizedQuantileRegressor
|
|
3
|
+
from lightgbm import LGBMRegressor
|
|
4
|
+
from xgboost import XGBRegressor
|
|
4
5
|
from sklearn.model_selection import train_test_split
|
|
5
|
-
import numpy as np
|
|
6
6
|
|
|
7
7
|
# Model Performance Scores
|
|
8
8
|
from sklearn.metrics import (
|
|
@@ -16,20 +16,16 @@ import json
|
|
|
16
16
|
import argparse
|
|
17
17
|
import joblib
|
|
18
18
|
import os
|
|
19
|
+
import numpy as np
|
|
19
20
|
import pandas as pd
|
|
20
|
-
|
|
21
|
-
# Local Imports
|
|
22
|
-
from proximity import Proximity
|
|
23
|
-
|
|
24
|
-
|
|
21
|
+
from typing import List, Tuple
|
|
25
22
|
|
|
26
23
|
# Template Placeholders
|
|
27
24
|
TEMPLATE_PARAMS = {
|
|
28
|
-
"
|
|
29
|
-
"features": ['
|
|
30
|
-
"
|
|
31
|
-
"train_all_data": True
|
|
32
|
-
"track_columns": ['solubility']
|
|
25
|
+
"target": "udm_asy_res_value",
|
|
26
|
+
"features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v', 'chiral_centers', 'r_cnt', 's_cnt', 'db_stereo', 'e_cnt', 'z_cnt', 'chiral_fp', 'db_fp'],
|
|
27
|
+
"compressed_features": [],
|
|
28
|
+
"train_all_data": True
|
|
33
29
|
}
|
|
34
30
|
|
|
35
31
|
|
|
@@ -73,138 +69,99 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
73
69
|
return df.rename(columns=rename_dict)
|
|
74
70
|
|
|
75
71
|
|
|
76
|
-
def
|
|
77
|
-
df_pred: pd.DataFrame,
|
|
78
|
-
prox_df: pd.DataFrame,
|
|
79
|
-
calibration_strength: float = 0.7,
|
|
80
|
-
distance_decay: float = 3.0,
|
|
81
|
-
) -> pd.DataFrame:
|
|
82
|
-
"""
|
|
83
|
-
Calibrate intervals using distance-weighted neighbor quantiles.
|
|
84
|
-
Uses all 10 neighbors with distance-based weighting.
|
|
72
|
+
def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
|
|
85
73
|
"""
|
|
86
|
-
|
|
87
|
-
target_column = TEMPLATE_PARAMS["target"]
|
|
88
|
-
|
|
89
|
-
# Distance-weighted neighbor statistics
|
|
90
|
-
def weighted_quantile(values, weights, q):
|
|
91
|
-
"""Calculate weighted quantile"""
|
|
92
|
-
if len(values) == 0:
|
|
93
|
-
return np.nan
|
|
94
|
-
sorted_indices = np.argsort(values)
|
|
95
|
-
sorted_values = values[sorted_indices]
|
|
96
|
-
sorted_weights = weights[sorted_indices]
|
|
97
|
-
cumsum = np.cumsum(sorted_weights)
|
|
98
|
-
cutoff = q * cumsum[-1]
|
|
99
|
-
return np.interp(cutoff, cumsum, sorted_values)
|
|
100
|
-
|
|
101
|
-
# Calculate distance weights (closer neighbors get more weight)
|
|
102
|
-
prox_df = prox_df.copy()
|
|
103
|
-
prox_df['weight'] = 1 / (1 + prox_df['distance'] ** distance_decay)
|
|
104
|
-
|
|
105
|
-
# Get weighted quantiles and statistics for each ID
|
|
106
|
-
neighbor_stats = []
|
|
107
|
-
for id_val, group in prox_df.groupby(id_column):
|
|
108
|
-
values = group[target_column].values
|
|
109
|
-
weights = group['weight'].values
|
|
110
|
-
|
|
111
|
-
# Normalize weights
|
|
112
|
-
weights = weights / weights.sum()
|
|
113
|
-
|
|
114
|
-
stats = {
|
|
115
|
-
id_column: id_val,
|
|
116
|
-
'local_q025': weighted_quantile(values, weights, 0.025),
|
|
117
|
-
'local_q25': weighted_quantile(values, weights, 0.25),
|
|
118
|
-
'local_q75': weighted_quantile(values, weights, 0.75),
|
|
119
|
-
'local_q975': weighted_quantile(values, weights, 0.975),
|
|
120
|
-
'local_median': weighted_quantile(values, weights, 0.5),
|
|
121
|
-
'local_std': np.sqrt(np.average((values - np.average(values, weights=weights)) ** 2, weights=weights)),
|
|
122
|
-
'avg_distance': group['distance'].mean(),
|
|
123
|
-
'min_distance': group['distance'].min(),
|
|
124
|
-
'max_distance': group['distance'].max(),
|
|
125
|
-
}
|
|
126
|
-
neighbor_stats.append(stats)
|
|
74
|
+
Converts appropriate columns to categorical type with consistent mappings.
|
|
127
75
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
disagreement_score = (model_disagreement / out["prediction_std"]).clip(0, 2)
|
|
76
|
+
Args:
|
|
77
|
+
df (pd.DataFrame): The DataFrame to process.
|
|
78
|
+
features (list): List of feature names to consider for conversion.
|
|
79
|
+
category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
|
|
80
|
+
training mode. If populated, we're in inference mode.
|
|
134
81
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
82
|
+
Returns:
|
|
83
|
+
tuple: (processed DataFrame, category mappings dictionary)
|
|
84
|
+
"""
|
|
85
|
+
# Training mode
|
|
86
|
+
if category_mappings == {}:
|
|
87
|
+
for col in df.select_dtypes(include=["object", "string"]):
|
|
88
|
+
if col in features and df[col].nunique() < 20:
|
|
89
|
+
print(f"Training mode: Converting {col} to category")
|
|
90
|
+
df[col] = df[col].astype("category")
|
|
91
|
+
category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
|
|
92
|
+
|
|
93
|
+
# Inference mode
|
|
94
|
+
else:
|
|
95
|
+
for col, categories in category_mappings.items():
|
|
96
|
+
if col in df.columns:
|
|
97
|
+
print(f"Inference mode: Applying categorical mapping for {col}")
|
|
98
|
+
df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
|
|
140
99
|
|
|
141
|
-
|
|
142
|
-
local_confidence = distance_confidence * variance_confidence.clip(0.5, 1.5)
|
|
100
|
+
return df, category_mappings
|
|
143
101
|
|
|
144
|
-
# Calibration weight: higher when models disagree and we have good local data
|
|
145
|
-
calibration_weight = (
|
|
146
|
-
calibration_strength *
|
|
147
|
-
local_confidence * # Weight by local data quality
|
|
148
|
-
disagreement_score.clip(0.3, 1.0) # More calibration when models disagree
|
|
149
|
-
)
|
|
150
102
|
|
|
151
|
-
|
|
152
|
-
|
|
103
|
+
def decompress_features(
|
|
104
|
+
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
105
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
106
|
+
"""Prepare features for the model by decompressing bitstring features
|
|
153
107
|
|
|
154
|
-
|
|
155
|
-
|
|
108
|
+
Args:
|
|
109
|
+
df (pd.DataFrame): The features DataFrame
|
|
110
|
+
features (List[str]): Full list of feature names
|
|
111
|
+
compressed_features (List[str]): List of feature names to decompress (bitstrings)
|
|
156
112
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
("q_25", "local_q25"),
|
|
161
|
-
("q_75", "local_q75"),
|
|
162
|
-
("q_975", "local_q975")
|
|
163
|
-
]
|
|
113
|
+
Returns:
|
|
114
|
+
pd.DataFrame: DataFrame with the decompressed features
|
|
115
|
+
List[str]: Updated list of feature names after decompression
|
|
164
116
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
117
|
+
Raises:
|
|
118
|
+
ValueError: If any missing values are found in the specified features
|
|
119
|
+
"""
|
|
168
120
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
121
|
+
# Check for any missing values in the required features
|
|
122
|
+
missing_counts = df[features].isna().sum()
|
|
123
|
+
if missing_counts.any():
|
|
124
|
+
missing_features = missing_counts[missing_counts > 0]
|
|
125
|
+
print(
|
|
126
|
+
f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
|
|
127
|
+
"WARNING: You might want to remove/replace all NaN values before processing."
|
|
173
128
|
)
|
|
174
129
|
|
|
175
|
-
#
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
130
|
+
# Decompress the specified compressed features
|
|
131
|
+
decompressed_features = features.copy()
|
|
132
|
+
for feature in compressed_features:
|
|
133
|
+
if (feature not in df.columns) or (feature not in features):
|
|
134
|
+
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
135
|
+
continue
|
|
179
136
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
sparse_region_mask = out['min_distance'] > out['min_distance'].quantile(0.9)
|
|
183
|
-
expansion_factor = 1 + 0.2 * sparse_region_mask # 20% expansion in sparse regions
|
|
137
|
+
# Remove the feature from the list of features to avoid duplication
|
|
138
|
+
decompressed_features.remove(feature)
|
|
184
139
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
140
|
+
# Handle all compressed features as bitstrings
|
|
141
|
+
bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
|
|
142
|
+
prefix = feature[:3]
|
|
188
143
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
144
|
+
# Create all new columns at once - avoids fragmentation
|
|
145
|
+
new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
|
|
146
|
+
new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
|
|
192
147
|
|
|
193
|
-
|
|
148
|
+
# Add to features list
|
|
149
|
+
decompressed_features.extend(new_col_names)
|
|
150
|
+
|
|
151
|
+
# Drop original column and concatenate new ones
|
|
152
|
+
df = df.drop(columns=[feature])
|
|
153
|
+
df = pd.concat([df, new_df], axis=1)
|
|
154
|
+
|
|
155
|
+
return df, decompressed_features
|
|
194
156
|
|
|
195
157
|
|
|
196
|
-
# TRAINING SECTION
|
|
197
|
-
#
|
|
198
|
-
# This section (__main__) is where SageMaker will execute the training job
|
|
199
|
-
# and save the model artifacts to the model directory.
|
|
200
|
-
#
|
|
201
158
|
if __name__ == "__main__":
|
|
202
159
|
# Template Parameters
|
|
203
|
-
id_column = TEMPLATE_PARAMS["id_column"]
|
|
204
|
-
features = TEMPLATE_PARAMS["features"]
|
|
205
160
|
target = TEMPLATE_PARAMS["target"]
|
|
161
|
+
features = TEMPLATE_PARAMS["features"]
|
|
162
|
+
orig_features = features.copy()
|
|
163
|
+
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
206
164
|
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
207
|
-
track_columns = TEMPLATE_PARAMS["track_columns"] # Can be None
|
|
208
165
|
validation_split = 0.2
|
|
209
166
|
|
|
210
167
|
# Script arguments for input/output directories
|
|
@@ -216,102 +173,221 @@ if __name__ == "__main__":
|
|
|
216
173
|
)
|
|
217
174
|
args = parser.parse_args()
|
|
218
175
|
|
|
219
|
-
#
|
|
176
|
+
# Read the training data into DataFrames
|
|
220
177
|
training_files = [
|
|
221
178
|
os.path.join(args.train, file)
|
|
222
|
-
for file in os.listdir(args.train)
|
|
179
|
+
for file in os.listdir(args.train)
|
|
180
|
+
if file.endswith(".csv")
|
|
223
181
|
]
|
|
224
182
|
print(f"Training Files: {training_files}")
|
|
225
183
|
|
|
226
184
|
# Combine files and read them all into a single pandas dataframe
|
|
227
|
-
|
|
185
|
+
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
186
|
+
|
|
187
|
+
# Check if the dataframe is empty
|
|
188
|
+
check_dataframe(all_df, "training_df")
|
|
189
|
+
|
|
190
|
+
# Features/Target output
|
|
191
|
+
print(f"Target: {target}")
|
|
192
|
+
print(f"Features: {str(features)}")
|
|
228
193
|
|
|
229
|
-
#
|
|
230
|
-
|
|
194
|
+
# Convert any features that might be categorical to 'category' type
|
|
195
|
+
all_df, category_mappings = convert_categorical_types(all_df, features)
|
|
231
196
|
|
|
232
|
-
#
|
|
197
|
+
# If we have compressed features, decompress them
|
|
198
|
+
if compressed_features:
|
|
199
|
+
print(f"Decompressing features {compressed_features}...")
|
|
200
|
+
all_df, features = decompress_features(all_df, features, compressed_features)
|
|
201
|
+
|
|
202
|
+
# Do we want to train on all the data?
|
|
233
203
|
if train_all_data:
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
print("
|
|
241
|
-
df_train =
|
|
242
|
-
df_val =
|
|
204
|
+
print("Training on ALL of the data")
|
|
205
|
+
df_train = all_df.copy()
|
|
206
|
+
df_val = all_df.copy()
|
|
207
|
+
|
|
208
|
+
# Does the dataframe have a training column?
|
|
209
|
+
elif "training" in all_df.columns:
|
|
210
|
+
print("Found training column, splitting data based on training column")
|
|
211
|
+
df_train = all_df[all_df["training"]]
|
|
212
|
+
df_val = all_df[~all_df["training"]]
|
|
243
213
|
else:
|
|
244
|
-
#
|
|
245
|
-
print("
|
|
246
|
-
df_train, df_val = train_test_split(
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
214
|
+
# Just do a random training Split
|
|
215
|
+
print("WARNING: No training column found, splitting data with random state=42")
|
|
216
|
+
df_train, df_val = train_test_split(
|
|
217
|
+
all_df, test_size=validation_split, random_state=42
|
|
218
|
+
)
|
|
219
|
+
print(f"FIT/TRAIN: {df_train.shape}")
|
|
220
|
+
print(f"VALIDATION: {df_val.shape}")
|
|
251
221
|
|
|
252
222
|
# Prepare features and targets for training
|
|
253
223
|
X_train = df_train[features]
|
|
254
|
-
|
|
224
|
+
X_validate = df_val[features]
|
|
255
225
|
y_train = df_train[target]
|
|
256
|
-
|
|
226
|
+
y_validate = df_val[target]
|
|
257
227
|
|
|
258
|
-
# Train
|
|
228
|
+
# Train XGBoost for point predictions
|
|
229
|
+
print("\nTraining XGBoost for point predictions...")
|
|
230
|
+
xgb_model = XGBRegressor(
|
|
231
|
+
n_estimators=1000,
|
|
232
|
+
max_depth=6,
|
|
233
|
+
learning_rate=0.01,
|
|
234
|
+
subsample=0.8,
|
|
235
|
+
colsample_bytree=0.8,
|
|
236
|
+
random_state=42,
|
|
237
|
+
verbosity=0
|
|
238
|
+
)
|
|
259
239
|
xgb_model.fit(X_train, y_train)
|
|
260
|
-
ngb_model.fit(X_train, y_train, X_val=X_val, Y_val=y_val)
|
|
261
240
|
|
|
262
|
-
#
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
241
|
+
# Evaluate XGBoost performance
|
|
242
|
+
y_pred_xgb = xgb_model.predict(X_validate)
|
|
243
|
+
xgb_rmse = root_mean_squared_error(y_validate, y_pred_xgb)
|
|
244
|
+
xgb_mae = mean_absolute_error(y_validate, y_pred_xgb)
|
|
245
|
+
xgb_r2 = r2_score(y_validate, y_pred_xgb)
|
|
246
|
+
|
|
247
|
+
print(f"\nXGBoost Point Prediction Performance:")
|
|
248
|
+
print(f"RMSE: {xgb_rmse:.3f}")
|
|
249
|
+
print(f"MAE: {xgb_mae:.3f}")
|
|
250
|
+
print(f"R2: {xgb_r2:.3f}")
|
|
251
|
+
|
|
252
|
+
# Define confidence levels we want to model
|
|
253
|
+
confidence_levels = [0.50, 0.80, 0.90, 0.95] # 50%, 80%, 90%, 95% confidence intervals
|
|
254
|
+
|
|
255
|
+
# Store MAPIE models for each confidence level
|
|
256
|
+
mapie_models = {}
|
|
257
|
+
|
|
258
|
+
# Train models for each confidence level
|
|
259
|
+
for confidence_level in confidence_levels:
|
|
260
|
+
alpha = 1 - confidence_level
|
|
261
|
+
lower_q = alpha / 2
|
|
262
|
+
upper_q = 1 - alpha / 2
|
|
263
|
+
|
|
264
|
+
print(f"\nTraining quantile models for {confidence_level * 100:.0f}% confidence interval...")
|
|
265
|
+
print(f" Quantiles: {lower_q:.3f}, {upper_q:.3f}, 0.500")
|
|
266
|
+
|
|
267
|
+
# Train three models for this confidence level
|
|
268
|
+
quantile_estimators = []
|
|
269
|
+
for q in [lower_q, upper_q, 0.5]:
|
|
270
|
+
print(f" Training model for quantile {q:.3f}...")
|
|
271
|
+
est = LGBMRegressor(
|
|
272
|
+
objective="quantile",
|
|
273
|
+
alpha=q,
|
|
274
|
+
n_estimators=1000,
|
|
275
|
+
max_depth=6,
|
|
276
|
+
learning_rate=0.01,
|
|
277
|
+
num_leaves=31,
|
|
278
|
+
min_child_samples=20,
|
|
279
|
+
subsample=0.8,
|
|
280
|
+
colsample_bytree=0.8,
|
|
281
|
+
random_state=42,
|
|
282
|
+
verbose=-1,
|
|
283
|
+
force_col_wise=True
|
|
284
|
+
)
|
|
285
|
+
est.fit(X_train, y_train)
|
|
286
|
+
quantile_estimators.append(est)
|
|
287
|
+
|
|
288
|
+
# Create MAPIE CQR model for this confidence level
|
|
289
|
+
print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
|
|
290
|
+
mapie_model = ConformalizedQuantileRegressor(
|
|
291
|
+
quantile_estimators,
|
|
292
|
+
confidence_level=confidence_level,
|
|
293
|
+
prefit=True
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Conformalize the model
|
|
297
|
+
print(f" Conformalizing with validation data...")
|
|
298
|
+
mapie_model.conformalize(X_validate, y_validate)
|
|
299
|
+
|
|
300
|
+
# Store the model
|
|
301
|
+
mapie_models[f"mapie_{confidence_level:.2f}"] = mapie_model
|
|
302
|
+
|
|
303
|
+
# Validate coverage for this confidence level
|
|
304
|
+
y_pred, y_pis = mapie_model.predict_interval(X_validate)
|
|
305
|
+
coverage = np.mean((y_validate >= y_pis[:, 0, 0]) & (y_validate <= y_pis[:, 1, 0]))
|
|
306
|
+
print(f" Coverage: Target={confidence_level * 100:.0f}%, Empirical={coverage * 100:.1f}%")
|
|
307
|
+
|
|
308
|
+
print(f"\nOverall Model Performance Summary:")
|
|
309
|
+
print(f"XGBoost RMSE: {xgb_rmse:.3f}")
|
|
310
|
+
print(f"XGBoost MAE: {xgb_mae:.3f}")
|
|
311
|
+
print(f"XGBoost R2: {xgb_r2:.3f}")
|
|
275
312
|
print(f"NumRows: {len(df_val)}")
|
|
276
313
|
|
|
314
|
+
# Analyze interval widths across confidence levels
|
|
315
|
+
print(f"\nInterval Width Analysis:")
|
|
316
|
+
for conf_level in confidence_levels:
|
|
317
|
+
model = mapie_models[f"mapie_{conf_level:.2f}"]
|
|
318
|
+
_, y_pis = model.predict_interval(X_validate)
|
|
319
|
+
widths = y_pis[:, 1, 0] - y_pis[:, 0, 0]
|
|
320
|
+
print(f" {conf_level * 100:.0f}% CI: Mean width={np.mean(widths):.3f}, Std={np.std(widths):.3f}")
|
|
321
|
+
|
|
277
322
|
# Save the trained XGBoost model
|
|
278
323
|
xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
|
|
279
324
|
|
|
280
|
-
# Save
|
|
281
|
-
|
|
325
|
+
# Save all MAPIE models
|
|
326
|
+
for model_name, model in mapie_models.items():
|
|
327
|
+
joblib.dump(model, os.path.join(args.model_dir, f"{model_name}.joblib"))
|
|
282
328
|
|
|
283
|
-
# Save the feature list
|
|
329
|
+
# Save the feature list
|
|
284
330
|
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
285
331
|
json.dump(features, fp)
|
|
286
332
|
|
|
287
|
-
#
|
|
288
|
-
|
|
333
|
+
# Save category mappings if any
|
|
334
|
+
if category_mappings:
|
|
335
|
+
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
|
|
336
|
+
json.dump(category_mappings, fp)
|
|
337
|
+
|
|
338
|
+
# Save model configuration
|
|
339
|
+
model_config = {
|
|
340
|
+
"model_type": "XGBoost_MAPIE_CQR_LightGBM",
|
|
341
|
+
"confidence_levels": confidence_levels,
|
|
342
|
+
"n_features": len(features),
|
|
343
|
+
"target": target,
|
|
344
|
+
"validation_metrics": {
|
|
345
|
+
"xgb_rmse": float(xgb_rmse),
|
|
346
|
+
"xgb_mae": float(xgb_mae),
|
|
347
|
+
"xgb_r2": float(xgb_r2),
|
|
348
|
+
"n_validation": len(df_val)
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
|
|
352
|
+
json.dump(model_config, fp, indent=2)
|
|
289
353
|
|
|
290
|
-
|
|
291
|
-
model
|
|
354
|
+
print(f"\nModel training complete!")
|
|
355
|
+
print(f"Saved 1 XGBoost model and {len(mapie_models)} MAPIE models to {args.model_dir}")
|
|
292
356
|
|
|
293
357
|
|
|
294
358
|
#
|
|
295
359
|
# Inference Section
|
|
296
360
|
#
|
|
297
361
|
def model_fn(model_dir) -> dict:
|
|
298
|
-
"""Load
|
|
362
|
+
"""Load XGBoost and all MAPIE models from the specified directory."""
|
|
363
|
+
|
|
364
|
+
# Load model configuration to know which models to load
|
|
365
|
+
with open(os.path.join(model_dir, "model_config.json")) as fp:
|
|
366
|
+
config = json.load(fp)
|
|
299
367
|
|
|
300
368
|
# Load XGBoost regressor
|
|
301
369
|
xgb_path = os.path.join(model_dir, "xgb_model.json")
|
|
302
370
|
xgb_model = XGBRegressor(enable_categorical=True)
|
|
303
371
|
xgb_model.load_model(xgb_path)
|
|
304
372
|
|
|
305
|
-
# Load
|
|
306
|
-
|
|
373
|
+
# Load all MAPIE models
|
|
374
|
+
mapie_models = {}
|
|
375
|
+
for conf_level in config["confidence_levels"]:
|
|
376
|
+
model_name = f"mapie_{conf_level:.2f}"
|
|
377
|
+
mapie_models[model_name] = joblib.load(os.path.join(model_dir, f"{model_name}.joblib"))
|
|
307
378
|
|
|
308
|
-
#
|
|
309
|
-
|
|
379
|
+
# Load category mappings if they exist
|
|
380
|
+
category_mappings = {}
|
|
381
|
+
category_path = os.path.join(model_dir, "category_mappings.json")
|
|
382
|
+
if os.path.exists(category_path):
|
|
383
|
+
with open(category_path) as fp:
|
|
384
|
+
category_mappings = json.load(fp)
|
|
310
385
|
|
|
311
386
|
return {
|
|
312
|
-
"
|
|
313
|
-
"
|
|
314
|
-
"
|
|
387
|
+
"xgb_model": xgb_model,
|
|
388
|
+
"mapie_models": mapie_models,
|
|
389
|
+
"confidence_levels": config["confidence_levels"],
|
|
390
|
+
"category_mappings": category_mappings
|
|
315
391
|
}
|
|
316
392
|
|
|
317
393
|
|
|
@@ -327,7 +403,7 @@ def input_fn(input_data, content_type):
|
|
|
327
403
|
if "text/csv" in content_type:
|
|
328
404
|
return pd.read_csv(StringIO(input_data))
|
|
329
405
|
elif "application/json" in content_type:
|
|
330
|
-
return pd.DataFrame(json.loads(input_data))
|
|
406
|
+
return pd.DataFrame(json.loads(input_data))
|
|
331
407
|
else:
|
|
332
408
|
raise ValueError(f"{content_type} not supported!")
|
|
333
409
|
|
|
@@ -335,23 +411,26 @@ def input_fn(input_data, content_type):
|
|
|
335
411
|
def output_fn(output_df, accept_type):
|
|
336
412
|
"""Supports both CSV and JSON output formats."""
|
|
337
413
|
if "text/csv" in accept_type:
|
|
338
|
-
|
|
414
|
+
# Convert categorical columns to string to avoid fillna issues
|
|
415
|
+
for col in output_df.select_dtypes(include=['category']).columns:
|
|
416
|
+
output_df[col] = output_df[col].astype(str)
|
|
417
|
+
csv_output = output_df.fillna("N/A").to_csv(index=False)
|
|
339
418
|
return csv_output, "text/csv"
|
|
340
419
|
elif "application/json" in accept_type:
|
|
341
|
-
return output_df.to_json(orient="records"), "application/json"
|
|
420
|
+
return output_df.to_json(orient="records"), "application/json"
|
|
342
421
|
else:
|
|
343
422
|
raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
|
|
344
423
|
|
|
345
424
|
|
|
346
425
|
def predict_fn(df, models) -> pd.DataFrame:
|
|
347
|
-
"""Make
|
|
426
|
+
"""Make predictions using XGBoost for point estimates and MAPIE for conformalized intervals
|
|
348
427
|
|
|
349
428
|
Args:
|
|
350
429
|
df (pd.DataFrame): The input DataFrame
|
|
351
|
-
models (dict):
|
|
430
|
+
models (dict): Dictionary containing XGBoost and MAPIE models
|
|
352
431
|
|
|
353
432
|
Returns:
|
|
354
|
-
pd.DataFrame:
|
|
433
|
+
pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
|
|
355
434
|
"""
|
|
356
435
|
|
|
357
436
|
# Grab our feature columns (from training)
|
|
@@ -362,32 +441,62 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
362
441
|
# Match features in a case-insensitive manner
|
|
363
442
|
matched_df = match_features_case_insensitive(df, model_features)
|
|
364
443
|
|
|
365
|
-
#
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
dist_params = y_dists.params
|
|
373
|
-
|
|
374
|
-
# Extract mean and std from distribution parameters
|
|
375
|
-
df["prediction_uq"] = dist_params['loc'] # mean
|
|
376
|
-
df["prediction_std"] = dist_params['scale'] # standard deviation
|
|
377
|
-
|
|
378
|
-
# Add 95% prediction intervals using ppf (percent point function)
|
|
379
|
-
df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
|
|
380
|
-
df["q_975"] = y_dists.ppf(0.975) # 97.5th percentile
|
|
381
|
-
|
|
382
|
-
# Add 50% prediction intervals
|
|
383
|
-
df["q_25"] = y_dists.ppf(0.25) # 25th percentile
|
|
384
|
-
df["q_75"] = y_dists.ppf(0.75) # 75th percentile
|
|
385
|
-
|
|
386
|
-
# Compute Nearest neighbors with Proximity model
|
|
387
|
-
prox_df = models["proximity"].neighbors(df)
|
|
444
|
+
# Apply categorical mappings if they exist
|
|
445
|
+
if models.get("category_mappings"):
|
|
446
|
+
matched_df, _ = convert_categorical_types(
|
|
447
|
+
matched_df,
|
|
448
|
+
model_features,
|
|
449
|
+
models["category_mappings"]
|
|
450
|
+
)
|
|
388
451
|
|
|
389
|
-
#
|
|
390
|
-
|
|
452
|
+
# Get features for prediction
|
|
453
|
+
X = matched_df[model_features]
|
|
454
|
+
|
|
455
|
+
# Get XGBoost point predictions
|
|
456
|
+
df["prediction"] = models["xgb_model"].predict(X)
|
|
457
|
+
|
|
458
|
+
# Get predictions from each MAPIE model for conformalized intervals
|
|
459
|
+
for conf_level in models["confidence_levels"]:
|
|
460
|
+
model_name = f"mapie_{conf_level:.2f}"
|
|
461
|
+
model = models["mapie_models"][model_name]
|
|
462
|
+
|
|
463
|
+
# Get conformalized predictions
|
|
464
|
+
y_pred, y_pis = model.predict_interval(X)
|
|
465
|
+
|
|
466
|
+
# Map confidence levels to quantile names
|
|
467
|
+
if conf_level == 0.50: # 50% CI
|
|
468
|
+
df["q_25"] = y_pis[:, 0, 0]
|
|
469
|
+
df["q_75"] = y_pis[:, 1, 0]
|
|
470
|
+
elif conf_level == 0.80: # 80% CI
|
|
471
|
+
df["q_10"] = y_pis[:, 0, 0]
|
|
472
|
+
df["q_90"] = y_pis[:, 1, 0]
|
|
473
|
+
elif conf_level == 0.90: # 90% CI
|
|
474
|
+
df["q_05"] = y_pis[:, 0, 0]
|
|
475
|
+
df["q_95"] = y_pis[:, 1, 0]
|
|
476
|
+
elif conf_level == 0.95: # 95% CI
|
|
477
|
+
df["q_025"] = y_pis[:, 0, 0]
|
|
478
|
+
df["q_975"] = y_pis[:, 1, 0]
|
|
479
|
+
|
|
480
|
+
# Add median (q_50) from XGBoost prediction
|
|
481
|
+
df["q_50"] = df["prediction"]
|
|
482
|
+
|
|
483
|
+
# Calculate uncertainty metrics based on 95% interval
|
|
484
|
+
interval_width = df["q_975"] - df["q_025"]
|
|
485
|
+
df["prediction_std"] = interval_width / 3.92
|
|
486
|
+
|
|
487
|
+
# Reorder the quantile columns for easier reading
|
|
488
|
+
quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
|
|
489
|
+
other_cols = [col for col in df.columns if col not in quantile_cols]
|
|
490
|
+
df = df[other_cols + quantile_cols]
|
|
491
|
+
|
|
492
|
+
# Uncertainty score
|
|
493
|
+
df["uncertainty_score"] = interval_width / (np.abs(df["prediction"]) + 1e-6)
|
|
494
|
+
|
|
495
|
+
# Confidence bands
|
|
496
|
+
df["confidence_band"] = pd.cut(
|
|
497
|
+
df["uncertainty_score"],
|
|
498
|
+
bins=[0, 0.5, 1.0, 2.0, np.inf],
|
|
499
|
+
labels=["high", "medium", "low", "very_low"]
|
|
500
|
+
)
|
|
391
501
|
|
|
392
|
-
# Return the modified DataFrame
|
|
393
502
|
return df
|