workbench 0.8.168__py3-none-any.whl → 0.8.192__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/dataframe/proximity.py +143 -102
- workbench/algorithms/graph/light/proximity_graph.py +2 -1
- workbench/api/compound.py +1 -1
- workbench/api/endpoint.py +3 -2
- workbench/api/feature_set.py +4 -4
- workbench/api/model.py +16 -12
- workbench/api/monitor.py +1 -16
- workbench/core/artifacts/artifact.py +11 -3
- workbench/core/artifacts/data_capture_core.py +355 -0
- workbench/core/artifacts/endpoint_core.py +113 -27
- workbench/core/artifacts/feature_set_core.py +72 -13
- workbench/core/artifacts/model_core.py +50 -15
- workbench/core/artifacts/monitor_core.py +33 -249
- workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
- workbench/core/cloud_platform/aws/aws_meta.py +11 -4
- workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
- workbench/core/transforms/features_to_model/features_to_model.py +9 -4
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
- workbench/core/views/training_view.py +49 -53
- workbench/core/views/view.py +51 -1
- workbench/core/views/view_utils.py +4 -4
- workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
- workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
- workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
- workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
- workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
- workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
- workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
- workbench/model_scripts/pytorch_model/pytorch.template +9 -18
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/script_generation.py +7 -2
- workbench/model_scripts/uq_models/mapie.template +492 -0
- workbench/model_scripts/uq_models/requirements.txt +1 -0
- workbench/model_scripts/xgb_model/xgb_model.template +31 -40
- workbench/repl/workbench_shell.py +4 -4
- workbench/scripts/lambda_launcher.py +63 -0
- workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +49 -51
- workbench/scripts/ml_pipeline_sqs.py +186 -0
- workbench/utils/chem_utils/__init__.py +0 -0
- workbench/utils/chem_utils/fingerprints.py +134 -0
- workbench/utils/chem_utils/misc.py +194 -0
- workbench/utils/chem_utils/mol_descriptors.py +483 -0
- workbench/utils/chem_utils/mol_standardize.py +450 -0
- workbench/utils/chem_utils/mol_tagging.py +348 -0
- workbench/utils/chem_utils/projections.py +209 -0
- workbench/utils/chem_utils/salts.py +256 -0
- workbench/utils/chem_utils/sdf.py +292 -0
- workbench/utils/chem_utils/toxicity.py +250 -0
- workbench/utils/chem_utils/vis.py +253 -0
- workbench/utils/config_manager.py +2 -6
- workbench/utils/endpoint_utils.py +5 -7
- workbench/utils/license_manager.py +2 -6
- workbench/utils/model_utils.py +76 -30
- workbench/utils/monitor_utils.py +44 -62
- workbench/utils/pandas_utils.py +3 -3
- workbench/utils/shap_utils.py +10 -2
- workbench/utils/workbench_sqs.py +1 -1
- workbench/utils/xgboost_model_utils.py +283 -145
- workbench/web_interface/components/plugins/dashboard_status.py +3 -1
- workbench/web_interface/components/plugins/generated_compounds.py +1 -1
- workbench/web_interface/components/plugins/scatter_plot.py +3 -3
- {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/METADATA +2 -1
- {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/RECORD +74 -70
- {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/entry_points.txt +3 -1
- workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
- workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
- workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
- workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
- workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
- workbench/model_scripts/pytorch_model/generated_model_script.py +0 -576
- workbench/model_scripts/quant_regression/quant_regression.template +0 -279
- workbench/model_scripts/quant_regression/requirements.txt +0 -1
- workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
- workbench/model_scripts/xgb_model/generated_model_script.py +0 -477
- workbench/utils/chem_utils.py +0 -1556
- workbench/utils/fast_inference.py +0 -167
- workbench/utils/resource_utils.py +0 -39
- {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/WHEEL +0 -0
- {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.168.dist-info → workbench-0.8.192.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,492 @@
|
|
|
1
|
+
# Model: XGBoost for point predictions + LightGBM with MAPIE for conformalized intervals
|
|
2
|
+
from mapie.regression import ConformalizedQuantileRegressor
|
|
3
|
+
from lightgbm import LGBMRegressor
|
|
4
|
+
from xgboost import XGBRegressor
|
|
5
|
+
from sklearn.model_selection import train_test_split
|
|
6
|
+
|
|
7
|
+
# Model Performance Scores
|
|
8
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
9
|
+
|
|
10
|
+
from io import StringIO
|
|
11
|
+
import json
|
|
12
|
+
import argparse
|
|
13
|
+
import joblib
|
|
14
|
+
import os
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
from typing import List, Tuple
|
|
18
|
+
|
|
19
|
+
# Template Placeholders
|
|
20
|
+
TEMPLATE_PARAMS = {
|
|
21
|
+
"target": "{{target_column}}",
|
|
22
|
+
"features": "{{feature_list}}",
|
|
23
|
+
"compressed_features": "{{compressed_features}}",
|
|
24
|
+
"train_all_data": "{{train_all_data}}",
|
|
25
|
+
"hyperparameters": "{{hyperparameters}}",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# Function to check if dataframe is empty
|
|
30
|
+
def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
31
|
+
"""
|
|
32
|
+
Check if the provided dataframe is empty and raise an exception if it is.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
df (pd.DataFrame): DataFrame to check
|
|
36
|
+
df_name (str): Name of the DataFrame
|
|
37
|
+
"""
|
|
38
|
+
if df.empty:
|
|
39
|
+
msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
|
|
40
|
+
print(msg)
|
|
41
|
+
raise ValueError(msg)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
|
|
45
|
+
"""
|
|
46
|
+
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
47
|
+
Prioritizes exact matches, then case-insensitive matches.
|
|
48
|
+
|
|
49
|
+
Raises ValueError if any model features cannot be matched.
|
|
50
|
+
"""
|
|
51
|
+
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
52
|
+
rename_dict = {}
|
|
53
|
+
missing = []
|
|
54
|
+
for feature in model_features:
|
|
55
|
+
if feature in df.columns:
|
|
56
|
+
continue # Exact match
|
|
57
|
+
elif feature.lower() in df_columns_lower:
|
|
58
|
+
rename_dict[df_columns_lower[feature.lower()]] = feature
|
|
59
|
+
else:
|
|
60
|
+
missing.append(feature)
|
|
61
|
+
|
|
62
|
+
if missing:
|
|
63
|
+
raise ValueError(f"Features not found: {missing}")
|
|
64
|
+
|
|
65
|
+
# Rename the DataFrame columns to match the model features
|
|
66
|
+
return df.rename(columns=rename_dict)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
|
|
70
|
+
"""
|
|
71
|
+
Converts appropriate columns to categorical type with consistent mappings.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
df (pd.DataFrame): The DataFrame to process.
|
|
75
|
+
features (list): List of feature names to consider for conversion.
|
|
76
|
+
category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
|
|
77
|
+
training mode. If populated, we're in inference mode.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
tuple: (processed DataFrame, category mappings dictionary)
|
|
81
|
+
"""
|
|
82
|
+
# Training mode
|
|
83
|
+
if category_mappings == {}:
|
|
84
|
+
for col in df.select_dtypes(include=["object", "string"]):
|
|
85
|
+
if col in features and df[col].nunique() < 20:
|
|
86
|
+
print(f"Training mode: Converting {col} to category")
|
|
87
|
+
df[col] = df[col].astype("category")
|
|
88
|
+
category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
|
|
89
|
+
|
|
90
|
+
# Inference mode
|
|
91
|
+
else:
|
|
92
|
+
for col, categories in category_mappings.items():
|
|
93
|
+
if col in df.columns:
|
|
94
|
+
print(f"Inference mode: Applying categorical mapping for {col}")
|
|
95
|
+
df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
|
|
96
|
+
|
|
97
|
+
return df, category_mappings
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def decompress_features(
|
|
101
|
+
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
102
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
103
|
+
"""Prepare features for the model by decompressing bitstring features
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
df (pd.DataFrame): The features DataFrame
|
|
107
|
+
features (List[str]): Full list of feature names
|
|
108
|
+
compressed_features (List[str]): List of feature names to decompress (bitstrings)
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
pd.DataFrame: DataFrame with the decompressed features
|
|
112
|
+
List[str]: Updated list of feature names after decompression
|
|
113
|
+
|
|
114
|
+
Raises:
|
|
115
|
+
ValueError: If any missing values are found in the specified features
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
# Check for any missing values in the required features
|
|
119
|
+
missing_counts = df[features].isna().sum()
|
|
120
|
+
if missing_counts.any():
|
|
121
|
+
missing_features = missing_counts[missing_counts > 0]
|
|
122
|
+
print(
|
|
123
|
+
f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
|
|
124
|
+
"WARNING: You might want to remove/replace all NaN values before processing."
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Decompress the specified compressed features
|
|
128
|
+
decompressed_features = features.copy()
|
|
129
|
+
for feature in compressed_features:
|
|
130
|
+
if (feature not in df.columns) or (feature not in features):
|
|
131
|
+
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
# Remove the feature from the list of features to avoid duplication
|
|
135
|
+
decompressed_features.remove(feature)
|
|
136
|
+
|
|
137
|
+
# Handle all compressed features as bitstrings
|
|
138
|
+
bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
|
|
139
|
+
prefix = feature[:3]
|
|
140
|
+
|
|
141
|
+
# Create all new columns at once - avoids fragmentation
|
|
142
|
+
new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
|
|
143
|
+
new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
|
|
144
|
+
|
|
145
|
+
# Add to features list
|
|
146
|
+
decompressed_features.extend(new_col_names)
|
|
147
|
+
|
|
148
|
+
# Drop original column and concatenate new ones
|
|
149
|
+
df = df.drop(columns=[feature])
|
|
150
|
+
df = pd.concat([df, new_df], axis=1)
|
|
151
|
+
|
|
152
|
+
return df, decompressed_features
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
if __name__ == "__main__":
|
|
156
|
+
# Template Parameters
|
|
157
|
+
target = TEMPLATE_PARAMS["target"]
|
|
158
|
+
features = TEMPLATE_PARAMS["features"]
|
|
159
|
+
orig_features = features.copy()
|
|
160
|
+
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
161
|
+
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
162
|
+
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
163
|
+
validation_split = 0.2
|
|
164
|
+
|
|
165
|
+
# Script arguments for input/output directories
|
|
166
|
+
parser = argparse.ArgumentParser()
|
|
167
|
+
parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
|
|
168
|
+
parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
|
|
169
|
+
parser.add_argument(
|
|
170
|
+
"--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
|
|
171
|
+
)
|
|
172
|
+
args = parser.parse_args()
|
|
173
|
+
|
|
174
|
+
# Read the training data into DataFrames
|
|
175
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
176
|
+
print(f"Training Files: {training_files}")
|
|
177
|
+
|
|
178
|
+
# Combine files and read them all into a single pandas dataframe
|
|
179
|
+
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
180
|
+
|
|
181
|
+
# Check if the dataframe is empty
|
|
182
|
+
check_dataframe(all_df, "training_df")
|
|
183
|
+
|
|
184
|
+
# Features/Target output
|
|
185
|
+
print(f"Target: {target}")
|
|
186
|
+
print(f"Features: {str(features)}")
|
|
187
|
+
|
|
188
|
+
# Convert any features that might be categorical to 'category' type
|
|
189
|
+
all_df, category_mappings = convert_categorical_types(all_df, features)
|
|
190
|
+
|
|
191
|
+
# If we have compressed features, decompress them
|
|
192
|
+
if compressed_features:
|
|
193
|
+
print(f"Decompressing features {compressed_features}...")
|
|
194
|
+
all_df, features = decompress_features(all_df, features, compressed_features)
|
|
195
|
+
|
|
196
|
+
# Do we want to train on all the data?
|
|
197
|
+
if train_all_data:
|
|
198
|
+
print("Training on ALL of the data")
|
|
199
|
+
df_train = all_df.copy()
|
|
200
|
+
df_val = all_df.copy()
|
|
201
|
+
|
|
202
|
+
# Does the dataframe have a training column?
|
|
203
|
+
elif "training" in all_df.columns:
|
|
204
|
+
print("Found training column, splitting data based on training column")
|
|
205
|
+
df_train = all_df[all_df["training"]]
|
|
206
|
+
df_val = all_df[~all_df["training"]]
|
|
207
|
+
else:
|
|
208
|
+
# Just do a random training Split
|
|
209
|
+
print("WARNING: No training column found, splitting data with random state=42")
|
|
210
|
+
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
211
|
+
print(f"FIT/TRAIN: {df_train.shape}")
|
|
212
|
+
print(f"VALIDATION: {df_val.shape}")
|
|
213
|
+
|
|
214
|
+
# Prepare features and targets for training
|
|
215
|
+
X_train = df_train[features]
|
|
216
|
+
X_validate = df_val[features]
|
|
217
|
+
y_train = df_train[target]
|
|
218
|
+
y_validate = df_val[target]
|
|
219
|
+
|
|
220
|
+
# Train XGBoost for point predictions
|
|
221
|
+
print("\nTraining XGBoost for point predictions...")
|
|
222
|
+
print(f" Hyperparameters: {hyperparameters}")
|
|
223
|
+
xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
|
|
224
|
+
xgb_model.fit(X_train, y_train)
|
|
225
|
+
|
|
226
|
+
# Evaluate XGBoost performance
|
|
227
|
+
y_pred_xgb = xgb_model.predict(X_validate)
|
|
228
|
+
xgb_rmse = root_mean_squared_error(y_validate, y_pred_xgb)
|
|
229
|
+
xgb_mae = mean_absolute_error(y_validate, y_pred_xgb)
|
|
230
|
+
xgb_r2 = r2_score(y_validate, y_pred_xgb)
|
|
231
|
+
|
|
232
|
+
print(f"\nXGBoost Point Prediction Performance:")
|
|
233
|
+
print(f"RMSE: {xgb_rmse:.3f}")
|
|
234
|
+
print(f"MAE: {xgb_mae:.3f}")
|
|
235
|
+
print(f"R2: {xgb_r2:.3f}")
|
|
236
|
+
|
|
237
|
+
# Define confidence levels we want to model
|
|
238
|
+
confidence_levels = [0.50, 0.68, 0.80, 0.90, 0.95] # 50%, 68%, 80%, 90%, 95% confidence intervals
|
|
239
|
+
|
|
240
|
+
# Store MAPIE models for each confidence level
|
|
241
|
+
mapie_models = {}
|
|
242
|
+
|
|
243
|
+
# Train models for each confidence level
|
|
244
|
+
for confidence_level in confidence_levels:
|
|
245
|
+
alpha = 1 - confidence_level
|
|
246
|
+
lower_q = alpha / 2
|
|
247
|
+
upper_q = 1 - alpha / 2
|
|
248
|
+
|
|
249
|
+
print(f"\nTraining quantile models for {confidence_level * 100:.0f}% confidence interval...")
|
|
250
|
+
print(f" Quantiles: {lower_q:.3f}, {upper_q:.3f}, 0.500")
|
|
251
|
+
|
|
252
|
+
# Train three models for this confidence level
|
|
253
|
+
quantile_estimators = []
|
|
254
|
+
for q in [lower_q, upper_q, 0.5]:
|
|
255
|
+
print(f" Training model for quantile {q:.3f}...")
|
|
256
|
+
est = LGBMRegressor(
|
|
257
|
+
objective="quantile",
|
|
258
|
+
alpha=q,
|
|
259
|
+
n_estimators=1000,
|
|
260
|
+
max_depth=6,
|
|
261
|
+
learning_rate=0.01,
|
|
262
|
+
num_leaves=31,
|
|
263
|
+
min_child_samples=20,
|
|
264
|
+
subsample=0.8,
|
|
265
|
+
colsample_bytree=0.8,
|
|
266
|
+
random_state=42,
|
|
267
|
+
verbose=-1,
|
|
268
|
+
force_col_wise=True,
|
|
269
|
+
)
|
|
270
|
+
est.fit(X_train, y_train)
|
|
271
|
+
quantile_estimators.append(est)
|
|
272
|
+
|
|
273
|
+
# Create MAPIE CQR model for this confidence level
|
|
274
|
+
print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
|
|
275
|
+
mapie_model = ConformalizedQuantileRegressor(
|
|
276
|
+
quantile_estimators, confidence_level=confidence_level, prefit=True
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# Conformalize the model
|
|
280
|
+
print(f" Conformalizing with validation data...")
|
|
281
|
+
mapie_model.conformalize(X_validate, y_validate)
|
|
282
|
+
|
|
283
|
+
# Store the model
|
|
284
|
+
mapie_models[f"mapie_{confidence_level:.2f}"] = mapie_model
|
|
285
|
+
|
|
286
|
+
# Validate coverage for this confidence level
|
|
287
|
+
y_pred, y_pis = mapie_model.predict_interval(X_validate)
|
|
288
|
+
coverage = np.mean((y_validate >= y_pis[:, 0, 0]) & (y_validate <= y_pis[:, 1, 0]))
|
|
289
|
+
print(f" Coverage: Target={confidence_level * 100:.0f}%, Empirical={coverage * 100:.1f}%")
|
|
290
|
+
|
|
291
|
+
print(f"\nOverall Model Performance Summary:")
|
|
292
|
+
print(f"XGBoost RMSE: {xgb_rmse:.3f}")
|
|
293
|
+
print(f"XGBoost MAE: {xgb_mae:.3f}")
|
|
294
|
+
print(f"XGBoost R2: {xgb_r2:.3f}")
|
|
295
|
+
print(f"NumRows: {len(df_val)}")
|
|
296
|
+
|
|
297
|
+
# Analyze interval widths across confidence levels
|
|
298
|
+
print(f"\nInterval Width Analysis:")
|
|
299
|
+
for conf_level in confidence_levels:
|
|
300
|
+
model = mapie_models[f"mapie_{conf_level:.2f}"]
|
|
301
|
+
_, y_pis = model.predict_interval(X_validate)
|
|
302
|
+
widths = y_pis[:, 1, 0] - y_pis[:, 0, 0]
|
|
303
|
+
print(f" {conf_level * 100:.0f}% CI: Mean width={np.mean(widths):.3f}, Std={np.std(widths):.3f}")
|
|
304
|
+
|
|
305
|
+
# Save the trained XGBoost model
|
|
306
|
+
joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
|
|
307
|
+
|
|
308
|
+
# Save all MAPIE models
|
|
309
|
+
for model_name, model in mapie_models.items():
|
|
310
|
+
joblib.dump(model, os.path.join(args.model_dir, f"{model_name}.joblib"))
|
|
311
|
+
|
|
312
|
+
# Save the feature list
|
|
313
|
+
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
314
|
+
json.dump(features, fp)
|
|
315
|
+
|
|
316
|
+
# Save category mappings if any
|
|
317
|
+
if category_mappings:
|
|
318
|
+
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
|
|
319
|
+
json.dump(category_mappings, fp)
|
|
320
|
+
|
|
321
|
+
# Save model configuration
|
|
322
|
+
model_config = {
|
|
323
|
+
"model_type": "XGBoost_MAPIE_CQR_LightGBM",
|
|
324
|
+
"confidence_levels": confidence_levels,
|
|
325
|
+
"n_features": len(features),
|
|
326
|
+
"target": target,
|
|
327
|
+
"validation_metrics": {
|
|
328
|
+
"xgb_rmse": float(xgb_rmse),
|
|
329
|
+
"xgb_mae": float(xgb_mae),
|
|
330
|
+
"xgb_r2": float(xgb_r2),
|
|
331
|
+
"n_validation": len(df_val),
|
|
332
|
+
},
|
|
333
|
+
}
|
|
334
|
+
with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
|
|
335
|
+
json.dump(model_config, fp, indent=2)
|
|
336
|
+
|
|
337
|
+
print(f"\nModel training complete!")
|
|
338
|
+
print(f"Saved 1 XGBoost model and {len(mapie_models)} MAPIE models to {args.model_dir}")
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
#
|
|
342
|
+
# Inference Section
|
|
343
|
+
#
|
|
344
|
+
def model_fn(model_dir) -> dict:
|
|
345
|
+
"""Load XGBoost and all MAPIE models from the specified directory."""
|
|
346
|
+
|
|
347
|
+
# Load model configuration to know which models to load
|
|
348
|
+
with open(os.path.join(model_dir, "model_config.json")) as fp:
|
|
349
|
+
config = json.load(fp)
|
|
350
|
+
|
|
351
|
+
# Load XGBoost regressor
|
|
352
|
+
xgb_path = os.path.join(model_dir, "xgb_model.joblib")
|
|
353
|
+
xgb_model = joblib.load(xgb_path)
|
|
354
|
+
|
|
355
|
+
# Load all MAPIE models
|
|
356
|
+
mapie_models = {}
|
|
357
|
+
for conf_level in config["confidence_levels"]:
|
|
358
|
+
model_name = f"mapie_{conf_level:.2f}"
|
|
359
|
+
mapie_models[model_name] = joblib.load(os.path.join(model_dir, f"{model_name}.joblib"))
|
|
360
|
+
|
|
361
|
+
# Load category mappings if they exist
|
|
362
|
+
category_mappings = {}
|
|
363
|
+
category_path = os.path.join(model_dir, "category_mappings.json")
|
|
364
|
+
if os.path.exists(category_path):
|
|
365
|
+
with open(category_path) as fp:
|
|
366
|
+
category_mappings = json.load(fp)
|
|
367
|
+
|
|
368
|
+
return {
|
|
369
|
+
"xgb_model": xgb_model,
|
|
370
|
+
"mapie_models": mapie_models,
|
|
371
|
+
"confidence_levels": config["confidence_levels"],
|
|
372
|
+
"category_mappings": category_mappings,
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def input_fn(input_data, content_type):
|
|
377
|
+
"""Parse input data and return a DataFrame."""
|
|
378
|
+
if not input_data:
|
|
379
|
+
raise ValueError("Empty input data is not supported!")
|
|
380
|
+
|
|
381
|
+
# Decode bytes to string if necessary
|
|
382
|
+
if isinstance(input_data, bytes):
|
|
383
|
+
input_data = input_data.decode("utf-8")
|
|
384
|
+
|
|
385
|
+
if "text/csv" in content_type:
|
|
386
|
+
return pd.read_csv(StringIO(input_data))
|
|
387
|
+
elif "application/json" in content_type:
|
|
388
|
+
return pd.DataFrame(json.loads(input_data))
|
|
389
|
+
else:
|
|
390
|
+
raise ValueError(f"{content_type} not supported!")
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def output_fn(output_df, accept_type):
|
|
394
|
+
"""Supports both CSV and JSON output formats."""
|
|
395
|
+
if "text/csv" in accept_type:
|
|
396
|
+
# Convert categorical columns to string to avoid fillna issues
|
|
397
|
+
for col in output_df.select_dtypes(include=["category"]).columns:
|
|
398
|
+
output_df[col] = output_df[col].astype(str)
|
|
399
|
+
csv_output = output_df.fillna("N/A").to_csv(index=False)
|
|
400
|
+
return csv_output, "text/csv"
|
|
401
|
+
elif "application/json" in accept_type:
|
|
402
|
+
return output_df.to_json(orient="records"), "application/json"
|
|
403
|
+
else:
|
|
404
|
+
raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def predict_fn(df, models) -> pd.DataFrame:
|
|
408
|
+
"""Make predictions using XGBoost for point estimates and MAPIE for conformalized intervals
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
df (pd.DataFrame): The input DataFrame
|
|
412
|
+
models (dict): Dictionary containing XGBoost and MAPIE models
|
|
413
|
+
|
|
414
|
+
Returns:
|
|
415
|
+
pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
|
|
416
|
+
"""
|
|
417
|
+
|
|
418
|
+
# Flag for outlier stretch adjustment for the prediction intervals
|
|
419
|
+
# if the predicted values are outside the intervals
|
|
420
|
+
outlier_stretch = False
|
|
421
|
+
|
|
422
|
+
# Grab our feature columns (from training)
|
|
423
|
+
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
424
|
+
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
425
|
+
model_features = json.load(fp)
|
|
426
|
+
|
|
427
|
+
# Match features in a case-insensitive manner
|
|
428
|
+
matched_df = match_features_case_insensitive(df, model_features)
|
|
429
|
+
|
|
430
|
+
# Apply categorical mappings if they exist
|
|
431
|
+
if models.get("category_mappings"):
|
|
432
|
+
matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
|
|
433
|
+
|
|
434
|
+
# Get features for prediction
|
|
435
|
+
X = matched_df[model_features]
|
|
436
|
+
|
|
437
|
+
# Get XGBoost point predictions
|
|
438
|
+
df["prediction"] = models["xgb_model"].predict(X)
|
|
439
|
+
|
|
440
|
+
# Get predictions from each MAPIE model for conformalized intervals
|
|
441
|
+
for conf_level in models["confidence_levels"]:
|
|
442
|
+
model_name = f"mapie_{conf_level:.2f}"
|
|
443
|
+
model = models["mapie_models"][model_name]
|
|
444
|
+
|
|
445
|
+
# Get conformalized predictions
|
|
446
|
+
y_pred, y_pis = model.predict_interval(X)
|
|
447
|
+
|
|
448
|
+
# Map confidence levels to quantile names
|
|
449
|
+
if conf_level == 0.50: # 50% CI
|
|
450
|
+
df["q_25"] = y_pis[:, 0, 0]
|
|
451
|
+
df["q_75"] = y_pis[:, 1, 0]
|
|
452
|
+
elif conf_level == 0.68: # 68% CI
|
|
453
|
+
df["q_16"] = y_pis[:, 0, 0]
|
|
454
|
+
df["q_84"] = y_pis[:, 1, 0]
|
|
455
|
+
elif conf_level == 0.80: # 80% CI
|
|
456
|
+
df["q_10"] = y_pis[:, 0, 0]
|
|
457
|
+
df["q_90"] = y_pis[:, 1, 0]
|
|
458
|
+
elif conf_level == 0.90: # 90% CI
|
|
459
|
+
df["q_05"] = y_pis[:, 0, 0]
|
|
460
|
+
df["q_95"] = y_pis[:, 1, 0]
|
|
461
|
+
elif conf_level == 0.95: # 95% CI
|
|
462
|
+
df["q_025"] = y_pis[:, 0, 0]
|
|
463
|
+
df["q_975"] = y_pis[:, 1, 0]
|
|
464
|
+
|
|
465
|
+
# Add median (q_50) from XGBoost prediction
|
|
466
|
+
df["q_50"] = df["prediction"]
|
|
467
|
+
|
|
468
|
+
# Calculate a pseudo-standard deviation from the 68% interval width
|
|
469
|
+
df["prediction_std"] = (df["q_84"] - df["q_16"]).abs() / 2.0
|
|
470
|
+
|
|
471
|
+
# Reorder the quantile columns for easier reading
|
|
472
|
+
quantile_cols = ["q_025", "q_05", "q_10", "q_16", "q_25", "q_75", "q_84", "q_90", "q_95", "q_975"]
|
|
473
|
+
other_cols = [col for col in df.columns if col not in quantile_cols]
|
|
474
|
+
df = df[other_cols + quantile_cols]
|
|
475
|
+
|
|
476
|
+
# Adjust the outer quantiles to ensure they encompass the prediction
|
|
477
|
+
if outlier_stretch:
|
|
478
|
+
# Lower intervals adjustments
|
|
479
|
+
df["q_025"] = np.minimum(df["q_025"], df["prediction"])
|
|
480
|
+
df["q_05"] = np.minimum(df["q_05"], df["prediction"])
|
|
481
|
+
df["q_10"] = np.minimum(df["q_10"], df["prediction"])
|
|
482
|
+
df["q_16"] = np.minimum(df["q_16"], df["prediction"])
|
|
483
|
+
df["q_25"] = np.minimum(df["q_25"], df["prediction"])
|
|
484
|
+
|
|
485
|
+
# Upper intervals adjustments
|
|
486
|
+
df["q_75"] = np.maximum(df["q_75"], df["prediction"])
|
|
487
|
+
df["q_84"] = np.maximum(df["q_84"], df["prediction"])
|
|
488
|
+
df["q_90"] = np.maximum(df["q_90"], df["prediction"])
|
|
489
|
+
df["q_95"] = np.maximum(df["q_95"], df["prediction"])
|
|
490
|
+
df["q_975"] = np.maximum(df["q_975"], df["prediction"])
|
|
491
|
+
|
|
492
|
+
return df
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Note: Most libs are already in the training/inference images, ONLY specify additional libs here
|
|
@@ -29,13 +29,15 @@ from typing import List, Tuple
|
|
|
29
29
|
# Template Parameters
|
|
30
30
|
TEMPLATE_PARAMS = {
|
|
31
31
|
"model_type": "{{model_type}}",
|
|
32
|
-
"
|
|
32
|
+
"target": "{{target_column}}",
|
|
33
33
|
"features": "{{feature_list}}",
|
|
34
34
|
"compressed_features": "{{compressed_features}}",
|
|
35
35
|
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
36
|
-
"train_all_data": "{{train_all_data}}"
|
|
36
|
+
"train_all_data": "{{train_all_data}}",
|
|
37
|
+
"hyperparameters": "{{hyperparameters}}",
|
|
37
38
|
}
|
|
38
39
|
|
|
40
|
+
|
|
39
41
|
# Function to check if dataframe is empty
|
|
40
42
|
def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
41
43
|
"""
|
|
@@ -75,7 +77,7 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
|
|
|
75
77
|
proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
|
|
76
78
|
|
|
77
79
|
# Drop any proba columns and reset the index in prep for the concat
|
|
78
|
-
df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
|
|
80
|
+
df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
|
|
79
81
|
df = df.reset_index(drop=True)
|
|
80
82
|
|
|
81
83
|
# Concatenate the new columns with the original DataFrame
|
|
@@ -88,13 +90,12 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
88
90
|
"""
|
|
89
91
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
90
92
|
Prioritizes exact matches, then case-insensitive matches.
|
|
91
|
-
|
|
93
|
+
|
|
92
94
|
Raises ValueError if any model features cannot be matched.
|
|
93
95
|
"""
|
|
94
96
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
95
97
|
rename_dict = {}
|
|
96
98
|
missing = []
|
|
97
|
-
|
|
98
99
|
for feature in model_features:
|
|
99
100
|
if feature in df.columns:
|
|
100
101
|
continue # Exact match
|
|
@@ -102,10 +103,11 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
102
103
|
rename_dict[df_columns_lower[feature.lower()]] = feature
|
|
103
104
|
else:
|
|
104
105
|
missing.append(feature)
|
|
105
|
-
|
|
106
|
+
|
|
106
107
|
if missing:
|
|
107
108
|
raise ValueError(f"Features not found: {missing}")
|
|
108
|
-
|
|
109
|
+
|
|
110
|
+
# Rename the DataFrame columns to match the model features
|
|
109
111
|
return df.rename(columns=rename_dict)
|
|
110
112
|
|
|
111
113
|
|
|
@@ -140,8 +142,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
|
|
|
140
142
|
return df, category_mappings
|
|
141
143
|
|
|
142
144
|
|
|
143
|
-
def decompress_features(
|
|
144
|
-
|
|
145
|
+
def decompress_features(
|
|
146
|
+
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
147
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
148
|
+
"""Prepare features for the model by decompressing bitstring features
|
|
145
149
|
|
|
146
150
|
Args:
|
|
147
151
|
df (pd.DataFrame): The features DataFrame
|
|
@@ -166,7 +170,7 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
|
|
|
166
170
|
)
|
|
167
171
|
|
|
168
172
|
# Decompress the specified compressed features
|
|
169
|
-
decompressed_features = features
|
|
173
|
+
decompressed_features = features.copy()
|
|
170
174
|
for feature in compressed_features:
|
|
171
175
|
if (feature not in df.columns) or (feature not in features):
|
|
172
176
|
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
@@ -197,13 +201,14 @@ if __name__ == "__main__":
|
|
|
197
201
|
"""The main function is for training the XGBoost model"""
|
|
198
202
|
|
|
199
203
|
# Harness Template Parameters
|
|
200
|
-
target = TEMPLATE_PARAMS["
|
|
204
|
+
target = TEMPLATE_PARAMS["target"]
|
|
201
205
|
features = TEMPLATE_PARAMS["features"]
|
|
202
206
|
orig_features = features.copy()
|
|
203
207
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
204
208
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
205
209
|
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
206
210
|
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
211
|
+
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
207
212
|
validation_split = 0.2
|
|
208
213
|
|
|
209
214
|
# Script arguments for input/output directories
|
|
@@ -216,11 +221,7 @@ if __name__ == "__main__":
|
|
|
216
221
|
args = parser.parse_args()
|
|
217
222
|
|
|
218
223
|
# Read the training data into DataFrames
|
|
219
|
-
training_files = [
|
|
220
|
-
os.path.join(args.train, file)
|
|
221
|
-
for file in os.listdir(args.train)
|
|
222
|
-
if file.endswith(".csv")
|
|
223
|
-
]
|
|
224
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
224
225
|
print(f"Training Files: {training_files}")
|
|
225
226
|
|
|
226
227
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -255,15 +256,16 @@ if __name__ == "__main__":
|
|
|
255
256
|
else:
|
|
256
257
|
# Just do a random training Split
|
|
257
258
|
print("WARNING: No training column found, splitting data with random state=42")
|
|
258
|
-
df_train, df_val = train_test_split(
|
|
259
|
-
all_df, test_size=validation_split, random_state=42
|
|
260
|
-
)
|
|
259
|
+
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
261
260
|
print(f"FIT/TRAIN: {df_train.shape}")
|
|
262
261
|
print(f"VALIDATION: {df_val.shape}")
|
|
263
262
|
|
|
263
|
+
# Use any hyperparameters to set up both the trainer and model configurations
|
|
264
|
+
print(f"Hyperparameters: {hyperparameters}")
|
|
265
|
+
|
|
264
266
|
# Now spin up our XGB Model
|
|
265
267
|
if model_type == "classifier":
|
|
266
|
-
xgb_model = xgb.XGBClassifier(enable_categorical=True)
|
|
268
|
+
xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
|
|
267
269
|
|
|
268
270
|
# Encode the target column
|
|
269
271
|
label_encoder = LabelEncoder()
|
|
@@ -271,12 +273,12 @@ if __name__ == "__main__":
|
|
|
271
273
|
df_val[target] = label_encoder.transform(df_val[target])
|
|
272
274
|
|
|
273
275
|
else:
|
|
274
|
-
xgb_model = xgb.XGBRegressor(enable_categorical=True)
|
|
276
|
+
xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
|
|
275
277
|
label_encoder = None # We don't need this for regression
|
|
276
278
|
|
|
277
279
|
# Grab our Features, Target and Train the Model
|
|
278
280
|
y_train = df_train[target]
|
|
279
|
-
X_train= df_train[features]
|
|
281
|
+
X_train = df_train[features]
|
|
280
282
|
xgb_model.fit(X_train, y_train)
|
|
281
283
|
|
|
282
284
|
# Make Predictions on the Validation Set
|
|
@@ -315,9 +317,7 @@ if __name__ == "__main__":
|
|
|
315
317
|
label_names = label_encoder.classes_
|
|
316
318
|
|
|
317
319
|
# Calculate various model performance metrics
|
|
318
|
-
scores = precision_recall_fscore_support(
|
|
319
|
-
y_validate, preds, average=None, labels=label_names
|
|
320
|
-
)
|
|
320
|
+
scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
|
|
321
321
|
|
|
322
322
|
# Put the scores into a dataframe
|
|
323
323
|
score_df = pd.DataFrame(
|
|
@@ -355,7 +355,9 @@ if __name__ == "__main__":
|
|
|
355
355
|
print(f"NumRows: {len(df_val)}")
|
|
356
356
|
|
|
357
357
|
# Now save the model to the standard place/name
|
|
358
|
-
|
|
358
|
+
joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
|
|
359
|
+
|
|
360
|
+
# Save the label encoder if we have one
|
|
359
361
|
if label_encoder:
|
|
360
362
|
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
361
363
|
|
|
@@ -370,19 +372,8 @@ if __name__ == "__main__":
|
|
|
370
372
|
|
|
371
373
|
def model_fn(model_dir):
|
|
372
374
|
"""Deserialize and return fitted XGBoost model"""
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
with open(model_path, "r") as f:
|
|
377
|
-
model_json = json.load(f)
|
|
378
|
-
|
|
379
|
-
sklearn_data = model_json['learner']['attributes']['scikit_learn']
|
|
380
|
-
model_type = json.loads(sklearn_data)['_estimator_type']
|
|
381
|
-
|
|
382
|
-
model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
|
|
383
|
-
model = model_class(enable_categorical=True)
|
|
384
|
-
model.load_model(model_path)
|
|
385
|
-
|
|
375
|
+
model_path = os.path.join(model_dir, "xgb_model.joblib")
|
|
376
|
+
model = joblib.load(model_path)
|
|
386
377
|
return model
|
|
387
378
|
|
|
388
379
|
|
|
@@ -390,7 +381,7 @@ def input_fn(input_data, content_type):
|
|
|
390
381
|
"""Parse input data and return a DataFrame."""
|
|
391
382
|
if not input_data:
|
|
392
383
|
raise ValueError("Empty input data is not supported!")
|
|
393
|
-
|
|
384
|
+
|
|
394
385
|
# Decode bytes to string if necessary
|
|
395
386
|
if isinstance(input_data, bytes):
|
|
396
387
|
input_data = input_data.decode("utf-8")
|