workbench 0.8.177__py3-none-any.whl → 0.8.179__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/api/endpoint.py +3 -2
- workbench/core/artifacts/endpoint_core.py +5 -5
- workbench/core/artifacts/feature_set_core.py +67 -8
- workbench/core/views/training_view.py +38 -48
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +44 -45
- workbench/model_scripts/custom_models/uq_models/mapie.template +42 -43
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +7 -22
- workbench/model_scripts/custom_models/uq_models/ngboost.template +5 -12
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
- workbench/model_scripts/pytorch_model/pytorch.template +9 -18
- workbench/model_scripts/quant_regression/quant_regression.template +5 -10
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/xgb_model/generated_model_script.py +24 -33
- workbench/model_scripts/xgb_model/xgb_model.template +23 -32
- workbench/scripts/ml_pipeline_sqs.py +14 -2
- workbench/utils/model_utils.py +12 -2
- workbench/utils/xgboost_model_utils.py +161 -138
- {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/METADATA +1 -1
- {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/RECORD +27 -27
- {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/WHEEL +0 -0
- {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/entry_points.txt +0 -0
- {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/top_level.txt +0 -0
|
@@ -5,11 +5,7 @@ from xgboost import XGBRegressor
|
|
|
5
5
|
from sklearn.model_selection import train_test_split
|
|
6
6
|
|
|
7
7
|
# Model Performance Scores
|
|
8
|
-
from sklearn.metrics import
|
|
9
|
-
mean_absolute_error,
|
|
10
|
-
r2_score,
|
|
11
|
-
root_mean_squared_error
|
|
12
|
-
)
|
|
8
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
13
9
|
|
|
14
10
|
from io import StringIO
|
|
15
11
|
import json
|
|
@@ -22,10 +18,11 @@ from typing import List, Tuple
|
|
|
22
18
|
|
|
23
19
|
# Template Placeholders
|
|
24
20
|
TEMPLATE_PARAMS = {
|
|
25
|
-
"target": "
|
|
26
|
-
"features": ['
|
|
21
|
+
"target": "solubility",
|
|
22
|
+
"features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
|
|
27
23
|
"compressed_features": [],
|
|
28
|
-
"train_all_data":
|
|
24
|
+
"train_all_data": False,
|
|
25
|
+
"hyperparameters": {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.7, 'colsample_bytree': 0.3, 'colsample_bylevel': 0.5, 'min_child_weight': 5, 'gamma': 0.2, 'reg_alpha': 0.5, 'reg_lambda': 2.0, 'scale_pos_weight': 1},
|
|
29
26
|
}
|
|
30
27
|
|
|
31
28
|
|
|
@@ -101,7 +98,7 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
|
|
|
101
98
|
|
|
102
99
|
|
|
103
100
|
def decompress_features(
|
|
104
|
-
|
|
101
|
+
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
105
102
|
) -> Tuple[pd.DataFrame, List[str]]:
|
|
106
103
|
"""Prepare features for the model by decompressing bitstring features
|
|
107
104
|
|
|
@@ -162,6 +159,7 @@ if __name__ == "__main__":
|
|
|
162
159
|
orig_features = features.copy()
|
|
163
160
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
164
161
|
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
162
|
+
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
165
163
|
validation_split = 0.2
|
|
166
164
|
|
|
167
165
|
# Script arguments for input/output directories
|
|
@@ -174,11 +172,7 @@ if __name__ == "__main__":
|
|
|
174
172
|
args = parser.parse_args()
|
|
175
173
|
|
|
176
174
|
# Read the training data into DataFrames
|
|
177
|
-
training_files = [
|
|
178
|
-
os.path.join(args.train, file)
|
|
179
|
-
for file in os.listdir(args.train)
|
|
180
|
-
if file.endswith(".csv")
|
|
181
|
-
]
|
|
175
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
182
176
|
print(f"Training Files: {training_files}")
|
|
183
177
|
|
|
184
178
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -213,9 +207,7 @@ if __name__ == "__main__":
|
|
|
213
207
|
else:
|
|
214
208
|
# Just do a random training Split
|
|
215
209
|
print("WARNING: No training column found, splitting data with random state=42")
|
|
216
|
-
df_train, df_val = train_test_split(
|
|
217
|
-
all_df, test_size=validation_split, random_state=42
|
|
218
|
-
)
|
|
210
|
+
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
219
211
|
print(f"FIT/TRAIN: {df_train.shape}")
|
|
220
212
|
print(f"VALIDATION: {df_val.shape}")
|
|
221
213
|
|
|
@@ -227,7 +219,8 @@ if __name__ == "__main__":
|
|
|
227
219
|
|
|
228
220
|
# Train XGBoost for point predictions
|
|
229
221
|
print("\nTraining XGBoost for point predictions...")
|
|
230
|
-
|
|
222
|
+
print(f" Hyperparameters: {hyperparameters}")
|
|
223
|
+
xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
|
|
231
224
|
xgb_model.fit(X_train, y_train)
|
|
232
225
|
|
|
233
226
|
# Evaluate XGBoost performance
|
|
@@ -242,7 +235,7 @@ if __name__ == "__main__":
|
|
|
242
235
|
print(f"R2: {xgb_r2:.3f}")
|
|
243
236
|
|
|
244
237
|
# Define confidence levels we want to model
|
|
245
|
-
confidence_levels = [0.50, 0.80, 0.90, 0.95] # 50%, 80%, 90%, 95% confidence intervals
|
|
238
|
+
confidence_levels = [0.50, 0.68, 0.80, 0.90, 0.95] # 50%, 68%, 80%, 90%, 95% confidence intervals
|
|
246
239
|
|
|
247
240
|
# Store MAPIE models for each confidence level
|
|
248
241
|
mapie_models = {}
|
|
@@ -272,7 +265,7 @@ if __name__ == "__main__":
|
|
|
272
265
|
colsample_bytree=0.8,
|
|
273
266
|
random_state=42,
|
|
274
267
|
verbose=-1,
|
|
275
|
-
force_col_wise=True
|
|
268
|
+
force_col_wise=True,
|
|
276
269
|
)
|
|
277
270
|
est.fit(X_train, y_train)
|
|
278
271
|
quantile_estimators.append(est)
|
|
@@ -280,9 +273,7 @@ if __name__ == "__main__":
|
|
|
280
273
|
# Create MAPIE CQR model for this confidence level
|
|
281
274
|
print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
|
|
282
275
|
mapie_model = ConformalizedQuantileRegressor(
|
|
283
|
-
quantile_estimators,
|
|
284
|
-
confidence_level=confidence_level,
|
|
285
|
-
prefit=True
|
|
276
|
+
quantile_estimators, confidence_level=confidence_level, prefit=True
|
|
286
277
|
)
|
|
287
278
|
|
|
288
279
|
# Conformalize the model
|
|
@@ -337,8 +328,8 @@ if __name__ == "__main__":
|
|
|
337
328
|
"xgb_rmse": float(xgb_rmse),
|
|
338
329
|
"xgb_mae": float(xgb_mae),
|
|
339
330
|
"xgb_r2": float(xgb_r2),
|
|
340
|
-
"n_validation": len(df_val)
|
|
341
|
-
}
|
|
331
|
+
"n_validation": len(df_val),
|
|
332
|
+
},
|
|
342
333
|
}
|
|
343
334
|
with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
|
|
344
335
|
json.dump(model_config, fp, indent=2)
|
|
@@ -379,7 +370,7 @@ def model_fn(model_dir) -> dict:
|
|
|
379
370
|
"xgb_model": xgb_model,
|
|
380
371
|
"mapie_models": mapie_models,
|
|
381
372
|
"confidence_levels": config["confidence_levels"],
|
|
382
|
-
"category_mappings": category_mappings
|
|
373
|
+
"category_mappings": category_mappings,
|
|
383
374
|
}
|
|
384
375
|
|
|
385
376
|
|
|
@@ -404,7 +395,7 @@ def output_fn(output_df, accept_type):
|
|
|
404
395
|
"""Supports both CSV and JSON output formats."""
|
|
405
396
|
if "text/csv" in accept_type:
|
|
406
397
|
# Convert categorical columns to string to avoid fillna issues
|
|
407
|
-
for col in output_df.select_dtypes(include=[
|
|
398
|
+
for col in output_df.select_dtypes(include=["category"]).columns:
|
|
408
399
|
output_df[col] = output_df[col].astype(str)
|
|
409
400
|
csv_output = output_df.fillna("N/A").to_csv(index=False)
|
|
410
401
|
return csv_output, "text/csv"
|
|
@@ -425,6 +416,10 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
425
416
|
pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
|
|
426
417
|
"""
|
|
427
418
|
|
|
419
|
+
# Flag for outlier stretch adjustment for the prediction intervals
|
|
420
|
+
# if the predicted values are outside the intervals
|
|
421
|
+
outlier_stretch = False
|
|
422
|
+
|
|
428
423
|
# Grab our feature columns (from training)
|
|
429
424
|
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
430
425
|
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
@@ -435,11 +430,7 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
435
430
|
|
|
436
431
|
# Apply categorical mappings if they exist
|
|
437
432
|
if models.get("category_mappings"):
|
|
438
|
-
matched_df, _ = convert_categorical_types(
|
|
439
|
-
matched_df,
|
|
440
|
-
model_features,
|
|
441
|
-
models["category_mappings"]
|
|
442
|
-
)
|
|
433
|
+
matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
|
|
443
434
|
|
|
444
435
|
# Get features for prediction
|
|
445
436
|
X = matched_df[model_features]
|
|
@@ -459,6 +450,9 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
459
450
|
if conf_level == 0.50: # 50% CI
|
|
460
451
|
df["q_25"] = y_pis[:, 0, 0]
|
|
461
452
|
df["q_75"] = y_pis[:, 1, 0]
|
|
453
|
+
elif conf_level == 0.68: # 68% CI
|
|
454
|
+
df["q_16"] = y_pis[:, 0, 0]
|
|
455
|
+
df["q_84"] = y_pis[:, 1, 0]
|
|
462
456
|
elif conf_level == 0.80: # 80% CI
|
|
463
457
|
df["q_10"] = y_pis[:, 0, 0]
|
|
464
458
|
df["q_90"] = y_pis[:, 1, 0]
|
|
@@ -472,23 +466,28 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
472
466
|
# Add median (q_50) from XGBoost prediction
|
|
473
467
|
df["q_50"] = df["prediction"]
|
|
474
468
|
|
|
475
|
-
# Calculate
|
|
476
|
-
|
|
477
|
-
df["prediction_std"] = interval_width / 3.92
|
|
469
|
+
# Calculate a pseudo-standard deviation from the 68% interval width
|
|
470
|
+
df["prediction_std"] = (df["q_84"] - df["q_16"]) / 2.0
|
|
478
471
|
|
|
479
472
|
# Reorder the quantile columns for easier reading
|
|
480
|
-
quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
|
|
473
|
+
quantile_cols = ["q_025", "q_05", "q_10", "q_16", "q_25", "q_75", "q_84", "q_90", "q_95", "q_975"]
|
|
481
474
|
other_cols = [col for col in df.columns if col not in quantile_cols]
|
|
482
475
|
df = df[other_cols + quantile_cols]
|
|
483
476
|
|
|
484
|
-
#
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
df["
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
477
|
+
# Adjust the outer quantiles to ensure they encompass the prediction
|
|
478
|
+
if outlier_stretch:
|
|
479
|
+
# Lower intervals adjustments
|
|
480
|
+
df["q_025"] = np.minimum(df["q_025"], df["prediction"])
|
|
481
|
+
df["q_05"] = np.minimum(df["q_05"], df["prediction"])
|
|
482
|
+
df["q_10"] = np.minimum(df["q_10"], df["prediction"])
|
|
483
|
+
df["q_16"] = np.minimum(df["q_16"], df["prediction"])
|
|
484
|
+
df["q_25"] = np.minimum(df["q_25"], df["prediction"])
|
|
485
|
+
|
|
486
|
+
# Upper intervals adjustments
|
|
487
|
+
df["q_75"] = np.maximum(df["q_75"], df["prediction"])
|
|
488
|
+
df["q_84"] = np.maximum(df["q_84"], df["prediction"])
|
|
489
|
+
df["q_90"] = np.maximum(df["q_90"], df["prediction"])
|
|
490
|
+
df["q_95"] = np.maximum(df["q_95"], df["prediction"])
|
|
491
|
+
df["q_975"] = np.maximum(df["q_975"], df["prediction"])
|
|
493
492
|
|
|
494
493
|
return df
|
|
@@ -5,11 +5,7 @@ from xgboost import XGBRegressor
|
|
|
5
5
|
from sklearn.model_selection import train_test_split
|
|
6
6
|
|
|
7
7
|
# Model Performance Scores
|
|
8
|
-
from sklearn.metrics import
|
|
9
|
-
mean_absolute_error,
|
|
10
|
-
r2_score,
|
|
11
|
-
root_mean_squared_error
|
|
12
|
-
)
|
|
8
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
13
9
|
|
|
14
10
|
from io import StringIO
|
|
15
11
|
import json
|
|
@@ -25,7 +21,8 @@ TEMPLATE_PARAMS = {
|
|
|
25
21
|
"target": "{{target_column}}",
|
|
26
22
|
"features": "{{feature_list}}",
|
|
27
23
|
"compressed_features": "{{compressed_features}}",
|
|
28
|
-
"train_all_data": "{{train_all_data}}"
|
|
24
|
+
"train_all_data": "{{train_all_data}}",
|
|
25
|
+
"hyperparameters": "{{hyperparameters}}",
|
|
29
26
|
}
|
|
30
27
|
|
|
31
28
|
|
|
@@ -101,7 +98,7 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
|
|
|
101
98
|
|
|
102
99
|
|
|
103
100
|
def decompress_features(
|
|
104
|
-
|
|
101
|
+
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
105
102
|
) -> Tuple[pd.DataFrame, List[str]]:
|
|
106
103
|
"""Prepare features for the model by decompressing bitstring features
|
|
107
104
|
|
|
@@ -162,6 +159,7 @@ if __name__ == "__main__":
|
|
|
162
159
|
orig_features = features.copy()
|
|
163
160
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
164
161
|
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
162
|
+
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
165
163
|
validation_split = 0.2
|
|
166
164
|
|
|
167
165
|
# Script arguments for input/output directories
|
|
@@ -174,11 +172,7 @@ if __name__ == "__main__":
|
|
|
174
172
|
args = parser.parse_args()
|
|
175
173
|
|
|
176
174
|
# Read the training data into DataFrames
|
|
177
|
-
training_files = [
|
|
178
|
-
os.path.join(args.train, file)
|
|
179
|
-
for file in os.listdir(args.train)
|
|
180
|
-
if file.endswith(".csv")
|
|
181
|
-
]
|
|
175
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
182
176
|
print(f"Training Files: {training_files}")
|
|
183
177
|
|
|
184
178
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -213,9 +207,7 @@ if __name__ == "__main__":
|
|
|
213
207
|
else:
|
|
214
208
|
# Just do a random training Split
|
|
215
209
|
print("WARNING: No training column found, splitting data with random state=42")
|
|
216
|
-
df_train, df_val = train_test_split(
|
|
217
|
-
all_df, test_size=validation_split, random_state=42
|
|
218
|
-
)
|
|
210
|
+
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
219
211
|
print(f"FIT/TRAIN: {df_train.shape}")
|
|
220
212
|
print(f"VALIDATION: {df_val.shape}")
|
|
221
213
|
|
|
@@ -227,7 +219,8 @@ if __name__ == "__main__":
|
|
|
227
219
|
|
|
228
220
|
# Train XGBoost for point predictions
|
|
229
221
|
print("\nTraining XGBoost for point predictions...")
|
|
230
|
-
|
|
222
|
+
print(f" Hyperparameters: {hyperparameters}")
|
|
223
|
+
xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
|
|
231
224
|
xgb_model.fit(X_train, y_train)
|
|
232
225
|
|
|
233
226
|
# Evaluate XGBoost performance
|
|
@@ -242,7 +235,7 @@ if __name__ == "__main__":
|
|
|
242
235
|
print(f"R2: {xgb_r2:.3f}")
|
|
243
236
|
|
|
244
237
|
# Define confidence levels we want to model
|
|
245
|
-
confidence_levels = [0.50, 0.80, 0.90, 0.95] # 50%, 80%, 90%, 95% confidence intervals
|
|
238
|
+
confidence_levels = [0.50, 0.68, 0.80, 0.90, 0.95] # 50%, 68%, 80%, 90%, 95% confidence intervals
|
|
246
239
|
|
|
247
240
|
# Store MAPIE models for each confidence level
|
|
248
241
|
mapie_models = {}
|
|
@@ -272,7 +265,7 @@ if __name__ == "__main__":
|
|
|
272
265
|
colsample_bytree=0.8,
|
|
273
266
|
random_state=42,
|
|
274
267
|
verbose=-1,
|
|
275
|
-
force_col_wise=True
|
|
268
|
+
force_col_wise=True,
|
|
276
269
|
)
|
|
277
270
|
est.fit(X_train, y_train)
|
|
278
271
|
quantile_estimators.append(est)
|
|
@@ -280,9 +273,7 @@ if __name__ == "__main__":
|
|
|
280
273
|
# Create MAPIE CQR model for this confidence level
|
|
281
274
|
print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
|
|
282
275
|
mapie_model = ConformalizedQuantileRegressor(
|
|
283
|
-
quantile_estimators,
|
|
284
|
-
confidence_level=confidence_level,
|
|
285
|
-
prefit=True
|
|
276
|
+
quantile_estimators, confidence_level=confidence_level, prefit=True
|
|
286
277
|
)
|
|
287
278
|
|
|
288
279
|
# Conformalize the model
|
|
@@ -337,8 +328,8 @@ if __name__ == "__main__":
|
|
|
337
328
|
"xgb_rmse": float(xgb_rmse),
|
|
338
329
|
"xgb_mae": float(xgb_mae),
|
|
339
330
|
"xgb_r2": float(xgb_r2),
|
|
340
|
-
"n_validation": len(df_val)
|
|
341
|
-
}
|
|
331
|
+
"n_validation": len(df_val),
|
|
332
|
+
},
|
|
342
333
|
}
|
|
343
334
|
with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
|
|
344
335
|
json.dump(model_config, fp, indent=2)
|
|
@@ -379,7 +370,7 @@ def model_fn(model_dir) -> dict:
|
|
|
379
370
|
"xgb_model": xgb_model,
|
|
380
371
|
"mapie_models": mapie_models,
|
|
381
372
|
"confidence_levels": config["confidence_levels"],
|
|
382
|
-
"category_mappings": category_mappings
|
|
373
|
+
"category_mappings": category_mappings,
|
|
383
374
|
}
|
|
384
375
|
|
|
385
376
|
|
|
@@ -404,7 +395,7 @@ def output_fn(output_df, accept_type):
|
|
|
404
395
|
"""Supports both CSV and JSON output formats."""
|
|
405
396
|
if "text/csv" in accept_type:
|
|
406
397
|
# Convert categorical columns to string to avoid fillna issues
|
|
407
|
-
for col in output_df.select_dtypes(include=[
|
|
398
|
+
for col in output_df.select_dtypes(include=["category"]).columns:
|
|
408
399
|
output_df[col] = output_df[col].astype(str)
|
|
409
400
|
csv_output = output_df.fillna("N/A").to_csv(index=False)
|
|
410
401
|
return csv_output, "text/csv"
|
|
@@ -425,6 +416,10 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
425
416
|
pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
|
|
426
417
|
"""
|
|
427
418
|
|
|
419
|
+
# Flag for outlier stretch adjustment for the prediction intervals
|
|
420
|
+
# if the predicted values are outside the intervals
|
|
421
|
+
outlier_stretch = False
|
|
422
|
+
|
|
428
423
|
# Grab our feature columns (from training)
|
|
429
424
|
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
430
425
|
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
@@ -435,11 +430,7 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
435
430
|
|
|
436
431
|
# Apply categorical mappings if they exist
|
|
437
432
|
if models.get("category_mappings"):
|
|
438
|
-
matched_df, _ = convert_categorical_types(
|
|
439
|
-
matched_df,
|
|
440
|
-
model_features,
|
|
441
|
-
models["category_mappings"]
|
|
442
|
-
)
|
|
433
|
+
matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
|
|
443
434
|
|
|
444
435
|
# Get features for prediction
|
|
445
436
|
X = matched_df[model_features]
|
|
@@ -459,6 +450,9 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
459
450
|
if conf_level == 0.50: # 50% CI
|
|
460
451
|
df["q_25"] = y_pis[:, 0, 0]
|
|
461
452
|
df["q_75"] = y_pis[:, 1, 0]
|
|
453
|
+
elif conf_level == 0.68: # 68% CI
|
|
454
|
+
df["q_16"] = y_pis[:, 0, 0]
|
|
455
|
+
df["q_84"] = y_pis[:, 1, 0]
|
|
462
456
|
elif conf_level == 0.80: # 80% CI
|
|
463
457
|
df["q_10"] = y_pis[:, 0, 0]
|
|
464
458
|
df["q_90"] = y_pis[:, 1, 0]
|
|
@@ -472,23 +466,28 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
472
466
|
# Add median (q_50) from XGBoost prediction
|
|
473
467
|
df["q_50"] = df["prediction"]
|
|
474
468
|
|
|
475
|
-
# Calculate
|
|
476
|
-
|
|
477
|
-
df["prediction_std"] = interval_width / 1.348
|
|
469
|
+
# Calculate a pseudo-standard deviation from the 68% interval width
|
|
470
|
+
df["prediction_std"] = (df["q_84"] - df["q_16"]) / 2.0
|
|
478
471
|
|
|
479
472
|
# Reorder the quantile columns for easier reading
|
|
480
|
-
quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
|
|
473
|
+
quantile_cols = ["q_025", "q_05", "q_10", "q_16", "q_25", "q_75", "q_84", "q_90", "q_95", "q_975"]
|
|
481
474
|
other_cols = [col for col in df.columns if col not in quantile_cols]
|
|
482
475
|
df = df[other_cols + quantile_cols]
|
|
483
476
|
|
|
484
|
-
#
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
df["
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
477
|
+
# Adjust the outer quantiles to ensure they encompass the prediction
|
|
478
|
+
if outlier_stretch:
|
|
479
|
+
# Lower intervals adjustments
|
|
480
|
+
df["q_025"] = np.minimum(df["q_025"], df["prediction"])
|
|
481
|
+
df["q_05"] = np.minimum(df["q_05"], df["prediction"])
|
|
482
|
+
df["q_10"] = np.minimum(df["q_10"], df["prediction"])
|
|
483
|
+
df["q_16"] = np.minimum(df["q_16"], df["prediction"])
|
|
484
|
+
df["q_25"] = np.minimum(df["q_25"], df["prediction"])
|
|
485
|
+
|
|
486
|
+
# Upper intervals adjustments
|
|
487
|
+
df["q_75"] = np.maximum(df["q_75"], df["prediction"])
|
|
488
|
+
df["q_84"] = np.maximum(df["q_84"], df["prediction"])
|
|
489
|
+
df["q_90"] = np.maximum(df["q_90"], df["prediction"])
|
|
490
|
+
df["q_95"] = np.maximum(df["q_95"], df["prediction"])
|
|
491
|
+
df["q_975"] = np.maximum(df["q_975"], df["prediction"])
|
|
493
492
|
|
|
494
493
|
return df
|
|
@@ -5,11 +5,7 @@ from xgboost import XGBRegressor # Point Estimator
|
|
|
5
5
|
from sklearn.model_selection import train_test_split
|
|
6
6
|
|
|
7
7
|
# Model Performance Scores
|
|
8
|
-
from sklearn.metrics import
|
|
9
|
-
mean_absolute_error,
|
|
10
|
-
r2_score,
|
|
11
|
-
root_mean_squared_error
|
|
12
|
-
)
|
|
8
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
13
9
|
|
|
14
10
|
from io import StringIO
|
|
15
11
|
import json
|
|
@@ -24,7 +20,6 @@ from typing import List, Tuple
|
|
|
24
20
|
from proximity import Proximity
|
|
25
21
|
|
|
26
22
|
|
|
27
|
-
|
|
28
23
|
# Template Placeholders
|
|
29
24
|
TEMPLATE_PARAMS = {
|
|
30
25
|
"id_column": "{{id_column}}",
|
|
@@ -32,7 +27,7 @@ TEMPLATE_PARAMS = {
|
|
|
32
27
|
"features": "{{feature_list}}",
|
|
33
28
|
"compressed_features": "{{compressed_features}}",
|
|
34
29
|
"train_all_data": "{{train_all_data}}",
|
|
35
|
-
"track_columns": "{{track_columns}}"
|
|
30
|
+
"track_columns": "{{track_columns}}",
|
|
36
31
|
}
|
|
37
32
|
|
|
38
33
|
|
|
@@ -183,11 +178,7 @@ if __name__ == "__main__":
|
|
|
183
178
|
args = parser.parse_args()
|
|
184
179
|
|
|
185
180
|
# Read the training data into DataFrames
|
|
186
|
-
training_files = [
|
|
187
|
-
os.path.join(args.train, file)
|
|
188
|
-
for file in os.listdir(args.train)
|
|
189
|
-
if file.endswith(".csv")
|
|
190
|
-
]
|
|
181
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
191
182
|
print(f"Training Files: {training_files}")
|
|
192
183
|
|
|
193
184
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -222,9 +213,7 @@ if __name__ == "__main__":
|
|
|
222
213
|
else:
|
|
223
214
|
# Just do a random training Split
|
|
224
215
|
print("WARNING: No training column found, splitting data with random state=42")
|
|
225
|
-
df_train, df_val = train_test_split(
|
|
226
|
-
all_df, test_size=validation_split, random_state=42
|
|
227
|
-
)
|
|
216
|
+
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
228
217
|
print(f"FIT/TRAIN: {df_train.shape}")
|
|
229
218
|
print(f"VALIDATION: {df_val.shape}")
|
|
230
219
|
|
|
@@ -289,11 +278,7 @@ def model_fn(model_dir) -> dict:
|
|
|
289
278
|
# Deserialize the proximity model
|
|
290
279
|
prox_model = Proximity.deserialize(model_dir)
|
|
291
280
|
|
|
292
|
-
return {
|
|
293
|
-
"xgboost": xgb_model,
|
|
294
|
-
"ngboost": ngb_model,
|
|
295
|
-
"proximity": prox_model
|
|
296
|
-
}
|
|
281
|
+
return {"xgboost": xgb_model, "ngboost": ngb_model, "proximity": prox_model}
|
|
297
282
|
|
|
298
283
|
|
|
299
284
|
def input_fn(input_data, content_type):
|
|
@@ -353,8 +338,8 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
353
338
|
dist_params = y_dists.params
|
|
354
339
|
|
|
355
340
|
# Extract mean and std from distribution parameters
|
|
356
|
-
df["prediction_uq"] = dist_params[
|
|
357
|
-
df["prediction_std"] = dist_params[
|
|
341
|
+
df["prediction_uq"] = dist_params["loc"] # mean
|
|
342
|
+
df["prediction_std"] = dist_params["scale"] # standard deviation
|
|
358
343
|
|
|
359
344
|
# Add 95% prediction intervals using ppf (percent point function)
|
|
360
345
|
# Note: Our hybrid model uses XGB point prediction and NGBoost UQ
|
|
@@ -3,11 +3,7 @@ from ngboost import NGBRegressor
|
|
|
3
3
|
from sklearn.model_selection import train_test_split
|
|
4
4
|
|
|
5
5
|
# Model Performance Scores
|
|
6
|
-
from sklearn.metrics import
|
|
7
|
-
mean_absolute_error,
|
|
8
|
-
r2_score,
|
|
9
|
-
root_mean_squared_error
|
|
10
|
-
)
|
|
6
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
11
7
|
|
|
12
8
|
from io import StringIO
|
|
13
9
|
import json
|
|
@@ -21,7 +17,7 @@ import pandas as pd
|
|
|
21
17
|
TEMPLATE_PARAMS = {
|
|
22
18
|
"features": "{{feature_list}}",
|
|
23
19
|
"target": "{{target_column}}",
|
|
24
|
-
"train_all_data": "{{train_all_data}}"
|
|
20
|
+
"train_all_data": "{{train_all_data}}",
|
|
25
21
|
}
|
|
26
22
|
|
|
27
23
|
|
|
@@ -87,10 +83,7 @@ if __name__ == "__main__":
|
|
|
87
83
|
args = parser.parse_args()
|
|
88
84
|
|
|
89
85
|
# Load training data from the specified directory
|
|
90
|
-
training_files = [
|
|
91
|
-
os.path.join(args.train, file)
|
|
92
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
93
|
-
]
|
|
86
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
94
87
|
print(f"Training Files: {training_files}")
|
|
95
88
|
|
|
96
89
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -212,8 +205,8 @@ def predict_fn(df, model) -> pd.DataFrame:
|
|
|
212
205
|
dist_params = y_dists.params
|
|
213
206
|
|
|
214
207
|
# Extract mean and std from distribution parameters
|
|
215
|
-
df["prediction"] = dist_params[
|
|
216
|
-
df["prediction_std"] = dist_params[
|
|
208
|
+
df["prediction"] = dist_params["loc"] # mean
|
|
209
|
+
df["prediction_std"] = dist_params["scale"] # standard deviation
|
|
217
210
|
|
|
218
211
|
# Add 95% prediction intervals using ppf (percent point function)
|
|
219
212
|
df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
|
|
@@ -3,7 +3,7 @@ TEMPLATE_PARAMS = {
|
|
|
3
3
|
"model_type": "{{model_type}}",
|
|
4
4
|
"target_column": "{{target_column}}",
|
|
5
5
|
"feature_list": "{{feature_list}}",
|
|
6
|
-
"model_metrics_s3_path": "{{model_metrics_s3_path}}"
|
|
6
|
+
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
7
7
|
}
|
|
8
8
|
|
|
9
9
|
# Imports for XGB Model
|
|
@@ -12,11 +12,7 @@ import awswrangler as wr
|
|
|
12
12
|
import numpy as np
|
|
13
13
|
|
|
14
14
|
# Model Performance Scores
|
|
15
|
-
from sklearn.metrics import
|
|
16
|
-
mean_absolute_error,
|
|
17
|
-
r2_score,
|
|
18
|
-
root_mean_squared_error
|
|
19
|
-
)
|
|
15
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
20
16
|
|
|
21
17
|
from io import StringIO
|
|
22
18
|
import json
|
|
@@ -39,6 +35,7 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
|
39
35
|
print(msg)
|
|
40
36
|
raise ValueError(msg)
|
|
41
37
|
|
|
38
|
+
|
|
42
39
|
def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
|
|
43
40
|
"""
|
|
44
41
|
Matches and renames the DataFrame's column names to match the model's feature names (case-insensitive).
|
|
@@ -95,11 +92,7 @@ if __name__ == "__main__":
|
|
|
95
92
|
args = parser.parse_args()
|
|
96
93
|
|
|
97
94
|
# Read the training data into DataFrames
|
|
98
|
-
training_files = [
|
|
99
|
-
os.path.join(args.train, file)
|
|
100
|
-
for file in os.listdir(args.train)
|
|
101
|
-
if file.endswith(".csv")
|
|
102
|
-
]
|
|
95
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
103
96
|
print(f"Training Files: {training_files}")
|
|
104
97
|
|
|
105
98
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -150,7 +143,6 @@ if __name__ == "__main__":
|
|
|
150
143
|
result_df["residual"] = result_df[target] - result_df["prediction"]
|
|
151
144
|
result_df["residual_abs"] = result_df["residual"].abs()
|
|
152
145
|
|
|
153
|
-
|
|
154
146
|
# Save the results dataframe to S3
|
|
155
147
|
wr.s3.to_csv(
|
|
156
148
|
result_df,
|
|
@@ -210,7 +202,7 @@ def input_fn(input_data, content_type):
|
|
|
210
202
|
"""Parse input data and return a DataFrame."""
|
|
211
203
|
if not input_data:
|
|
212
204
|
raise ValueError("Empty input data is not supported!")
|
|
213
|
-
|
|
205
|
+
|
|
214
206
|
# Decode bytes to string if necessary
|
|
215
207
|
if isinstance(input_data, bytes):
|
|
216
208
|
input_data = input_data.decode("utf-8")
|