workbench 0.8.201__py3-none-any.whl → 0.8.204__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/api/df_store.py +17 -108
- workbench/api/feature_set.py +41 -7
- workbench/api/parameter_store.py +3 -52
- workbench/core/artifacts/artifact.py +5 -5
- workbench/core/artifacts/df_store_core.py +114 -0
- workbench/core/artifacts/endpoint_core.py +184 -75
- workbench/core/artifacts/model_core.py +11 -7
- workbench/core/artifacts/parameter_store_core.py +98 -0
- workbench/core/transforms/features_to_model/features_to_model.py +27 -13
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +11 -0
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
- workbench/model_scripts/chemprop/chemprop.template +312 -293
- workbench/model_scripts/chemprop/generated_model_script.py +316 -297
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +11 -5
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +11 -5
- workbench/model_scripts/custom_models/uq_models/ngboost.template +11 -5
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +11 -5
- workbench/model_scripts/pytorch_model/generated_model_script.py +278 -128
- workbench/model_scripts/pytorch_model/pytorch.template +273 -123
- workbench/model_scripts/uq_models/generated_model_script.py +20 -11
- workbench/model_scripts/uq_models/mapie.template +17 -8
- workbench/model_scripts/xgb_model/generated_model_script.py +38 -9
- workbench/model_scripts/xgb_model/xgb_model.template +34 -5
- workbench/resources/open_source_api.key +1 -1
- workbench/utils/chemprop_utils.py +38 -1
- workbench/utils/pytorch_utils.py +38 -8
- workbench/web_interface/components/model_plot.py +7 -1
- {workbench-0.8.201.dist-info → workbench-0.8.204.dist-info}/METADATA +2 -2
- {workbench-0.8.201.dist-info → workbench-0.8.204.dist-info}/RECORD +33 -33
- workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
- workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -296
- {workbench-0.8.201.dist-info → workbench-0.8.204.dist-info}/WHEEL +0 -0
- {workbench-0.8.201.dist-info → workbench-0.8.204.dist-info}/entry_points.txt +0 -0
- {workbench-0.8.201.dist-info → workbench-0.8.204.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.201.dist-info → workbench-0.8.204.dist-info}/top_level.txt +0 -0
|
@@ -13,17 +13,19 @@ from pytorch_tabular.models import CategoryEmbeddingModelConfig
|
|
|
13
13
|
# Model Performance Scores
|
|
14
14
|
from sklearn.metrics import (
|
|
15
15
|
mean_absolute_error,
|
|
16
|
+
median_absolute_error,
|
|
16
17
|
r2_score,
|
|
17
18
|
root_mean_squared_error,
|
|
18
19
|
precision_recall_fscore_support,
|
|
19
20
|
confusion_matrix,
|
|
20
21
|
)
|
|
22
|
+
from scipy.stats import spearmanr
|
|
21
23
|
|
|
22
24
|
# Classification Encoder
|
|
23
25
|
from sklearn.preprocessing import LabelEncoder
|
|
24
26
|
|
|
25
27
|
# Scikit Learn Imports
|
|
26
|
-
from sklearn.model_selection import train_test_split
|
|
28
|
+
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
|
|
27
29
|
|
|
28
30
|
from io import StringIO
|
|
29
31
|
import json
|
|
@@ -33,13 +35,13 @@ import pandas as pd
|
|
|
33
35
|
|
|
34
36
|
# Template Parameters
|
|
35
37
|
TEMPLATE_PARAMS = {
|
|
36
|
-
"model_type": "
|
|
37
|
-
"target": "
|
|
38
|
-
"features": ['
|
|
38
|
+
"model_type": "uq_regressor",
|
|
39
|
+
"target": "mppb",
|
|
40
|
+
"features": ['mollogp', 'mi', 'fr_benzene', 'smr_vsa3', 'fr_halogen', 'c2sp2', 'peoe_vsa6', 'bcut2d_mwhi', 'vsa_estate1', 'mv', 'numaromaticcarbocycles', 'vsa_estate5', 'fr_nh0', 'mm', 'smr_vsa7', 'tpsa', 'c1sp2', 'mz', 'vsa_estate2', 'peoe_vsa7', 'vsa_estate10', 'vsa_estate7', 'vsa_estate6', 'smr_vsa10', 'slogp_vsa2', 'bcut2d_logphi', 'naromatom', 'axp_2dv', 'bcut2d_mrhi', 'vsa_estate8', 'slogp_vsa3', 'vsa_estate4', 'xpc_6dv', 'slogp_vsa12', 'peoe_vsa9', 'mp', 'slogp_vsa1', 'peoe_vsa1', 'xch_5dv', 'qed', 'vsa_estate3', 'fpdensitymorgan3', 'axp_2d', 'axp_0d', 'mse', 'numhacceptors', 'bertzct', 'estate_vsa8', 'minestateindex', 'estate_vsa3', 'fpdensitymorgan2', 'smr_vsa6', 'peoe_vsa8', 'slogp_vsa6', 'xp_5dv', 'hallkieralpha', 'avgipc', 'fr_arn', 'xp_7d', 'mare', 'xp_6d', 'bcut2d_mrlow', 'estate_vsa4', 'bcut2d_logplow', 'peoe_vsa10', 'maxabspartialcharge', 'peoe_vsa3', 'bcut2d_mwlow', 'axp_7d', 'minpartialcharge', 'xpc_4d', 'axp_1d', 'estate_vsa9', 'vsa_estate9', 'estate_vsa7', 'maxestateindex', 'estate_vsa6', 'smr_vsa1', 'xpc_6d', 'xch_7d', 'xc_5d', 'phi', 'axp_0dv', 'axp_3dv', 'mpe', 'xc_3d', 'xch_5d', 'xc_5dv', 'xch_6d', 'chi4n', 'axp_7dv', 'slogp_vsa5', 'axp_1dv', 'xch_6dv', 'minabsestateindex', 'numrotatablebonds', 'peoe_vsa2', 'estate_vsa2', 'slogp_vsa8', 'bcut2d_chglo', 'xch_7dv', 'kappa2', 'axp_4dv', 'xc_3dv', 'kappa1', 'nbase', 'xpc_5dv', 'maxpartialcharge', 'bcut2d_chghi', 'axp_5d', 'balabanj', 'xpc_5d', 'fpdensitymorgan1', 'xp_5d', 'smr_vsa5', 'axp_4d', 'kappa3', 'fr_morpholine', 'estate_vsa5', 'chi2n', 'labuteasa', 'axp_5dv', 'molwt', 'smr_vsa9', 'maxabsestateindex', 'xp_7dv', 'fr_bicyclic', 'numaliphaticheterocycles', 'axp_6dv', 'slogp_vsa4', 'axp_3d', 'xp_6dv', 'nocount', 'axp_6d', 'fr_aniline', 'xpc_4dv', 'xp_1d', 'c3sp2', 'numheterocycles', 'nhohcount', 'molmr', 'numaromaticheterocycles', 'chi0', 'minabspartialcharge', 'fr_ar_n', 'xp_3d', 'chi2v', 'fr_ether', 'chi1v', 'chi1', 'xp_2d', 'xp_4dv', 'xp_4d', 'chi4v', 'fr_pyridine', 'smr_vsa4', 'sps', 'chi3n', 'heavyatommolwt', 'slogp_vsa11', 'fr_aryl_methyl', 'si', 'fractioncsp3', 'sse', 'fr_para_hydroxylation', 'slogp_vsa10', 'c1sp3', 'exactmolwt', 'numsaturatedheterocycles', 'chi1n', 'chi0n', 'fcsp3'],
|
|
41
|
+
"id_column": "molecule_name",
|
|
39
42
|
"compressed_features": [],
|
|
40
|
-
"model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/
|
|
41
|
-
"
|
|
42
|
-
"hyperparameters": {},
|
|
43
|
+
"model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/mppb-reg-pytorch/training",
|
|
44
|
+
"hyperparameters": {'n_folds': 5},
|
|
43
45
|
}
|
|
44
46
|
|
|
45
47
|
|
|
@@ -204,36 +206,57 @@ def decompress_features(
|
|
|
204
206
|
return df, decompressed_features
|
|
205
207
|
|
|
206
208
|
|
|
207
|
-
def model_fn(model_dir: str) ->
|
|
208
|
-
"""Load the PyTorch Tabular
|
|
209
|
+
def model_fn(model_dir: str) -> dict:
|
|
210
|
+
"""Load the PyTorch Tabular ensemble models from the specified directory.
|
|
209
211
|
|
|
210
212
|
Args:
|
|
211
|
-
model_dir: Directory containing the saved model
|
|
213
|
+
model_dir: Directory containing the saved model(s)
|
|
212
214
|
|
|
213
215
|
Returns:
|
|
214
|
-
|
|
216
|
+
Dictionary with ensemble models and metadata
|
|
215
217
|
"""
|
|
218
|
+
import torch
|
|
219
|
+
from functools import partial
|
|
220
|
+
|
|
221
|
+
# Load ensemble metadata if present
|
|
222
|
+
ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
|
|
223
|
+
if os.path.exists(ensemble_metadata_path):
|
|
224
|
+
ensemble_metadata = joblib.load(ensemble_metadata_path)
|
|
225
|
+
n_ensemble = ensemble_metadata["n_ensemble"]
|
|
226
|
+
else:
|
|
227
|
+
n_ensemble = 1
|
|
228
|
+
|
|
229
|
+
# Determine map_location for loading models (handle CUDA trained models on CPU inference)
|
|
230
|
+
map_location = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
231
|
+
|
|
232
|
+
# Patch torch.load globally to use map_location (needed for joblib-loaded callbacks)
|
|
233
|
+
# This handles the case where pytorch-tabular loads callbacks.sav via joblib,
|
|
234
|
+
# which internally calls torch.load without map_location
|
|
235
|
+
original_torch_load = torch.load
|
|
236
|
+
torch.load = partial(original_torch_load, map_location=map_location)
|
|
237
|
+
|
|
216
238
|
# Save current working directory
|
|
217
239
|
original_cwd = os.getcwd()
|
|
240
|
+
ensemble_models = []
|
|
241
|
+
|
|
218
242
|
try:
|
|
219
243
|
# Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
|
|
220
244
|
os.chdir("/tmp")
|
|
221
245
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
# Load the model (map_location="cpu" ensures GPU-trained models work on CPU endpoints)
|
|
230
|
-
model = TabularModel.load_model(model_path, map_location="cpu")
|
|
246
|
+
for ens_idx in range(n_ensemble):
|
|
247
|
+
# Try numbered model path first, fall back to legacy path
|
|
248
|
+
model_path = os.path.join(model_dir, f"tabular_model_{ens_idx}")
|
|
249
|
+
if not os.path.exists(model_path):
|
|
250
|
+
model_path = os.path.join(model_dir, "tabular_model")
|
|
251
|
+
model = TabularModel.load_model(model_path, map_location=map_location)
|
|
252
|
+
ensemble_models.append(model)
|
|
231
253
|
|
|
232
254
|
finally:
|
|
233
|
-
# Restore
|
|
255
|
+
# Restore torch.load and working directory
|
|
256
|
+
torch.load = original_torch_load
|
|
234
257
|
os.chdir(original_cwd)
|
|
235
258
|
|
|
236
|
-
return
|
|
259
|
+
return {"ensemble_models": ensemble_models, "n_ensemble": n_ensemble}
|
|
237
260
|
|
|
238
261
|
|
|
239
262
|
def input_fn(input_data, content_type: str) -> pd.DataFrame:
|
|
@@ -264,18 +287,23 @@ def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
|
|
|
264
287
|
raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
|
|
265
288
|
|
|
266
289
|
|
|
267
|
-
def predict_fn(df: pd.DataFrame,
|
|
268
|
-
"""Make Predictions with our PyTorch Tabular Model
|
|
290
|
+
def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
|
|
291
|
+
"""Make Predictions with our PyTorch Tabular Model ensemble.
|
|
269
292
|
|
|
270
293
|
Args:
|
|
271
294
|
df (pd.DataFrame): The input DataFrame
|
|
272
|
-
|
|
295
|
+
model_dict: Dictionary containing ensemble models and metadata
|
|
273
296
|
|
|
274
297
|
Returns:
|
|
275
|
-
pd.DataFrame: The DataFrame with
|
|
298
|
+
pd.DataFrame: The DataFrame with predictions (and prediction_std for ensembles)
|
|
276
299
|
"""
|
|
300
|
+
model_type = TEMPLATE_PARAMS["model_type"]
|
|
277
301
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
278
302
|
|
|
303
|
+
# Extract ensemble models
|
|
304
|
+
ensemble_models = model_dict["ensemble_models"]
|
|
305
|
+
n_ensemble = model_dict["n_ensemble"]
|
|
306
|
+
|
|
279
307
|
# Grab our feature columns (from training)
|
|
280
308
|
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
281
309
|
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
@@ -308,8 +336,10 @@ def predict_fn(df: pd.DataFrame, model: TabularModel) -> pd.DataFrame:
|
|
|
308
336
|
if missing_mask.any():
|
|
309
337
|
print(f"Warning: {missing_mask.sum()} rows have missing features, will return NaN predictions")
|
|
310
338
|
|
|
311
|
-
# Initialize prediction
|
|
339
|
+
# Initialize prediction columns
|
|
312
340
|
df["prediction"] = np.nan
|
|
341
|
+
if model_type in ["regressor", "uq_regressor"]:
|
|
342
|
+
df["prediction_std"] = np.nan
|
|
313
343
|
|
|
314
344
|
# Only predict on complete rows
|
|
315
345
|
complete_df = matched_df[~missing_mask]
|
|
@@ -317,37 +347,63 @@ def predict_fn(df: pd.DataFrame, model: TabularModel) -> pd.DataFrame:
|
|
|
317
347
|
print("Warning: No complete rows to predict on")
|
|
318
348
|
return df
|
|
319
349
|
|
|
320
|
-
# Make predictions using the TabularModel
|
|
321
|
-
result = model.predict(complete_df[features])
|
|
322
|
-
|
|
323
350
|
# pytorch-tabular returns predictions using f"{target}_prediction" column
|
|
324
351
|
target = TEMPLATE_PARAMS["target"]
|
|
325
352
|
prediction_column = f"{target}_prediction"
|
|
326
|
-
if prediction_column in result.columns:
|
|
327
|
-
predictions = result[prediction_column].values
|
|
328
|
-
else:
|
|
329
|
-
raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
|
|
330
353
|
|
|
331
|
-
#
|
|
332
|
-
|
|
333
|
-
|
|
354
|
+
# Collect predictions from all ensemble members
|
|
355
|
+
all_ensemble_preds = []
|
|
356
|
+
all_ensemble_probs = []
|
|
334
357
|
|
|
335
|
-
|
|
336
|
-
|
|
358
|
+
for ens_idx, ens_model in enumerate(ensemble_models):
|
|
359
|
+
result = ens_model.predict(complete_df[features])
|
|
360
|
+
|
|
361
|
+
if prediction_column in result.columns:
|
|
362
|
+
ens_preds = result[prediction_column].values
|
|
363
|
+
else:
|
|
364
|
+
raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
|
|
337
365
|
|
|
338
|
-
|
|
366
|
+
all_ensemble_preds.append(ens_preds)
|
|
367
|
+
|
|
368
|
+
# For classification, collect probabilities
|
|
369
|
+
if label_encoder is not None:
|
|
370
|
+
prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
|
|
371
|
+
if prob_cols:
|
|
372
|
+
all_ensemble_probs.append(result[prob_cols].values)
|
|
373
|
+
|
|
374
|
+
# Stack and compute mean/std (std is 0 for single model)
|
|
375
|
+
ensemble_preds = np.stack(all_ensemble_preds, axis=0) # (n_ensemble, n_samples)
|
|
376
|
+
preds = np.mean(ensemble_preds, axis=0)
|
|
377
|
+
preds_std = np.std(ensemble_preds, axis=0) # Will be 0s for n_ensemble=1
|
|
378
|
+
|
|
379
|
+
print(f"Inference: Ensemble predictions shape: {preds.shape}, n_ensemble: {n_ensemble}")
|
|
380
|
+
|
|
381
|
+
# Handle classification vs regression
|
|
339
382
|
if label_encoder is not None:
|
|
340
|
-
|
|
341
|
-
if
|
|
342
|
-
|
|
383
|
+
# For classification, average probabilities then take argmax
|
|
384
|
+
if all_ensemble_probs:
|
|
385
|
+
ensemble_probs = np.stack(all_ensemble_probs, axis=0) # (n_ensemble, n_samples, n_classes)
|
|
386
|
+
avg_probs = np.mean(ensemble_probs, axis=0) # (n_samples, n_classes)
|
|
387
|
+
class_preds = np.argmax(avg_probs, axis=1)
|
|
388
|
+
predictions = label_encoder.inverse_transform(class_preds)
|
|
343
389
|
|
|
344
390
|
# Build full proba Series with None for missing rows
|
|
345
391
|
all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
|
|
346
|
-
all_proba.loc[~missing_mask] = [p.tolist() for p in
|
|
392
|
+
all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
|
|
347
393
|
df["pred_proba"] = all_proba
|
|
348
394
|
|
|
349
395
|
# Expand the pred_proba column into separate columns for each class
|
|
350
396
|
df = expand_proba_column(df, label_encoder.classes_)
|
|
397
|
+
else:
|
|
398
|
+
# No probabilities, use averaged predictions
|
|
399
|
+
predictions = label_encoder.inverse_transform(preds.astype(int))
|
|
400
|
+
else:
|
|
401
|
+
# Regression (includes uq_regressor)
|
|
402
|
+
predictions = preds
|
|
403
|
+
df.loc[~missing_mask, "prediction_std"] = preds_std
|
|
404
|
+
|
|
405
|
+
# Set predictions only for complete rows
|
|
406
|
+
df.loc[~missing_mask, "prediction"] = predictions
|
|
351
407
|
|
|
352
408
|
return df
|
|
353
409
|
|
|
@@ -359,12 +415,11 @@ if __name__ == "__main__":
|
|
|
359
415
|
target = TEMPLATE_PARAMS["target"]
|
|
360
416
|
features = TEMPLATE_PARAMS["features"]
|
|
361
417
|
orig_features = features.copy()
|
|
418
|
+
id_column = TEMPLATE_PARAMS["id_column"]
|
|
362
419
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
363
420
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
364
421
|
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
365
|
-
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
366
422
|
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
367
|
-
validation_split = 0.2
|
|
368
423
|
|
|
369
424
|
# Script arguments for input/output directories
|
|
370
425
|
parser = argparse.ArgumentParser()
|
|
@@ -423,72 +478,71 @@ if __name__ == "__main__":
|
|
|
423
478
|
# Cast continuous columns to float
|
|
424
479
|
all_df[continuous_cols] = all_df[continuous_cols].astype("float64")
|
|
425
480
|
|
|
426
|
-
#
|
|
427
|
-
if train_all_data:
|
|
428
|
-
print("Training on ALL of the data")
|
|
429
|
-
df_train = all_df.copy()
|
|
430
|
-
df_val = all_df.copy()
|
|
431
|
-
|
|
432
|
-
# Does the dataframe have a training column?
|
|
433
|
-
elif "training" in all_df.columns:
|
|
434
|
-
print("Found training column, splitting data based on training column")
|
|
435
|
-
df_train = all_df[all_df["training"]].copy()
|
|
436
|
-
df_val = all_df[~all_df["training"]].copy()
|
|
437
|
-
else:
|
|
438
|
-
# Just do a random training Split
|
|
439
|
-
print("WARNING: No training column found, splitting data with random state=42")
|
|
440
|
-
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
441
|
-
print(f"FIT/TRAIN: {df_train.shape}")
|
|
442
|
-
print(f"VALIDATION: {df_val.shape}")
|
|
443
|
-
|
|
444
|
-
# Set up PyTorch Tabular configuration
|
|
445
|
-
data_config = DataConfig(
|
|
446
|
-
target=[target],
|
|
447
|
-
continuous_cols=continuous_cols,
|
|
448
|
-
categorical_cols=categorical_cols,
|
|
449
|
-
)
|
|
450
|
-
|
|
451
|
-
# Choose the 'task' based on model type also set up the label encoder if needed
|
|
481
|
+
# Choose the 'task' based on model type and set up the label encoder if needed
|
|
452
482
|
if model_type == "classifier":
|
|
453
483
|
task = "classification"
|
|
454
|
-
# Encode the target column
|
|
484
|
+
# Encode the target column on full dataset for consistent encoding
|
|
455
485
|
label_encoder = LabelEncoder()
|
|
456
|
-
|
|
457
|
-
|
|
486
|
+
all_df[target] = label_encoder.fit_transform(all_df[target])
|
|
487
|
+
num_classes = len(label_encoder.classes_)
|
|
458
488
|
else:
|
|
459
489
|
task = "regression"
|
|
460
490
|
label_encoder = None
|
|
491
|
+
num_classes = None
|
|
461
492
|
|
|
462
493
|
# Use any hyperparameters to set up both the trainer and model configurations
|
|
463
494
|
print(f"Hyperparameters: {hyperparameters}")
|
|
495
|
+
n_folds = hyperparameters.get("n_folds", 5) # Number of CV folds (default: 5)
|
|
496
|
+
|
|
497
|
+
# =========================================================================
|
|
498
|
+
# UNIFIED TRAINING: Works for n_folds=1 (single model) or n_folds>1 (K-fold CV)
|
|
499
|
+
# =========================================================================
|
|
500
|
+
print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
|
|
501
|
+
|
|
502
|
+
# Create fold splits
|
|
503
|
+
if n_folds == 1:
|
|
504
|
+
# Single fold: use train/val split from "training" column or random split
|
|
505
|
+
if "training" in all_df.columns:
|
|
506
|
+
print("Found training column, splitting data based on training column")
|
|
507
|
+
train_idx = np.where(all_df["training"])[0]
|
|
508
|
+
val_idx = np.where(~all_df["training"])[0]
|
|
509
|
+
else:
|
|
510
|
+
print("WARNING: No training column found, splitting data with random 80/20 split")
|
|
511
|
+
indices = np.arange(len(all_df))
|
|
512
|
+
train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
|
|
513
|
+
folds = [(train_idx, val_idx)]
|
|
514
|
+
else:
|
|
515
|
+
# K-Fold CV
|
|
516
|
+
if model_type == "classifier":
|
|
517
|
+
kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
518
|
+
split_target = all_df[target]
|
|
519
|
+
else:
|
|
520
|
+
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
521
|
+
split_target = None
|
|
522
|
+
folds = list(kfold.split(all_df, split_target))
|
|
523
|
+
|
|
524
|
+
# Initialize storage for out-of-fold predictions
|
|
525
|
+
oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
|
|
526
|
+
if model_type == "classifier" and num_classes and num_classes > 1:
|
|
527
|
+
oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
|
|
528
|
+
else:
|
|
529
|
+
oof_proba = None
|
|
464
530
|
|
|
465
|
-
|
|
466
|
-
trainer_defaults = {
|
|
467
|
-
"auto_lr_find": False,
|
|
468
|
-
"batch_size": min(128, max(32, len(df_train) // 16)),
|
|
469
|
-
"max_epochs": 100,
|
|
470
|
-
"min_epochs": 10,
|
|
471
|
-
"early_stopping": "valid_loss",
|
|
472
|
-
"early_stopping_patience": 10,
|
|
473
|
-
"checkpoints": "valid_loss",
|
|
474
|
-
"accelerator": "auto",
|
|
475
|
-
"progress_bar": "none",
|
|
476
|
-
"gradient_clip_val": 1.0,
|
|
477
|
-
}
|
|
531
|
+
ensemble_models = []
|
|
478
532
|
|
|
479
|
-
#
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
533
|
+
# Set up PyTorch Tabular data configuration (shared across folds)
|
|
534
|
+
data_config = DataConfig(
|
|
535
|
+
target=[target],
|
|
536
|
+
continuous_cols=continuous_cols,
|
|
537
|
+
categorical_cols=categorical_cols,
|
|
538
|
+
)
|
|
485
539
|
|
|
486
540
|
# Model config defaults
|
|
487
541
|
model_defaults = {
|
|
488
542
|
"layers": "256-128-64",
|
|
489
543
|
"activation": "LeakyReLU",
|
|
490
544
|
"learning_rate": 1e-3,
|
|
491
|
-
"dropout": 0.
|
|
545
|
+
"dropout": 0.1,
|
|
492
546
|
"use_batch_norm": True,
|
|
493
547
|
"initialization": "kaiming",
|
|
494
548
|
}
|
|
@@ -498,41 +552,107 @@ if __name__ == "__main__":
|
|
|
498
552
|
print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
|
|
499
553
|
model_params = {**model_defaults, **model_overrides}
|
|
500
554
|
|
|
501
|
-
# Use CategoryEmbedding model configuration for general-purpose tabular modeling.
|
|
502
555
|
model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
|
|
503
556
|
optimizer_config = OptimizerConfig()
|
|
504
557
|
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
558
|
+
for fold_idx, (train_idx, val_idx) in enumerate(folds):
|
|
559
|
+
print(f"\n{'='*50}")
|
|
560
|
+
print(f"Training Fold {fold_idx + 1}/{len(folds)}")
|
|
561
|
+
print(f"{'='*50}")
|
|
562
|
+
|
|
563
|
+
# Split data for this fold
|
|
564
|
+
df_train = all_df.iloc[train_idx].reset_index(drop=True)
|
|
565
|
+
df_val = all_df.iloc[val_idx].reset_index(drop=True)
|
|
566
|
+
|
|
567
|
+
print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
|
|
568
|
+
|
|
569
|
+
# Set up PyTorch Tabular trainer configuration (per-fold for batch_size)
|
|
570
|
+
# Calculate batch size that avoids single-sample last batch (batch norm requires >1)
|
|
571
|
+
batch_size = min(128, max(32, len(df_train) // 16))
|
|
572
|
+
if len(df_train) % batch_size == 1:
|
|
573
|
+
batch_size += 1 # Adjust to avoid last batch of size 1
|
|
574
|
+
trainer_defaults = {
|
|
575
|
+
"auto_lr_find": False,
|
|
576
|
+
"batch_size": batch_size,
|
|
577
|
+
"max_epochs": 200,
|
|
578
|
+
"min_epochs": 10,
|
|
579
|
+
"early_stopping": "valid_loss",
|
|
580
|
+
"early_stopping_patience": 20,
|
|
581
|
+
"checkpoints": "valid_loss",
|
|
582
|
+
"accelerator": "auto",
|
|
583
|
+
"progress_bar": "none",
|
|
584
|
+
"gradient_clip_val": 1.0,
|
|
585
|
+
"seed": 42 + fold_idx,
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
# Override defaults with training_config if present
|
|
589
|
+
training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
|
|
590
|
+
if fold_idx == 0: # Only print overrides once
|
|
591
|
+
for key, value in training_overrides.items():
|
|
592
|
+
print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
|
|
593
|
+
trainer_params = {**trainer_defaults, **training_overrides}
|
|
594
|
+
trainer_config = TrainerConfig(**trainer_params)
|
|
595
|
+
|
|
596
|
+
# Create and train the TabularModel for this fold
|
|
597
|
+
tabular_model = TabularModel(
|
|
598
|
+
data_config=data_config,
|
|
599
|
+
model_config=model_config,
|
|
600
|
+
optimizer_config=optimizer_config,
|
|
601
|
+
trainer_config=trainer_config,
|
|
602
|
+
)
|
|
603
|
+
tabular_model.fit(train=df_train, validation=df_val)
|
|
604
|
+
ensemble_models.append(tabular_model)
|
|
605
|
+
|
|
606
|
+
# Make out-of-fold predictions
|
|
607
|
+
result = tabular_model.predict(df_val, include_input_features=False)
|
|
608
|
+
fold_preds = result[f"{target}_prediction"].values
|
|
609
|
+
|
|
610
|
+
# Store out-of-fold predictions
|
|
611
|
+
if model_type == "classifier":
|
|
612
|
+
oof_predictions[val_idx] = fold_preds.astype(int)
|
|
613
|
+
prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
|
|
614
|
+
if prob_cols and oof_proba is not None:
|
|
615
|
+
oof_proba[val_idx] = result[prob_cols].values
|
|
616
|
+
else:
|
|
617
|
+
oof_predictions[val_idx] = fold_preds.flatten()
|
|
515
618
|
|
|
516
|
-
|
|
517
|
-
print("Making Predictions on Validation Set...")
|
|
518
|
-
result = tabular_model.predict(df_val, include_input_features=False)
|
|
619
|
+
print(f"Fold {fold_idx + 1} complete!")
|
|
519
620
|
|
|
520
|
-
|
|
521
|
-
|
|
621
|
+
print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
|
|
622
|
+
|
|
623
|
+
# Use out-of-fold predictions for metrics
|
|
624
|
+
# For n_folds=1, we only have predictions for val_idx, so filter to those rows
|
|
625
|
+
if n_folds == 1:
|
|
626
|
+
val_mask = ~np.isnan(oof_predictions)
|
|
627
|
+
preds = oof_predictions[val_mask]
|
|
628
|
+
df_val = all_df[val_mask].copy()
|
|
629
|
+
if oof_proba is not None:
|
|
630
|
+
oof_proba = oof_proba[val_mask]
|
|
631
|
+
else:
|
|
632
|
+
preds = oof_predictions
|
|
633
|
+
df_val = all_df.copy()
|
|
634
|
+
|
|
635
|
+
# Compute prediction_std by running all ensemble models on validation data
|
|
636
|
+
# For n_folds=1, std will be 0 (only one model). For n_folds>1, std shows ensemble disagreement.
|
|
637
|
+
preds_std = None
|
|
638
|
+
if model_type in ["regressor", "uq_regressor"] and len(ensemble_models) > 0:
|
|
639
|
+
print("Computing prediction_std from ensemble predictions on validation data...")
|
|
640
|
+
all_ensemble_preds_for_std = []
|
|
641
|
+
for ens_model in ensemble_models:
|
|
642
|
+
result = ens_model.predict(df_val[features], include_input_features=False)
|
|
643
|
+
ens_preds = result[f"{target}_prediction"].values.flatten()
|
|
644
|
+
all_ensemble_preds_for_std.append(ens_preds)
|
|
645
|
+
|
|
646
|
+
ensemble_preds_stacked = np.stack(all_ensemble_preds_for_std, axis=0)
|
|
647
|
+
preds_std = np.std(ensemble_preds_stacked, axis=0)
|
|
648
|
+
print(f"Ensemble prediction_std - mean: {np.mean(preds_std):.4f}, max: {np.max(preds_std):.4f}")
|
|
522
649
|
|
|
523
650
|
if model_type == "classifier":
|
|
524
651
|
# Get probabilities for classification
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
probs = result[prob_cols].values
|
|
529
|
-
df_val = df_val.copy() # Avoid SettingWithCopyWarning
|
|
530
|
-
df_val["pred_proba"] = [p.tolist() for p in probs]
|
|
531
|
-
|
|
532
|
-
# Expand the pred_proba column into separate columns for each class
|
|
533
|
-
print(df_val.columns.tolist())
|
|
652
|
+
if oof_proba is not None:
|
|
653
|
+
df_val = df_val.copy()
|
|
654
|
+
df_val["pred_proba"] = [p.tolist() for p in oof_proba]
|
|
534
655
|
df_val = expand_proba_column(df_val, label_encoder.classes_)
|
|
535
|
-
print(df_val.columns.tolist())
|
|
536
656
|
|
|
537
657
|
# Decode the target and prediction labels
|
|
538
658
|
y_validate = label_encoder.inverse_transform(df_val[target])
|
|
@@ -544,7 +664,22 @@ if __name__ == "__main__":
|
|
|
544
664
|
# Save predictions to S3
|
|
545
665
|
df_val = df_val.copy()
|
|
546
666
|
df_val["prediction"] = preds_decoded
|
|
547
|
-
|
|
667
|
+
|
|
668
|
+
# Build output columns - include id_column if it exists
|
|
669
|
+
output_columns = []
|
|
670
|
+
if id_column in df_val.columns:
|
|
671
|
+
output_columns.append(id_column)
|
|
672
|
+
output_columns += [target, "prediction"]
|
|
673
|
+
|
|
674
|
+
# Add prediction_std for regression models (always present, 0 for single model)
|
|
675
|
+
if model_type in ["regressor", "uq_regressor"]:
|
|
676
|
+
if preds_std is not None:
|
|
677
|
+
df_val["prediction_std"] = preds_std
|
|
678
|
+
else:
|
|
679
|
+
df_val["prediction_std"] = 0.0
|
|
680
|
+
output_columns.append("prediction_std")
|
|
681
|
+
print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
|
|
682
|
+
|
|
548
683
|
output_columns += [col for col in df_val.columns if col.endswith("_proba")]
|
|
549
684
|
wr.s3.to_csv(
|
|
550
685
|
df_val[output_columns],
|
|
@@ -589,14 +724,29 @@ if __name__ == "__main__":
|
|
|
589
724
|
# Calculate various model performance metrics (regression)
|
|
590
725
|
rmse = root_mean_squared_error(y_validate, preds_decoded)
|
|
591
726
|
mae = mean_absolute_error(y_validate, preds_decoded)
|
|
727
|
+
medae = median_absolute_error(y_validate, preds_decoded)
|
|
592
728
|
r2 = r2_score(y_validate, preds_decoded)
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
print(f"
|
|
596
|
-
print(f"
|
|
729
|
+
spearman_corr = spearmanr(y_validate, preds_decoded).correlation
|
|
730
|
+
support = len(df_val)
|
|
731
|
+
print(f"rmse: {rmse:.3f}")
|
|
732
|
+
print(f"mae: {mae:.3f}")
|
|
733
|
+
print(f"medae: {medae:.3f}")
|
|
734
|
+
print(f"r2: {r2:.3f}")
|
|
735
|
+
print(f"spearmanr: {spearman_corr:.3f}")
|
|
736
|
+
print(f"support: {support}")
|
|
737
|
+
|
|
738
|
+
# Save ensemble models
|
|
739
|
+
for model_idx, ens_model in enumerate(ensemble_models):
|
|
740
|
+
model_path = os.path.join(args.model_dir, f"tabular_model_{model_idx}")
|
|
741
|
+
ens_model.save_model(model_path)
|
|
742
|
+
print(f"Saved model {model_idx + 1} to {model_path}")
|
|
743
|
+
|
|
744
|
+
# Save ensemble metadata
|
|
745
|
+
n_ensemble = len(ensemble_models)
|
|
746
|
+
ensemble_metadata = {"n_ensemble": n_ensemble, "n_folds": n_folds}
|
|
747
|
+
joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
|
|
748
|
+
print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
|
|
597
749
|
|
|
598
|
-
# Save the model to the standard place/name
|
|
599
|
-
tabular_model.save_model(os.path.join(args.model_dir, "tabular_model"))
|
|
600
750
|
if label_encoder:
|
|
601
751
|
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
602
752
|
|