workbench 0.8.201__py3-none-any.whl → 0.8.204__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. workbench/api/df_store.py +17 -108
  2. workbench/api/feature_set.py +41 -7
  3. workbench/api/parameter_store.py +3 -52
  4. workbench/core/artifacts/artifact.py +5 -5
  5. workbench/core/artifacts/df_store_core.py +114 -0
  6. workbench/core/artifacts/endpoint_core.py +184 -75
  7. workbench/core/artifacts/model_core.py +11 -7
  8. workbench/core/artifacts/parameter_store_core.py +98 -0
  9. workbench/core/transforms/features_to_model/features_to_model.py +27 -13
  10. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +11 -0
  11. workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
  12. workbench/model_scripts/chemprop/chemprop.template +312 -293
  13. workbench/model_scripts/chemprop/generated_model_script.py +316 -297
  14. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +11 -5
  15. workbench/model_scripts/custom_models/uq_models/meta_uq.template +11 -5
  16. workbench/model_scripts/custom_models/uq_models/ngboost.template +11 -5
  17. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +11 -5
  18. workbench/model_scripts/pytorch_model/generated_model_script.py +278 -128
  19. workbench/model_scripts/pytorch_model/pytorch.template +273 -123
  20. workbench/model_scripts/uq_models/generated_model_script.py +20 -11
  21. workbench/model_scripts/uq_models/mapie.template +17 -8
  22. workbench/model_scripts/xgb_model/generated_model_script.py +38 -9
  23. workbench/model_scripts/xgb_model/xgb_model.template +34 -5
  24. workbench/resources/open_source_api.key +1 -1
  25. workbench/utils/chemprop_utils.py +38 -1
  26. workbench/utils/pytorch_utils.py +38 -8
  27. workbench/web_interface/components/model_plot.py +7 -1
  28. {workbench-0.8.201.dist-info → workbench-0.8.204.dist-info}/METADATA +2 -2
  29. {workbench-0.8.201.dist-info → workbench-0.8.204.dist-info}/RECORD +33 -33
  30. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  31. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -296
  32. {workbench-0.8.201.dist-info → workbench-0.8.204.dist-info}/WHEEL +0 -0
  33. {workbench-0.8.201.dist-info → workbench-0.8.204.dist-info}/entry_points.txt +0 -0
  34. {workbench-0.8.201.dist-info → workbench-0.8.204.dist-info}/licenses/LICENSE +0 -0
  35. {workbench-0.8.201.dist-info → workbench-0.8.204.dist-info}/top_level.txt +0 -0
@@ -13,17 +13,19 @@ from pytorch_tabular.models import CategoryEmbeddingModelConfig
13
13
  # Model Performance Scores
14
14
  from sklearn.metrics import (
15
15
  mean_absolute_error,
16
+ median_absolute_error,
16
17
  r2_score,
17
18
  root_mean_squared_error,
18
19
  precision_recall_fscore_support,
19
20
  confusion_matrix,
20
21
  )
22
+ from scipy.stats import spearmanr
21
23
 
22
24
  # Classification Encoder
23
25
  from sklearn.preprocessing import LabelEncoder
24
26
 
25
27
  # Scikit Learn Imports
26
- from sklearn.model_selection import train_test_split
28
+ from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
27
29
 
28
30
  from io import StringIO
29
31
  import json
@@ -36,9 +38,9 @@ TEMPLATE_PARAMS = {
36
38
  "model_type": "{{model_type}}",
37
39
  "target": "{{target_column}}",
38
40
  "features": "{{feature_list}}",
41
+ "id_column": "{{id_column}}",
39
42
  "compressed_features": "{{compressed_features}}",
40
43
  "model_metrics_s3_path": "{{model_metrics_s3_path}}",
41
- "train_all_data": "{{train_all_data}}",
42
44
  "hyperparameters": "{{hyperparameters}}",
43
45
  }
44
46
 
@@ -204,36 +206,57 @@ def decompress_features(
204
206
  return df, decompressed_features
205
207
 
206
208
 
207
- def model_fn(model_dir: str) -> TabularModel:
208
- """Load the PyTorch Tabular model from the specified directory.
209
+ def model_fn(model_dir: str) -> dict:
210
+ """Load the PyTorch Tabular ensemble models from the specified directory.
209
211
 
210
212
  Args:
211
- model_dir: Directory containing the saved model
213
+ model_dir: Directory containing the saved model(s)
212
214
 
213
215
  Returns:
214
- Loaded TabularModel instance
216
+ Dictionary with ensemble models and metadata
215
217
  """
218
+ import torch
219
+ from functools import partial
220
+
221
+ # Load ensemble metadata if present
222
+ ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
223
+ if os.path.exists(ensemble_metadata_path):
224
+ ensemble_metadata = joblib.load(ensemble_metadata_path)
225
+ n_ensemble = ensemble_metadata["n_ensemble"]
226
+ else:
227
+ n_ensemble = 1
228
+
229
+ # Determine map_location for loading models (handle CUDA trained models on CPU inference)
230
+ map_location = torch.device("cuda" if torch.cuda.is_available() else "cpu")
231
+
232
+ # Patch torch.load globally to use map_location (needed for joblib-loaded callbacks)
233
+ # This handles the case where pytorch-tabular loads callbacks.sav via joblib,
234
+ # which internally calls torch.load without map_location
235
+ original_torch_load = torch.load
236
+ torch.load = partial(original_torch_load, map_location=map_location)
237
+
216
238
  # Save current working directory
217
239
  original_cwd = os.getcwd()
240
+ ensemble_models = []
241
+
218
242
  try:
219
243
  # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
220
244
  os.chdir("/tmp")
221
245
 
222
- # Remove callbacks.sav if it exists - it's not needed for inference and causes
223
- # GPU->CPU loading issues (joblib.load doesn't support map_location)
224
- model_path = os.path.join(model_dir, "tabular_model")
225
- callbacks_path = os.path.join(model_path, "callbacks.sav")
226
- if os.path.exists(callbacks_path):
227
- os.remove(callbacks_path)
228
-
229
- # Load the model (map_location="cpu" ensures GPU-trained models work on CPU endpoints)
230
- model = TabularModel.load_model(model_path, map_location="cpu")
246
+ for ens_idx in range(n_ensemble):
247
+ # Try numbered model path first, fall back to legacy path
248
+ model_path = os.path.join(model_dir, f"tabular_model_{ens_idx}")
249
+ if not os.path.exists(model_path):
250
+ model_path = os.path.join(model_dir, "tabular_model")
251
+ model = TabularModel.load_model(model_path, map_location=map_location)
252
+ ensemble_models.append(model)
231
253
 
232
254
  finally:
233
- # Restore the original working directory
255
+ # Restore torch.load and working directory
256
+ torch.load = original_torch_load
234
257
  os.chdir(original_cwd)
235
258
 
236
- return model
259
+ return {"ensemble_models": ensemble_models, "n_ensemble": n_ensemble}
237
260
 
238
261
 
239
262
  def input_fn(input_data, content_type: str) -> pd.DataFrame:
@@ -264,18 +287,23 @@ def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
264
287
  raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
265
288
 
266
289
 
267
- def predict_fn(df: pd.DataFrame, model: TabularModel) -> pd.DataFrame:
268
- """Make Predictions with our PyTorch Tabular Model
290
+ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
291
+ """Make Predictions with our PyTorch Tabular Model ensemble.
269
292
 
270
293
  Args:
271
294
  df (pd.DataFrame): The input DataFrame
272
- model: The TabularModel use for predictions
295
+ model_dict: Dictionary containing ensemble models and metadata
273
296
 
274
297
  Returns:
275
- pd.DataFrame: The DataFrame with the predictions added
298
+ pd.DataFrame: The DataFrame with predictions (and prediction_std for ensembles)
276
299
  """
300
+ model_type = TEMPLATE_PARAMS["model_type"]
277
301
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
278
302
 
303
+ # Extract ensemble models
304
+ ensemble_models = model_dict["ensemble_models"]
305
+ n_ensemble = model_dict["n_ensemble"]
306
+
279
307
  # Grab our feature columns (from training)
280
308
  model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
281
309
  with open(os.path.join(model_dir, "feature_columns.json")) as fp:
@@ -308,8 +336,10 @@ def predict_fn(df: pd.DataFrame, model: TabularModel) -> pd.DataFrame:
308
336
  if missing_mask.any():
309
337
  print(f"Warning: {missing_mask.sum()} rows have missing features, will return NaN predictions")
310
338
 
311
- # Initialize prediction column with NaN
339
+ # Initialize prediction columns
312
340
  df["prediction"] = np.nan
341
+ if model_type in ["regressor", "uq_regressor"]:
342
+ df["prediction_std"] = np.nan
313
343
 
314
344
  # Only predict on complete rows
315
345
  complete_df = matched_df[~missing_mask]
@@ -317,37 +347,63 @@ def predict_fn(df: pd.DataFrame, model: TabularModel) -> pd.DataFrame:
317
347
  print("Warning: No complete rows to predict on")
318
348
  return df
319
349
 
320
- # Make predictions using the TabularModel
321
- result = model.predict(complete_df[features])
322
-
323
350
  # pytorch-tabular returns predictions using f"{target}_prediction" column
324
351
  target = TEMPLATE_PARAMS["target"]
325
352
  prediction_column = f"{target}_prediction"
326
- if prediction_column in result.columns:
327
- predictions = result[prediction_column].values
328
- else:
329
- raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
330
353
 
331
- # If we have a label encoder, decode the predictions
332
- if label_encoder:
333
- predictions = label_encoder.inverse_transform(predictions.astype(int))
354
+ # Collect predictions from all ensemble members
355
+ all_ensemble_preds = []
356
+ all_ensemble_probs = []
334
357
 
335
- # Set predictions only for complete rows
336
- df.loc[~missing_mask, "prediction"] = predictions
358
+ for ens_idx, ens_model in enumerate(ensemble_models):
359
+ result = ens_model.predict(complete_df[features])
360
+
361
+ if prediction_column in result.columns:
362
+ ens_preds = result[prediction_column].values
363
+ else:
364
+ raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
337
365
 
338
- # For classification, get probabilities
366
+ all_ensemble_preds.append(ens_preds)
367
+
368
+ # For classification, collect probabilities
369
+ if label_encoder is not None:
370
+ prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
371
+ if prob_cols:
372
+ all_ensemble_probs.append(result[prob_cols].values)
373
+
374
+ # Stack and compute mean/std (std is 0 for single model)
375
+ ensemble_preds = np.stack(all_ensemble_preds, axis=0) # (n_ensemble, n_samples)
376
+ preds = np.mean(ensemble_preds, axis=0)
377
+ preds_std = np.std(ensemble_preds, axis=0) # Will be 0s for n_ensemble=1
378
+
379
+ print(f"Inference: Ensemble predictions shape: {preds.shape}, n_ensemble: {n_ensemble}")
380
+
381
+ # Handle classification vs regression
339
382
  if label_encoder is not None:
340
- prob_cols = [col for col in result.columns if col.endswith("_probability")]
341
- if prob_cols:
342
- probs = result[prob_cols].values
383
+ # For classification, average probabilities then take argmax
384
+ if all_ensemble_probs:
385
+ ensemble_probs = np.stack(all_ensemble_probs, axis=0) # (n_ensemble, n_samples, n_classes)
386
+ avg_probs = np.mean(ensemble_probs, axis=0) # (n_samples, n_classes)
387
+ class_preds = np.argmax(avg_probs, axis=1)
388
+ predictions = label_encoder.inverse_transform(class_preds)
343
389
 
344
390
  # Build full proba Series with None for missing rows
345
391
  all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
346
- all_proba.loc[~missing_mask] = [p.tolist() for p in probs]
392
+ all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
347
393
  df["pred_proba"] = all_proba
348
394
 
349
395
  # Expand the pred_proba column into separate columns for each class
350
396
  df = expand_proba_column(df, label_encoder.classes_)
397
+ else:
398
+ # No probabilities, use averaged predictions
399
+ predictions = label_encoder.inverse_transform(preds.astype(int))
400
+ else:
401
+ # Regression (includes uq_regressor)
402
+ predictions = preds
403
+ df.loc[~missing_mask, "prediction_std"] = preds_std
404
+
405
+ # Set predictions only for complete rows
406
+ df.loc[~missing_mask, "prediction"] = predictions
351
407
 
352
408
  return df
353
409
 
@@ -359,12 +415,11 @@ if __name__ == "__main__":
359
415
  target = TEMPLATE_PARAMS["target"]
360
416
  features = TEMPLATE_PARAMS["features"]
361
417
  orig_features = features.copy()
418
+ id_column = TEMPLATE_PARAMS["id_column"]
362
419
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
363
420
  model_type = TEMPLATE_PARAMS["model_type"]
364
421
  model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
365
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
366
422
  hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
367
- validation_split = 0.2
368
423
 
369
424
  # Script arguments for input/output directories
370
425
  parser = argparse.ArgumentParser()
@@ -423,72 +478,71 @@ if __name__ == "__main__":
423
478
  # Cast continuous columns to float
424
479
  all_df[continuous_cols] = all_df[continuous_cols].astype("float64")
425
480
 
426
- # Do we want to train on all the data?
427
- if train_all_data:
428
- print("Training on ALL of the data")
429
- df_train = all_df.copy()
430
- df_val = all_df.copy()
431
-
432
- # Does the dataframe have a training column?
433
- elif "training" in all_df.columns:
434
- print("Found training column, splitting data based on training column")
435
- df_train = all_df[all_df["training"]].copy()
436
- df_val = all_df[~all_df["training"]].copy()
437
- else:
438
- # Just do a random training Split
439
- print("WARNING: No training column found, splitting data with random state=42")
440
- df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
441
- print(f"FIT/TRAIN: {df_train.shape}")
442
- print(f"VALIDATION: {df_val.shape}")
443
-
444
- # Set up PyTorch Tabular configuration
445
- data_config = DataConfig(
446
- target=[target],
447
- continuous_cols=continuous_cols,
448
- categorical_cols=categorical_cols,
449
- )
450
-
451
- # Choose the 'task' based on model type also set up the label encoder if needed
481
+ # Choose the 'task' based on model type and set up the label encoder if needed
452
482
  if model_type == "classifier":
453
483
  task = "classification"
454
- # Encode the target column
484
+ # Encode the target column on full dataset for consistent encoding
455
485
  label_encoder = LabelEncoder()
456
- df_train[target] = label_encoder.fit_transform(df_train[target])
457
- df_val[target] = label_encoder.transform(df_val[target])
486
+ all_df[target] = label_encoder.fit_transform(all_df[target])
487
+ num_classes = len(label_encoder.classes_)
458
488
  else:
459
489
  task = "regression"
460
490
  label_encoder = None
491
+ num_classes = None
461
492
 
462
493
  # Use any hyperparameters to set up both the trainer and model configurations
463
494
  print(f"Hyperparameters: {hyperparameters}")
495
+ n_folds = hyperparameters.get("n_folds", 5) # Number of CV folds (default: 5)
496
+
497
+ # =========================================================================
498
+ # UNIFIED TRAINING: Works for n_folds=1 (single model) or n_folds>1 (K-fold CV)
499
+ # =========================================================================
500
+ print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
501
+
502
+ # Create fold splits
503
+ if n_folds == 1:
504
+ # Single fold: use train/val split from "training" column or random split
505
+ if "training" in all_df.columns:
506
+ print("Found training column, splitting data based on training column")
507
+ train_idx = np.where(all_df["training"])[0]
508
+ val_idx = np.where(~all_df["training"])[0]
509
+ else:
510
+ print("WARNING: No training column found, splitting data with random 80/20 split")
511
+ indices = np.arange(len(all_df))
512
+ train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
513
+ folds = [(train_idx, val_idx)]
514
+ else:
515
+ # K-Fold CV
516
+ if model_type == "classifier":
517
+ kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
518
+ split_target = all_df[target]
519
+ else:
520
+ kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
521
+ split_target = None
522
+ folds = list(kfold.split(all_df, split_target))
523
+
524
+ # Initialize storage for out-of-fold predictions
525
+ oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
526
+ if model_type == "classifier" and num_classes and num_classes > 1:
527
+ oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
528
+ else:
529
+ oof_proba = None
464
530
 
465
- # Set up PyTorch Tabular configuration with defaults
466
- trainer_defaults = {
467
- "auto_lr_find": False,
468
- "batch_size": min(128, max(32, len(df_train) // 16)),
469
- "max_epochs": 100,
470
- "min_epochs": 10,
471
- "early_stopping": "valid_loss",
472
- "early_stopping_patience": 10,
473
- "checkpoints": "valid_loss",
474
- "accelerator": "auto",
475
- "progress_bar": "none",
476
- "gradient_clip_val": 1.0,
477
- }
531
+ ensemble_models = []
478
532
 
479
- # Override defaults with training_config if present
480
- training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
481
- for key, value in training_overrides.items():
482
- print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
483
- trainer_params = {**trainer_defaults, **training_overrides}
484
- trainer_config = TrainerConfig(**trainer_params)
533
+ # Set up PyTorch Tabular data configuration (shared across folds)
534
+ data_config = DataConfig(
535
+ target=[target],
536
+ continuous_cols=continuous_cols,
537
+ categorical_cols=categorical_cols,
538
+ )
485
539
 
486
540
  # Model config defaults
487
541
  model_defaults = {
488
542
  "layers": "256-128-64",
489
543
  "activation": "LeakyReLU",
490
544
  "learning_rate": 1e-3,
491
- "dropout": 0.3,
545
+ "dropout": 0.1,
492
546
  "use_batch_norm": True,
493
547
  "initialization": "kaiming",
494
548
  }
@@ -498,41 +552,107 @@ if __name__ == "__main__":
498
552
  print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
499
553
  model_params = {**model_defaults, **model_overrides}
500
554
 
501
- # Use CategoryEmbedding model configuration for general-purpose tabular modeling.
502
555
  model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
503
556
  optimizer_config = OptimizerConfig()
504
557
 
505
- #####################################
506
- # Create and train the TabularModel #
507
- #####################################
508
- tabular_model = TabularModel(
509
- data_config=data_config,
510
- model_config=model_config,
511
- optimizer_config=optimizer_config,
512
- trainer_config=trainer_config,
513
- )
514
- tabular_model.fit(train=df_train, validation=df_val)
558
+ for fold_idx, (train_idx, val_idx) in enumerate(folds):
559
+ print(f"\n{'='*50}")
560
+ print(f"Training Fold {fold_idx + 1}/{len(folds)}")
561
+ print(f"{'='*50}")
562
+
563
+ # Split data for this fold
564
+ df_train = all_df.iloc[train_idx].reset_index(drop=True)
565
+ df_val = all_df.iloc[val_idx].reset_index(drop=True)
566
+
567
+ print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
568
+
569
+ # Set up PyTorch Tabular trainer configuration (per-fold for batch_size)
570
+ # Calculate batch size that avoids single-sample last batch (batch norm requires >1)
571
+ batch_size = min(128, max(32, len(df_train) // 16))
572
+ if len(df_train) % batch_size == 1:
573
+ batch_size += 1 # Adjust to avoid last batch of size 1
574
+ trainer_defaults = {
575
+ "auto_lr_find": False,
576
+ "batch_size": batch_size,
577
+ "max_epochs": 200,
578
+ "min_epochs": 10,
579
+ "early_stopping": "valid_loss",
580
+ "early_stopping_patience": 20,
581
+ "checkpoints": "valid_loss",
582
+ "accelerator": "auto",
583
+ "progress_bar": "none",
584
+ "gradient_clip_val": 1.0,
585
+ "seed": 42 + fold_idx,
586
+ }
587
+
588
+ # Override defaults with training_config if present
589
+ training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
590
+ if fold_idx == 0: # Only print overrides once
591
+ for key, value in training_overrides.items():
592
+ print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
593
+ trainer_params = {**trainer_defaults, **training_overrides}
594
+ trainer_config = TrainerConfig(**trainer_params)
595
+
596
+ # Create and train the TabularModel for this fold
597
+ tabular_model = TabularModel(
598
+ data_config=data_config,
599
+ model_config=model_config,
600
+ optimizer_config=optimizer_config,
601
+ trainer_config=trainer_config,
602
+ )
603
+ tabular_model.fit(train=df_train, validation=df_val)
604
+ ensemble_models.append(tabular_model)
605
+
606
+ # Make out-of-fold predictions
607
+ result = tabular_model.predict(df_val, include_input_features=False)
608
+ fold_preds = result[f"{target}_prediction"].values
609
+
610
+ # Store out-of-fold predictions
611
+ if model_type == "classifier":
612
+ oof_predictions[val_idx] = fold_preds.astype(int)
613
+ prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
614
+ if prob_cols and oof_proba is not None:
615
+ oof_proba[val_idx] = result[prob_cols].values
616
+ else:
617
+ oof_predictions[val_idx] = fold_preds.flatten()
515
618
 
516
- # Make Predictions on the Validation Set
517
- print("Making Predictions on Validation Set...")
518
- result = tabular_model.predict(df_val, include_input_features=False)
619
+ print(f"Fold {fold_idx + 1} complete!")
519
620
 
520
- # pytorch-tabular returns predictions using f"{target}_prediction" column
521
- preds = result[f"{target}_prediction"].values
621
+ print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
622
+
623
+ # Use out-of-fold predictions for metrics
624
+ # For n_folds=1, we only have predictions for val_idx, so filter to those rows
625
+ if n_folds == 1:
626
+ val_mask = ~np.isnan(oof_predictions)
627
+ preds = oof_predictions[val_mask]
628
+ df_val = all_df[val_mask].copy()
629
+ if oof_proba is not None:
630
+ oof_proba = oof_proba[val_mask]
631
+ else:
632
+ preds = oof_predictions
633
+ df_val = all_df.copy()
634
+
635
+ # Compute prediction_std by running all ensemble models on validation data
636
+ # For n_folds=1, std will be 0 (only one model). For n_folds>1, std shows ensemble disagreement.
637
+ preds_std = None
638
+ if model_type in ["regressor", "uq_regressor"] and len(ensemble_models) > 0:
639
+ print("Computing prediction_std from ensemble predictions on validation data...")
640
+ all_ensemble_preds_for_std = []
641
+ for ens_model in ensemble_models:
642
+ result = ens_model.predict(df_val[features], include_input_features=False)
643
+ ens_preds = result[f"{target}_prediction"].values.flatten()
644
+ all_ensemble_preds_for_std.append(ens_preds)
645
+
646
+ ensemble_preds_stacked = np.stack(all_ensemble_preds_for_std, axis=0)
647
+ preds_std = np.std(ensemble_preds_stacked, axis=0)
648
+ print(f"Ensemble prediction_std - mean: {np.mean(preds_std):.4f}, max: {np.max(preds_std):.4f}")
522
649
 
523
650
  if model_type == "classifier":
524
651
  # Get probabilities for classification
525
- print("Processing Probabilities...")
526
- prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
527
- if prob_cols:
528
- probs = result[prob_cols].values
529
- df_val = df_val.copy() # Avoid SettingWithCopyWarning
530
- df_val["pred_proba"] = [p.tolist() for p in probs]
531
-
532
- # Expand the pred_proba column into separate columns for each class
533
- print(df_val.columns.tolist())
652
+ if oof_proba is not None:
653
+ df_val = df_val.copy()
654
+ df_val["pred_proba"] = [p.tolist() for p in oof_proba]
534
655
  df_val = expand_proba_column(df_val, label_encoder.classes_)
535
- print(df_val.columns.tolist())
536
656
 
537
657
  # Decode the target and prediction labels
538
658
  y_validate = label_encoder.inverse_transform(df_val[target])
@@ -544,7 +664,22 @@ if __name__ == "__main__":
544
664
  # Save predictions to S3
545
665
  df_val = df_val.copy()
546
666
  df_val["prediction"] = preds_decoded
547
- output_columns = [target, "prediction"]
667
+
668
+ # Build output columns - include id_column if it exists
669
+ output_columns = []
670
+ if id_column in df_val.columns:
671
+ output_columns.append(id_column)
672
+ output_columns += [target, "prediction"]
673
+
674
+ # Add prediction_std for regression models (always present, 0 for single model)
675
+ if model_type in ["regressor", "uq_regressor"]:
676
+ if preds_std is not None:
677
+ df_val["prediction_std"] = preds_std
678
+ else:
679
+ df_val["prediction_std"] = 0.0
680
+ output_columns.append("prediction_std")
681
+ print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
682
+
548
683
  output_columns += [col for col in df_val.columns if col.endswith("_proba")]
549
684
  wr.s3.to_csv(
550
685
  df_val[output_columns],
@@ -589,14 +724,29 @@ if __name__ == "__main__":
589
724
  # Calculate various model performance metrics (regression)
590
725
  rmse = root_mean_squared_error(y_validate, preds_decoded)
591
726
  mae = mean_absolute_error(y_validate, preds_decoded)
727
+ medae = median_absolute_error(y_validate, preds_decoded)
592
728
  r2 = r2_score(y_validate, preds_decoded)
593
- print(f"RMSE: {rmse:.3f}")
594
- print(f"MAE: {mae:.3f}")
595
- print(f"R2: {r2:.3f}")
596
- print(f"NumRows: {len(df_val)}")
729
+ spearman_corr = spearmanr(y_validate, preds_decoded).correlation
730
+ support = len(df_val)
731
+ print(f"rmse: {rmse:.3f}")
732
+ print(f"mae: {mae:.3f}")
733
+ print(f"medae: {medae:.3f}")
734
+ print(f"r2: {r2:.3f}")
735
+ print(f"spearmanr: {spearman_corr:.3f}")
736
+ print(f"support: {support}")
737
+
738
+ # Save ensemble models
739
+ for model_idx, ens_model in enumerate(ensemble_models):
740
+ model_path = os.path.join(args.model_dir, f"tabular_model_{model_idx}")
741
+ ens_model.save_model(model_path)
742
+ print(f"Saved model {model_idx + 1} to {model_path}")
743
+
744
+ # Save ensemble metadata
745
+ n_ensemble = len(ensemble_models)
746
+ ensemble_metadata = {"n_ensemble": n_ensemble, "n_folds": n_folds}
747
+ joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
748
+ print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
597
749
 
598
- # Save the model to the standard place/name
599
- tabular_model.save_model(os.path.join(args.model_dir, "tabular_model"))
600
750
  if label_encoder:
601
751
  joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
602
752
 
@@ -5,7 +5,8 @@ from xgboost import XGBRegressor
5
5
  from sklearn.model_selection import train_test_split
6
6
 
7
7
  # Model Performance Scores
8
- from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
8
+ from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
9
+ from scipy.stats import spearmanr
9
10
 
10
11
  from io import StringIO
11
12
  import json
@@ -18,11 +19,11 @@ from typing import List, Tuple, Optional, Dict
18
19
 
19
20
  # Template Placeholders
20
21
  TEMPLATE_PARAMS = {
21
- "target": "caco_2_efflux",
22
- "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
22
+ "target": "udm_asy_res_efflux_ratio",
23
+ "features": ['smr_vsa4', 'tpsa', 'nhohcount', 'mollogp', 'peoe_vsa1', 'smr_vsa3', 'nitrogen_span', 'numhdonors', 'minpartialcharge', 'vsa_estate3', 'vsa_estate6', 'tertiary_amine_count', 'hba_hbd_ratio', 'peoe_vsa8', 'estate_vsa4', 'xc_4dv', 'vsa_estate2', 'molmr', 'xp_2dv', 'mi', 'molecular_axis_length', 'vsa_estate4', 'xp_6dv', 'qed', 'estate_vsa8', 'chi1v', 'asphericity', 'axp_1d', 'bcut2d_logphi', 'kappa3', 'axp_7d', 'num_s_centers', 'amphiphilic_moment', 'molecular_asymmetry', 'charge_centroid_distance', 'estate_vsa3', 'vsa_estate8', 'aromatic_interaction_score', 'molecular_volume_3d', 'axp_7dv', 'peoe_vsa3', 'smr_vsa6', 'bcut2d_mrhi', 'radius_of_gyration', 'xpc_4dv', 'minabsestateindex', 'axp_0dv', 'chi4n', 'balabanj', 'bcut2d_mwlow', 'estate_vsa2', 'axp_5d', 'maxestateindex', 'bcut2d_mrlow', 'type_ii_pattern_count', 'avgipc', 'slogp_vsa1', 'fr_nhpyrrole', 'xch_7d', 'axp_1dv', 'peoe_vsa9', 'xch_6d', 'xch_5dv', 'bcut2d_chglo', 'fpdensitymorgan1', 'fr_al_oh', 'axp_5dv', 'smr_vsa5', 'chi2v', 'estate_vsa6', 'smr_vsa9', 'minestateindex', 'bcut2d_logplow', 'c3sp3', 'xp_3d', 'vsa_estate9', 'nbase', 'peoe_vsa2', 'numatomstereocenters', 'xc_5dv', 'bcut2d_mwhi', 'nocount', 'slogp_vsa2', 'smr_vsa1', 'axp_6d', 'maxabspartialcharge', 'vsa_estate5', 'fpdensitymorgan2', 'xp_7d', 'peoe_vsa10', 'num_r_centers', 'mv', 'vsa_estate10', 'xp_0dv', 'axp_4d', 'fractioncsp3', 'smr_vsa10', 'xp_7dv', 'xp_4dv', 'hallkieralpha', 'numhacceptors', 'axp_3d', 'vsa_estate7', 'slogp_vsa3', 'peoe_vsa7', 'estate_vsa10', 'axp_2d', 'c1sp3', 'axp_2dv', 'slogp_vsa4', 'estate_vsa9', 'xch_7dv', 'sps', 'chi0n', 'axp_6dv', 'fr_imidazole', 'xpc_4d', 'bcut2d_chghi', 'chi3n', 'peoe_vsa11', 'xpc_6d', 'estate_vsa1', 'xch_6dv', 'chi3v', 'axp_3dv', 'xc_5d', 'slogp_vsa5', 'maxpartialcharge', 'estate_vsa5', 'fr_hoccn', 'heavyatommolwt', 'fr_ar_n', 'mz', 'xpc_5d', 'axp_4dv', 'xc_3dv', 'xp_6d', 'bertzct', 'peoe_vsa6', 'xc_3d', 'c2sp2', 'fpdensitymorgan3', 'xpc_5dv', 'intramolecular_hbond_potential', 'molwt', 'estate_vsa7', 'xp_5d', 'kappa1', 'xp_5dv', 'chi2n', 'axp_0d', 'xch_4dv', 'xp_4d', 'mp', 'chi1n', 'mm', 'fr_nh0', 'phi', 'labuteasa', 'xp_3dv', 'mse', 'xc_6dv', 'fr_piperzine', 'chi4v', 'xch_4d', 'fr_pyridine', 'xp_2d', 'num_stereocenters', 'minabspartialcharge', 'numaliphaticheterocycles', 'chi0v', 'type_i_pattern_count', 'fr_ketone_topliss', 'fr_ether', 'fr_priamide', 'num_defined_stereocenters', 'mare', 'peoe_vsa12', 'numheteroatoms', 'peoe_vsa4', 'peoe_vsa13', 'hybratio', 'numheterocycles', 'sse', 'fr_c_o_nocoo', 'fr_thiazole', 'slogp_vsa6', 'smr_vsa7', 'vsa_estate1', 'xch_5d', 'smr_vsa2', 'amide_count', 'fr_amide', 'chi1', 'fr_urea', 'fr_aniline', 'fr_aryl_methyl', 'fr_piperdine', 'numrotatablebonds', 'fr_nitrile', 'si', 'sp', 'fr_morpholine', 'numsaturatedheterocycles', 'kappa2', 'chi0', 'mpe', 'sare', 'numaromaticheterocycles', 'naromatom', 'xpc_6dv', 'fr_nh1', 'slogp_vsa10', 'numsaturatedrings', 'c2sp3', 'slogp_vsa8', 'c1sp2', 'fr_ndealkylation2', 'spe', 'xc_6d', 'slogp_vsa11', 'numaliphaticcarbocycles', 'frac_defined_stereo', 'numunspecifiedatomstereocenters', 'fr_benzene', 'xp_1d', 'xc_4d', 'fr_methoxy', 'c4sp3', 'numvalenceelectrons', 'fr_bicyclic', 'fr_imine', 'fr_sulfone', 'c3sp2', 'ringcount', 'slogp_vsa7', 'exactmolwt', 'fr_al_oh_notert', 'fr_tetrazole', 'peoe_vsa14', 'xch_3d', 'fr_para_hydroxylation', 'numspiroatoms'],
23
24
  "compressed_features": [],
24
25
  "train_all_data": True,
25
- "hyperparameters": {},
26
+ "hyperparameters": {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.04},
26
27
  }
27
28
 
28
29
 
@@ -277,10 +278,15 @@ if __name__ == "__main__":
277
278
  xgb_mae = mean_absolute_error(y_validate, y_pred_xgb)
278
279
  xgb_r2 = r2_score(y_validate, y_pred_xgb)
279
280
 
281
+ xgb_medae = median_absolute_error(y_validate, y_pred_xgb)
282
+ xgb_spearman = spearmanr(y_validate, y_pred_xgb).correlation
283
+
280
284
  print(f"\nXGBoost Point Prediction Performance:")
281
- print(f"RMSE: {xgb_rmse:.3f}")
282
- print(f"MAE: {xgb_mae:.3f}")
283
- print(f"R2: {xgb_r2:.3f}")
285
+ print(f"rmse: {xgb_rmse:.3f}")
286
+ print(f"mae: {xgb_mae:.3f}")
287
+ print(f"medae: {xgb_medae:.3f}")
288
+ print(f"r2: {xgb_r2:.3f}")
289
+ print(f"spearmanr: {xgb_spearman:.3f}")
284
290
 
285
291
  # Define confidence levels we want to model
286
292
  confidence_levels = [0.50, 0.68, 0.80, 0.90, 0.95] # 50%, 68%, 80%, 90%, 95% confidence intervals
@@ -336,11 +342,14 @@ if __name__ == "__main__":
336
342
  coverage = np.mean((y_validate >= y_pis[:, 0, 0]) & (y_validate <= y_pis[:, 1, 0]))
337
343
  print(f" Coverage: Target={confidence_level * 100:.0f}%, Empirical={coverage * 100:.1f}%")
338
344
 
345
+ support = len(df_val)
339
346
  print(f"\nOverall Model Performance Summary:")
340
- print(f"XGBoost RMSE: {xgb_rmse:.3f}")
341
- print(f"XGBoost MAE: {xgb_mae:.3f}")
342
- print(f"XGBoost R2: {xgb_r2:.3f}")
343
- print(f"NumRows: {len(df_val)}")
347
+ print(f"rmse: {xgb_rmse:.3f}")
348
+ print(f"mae: {xgb_mae:.3f}")
349
+ print(f"medae: {xgb_medae:.3f}")
350
+ print(f"r2: {xgb_r2:.3f}")
351
+ print(f"spearmanr: {xgb_spearman:.3f}")
352
+ print(f"support: {support}")
344
353
 
345
354
  # Analyze interval widths across confidence levels
346
355
  print(f"\nInterval Width Analysis:")