workbench 0.8.201__py3-none-any.whl → 0.8.203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. workbench/api/df_store.py +17 -108
  2. workbench/api/feature_set.py +41 -7
  3. workbench/api/parameter_store.py +3 -52
  4. workbench/core/artifacts/artifact.py +5 -5
  5. workbench/core/artifacts/df_store_core.py +114 -0
  6. workbench/core/artifacts/endpoint_core.py +203 -58
  7. workbench/core/artifacts/model_core.py +11 -7
  8. workbench/core/artifacts/parameter_store_core.py +98 -0
  9. workbench/core/transforms/features_to_model/features_to_model.py +27 -13
  10. workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
  11. workbench/model_scripts/chemprop/chemprop.template +297 -295
  12. workbench/model_scripts/chemprop/generated_model_script.py +300 -298
  13. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +11 -5
  14. workbench/model_scripts/custom_models/uq_models/meta_uq.template +11 -5
  15. workbench/model_scripts/custom_models/uq_models/ngboost.template +11 -5
  16. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +11 -5
  17. workbench/model_scripts/pytorch_model/generated_model_script.py +278 -128
  18. workbench/model_scripts/pytorch_model/pytorch.template +273 -123
  19. workbench/model_scripts/uq_models/generated_model_script.py +19 -10
  20. workbench/model_scripts/uq_models/mapie.template +17 -8
  21. workbench/model_scripts/xgb_model/generated_model_script.py +38 -9
  22. workbench/model_scripts/xgb_model/xgb_model.template +34 -5
  23. workbench/resources/open_source_api.key +1 -1
  24. workbench/utils/chemprop_utils.py +38 -1
  25. workbench/utils/pytorch_utils.py +38 -8
  26. workbench/web_interface/components/model_plot.py +7 -1
  27. {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/METADATA +2 -2
  28. {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/RECORD +32 -32
  29. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  30. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -296
  31. {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/WHEEL +0 -0
  32. {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/entry_points.txt +0 -0
  33. {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/licenses/LICENSE +0 -0
  34. {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/top_level.txt +0 -0
@@ -39,11 +39,13 @@ from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
39
39
  from sklearn.preprocessing import LabelEncoder
40
40
  from sklearn.metrics import (
41
41
  mean_absolute_error,
42
+ median_absolute_error,
42
43
  r2_score,
43
44
  root_mean_squared_error,
44
45
  precision_recall_fscore_support,
45
46
  confusion_matrix,
46
47
  )
48
+ from scipy.stats import spearmanr
47
49
  import joblib
48
50
 
49
51
  # ChemProp imports
@@ -51,12 +53,12 @@ from chemprop import data, models, nn
51
53
 
52
54
  # Template Parameters
53
55
  TEMPLATE_PARAMS = {
54
- "model_type": "classifier",
55
- "target": "solubility_class",
56
+ "model_type": "regressor",
57
+ "targets": ['logd', 'ksol', 'hlm_clint', 'mlm_clint', 'caco_2_papp_a_b', 'caco_2_efflux', 'mppb', 'mbpb', 'mgmb'], # List of target columns (single or multi-task)
56
58
  "feature_list": ['smiles'],
57
- "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-chemprop-class/training",
58
- "train_all_data": False,
59
- "hyperparameters": {'max_epochs': 400, 'hidden_dim': 300, 'depth': 3, 'n_folds': 5},
59
+ "id_column": "molecule_name",
60
+ "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/open-admet-chemprop-mt/training",
61
+ "hyperparameters": {},
60
62
  }
61
63
 
62
64
 
@@ -108,14 +110,14 @@ def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFra
108
110
 
109
111
  def create_molecule_datapoints(
110
112
  smiles_list: list[str],
111
- targets: list[float] | None = None,
113
+ targets: list[float] | np.ndarray | None = None,
112
114
  extra_descriptors: np.ndarray | None = None,
113
115
  ) -> tuple[list[data.MoleculeDatapoint], list[int]]:
114
116
  """Create ChemProp MoleculeDatapoints from SMILES strings.
115
117
 
116
118
  Args:
117
119
  smiles_list: List of SMILES strings
118
- targets: Optional list of target values (for training)
120
+ targets: Optional target values as 2D array (n_samples, n_targets). NaN allowed for missing targets.
119
121
  extra_descriptors: Optional array of extra features (n_samples, n_features)
120
122
 
121
123
  Returns:
@@ -127,6 +129,12 @@ def create_molecule_datapoints(
127
129
  valid_indices = []
128
130
  invalid_count = 0
129
131
 
132
+ # Convert targets to 2D array if provided
133
+ if targets is not None:
134
+ targets = np.atleast_2d(np.array(targets))
135
+ if targets.shape[0] == 1 and len(smiles_list) > 1:
136
+ targets = targets.T # Shape was (1, n_samples), transpose to (n_samples, 1)
137
+
130
138
  for i, smi in enumerate(smiles_list):
131
139
  # Validate SMILES with RDKit first
132
140
  mol = Chem.MolFromSmiles(smi)
@@ -134,8 +142,9 @@ def create_molecule_datapoints(
134
142
  invalid_count += 1
135
143
  continue
136
144
 
137
- # Build datapoint with optional target and extra descriptors
138
- y = [targets[i]] if targets is not None else None
145
+ # Build datapoint with optional target(s) and extra descriptors
146
+ # For multi-task, y is a list of values (can include NaN for missing targets)
147
+ y = targets[i].tolist() if targets is not None else None
139
148
  x_d = extra_descriptors[i] if extra_descriptors is not None else None
140
149
 
141
150
  dp = data.MoleculeDatapoint.from_smi(smi, y=y, x_d=x_d)
@@ -152,9 +161,11 @@ def build_mpnn_model(
152
161
  hyperparameters: dict,
153
162
  task: str = "regression",
154
163
  num_classes: int | None = None,
164
+ n_targets: int = 1,
155
165
  n_extra_descriptors: int = 0,
156
166
  x_d_transform: nn.ScaleTransform | None = None,
157
167
  output_transform: nn.UnscaleTransform | None = None,
168
+ task_weights: np.ndarray | None = None,
158
169
  ) -> models.MPNN:
159
170
  """Build an MPNN model with the specified hyperparameters.
160
171
 
@@ -162,19 +173,21 @@ def build_mpnn_model(
162
173
  hyperparameters: Dictionary of model hyperparameters
163
174
  task: Either "regression" or "classification"
164
175
  num_classes: Number of classes for classification tasks
176
+ n_targets: Number of target columns (for multi-task regression)
165
177
  n_extra_descriptors: Number of extra descriptor features (for hybrid mode)
166
178
  x_d_transform: Optional transform for extra descriptors (scaling)
167
179
  output_transform: Optional transform for regression output (unscaling targets)
180
+ task_weights: Optional array of weights for each task (multi-task learning)
168
181
 
169
182
  Returns:
170
183
  Configured MPNN model
171
184
  """
172
185
  # Model hyperparameters with defaults
173
- hidden_dim = hyperparameters.get("hidden_dim", 300)
174
- depth = hyperparameters.get("depth", 3)
175
- dropout = hyperparameters.get("dropout", 0.0)
176
- ffn_hidden_dim = hyperparameters.get("ffn_hidden_dim", 300)
177
- ffn_num_layers = hyperparameters.get("ffn_num_layers", 1)
186
+ hidden_dim = hyperparameters.get("hidden_dim", 700)
187
+ depth = hyperparameters.get("depth", 6)
188
+ dropout = hyperparameters.get("dropout", 0.25)
189
+ ffn_hidden_dim = hyperparameters.get("ffn_hidden_dim", 2000)
190
+ ffn_num_layers = hyperparameters.get("ffn_num_layers", 2)
178
191
 
179
192
  # Message passing component
180
193
  mp = nn.BondMessagePassing(d_h=hidden_dim, depth=depth, dropout=dropout)
@@ -197,12 +210,20 @@ def build_mpnn_model(
197
210
  )
198
211
  else:
199
212
  # Regression with optional output transform to unscale predictions
213
+ # n_tasks controls the number of output heads for multi-task learning
214
+ # task_weights goes here (in RegressionFFN) to weight loss per task
215
+ weights_tensor = None
216
+ if task_weights is not None:
217
+ weights_tensor = torch.tensor(task_weights, dtype=torch.float32)
218
+
200
219
  ffn = nn.RegressionFFN(
201
220
  input_dim=ffn_input_dim,
202
221
  hidden_dim=ffn_hidden_dim,
203
222
  n_layers=ffn_num_layers,
204
223
  dropout=dropout,
224
+ n_tasks=n_targets,
205
225
  output_transform=output_transform,
226
+ task_weights=weights_tensor,
206
227
  )
207
228
 
208
229
  # Create the MPNN model
@@ -227,31 +248,26 @@ def model_fn(model_dir: str) -> dict:
227
248
  Returns:
228
249
  Dictionary with ensemble models and metadata
229
250
  """
230
- # Load ensemble metadata
251
+ # Load ensemble metadata (required)
231
252
  ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
232
- if os.path.exists(ensemble_metadata_path):
233
- ensemble_metadata = joblib.load(ensemble_metadata_path)
234
- n_ensemble = ensemble_metadata["n_ensemble"]
235
- else:
236
- # Backwards compatibility: single model without ensemble metadata
237
- n_ensemble = 1
253
+ ensemble_metadata = joblib.load(ensemble_metadata_path)
254
+ n_ensemble = ensemble_metadata["n_ensemble"]
255
+ target_columns = ensemble_metadata["target_columns"]
238
256
 
239
257
  # Load all ensemble models
240
258
  ensemble_models = []
241
259
  for ens_idx in range(n_ensemble):
242
260
  model_path = os.path.join(model_dir, f"chemprop_model_{ens_idx}.pt")
243
- if not os.path.exists(model_path):
244
- # Backwards compatibility: try old single model path
245
- model_path = os.path.join(model_dir, "chemprop_model.pt")
246
261
  model = models.MPNN.load_from_file(model_path)
247
262
  model.eval()
248
263
  ensemble_models.append(model)
249
264
 
250
- print(f"Loaded {len(ensemble_models)} ensemble model(s)")
265
+ print(f"Loaded {len(ensemble_models)} ensemble model(s), n_targets={len(target_columns)}")
251
266
 
252
267
  return {
253
268
  "ensemble_models": ensemble_models,
254
269
  "n_ensemble": n_ensemble,
270
+ "target_columns": target_columns,
255
271
  }
256
272
 
257
273
 
@@ -297,9 +313,10 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
297
313
  model_type = TEMPLATE_PARAMS["model_type"]
298
314
  model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
299
315
 
300
- # Extract ensemble models
316
+ # Extract ensemble models and metadata
301
317
  ensemble_models = model_dict["ensemble_models"]
302
318
  n_ensemble = model_dict["n_ensemble"]
319
+ target_columns = model_dict["target_columns"]
303
320
 
304
321
  # Load label encoder if present (classification)
305
322
  label_encoder = None
@@ -337,13 +354,14 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
337
354
  valid_mask = np.array(valid_mask)
338
355
  print(f"Valid SMILES: {sum(valid_mask)} / {len(smiles_list)}")
339
356
 
340
- # Initialize prediction column (use object dtype for classifiers to avoid FutureWarning)
357
+ # Initialize prediction columns (use object dtype for classifiers to avoid FutureWarning)
341
358
  if model_type == "classifier":
342
359
  df["prediction"] = pd.Series([None] * len(df), dtype=object)
343
360
  else:
344
- df["prediction"] = np.nan
345
- if n_ensemble > 1:
346
- df["prediction_std"] = np.nan
361
+ # Regression: create prediction column for each target
362
+ for tc in target_columns:
363
+ df[f"{tc}_pred"] = np.nan
364
+ df[f"{tc}_pred_std"] = np.nan
347
365
 
348
366
  if sum(valid_mask) == 0:
349
367
  print("Warning: No valid SMILES to predict on")
@@ -408,10 +426,15 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
408
426
  ens_preds = ens_preds.squeeze(axis=1)
409
427
  all_ensemble_preds.append(ens_preds)
410
428
 
411
- # Stack and compute mean/std
429
+ # Stack and compute mean/std (std is 0 for single model)
412
430
  ensemble_preds = np.stack(all_ensemble_preds, axis=0)
413
431
  preds = np.mean(ensemble_preds, axis=0)
414
- preds_std = np.std(ensemble_preds, axis=0) if n_ensemble > 1 else None
432
+ preds_std = np.std(ensemble_preds, axis=0) # Will be 0s for n_ensemble=1
433
+
434
+ # Ensure 2D: (n_samples, n_targets)
435
+ if preds.ndim == 1:
436
+ preds = preds.reshape(-1, 1)
437
+ preds_std = preds_std.reshape(-1, 1)
415
438
 
416
439
  print(f"Inference: Ensemble predictions shape: {preds.shape}")
417
440
 
@@ -440,12 +463,10 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
440
463
  decoded_preds = label_encoder.inverse_transform(class_preds)
441
464
  df.loc[valid_mask, "prediction"] = decoded_preds
442
465
  else:
443
- # Regression: direct predictions
444
- df.loc[valid_mask, "prediction"] = preds.flatten()
445
-
446
- # Add prediction_std for ensemble models
447
- if preds_std is not None:
448
- df.loc[valid_mask, "prediction_std"] = preds_std.flatten()
466
+ # Regression: store predictions for each target
467
+ for t_idx, tc in enumerate(target_columns):
468
+ df.loc[valid_mask, f"{tc}_pred"] = preds[:, t_idx]
469
+ df.loc[valid_mask, f"{tc}_pred_std"] = preds_std[:, t_idx]
449
470
 
450
471
  return df
451
472
 
@@ -454,13 +475,18 @@ if __name__ == "__main__":
454
475
  """Training script for ChemProp MPNN model"""
455
476
 
456
477
  # Template Parameters
457
- target = TEMPLATE_PARAMS["target"]
478
+ target_columns = TEMPLATE_PARAMS["targets"] # List of target columns
458
479
  model_type = TEMPLATE_PARAMS["model_type"]
459
480
  feature_list = TEMPLATE_PARAMS["feature_list"]
481
+ id_column = TEMPLATE_PARAMS["id_column"]
460
482
  model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
461
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
462
483
  hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
463
- validation_split = 0.2
484
+
485
+ # Validate target_columns
486
+ if not target_columns or not isinstance(target_columns, list) or len(target_columns) == 0:
487
+ raise ValueError("'targets' must be a non-empty list of target column names")
488
+ n_targets = len(target_columns)
489
+ print(f"Target columns ({n_targets}): {target_columns}")
464
490
 
465
491
  # Get the SMILES column name from feature_list (user defines this, so we use their exact name)
466
492
  smiles_column = find_smiles_column(feature_list)
@@ -502,21 +528,29 @@ if __name__ == "__main__":
502
528
 
503
529
  check_dataframe(all_df, "training_df")
504
530
 
505
- # Drop rows with missing SMILES or target values
531
+ # Drop rows with missing SMILES or all target values
506
532
  initial_count = len(all_df)
507
- all_df = all_df.dropna(subset=[smiles_column, target])
533
+ all_df = all_df.dropna(subset=[smiles_column])
534
+ # Keep rows that have at least one non-null target (works for single and multi-task)
535
+ has_any_target = all_df[target_columns].notna().any(axis=1)
536
+ all_df = all_df[has_any_target]
508
537
  dropped = initial_count - len(all_df)
509
538
  if dropped > 0:
510
- print(f"Dropped {dropped} rows with missing SMILES or target values")
539
+ print(f"Dropped {dropped} rows with missing SMILES or all target values")
511
540
 
512
- print(f"Target: {target}")
541
+ print(f"Target columns: {target_columns}")
513
542
  print(f"Data Shape after cleaning: {all_df.shape}")
543
+ for tc in target_columns:
544
+ n_valid = all_df[tc].notna().sum()
545
+ print(f" {tc}: {n_valid} samples with values")
514
546
 
515
- # Set up label encoder for classification
547
+ # Set up label encoder for classification (single-target only)
516
548
  label_encoder = None
517
549
  if model_type == "classifier":
550
+ if n_targets > 1:
551
+ raise ValueError("Multi-task classification is not supported. Use regression for multi-task.")
518
552
  label_encoder = LabelEncoder()
519
- all_df[target] = label_encoder.fit_transform(all_df[target])
553
+ all_df[target_columns[0]] = label_encoder.fit_transform(all_df[target_columns[0]])
520
554
  num_classes = len(label_encoder.classes_)
521
555
  print(
522
556
  f"Classification task with {num_classes} classes: {label_encoder.classes_}"
@@ -528,10 +562,10 @@ if __name__ == "__main__":
528
562
  print(f"Hyperparameters: {hyperparameters}")
529
563
  task = "classification" if model_type == "classifier" else "regression"
530
564
  n_extra = len(extra_feature_cols) if use_extra_features else 0
531
- max_epochs = hyperparameters.get("max_epochs", 50)
532
- patience = hyperparameters.get("patience", 10)
533
- n_folds = hyperparameters.get("n_folds", 1) # Number of CV folds (default: 1 = no CV)
534
- batch_size = hyperparameters.get("batch_size", min(64, max(16, len(all_df) // 16)))
565
+ max_epochs = hyperparameters.get("max_epochs", 400)
566
+ patience = hyperparameters.get("patience", 40)
567
+ n_folds = hyperparameters.get("n_folds", 5) # Number of CV folds (default: 5)
568
+ batch_size = hyperparameters.get("batch_size", 16)
535
569
 
536
570
  # Check extra feature columns exist
537
571
  if use_extra_features:
@@ -540,60 +574,108 @@ if __name__ == "__main__":
540
574
  raise ValueError(f"Missing extra feature columns in training data: {missing_cols}")
541
575
 
542
576
  # =========================================================================
543
- # SINGLE MODEL TRAINING (n_folds=1) - uses train/val split
577
+ # UNIFIED TRAINING: Works for n_folds=1 (single model) or n_folds>1 (K-fold CV)
544
578
  # =========================================================================
579
+ print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
580
+
581
+ # Prepare extra features and validate SMILES upfront
582
+ all_extra_features = None
583
+ col_means = None
584
+ if use_extra_features:
585
+ all_extra_features = all_df[extra_feature_cols].values.astype(np.float32)
586
+ col_means = np.nanmean(all_extra_features, axis=0)
587
+ for i in range(all_extra_features.shape[1]):
588
+ all_extra_features[np.isnan(all_extra_features[:, i]), i] = col_means[i]
589
+
590
+ # Prepare target array: always 2D (n_samples, n_targets)
591
+ all_targets = all_df[target_columns].values.astype(np.float32)
592
+
593
+ # Filter invalid SMILES from the full dataset
594
+ _, valid_indices = create_molecule_datapoints(
595
+ all_df[smiles_column].tolist(), all_targets, all_extra_features
596
+ )
597
+ all_df = all_df.iloc[valid_indices].reset_index(drop=True)
598
+ all_targets = all_targets[valid_indices]
599
+ if all_extra_features is not None:
600
+ all_extra_features = all_extra_features[valid_indices]
601
+ print(f"Data after SMILES validation: {all_df.shape}")
602
+
603
+ # Compute dynamic task weights for multi-task regression
604
+ # Weight = inverse of sample count (normalized so min weight = 1.0)
605
+ # This gives higher weight to targets with fewer samples
606
+ task_weights = None
607
+ if n_targets > 1 and model_type != "classifier":
608
+ sample_counts = np.array([np.sum(~np.isnan(all_targets[:, t])) for t in range(n_targets)])
609
+ # Inverse weighting: fewer samples = higher weight
610
+ inverse_counts = 1.0 / sample_counts
611
+ # Normalize so minimum weight is 1.0
612
+ task_weights = inverse_counts / inverse_counts.min()
613
+ print(f"Task weights (inverse sample count):")
614
+ for t_idx, t_name in enumerate(target_columns):
615
+ print(f" {t_name}: {task_weights[t_idx]:.3f} (n={sample_counts[t_idx]})")
616
+
617
+ # Create fold splits
545
618
  if n_folds == 1:
546
- print("Training single model (no cross-validation)...")
547
-
548
- # Split data
549
- if train_all_data:
550
- print("Training on ALL of the data")
551
- df_train = all_df.copy()
552
- df_val = all_df.copy()
553
- elif "training" in all_df.columns:
619
+ # Single fold: use train/val split from "training" column or random split
620
+ if "training" in all_df.columns:
554
621
  print("Found training column, splitting data based on training column")
555
- df_train = all_df[all_df["training"]].copy()
556
- df_val = all_df[~all_df["training"]].copy()
622
+ train_idx = np.where(all_df["training"])[0]
623
+ val_idx = np.where(~all_df["training"])[0]
557
624
  else:
558
- print("WARNING: No training column found, splitting data with random state=42")
559
- df_train, df_val = train_test_split(
560
- all_df, test_size=validation_split, random_state=42
561
- )
625
+ print("WARNING: No training column found, splitting data with random 80/20 split")
626
+ indices = np.arange(len(all_df))
627
+ train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
628
+ folds = [(train_idx, val_idx)]
629
+ else:
630
+ # K-Fold CV
631
+ if model_type == "classifier":
632
+ kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
633
+ split_target = all_df[target_columns[0]]
634
+ else:
635
+ kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
636
+ split_target = None
637
+ folds = list(kfold.split(all_df, split_target))
638
+
639
+ # Initialize storage for out-of-fold predictions: always 2D (n_samples, n_targets)
640
+ oof_predictions = np.full((len(all_df), n_targets), np.nan, dtype=np.float64)
641
+ if model_type == "classifier" and num_classes and num_classes > 1:
642
+ oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
643
+ else:
644
+ oof_proba = None
562
645
 
563
- print(f"TRAIN: {df_train.shape}")
564
- print(f"VALIDATION: {df_val.shape}")
646
+ ensemble_models = []
565
647
 
566
- # Extract and prepare extra features
567
- train_extra_features = None
568
- val_extra_features = None
569
- col_means = None
648
+ for fold_idx, (train_idx, val_idx) in enumerate(folds):
649
+ print(f"\n{'='*50}")
650
+ print(f"Training Fold {fold_idx + 1}/{len(folds)}")
651
+ print(f"{'='*50}")
570
652
 
571
- if use_extra_features:
572
- train_extra_features = df_train[extra_feature_cols].values.astype(np.float32)
573
- val_extra_features = df_val[extra_feature_cols].values.astype(np.float32)
574
- col_means = np.nanmean(train_extra_features, axis=0)
575
- for i in range(train_extra_features.shape[1]):
576
- train_extra_features[np.isnan(train_extra_features[:, i]), i] = col_means[i]
577
- val_extra_features[np.isnan(val_extra_features[:, i]), i] = col_means[i]
578
-
579
- # Create ChemProp datasets
580
- train_datapoints, train_valid_idx = create_molecule_datapoints(
581
- df_train[smiles_column].tolist(), df_train[target].tolist(), train_extra_features
653
+ # Split data for this fold
654
+ df_train = all_df.iloc[train_idx].reset_index(drop=True)
655
+ df_val = all_df.iloc[val_idx].reset_index(drop=True)
656
+ train_targets = all_targets[train_idx]
657
+ val_targets = all_targets[val_idx]
658
+
659
+ train_extra = all_extra_features[train_idx] if all_extra_features is not None else None
660
+ val_extra = all_extra_features[val_idx] if all_extra_features is not None else None
661
+
662
+ print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
663
+
664
+ # Create ChemProp datasets for this fold
665
+ train_datapoints, _ = create_molecule_datapoints(
666
+ df_train[smiles_column].tolist(), train_targets, train_extra
582
667
  )
583
- val_datapoints, val_valid_idx = create_molecule_datapoints(
584
- df_val[smiles_column].tolist(), df_val[target].tolist(), val_extra_features
668
+ val_datapoints, _ = create_molecule_datapoints(
669
+ df_val[smiles_column].tolist(), val_targets, val_extra
585
670
  )
586
671
 
587
- df_train = df_train.iloc[train_valid_idx].reset_index(drop=True)
588
- df_val = df_val.iloc[val_valid_idx].reset_index(drop=True)
589
-
590
672
  train_dataset = data.MoleculeDataset(train_datapoints)
591
673
  val_dataset = data.MoleculeDataset(val_datapoints)
592
674
 
593
- # Save raw validation features for predictions later
594
- val_extra_raw = val_extra_features[val_valid_idx] if val_extra_features is not None else None
675
+ # Save raw val features for prediction
676
+ val_extra_raw = val_extra.copy() if val_extra is not None else None
595
677
 
596
- # Scale features and targets
678
+ # Scale features and targets for this fold
597
679
  x_d_transform = None
598
680
  if use_extra_features:
599
681
  feature_scaler = train_dataset.normalize_inputs("X_d")
@@ -601,7 +683,7 @@ if __name__ == "__main__":
601
683
  x_d_transform = nn.ScaleTransform.from_standard_scaler(feature_scaler)
602
684
 
603
685
  output_transform = None
604
- if model_type == "regressor":
686
+ if model_type in ["regressor", "uq_regressor"]:
605
687
  target_scaler = train_dataset.normalize_targets()
606
688
  val_dataset.normalize_targets(target_scaler)
607
689
  output_transform = nn.UnscaleTransform.from_standard_scaler(target_scaler)
@@ -609,17 +691,18 @@ if __name__ == "__main__":
609
691
  train_loader = data.build_dataloader(train_dataset, batch_size=batch_size, shuffle=True)
610
692
  val_loader = data.build_dataloader(val_dataset, batch_size=batch_size, shuffle=False)
611
693
 
612
- # Build and train single model
613
- pl.seed_everything(42)
694
+ # Build and train model for this fold
695
+ pl.seed_everything(42 + fold_idx)
614
696
  mpnn = build_mpnn_model(
615
- hyperparameters, task=task, num_classes=num_classes,
697
+ hyperparameters, task=task, num_classes=num_classes, n_targets=n_targets,
616
698
  n_extra_descriptors=n_extra, x_d_transform=x_d_transform, output_transform=output_transform,
699
+ task_weights=task_weights,
617
700
  )
618
701
 
619
702
  callbacks = [
620
703
  pl.callbacks.EarlyStopping(monitor="val_loss", patience=patience, mode="min"),
621
704
  pl.callbacks.ModelCheckpoint(
622
- dirpath=args.model_dir, filename="best_model_0",
705
+ dirpath=args.model_dir, filename=f"best_model_{fold_idx}",
623
706
  monitor="val_loss", mode="min", save_top_k=1,
624
707
  ),
625
708
  ]
@@ -636,201 +719,95 @@ if __name__ == "__main__":
636
719
  mpnn.load_state_dict(checkpoint["state_dict"])
637
720
 
638
721
  mpnn.eval()
639
- ensemble_models = [mpnn]
722
+ ensemble_models.append(mpnn)
640
723
 
641
- # Make predictions on validation set
724
+ # Make out-of-fold predictions using raw features
642
725
  val_datapoints_raw, _ = create_molecule_datapoints(
643
- df_val[smiles_column].tolist(), df_val[target].tolist(), val_extra_raw
726
+ df_val[smiles_column].tolist(), val_targets, val_extra_raw
644
727
  )
645
728
  val_dataset_raw = data.MoleculeDataset(val_datapoints_raw)
646
729
  val_loader_pred = data.build_dataloader(val_dataset_raw, batch_size=batch_size, shuffle=False)
647
730
 
648
731
  with torch.inference_mode():
649
- val_predictions = trainer.predict(mpnn, val_loader_pred)
650
- preds = np.concatenate([p.numpy() for p in val_predictions], axis=0)
651
- if preds.ndim == 3 and preds.shape[1] == 1:
652
- preds = preds.squeeze(axis=1)
653
-
654
- preds_std = None
655
- y_validate = df_val[target].values
656
-
657
- # =========================================================================
658
- # K-FOLD CROSS-VALIDATION (n_folds > 1) - trains n_folds models
659
- # =========================================================================
660
- else:
661
- print(f"Training {n_folds}-fold cross-validation ensemble...")
662
-
663
- # Validate all SMILES upfront and filter invalid ones
664
- all_extra_features = None
665
- if use_extra_features:
666
- all_extra_features = all_df[extra_feature_cols].values.astype(np.float32)
667
- col_means = np.nanmean(all_extra_features, axis=0)
668
- for i in range(all_extra_features.shape[1]):
669
- all_extra_features[np.isnan(all_extra_features[:, i]), i] = col_means[i]
670
- else:
671
- col_means = None
672
-
673
- # Filter invalid SMILES from the full dataset
674
- _, valid_indices = create_molecule_datapoints(
675
- all_df[smiles_column].tolist(), all_df[target].tolist(), all_extra_features
676
- )
677
- all_df = all_df.iloc[valid_indices].reset_index(drop=True)
678
- if all_extra_features is not None:
679
- all_extra_features = all_extra_features[valid_indices]
680
- print(f"Data after SMILES validation: {all_df.shape}")
681
-
682
- # Set up K-Fold
683
- if model_type == "classifier":
684
- kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
685
- split_target = all_df[target]
686
- else:
687
- kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
688
- split_target = None
689
-
690
- # Initialize storage for out-of-fold predictions
691
- oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
692
- if model_type == "classifier" and num_classes and num_classes > 1:
693
- oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
732
+ fold_predictions = trainer.predict(mpnn, val_loader_pred)
733
+ fold_preds = np.concatenate([p.numpy() for p in fold_predictions], axis=0)
734
+ if fold_preds.ndim == 3 and fold_preds.shape[1] == 1:
735
+ fold_preds = fold_preds.squeeze(axis=1)
736
+
737
+ # Store out-of-fold predictions
738
+ if model_type == "classifier" and fold_preds.ndim == 2:
739
+ # Store class index in first column for classification
740
+ oof_predictions[val_idx, 0] = np.argmax(fold_preds, axis=1)
741
+ if oof_proba is not None:
742
+ oof_proba[val_idx] = fold_preds
694
743
  else:
695
- oof_proba = None
744
+ # Regression: fold_preds shape is (n_val, n_targets) or (n_val,)
745
+ if fold_preds.ndim == 1:
746
+ fold_preds = fold_preds.reshape(-1, 1)
747
+ oof_predictions[val_idx] = fold_preds
696
748
 
697
- ensemble_models = []
749
+ print(f"Fold {fold_idx + 1} complete!")
698
750
 
699
- for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(all_df, split_target)):
700
- print(f"\n{'='*50}")
701
- print(f"Training Fold {fold_idx + 1}/{n_folds}")
702
- print(f"{'='*50}")
751
+ print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
703
752
 
704
- # Split data for this fold
705
- df_train = all_df.iloc[train_idx].reset_index(drop=True)
706
- df_val = all_df.iloc[val_idx].reset_index(drop=True)
707
-
708
- train_extra = all_extra_features[train_idx] if all_extra_features is not None else None
709
- val_extra = all_extra_features[val_idx] if all_extra_features is not None else None
710
-
711
- print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
712
-
713
- # Create ChemProp datasets for this fold
714
- train_datapoints, _ = create_molecule_datapoints(
715
- df_train[smiles_column].tolist(), df_train[target].tolist(), train_extra
716
- )
717
- val_datapoints, _ = create_molecule_datapoints(
718
- df_val[smiles_column].tolist(), df_val[target].tolist(), val_extra
719
- )
720
-
721
- train_dataset = data.MoleculeDataset(train_datapoints)
722
- val_dataset = data.MoleculeDataset(val_datapoints)
723
-
724
- # Save raw val features for prediction
725
- val_extra_raw = val_extra.copy() if val_extra is not None else None
726
-
727
- # Scale features and targets for this fold
728
- x_d_transform = None
729
- if use_extra_features:
730
- feature_scaler = train_dataset.normalize_inputs("X_d")
731
- val_dataset.normalize_inputs("X_d", feature_scaler)
732
- x_d_transform = nn.ScaleTransform.from_standard_scaler(feature_scaler)
733
-
734
- output_transform = None
735
- if model_type == "regressor":
736
- target_scaler = train_dataset.normalize_targets()
737
- val_dataset.normalize_targets(target_scaler)
738
- output_transform = nn.UnscaleTransform.from_standard_scaler(target_scaler)
739
-
740
- train_loader = data.build_dataloader(train_dataset, batch_size=batch_size, shuffle=True)
741
- val_loader = data.build_dataloader(val_dataset, batch_size=batch_size, shuffle=False)
742
-
743
- # Build and train model for this fold
744
- pl.seed_everything(42 + fold_idx)
745
- mpnn = build_mpnn_model(
746
- hyperparameters, task=task, num_classes=num_classes,
747
- n_extra_descriptors=n_extra, x_d_transform=x_d_transform, output_transform=output_transform,
748
- )
749
-
750
- callbacks = [
751
- pl.callbacks.EarlyStopping(monitor="val_loss", patience=patience, mode="min"),
752
- pl.callbacks.ModelCheckpoint(
753
- dirpath=args.model_dir, filename=f"best_model_{fold_idx}",
754
- monitor="val_loss", mode="min", save_top_k=1,
755
- ),
756
- ]
757
-
758
- trainer = pl.Trainer(
759
- accelerator="auto", max_epochs=max_epochs, callbacks=callbacks,
760
- logger=False, enable_progress_bar=True,
761
- )
762
-
763
- trainer.fit(mpnn, train_loader, val_loader)
764
-
765
- if trainer.checkpoint_callback and trainer.checkpoint_callback.best_model_path:
766
- checkpoint = torch.load(trainer.checkpoint_callback.best_model_path, weights_only=False)
767
- mpnn.load_state_dict(checkpoint["state_dict"])
768
-
769
- mpnn.eval()
770
- ensemble_models.append(mpnn)
771
-
772
- # Make out-of-fold predictions using raw features
773
- val_datapoints_raw, _ = create_molecule_datapoints(
774
- df_val[smiles_column].tolist(), df_val[target].tolist(), val_extra_raw
775
- )
776
- val_dataset_raw = data.MoleculeDataset(val_datapoints_raw)
777
- val_loader_pred = data.build_dataloader(val_dataset_raw, batch_size=batch_size, shuffle=False)
753
+ # Use out-of-fold predictions for metrics
754
+ # For n_folds=1, we only have predictions for val_idx, so filter to those rows
755
+ if n_folds == 1:
756
+ # oof_predictions is always 2D now: check if any column has a value
757
+ val_mask = ~np.isnan(oof_predictions).all(axis=1)
758
+ preds = oof_predictions[val_mask]
759
+ df_val = all_df[val_mask].copy()
760
+ y_validate = all_targets[val_mask]
761
+ if oof_proba is not None:
762
+ oof_proba = oof_proba[val_mask]
763
+ val_extra_features = all_extra_features[val_mask] if all_extra_features is not None else None
764
+ else:
765
+ preds = oof_predictions
766
+ df_val = all_df.copy()
767
+ y_validate = all_targets
768
+ val_extra_features = all_extra_features
769
+
770
+ # Compute prediction_std by running all ensemble models on validation data
771
+ # For n_folds=1, std will be 0 (only one model). For n_folds>1, std shows ensemble disagreement.
772
+ preds_std = None
773
+ if model_type in ["regressor", "uq_regressor"] and len(ensemble_models) > 0:
774
+ print("Computing prediction_std from ensemble predictions on validation data...")
775
+ val_datapoints_for_std, _ = create_molecule_datapoints(
776
+ df_val[smiles_column].tolist(),
777
+ y_validate,
778
+ val_extra_features
779
+ )
780
+ val_dataset_for_std = data.MoleculeDataset(val_datapoints_for_std)
781
+ val_loader_for_std = data.build_dataloader(val_dataset_for_std, batch_size=batch_size, shuffle=False)
778
782
 
783
+ all_ensemble_preds_for_std = []
784
+ trainer_pred = pl.Trainer(accelerator="auto", logger=False, enable_progress_bar=False)
785
+ for ens_model in ensemble_models:
779
786
  with torch.inference_mode():
780
- fold_predictions = trainer.predict(mpnn, val_loader_pred)
781
- fold_preds = np.concatenate([p.numpy() for p in fold_predictions], axis=0)
782
- if fold_preds.ndim == 3 and fold_preds.shape[1] == 1:
783
- fold_preds = fold_preds.squeeze(axis=1)
784
-
785
- # Store out-of-fold predictions
786
- if model_type == "classifier" and fold_preds.ndim == 2:
787
- oof_predictions[val_idx] = np.argmax(fold_preds, axis=1)
788
- if oof_proba is not None:
789
- oof_proba[val_idx] = fold_preds
790
- else:
791
- oof_predictions[val_idx] = fold_preds.flatten()
792
-
793
- print(f"Fold {fold_idx + 1} complete!")
794
-
795
- print(f"\nCross-validation complete! Trained {len(ensemble_models)} models.")
796
-
797
- # Use out-of-fold predictions for metrics
798
- preds = oof_predictions
799
- preds_std = None # Will compute from ensemble at inference time
800
- y_validate = all_df[target].values
801
- df_val = all_df # For saving predictions
787
+ ens_preds = trainer_pred.predict(ens_model, val_loader_for_std)
788
+ ens_preds = np.concatenate([p.numpy() for p in ens_preds], axis=0)
789
+ if ens_preds.ndim == 3 and ens_preds.shape[1] == 1:
790
+ ens_preds = ens_preds.squeeze(axis=1)
791
+ all_ensemble_preds_for_std.append(ens_preds)
792
+
793
+ # Stack ensemble predictions: shape (n_ensemble, n_samples, n_targets)
794
+ ensemble_preds_stacked = np.stack(all_ensemble_preds_for_std, axis=0)
795
+ preds_std = np.std(ensemble_preds_stacked, axis=0)
796
+ # Ensure 2D
797
+ if preds_std.ndim == 1:
798
+ preds_std = preds_std.reshape(-1, 1)
799
+ print(f"Ensemble prediction_std - mean per target: {np.nanmean(preds_std, axis=0)}")
802
800
 
803
801
  if model_type == "classifier":
804
- # Classification metrics - handle multi-class output
805
- # For CV mode, preds already contains class indices; for single model, preds are probabilities
806
- if preds.ndim == 2 and preds.shape[1] > 1:
807
- # Multi-class probabilities: (n_samples, n_classes), take argmax
808
- class_preds = np.argmax(preds, axis=1)
809
- has_proba = True
810
- elif preds.ndim == 1:
811
- # Either class indices (CV mode) or binary probabilities
812
- if n_folds > 1:
813
- # CV mode: preds are already class indices
814
- class_preds = preds.astype(int)
815
- has_proba = False
816
- else:
817
- # Single model: preds are probabilities
818
- class_preds = (preds > 0.5).astype(int)
819
- has_proba = False
820
- else:
821
- # Squeeze extra dimensions if needed
822
- preds = preds.squeeze()
823
- if preds.ndim == 2:
824
- class_preds = np.argmax(preds, axis=1)
825
- has_proba = True
826
- else:
827
- class_preds = (preds > 0.5).astype(int)
828
- has_proba = False
802
+ # Classification metrics - preds contains class indices in first column from OOF predictions
803
+ class_preds = preds[:, 0].astype(int)
804
+ has_proba = oof_proba is not None
829
805
 
830
806
  print(f"class_preds shape: {class_preds.shape}")
831
807
 
832
- # Decode labels for metrics
833
- y_validate_decoded = label_encoder.inverse_transform(y_validate.astype(int))
808
+ # Decode labels for metrics (classification is single-target only)
809
+ target_name = target_columns[0]
810
+ y_validate_decoded = label_encoder.inverse_transform(y_validate[:, 0].astype(int))
834
811
  preds_decoded = label_encoder.inverse_transform(class_preds)
835
812
 
836
813
  # Calculate metrics
@@ -841,7 +818,7 @@ if __name__ == "__main__":
841
818
 
842
819
  score_df = pd.DataFrame(
843
820
  {
844
- target: label_names,
821
+ target_name: label_names,
845
822
  "precision": scores[0],
846
823
  "recall": scores[1],
847
824
  "f1": scores[2],
@@ -853,7 +830,7 @@ if __name__ == "__main__":
853
830
  metrics = ["precision", "recall", "f1", "support"]
854
831
  for t in label_names:
855
832
  for m in metrics:
856
- value = score_df.loc[score_df[target] == t, m].iloc[0]
833
+ value = score_df.loc[score_df[target_name] == t, m].iloc[0]
857
834
  print(f"Metrics:{t}:{m} {value}")
858
835
 
859
836
  # Confusion matrix
@@ -868,34 +845,55 @@ if __name__ == "__main__":
868
845
  # Save validation predictions
869
846
  df_val = df_val.copy()
870
847
  df_val["prediction"] = preds_decoded
871
- if has_proba and preds.ndim == 2 and preds.shape[1] > 1:
872
- df_val["pred_proba"] = [p.tolist() for p in preds]
848
+ if has_proba and oof_proba is not None:
849
+ df_val["pred_proba"] = [p.tolist() for p in oof_proba]
873
850
  df_val = expand_proba_column(df_val, label_names)
874
851
 
875
852
  else:
876
- # Regression metrics
877
- preds_flat = preds.flatten()
878
- rmse = root_mean_squared_error(y_validate, preds_flat)
879
- mae = mean_absolute_error(y_validate, preds_flat)
880
- r2 = r2_score(y_validate, preds_flat)
881
- print(f"RMSE: {rmse:.3f}")
882
- print(f"MAE: {mae:.3f}")
883
- print(f"R2: {r2:.3f}")
884
- print(f"NumRows: {len(df_val)}")
885
-
853
+ # Regression metrics: compute per target (works for single or multi-task)
886
854
  df_val = df_val.copy()
887
- df_val["prediction"] = preds_flat
888
-
889
- # Add prediction_std for ensemble models
890
- if preds_std is not None:
891
- df_val["prediction_std"] = preds_std.flatten()
892
- print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
855
+ print("\n--- Per-target metrics ---")
856
+ for t_idx, t_name in enumerate(target_columns):
857
+ # Get valid (non-NaN) indices for this target
858
+ target_valid_mask = ~np.isnan(y_validate[:, t_idx])
859
+ y_true = y_validate[target_valid_mask, t_idx]
860
+ y_pred = preds[target_valid_mask, t_idx]
861
+
862
+ if len(y_true) > 0:
863
+ rmse = root_mean_squared_error(y_true, y_pred)
864
+ mae = mean_absolute_error(y_true, y_pred)
865
+ medae = median_absolute_error(y_true, y_pred)
866
+ r2 = r2_score(y_true, y_pred)
867
+ spearman_corr = spearmanr(y_true, y_pred).correlation
868
+ support = len(y_true)
869
+ # Print metrics in format expected by SageMaker metric definitions
870
+ print(f"rmse: {rmse:.3f}")
871
+ print(f"mae: {mae:.3f}")
872
+ print(f"medae: {medae:.3f}")
873
+ print(f"r2: {r2:.3f}")
874
+ print(f"spearmanr: {spearman_corr:.3f}")
875
+ print(f"support: {support}")
876
+
877
+ # Store predictions in dataframe
878
+ df_val[f"{t_name}_pred"] = preds[:, t_idx]
879
+ if preds_std is not None:
880
+ df_val[f"{t_name}_pred_std"] = preds_std[:, t_idx]
881
+ else:
882
+ df_val[f"{t_name}_pred_std"] = 0.0
893
883
 
894
884
  # Save validation predictions to S3
895
- output_columns = [target, "prediction"]
896
- if "prediction_std" in df_val.columns:
897
- output_columns.append("prediction_std")
885
+ # Include id_column if it exists in df_val
886
+ output_columns = []
887
+ if id_column in df_val.columns:
888
+ output_columns.append(id_column)
889
+ # Include all target columns and their predictions
890
+ output_columns += target_columns
891
+ output_columns += [f"{t}_pred" for t in target_columns]
892
+ output_columns += [f"{t}_pred_std" for t in target_columns]
893
+ # Add proba columns for classifiers
898
894
  output_columns += [col for col in df_val.columns if col.endswith("_proba")]
895
+ # Filter to only columns that exist
896
+ output_columns = [c for c in output_columns if c in df_val.columns]
899
897
  wr.s3.to_csv(
900
898
  df_val[output_columns],
901
899
  path=f"{model_metrics_s3_path}/validation_predictions.csv",
@@ -910,9 +908,13 @@ if __name__ == "__main__":
910
908
 
911
909
  # Save ensemble metadata (n_ensemble = number of models for inference)
912
910
  n_ensemble = len(ensemble_models)
913
- ensemble_metadata = {"n_ensemble": n_ensemble, "n_folds": n_folds}
911
+ ensemble_metadata = {
912
+ "n_ensemble": n_ensemble,
913
+ "n_folds": n_folds,
914
+ "target_columns": target_columns,
915
+ }
914
916
  joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
915
- print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
917
+ print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds}, targets={target_columns})")
916
918
 
917
919
  # Save label encoder if classification
918
920
  if label_encoder is not None: