PyPI - workbench - Versions diffs - 0.8.201__py3-none-any.whl → 0.8.204__py3-none-any.whl - Mend

workbench 0.8.201py3-none-any.whl → 0.8.204py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

workbench/model_scripts/chemprop/chemprop.template CHANGED Viewed

@@ -25,6 +25,7 @@
 # - argparse, file loading, S3 writes
 # =============================
+import glob
 import os
 import argparse
 import json
@@ -39,11 +40,13 @@ from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
 from sklearn.preprocessing import LabelEncoder
 from sklearn.metrics import (
     mean_absolute_error,
+    median_absolute_error,
     r2_score,
     root_mean_squared_error,
     precision_recall_fscore_support,
     confusion_matrix,
 )
+from scipy.stats import spearmanr
 import joblib
 # ChemProp imports
@@ -52,10 +55,10 @@ from chemprop import data, models, nn
 # Template Parameters
 TEMPLATE_PARAMS = {
     "model_type": "{{model_type}}",
-    "target": "{{target_column}}",
+    "targets": "{{target_column}}",  # List of target columns (single or multi-task)
     "feature_list": "{{feature_list}}",
+    "id_column": "{{id_column}}",
     "model_metrics_s3_path": "{{model_metrics_s3_path}}",
-    "train_all_data": "{{train_all_data}}",
     "hyperparameters": "{{hyperparameters}}",
 }
@@ -108,14 +111,14 @@ def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFra
 def create_molecule_datapoints(
     smiles_list: list[str],
-    targets: list[float] | None = None,
+    targets: list[float] | np.ndarray | None = None,
     extra_descriptors: np.ndarray | None = None,
 ) -> tuple[list[data.MoleculeDatapoint], list[int]]:
     """Create ChemProp MoleculeDatapoints from SMILES strings.
     Args:
         smiles_list: List of SMILES strings
-        targets: Optional list of target values (for training)
+        targets: Optional target values as 2D array (n_samples, n_targets). NaN allowed for missing targets.
         extra_descriptors: Optional array of extra features (n_samples, n_features)
     Returns:
@@ -127,6 +130,12 @@ def create_molecule_datapoints(
     valid_indices = []
     invalid_count = 0
+    # Convert targets to 2D array if provided
+    if targets is not None:
+        targets = np.atleast_2d(np.array(targets))
+        if targets.shape[0] == 1 and len(smiles_list) > 1:
+            targets = targets.T  # Shape was (1, n_samples), transpose to (n_samples, 1)
     for i, smi in enumerate(smiles_list):
         # Validate SMILES with RDKit first
         mol = Chem.MolFromSmiles(smi)
@@ -134,8 +143,9 @@ def create_molecule_datapoints(
             invalid_count += 1
             continue
-        # Build datapoint with optional target and extra descriptors
-        y = [targets[i]] if targets is not None else None
+        # Build datapoint with optional target(s) and extra descriptors
+        # For multi-task, y is a list of values (can include NaN for missing targets)
+        y = targets[i].tolist() if targets is not None else None
         x_d = extra_descriptors[i] if extra_descriptors is not None else None
         dp = data.MoleculeDatapoint.from_smi(smi, y=y, x_d=x_d)
@@ -152,9 +162,11 @@ def build_mpnn_model(
     hyperparameters: dict,
     task: str = "regression",
     num_classes: int | None = None,
+    n_targets: int = 1,
     n_extra_descriptors: int = 0,
     x_d_transform: nn.ScaleTransform | None = None,
     output_transform: nn.UnscaleTransform | None = None,
+    task_weights: np.ndarray | None = None,
 ) -> models.MPNN:
     """Build an MPNN model with the specified hyperparameters.
@@ -162,18 +174,20 @@ def build_mpnn_model(
         hyperparameters: Dictionary of model hyperparameters
         task: Either "regression" or "classification"
         num_classes: Number of classes for classification tasks
+        n_targets: Number of target columns (for multi-task regression)
         n_extra_descriptors: Number of extra descriptor features (for hybrid mode)
         x_d_transform: Optional transform for extra descriptors (scaling)
         output_transform: Optional transform for regression output (unscaling targets)
+        task_weights: Optional array of weights for each task (multi-task learning)
     Returns:
         Configured MPNN model
     """
-    # Model hyperparameters with defaults (based on OpenADMET baseline with slight improvements)
-    hidden_dim = hyperparameters.get("hidden_dim", 300)
-    depth = hyperparameters.get("depth", 4)
-    dropout = hyperparameters.get("dropout", 0.10)
-    ffn_hidden_dim = hyperparameters.get("ffn_hidden_dim", 300)
+    # Model hyperparameters with defaults
+    hidden_dim = hyperparameters.get("hidden_dim", 700)
+    depth = hyperparameters.get("depth", 6)
+    dropout = hyperparameters.get("dropout", 0.15)
+    ffn_hidden_dim = hyperparameters.get("ffn_hidden_dim", 2000)
     ffn_num_layers = hyperparameters.get("ffn_num_layers", 2)
     # Message passing component
@@ -197,12 +211,20 @@ def build_mpnn_model(
         )
     else:
         # Regression with optional output transform to unscale predictions
+        # n_tasks controls the number of output heads for multi-task learning
+        # task_weights goes here (in RegressionFFN) to weight loss per task
+        weights_tensor = None
+        if task_weights is not None:
+            weights_tensor = torch.tensor(task_weights, dtype=torch.float32)
         ffn = nn.RegressionFFN(
             input_dim=ffn_input_dim,
             hidden_dim=ffn_hidden_dim,
             n_layers=ffn_num_layers,
             dropout=dropout,
+            n_tasks=n_targets,
             output_transform=output_transform,
+            task_weights=weights_tensor,
         )
     # Create the MPNN model
@@ -227,31 +249,26 @@ def model_fn(model_dir: str) -> dict:
     Returns:
         Dictionary with ensemble models and metadata
     """
-    # Load ensemble metadata
+    # Load ensemble metadata (required)
     ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
-    if os.path.exists(ensemble_metadata_path):
-        ensemble_metadata = joblib.load(ensemble_metadata_path)
-        n_ensemble = ensemble_metadata["n_ensemble"]
-    else:
-        # Backwards compatibility: single model without ensemble metadata
-        n_ensemble = 1
+    ensemble_metadata = joblib.load(ensemble_metadata_path)
+    n_ensemble = ensemble_metadata["n_ensemble"]
+    target_columns = ensemble_metadata["target_columns"]
     # Load all ensemble models
     ensemble_models = []
     for ens_idx in range(n_ensemble):
         model_path = os.path.join(model_dir, f"chemprop_model_{ens_idx}.pt")
-        if not os.path.exists(model_path):
-            # Backwards compatibility: try old single model path
-            model_path = os.path.join(model_dir, "chemprop_model.pt")
         model = models.MPNN.load_from_file(model_path)
         model.eval()
         ensemble_models.append(model)
-    print(f"Loaded {len(ensemble_models)} ensemble model(s)")
+    print(f"Loaded {len(ensemble_models)} ensemble model(s), n_targets={len(target_columns)}")
     return {
         "ensemble_models": ensemble_models,
         "n_ensemble": n_ensemble,
+        "target_columns": target_columns,
     }
@@ -297,9 +314,10 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
     model_type = TEMPLATE_PARAMS["model_type"]
     model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
-    # Extract ensemble models
+    # Extract ensemble models and metadata
     ensemble_models = model_dict["ensemble_models"]
     n_ensemble = model_dict["n_ensemble"]
+    target_columns = model_dict["target_columns"]
     # Load label encoder if present (classification)
     label_encoder = None
@@ -337,13 +355,14 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
     valid_mask = np.array(valid_mask)
     print(f"Valid SMILES: {sum(valid_mask)} / {len(smiles_list)}")
-    # Initialize prediction column (use object dtype for classifiers to avoid FutureWarning)
+    # Initialize prediction columns (use object dtype for classifiers to avoid FutureWarning)
     if model_type == "classifier":
         df["prediction"] = pd.Series([None] * len(df), dtype=object)
     else:
-        df["prediction"] = np.nan
-        if n_ensemble > 1:
-            df["prediction_std"] = np.nan
+        # Regression: create prediction column for each target
+        for tc in target_columns:
+            df[f"{tc}_pred"] = np.nan
+            df[f"{tc}_pred_std"] = np.nan
     if sum(valid_mask) == 0:
         print("Warning: No valid SMILES to predict on")
@@ -408,10 +427,15 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
             ens_preds = ens_preds.squeeze(axis=1)
         all_ensemble_preds.append(ens_preds)
-    # Stack and compute mean/std
+    # Stack and compute mean/std (std is 0 for single model)
     ensemble_preds = np.stack(all_ensemble_preds, axis=0)
     preds = np.mean(ensemble_preds, axis=0)
-    preds_std = np.std(ensemble_preds, axis=0) if n_ensemble > 1 else None
+    preds_std = np.std(ensemble_preds, axis=0)  # Will be 0s for n_ensemble=1
+    # Ensure 2D: (n_samples, n_targets)
+    if preds.ndim == 1:
+        preds = preds.reshape(-1, 1)
+        preds_std = preds_std.reshape(-1, 1)
     print(f"Inference: Ensemble predictions shape: {preds.shape}")
@@ -440,12 +464,15 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
             decoded_preds = label_encoder.inverse_transform(class_preds)
             df.loc[valid_mask, "prediction"] = decoded_preds
     else:
-        # Regression: direct predictions
-        df.loc[valid_mask, "prediction"] = preds.flatten()
+        # Regression: store predictions for each target
+        for t_idx, tc in enumerate(target_columns):
+            df.loc[valid_mask, f"{tc}_pred"] = preds[:, t_idx]
+            df.loc[valid_mask, f"{tc}_pred_std"] = preds_std[:, t_idx]
-        # Add prediction_std for ensemble models
-        if preds_std is not None:
-            df.loc[valid_mask, "prediction_std"] = preds_std.flatten()
+        # Add prediction/prediction_std aliases for first target
+        first_target = target_columns[0]
+        df["prediction"] = df[f"{first_target}_pred"]
+        df["prediction_std"] = df[f"{first_target}_pred_std"]
     return df
@@ -454,13 +481,18 @@ if __name__ == "__main__":
     """Training script for ChemProp MPNN model"""
     # Template Parameters
-    target = TEMPLATE_PARAMS["target"]
+    target_columns = TEMPLATE_PARAMS["targets"]  # List of target columns
     model_type = TEMPLATE_PARAMS["model_type"]
     feature_list = TEMPLATE_PARAMS["feature_list"]
+    id_column = TEMPLATE_PARAMS["id_column"]
     model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
-    train_all_data = TEMPLATE_PARAMS["train_all_data"]
     hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
-    validation_split = 0.2
+    # Validate target_columns
+    if not target_columns or not isinstance(target_columns, list) or len(target_columns) == 0:
+        raise ValueError("'targets' must be a non-empty list of target column names")
+    n_targets = len(target_columns)
+    print(f"Target columns ({n_targets}): {target_columns}")
     # Get the SMILES column name from feature_list (user defines this, so we use their exact name)
     smiles_column = find_smiles_column(feature_list)
@@ -502,21 +534,29 @@ if __name__ == "__main__":
     check_dataframe(all_df, "training_df")
-    # Drop rows with missing SMILES or target values
+    # Drop rows with missing SMILES or all target values
     initial_count = len(all_df)
-    all_df = all_df.dropna(subset=[smiles_column, target])
+    all_df = all_df.dropna(subset=[smiles_column])
+    # Keep rows that have at least one non-null target (works for single and multi-task)
+    has_any_target = all_df[target_columns].notna().any(axis=1)
+    all_df = all_df[has_any_target]
     dropped = initial_count - len(all_df)
     if dropped > 0:
-        print(f"Dropped {dropped} rows with missing SMILES or target values")
+        print(f"Dropped {dropped} rows with missing SMILES or all target values")
-    print(f"Target: {target}")
+    print(f"Target columns: {target_columns}")
     print(f"Data Shape after cleaning: {all_df.shape}")
+    for tc in target_columns:
+        n_valid = all_df[tc].notna().sum()
+        print(f"  {tc}: {n_valid} samples with values")
-    # Set up label encoder for classification
+    # Set up label encoder for classification (single-target only)
     label_encoder = None
     if model_type == "classifier":
+        if n_targets > 1:
+            raise ValueError("Multi-task classification is not supported. Use regression for multi-task.")
         label_encoder = LabelEncoder()
-        all_df[target] = label_encoder.fit_transform(all_df[target])
+        all_df[target_columns[0]] = label_encoder.fit_transform(all_df[target_columns[0]])
         num_classes = len(label_encoder.classes_)
         print(
             f"Classification task with {num_classes} classes: {label_encoder.classes_}"
@@ -528,10 +568,10 @@ if __name__ == "__main__":
     print(f"Hyperparameters: {hyperparameters}")
     task = "classification" if model_type == "classifier" else "regression"
     n_extra = len(extra_feature_cols) if use_extra_features else 0
-    max_epochs = hyperparameters.get("max_epochs", 50)
-    patience = hyperparameters.get("patience", 10)
-    n_folds = hyperparameters.get("n_folds", 1)  # Number of CV folds (default: 1 = no CV)
-    batch_size = hyperparameters.get("batch_size", min(64, max(16, len(all_df) // 16)))
+    max_epochs = hyperparameters.get("max_epochs", 400)
+    patience = hyperparameters.get("patience", 40)
+    n_folds = hyperparameters.get("n_folds", 5)  # Number of CV folds (default: 5)
+    batch_size = hyperparameters.get("batch_size", 16)
     # Check extra feature columns exist
     if use_extra_features:
@@ -540,60 +580,108 @@ if __name__ == "__main__":
             raise ValueError(f"Missing extra feature columns in training data: {missing_cols}")
     # =========================================================================
-    # SINGLE MODEL TRAINING (n_folds=1) - uses train/val split
+    # UNIFIED TRAINING: Works for n_folds=1 (single model) or n_folds>1 (K-fold CV)
     # =========================================================================
+    print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
+    # Prepare extra features and validate SMILES upfront
+    all_extra_features = None
+    col_means = None
+    if use_extra_features:
+        all_extra_features = all_df[extra_feature_cols].values.astype(np.float32)
+        col_means = np.nanmean(all_extra_features, axis=0)
+        for i in range(all_extra_features.shape[1]):
+            all_extra_features[np.isnan(all_extra_features[:, i]), i] = col_means[i]
+    # Prepare target array: always 2D (n_samples, n_targets)
+    all_targets = all_df[target_columns].values.astype(np.float32)
+    # Filter invalid SMILES from the full dataset
+    _, valid_indices = create_molecule_datapoints(
+        all_df[smiles_column].tolist(), all_targets, all_extra_features
+    )
+    all_df = all_df.iloc[valid_indices].reset_index(drop=True)
+    all_targets = all_targets[valid_indices]
+    if all_extra_features is not None:
+        all_extra_features = all_extra_features[valid_indices]
+    print(f"Data after SMILES validation: {all_df.shape}")
+    # Compute dynamic task weights for multi-task regression
+    # Weight = inverse of sample count (normalized so min weight = 1.0)
+    # This gives higher weight to targets with fewer samples
+    task_weights = None
+    if n_targets > 1 and model_type != "classifier":
+        sample_counts = np.array([np.sum(~np.isnan(all_targets[:, t])) for t in range(n_targets)])
+        # Inverse weighting: fewer samples = higher weight
+        inverse_counts = 1.0 / sample_counts
+        # Normalize so minimum weight is 1.0
+        task_weights = inverse_counts / inverse_counts.min()
+        print(f"Task weights (inverse sample count):")
+        for t_idx, t_name in enumerate(target_columns):
+            print(f"  {t_name}: {task_weights[t_idx]:.3f} (n={sample_counts[t_idx]})")
+    # Create fold splits
     if n_folds == 1:
-        print("Training single model (no cross-validation)...")
-        # Split data
-        if train_all_data:
-            print("Training on ALL of the data")
-            df_train = all_df.copy()
-            df_val = all_df.copy()
-        elif "training" in all_df.columns:
+        # Single fold: use train/val split from "training" column or random split
+        if "training" in all_df.columns:
             print("Found training column, splitting data based on training column")
-            df_train = all_df[all_df["training"]].copy()
-            df_val = all_df[~all_df["training"]].copy()
+            train_idx = np.where(all_df["training"])[0]
+            val_idx = np.where(~all_df["training"])[0]
         else:
-            print("WARNING: No training column found, splitting data with random state=42")
-            df_train, df_val = train_test_split(
-                all_df, test_size=validation_split, random_state=42
-            )
+            print("WARNING: No training column found, splitting data with random 80/20 split")
+            indices = np.arange(len(all_df))
+            train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
+        folds = [(train_idx, val_idx)]
+    else:
+        # K-Fold CV
+        if model_type == "classifier":
+            kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
+            split_target = all_df[target_columns[0]]
+        else:
+            kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
+            split_target = None
+        folds = list(kfold.split(all_df, split_target))
+    # Initialize storage for out-of-fold predictions: always 2D (n_samples, n_targets)
+    oof_predictions = np.full((len(all_df), n_targets), np.nan, dtype=np.float64)
+    if model_type == "classifier" and num_classes and num_classes > 1:
+        oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
+    else:
+        oof_proba = None
-        print(f"TRAIN: {df_train.shape}")
-        print(f"VALIDATION: {df_val.shape}")
+    ensemble_models = []
-        # Extract and prepare extra features
-        train_extra_features = None
-        val_extra_features = None
-        col_means = None
+    for fold_idx, (train_idx, val_idx) in enumerate(folds):
+        print(f"\n{'='*50}")
+        print(f"Training Fold {fold_idx + 1}/{len(folds)}")
+        print(f"{'='*50}")
-        if use_extra_features:
-            train_extra_features = df_train[extra_feature_cols].values.astype(np.float32)
-            val_extra_features = df_val[extra_feature_cols].values.astype(np.float32)
-            col_means = np.nanmean(train_extra_features, axis=0)
-            for i in range(train_extra_features.shape[1]):
-                train_extra_features[np.isnan(train_extra_features[:, i]), i] = col_means[i]
-                val_extra_features[np.isnan(val_extra_features[:, i]), i] = col_means[i]
-        # Create ChemProp datasets
-        train_datapoints, train_valid_idx = create_molecule_datapoints(
-            df_train[smiles_column].tolist(), df_train[target].tolist(), train_extra_features
+        # Split data for this fold
+        df_train = all_df.iloc[train_idx].reset_index(drop=True)
+        df_val = all_df.iloc[val_idx].reset_index(drop=True)
+        train_targets = all_targets[train_idx]
+        val_targets = all_targets[val_idx]
+        train_extra = all_extra_features[train_idx] if all_extra_features is not None else None
+        val_extra = all_extra_features[val_idx] if all_extra_features is not None else None
+        print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
+        # Create ChemProp datasets for this fold
+        train_datapoints, _ = create_molecule_datapoints(
+            df_train[smiles_column].tolist(), train_targets, train_extra
         )
-        val_datapoints, val_valid_idx = create_molecule_datapoints(
-            df_val[smiles_column].tolist(), df_val[target].tolist(), val_extra_features
+        val_datapoints, _ = create_molecule_datapoints(
+            df_val[smiles_column].tolist(), val_targets, val_extra
         )
-        df_train = df_train.iloc[train_valid_idx].reset_index(drop=True)
-        df_val = df_val.iloc[val_valid_idx].reset_index(drop=True)
         train_dataset = data.MoleculeDataset(train_datapoints)
         val_dataset = data.MoleculeDataset(val_datapoints)
-        # Save raw validation features for predictions later
-        val_extra_raw = val_extra_features[val_valid_idx] if val_extra_features is not None else None
+        # Save raw val features for prediction
+        val_extra_raw = val_extra.copy() if val_extra is not None else None
-        # Scale features and targets
+        # Scale features and targets for this fold
         x_d_transform = None
         if use_extra_features:
             feature_scaler = train_dataset.normalize_inputs("X_d")
@@ -601,7 +689,7 @@ if __name__ == "__main__":
             x_d_transform = nn.ScaleTransform.from_standard_scaler(feature_scaler)
         output_transform = None
-        if model_type == "regressor":
+        if model_type in ["regressor", "uq_regressor"]:
             target_scaler = train_dataset.normalize_targets()
             val_dataset.normalize_targets(target_scaler)
             output_transform = nn.UnscaleTransform.from_standard_scaler(target_scaler)
@@ -609,17 +697,18 @@ if __name__ == "__main__":
         train_loader = data.build_dataloader(train_dataset, batch_size=batch_size, shuffle=True)
         val_loader = data.build_dataloader(val_dataset, batch_size=batch_size, shuffle=False)
-        # Build and train single model
-        pl.seed_everything(42)
+        # Build and train model for this fold
+        pl.seed_everything(42 + fold_idx)
         mpnn = build_mpnn_model(
-            hyperparameters, task=task, num_classes=num_classes,
+            hyperparameters, task=task, num_classes=num_classes, n_targets=n_targets,
             n_extra_descriptors=n_extra, x_d_transform=x_d_transform, output_transform=output_transform,
+            task_weights=task_weights,
         )
         callbacks = [
             pl.callbacks.EarlyStopping(monitor="val_loss", patience=patience, mode="min"),
             pl.callbacks.ModelCheckpoint(
-                dirpath=args.model_dir, filename="best_model_0",
+                dirpath=args.model_dir, filename=f"best_model_{fold_idx}",
                 monitor="val_loss", mode="min", save_top_k=1,
             ),
         ]
@@ -636,201 +725,95 @@ if __name__ == "__main__":
             mpnn.load_state_dict(checkpoint["state_dict"])
         mpnn.eval()
-        ensemble_models = [mpnn]
+        ensemble_models.append(mpnn)
-        # Make predictions on validation set
+        # Make out-of-fold predictions using raw features
         val_datapoints_raw, _ = create_molecule_datapoints(
-            df_val[smiles_column].tolist(), df_val[target].tolist(), val_extra_raw
+            df_val[smiles_column].tolist(), val_targets, val_extra_raw
         )
         val_dataset_raw = data.MoleculeDataset(val_datapoints_raw)
         val_loader_pred = data.build_dataloader(val_dataset_raw, batch_size=batch_size, shuffle=False)
         with torch.inference_mode():
-            val_predictions = trainer.predict(mpnn, val_loader_pred)
-        preds = np.concatenate([p.numpy() for p in val_predictions], axis=0)
-        if preds.ndim == 3 and preds.shape[1] == 1:
-            preds = preds.squeeze(axis=1)
-        preds_std = None
-        y_validate = df_val[target].values
-    # =========================================================================
-    # K-FOLD CROSS-VALIDATION (n_folds > 1) - trains n_folds models
-    # =========================================================================
-    else:
-        print(f"Training {n_folds}-fold cross-validation ensemble...")
-        # Validate all SMILES upfront and filter invalid ones
-        all_extra_features = None
-        if use_extra_features:
-            all_extra_features = all_df[extra_feature_cols].values.astype(np.float32)
-            col_means = np.nanmean(all_extra_features, axis=0)
-            for i in range(all_extra_features.shape[1]):
-                all_extra_features[np.isnan(all_extra_features[:, i]), i] = col_means[i]
+            fold_predictions = trainer.predict(mpnn, val_loader_pred)
+        fold_preds = np.concatenate([p.numpy() for p in fold_predictions], axis=0)
+        if fold_preds.ndim == 3 and fold_preds.shape[1] == 1:
+            fold_preds = fold_preds.squeeze(axis=1)
+        # Store out-of-fold predictions
+        if model_type == "classifier" and fold_preds.ndim == 2:
+            # Store class index in first column for classification
+            oof_predictions[val_idx, 0] = np.argmax(fold_preds, axis=1)
+            if oof_proba is not None:
+                oof_proba[val_idx] = fold_preds
         else:
-            col_means = None
+            # Regression: fold_preds shape is (n_val, n_targets) or (n_val,)
+            if fold_preds.ndim == 1:
+                fold_preds = fold_preds.reshape(-1, 1)
+            oof_predictions[val_idx] = fold_preds
-        # Filter invalid SMILES from the full dataset
-        _, valid_indices = create_molecule_datapoints(
-            all_df[smiles_column].tolist(), all_df[target].tolist(), all_extra_features
-        )
-        all_df = all_df.iloc[valid_indices].reset_index(drop=True)
-        if all_extra_features is not None:
-            all_extra_features = all_extra_features[valid_indices]
-        print(f"Data after SMILES validation: {all_df.shape}")
+        print(f"Fold {fold_idx + 1} complete!")
-        # Set up K-Fold
-        if model_type == "classifier":
-            kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
-            split_target = all_df[target]
-        else:
-            kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
-            split_target = None
+    print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
-        # Initialize storage for out-of-fold predictions
-        oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
-        if model_type == "classifier" and num_classes and num_classes > 1:
-            oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
-        else:
-            oof_proba = None
-        ensemble_models = []
-        for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(all_df, split_target)):
-            print(f"\n{'='*50}")
-            print(f"Training Fold {fold_idx + 1}/{n_folds}")
-            print(f"{'='*50}")
-            # Split data for this fold
-            df_train = all_df.iloc[train_idx].reset_index(drop=True)
-            df_val = all_df.iloc[val_idx].reset_index(drop=True)
-            train_extra = all_extra_features[train_idx] if all_extra_features is not None else None
-            val_extra = all_extra_features[val_idx] if all_extra_features is not None else None
-            print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
-            # Create ChemProp datasets for this fold
-            train_datapoints, _ = create_molecule_datapoints(
-                df_train[smiles_column].tolist(), df_train[target].tolist(), train_extra
-            )
-            val_datapoints, _ = create_molecule_datapoints(
-                df_val[smiles_column].tolist(), df_val[target].tolist(), val_extra
-            )
-            train_dataset = data.MoleculeDataset(train_datapoints)
-            val_dataset = data.MoleculeDataset(val_datapoints)
-            # Save raw val features for prediction
-            val_extra_raw = val_extra.copy() if val_extra is not None else None
-            # Scale features and targets for this fold
-            x_d_transform = None
-            if use_extra_features:
-                feature_scaler = train_dataset.normalize_inputs("X_d")
-                val_dataset.normalize_inputs("X_d", feature_scaler)
-                x_d_transform = nn.ScaleTransform.from_standard_scaler(feature_scaler)
-            output_transform = None
-            if model_type == "regressor":
-                target_scaler = train_dataset.normalize_targets()
-                val_dataset.normalize_targets(target_scaler)
-                output_transform = nn.UnscaleTransform.from_standard_scaler(target_scaler)
-            train_loader = data.build_dataloader(train_dataset, batch_size=batch_size, shuffle=True)
-            val_loader = data.build_dataloader(val_dataset, batch_size=batch_size, shuffle=False)
-            # Build and train model for this fold
-            pl.seed_everything(42 + fold_idx)
-            mpnn = build_mpnn_model(
-                hyperparameters, task=task, num_classes=num_classes,
-                n_extra_descriptors=n_extra, x_d_transform=x_d_transform, output_transform=output_transform,
-            )
-            callbacks = [
-                pl.callbacks.EarlyStopping(monitor="val_loss", patience=patience, mode="min"),
-                pl.callbacks.ModelCheckpoint(
-                    dirpath=args.model_dir, filename=f"best_model_{fold_idx}",
-                    monitor="val_loss", mode="min", save_top_k=1,
-                ),
-            ]
-            trainer = pl.Trainer(
-                accelerator="auto", max_epochs=max_epochs, callbacks=callbacks,
-                logger=False, enable_progress_bar=True,
-            )
-            trainer.fit(mpnn, train_loader, val_loader)
-            if trainer.checkpoint_callback and trainer.checkpoint_callback.best_model_path:
-                checkpoint = torch.load(trainer.checkpoint_callback.best_model_path, weights_only=False)
-                mpnn.load_state_dict(checkpoint["state_dict"])
-            mpnn.eval()
-            ensemble_models.append(mpnn)
-            # Make out-of-fold predictions using raw features
-            val_datapoints_raw, _ = create_molecule_datapoints(
-                df_val[smiles_column].tolist(), df_val[target].tolist(), val_extra_raw
-            )
-            val_dataset_raw = data.MoleculeDataset(val_datapoints_raw)
-            val_loader_pred = data.build_dataloader(val_dataset_raw, batch_size=batch_size, shuffle=False)
+    # Use out-of-fold predictions for metrics
+    # For n_folds=1, we only have predictions for val_idx, so filter to those rows
+    if n_folds == 1:
+        # oof_predictions is always 2D now: check if any column has a value
+        val_mask = ~np.isnan(oof_predictions).all(axis=1)
+        preds = oof_predictions[val_mask]
+        df_val = all_df[val_mask].copy()
+        y_validate = all_targets[val_mask]
+        if oof_proba is not None:
+            oof_proba = oof_proba[val_mask]
+        val_extra_features = all_extra_features[val_mask] if all_extra_features is not None else None
+    else:
+        preds = oof_predictions
+        df_val = all_df.copy()
+        y_validate = all_targets
+        val_extra_features = all_extra_features
+    # Compute prediction_std by running all ensemble models on validation data
+    # For n_folds=1, std will be 0 (only one model). For n_folds>1, std shows ensemble disagreement.
+    preds_std = None
+    if model_type in ["regressor", "uq_regressor"] and len(ensemble_models) > 0:
+        print("Computing prediction_std from ensemble predictions on validation data...")
+        val_datapoints_for_std, _ = create_molecule_datapoints(
+            df_val[smiles_column].tolist(),
+            y_validate,
+            val_extra_features
+        )
+        val_dataset_for_std = data.MoleculeDataset(val_datapoints_for_std)
+        val_loader_for_std = data.build_dataloader(val_dataset_for_std, batch_size=batch_size, shuffle=False)
+        all_ensemble_preds_for_std = []
+        trainer_pred = pl.Trainer(accelerator="auto", logger=False, enable_progress_bar=False)
+        for ens_model in ensemble_models:
             with torch.inference_mode():
-                fold_predictions = trainer.predict(mpnn, val_loader_pred)
-            fold_preds = np.concatenate([p.numpy() for p in fold_predictions], axis=0)
-            if fold_preds.ndim == 3 and fold_preds.shape[1] == 1:
-                fold_preds = fold_preds.squeeze(axis=1)
-            # Store out-of-fold predictions
-            if model_type == "classifier" and fold_preds.ndim == 2:
-                oof_predictions[val_idx] = np.argmax(fold_preds, axis=1)
-                if oof_proba is not None:
-                    oof_proba[val_idx] = fold_preds
-            else:
-                oof_predictions[val_idx] = fold_preds.flatten()
-            print(f"Fold {fold_idx + 1} complete!")
-        print(f"\nCross-validation complete! Trained {len(ensemble_models)} models.")
-        # Use out-of-fold predictions for metrics
-        preds = oof_predictions
-        preds_std = None  # Will compute from ensemble at inference time
-        y_validate = all_df[target].values
-        df_val = all_df  # For saving predictions
+                ens_preds = trainer_pred.predict(ens_model, val_loader_for_std)
+            ens_preds = np.concatenate([p.numpy() for p in ens_preds], axis=0)
+            if ens_preds.ndim == 3 and ens_preds.shape[1] == 1:
+                ens_preds = ens_preds.squeeze(axis=1)
+            all_ensemble_preds_for_std.append(ens_preds)
+        # Stack ensemble predictions: shape (n_ensemble, n_samples, n_targets)
+        ensemble_preds_stacked = np.stack(all_ensemble_preds_for_std, axis=0)
+        preds_std = np.std(ensemble_preds_stacked, axis=0)
+        # Ensure 2D
+        if preds_std.ndim == 1:
+            preds_std = preds_std.reshape(-1, 1)
+        print(f"Ensemble prediction_std - mean per target: {np.nanmean(preds_std, axis=0)}")
     if model_type == "classifier":
-        # Classification metrics - handle multi-class output
-        # For CV mode, preds already contains class indices; for single model, preds are probabilities
-        if preds.ndim == 2 and preds.shape[1] > 1:
-            # Multi-class probabilities: (n_samples, n_classes), take argmax
-            class_preds = np.argmax(preds, axis=1)
-            has_proba = True
-        elif preds.ndim == 1:
-            # Either class indices (CV mode) or binary probabilities
-            if n_folds > 1:
-                # CV mode: preds are already class indices
-                class_preds = preds.astype(int)
-                has_proba = False
-            else:
-                # Single model: preds are probabilities
-                class_preds = (preds > 0.5).astype(int)
-                has_proba = False
-        else:
-            # Squeeze extra dimensions if needed
-            preds = preds.squeeze()
-            if preds.ndim == 2:
-                class_preds = np.argmax(preds, axis=1)
-                has_proba = True
-            else:
-                class_preds = (preds > 0.5).astype(int)
-                has_proba = False
+        # Classification metrics - preds contains class indices in first column from OOF predictions
+        class_preds = preds[:, 0].astype(int)
+        has_proba = oof_proba is not None
         print(f"class_preds shape: {class_preds.shape}")
-        # Decode labels for metrics
-        y_validate_decoded = label_encoder.inverse_transform(y_validate.astype(int))
+        # Decode labels for metrics (classification is single-target only)
+        target_name = target_columns[0]
+        y_validate_decoded = label_encoder.inverse_transform(y_validate[:, 0].astype(int))
         preds_decoded = label_encoder.inverse_transform(class_preds)
         # Calculate metrics
@@ -841,7 +824,7 @@ if __name__ == "__main__":
         score_df = pd.DataFrame(
             {
-                target: label_names,
+                target_name: label_names,
                 "precision": scores[0],
                 "recall": scores[1],
                 "f1": scores[2],
@@ -853,7 +836,7 @@ if __name__ == "__main__":
         metrics = ["precision", "recall", "f1", "support"]
         for t in label_names:
             for m in metrics:
-                value = score_df.loc[score_df[target] == t, m].iloc[0]
+                value = score_df.loc[score_df[target_name] == t, m].iloc[0]
                 print(f"Metrics:{t}:{m} {value}")
         # Confusion matrix
@@ -868,34 +851,61 @@ if __name__ == "__main__":
         # Save validation predictions
         df_val = df_val.copy()
         df_val["prediction"] = preds_decoded
-        if has_proba and preds.ndim == 2 and preds.shape[1] > 1:
-            df_val["pred_proba"] = [p.tolist() for p in preds]
+        if has_proba and oof_proba is not None:
+            df_val["pred_proba"] = [p.tolist() for p in oof_proba]
             df_val = expand_proba_column(df_val, label_names)
     else:
-        # Regression metrics
-        preds_flat = preds.flatten()
-        rmse = root_mean_squared_error(y_validate, preds_flat)
-        mae = mean_absolute_error(y_validate, preds_flat)
-        r2 = r2_score(y_validate, preds_flat)
-        print(f"RMSE: {rmse:.3f}")
-        print(f"MAE: {mae:.3f}")
-        print(f"R2: {r2:.3f}")
-        print(f"NumRows: {len(df_val)}")
+        # Regression metrics: compute per target (works for single or multi-task)
         df_val = df_val.copy()
-        df_val["prediction"] = preds_flat
+        print("\n--- Per-target metrics ---")
+        for t_idx, t_name in enumerate(target_columns):
+            # Get valid (non-NaN) indices for this target
+            target_valid_mask = ~np.isnan(y_validate[:, t_idx])
+            y_true = y_validate[target_valid_mask, t_idx]
+            y_pred = preds[target_valid_mask, t_idx]
+            if len(y_true) > 0:
+                rmse = root_mean_squared_error(y_true, y_pred)
+                mae = mean_absolute_error(y_true, y_pred)
+                medae = median_absolute_error(y_true, y_pred)
+                r2 = r2_score(y_true, y_pred)
+                spearman_corr = spearmanr(y_true, y_pred).correlation
+                support = len(y_true)
+                # Print metrics in format expected by SageMaker metric definitions
+                print(f"rmse: {rmse:.3f}")
+                print(f"mae: {mae:.3f}")
+                print(f"medae: {medae:.3f}")
+                print(f"r2: {r2:.3f}")
+                print(f"spearmanr: {spearman_corr:.3f}")
+                print(f"support: {support}")
+            # Store predictions in dataframe
+            df_val[f"{t_name}_pred"] = preds[:, t_idx]
+            if preds_std is not None:
+                df_val[f"{t_name}_pred_std"] = preds_std[:, t_idx]
+            else:
+                df_val[f"{t_name}_pred_std"] = 0.0
-        # Add prediction_std for ensemble models
-        if preds_std is not None:
-            df_val["prediction_std"] = preds_std.flatten()
-            print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
+        # Add prediction/prediction_std aliases for first target
+        first_target = target_columns[0]
+        df_val["prediction"] = df_val[f"{first_target}_pred"]
+        df_val["prediction_std"] = df_val[f"{first_target}_pred_std"]
     # Save validation predictions to S3
-    output_columns = [target, "prediction"]
-    if "prediction_std" in df_val.columns:
-        output_columns.append("prediction_std")
+    # Include id_column if it exists in df_val
+    output_columns = []
+    if id_column in df_val.columns:
+        output_columns.append(id_column)
+    # Include all target columns and their predictions
+    output_columns += target_columns
+    output_columns += [f"{t}_pred" for t in target_columns]
+    output_columns += [f"{t}_pred_std" for t in target_columns]
+    output_columns += ["prediction", "prediction_std"]
+    # Add proba columns for classifiers
     output_columns += [col for col in df_val.columns if col.endswith("_proba")]
+    # Filter to only columns that exist
+    output_columns = [c for c in output_columns if c in df_val.columns]
     wr.s3.to_csv(
         df_val[output_columns],
         path=f"{model_metrics_s3_path}/validation_predictions.csv",
@@ -908,11 +918,20 @@ if __name__ == "__main__":
         models.save_model(model_path, ens_model)
         print(f"Saved model {model_idx + 1} to {model_path}")
+    # Clean up checkpoint files (not needed for inference, reduces artifact size)
+    for ckpt_file in glob.glob(os.path.join(args.model_dir, "best_model_*.ckpt")):
+        os.remove(ckpt_file)
+        print(f"Removed checkpoint: {ckpt_file}")
     # Save ensemble metadata (n_ensemble = number of models for inference)
     n_ensemble = len(ensemble_models)
-    ensemble_metadata = {"n_ensemble": n_ensemble, "n_folds": n_folds}
+    ensemble_metadata = {
+        "n_ensemble": n_ensemble,
+        "n_folds": n_folds,
+        "target_columns": target_columns,
+    }
     joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
-    print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
+    print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds}, targets={target_columns})")
     # Save label encoder if classification
     if label_encoder is not None:

workbench 0.8.201__py3-none-any.whl → 0.8.204__py3-none-any.whl

workbench 0.8.201py3-none-any.whl → 0.8.204py3-none-any.whl