PyPI - workbench - Versions diffs - 0.8.219__py3-none-any.whl → 0.8.231__py3-none-any.whl - Mend

workbench 0.8.219py3-none-any.whl → 0.8.231py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

workbench/__init__.py +1 -0
workbench/algorithms/dataframe/__init__.py +2 -0
workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
workbench/algorithms/dataframe/fingerprint_proximity.py +190 -31
workbench/algorithms/dataframe/projection_2d.py +8 -2
workbench/algorithms/dataframe/proximity.py +3 -0
workbench/algorithms/dataframe/smart_aggregator.py +161 -0
workbench/algorithms/sql/column_stats.py +0 -1
workbench/algorithms/sql/correlations.py +0 -1
workbench/algorithms/sql/descriptive_stats.py +0 -1
workbench/api/feature_set.py +0 -1
workbench/api/meta.py +0 -1
workbench/cached/cached_meta.py +0 -1
workbench/cached/cached_model.py +37 -7
workbench/core/artifacts/endpoint_core.py +12 -2
workbench/core/artifacts/feature_set_core.py +238 -225
workbench/core/cloud_platform/cloud_meta.py +0 -1
workbench/core/transforms/features_to_model/features_to_model.py +2 -8
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +2 -0
workbench/model_script_utils/model_script_utils.py +30 -0
workbench/model_script_utils/uq_harness.py +0 -1
workbench/model_scripts/chemprop/chemprop.template +196 -68
workbench/model_scripts/chemprop/generated_model_script.py +197 -72
workbench/model_scripts/chemprop/model_script_utils.py +30 -0
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +0 -1
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +0 -1
workbench/model_scripts/pytorch_model/generated_model_script.py +52 -34
workbench/model_scripts/pytorch_model/model_script_utils.py +30 -0
workbench/model_scripts/pytorch_model/pytorch.template +47 -29
workbench/model_scripts/pytorch_model/uq_harness.py +0 -1
workbench/model_scripts/script_generation.py +0 -1
workbench/model_scripts/xgb_model/generated_model_script.py +3 -3
workbench/model_scripts/xgb_model/model_script_utils.py +30 -0
workbench/model_scripts/xgb_model/uq_harness.py +0 -1
workbench/scripts/ml_pipeline_sqs.py +71 -2
workbench/themes/dark/custom.css +85 -8
workbench/themes/dark/plotly.json +6 -6
workbench/themes/light/custom.css +172 -64
workbench/themes/light/plotly.json +9 -9
workbench/themes/midnight_blue/custom.css +82 -29
workbench/themes/midnight_blue/plotly.json +1 -1
workbench/utils/aws_utils.py +0 -1
workbench/utils/chem_utils/mol_descriptors.py +0 -1
workbench/utils/chem_utils/projections.py +16 -6
workbench/utils/chem_utils/vis.py +137 -27
workbench/utils/clientside_callbacks.py +41 -0
workbench/utils/markdown_utils.py +57 -0
workbench/utils/model_utils.py +0 -1
workbench/utils/pipeline_utils.py +0 -1
workbench/utils/plot_utils.py +52 -36
workbench/utils/theme_manager.py +95 -30
workbench/web_interface/components/experiments/outlier_plot.py +0 -1
workbench/web_interface/components/model_plot.py +2 -0
workbench/web_interface/components/plugin_unit_test.py +0 -1
workbench/web_interface/components/plugins/ag_table.py +2 -4
workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
workbench/web_interface/components/plugins/model_details.py +10 -6
workbench/web_interface/components/plugins/scatter_plot.py +184 -85
workbench/web_interface/components/settings_menu.py +185 -0
workbench/web_interface/page_views/main_page.py +0 -1
{workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/METADATA +34 -41
{workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/RECORD +67 -69
{workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/WHEEL +1 -1
workbench/themes/quartz/base_css.url +0 -1
workbench/themes/quartz/custom.css +0 -117
workbench/themes/quartz/plotly.json +0 -642
workbench/themes/quartz_dark/base_css.url +0 -1
workbench/themes/quartz_dark/custom.css +0 -131
workbench/themes/quartz_dark/plotly.json +0 -642
{workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/entry_points.txt +0 -0
{workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.219.dist-info → workbench-0.8.231.dist-info}/top_level.txt +0 -0

workbench/model_scripts/chemprop/generated_model_script.py CHANGED Viewed

@@ -34,7 +34,7 @@ DEFAULT_HYPERPARAMETERS = {
     "max_epochs": 400,
     "patience": 50,
     "batch_size": 32,
-    # Message Passing
+    # Message Passing (ignored when using foundation model)
     "hidden_dim": 700,
     "depth": 6,
     "dropout": 0.1,  # Lower dropout - ensemble provides regularization
@@ -45,15 +45,23 @@ DEFAULT_HYPERPARAMETERS = {
     "criterion": "mae",
     # Random seed
     "seed": 42,
+    # Foundation model support
+    # - "CheMeleon": Load CheMeleon pretrained weights (auto-downloads on first use)
+    # - Path to .pt file: Load custom pretrained Chemprop model
+    # - None: Train from scratch (default)
+    "from_foundation": None,
+    # Freeze MPNN for N epochs, then unfreeze (0 = no freezing, train all params from start)
+    # Recommended: 5-20 epochs when using foundation models to stabilize FFN before fine-tuning MPNN
+    "freeze_mpnn_epochs": 0,
 }
 # Template parameters (filled in by Workbench)
 TEMPLATE_PARAMS = {
     "model_type": "uq_regressor",
-    "targets": ['logd'],
-    "feature_list": ['smiles', 'mollogp', 'fr_halogen', 'nbase', 'peoe_vsa6', 'bcut2d_mrlow', 'peoe_vsa7', 'peoe_vsa9', 'vsa_estate1', 'peoe_vsa1', 'numhdonors', 'vsa_estate5', 'smr_vsa3', 'slogp_vsa1', 'vsa_estate7', 'bcut2d_mwhi', 'axp_2dv', 'axp_3dv', 'mi', 'smr_vsa9', 'vsa_estate3', 'estate_vsa9', 'bcut2d_mwlow', 'tpsa', 'vsa_estate10', 'xch_5dv', 'slogp_vsa2', 'nhohcount', 'bcut2d_logplow', 'hallkieralpha', 'c2sp2', 'bcut2d_chglo', 'smr_vsa4', 'maxabspartialcharge', 'estate_vsa6', 'qed', 'slogp_vsa6', 'vsa_estate2', 'bcut2d_logphi', 'vsa_estate8', 'xch_7dv', 'fpdensitymorgan3', 'xpc_6d', 'smr_vsa10', 'axp_0d', 'fr_nh1', 'axp_4dv', 'peoe_vsa2', 'estate_vsa8', 'peoe_vsa5', 'vsa_estate6'],
-    "id_column": "molecule_name",
-    "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/logd-reg-chemprop-hybrid/training",
+    "targets": ['udm_asy_res_value'],
+    "feature_list": ['smiles'],
+    "id_column": "udm_mol_bat_id",
+    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/logd-value-reg-chemprop-1-dt/training",
     "hyperparameters": {},
 }
@@ -114,26 +122,27 @@ def _create_molecule_datapoints(
 # Model Loading (for SageMaker inference)
 # =============================================================================
 def model_fn(model_dir: str) -> dict:
-    """Load ChemProp MPNN ensemble from the specified directory."""
-    from lightning import pytorch as pl
+    """Load ChemProp MPNN ensemble from the specified directory.
+    Optimized for serverless cold starts - uses direct PyTorch inference
+    instead of Lightning Trainer to minimize startup time.
+    """
     metadata = joblib.load(os.path.join(model_dir, "ensemble_metadata.joblib"))
+    # Load all ensemble models (keep on CPU for serverless compatibility)
+    # ChemProp handles device placement internally
     ensemble_models = []
     for i in range(metadata["n_ensemble"]):
         model = models.MPNN.load_from_file(os.path.join(model_dir, f"chemprop_model_{i}.pt"))
         model.eval()
         ensemble_models.append(model)
-    # Pre-initialize trainer once during model loading (expensive operation)
-    trainer = pl.Trainer(accelerator="auto", logger=False, enable_progress_bar=False)
     print(f"Loaded {len(ensemble_models)} model(s), targets={metadata['target_columns']}")
     return {
         "ensemble_models": ensemble_models,
         "n_ensemble": metadata["n_ensemble"],
         "target_columns": metadata["target_columns"],
         "median_std": metadata["median_std"],
-        "trainer": trainer,
     }
@@ -141,13 +150,15 @@ def model_fn(model_dir: str) -> dict:
 # Inference (for SageMaker inference)
 # =============================================================================
 def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
-    """Make predictions with ChemProp MPNN ensemble."""
+    """Make predictions with ChemProp MPNN ensemble.
+    Uses direct PyTorch inference (no Lightning Trainer) for fast serverless inference.
+    """
     model_type = TEMPLATE_PARAMS["model_type"]
     model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
     ensemble_models = model_dict["ensemble_models"]
     target_columns = model_dict["target_columns"]
-    trainer = model_dict["trainer"]  # Use pre-initialized trainer
     # Load artifacts
     label_encoder = None
@@ -202,18 +213,34 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
         return df
     dataset = data.MoleculeDataset(datapoints)
-    dataloader = data.build_dataloader(dataset, shuffle=False)
+    dataloader = data.build_dataloader(dataset, shuffle=False, batch_size=64)
-    # Ensemble predictions
+    # Ensemble predictions using direct PyTorch inference (no Lightning Trainer)
     all_preds = []
     for model in ensemble_models:
+        model_preds = []
+        model.eval()
         with torch.inference_mode():
-            predictions = trainer.predict(model, dataloader)
-        preds = np.concatenate([p.numpy() for p in predictions], axis=0)
+            for batch in dataloader:
+                # TrainingBatch contains (bmg, V_d, X_d, targets, weights, lt_mask, gt_mask)
+                # For inference we only need bmg, V_d, X_d
+                bmg, V_d, X_d, *_ = batch
+                output = model(bmg, V_d, X_d)
+                model_preds.append(output.detach().cpu().numpy())
+        if len(model_preds) == 0:
+            print(f"Warning: No predictions generated. Dataset size: {len(datapoints)}")
+            continue
+        preds = np.concatenate(model_preds, axis=0)
         if preds.ndim == 3 and preds.shape[1] == 1:
             preds = preds.squeeze(axis=1)
         all_preds.append(preds)
+    if len(all_preds) == 0:
+        print("Error: No ensemble predictions generated")
+        return df
     preds = np.mean(np.stack(all_preds), axis=0)
     preds_std = np.std(np.stack(all_preds), axis=0)
     if preds.ndim == 1:
@@ -243,8 +270,11 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
         df["prediction"] = df[f"{target_columns[0]}_pred"]
         df["prediction_std"] = df[f"{target_columns[0]}_pred_std"]
-        # Compute confidence from ensemble std
-        df = _compute_std_confidence(df, model_dict["median_std"])
+        # Compute confidence from ensemble std (or NaN if single model)
+        if model_dict["median_std"] is not None:
+            df = _compute_std_confidence(df, model_dict["median_std"])
+        else:
+            df["confidence"] = np.nan
     return df
@@ -279,54 +309,107 @@ if __name__ == "__main__":
     )
     # -------------------------------------------------------------------------
-    # Training-only helper function
+    # Training-only helper functions
     # -------------------------------------------------------------------------
-    def build_mpnn_model(
-        hyperparameters: dict,
-        task: str = "regression",
-        num_classes: int | None = None,
-        n_targets: int = 1,
-        n_extra_descriptors: int = 0,
-        x_d_transform: nn.ScaleTransform | None = None,
-        output_transform: nn.UnscaleTransform | None = None,
-        task_weights: np.ndarray | None = None,
-    ) -> models.MPNN:
-        """Build an MPNN model with specified hyperparameters."""
-        hidden_dim = hyperparameters["hidden_dim"]
-        depth = hyperparameters["depth"]
+    def _load_foundation_weights(from_foundation: str) -> tuple[nn.BondMessagePassing, nn.Aggregation]:
+        """Load pretrained MPNN weights from foundation model.
+        Args:
+            from_foundation: "CheMeleon" or path to .pt file
+        Returns:
+            Tuple of (message_passing, aggregation) modules
+        """
+        import urllib.request
+        from pathlib import Path
+        print(f"Loading foundation model: {from_foundation}")
+        if from_foundation.lower() == "chemeleon":
+            # Download from Zenodo if not cached
+            cache_dir = Path.home() / ".chemprop" / "foundation"
+            cache_dir.mkdir(parents=True, exist_ok=True)
+            chemeleon_path = cache_dir / "chemeleon_mp.pt"
+            if not chemeleon_path.exists():
+                print("  Downloading CheMeleon weights from Zenodo...")
+                urllib.request.urlretrieve(
+                    "https://zenodo.org/records/15460715/files/chemeleon_mp.pt", chemeleon_path
+                )
+                print(f"  Downloaded to {chemeleon_path}")
+            ckpt = torch.load(chemeleon_path, weights_only=True)
+            mp = nn.BondMessagePassing(**ckpt["hyper_parameters"])
+            mp.load_state_dict(ckpt["state_dict"])
+            print(f"  Loaded CheMeleon MPNN (hidden_dim={mp.output_dim})")
+            return mp, nn.MeanAggregation()
+        if not os.path.exists(from_foundation):
+            raise ValueError(f"Foundation model not found: {from_foundation}. Use 'CheMeleon' or a valid .pt path.")
+        ckpt = torch.load(from_foundation, weights_only=False)
+        if "hyper_parameters" in ckpt and "state_dict" in ckpt:
+            # CheMeleon-style checkpoint
+            mp = nn.BondMessagePassing(**ckpt["hyper_parameters"])
+            mp.load_state_dict(ckpt["state_dict"])
+            print(f"  Loaded custom foundation weights (hidden_dim={mp.output_dim})")
+            return mp, nn.MeanAggregation()
+        # Full MPNN model file
+        pretrained = models.MPNN.load_from_file(from_foundation)
+        print(f"  Loaded custom MPNN (hidden_dim={pretrained.message_passing.output_dim})")
+        return pretrained.message_passing, pretrained.agg
+    def _build_ffn(
+        task: str, input_dim: int, hyperparameters: dict,
+        num_classes: int | None, n_targets: int,
+        output_transform: nn.UnscaleTransform | None, task_weights: np.ndarray | None,
+    ) -> nn.Predictor:
+        """Build task-specific FFN head."""
         dropout = hyperparameters["dropout"]
         ffn_hidden_dim = hyperparameters["ffn_hidden_dim"]
         ffn_num_layers = hyperparameters["ffn_num_layers"]
-        mp = nn.BondMessagePassing(d_h=hidden_dim, depth=depth, dropout=dropout)
-        agg = nn.NormAggregation()
-        ffn_input_dim = hidden_dim + n_extra_descriptors
         if task == "classification" and num_classes is not None:
-            ffn = nn.MulticlassClassificationFFN(
-                n_classes=num_classes, input_dim=ffn_input_dim,
+            return nn.MulticlassClassificationFFN(
+                n_classes=num_classes, input_dim=input_dim,
                 hidden_dim=ffn_hidden_dim, n_layers=ffn_num_layers, dropout=dropout,
             )
+        from chemprop.nn.metrics import MAE, MSE
+        criterion_map = {"mae": MAE, "mse": MSE}
+        criterion_name = hyperparameters.get("criterion", "mae")
+        if criterion_name not in criterion_map:
+            raise ValueError(f"Unknown criterion '{criterion_name}'. Supported: {list(criterion_map.keys())}")
+        weights_tensor = torch.tensor(task_weights, dtype=torch.float32) if task_weights is not None else None
+        return nn.RegressionFFN(
+            input_dim=input_dim, hidden_dim=ffn_hidden_dim, n_layers=ffn_num_layers,
+            dropout=dropout, n_tasks=n_targets, output_transform=output_transform,
+            task_weights=weights_tensor, criterion=criterion_map[criterion_name](),
+        )
+    def build_mpnn_model(
+        hyperparameters: dict, task: str = "regression", num_classes: int | None = None,
+        n_targets: int = 1, n_extra_descriptors: int = 0,
+        x_d_transform: nn.ScaleTransform | None = None,
+        output_transform: nn.UnscaleTransform | None = None, task_weights: np.ndarray | None = None,
+    ) -> models.MPNN:
+        """Build MPNN model, optionally loading pretrained weights."""
+        from_foundation = hyperparameters.get("from_foundation")
+        if from_foundation:
+            mp, agg = _load_foundation_weights(from_foundation)
+            ffn_input_dim = mp.output_dim + n_extra_descriptors
         else:
-            # Map criterion name to ChemProp metric class (must have .clone() method)
-            from chemprop.nn.metrics import MAE, MSE
-            criterion_map = {
-                "mae": MAE,
-                "mse": MSE,
-            }
-            criterion_name = hyperparameters.get("criterion", "mae")
-            if criterion_name not in criterion_map:
-                raise ValueError(f"Unknown criterion '{criterion_name}'. Supported: {list(criterion_map.keys())}")
-            criterion = criterion_map[criterion_name]()
-            weights_tensor = torch.tensor(task_weights, dtype=torch.float32) if task_weights is not None else None
-            ffn = nn.RegressionFFN(
-                input_dim=ffn_input_dim, hidden_dim=ffn_hidden_dim, n_layers=ffn_num_layers,
-                dropout=dropout, n_tasks=n_targets, output_transform=output_transform, task_weights=weights_tensor,
-                criterion=criterion,
+            mp = nn.BondMessagePassing(
+                d_h=hyperparameters["hidden_dim"], depth=hyperparameters["depth"],
+                dropout=hyperparameters["dropout"],
             )
+            agg = nn.NormAggregation()
+            ffn_input_dim = hyperparameters["hidden_dim"] + n_extra_descriptors
+        ffn = _build_ffn(task, ffn_input_dim, hyperparameters, num_classes, n_targets, output_transform, task_weights)
         return models.MPNN(message_passing=mp, agg=agg, predictor=ffn, batch_norm=True, metrics=None, X_d_transform=x_d_transform)
     # -------------------------------------------------------------------------
@@ -359,6 +442,14 @@ if __name__ == "__main__":
     print(f"Extra features: {extra_feature_cols if use_extra_features else 'None (SMILES only)'}")
     print(f"Hyperparameters: {hyperparameters}")
+    # Log foundation model configuration
+    if hyperparameters.get("from_foundation"):
+        freeze_epochs = hyperparameters.get("freeze_mpnn_epochs", 0)
+        freeze_msg = f"MPNN frozen for {freeze_epochs} epochs" if freeze_epochs > 0 else "no freezing"
+        print(f"Foundation model: {hyperparameters['from_foundation']} ({freeze_msg})")
+    else:
+        print("Foundation model: None (training from scratch)")
     # Load training data
     training_files = [os.path.join(args.train, f) for f in os.listdir(args.train) if f.endswith(".csv")]
     print(f"Training Files: {training_files}")
@@ -456,7 +547,7 @@ if __name__ == "__main__":
         print(f"Fold {fold_idx + 1}/{len(folds)} - Train: {len(train_idx)}, Val: {len(val_idx)}")
         print(f"{'='*50}")
-        # Split data
+        # Split data (val_extra_raw preserves unscaled features for OOF predictions)
         df_train, df_val = all_df.iloc[train_idx].reset_index(drop=True), all_df.iloc[val_idx].reset_index(drop=True)
         train_targets, val_targets = all_targets[train_idx], all_targets[val_idx]
         train_extra = all_extra_features[train_idx] if all_extra_features is not None else None
@@ -484,7 +575,7 @@ if __name__ == "__main__":
         train_loader = data.build_dataloader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=3)
         val_loader = data.build_dataloader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=3)
-        # Build and train model
+        # Build model
         pl.seed_everything(hyperparameters["seed"] + fold_idx)
         mpnn = build_mpnn_model(
             hyperparameters, task=task, num_classes=num_classes, n_targets=n_targets,
@@ -492,14 +583,39 @@ if __name__ == "__main__":
             output_transform=output_transform, task_weights=task_weights,
         )
-        trainer = pl.Trainer(
-            accelerator="auto", max_epochs=hyperparameters["max_epochs"], logger=False, enable_progress_bar=True,
-            callbacks=[
-                pl.callbacks.EarlyStopping(monitor="val_loss", patience=hyperparameters["patience"], mode="min"),
-                pl.callbacks.ModelCheckpoint(dirpath=args.model_dir, filename=f"best_{fold_idx}", monitor="val_loss", mode="min", save_top_k=1),
-            ],
-        )
-        trainer.fit(mpnn, train_loader, val_loader)
+        # Train model (with optional two-phase foundation training)
+        freeze_mpnn_epochs = hyperparameters.get("freeze_mpnn_epochs", 0)
+        use_two_phase = hyperparameters.get("from_foundation") and freeze_mpnn_epochs > 0
+        def _set_mpnn_frozen(frozen: bool):
+            for param in mpnn.message_passing.parameters():
+                param.requires_grad = not frozen
+            for param in mpnn.agg.parameters():
+                param.requires_grad = not frozen
+        def _make_trainer(max_epochs: int, save_checkpoint: bool = False):
+            callbacks = [pl.callbacks.EarlyStopping(monitor="val_loss", patience=hyperparameters["patience"], mode="min")]
+            if save_checkpoint:
+                callbacks.append(pl.callbacks.ModelCheckpoint(
+                    dirpath=args.model_dir, filename=f"best_{fold_idx}", monitor="val_loss", mode="min", save_top_k=1
+                ))
+            return pl.Trainer(accelerator="auto", max_epochs=max_epochs, logger=False, enable_progress_bar=True, callbacks=callbacks)
+        if use_two_phase:
+            # Phase 1: Freeze MPNN, train FFN only
+            print(f"Phase 1: Training with frozen MPNN for {freeze_mpnn_epochs} epochs...")
+            _set_mpnn_frozen(True)
+            _make_trainer(freeze_mpnn_epochs).fit(mpnn, train_loader, val_loader)
+            # Phase 2: Unfreeze and fine-tune all
+            print("Phase 2: Unfreezing MPNN, continuing training...")
+            _set_mpnn_frozen(False)
+            remaining_epochs = max(1, hyperparameters["max_epochs"] - freeze_mpnn_epochs)
+            trainer = _make_trainer(remaining_epochs, save_checkpoint=True)
+            trainer.fit(mpnn, train_loader, val_loader)
+        else:
+            trainer = _make_trainer(hyperparameters["max_epochs"], save_checkpoint=True)
+            trainer.fit(mpnn, train_loader, val_loader)
         # Load best checkpoint
         if trainer.checkpoint_callback and trainer.checkpoint_callback.best_model_path:
@@ -509,7 +625,7 @@ if __name__ == "__main__":
         mpnn.eval()
         ensemble_models.append(mpnn)
-        # Out-of-fold predictions (using raw features)
+        # Out-of-fold predictions (using unscaled features - model's x_d_transform handles scaling)
         val_dps_raw, _ = _create_molecule_datapoints(df_val[smiles_column].tolist(), val_targets, val_extra_raw)
         val_loader_pred = data.build_dataloader(data.MoleculeDataset(val_dps_raw), batch_size=batch_size, shuffle=False)
@@ -599,11 +715,17 @@ if __name__ == "__main__":
         df_val["prediction"] = df_val[f"{target_columns[0]}_pred"]
         df_val["prediction_std"] = df_val[f"{target_columns[0]}_pred_std"]
-        # Compute confidence from ensemble std
-        median_std = float(np.median(preds_std[:, 0]))
-        print(f"\nComputing confidence scores (median_std={median_std:.6f})...")
-        df_val = _compute_std_confidence(df_val, median_std)
-        print(f"  Confidence: mean={df_val['confidence'].mean():.3f}, min={df_val['confidence'].min():.3f}, max={df_val['confidence'].max():.3f}")
+        # Compute confidence from ensemble std (or NaN for single model)
+        if preds_std is not None:
+            median_std = float(np.median(preds_std[:, 0]))
+            print(f"\nComputing confidence scores (median_std={median_std:.6f})...")
+            df_val = _compute_std_confidence(df_val, median_std)
+            print(f"  Confidence: mean={df_val['confidence'].mean():.3f}, min={df_val['confidence'].min():.3f}, max={df_val['confidence'].max():.3f}")
+        else:
+            # Single model - no ensemble std available, confidence is undefined
+            median_std = None
+            df_val["confidence"] = np.nan
+            print("\nSingle model (n_folds=1): No ensemble std, confidence set to NaN")
     # -------------------------------------------------------------------------
     # Save validation predictions to S3
@@ -633,6 +755,9 @@ if __name__ == "__main__":
         "n_folds": n_folds,
         "target_columns": target_columns,
         "median_std": median_std,  # For confidence calculation during inference
+        # Foundation model provenance (for tracking/reproducibility)
+        "from_foundation": hyperparameters.get("from_foundation", None),
+        "freeze_mpnn_epochs": hyperparameters.get("freeze_mpnn_epochs", 0),
     }
     joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))

workbench/model_scripts/chemprop/model_script_utils.py CHANGED Viewed

@@ -249,6 +249,36 @@ def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
         raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
+def cap_std_outliers(std_array: np.ndarray) -> np.ndarray:
+    """Cap extreme outliers in prediction_std using IQR method.
+    Uses the standard IQR fence (Q3 + 1.5*IQR) to cap extreme values.
+    This prevents unreasonably large std values while preserving the
+    relative ordering and keeping meaningful high-uncertainty signals.
+    Args:
+        std_array: Array of standard deviations (n_samples,) or (n_samples, n_targets)
+    Returns:
+        Array with outliers capped at the upper fence
+    """
+    if std_array.ndim == 1:
+        std_array = std_array.reshape(-1, 1)
+        squeeze = True
+    else:
+        squeeze = False
+    capped = std_array.copy()
+    for col in range(capped.shape[1]):
+        col_data = capped[:, col]
+        q1, q3 = np.percentile(col_data, [25, 75])
+        iqr = q3 - q1
+        upper_bound = q3 + 1.5 * iqr
+        capped[:, col] = np.minimum(col_data, upper_bound)
+    return capped.squeeze() if squeeze else capped
 def compute_regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict[str, float]:
     """Compute standard regression metrics.

workbench/model_scripts/custom_models/chem_info/mol_descriptors.py CHANGED Viewed

@@ -99,7 +99,6 @@ from rdkit.ML.Descriptors import MoleculeDescriptors
 from mordred import Calculator as MordredCalculator
 from mordred import AcidBase, Aromatic, Constitutional, Chi, CarbonTypes
 logger = logging.getLogger("workbench")
 logger.setLevel(logging.DEBUG)

workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py CHANGED Viewed

@@ -15,7 +15,6 @@ import json
 from mol_standardize import standardize
 from mol_descriptors import compute_descriptors
 # TRAINING SECTION
 #
 # This section (__main__) is where SageMaker will execute the training job

workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py CHANGED Viewed

@@ -17,7 +17,6 @@ import json
 # Local imports
 from fingerprints import compute_morgan_fingerprints
 # TRAINING SECTION
 #
 # This section (__main__) is where SageMaker will execute the training job

workbench/model_scripts/pytorch_model/generated_model_script.py CHANGED Viewed

@@ -59,12 +59,12 @@ DEFAULT_HYPERPARAMETERS = {
 # Template parameters (filled in by Workbench)
 TEMPLATE_PARAMS = {
-    "model_type": "uq_regressor",
-    "target": "udm_asy_res_efflux_ratio",
-    "features": ['fingerprint'],
+    "model_type": "classifier",
+    "target": "class",
+    "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
     "id_column": "udm_mol_bat_id",
-    "compressed_features": ['fingerprint'],
-    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-fp-pytorch/training",
+    "compressed_features": [],
+    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-class-pytorch-1-fr/training",
     "hyperparameters": {},
 }
@@ -152,24 +152,30 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
         print("Decompressing features for prediction...")
         matched_df, features = decompress_features(matched_df, features, compressed_features)
-    # Track missing features
-    missing_mask = matched_df[features].isna().any(axis=1)
-    if missing_mask.any():
-        print(f"Warning: {missing_mask.sum()} rows have missing features")
+    # Impute missing values (categorical with mode, continuous handled by scaler)
+    missing_counts = matched_df[features].isna().sum()
+    if missing_counts.any():
+        missing_features = missing_counts[missing_counts > 0]
+        print(f"Imputing missing values: {missing_features.to_dict()}")
+        # Load categorical imputation values if available
+        impute_path = os.path.join(model_dir, "categorical_impute.json")
+        if os.path.exists(impute_path):
+            with open(impute_path) as f:
+                cat_impute_values = json.load(f)
+            for col in categorical_cols:
+                if col in cat_impute_values and matched_df[col].isna().any():
+                    matched_df[col] = matched_df[col].fillna(cat_impute_values[col])
+        # Continuous features are imputed by FeatureScaler.transform() using column means
     # Initialize output columns
     df["prediction"] = np.nan
     if model_type in ["regressor", "uq_regressor"]:
         df["prediction_std"] = np.nan
-    complete_df = matched_df[~missing_mask].copy()
-    if len(complete_df) == 0:
-        print("Warning: No complete rows to predict on")
-        return df
-    # Prepare data for inference (with standardization)
+    # Prepare data for inference (with standardization and continuous imputation)
     x_cont, x_cat, _, _, _ = prepare_data(
-        complete_df, continuous_cols, categorical_cols, category_mappings=category_mappings, scaler=scaler
+        matched_df, continuous_cols, categorical_cols, category_mappings=category_mappings, scaler=scaler
     )
     # Collect ensemble predictions
@@ -191,28 +197,20 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
         class_preds = np.argmax(avg_probs, axis=1)
         predictions = label_encoder.inverse_transform(class_preds)
-        all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
-        all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
-        df["pred_proba"] = all_proba
+        df["pred_proba"] = [p.tolist() for p in avg_probs]
         df = expand_proba_column(df, label_encoder.classes_)
     else:
         # Regression
         predictions = preds.flatten()
-        df.loc[~missing_mask, "prediction_std"] = preds_std.flatten()
+        df["prediction_std"] = preds_std.flatten()
         # Add UQ intervals if available
         if uq_models and uq_metadata:
-            X_complete = complete_df[features]
-            df_complete = df.loc[~missing_mask].copy()
-            df_complete["prediction"] = predictions  # Set prediction before compute_confidence
-            df_complete = predict_intervals(df_complete, X_complete, uq_models, uq_metadata)
-            df_complete = compute_confidence(df_complete, uq_metadata["median_interval_width"], "q_10", "q_90")
-            # Copy UQ columns back to main dataframe
-            for col in df_complete.columns:
-                if col.startswith("q_") or col == "confidence":
-                    df.loc[~missing_mask, col] = df_complete[col].values
-    df.loc[~missing_mask, "prediction"] = predictions
+            df["prediction"] = predictions  # Set prediction before compute_confidence
+            df = predict_intervals(df, matched_df[features], uq_models, uq_metadata)
+            df = compute_confidence(df, uq_metadata["median_interval_width"], "q_10", "q_90")
+    df["prediction"] = predictions
     return df
@@ -275,11 +273,11 @@ if __name__ == "__main__":
     all_df = pd.concat([pd.read_csv(f, engine="python") for f in training_files])
     check_dataframe(all_df, "training_df")
-    # Drop rows with missing features
+    # Drop rows with missing target (required for training)
     initial_count = len(all_df)
-    all_df = all_df.dropna(subset=features)
+    all_df = all_df.dropna(subset=[target])
     if len(all_df) < initial_count:
-        print(f"Dropped {initial_count - len(all_df)} rows with missing features")
+        print(f"Dropped {initial_count - len(all_df)} rows with missing target")
     print(f"Target: {target}")
     print(f"Features: {features}")
@@ -301,6 +299,23 @@ if __name__ == "__main__":
     print(f"Categorical: {categorical_cols}")
     print(f"Continuous: {len(continuous_cols)} columns")
+    # Report and handle missing values in features
+    # Compute categorical imputation values (mode) for use at inference time
+    cat_impute_values = {}
+    for col in categorical_cols:
+        mode_val = all_df[col].mode().iloc[0] if not all_df[col].mode().empty else all_df[col].cat.categories[0]
+        cat_impute_values[col] = str(mode_val)  # Convert to string for JSON serialization
+    missing_counts = all_df[features].isna().sum()
+    if missing_counts.any():
+        missing_features = missing_counts[missing_counts > 0]
+        print(f"Missing values in features (will be imputed): {missing_features.to_dict()}")
+        # Impute categorical features with mode (most frequent value)
+        for col in categorical_cols:
+            if all_df[col].isna().any():
+                all_df[col] = all_df[col].fillna(cat_impute_values[col])
+        # Continuous features are imputed by FeatureScaler.transform() using column means
     # -------------------------------------------------------------------------
     # Classification setup
     # -------------------------------------------------------------------------
@@ -506,6 +521,9 @@ if __name__ == "__main__":
     with open(os.path.join(args.model_dir, "feature_metadata.json"), "w") as f:
         json.dump({"continuous_cols": continuous_cols, "categorical_cols": categorical_cols}, f)
+    with open(os.path.join(args.model_dir, "categorical_impute.json"), "w") as f:
+        json.dump(cat_impute_values, f)
     with open(os.path.join(args.model_dir, "hyperparameters.json"), "w") as f:
         json.dump(hyperparameters, f, indent=2)

workbench 0.8.219__py3-none-any.whl → 0.8.231__py3-none-any.whl

workbench 0.8.219py3-none-any.whl → 0.8.231py3-none-any.whl