PyPI - workbench - Versions diffs - 0.8.217__py3-none-any.whl → 0.8.224__py3-none-any.whl - Mend

workbench 0.8.217py3-none-any.whl → 0.8.224py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
workbench/algorithms/dataframe/fingerprint_proximity.py +190 -31
workbench/algorithms/dataframe/projection_2d.py +8 -2
workbench/algorithms/dataframe/proximity.py +3 -0
workbench/algorithms/sql/outliers.py +3 -3
workbench/api/feature_set.py +0 -1
workbench/core/artifacts/endpoint_core.py +2 -2
workbench/core/artifacts/feature_set_core.py +185 -230
workbench/core/transforms/features_to_model/features_to_model.py +2 -8
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +2 -0
workbench/model_script_utils/model_script_utils.py +15 -11
workbench/model_scripts/chemprop/chemprop.template +195 -70
workbench/model_scripts/chemprop/generated_model_script.py +198 -73
workbench/model_scripts/chemprop/model_script_utils.py +15 -11
workbench/model_scripts/custom_models/chem_info/fingerprints.py +80 -43
workbench/model_scripts/pytorch_model/generated_model_script.py +2 -2
workbench/model_scripts/pytorch_model/model_script_utils.py +15 -11
workbench/model_scripts/xgb_model/generated_model_script.py +7 -7
workbench/model_scripts/xgb_model/model_script_utils.py +15 -11
workbench/scripts/meta_model_sim.py +35 -0
workbench/scripts/ml_pipeline_sqs.py +71 -2
workbench/themes/light/custom.css +7 -1
workbench/themes/midnight_blue/custom.css +34 -0
workbench/utils/chem_utils/fingerprints.py +80 -43
workbench/utils/chem_utils/projections.py +16 -6
workbench/utils/meta_model_simulator.py +41 -13
workbench/utils/model_utils.py +0 -1
workbench/utils/plot_utils.py +146 -28
workbench/utils/shap_utils.py +1 -55
workbench/utils/theme_manager.py +95 -30
workbench/web_interface/components/plugins/scatter_plot.py +152 -66
workbench/web_interface/components/settings_menu.py +184 -0
{workbench-0.8.217.dist-info → workbench-0.8.224.dist-info}/METADATA +4 -13
{workbench-0.8.217.dist-info → workbench-0.8.224.dist-info}/RECORD +38 -37
{workbench-0.8.217.dist-info → workbench-0.8.224.dist-info}/entry_points.txt +1 -0
workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
{workbench-0.8.217.dist-info → workbench-0.8.224.dist-info}/WHEEL +0 -0
{workbench-0.8.217.dist-info → workbench-0.8.224.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.217.dist-info → workbench-0.8.224.dist-info}/top_level.txt +0 -0

workbench/model_scripts/chemprop/generated_model_script.py CHANGED Viewed

@@ -34,7 +34,7 @@ DEFAULT_HYPERPARAMETERS = {
     "max_epochs": 400,
     "patience": 50,
     "batch_size": 32,
-    # Message Passing
+    # Message Passing (ignored when using foundation model)
     "hidden_dim": 700,
     "depth": 6,
     "dropout": 0.1,  # Lower dropout - ensemble provides regularization
@@ -45,16 +45,24 @@ DEFAULT_HYPERPARAMETERS = {
     "criterion": "mae",
     # Random seed
     "seed": 42,
+    # Foundation model support
+    # - "CheMeleon": Load CheMeleon pretrained weights (auto-downloads on first use)
+    # - Path to .pt file: Load custom pretrained Chemprop model
+    # - None: Train from scratch (default)
+    "from_foundation": None,
+    # Freeze MPNN for N epochs, then unfreeze (0 = no freezing, train all params from start)
+    # Recommended: 5-20 epochs when using foundation models to stabilize FFN before fine-tuning MPNN
+    "freeze_mpnn_epochs": 0,
 }
 # Template parameters (filled in by Workbench)
 TEMPLATE_PARAMS = {
     "model_type": "uq_regressor",
-    "targets": ['udm_asy_res_efflux_ratio'],
+    "targets": ['udm_asy_res_free_percent'],
     "feature_list": ['smiles'],
     "id_column": "udm_mol_bat_id",
-    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-chemprop/training",
-    "hyperparameters": {},
+    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/ppb-human-free-reg-chemprop-foundation-1-dt/training",
+    "hyperparameters": {'from_foundation': 'CheMeleon', 'freeze_mpnn_epochs': 10, 'n_folds': 5, 'max_epochs': 100, 'patience': 20, 'ffn_hidden_dim': 512, 'dropout': 0.15},
 }
@@ -114,26 +122,27 @@ def _create_molecule_datapoints(
 # Model Loading (for SageMaker inference)
 # =============================================================================
 def model_fn(model_dir: str) -> dict:
-    """Load ChemProp MPNN ensemble from the specified directory."""
-    from lightning import pytorch as pl
+    """Load ChemProp MPNN ensemble from the specified directory.
+    Optimized for serverless cold starts - uses direct PyTorch inference
+    instead of Lightning Trainer to minimize startup time.
+    """
     metadata = joblib.load(os.path.join(model_dir, "ensemble_metadata.joblib"))
+    # Load all ensemble models (keep on CPU for serverless compatibility)
+    # ChemProp handles device placement internally
     ensemble_models = []
     for i in range(metadata["n_ensemble"]):
         model = models.MPNN.load_from_file(os.path.join(model_dir, f"chemprop_model_{i}.pt"))
         model.eval()
         ensemble_models.append(model)
-    # Pre-initialize trainer once during model loading (expensive operation)
-    trainer = pl.Trainer(accelerator="auto", logger=False, enable_progress_bar=False)
     print(f"Loaded {len(ensemble_models)} model(s), targets={metadata['target_columns']}")
     return {
         "ensemble_models": ensemble_models,
         "n_ensemble": metadata["n_ensemble"],
         "target_columns": metadata["target_columns"],
         "median_std": metadata["median_std"],
-        "trainer": trainer,
     }
@@ -141,13 +150,15 @@ def model_fn(model_dir: str) -> dict:
 # Inference (for SageMaker inference)
 # =============================================================================
 def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
-    """Make predictions with ChemProp MPNN ensemble."""
+    """Make predictions with ChemProp MPNN ensemble.
+    Uses direct PyTorch inference (no Lightning Trainer) for fast serverless inference.
+    """
     model_type = TEMPLATE_PARAMS["model_type"]
     model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
     ensemble_models = model_dict["ensemble_models"]
     target_columns = model_dict["target_columns"]
-    trainer = model_dict["trainer"]  # Use pre-initialized trainer
     # Load artifacts
     label_encoder = None
@@ -202,18 +213,34 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
         return df
     dataset = data.MoleculeDataset(datapoints)
-    dataloader = data.build_dataloader(dataset, shuffle=False)
+    dataloader = data.build_dataloader(dataset, shuffle=False, batch_size=64)
-    # Ensemble predictions
+    # Ensemble predictions using direct PyTorch inference (no Lightning Trainer)
     all_preds = []
     for model in ensemble_models:
+        model_preds = []
+        model.eval()
         with torch.inference_mode():
-            predictions = trainer.predict(model, dataloader)
-        preds = np.concatenate([p.numpy() for p in predictions], axis=0)
+            for batch in dataloader:
+                # TrainingBatch contains (bmg, V_d, X_d, targets, weights, lt_mask, gt_mask)
+                # For inference we only need bmg, V_d, X_d
+                bmg, V_d, X_d, *_ = batch
+                output = model(bmg, V_d, X_d)
+                model_preds.append(output.detach().cpu().numpy())
+        if len(model_preds) == 0:
+            print(f"Warning: No predictions generated. Dataset size: {len(datapoints)}")
+            continue
+        preds = np.concatenate(model_preds, axis=0)
         if preds.ndim == 3 and preds.shape[1] == 1:
             preds = preds.squeeze(axis=1)
         all_preds.append(preds)
+    if len(all_preds) == 0:
+        print("Error: No ensemble predictions generated")
+        return df
     preds = np.mean(np.stack(all_preds), axis=0)
     preds_std = np.std(np.stack(all_preds), axis=0)
     if preds.ndim == 1:
@@ -243,8 +270,11 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
         df["prediction"] = df[f"{target_columns[0]}_pred"]
         df["prediction_std"] = df[f"{target_columns[0]}_pred_std"]
-        # Compute confidence from ensemble std
-        df = _compute_std_confidence(df, model_dict["median_std"])
+        # Compute confidence from ensemble std (or NaN if single model)
+        if model_dict["median_std"] is not None:
+            df = _compute_std_confidence(df, model_dict["median_std"])
+        else:
+            df["confidence"] = np.nan
     return df
@@ -279,54 +309,107 @@ if __name__ == "__main__":
     )
     # -------------------------------------------------------------------------
-    # Training-only helper function
+    # Training-only helper functions
     # -------------------------------------------------------------------------
-    def build_mpnn_model(
-        hyperparameters: dict,
-        task: str = "regression",
-        num_classes: int | None = None,
-        n_targets: int = 1,
-        n_extra_descriptors: int = 0,
-        x_d_transform: nn.ScaleTransform | None = None,
-        output_transform: nn.UnscaleTransform | None = None,
-        task_weights: np.ndarray | None = None,
-    ) -> models.MPNN:
-        """Build an MPNN model with specified hyperparameters."""
-        hidden_dim = hyperparameters["hidden_dim"]
-        depth = hyperparameters["depth"]
+    def _load_foundation_weights(from_foundation: str) -> tuple[nn.BondMessagePassing, nn.Aggregation]:
+        """Load pretrained MPNN weights from foundation model.
+        Args:
+            from_foundation: "CheMeleon" or path to .pt file
+        Returns:
+            Tuple of (message_passing, aggregation) modules
+        """
+        import urllib.request
+        from pathlib import Path
+        print(f"Loading foundation model: {from_foundation}")
+        if from_foundation.lower() == "chemeleon":
+            # Download from Zenodo if not cached
+            cache_dir = Path.home() / ".chemprop" / "foundation"
+            cache_dir.mkdir(parents=True, exist_ok=True)
+            chemeleon_path = cache_dir / "chemeleon_mp.pt"
+            if not chemeleon_path.exists():
+                print("  Downloading CheMeleon weights from Zenodo...")
+                urllib.request.urlretrieve(
+                    "https://zenodo.org/records/15460715/files/chemeleon_mp.pt", chemeleon_path
+                )
+                print(f"  Downloaded to {chemeleon_path}")
+            ckpt = torch.load(chemeleon_path, weights_only=True)
+            mp = nn.BondMessagePassing(**ckpt["hyper_parameters"])
+            mp.load_state_dict(ckpt["state_dict"])
+            print(f"  Loaded CheMeleon MPNN (hidden_dim={mp.output_dim})")
+            return mp, nn.MeanAggregation()
+        if not os.path.exists(from_foundation):
+            raise ValueError(f"Foundation model not found: {from_foundation}. Use 'CheMeleon' or a valid .pt path.")
+        ckpt = torch.load(from_foundation, weights_only=False)
+        if "hyper_parameters" in ckpt and "state_dict" in ckpt:
+            # CheMeleon-style checkpoint
+            mp = nn.BondMessagePassing(**ckpt["hyper_parameters"])
+            mp.load_state_dict(ckpt["state_dict"])
+            print(f"  Loaded custom foundation weights (hidden_dim={mp.output_dim})")
+            return mp, nn.MeanAggregation()
+        # Full MPNN model file
+        pretrained = models.MPNN.load_from_file(from_foundation)
+        print(f"  Loaded custom MPNN (hidden_dim={pretrained.message_passing.output_dim})")
+        return pretrained.message_passing, pretrained.agg
+    def _build_ffn(
+        task: str, input_dim: int, hyperparameters: dict,
+        num_classes: int | None, n_targets: int,
+        output_transform: nn.UnscaleTransform | None, task_weights: np.ndarray | None,
+    ) -> nn.Predictor:
+        """Build task-specific FFN head."""
         dropout = hyperparameters["dropout"]
         ffn_hidden_dim = hyperparameters["ffn_hidden_dim"]
         ffn_num_layers = hyperparameters["ffn_num_layers"]
-        mp = nn.BondMessagePassing(d_h=hidden_dim, depth=depth, dropout=dropout)
-        agg = nn.NormAggregation()
-        ffn_input_dim = hidden_dim + n_extra_descriptors
         if task == "classification" and num_classes is not None:
-            ffn = nn.MulticlassClassificationFFN(
-                n_classes=num_classes, input_dim=ffn_input_dim,
+            return nn.MulticlassClassificationFFN(
+                n_classes=num_classes, input_dim=input_dim,
                 hidden_dim=ffn_hidden_dim, n_layers=ffn_num_layers, dropout=dropout,
             )
+        from chemprop.nn.metrics import MAE, MSE
+        criterion_map = {"mae": MAE, "mse": MSE}
+        criterion_name = hyperparameters.get("criterion", "mae")
+        if criterion_name not in criterion_map:
+            raise ValueError(f"Unknown criterion '{criterion_name}'. Supported: {list(criterion_map.keys())}")
+        weights_tensor = torch.tensor(task_weights, dtype=torch.float32) if task_weights is not None else None
+        return nn.RegressionFFN(
+            input_dim=input_dim, hidden_dim=ffn_hidden_dim, n_layers=ffn_num_layers,
+            dropout=dropout, n_tasks=n_targets, output_transform=output_transform,
+            task_weights=weights_tensor, criterion=criterion_map[criterion_name](),
+        )
+    def build_mpnn_model(
+        hyperparameters: dict, task: str = "regression", num_classes: int | None = None,
+        n_targets: int = 1, n_extra_descriptors: int = 0,
+        x_d_transform: nn.ScaleTransform | None = None,
+        output_transform: nn.UnscaleTransform | None = None, task_weights: np.ndarray | None = None,
+    ) -> models.MPNN:
+        """Build MPNN model, optionally loading pretrained weights."""
+        from_foundation = hyperparameters.get("from_foundation")
+        if from_foundation:
+            mp, agg = _load_foundation_weights(from_foundation)
+            ffn_input_dim = mp.output_dim + n_extra_descriptors
         else:
-            # Map criterion name to ChemProp metric class (must have .clone() method)
-            from chemprop.nn.metrics import MAE, MSE
-            criterion_map = {
-                "mae": MAE,
-                "mse": MSE,
-            }
-            criterion_name = hyperparameters.get("criterion", "mae")
-            if criterion_name not in criterion_map:
-                raise ValueError(f"Unknown criterion '{criterion_name}'. Supported: {list(criterion_map.keys())}")
-            criterion = criterion_map[criterion_name]()
-            weights_tensor = torch.tensor(task_weights, dtype=torch.float32) if task_weights is not None else None
-            ffn = nn.RegressionFFN(
-                input_dim=ffn_input_dim, hidden_dim=ffn_hidden_dim, n_layers=ffn_num_layers,
-                dropout=dropout, n_tasks=n_targets, output_transform=output_transform, task_weights=weights_tensor,
-                criterion=criterion,
+            mp = nn.BondMessagePassing(
+                d_h=hyperparameters["hidden_dim"], depth=hyperparameters["depth"],
+                dropout=hyperparameters["dropout"],
             )
+            agg = nn.NormAggregation()
+            ffn_input_dim = hyperparameters["hidden_dim"] + n_extra_descriptors
+        ffn = _build_ffn(task, ffn_input_dim, hyperparameters, num_classes, n_targets, output_transform, task_weights)
         return models.MPNN(message_passing=mp, agg=agg, predictor=ffn, batch_norm=True, metrics=None, X_d_transform=x_d_transform)
     # -------------------------------------------------------------------------
@@ -359,6 +442,14 @@ if __name__ == "__main__":
     print(f"Extra features: {extra_feature_cols if use_extra_features else 'None (SMILES only)'}")
     print(f"Hyperparameters: {hyperparameters}")
+    # Log foundation model configuration
+    if hyperparameters.get("from_foundation"):
+        freeze_epochs = hyperparameters.get("freeze_mpnn_epochs", 0)
+        freeze_msg = f"MPNN frozen for {freeze_epochs} epochs" if freeze_epochs > 0 else "no freezing"
+        print(f"Foundation model: {hyperparameters['from_foundation']} ({freeze_msg})")
+    else:
+        print("Foundation model: None (training from scratch)")
     # Load training data
     training_files = [os.path.join(args.train, f) for f in os.listdir(args.train) if f.endswith(".csv")]
     print(f"Training Files: {training_files}")
@@ -456,7 +547,7 @@ if __name__ == "__main__":
         print(f"Fold {fold_idx + 1}/{len(folds)} - Train: {len(train_idx)}, Val: {len(val_idx)}")
         print(f"{'='*50}")
-        # Split data
+        # Split data (val_extra_raw preserves unscaled features for OOF predictions)
         df_train, df_val = all_df.iloc[train_idx].reset_index(drop=True), all_df.iloc[val_idx].reset_index(drop=True)
         train_targets, val_targets = all_targets[train_idx], all_targets[val_idx]
         train_extra = all_extra_features[train_idx] if all_extra_features is not None else None
@@ -481,10 +572,10 @@ if __name__ == "__main__":
             val_dataset.normalize_targets(target_scaler)
             output_transform = nn.UnscaleTransform.from_standard_scaler(target_scaler)
-        train_loader = data.build_dataloader(train_dataset, batch_size=batch_size, shuffle=True)
-        val_loader = data.build_dataloader(val_dataset, batch_size=batch_size, shuffle=False)
+        train_loader = data.build_dataloader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=3)
+        val_loader = data.build_dataloader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=3)
-        # Build and train model
+        # Build model
         pl.seed_everything(hyperparameters["seed"] + fold_idx)
         mpnn = build_mpnn_model(
             hyperparameters, task=task, num_classes=num_classes, n_targets=n_targets,
@@ -492,14 +583,39 @@ if __name__ == "__main__":
             output_transform=output_transform, task_weights=task_weights,
         )
-        trainer = pl.Trainer(
-            accelerator="auto", max_epochs=hyperparameters["max_epochs"], logger=False, enable_progress_bar=True,
-            callbacks=[
-                pl.callbacks.EarlyStopping(monitor="val_loss", patience=hyperparameters["patience"], mode="min"),
-                pl.callbacks.ModelCheckpoint(dirpath=args.model_dir, filename=f"best_{fold_idx}", monitor="val_loss", mode="min", save_top_k=1),
-            ],
-        )
-        trainer.fit(mpnn, train_loader, val_loader)
+        # Train model (with optional two-phase foundation training)
+        freeze_mpnn_epochs = hyperparameters.get("freeze_mpnn_epochs", 0)
+        use_two_phase = hyperparameters.get("from_foundation") and freeze_mpnn_epochs > 0
+        def _set_mpnn_frozen(frozen: bool):
+            for param in mpnn.message_passing.parameters():
+                param.requires_grad = not frozen
+            for param in mpnn.agg.parameters():
+                param.requires_grad = not frozen
+        def _make_trainer(max_epochs: int, save_checkpoint: bool = False):
+            callbacks = [pl.callbacks.EarlyStopping(monitor="val_loss", patience=hyperparameters["patience"], mode="min")]
+            if save_checkpoint:
+                callbacks.append(pl.callbacks.ModelCheckpoint(
+                    dirpath=args.model_dir, filename=f"best_{fold_idx}", monitor="val_loss", mode="min", save_top_k=1
+                ))
+            return pl.Trainer(accelerator="auto", max_epochs=max_epochs, logger=False, enable_progress_bar=True, callbacks=callbacks)
+        if use_two_phase:
+            # Phase 1: Freeze MPNN, train FFN only
+            print(f"Phase 1: Training with frozen MPNN for {freeze_mpnn_epochs} epochs...")
+            _set_mpnn_frozen(True)
+            _make_trainer(freeze_mpnn_epochs).fit(mpnn, train_loader, val_loader)
+            # Phase 2: Unfreeze and fine-tune all
+            print("Phase 2: Unfreezing MPNN, continuing training...")
+            _set_mpnn_frozen(False)
+            remaining_epochs = max(1, hyperparameters["max_epochs"] - freeze_mpnn_epochs)
+            trainer = _make_trainer(remaining_epochs, save_checkpoint=True)
+            trainer.fit(mpnn, train_loader, val_loader)
+        else:
+            trainer = _make_trainer(hyperparameters["max_epochs"], save_checkpoint=True)
+            trainer.fit(mpnn, train_loader, val_loader)
         # Load best checkpoint
         if trainer.checkpoint_callback and trainer.checkpoint_callback.best_model_path:
@@ -509,7 +625,7 @@ if __name__ == "__main__":
         mpnn.eval()
         ensemble_models.append(mpnn)
-        # Out-of-fold predictions (using raw features)
+        # Out-of-fold predictions (using unscaled features - model's x_d_transform handles scaling)
         val_dps_raw, _ = _create_molecule_datapoints(df_val[smiles_column].tolist(), val_targets, val_extra_raw)
         val_loader_pred = data.build_dataloader(data.MoleculeDataset(val_dps_raw), batch_size=batch_size, shuffle=False)
@@ -599,11 +715,17 @@ if __name__ == "__main__":
         df_val["prediction"] = df_val[f"{target_columns[0]}_pred"]
         df_val["prediction_std"] = df_val[f"{target_columns[0]}_pred_std"]
-        # Compute confidence from ensemble std
-        median_std = float(np.median(preds_std[:, 0]))
-        print(f"\nComputing confidence scores (median_std={median_std:.6f})...")
-        df_val = _compute_std_confidence(df_val, median_std)
-        print(f"  Confidence: mean={df_val['confidence'].mean():.3f}, min={df_val['confidence'].min():.3f}, max={df_val['confidence'].max():.3f}")
+        # Compute confidence from ensemble std (or NaN for single model)
+        if preds_std is not None:
+            median_std = float(np.median(preds_std[:, 0]))
+            print(f"\nComputing confidence scores (median_std={median_std:.6f})...")
+            df_val = _compute_std_confidence(df_val, median_std)
+            print(f"  Confidence: mean={df_val['confidence'].mean():.3f}, min={df_val['confidence'].min():.3f}, max={df_val['confidence'].max():.3f}")
+        else:
+            # Single model - no ensemble std available, confidence is undefined
+            median_std = None
+            df_val["confidence"] = np.nan
+            print("\nSingle model (n_folds=1): No ensemble std, confidence set to NaN")
     # -------------------------------------------------------------------------
     # Save validation predictions to S3
@@ -633,6 +755,9 @@ if __name__ == "__main__":
         "n_folds": n_folds,
         "target_columns": target_columns,
         "median_std": median_std,  # For confidence calculation during inference
+        # Foundation model provenance (for tracking/reproducibility)
+        "from_foundation": hyperparameters.get("from_foundation", None),
+        "freeze_mpnn_epochs": hyperparameters.get("freeze_mpnn_epochs", 0),
     }
     joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))

workbench/model_scripts/chemprop/model_script_utils.py CHANGED Viewed

@@ -148,12 +148,16 @@ def convert_categorical_types(
 def decompress_features(
     df: pd.DataFrame, features: list[str], compressed_features: list[str]
 ) -> tuple[pd.DataFrame, list[str]]:
-    """Decompress bitstring features into individual bit columns.
+    """Decompress compressed features (bitstrings or count vectors) into individual columns.
+    Supports two formats (auto-detected):
+        - Bitstrings: "10110010..." → individual uint8 columns (0 or 1)
+        - Count vectors: "0,3,0,1,5,..." → individual uint8 columns (0-255)
     Args:
         df: The features DataFrame
         features: Full list of feature names
-        compressed_features: List of feature names to decompress (bitstrings)
+        compressed_features: List of feature names to decompress
     Returns:
         Tuple of (DataFrame with decompressed features, updated feature list)
@@ -178,18 +182,18 @@ def decompress_features(
         # Remove the feature from the list to avoid duplication
         decompressed_features.remove(feature)
-        # Handle all compressed features as bitstrings
-        bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
-        prefix = feature[:3]
+        # Auto-detect format and parse: comma-separated counts or bitstring
+        sample = str(df[feature].dropna().iloc[0]) if not df[feature].dropna().empty else ""
+        parse_fn = (lambda s: list(map(int, s.split(",")))) if "," in sample else list
+        feature_matrix = np.array([parse_fn(s) for s in df[feature]], dtype=np.uint8)
-        # Create all new columns at once - avoids fragmentation
-        new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
-        new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
+        # Create new columns with prefix from feature name
+        prefix = feature[:3]
+        new_col_names = [f"{prefix}_{i}" for i in range(feature_matrix.shape[1])]
+        new_df = pd.DataFrame(feature_matrix, columns=new_col_names, index=df.index)
-        # Add to features list
+        # Update features list and dataframe
         decompressed_features.extend(new_col_names)
-        # Drop original column and concatenate new ones
         df = df.drop(columns=[feature])
         df = pd.concat([df, new_df], axis=1)

workbench/model_scripts/custom_models/chem_info/fingerprints.py CHANGED Viewed

@@ -1,11 +1,19 @@
-"""Molecular fingerprint computation utilities"""
+"""Molecular fingerprint computation utilities for ADMET modeling.
+This module provides Morgan count fingerprints, the standard for ADMET prediction.
+Count fingerprints outperform binary fingerprints for molecular property prediction.
+References:
+    - Count vs Binary: https://pubs.acs.org/doi/10.1021/acs.est.3c02198
+    - ECFP/Morgan: https://pubs.acs.org/doi/10.1021/ci100050t
+"""
 import logging
-import pandas as pd
-# Molecular Descriptor Imports
+import numpy as np
+import pandas as pd
 from rdkit import Chem, RDLogger
-from rdkit.Chem import rdFingerprintGenerator
+from rdkit.Chem import AllChem
 from rdkit.Chem.MolStandardize import rdMolStandardize
 # Suppress RDKit warnings (e.g., "not removing hydrogen atom without neighbors")
@@ -16,20 +24,25 @@ RDLogger.DisableLog("rdApp.warning")
 log = logging.getLogger("workbench")
-def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=True) -> pd.DataFrame:
-    """Compute and add Morgan fingerprints to the DataFrame.
+def compute_morgan_fingerprints(df: pd.DataFrame, radius: int = 2, n_bits: int = 2048) -> pd.DataFrame:
+    """Compute Morgan count fingerprints for ADMET modeling.
+    Generates true count fingerprints where each bit position contains the
+    number of times that substructure appears in the molecule (clamped to 0-255).
+    This is the recommended approach for ADMET prediction per 2025 research.
     Args:
-        df (pd.DataFrame): Input DataFrame containing SMILES strings.
-        radius (int): Radius for the Morgan fingerprint.
-        n_bits (int): Number of bits for the fingerprint.
-        counts (bool): Count simulation for the fingerprint.
+        df: Input DataFrame containing SMILES strings.
+        radius: Radius for the Morgan fingerprint (default 2 = ECFP4 equivalent).
+        n_bits: Number of bits for the fingerprint (default 2048).
     Returns:
-        pd.DataFrame: The input DataFrame with the Morgan fingerprints added as bit strings.
+        pd.DataFrame: Input DataFrame with 'fingerprint' column added.
+                      Values are comma-separated uint8 counts.
     Note:
-        See: https://greglandrum.github.io/rdkit-blog/posts/2021-07-06-simulating-counts.html
+        Count fingerprints outperform binary for ADMET prediction.
+        See: https://pubs.acs.org/doi/10.1021/acs.est.3c02198
     """
     delete_mol_column = False
@@ -43,7 +56,7 @@ def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=
         log.warning("Detected serialized molecules in 'molecule' column. Removing...")
         del df["molecule"]
-    # Convert SMILES to RDKit molecule objects (vectorized)
+    # Convert SMILES to RDKit molecule objects
     if "molecule" not in df.columns:
         log.info("Converting SMILES to RDKit Molecules...")
         delete_mol_column = True
@@ -59,15 +72,24 @@ def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=
         lambda mol: rdMolStandardize.LargestFragmentChooser().choose(mol) if mol else None
     )
-    # Create a Morgan fingerprint generator
-    if counts:
-        n_bits *= 4  # Multiply by 4 to simulate counts
-    morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits, countSimulation=counts)
+    def mol_to_count_string(mol):
+        """Convert molecule to comma-separated count fingerprint string."""
+        if mol is None:
+            return pd.NA
-    # Compute Morgan fingerprints (vectorized)
-    fingerprints = largest_frags.apply(
-        lambda mol: (morgan_generator.GetFingerprint(mol).ToBitString() if mol else pd.NA)
-    )
+        # Get hashed Morgan fingerprint with counts
+        fp = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=n_bits)
+        # Initialize array and populate with counts (clamped to uint8 range)
+        counts = np.zeros(n_bits, dtype=np.uint8)
+        for idx, count in fp.GetNonzeroElements().items():
+            counts[idx] = min(count, 255)
+        # Return as comma-separated string
+        return ",".join(map(str, counts))
+    # Compute Morgan count fingerprints
+    fingerprints = largest_frags.apply(mol_to_count_string)
     # Add the fingerprints to the DataFrame
     df["fingerprint"] = fingerprints
@@ -75,59 +97,62 @@ def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=
     # Drop the intermediate 'molecule' column if it was added
     if delete_mol_column:
         del df["molecule"]
     return df
 if __name__ == "__main__":
-    print("Running molecular fingerprint tests...")
-    print("Note: This requires molecular_screening module to be available")
+    print("Running Morgan count fingerprint tests...")
     # Test molecules
     test_molecules = {
         "aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
         "caffeine": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
         "glucose": "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O",  # With stereochemistry
-        "sodium_acetate": "CC(=O)[O-].[Na+]",  # Salt
+        "sodium_acetate": "CC(=O)[O-].[Na+]",  # Salt (largest fragment used)
         "benzene": "c1ccccc1",
         "butene_e": "C/C=C/C",  # E-butene
         "butene_z": "C/C=C\\C",  # Z-butene
     }
-    # Test 1: Morgan Fingerprints
-    print("\n1. Testing Morgan fingerprint generation...")
+    # Test 1: Morgan Count Fingerprints (default parameters)
+    print("\n1. Testing Morgan fingerprint generation (radius=2, n_bits=2048)...")
     test_df = pd.DataFrame({"SMILES": list(test_molecules.values()), "name": list(test_molecules.keys())})
-    fp_df = compute_morgan_fingerprints(test_df.copy(), radius=2, n_bits=512, counts=False)
+    fp_df = compute_morgan_fingerprints(test_df.copy())
     print("   Fingerprint generation results:")
     for _, row in fp_df.iterrows():
         fp = row.get("fingerprint", "N/A")
-        fp_len = len(fp) if fp != "N/A" else 0
-        print(f"   {row['name']:15} → {fp_len} bits")
+        if pd.notna(fp):
+            counts = [int(x) for x in fp.split(",")]
+            non_zero = sum(1 for c in counts if c > 0)
+            max_count = max(counts)
+            print(f"   {row['name']:15} → {len(counts)} features, {non_zero} non-zero, max={max_count}")
+        else:
+            print(f"   {row['name']:15} → N/A")
-    # Test 2: Different fingerprint parameters
-    print("\n2. Testing different fingerprint parameters...")
+    # Test 2: Different parameters
+    print("\n2. Testing with different parameters (radius=3, n_bits=1024)...")
-    # Test with counts enabled
-    fp_counts_df = compute_morgan_fingerprints(test_df.copy(), radius=3, n_bits=256, counts=True)
+    fp_df_custom = compute_morgan_fingerprints(test_df.copy(), radius=3, n_bits=1024)
-    print("   With count simulation (256 bits * 4):")
-    for _, row in fp_counts_df.iterrows():
+    for _, row in fp_df_custom.iterrows():
         fp = row.get("fingerprint", "N/A")
-        fp_len = len(fp) if fp != "N/A" else 0
-        print(f"   {row['name']:15} → {fp_len} bits")
+        if pd.notna(fp):
+            counts = [int(x) for x in fp.split(",")]
+            non_zero = sum(1 for c in counts if c > 0)
+            print(f"   {row['name']:15} → {len(counts)} features, {non_zero} non-zero")
+        else:
+            print(f"   {row['name']:15} → N/A")
     # Test 3: Edge cases
     print("\n3. Testing edge cases...")
     # Invalid SMILES
     invalid_df = pd.DataFrame({"SMILES": ["INVALID", ""]})
-    try:
-        fp_invalid = compute_morgan_fingerprints(invalid_df.copy())
-        print(f"   ✓ Invalid SMILES handled: {len(fp_invalid)} valid molecules")
-    except Exception as e:
-        print(f"   ✓ Invalid SMILES properly raised error: {type(e).__name__}")
+    fp_invalid = compute_morgan_fingerprints(invalid_df.copy())
+    print(f"   ✓ Invalid SMILES handled: {len(fp_invalid)} rows returned")
     # Test with pre-existing molecule column
     mol_df = test_df.copy()
@@ -135,4 +160,16 @@ if __name__ == "__main__":
     fp_with_mol = compute_morgan_fingerprints(mol_df)
     print(f"   ✓ Pre-existing molecule column handled: {len(fp_with_mol)} fingerprints generated")
+    # Test 4: Verify count values are reasonable
+    print("\n4. Verifying count distribution...")
+    all_counts = []
+    for _, row in fp_df.iterrows():
+        fp = row.get("fingerprint", "N/A")
+        if pd.notna(fp):
+            counts = [int(x) for x in fp.split(",")]
+            all_counts.extend([c for c in counts if c > 0])
+    if all_counts:
+        print(f"   Non-zero counts: min={min(all_counts)}, max={max(all_counts)}, mean={np.mean(all_counts):.2f}")
     print("\n✅ All fingerprint tests completed!")

workbench 0.8.217__py3-none-any.whl → 0.8.224__py3-none-any.whl

workbench 0.8.217py3-none-any.whl → 0.8.224py3-none-any.whl