PyPI - workbench - Versions diffs - 0.8.212__py3-none-any.whl → 0.8.217__py3-none-any.whl - Mend

workbench 0.8.212py3-none-any.whl → 0.8.217py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +257 -80
workbench/algorithms/dataframe/projection_2d.py +38 -21
workbench/algorithms/dataframe/proximity.py +75 -150
workbench/algorithms/graph/light/proximity_graph.py +5 -5
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +2 -2
workbench/api/__init__.py +3 -0
workbench/api/endpoint.py +10 -5
workbench/api/feature_set.py +76 -6
workbench/api/meta_model.py +289 -0
workbench/api/model.py +43 -4
workbench/core/artifacts/endpoint_core.py +75 -129
workbench/core/artifacts/feature_set_core.py +1 -1
workbench/core/artifacts/model_core.py +6 -4
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +30 -10
workbench/model_script_utils/pytorch_utils.py +11 -1
workbench/model_scripts/chemprop/chemprop.template +145 -69
workbench/model_scripts/chemprop/generated_model_script.py +147 -71
workbench/model_scripts/custom_models/chem_info/fingerprints.py +7 -3
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/uq_models/meta_uq.template +6 -6
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +42 -24
workbench/model_scripts/pytorch_model/pytorch.template +42 -24
workbench/model_scripts/pytorch_model/pytorch_utils.py +11 -1
workbench/model_scripts/script_generation.py +4 -0
workbench/model_scripts/xgb_model/generated_model_script.py +169 -158
workbench/model_scripts/xgb_model/xgb_model.template +163 -152
workbench/repl/workbench_shell.py +0 -5
workbench/scripts/endpoint_test.py +2 -2
workbench/utils/chem_utils/fingerprints.py +7 -3
workbench/utils/chemprop_utils.py +23 -5
workbench/utils/meta_model_simulator.py +471 -0
workbench/utils/metrics_utils.py +94 -10
workbench/utils/model_utils.py +91 -9
workbench/utils/pytorch_utils.py +1 -1
workbench/web_interface/components/plugins/scatter_plot.py +4 -8
{workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/METADATA +2 -1
{workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/RECORD +48 -43
workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
{workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/WHEEL +0 -0
{workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/entry_points.txt +0 -0
{workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/top_level.txt +0 -0

workbench/model_scripts/chemprop/generated_model_script.py CHANGED Viewed

@@ -5,36 +5,24 @@
 # - Multi-task regression support
 # - Hybrid mode (SMILES + extra molecular descriptors)
 # - Classification (single-target only)
+#
+# NOTE: Imports are structured to minimize serverless endpoint startup time.
+# Heavy imports (lightning, sklearn, awswrangler) are deferred to training time.
-import argparse
-import glob
 import json
 import os
-import awswrangler as wr
 import joblib
 import numpy as np
 import pandas as pd
 import torch
-from lightning import pytorch as pl
-from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
-from sklearn.preprocessing import LabelEncoder
-# Enable Tensor Core optimization for GPUs that support it
-torch.set_float32_matmul_precision("medium")
-from chemprop import data, models, nn
+from chemprop import data, models
 from model_script_utils import (
-    check_dataframe,
-    compute_classification_metrics,
-    compute_regression_metrics,
     expand_proba_column,
     input_fn,
     output_fn,
-    print_classification_metrics,
-    print_confusion_matrix,
-    print_regression_metrics,
 )
 # =============================================================================
@@ -44,15 +32,17 @@ DEFAULT_HYPERPARAMETERS = {
     # Training
     "n_folds": 5,
     "max_epochs": 400,
-    "patience": 40,
-    "batch_size": 16,
+    "patience": 50,
+    "batch_size": 32,
     # Message Passing
     "hidden_dim": 700,
     "depth": 6,
-    "dropout": 0.15,
+    "dropout": 0.1,  # Lower dropout - ensemble provides regularization
     # FFN
     "ffn_hidden_dim": 2000,
     "ffn_num_layers": 2,
+    # Loss function for regression (mae, mse)
+    "criterion": "mae",
     # Random seed
     "seed": 42,
 }
@@ -61,9 +51,9 @@ DEFAULT_HYPERPARAMETERS = {
 TEMPLATE_PARAMS = {
     "model_type": "uq_regressor",
     "targets": ['udm_asy_res_efflux_ratio'],
-    "feature_list": ['smiles', 'smr_vsa4', 'tpsa', 'numhdonors', 'nhohcount', 'nbase', 'vsa_estate3', 'fr_guanido', 'mollogp', 'peoe_vsa8', 'peoe_vsa1', 'fr_imine', 'vsa_estate2', 'estate_vsa10', 'asphericity', 'xc_3dv', 'smr_vsa3', 'charge_centroid_distance', 'c3sp3', 'nitrogen_span', 'estate_vsa2', 'minpartialcharge', 'hba_hbd_ratio', 'slogp_vsa1', 'axp_7d', 'nocount', 'vsa_estate4', 'vsa_estate6', 'estate_vsa4', 'xc_4dv', 'xc_4d', 'num_s_centers', 'vsa_estate9', 'chi2v', 'axp_5d', 'mi', 'mse', 'bcut2d_mrhi', 'smr_vsa6', 'hallkieralpha', 'balabanj', 'amphiphilic_moment', 'type_ii_pattern_count', 'minabsestateindex', 'bcut2d_mwlow', 'axp_0dv', 'slogp_vsa5', 'axp_2d', 'axp_1dv', 'xch_5d', 'peoe_vsa10'],
+    "feature_list": ['smiles'],
     "id_column": "udm_mol_bat_id",
-    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-chemprop-hybrid/training",
+    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-chemprop/training",
     "hyperparameters": {},
 }
@@ -71,7 +61,26 @@ TEMPLATE_PARAMS = {
 # =============================================================================
 # Helper Functions
 # =============================================================================
-def find_smiles_column(columns: list[str]) -> str:
+def _compute_std_confidence(df: pd.DataFrame, median_std: float, std_col: str = "prediction_std") -> pd.DataFrame:
+    """Compute confidence score from ensemble prediction_std.
+    Uses exponential decay: confidence = exp(-std / median_std)
+    - Low std (ensemble agreement) -> high confidence
+    - High std (ensemble disagreement) -> low confidence
+    Args:
+        df: DataFrame with prediction_std column
+        median_std: Median std from training validation set (normalization factor)
+        std_col: Name of the std column to use
+    Returns:
+        DataFrame with added 'confidence' column (0.0 to 1.0)
+    """
+    df["confidence"] = np.exp(-df[std_col] / median_std)
+    return df
+def _find_smiles_column(columns: list[str]) -> str:
     """Find SMILES column (case-insensitive match for 'smiles')."""
     smiles_col = next((c for c in columns if c.lower() == "smiles"), None)
     if smiles_col is None:
@@ -79,7 +88,7 @@ def find_smiles_column(columns: list[str]) -> str:
     return smiles_col
-def create_molecule_datapoints(
+def _create_molecule_datapoints(
     smiles_list: list[str],
     targets: np.ndarray | None = None,
     extra_descriptors: np.ndarray | None = None,
@@ -101,47 +110,13 @@ def create_molecule_datapoints(
     return datapoints, valid_indices
-def build_mpnn_model(
-    hyperparameters: dict,
-    task: str = "regression",
-    num_classes: int | None = None,
-    n_targets: int = 1,
-    n_extra_descriptors: int = 0,
-    x_d_transform: nn.ScaleTransform | None = None,
-    output_transform: nn.UnscaleTransform | None = None,
-    task_weights: np.ndarray | None = None,
-) -> models.MPNN:
-    """Build an MPNN model with specified hyperparameters."""
-    hidden_dim = hyperparameters["hidden_dim"]
-    depth = hyperparameters["depth"]
-    dropout = hyperparameters["dropout"]
-    ffn_hidden_dim = hyperparameters["ffn_hidden_dim"]
-    ffn_num_layers = hyperparameters["ffn_num_layers"]
-    mp = nn.BondMessagePassing(d_h=hidden_dim, depth=depth, dropout=dropout)
-    agg = nn.NormAggregation()
-    ffn_input_dim = hidden_dim + n_extra_descriptors
-    if task == "classification" and num_classes is not None:
-        ffn = nn.MulticlassClassificationFFN(
-            n_classes=num_classes, input_dim=ffn_input_dim,
-            hidden_dim=ffn_hidden_dim, n_layers=ffn_num_layers, dropout=dropout,
-        )
-    else:
-        weights_tensor = torch.tensor(task_weights, dtype=torch.float32) if task_weights is not None else None
-        ffn = nn.RegressionFFN(
-            input_dim=ffn_input_dim, hidden_dim=ffn_hidden_dim, n_layers=ffn_num_layers,
-            dropout=dropout, n_tasks=n_targets, output_transform=output_transform, task_weights=weights_tensor,
-        )
-    return models.MPNN(message_passing=mp, agg=agg, predictor=ffn, batch_norm=True, metrics=None, X_d_transform=x_d_transform)
 # =============================================================================
 # Model Loading (for SageMaker inference)
 # =============================================================================
 def model_fn(model_dir: str) -> dict:
     """Load ChemProp MPNN ensemble from the specified directory."""
+    from lightning import pytorch as pl
     metadata = joblib.load(os.path.join(model_dir, "ensemble_metadata.joblib"))
     ensemble_models = []
     for i in range(metadata["n_ensemble"]):
@@ -149,8 +124,17 @@ def model_fn(model_dir: str) -> dict:
         model.eval()
         ensemble_models.append(model)
+    # Pre-initialize trainer once during model loading (expensive operation)
+    trainer = pl.Trainer(accelerator="auto", logger=False, enable_progress_bar=False)
     print(f"Loaded {len(ensemble_models)} model(s), targets={metadata['target_columns']}")
-    return {"ensemble_models": ensemble_models, "n_ensemble": metadata["n_ensemble"], "target_columns": metadata["target_columns"]}
+    return {
+        "ensemble_models": ensemble_models,
+        "n_ensemble": metadata["n_ensemble"],
+        "target_columns": metadata["target_columns"],
+        "median_std": metadata["median_std"],
+        "trainer": trainer,
+    }
 # =============================================================================
@@ -163,6 +147,7 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
     ensemble_models = model_dict["ensemble_models"]
     target_columns = model_dict["target_columns"]
+    trainer = model_dict["trainer"]  # Use pre-initialized trainer
     # Load artifacts
     label_encoder = None
@@ -177,7 +162,7 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
         print(f"Hybrid mode: {len(feature_metadata['extra_feature_cols'])} extra features")
     # Find SMILES column and validate
-    smiles_column = find_smiles_column(df.columns.tolist())
+    smiles_column = _find_smiles_column(df.columns.tolist())
     smiles_list = df[smiles_column].tolist()
     valid_mask = np.array([bool(s and isinstance(s, str) and s.strip()) for s in smiles_list])
@@ -212,13 +197,12 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
                 extra_features[:, j] = col_means[j]
     # Create datapoints and predict
-    datapoints, rdkit_valid = create_molecule_datapoints(valid_smiles, extra_descriptors=extra_features)
+    datapoints, rdkit_valid = _create_molecule_datapoints(valid_smiles, extra_descriptors=extra_features)
     if len(datapoints) == 0:
         return df
     dataset = data.MoleculeDataset(datapoints)
     dataloader = data.build_dataloader(dataset, shuffle=False)
-    trainer = pl.Trainer(accelerator="auto", logger=False, enable_progress_bar=False)
     # Ensemble predictions
     all_preds = []
@@ -259,6 +243,9 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
         df["prediction"] = df[f"{target_columns[0]}_pred"]
         df["prediction_std"] = df[f"{target_columns[0]}_pred_std"]
+        # Compute confidence from ensemble std
+        df = _compute_std_confidence(df, model_dict["median_std"])
     return df
@@ -266,6 +253,82 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
 # Training
 # =============================================================================
 if __name__ == "__main__":
+    # -------------------------------------------------------------------------
+    # Training-only imports (deferred to reduce serverless startup time)
+    # -------------------------------------------------------------------------
+    import argparse
+    import glob
+    import awswrangler as wr
+    from lightning import pytorch as pl
+    from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
+    from sklearn.preprocessing import LabelEncoder
+    # Enable Tensor Core optimization for GPUs that support it
+    torch.set_float32_matmul_precision("medium")
+    from chemprop import nn
+    from model_script_utils import (
+        check_dataframe,
+        compute_classification_metrics,
+        compute_regression_metrics,
+        print_classification_metrics,
+        print_confusion_matrix,
+        print_regression_metrics,
+    )
+    # -------------------------------------------------------------------------
+    # Training-only helper function
+    # -------------------------------------------------------------------------
+    def build_mpnn_model(
+        hyperparameters: dict,
+        task: str = "regression",
+        num_classes: int | None = None,
+        n_targets: int = 1,
+        n_extra_descriptors: int = 0,
+        x_d_transform: nn.ScaleTransform | None = None,
+        output_transform: nn.UnscaleTransform | None = None,
+        task_weights: np.ndarray | None = None,
+    ) -> models.MPNN:
+        """Build an MPNN model with specified hyperparameters."""
+        hidden_dim = hyperparameters["hidden_dim"]
+        depth = hyperparameters["depth"]
+        dropout = hyperparameters["dropout"]
+        ffn_hidden_dim = hyperparameters["ffn_hidden_dim"]
+        ffn_num_layers = hyperparameters["ffn_num_layers"]
+        mp = nn.BondMessagePassing(d_h=hidden_dim, depth=depth, dropout=dropout)
+        agg = nn.NormAggregation()
+        ffn_input_dim = hidden_dim + n_extra_descriptors
+        if task == "classification" and num_classes is not None:
+            ffn = nn.MulticlassClassificationFFN(
+                n_classes=num_classes, input_dim=ffn_input_dim,
+                hidden_dim=ffn_hidden_dim, n_layers=ffn_num_layers, dropout=dropout,
+            )
+        else:
+            # Map criterion name to ChemProp metric class (must have .clone() method)
+            from chemprop.nn.metrics import MAE, MSE
+            criterion_map = {
+                "mae": MAE,
+                "mse": MSE,
+            }
+            criterion_name = hyperparameters.get("criterion", "mae")
+            if criterion_name not in criterion_map:
+                raise ValueError(f"Unknown criterion '{criterion_name}'. Supported: {list(criterion_map.keys())}")
+            criterion = criterion_map[criterion_name]()
+            weights_tensor = torch.tensor(task_weights, dtype=torch.float32) if task_weights is not None else None
+            ffn = nn.RegressionFFN(
+                input_dim=ffn_input_dim, hidden_dim=ffn_hidden_dim, n_layers=ffn_num_layers,
+                dropout=dropout, n_tasks=n_targets, output_transform=output_transform, task_weights=weights_tensor,
+                criterion=criterion,
+            )
+        return models.MPNN(message_passing=mp, agg=agg, predictor=ffn, batch_norm=True, metrics=None, X_d_transform=x_d_transform)
     # -------------------------------------------------------------------------
     # Setup: Parse arguments and load data
     # -------------------------------------------------------------------------
@@ -287,7 +350,7 @@ if __name__ == "__main__":
         raise ValueError("'targets' must be a non-empty list of target column names")
     n_targets = len(target_columns)
-    smiles_column = find_smiles_column(feature_list)
+    smiles_column = _find_smiles_column(feature_list)
     extra_feature_cols = [f for f in feature_list if f != smiles_column]
     use_extra_features = len(extra_feature_cols) > 0
@@ -342,7 +405,7 @@ if __name__ == "__main__":
     all_targets = all_df[target_columns].values.astype(np.float32)
     # Filter invalid SMILES
-    _, valid_indices = create_molecule_datapoints(all_df[smiles_column].tolist(), all_targets, all_extra_features)
+    _, valid_indices = _create_molecule_datapoints(all_df[smiles_column].tolist(), all_targets, all_extra_features)
     all_df = all_df.iloc[valid_indices].reset_index(drop=True)
     all_targets = all_targets[valid_indices]
     if all_extra_features is not None:
@@ -401,8 +464,8 @@ if __name__ == "__main__":
         val_extra_raw = val_extra.copy() if val_extra is not None else None
         # Create datasets
-        train_dps, _ = create_molecule_datapoints(df_train[smiles_column].tolist(), train_targets, train_extra)
-        val_dps, _ = create_molecule_datapoints(df_val[smiles_column].tolist(), val_targets, val_extra)
+        train_dps, _ = _create_molecule_datapoints(df_train[smiles_column].tolist(), train_targets, train_extra)
+        val_dps, _ = _create_molecule_datapoints(df_val[smiles_column].tolist(), val_targets, val_extra)
         train_dataset, val_dataset = data.MoleculeDataset(train_dps), data.MoleculeDataset(val_dps)
         # Scale features/targets
@@ -447,7 +510,7 @@ if __name__ == "__main__":
         ensemble_models.append(mpnn)
         # Out-of-fold predictions (using raw features)
-        val_dps_raw, _ = create_molecule_datapoints(df_val[smiles_column].tolist(), val_targets, val_extra_raw)
+        val_dps_raw, _ = _create_molecule_datapoints(df_val[smiles_column].tolist(), val_targets, val_extra_raw)
         val_loader_pred = data.build_dataloader(data.MoleculeDataset(val_dps_raw), batch_size=batch_size, shuffle=False)
         with torch.inference_mode():
@@ -486,6 +549,7 @@ if __name__ == "__main__":
     # -------------------------------------------------------------------------
     # Compute metrics and prepare output
     # -------------------------------------------------------------------------
+    median_std = None  # Only set for regression models with ensemble
     if model_type == "classifier":
         class_preds = preds[:, 0].astype(int)
         target_name = target_columns[0]
@@ -507,7 +571,7 @@ if __name__ == "__main__":
         preds_std = None
         if len(ensemble_models) > 1:
             print("Computing prediction_std from ensemble...")
-            val_dps, _ = create_molecule_datapoints(df_val[smiles_column].tolist(), y_validate, val_extra_features)
+            val_dps, _ = _create_molecule_datapoints(df_val[smiles_column].tolist(), y_validate, val_extra_features)
             val_loader = data.build_dataloader(data.MoleculeDataset(val_dps), batch_size=batch_size, shuffle=False)
             trainer_pred = pl.Trainer(accelerator="auto", logger=False, enable_progress_bar=False)
@@ -535,13 +599,19 @@ if __name__ == "__main__":
         df_val["prediction"] = df_val[f"{target_columns[0]}_pred"]
         df_val["prediction_std"] = df_val[f"{target_columns[0]}_pred_std"]
+        # Compute confidence from ensemble std
+        median_std = float(np.median(preds_std[:, 0]))
+        print(f"\nComputing confidence scores (median_std={median_std:.6f})...")
+        df_val = _compute_std_confidence(df_val, median_std)
+        print(f"  Confidence: mean={df_val['confidence'].mean():.3f}, min={df_val['confidence'].min():.3f}, max={df_val['confidence'].max():.3f}")
     # -------------------------------------------------------------------------
     # Save validation predictions to S3
     # -------------------------------------------------------------------------
     output_columns = [id_column] if id_column in df_val.columns else []
     output_columns += target_columns
     output_columns += [f"{t}_pred" for t in target_columns] + [f"{t}_pred_std" for t in target_columns]
-    output_columns += ["prediction", "prediction_std"]
+    output_columns += ["prediction", "prediction_std", "confidence"]
     output_columns += [c for c in df_val.columns if c.endswith("_proba")]
     output_columns = [c for c in output_columns if c in df_val.columns]
@@ -558,7 +628,13 @@ if __name__ == "__main__":
     for ckpt in glob.glob(os.path.join(args.model_dir, "best_*.ckpt")):
         os.remove(ckpt)
-    joblib.dump({"n_ensemble": len(ensemble_models), "n_folds": n_folds, "target_columns": target_columns}, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
+    ensemble_metadata = {
+        "n_ensemble": len(ensemble_models),
+        "n_folds": n_folds,
+        "target_columns": target_columns,
+        "median_std": median_std,  # For confidence calculation during inference
+    }
+    joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
     with open(os.path.join(args.model_dir, "hyperparameters.json"), "w") as f:
         json.dump(hyperparameters, f, indent=2)

workbench/model_scripts/custom_models/chem_info/fingerprints.py CHANGED Viewed

@@ -4,10 +4,14 @@ import logging
 import pandas as pd
 # Molecular Descriptor Imports
-from rdkit import Chem
+from rdkit import Chem, RDLogger
 from rdkit.Chem import rdFingerprintGenerator
 from rdkit.Chem.MolStandardize import rdMolStandardize
+# Suppress RDKit warnings (e.g., "not removing hydrogen atom without neighbors")
+# Keep errors enabled so we see actual problems
+RDLogger.DisableLog("rdApp.warning")
 # Set up the logger
 log = logging.getLogger("workbench")
@@ -47,8 +51,8 @@ def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=
         # Make sure our molecules are not None
         failed_smiles = df[df["molecule"].isnull()][smiles_column].tolist()
         if failed_smiles:
-            log.error(f"Failed to convert the following SMILES to molecules: {failed_smiles}")
-        df = df.dropna(subset=["molecule"])
+            log.warning(f"Failed to convert {len(failed_smiles)} SMILES to molecules ({failed_smiles})")
+        df = df.dropna(subset=["molecule"]).copy()
     # If we have fragments in our compounds, get the largest fragment before computing fingerprints
     largest_frags = df["molecule"].apply(

workbench/model_scripts/custom_models/proximity/feature_space_proximity.py ADDED Viewed

@@ -0,0 +1,194 @@
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from sklearn.neighbors import NearestNeighbors
+from typing import List, Optional
+import logging
+# Workbench Imports
+from workbench.algorithms.dataframe.proximity import Proximity
+from workbench.algorithms.dataframe.projection_2d import Projection2D
+# Set up logging
+log = logging.getLogger("workbench")
+class FeatureSpaceProximity(Proximity):
+    """Proximity computations for numeric feature spaces using Euclidean distance."""
+    def __init__(
+        self,
+        df: pd.DataFrame,
+        id_column: str,
+        features: List[str],
+        target: Optional[str] = None,
+        include_all_columns: bool = False,
+    ):
+        """
+        Initialize the FeatureSpaceProximity class.
+        Args:
+            df: DataFrame containing data for neighbor computations.
+            id_column: Name of the column used as the identifier.
+            features: List of feature column names to be used for neighbor computations.
+            target: Name of the target column. Defaults to None.
+            include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
+        """
+        # Validate and filter features before calling parent init
+        self._raw_features = features
+        super().__init__(
+            df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns
+        )
+    def _prepare_data(self) -> None:
+        """Filter out non-numeric features and drop NaN rows."""
+        # Validate features
+        self.features = self._validate_features(self.df, self._raw_features)
+        # Drop NaN rows for the features we're using
+        self.df = self.df.dropna(subset=self.features).copy()
+    def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
+        """Remove non-numeric features and log warnings."""
+        non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
+        if non_numeric:
+            log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
+        return [f for f in features if f not in non_numeric]
+    def _build_model(self) -> None:
+        """Standardize features and fit Nearest Neighbors model."""
+        self.scaler = StandardScaler()
+        X = self.scaler.fit_transform(self.df[self.features])
+        self.nn = NearestNeighbors().fit(X)
+    def _transform_features(self, df: pd.DataFrame) -> np.ndarray:
+        """Transform features using the fitted scaler."""
+        return self.scaler.transform(df[self.features])
+    def _project_2d(self) -> None:
+        """Project the numeric features to 2D for visualization."""
+        if len(self.features) >= 2:
+            self.df = Projection2D().fit_transform(self.df, features=self.features)
+# Testing the FeatureSpaceProximity class
+if __name__ == "__main__":
+    pd.set_option("display.max_columns", None)
+    pd.set_option("display.width", 1000)
+    # Create a sample DataFrame
+    data = {
+        "ID": [1, 2, 3, 4, 5],
+        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
+        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
+        "Feature3": [2.5, 2.4, 2.3, 2.3, np.nan],
+    }
+    df = pd.DataFrame(data)
+    # Test the FeatureSpaceProximity class
+    features = ["Feature1", "Feature2", "Feature3"]
+    prox = FeatureSpaceProximity(df, id_column="ID", features=features)
+    print(prox.neighbors(1, n_neighbors=2))
+    # Test the neighbors method with radius
+    print(prox.neighbors(1, radius=2.0))
+    # Test with Features list
+    prox = FeatureSpaceProximity(df, id_column="ID", features=["Feature1"])
+    print(prox.neighbors(1))
+    # Create a sample DataFrame
+    data = {
+        "id": ["a", "b", "c", "d", "e"],  # Testing string IDs
+        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
+        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
+        "target": [1, 0, 1, 0, 5],
+    }
+    df = pd.DataFrame(data)
+    # Test with String Ids
+    prox = FeatureSpaceProximity(
+        df,
+        id_column="id",
+        features=["Feature1", "Feature2"],
+        target="target",
+        include_all_columns=True,
+    )
+    print(prox.neighbors(["a", "b"]))
+    # Test duplicate IDs
+    data = {
+        "id": ["a", "b", "c", "d", "d"],  # Duplicate ID (d)
+        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
+        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
+        "target": [1, 0, 1, 0, 5],
+    }
+    df = pd.DataFrame(data)
+    prox = FeatureSpaceProximity(df, id_column="id", features=["Feature1", "Feature2"], target="target")
+    print(df.equals(prox.df))
+    # Test on real data from Workbench
+    from workbench.api import FeatureSet, Model
+    fs = FeatureSet("aqsol_features")
+    model = Model("aqsol-regression")
+    features = model.features()
+    df = fs.pull_dataframe()
+    prox = FeatureSpaceProximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
+    print("\n" + "=" * 80)
+    print("Testing Neighbors...")
+    print("=" * 80)
+    test_id = df[fs.id_column].tolist()[0]
+    print(f"\nNeighbors for ID {test_id}:")
+    print(prox.neighbors(test_id))
+    print("\n" + "=" * 80)
+    print("Testing isolated_compounds...")
+    print("=" * 80)
+    # Test isolated data in the top 1%
+    isolated_1pct = prox.isolated(top_percent=1.0)
+    print(f"\nTop 1% most isolated compounds (n={len(isolated_1pct)}):")
+    print(isolated_1pct)
+    # Test isolated data in the top 5%
+    isolated_5pct = prox.isolated(top_percent=5.0)
+    print(f"\nTop 5% most isolated compounds (n={len(isolated_5pct)}):")
+    print(isolated_5pct)
+    print("\n" + "=" * 80)
+    print("Testing target_gradients...")
+    print("=" * 80)
+    # Test with different parameters
+    gradients_1pct = prox.target_gradients(top_percent=1.0, min_delta=1.0)
+    print(f"\nTop 1% target gradients (min_delta=5.0) (n={len(gradients_1pct)}):")
+    print(gradients_1pct)
+    gradients_5pct = prox.target_gradients(top_percent=5.0, min_delta=5.0)
+    print(f"\nTop 5% target gradients (min_delta=5.0) (n={len(gradients_5pct)}):")
+    print(gradients_5pct)
+    # Test proximity_stats
+    print("\n" + "=" * 80)
+    print("Testing proximity_stats...")
+    print("=" * 80)
+    stats = prox.proximity_stats()
+    print(stats)
+    # Plot the distance distribution using pandas
+    print("\n" + "=" * 80)
+    print("Plotting distance distribution...")
+    print("=" * 80)
+    prox.df["nn_distance"].hist(bins=50, figsize=(10, 6), edgecolor="black")
+    # Visualize the 2D projection
+    print("\n" + "=" * 80)
+    print("Visualizing 2D Projection...")
+    print("=" * 80)
+    from workbench.web_interface.components.plugin_unit_test import PluginUnitTest
+    from workbench.web_interface.components.plugins.scatter_plot import ScatterPlot
+    unit_test = PluginUnitTest(ScatterPlot, input_data=prox.df[:1000], x="x", y="y", color=model.target())
+    unit_test.run()

workbench/model_scripts/custom_models/proximity/feature_space_proximity.template CHANGED Viewed

@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
     "id_column": "{{id_column}}",
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
-    "track_columns": "{{track_columns}}",
+    "include_all_columns": "{{include_all_columns}}",
 }
 from io import StringIO
@@ -18,7 +18,7 @@ import os
 import pandas as pd
 # Local Imports
-from proximity import Proximity
+from feature_space_proximity import FeatureSpaceProximity
 # Function to check if dataframe is empty
@@ -61,7 +61,7 @@ if __name__ == "__main__":
     id_column = TEMPLATE_PARAMS["id_column"]
     features = TEMPLATE_PARAMS["features"]
     target = TEMPLATE_PARAMS["target"]  # Can be None for unsupervised models
-    track_columns = TEMPLATE_PARAMS["track_columns"]  # Can be None
+    include_all_columns = TEMPLATE_PARAMS["include_all_columns"]  # Defaults to False
     # Script arguments for input/output directories
     parser = argparse.ArgumentParser()
@@ -79,8 +79,8 @@ if __name__ == "__main__":
     # Check if the DataFrame is empty
     check_dataframe(all_df, "training_df")
-    # Create the Proximity model
-    model = Proximity(all_df, id_column, features, target, track_columns=track_columns)
+    # Create the FeatureSpaceProximity model
+    model = FeatureSpaceProximity(all_df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns)
     # Now serialize the model
     model.serialize(args.model_dir)
@@ -90,7 +90,7 @@ if __name__ == "__main__":
 def model_fn(model_dir):
     # Deserialize the model
-    model = Proximity.deserialize(model_dir)
+    model = FeatureSpaceProximity.deserialize(model_dir)
     return model

workbench 0.8.212__py3-none-any.whl → 0.8.217__py3-none-any.whl

workbench 0.8.212py3-none-any.whl → 0.8.217py3-none-any.whl