PyPI - workbench - Versions diffs - 0.8.212__py3-none-any.whl → 0.8.217__py3-none-any.whl - Mend

workbench 0.8.212py3-none-any.whl → 0.8.217py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +257 -80
workbench/algorithms/dataframe/projection_2d.py +38 -21
workbench/algorithms/dataframe/proximity.py +75 -150
workbench/algorithms/graph/light/proximity_graph.py +5 -5
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +2 -2
workbench/api/__init__.py +3 -0
workbench/api/endpoint.py +10 -5
workbench/api/feature_set.py +76 -6
workbench/api/meta_model.py +289 -0
workbench/api/model.py +43 -4
workbench/core/artifacts/endpoint_core.py +75 -129
workbench/core/artifacts/feature_set_core.py +1 -1
workbench/core/artifacts/model_core.py +6 -4
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +30 -10
workbench/model_script_utils/pytorch_utils.py +11 -1
workbench/model_scripts/chemprop/chemprop.template +145 -69
workbench/model_scripts/chemprop/generated_model_script.py +147 -71
workbench/model_scripts/custom_models/chem_info/fingerprints.py +7 -3
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/uq_models/meta_uq.template +6 -6
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +42 -24
workbench/model_scripts/pytorch_model/pytorch.template +42 -24
workbench/model_scripts/pytorch_model/pytorch_utils.py +11 -1
workbench/model_scripts/script_generation.py +4 -0
workbench/model_scripts/xgb_model/generated_model_script.py +169 -158
workbench/model_scripts/xgb_model/xgb_model.template +163 -152
workbench/repl/workbench_shell.py +0 -5
workbench/scripts/endpoint_test.py +2 -2
workbench/utils/chem_utils/fingerprints.py +7 -3
workbench/utils/chemprop_utils.py +23 -5
workbench/utils/meta_model_simulator.py +471 -0
workbench/utils/metrics_utils.py +94 -10
workbench/utils/model_utils.py +91 -9
workbench/utils/pytorch_utils.py +1 -1
workbench/web_interface/components/plugins/scatter_plot.py +4 -8
{workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/METADATA +2 -1
{workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/RECORD +48 -43
workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
{workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/WHEEL +0 -0
{workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/entry_points.txt +0 -0
{workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/top_level.txt +0 -0

workbench/algorithms/dataframe/proximity.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import pandas as pd
 import numpy as np
-from sklearn.preprocessing import StandardScaler
-from sklearn.neighbors import NearestNeighbors
+from abc import ABC, abstractmethod
 from typing import List, Dict, Optional, Union
 import logging
@@ -9,14 +8,16 @@ import logging
 log = logging.getLogger("workbench")
-class Proximity:
+class Proximity(ABC):
+    """Abstract base class for proximity/neighbor computations."""
     def __init__(
         self,
         df: pd.DataFrame,
         id_column: str,
         features: List[str],
         target: Optional[str] = None,
-        track_columns: Optional[List[str]] = None,
+        include_all_columns: bool = False,
     ):
         """
         Initialize the Proximity class.
@@ -26,29 +27,61 @@ class Proximity:
             id_column: Name of the column used as the identifier.
             features: List of feature column names to be used for neighbor computations.
             target: Name of the target column. Defaults to None.
-            track_columns: Additional columns to track in results. Defaults to None.
+            include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
         """
         self.id_column = id_column
+        self.features = features
         self.target = target
-        self.track_columns = track_columns or []
+        self.include_all_columns = include_all_columns
-        # Filter out non-numeric features
-        self.features = self._validate_features(df, features)
+        # Store the DataFrame (subclasses may filter/modify in _prepare_data)
+        self.df = df.copy()
-        # Drop NaN rows and set up DataFrame
-        self.df = df.dropna(subset=self.features).copy()
+        # Prepare data (subclasses can override)
+        self._prepare_data()
         # Compute target range if target is provided
         self.target_range = None
         if self.target and self.target in self.df.columns:
             self.target_range = self.df[self.target].max() - self.df[self.target].min()
-        # Build the proximity model
+        # Build the proximity model (subclass-specific)
         self._build_model()
         # Precompute landscape metrics
         self._precompute_metrics()
+        # Define core columns for output (subclasses can override)
+        self._set_core_columns()
+        # Project the data to 2D (subclass-specific)
+        self._project_2d()
+    def _prepare_data(self) -> None:
+        """Prepare the data before building the model. Subclasses can override."""
+        pass
+    def _set_core_columns(self) -> None:
+        """Set the core columns for output. Subclasses can override."""
+        self.core_columns = [self.id_column, "nn_distance", "nn_id"]
+        if self.target:
+            self.core_columns.extend([self.target, "nn_target", "nn_target_diff"])
+    @abstractmethod
+    def _build_model(self) -> None:
+        """Build the proximity model. Must set self.nn (NearestNeighbors instance)."""
+        pass
+    @abstractmethod
+    def _transform_features(self, df: pd.DataFrame) -> np.ndarray:
+        """Transform features for querying. Returns feature matrix for nearest neighbor lookup."""
+        pass
+    @abstractmethod
+    def _project_2d(self) -> None:
+        """Project the data to 2D for visualization. Updates self.df with 'x' and 'y' columns."""
+        pass
     def isolated(self, top_percent: float = 1.0) -> pd.DataFrame:
         """
         Find isolated data points based on distance to nearest neighbor.
@@ -62,7 +95,19 @@ class Proximity:
         percentile = 100 - top_percent
         threshold = np.percentile(self.df["nn_distance"], percentile)
         isolated = self.df[self.df["nn_distance"] >= threshold].copy()
-        return isolated.sort_values("nn_distance", ascending=False).reset_index(drop=True)
+        isolated = isolated.sort_values("nn_distance", ascending=False).reset_index(drop=True)
+        return isolated if self.include_all_columns else isolated[self.core_columns]
+    def proximity_stats(self) -> pd.DataFrame:
+        """
+        Return distribution statistics for nearest neighbor distances.
+        Returns:
+            DataFrame with proximity distribution statistics (count, mean, std, percentiles)
+        """
+        return (
+            self.df["nn_distance"].describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).to_frame()
+        )
     def target_gradients(
         self,
@@ -90,7 +135,7 @@ class Proximity:
         if self.target is None:
             raise ValueError("Target column must be specified")
-        epsilon = 1e-5
+        epsilon = 1e-6
         # Phase 1: Quick filter using precomputed nearest neighbor
         candidates = self.df.copy()
@@ -111,13 +156,13 @@ class Proximity:
             threshold = np.percentile(candidates["gradient"], percentile)
             candidates = candidates[candidates["gradient"] >= threshold].copy()
-        # Phase 2: Verify with k-neighbor median to filter out cases where nearest neighbor is the outlier
+        # Phase 2: Verify with K-neighbor median to filter out cases where nearest neighbor is the outlier
         results = []
         for _, row in candidates.iterrows():
             cmpd_id = row[self.id_column]
             cmpd_target = row[self.target]
-            # Get k nearest neighbors (excluding self)
+            # Get K nearest neighbors (excluding self)
             nbrs = self.neighbors(cmpd_id, n_neighbors=k_neighbors, include_self=False)
             # Calculate median target of k neighbors, excluding the nearest neighbor (index 0)
@@ -146,10 +191,12 @@ class Proximity:
                 columns=[
                     self.id_column,
                     self.target,
+                    "nn_target",
+                    "nn_target_diff",
+                    "nn_distance",
+                    "gradient",
                     "neighbor_median",
                     "neighbor_median_diff",
-                    "mean_distance",
-                    "gradient",
                 ]
             )
@@ -188,8 +235,8 @@ class Proximity:
         query_df = self.df[self.df[self.id_column].isin(ids)]
         query_df = query_df.set_index(self.id_column).loc[ids].reset_index()
-        # Transform query features
-        X_query = self.scaler.transform(query_df[self.features])
+        # Transform query features (subclass-specific)
+        X_query = self._transform_features(query_df)
         # Get neighbors
         if radius is not None:
@@ -216,20 +263,7 @@ class Proximity:
         df_results = df_results.sort_values([self.id_column, "is_self", "distance"], ascending=[True, False, True])
         return df_results.drop("is_self", axis=1).reset_index(drop=True)
-    def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
-        """Remove non-numeric features and log warnings."""
-        non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
-        if non_numeric:
-            log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
-        return [f for f in features if f not in non_numeric]
-    def _build_model(self) -> None:
-        """Standardize features and fit Nearest Neighbors model."""
-        self.scaler = StandardScaler()
-        X = self.scaler.fit_transform(self.df[self.features])
-        self.nn = NearestNeighbors().fit(X)
-    def _precompute_metrics(self, n_neighbors: int = 10) -> None:
+    def _precompute_metrics(self) -> None:
         """
         Precompute landscape metrics for all compounds.
@@ -243,12 +277,9 @@ class Proximity:
         """
         log.info("Precomputing proximity metrics...")
-        # Make sure n_neighbors isn't greater than dataset size
-        n_neighbors = min(n_neighbors, len(self.df) - 1)
-        # Get nearest neighbors for all points (including self)
-        X = self.scaler.transform(self.df[self.features])
-        distances, indices = self.nn.kneighbors(X, n_neighbors=2)  # Just need nearest neighbor
+        # Get nearest neighbors for all points (n=2 because index 0 is self)
+        X = self._transform_features(self.df)
+        distances, indices = self.nn.kneighbors(X, n_neighbors=2)
         # Extract nearest neighbor (index 1, since index 0 is self)
         self.df["nn_distance"] = distances[:, 1]
@@ -285,126 +316,20 @@ class Proximity:
         result = {
             self.id_column: query_id,
             "neighbor_id": neighbor_id,
-            "distance": 0.0 if distance < 1e-5 else distance,
+            "distance": 0.0 if distance < 1e-6 else distance,
         }
         # Add target if present
         if self.target and self.target in self.df.columns:
             result[self.target] = neighbor_row[self.target]
-        # Add tracked columns
-        for col in self.track_columns:
-            if col in self.df.columns:
-                result[col] = neighbor_row[col]
         # Add prediction/probability columns if they exist
         for col in self.df.columns:
             if col == "prediction" or "_proba" in col or "residual" in col or col == "in_model":
                 result[col] = neighbor_row[col]
-        return result
+        # Include all columns if requested
+        if self.include_all_columns:
+            result.update(neighbor_row.to_dict())
-# Testing the Proximity class
-if __name__ == "__main__":
-    pd.set_option("display.max_columns", None)
-    pd.set_option("display.width", 1000)
-    # Create a sample DataFrame
-    data = {
-        "ID": [1, 2, 3, 4, 5],
-        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
-        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
-        "Feature3": [2.5, 2.4, 2.3, 2.3, np.nan],
-    }
-    df = pd.DataFrame(data)
-    # Test the Proximity class
-    features = ["Feature1", "Feature2", "Feature3"]
-    prox = Proximity(df, id_column="ID", features=features)
-    print(prox.neighbors(1, n_neighbors=2))
-    # Test the neighbors method with radius
-    print(prox.neighbors(1, radius=2.0))
-    # Test with Features list
-    prox = Proximity(df, id_column="ID", features=["Feature1"])
-    print(prox.neighbors(1))
-    # Create a sample DataFrame
-    data = {
-        "foo_id": ["a", "b", "c", "d", "e"],  # Testing string IDs
-        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
-        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
-        "target": [1, 0, 1, 0, 5],
-    }
-    df = pd.DataFrame(data)
-    # Test with String Ids
-    prox = Proximity(
-        df,
-        id_column="foo_id",
-        features=["Feature1", "Feature2"],
-        target="target",
-        track_columns=["Feature1", "Feature2"],
-    )
-    print(prox.neighbors(["a", "b"]))
-    # Test duplicate IDs
-    data = {
-        "foo_id": ["a", "b", "c", "d", "d"],  # Duplicate ID (d)
-        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
-        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
-        "target": [1, 0, 1, 0, 5],
-    }
-    df = pd.DataFrame(data)
-    prox = Proximity(df, id_column="foo_id", features=["Feature1", "Feature2"], target="target")
-    print(df.equals(prox.df))
-    # Test with a categorical feature
-    from workbench.api import FeatureSet, Model
-    fs = FeatureSet("aqsol_features")
-    model = Model("aqsol-regression")
-    features = model.features()
-    df = fs.pull_dataframe()
-    prox = Proximity(
-        df, id_column=fs.id_column, features=model.features(), target=model.target(), track_columns=features
-    )
-    print(prox.neighbors(df[fs.id_column].tolist()[:3]))
-    print("\n" + "=" * 80)
-    print("Testing isolated_compounds...")
-    print("=" * 80)
-    # Test isolated data in the top 1%
-    isolated_1pct = prox.isolated(top_percent=1.0)
-    print(f"\nTop 1% most isolated compounds (n={len(isolated_1pct)}):")
-    print(isolated_1pct[[fs.id_column, "nn_distance", "nn_id"]].head(10))
-    # Test isolated data in the top 5%
-    isolated_5pct = prox.isolated(top_percent=5.0)
-    print(f"\nTop 5% most isolated compounds (n={len(isolated_5pct)}):")
-    print(isolated_5pct[[fs.id_column, "nn_distance", "nn_id"]].head(10))
-    print("\n" + "=" * 80)
-    print("Testing target_gradients...")
-    print("=" * 80)
-    # Test with different parameters
-    gradients_1pct = prox.target_gradients(top_percent=1.0, min_delta=1.0)
-    print(f"\nTop 1% target gradients (min_delta=5.0) (n={len(gradients_1pct)}):")
-    print(
-        gradients_1pct[
-            [fs.id_column, model.target(), "neighbor_median", "neighbor_median_diff", "mean_distance", "gradient"]
-        ].head(10)
-    )
-    gradients_5pct = prox.target_gradients(top_percent=5.0, min_delta=5.0)
-    print(f"\nTop 5% target gradients (min_delta=5.0) (n={len(gradients_5pct)}):")
-    print(
-        gradients_5pct[
-            [fs.id_column, model.target(), "neighbor_median", "neighbor_median_diff", "mean_distance", "gradient"]
-        ].head(10)
-    )
+        return result

workbench/algorithms/graph/light/proximity_graph.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Union
 import logging
 # Workbench Imports
-from workbench.algorithms.dataframe import Proximity
+from workbench.algorithms.dataframe.proximity import Proximity
 from workbench.api.graph_store import GraphStore
 # Set up logging
@@ -132,7 +132,7 @@ class ProximityGraph:
 if __name__ == "__main__":
-    from workbench.algorithms.dataframe.proximity import Proximity
+    from workbench.algorithms.dataframe.feature_space_proximity import FeatureSpaceProximity
     from workbench.algorithms.dataframe.fingerprint_proximity import FingerprintProximity
     from workbench.web_interface.components.plugins.graph_plot import GraphPlot
     from workbench.api import DFStore
@@ -157,9 +157,9 @@ if __name__ == "__main__":
     }
     feature_df = pd.DataFrame(feature_data)
-    # Build a graph using the base Proximity class
-    print("\n--- Proximity Class ---")
-    prox = Proximity(feature_df, id_column="id", features=["Feature1", "Feature2"], target="target")
+    # Build a graph using FeatureSpaceProximity
+    print("\n--- FeatureSpaceProximity Class ---")
+    prox = FeatureSpaceProximity(feature_df, id_column="id", features=["Feature1", "Feature2"], target="target")
     feature_graph = ProximityGraph()
     feature_graph.build_graph(prox)
     nx_graph = feature_graph.nx_graph

workbench 0.8.212__py3-none-any.whl → 0.8.217__py3-none-any.whl

workbench 0.8.212py3-none-any.whl → 0.8.217py3-none-any.whl