PyPI - workbench - Versions diffs - 0.8.213__py3-none-any.whl → 0.8.217__py3-none-any.whl - Mend

workbench 0.8.213py3-none-any.whl → 0.8.217py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +257 -80
workbench/algorithms/dataframe/projection_2d.py +38 -21
workbench/algorithms/dataframe/proximity.py +75 -150
workbench/algorithms/graph/light/proximity_graph.py +5 -5
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +2 -2
workbench/api/__init__.py +3 -0
workbench/api/endpoint.py +10 -5
workbench/api/feature_set.py +76 -6
workbench/api/meta_model.py +289 -0
workbench/api/model.py +43 -4
workbench/core/artifacts/endpoint_core.py +63 -115
workbench/core/artifacts/feature_set_core.py +1 -1
workbench/core/artifacts/model_core.py +6 -4
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +30 -10
workbench/model_script_utils/pytorch_utils.py +11 -1
workbench/model_scripts/chemprop/chemprop.template +145 -69
workbench/model_scripts/chemprop/generated_model_script.py +147 -71
workbench/model_scripts/custom_models/chem_info/fingerprints.py +7 -3
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/uq_models/meta_uq.template +6 -6
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +42 -24
workbench/model_scripts/pytorch_model/pytorch.template +42 -24
workbench/model_scripts/pytorch_model/pytorch_utils.py +11 -1
workbench/model_scripts/script_generation.py +4 -0
workbench/model_scripts/xgb_model/generated_model_script.py +169 -158
workbench/model_scripts/xgb_model/xgb_model.template +163 -152
workbench/repl/workbench_shell.py +0 -5
workbench/scripts/endpoint_test.py +2 -2
workbench/utils/chem_utils/fingerprints.py +7 -3
workbench/utils/chemprop_utils.py +23 -5
workbench/utils/meta_model_simulator.py +471 -0
workbench/utils/metrics_utils.py +94 -10
workbench/utils/model_utils.py +91 -9
workbench/utils/pytorch_utils.py +1 -1
workbench/web_interface/components/plugins/scatter_plot.py +4 -8
{workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/METADATA +2 -1
{workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/RECORD +48 -43
workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
{workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/WHEEL +0 -0
{workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/entry_points.txt +0 -0
{workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/top_level.txt +0 -0

workbench/algorithms/dataframe/feature_space_proximity.py CHANGED Viewed

@@ -1,101 +1,194 @@
 import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from sklearn.neighbors import NearestNeighbors
+from typing import List, Optional
 import logging
 # Workbench Imports
 from workbench.algorithms.dataframe.proximity import Proximity
 from workbench.algorithms.dataframe.projection_2d import Projection2D
-from workbench.core.views.inference_view import InferenceView
-from workbench.api import FeatureSet, Model
 # Set up logging
 log = logging.getLogger("workbench")
 class FeatureSpaceProximity(Proximity):
-    def __init__(self, model: Model, n_neighbors: int = 10) -> None:
+    """Proximity computations for numeric feature spaces using Euclidean distance."""
+    def __init__(
+        self,
+        df: pd.DataFrame,
+        id_column: str,
+        features: List[str],
+        target: Optional[str] = None,
+        include_all_columns: bool = False,
+    ):
         """
         Initialize the FeatureSpaceProximity class.
         Args:
-            model (Model): A Workbench model object.
-            n_neighbors (int): Number of neighbors to compute. Defaults to 10.
+            df: DataFrame containing data for neighbor computations.
+            id_column: Name of the column used as the identifier.
+            features: List of feature column names to be used for neighbor computations.
+            target: Name of the target column. Defaults to None.
+            include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
         """
-        # Grab the features and target from the model
-        features = model.features()
-        target = model.target()
-        # Grab the feature set for the model
-        fs = FeatureSet(model.get_input())
-        # If we have a "inference" view, pull the data from that view
-        view_name = f"inf_{model.name.replace('-', '_')}"
-        if view_name in fs.views():
-            self.df = fs.view(view_name).pull_dataframe()
-        # Otherwise, pull the data from the feature set and run inference
-        else:
-            inf_view = InferenceView.create(model)
-            self.df = inf_view.pull_dataframe()
-        # Call the parent class constructor
-        super().__init__(self.df, id_column=fs.id_column, features=features, target=target, n_neighbors=n_neighbors)
-        # Project the data to 2D
-        self.df = Projection2D().fit_transform(self.df, features=features)
+        # Validate and filter features before calling parent init
+        self._raw_features = features
+        super().__init__(
+            df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns
+        )
+    def _prepare_data(self) -> None:
+        """Filter out non-numeric features and drop NaN rows."""
+        # Validate features
+        self.features = self._validate_features(self.df, self._raw_features)
+        # Drop NaN rows for the features we're using
+        self.df = self.df.dropna(subset=self.features).copy()
+    def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
+        """Remove non-numeric features and log warnings."""
+        non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
+        if non_numeric:
+            log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
+        return [f for f in features if f not in non_numeric]
+    def _build_model(self) -> None:
+        """Standardize features and fit Nearest Neighbors model."""
+        self.scaler = StandardScaler()
+        X = self.scaler.fit_transform(self.df[self.features])
+        self.nn = NearestNeighbors().fit(X)
+    def _transform_features(self, df: pd.DataFrame) -> np.ndarray:
+        """Transform features using the fitted scaler."""
+        return self.scaler.transform(df[self.features])
+    def _project_2d(self) -> None:
+        """Project the numeric features to 2D for visualization."""
+        if len(self.features) >= 2:
+            self.df = Projection2D().fit_transform(self.df, features=self.features)
+# Testing the FeatureSpaceProximity class
 if __name__ == "__main__":
     pd.set_option("display.max_columns", None)
     pd.set_option("display.width", 1000)
-    # Test a Workbench classification Model
-    m = Model("wine-classification")
-    fsp = FeatureSpaceProximity(m)
-    # Neighbors Test using a single row from FeatureSet
-    fs = FeatureSet(m.get_input())
+    # Create a sample DataFrame
+    data = {
+        "ID": [1, 2, 3, 4, 5],
+        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
+        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
+        "Feature3": [2.5, 2.4, 2.3, 2.3, np.nan],
+    }
+    df = pd.DataFrame(data)
+    # Test the FeatureSpaceProximity class
+    features = ["Feature1", "Feature2", "Feature3"]
+    prox = FeatureSpaceProximity(df, id_column="ID", features=features)
+    print(prox.neighbors(1, n_neighbors=2))
+    # Test the neighbors method with radius
+    print(prox.neighbors(1, radius=2.0))
+    # Test with Features list
+    prox = FeatureSpaceProximity(df, id_column="ID", features=["Feature1"])
+    print(prox.neighbors(1))
+    # Create a sample DataFrame
+    data = {
+        "id": ["a", "b", "c", "d", "e"],  # Testing string IDs
+        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
+        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
+        "target": [1, 0, 1, 0, 5],
+    }
+    df = pd.DataFrame(data)
+    # Test with String Ids
+    prox = FeatureSpaceProximity(
+        df,
+        id_column="id",
+        features=["Feature1", "Feature2"],
+        target="target",
+        include_all_columns=True,
+    )
+    print(prox.neighbors(["a", "b"]))
+    # Test duplicate IDs
+    data = {
+        "id": ["a", "b", "c", "d", "d"],  # Duplicate ID (d)
+        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
+        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
+        "target": [1, 0, 1, 0, 5],
+    }
+    df = pd.DataFrame(data)
+    prox = FeatureSpaceProximity(df, id_column="id", features=["Feature1", "Feature2"], target="target")
+    print(df.equals(prox.df))
+    # Test on real data from Workbench
+    from workbench.api import FeatureSet, Model
+    fs = FeatureSet("aqsol_features")
+    model = Model("aqsol-regression")
+    features = model.features()
     df = fs.pull_dataframe()
-    single_query_neighbors = fsp.neighbors(df.iloc[[0]])
-    print("\nNeighbors for Query ID:", df.iloc[0][fs.id_column])
-    print(single_query_neighbors)
-    # Test a Workbench regression model
-    m = Model("abalone-regression")
-    fsp = FeatureSpaceProximity(m)
-    # Neighbors Test using a multiple rows from FeatureSet
-    fs = FeatureSet(m.get_input())
-    df = fs.pull_dataframe()
-    query_neighbors = fsp.neighbors(df.iloc[0:2])
-    print("\nNeighbors for Query ID:", df.iloc[0][fs.id_column])
-    print(query_neighbors)
-    # Test a Workbench regression model
-    m = Model("aqsol-regression")
-    fsp = FeatureSpaceProximity(m)
-    # Neighbors Test using a multiple rows from FeatureSet
-    fs = FeatureSet(m.get_input())
-    df = fs.pull_dataframe()
-    query_neighbors = fsp.neighbors(df.iloc[5:7])
-    print("\nNeighbors for Query ID:", df.iloc[5][fs.id_column])
-    print(query_neighbors)
-    # Time the all_neighbors method
-    import time
-    start_time = time.time()
-    all_neighbors_df = fsp.all_neighbors()
-    end_time = time.time()
-    print("\nTime taken for all_neighbors:", end_time - start_time)
-    print("\nAll Neighbors DataFrame:")
-    print(all_neighbors_df)
-    # Show a scatter plot of the data
+    prox = FeatureSpaceProximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
+    print("\n" + "=" * 80)
+    print("Testing Neighbors...")
+    print("=" * 80)
+    test_id = df[fs.id_column].tolist()[0]
+    print(f"\nNeighbors for ID {test_id}:")
+    print(prox.neighbors(test_id))
+    print("\n" + "=" * 80)
+    print("Testing isolated_compounds...")
+    print("=" * 80)
+    # Test isolated data in the top 1%
+    isolated_1pct = prox.isolated(top_percent=1.0)
+    print(f"\nTop 1% most isolated compounds (n={len(isolated_1pct)}):")
+    print(isolated_1pct)
+    # Test isolated data in the top 5%
+    isolated_5pct = prox.isolated(top_percent=5.0)
+    print(f"\nTop 5% most isolated compounds (n={len(isolated_5pct)}):")
+    print(isolated_5pct)
+    print("\n" + "=" * 80)
+    print("Testing target_gradients...")
+    print("=" * 80)
+    # Test with different parameters
+    gradients_1pct = prox.target_gradients(top_percent=1.0, min_delta=1.0)
+    print(f"\nTop 1% target gradients (min_delta=5.0) (n={len(gradients_1pct)}):")
+    print(gradients_1pct)
+    gradients_5pct = prox.target_gradients(top_percent=5.0, min_delta=5.0)
+    print(f"\nTop 5% target gradients (min_delta=5.0) (n={len(gradients_5pct)}):")
+    print(gradients_5pct)
+    # Test proximity_stats
+    print("\n" + "=" * 80)
+    print("Testing proximity_stats...")
+    print("=" * 80)
+    stats = prox.proximity_stats()
+    print(stats)
+    # Plot the distance distribution using pandas
+    print("\n" + "=" * 80)
+    print("Plotting distance distribution...")
+    print("=" * 80)
+    prox.df["nn_distance"].hist(bins=50, figsize=(10, 6), edgecolor="black")
+    # Visualize the 2D projection
+    print("\n" + "=" * 80)
+    print("Visualizing 2D Projection...")
+    print("=" * 80)
     from workbench.web_interface.components.plugin_unit_test import PluginUnitTest
     from workbench.web_interface.components.plugins.scatter_plot import ScatterPlot
-    # Run the Unit Test on the Plugin using the new DataFrame with 'x' and 'y'
-    unit_test = PluginUnitTest(ScatterPlot, input_data=fsp.df, x="x", y="y")
+    unit_test = PluginUnitTest(ScatterPlot, input_data=prox.df[:1000], x="x", y="y", color=model.target())
     unit_test.run()

workbench 0.8.213__py3-none-any.whl → 0.8.217__py3-none-any.whl

workbench 0.8.213py3-none-any.whl → 0.8.217py3-none-any.whl