PyPI - workbench - Versions diffs - 0.8.174__py3-none-any.whl → 0.8.227__py3-none-any.whl - Mend

workbench 0.8.174py3-none-any.whl → 0.8.227py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (145) hide show

workbench/__init__.py +1 -0
workbench/algorithms/dataframe/__init__.py +1 -2
workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
workbench/algorithms/dataframe/projection_2d.py +44 -21
workbench/algorithms/dataframe/proximity.py +259 -305
workbench/algorithms/graph/light/proximity_graph.py +12 -11
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +388 -0
workbench/algorithms/sql/column_stats.py +0 -1
workbench/algorithms/sql/correlations.py +0 -1
workbench/algorithms/sql/descriptive_stats.py +0 -1
workbench/algorithms/sql/outliers.py +3 -3
workbench/api/__init__.py +5 -1
workbench/api/df_store.py +17 -108
workbench/api/endpoint.py +14 -12
workbench/api/feature_set.py +117 -11
workbench/api/meta.py +0 -1
workbench/api/meta_model.py +289 -0
workbench/api/model.py +52 -21
workbench/api/parameter_store.py +3 -52
workbench/cached/cached_meta.py +0 -1
workbench/cached/cached_model.py +49 -11
workbench/core/artifacts/__init__.py +11 -2
workbench/core/artifacts/artifact.py +7 -7
workbench/core/artifacts/data_capture_core.py +8 -1
workbench/core/artifacts/df_store_core.py +114 -0
workbench/core/artifacts/endpoint_core.py +323 -205
workbench/core/artifacts/feature_set_core.py +249 -45
workbench/core/artifacts/model_core.py +133 -101
workbench/core/artifacts/parameter_store_core.py +98 -0
workbench/core/cloud_platform/aws/aws_account_clamp.py +48 -2
workbench/core/cloud_platform/cloud_meta.py +0 -1
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/features_to_model/features_to_model.py +60 -44
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +43 -10
workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
workbench/core/views/training_view.py +113 -42
workbench/core/views/view.py +53 -3
workbench/core/views/view_utils.py +4 -4
workbench/model_script_utils/model_script_utils.py +339 -0
workbench/model_script_utils/pytorch_utils.py +405 -0
workbench/model_script_utils/uq_harness.py +277 -0
workbench/model_scripts/chemprop/chemprop.template +774 -0
workbench/model_scripts/chemprop/generated_model_script.py +774 -0
workbench/model_scripts/chemprop/model_script_utils.py +339 -0
workbench/model_scripts/chemprop/requirements.txt +3 -0
workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +18 -7
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +80 -58
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -2
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/ngboost.template +15 -16
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +443 -499
workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
workbench/model_scripts/pytorch_model/pytorch.template +440 -496
workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
workbench/model_scripts/pytorch_model/requirements.txt +1 -1
workbench/model_scripts/pytorch_model/uq_harness.py +277 -0
workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +15 -12
workbench/model_scripts/uq_models/generated_model_script.py +248 -0
workbench/model_scripts/xgb_model/generated_model_script.py +371 -403
workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
workbench/model_scripts/xgb_model/uq_harness.py +277 -0
workbench/model_scripts/xgb_model/xgb_model.template +367 -399
workbench/repl/workbench_shell.py +18 -14
workbench/resources/open_source_api.key +1 -1
workbench/scripts/endpoint_test.py +162 -0
workbench/scripts/lambda_test.py +73 -0
workbench/scripts/meta_model_sim.py +35 -0
workbench/scripts/ml_pipeline_sqs.py +122 -6
workbench/scripts/training_test.py +85 -0
workbench/themes/dark/custom.css +59 -0
workbench/themes/dark/plotly.json +5 -5
workbench/themes/light/custom.css +153 -40
workbench/themes/light/plotly.json +9 -9
workbench/themes/midnight_blue/custom.css +59 -0
workbench/utils/aws_utils.py +0 -1
workbench/utils/chem_utils/fingerprints.py +87 -46
workbench/utils/chem_utils/mol_descriptors.py +18 -7
workbench/utils/chem_utils/mol_standardize.py +80 -58
workbench/utils/chem_utils/projections.py +16 -6
workbench/utils/chem_utils/vis.py +25 -27
workbench/utils/chemprop_utils.py +141 -0
workbench/utils/config_manager.py +2 -6
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/markdown_utils.py +57 -0
workbench/utils/meta_model_simulator.py +499 -0
workbench/utils/metrics_utils.py +256 -0
workbench/utils/model_utils.py +274 -87
workbench/utils/pipeline_utils.py +0 -1
workbench/utils/plot_utils.py +159 -34
workbench/utils/pytorch_utils.py +87 -0
workbench/utils/shap_utils.py +11 -57
workbench/utils/theme_manager.py +95 -30
workbench/utils/xgboost_local_crossfold.py +267 -0
workbench/utils/xgboost_model_utils.py +127 -220
workbench/web_interface/components/experiments/outlier_plot.py +0 -1
workbench/web_interface/components/model_plot.py +16 -2
workbench/web_interface/components/plugin_unit_test.py +5 -3
workbench/web_interface/components/plugins/ag_table.py +2 -4
workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
workbench/web_interface/components/plugins/model_details.py +48 -80
workbench/web_interface/components/plugins/scatter_plot.py +192 -92
workbench/web_interface/components/settings_menu.py +184 -0
workbench/web_interface/page_views/main_page.py +0 -1
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/METADATA +31 -17
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/RECORD +125 -111
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/entry_points.txt +4 -0
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/licenses/LICENSE +1 -1
workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie.template +0 -502
workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -386
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/themes/quartz/base_css.url +0 -1
workbench/themes/quartz/custom.css +0 -117
workbench/themes/quartz/plotly.json +0 -642
workbench/themes/quartz_dark/base_css.url +0 -1
workbench/themes/quartz_dark/custom.css +0 -131
workbench/themes/quartz_dark/plotly.json +0 -642
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/WHEEL +0 -0
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/top_level.txt +0 -0

workbench/__init__.py CHANGED Viewed

@@ -29,6 +29,7 @@ Workbench Main Classes
      |      json_to_data.set_output_tags(["abalone", "json", "whatever"])
      |      json_to_data.transform()
 """
 import os
 from importlib.metadata import version

workbench/algorithms/dataframe/__init__.py CHANGED Viewed

@@ -5,14 +5,13 @@ These classes provide functionality for Pandas Dataframes
 - TBD: TBD
 """
-from .proximity import Proximity, ProximityType
+from .proximity import Proximity
 from .feature_space_proximity import FeatureSpaceProximity
 from .fingerprint_proximity import FingerprintProximity
 from .projection_2d import Projection2D
 __all__ = [
     "Proximity",
-    "ProximityType",
     "FeatureSpaceProximity",
     "FingerprintProximity",
     "Projection2D",

workbench/algorithms/dataframe/compound_dataset_overlap.py ADDED Viewed

@@ -0,0 +1,321 @@
+"""Compound Dataset Overlap Analysis
+This module provides utilities for comparing two molecular datasets based on
+Tanimoto similarity in fingerprint space. It helps quantify the "overlap"
+between datasets in chemical space.
+Use cases:
+    - Train/test split validation: Ensure test set isn't too similar to training
+    - Dataset comparison: Compare proprietary vs public datasets
+    - Novelty assessment: Find compounds in query dataset that are novel vs reference
+"""
+import logging
+from typing import Optional, Tuple
+import pandas as pd
+from workbench.algorithms.dataframe.fingerprint_proximity import FingerprintProximity
+# Set up logging
+log = logging.getLogger("workbench")
+class CompoundDatasetOverlap:
+    """Compare two molecular datasets using Tanimoto similarity.
+    Builds a FingerprintProximity model on the reference dataset, then queries
+    with SMILES from the query dataset to find the nearest neighbor in the
+    reference for each query compound. This guarantees cross-dataset matches.
+    Attributes:
+        prox: FingerprintProximity instance on reference dataset
+        overlap_df: Results DataFrame with similarity scores for each query compound
+    """
+    def __init__(
+        self,
+        df_reference: pd.DataFrame,
+        df_query: pd.DataFrame,
+        id_column_reference: str = "id",
+        id_column_query: str = "id",
+        radius: int = 2,
+        n_bits: int = 2048,
+    ) -> None:
+        """
+        Initialize the CompoundDatasetOverlap analysis.
+        Args:
+            df_reference: Reference dataset (DataFrame with SMILES)
+            df_query: Query dataset (DataFrame with SMILES)
+            id_column_reference: ID column name in df_reference
+            id_column_query: ID column name in df_query
+            radius: Morgan fingerprint radius (default: 2 = ECFP4)
+            n_bits: Number of fingerprint bits (default: 2048)
+        """
+        self.id_column_reference = id_column_reference
+        self.id_column_query = id_column_query
+        self._radius = radius
+        self._n_bits = n_bits
+        # Store copies of the dataframes
+        self.df_reference = df_reference.copy()
+        self.df_query = df_query.copy()
+        # Find SMILES columns
+        self._smiles_col_reference = self._find_smiles_column(self.df_reference)
+        self._smiles_col_query = self._find_smiles_column(self.df_query)
+        if self._smiles_col_reference is None:
+            raise ValueError("Reference dataset must have a SMILES column")
+        if self._smiles_col_query is None:
+            raise ValueError("Query dataset must have a SMILES column")
+        log.info(f"Reference dataset: {len(self.df_reference)} compounds")
+        log.info(f"Query dataset: {len(self.df_query)} compounds")
+        # Build FingerprintProximity on reference dataset only
+        self.prox = FingerprintProximity(
+            self.df_reference,
+            id_column=id_column_reference,
+            radius=radius,
+            n_bits=n_bits,
+        )
+        # Compute cross-dataset overlap
+        self.overlap_df = self._compute_cross_dataset_overlap()
+    @staticmethod
+    def _find_smiles_column(df: pd.DataFrame) -> Optional[str]:
+        """Find the SMILES column in a DataFrame (case-insensitive)."""
+        for col in df.columns:
+            if col.lower() == "smiles":
+                return col
+        return None
+    def _compute_cross_dataset_overlap(self) -> pd.DataFrame:
+        """For each query compound, find nearest neighbor in reference using neighbors_from_smiles."""
+        log.info(f"Computing nearest neighbors in reference for {len(self.df_query)} query compounds")
+        # Get SMILES list from query dataset
+        query_smiles = self.df_query[self._smiles_col_query].tolist()
+        query_ids = self.df_query[self.id_column_query].tolist()
+        # Query all compounds against reference (get only nearest neighbor)
+        neighbors_df = self.prox.neighbors_from_smiles(query_smiles, n_neighbors=1)
+        # Build results with query IDs
+        results = []
+        for i, (q_id, q_smi) in enumerate(zip(query_ids, query_smiles)):
+            # Find the row for this query SMILES
+            match = neighbors_df[neighbors_df["query_id"] == q_smi]
+            if len(match) > 0:
+                row = match.iloc[0]
+                results.append(
+                    {
+                        "id": q_id,
+                        "smiles": q_smi,
+                        "nearest_neighbor_id": row["neighbor_id"],
+                        "tanimoto_similarity": row["similarity"],
+                    }
+                )
+            else:
+                # Should not happen, but handle gracefully
+                results.append(
+                    {
+                        "id": q_id,
+                        "smiles": q_smi,
+                        "nearest_neighbor_id": None,
+                        "tanimoto_similarity": 0.0,
+                    }
+                )
+        result_df = pd.DataFrame(results)
+        # Add nearest neighbor SMILES from reference
+        ref_smiles_map = self.df_reference.set_index(self.id_column_reference)[self._smiles_col_reference]
+        result_df["nearest_neighbor_smiles"] = result_df["nearest_neighbor_id"].map(ref_smiles_map)
+        return result_df.sort_values("tanimoto_similarity", ascending=False).reset_index(drop=True)
+    def summary_stats(self) -> pd.DataFrame:
+        """Return distribution statistics for nearest-neighbor Tanimoto similarities."""
+        return (
+            self.overlap_df["tanimoto_similarity"]
+            .describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
+            .to_frame()
+        )
+    def novel_compounds(self, threshold: float = 0.4) -> pd.DataFrame:
+        """Return query compounds that are novel (low similarity to reference).
+        Args:
+            threshold: Maximum Tanimoto similarity to consider "novel" (default: 0.4)
+        Returns:
+            DataFrame of query compounds with similarity below threshold
+        """
+        novel = self.overlap_df[self.overlap_df["tanimoto_similarity"] < threshold].copy()
+        return novel.sort_values("tanimoto_similarity", ascending=True).reset_index(drop=True)
+    def similar_compounds(self, threshold: float = 0.7) -> pd.DataFrame:
+        """Return query compounds that are similar to reference (high overlap).
+        Args:
+            threshold: Minimum Tanimoto similarity to consider "similar" (default: 0.7)
+        Returns:
+            DataFrame of query compounds with similarity above threshold
+        """
+        similar = self.overlap_df[self.overlap_df["tanimoto_similarity"] >= threshold].copy()
+        return similar.sort_values("tanimoto_similarity", ascending=False).reset_index(drop=True)
+    def overlap_fraction(self, threshold: float = 0.7) -> float:
+        """Return fraction of query compounds that overlap with reference above similarity threshold.
+        Args:
+            threshold: Minimum Tanimoto similarity to consider "overlapping"
+        Returns:
+            Fraction of query compounds with nearest neighbor similarity >= threshold
+        """
+        n_overlapping = (self.overlap_df["tanimoto_similarity"] >= threshold).sum()
+        return n_overlapping / len(self.overlap_df)
+    def plot_histogram(self, bins: int = 50, figsize: Tuple[int, int] = (10, 6)) -> None:
+        """Plot histogram of nearest-neighbor Tanimoto similarities.
+        Args:
+            bins: Number of histogram bins
+            figsize: Figure size (width, height)
+        """
+        import matplotlib.pyplot as plt
+        fig, ax = plt.subplots(figsize=figsize)
+        ax.hist(self.overlap_df["tanimoto_similarity"], bins=bins, edgecolor="black", alpha=0.7)
+        ax.set_xlabel("Tanimoto Similarity (query → nearest in reference)")
+        ax.set_ylabel("Count")
+        ax.set_title(f"Dataset Overlap: {len(self.overlap_df)} query compounds")
+        ax.axvline(x=0.4, color="red", linestyle="--", label="Novel threshold (0.4)")
+        ax.axvline(x=0.7, color="green", linestyle="--", label="Similar threshold (0.7)")
+        ax.legend()
+        # Add summary stats as text
+        stats = self.overlap_df["tanimoto_similarity"]
+        textstr = f"Mean: {stats.mean():.3f}\nMedian: {stats.median():.3f}\nStd: {stats.std():.3f}"
+        ax.text(
+            0.02,
+            0.98,
+            textstr,
+            transform=ax.transAxes,
+            verticalalignment="top",
+            bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.5),
+        )
+        plt.tight_layout()
+        plt.show()
+# =============================================================================
+# Testing
+# =============================================================================
+if __name__ == "__main__":
+    print("=" * 80)
+    print("Testing CompoundDatasetOverlap")
+    print("=" * 80)
+    # Test 1: Basic functionality with SMILES data
+    print("\n1. Testing with SMILES data...")
+    # Reference dataset: Known drug-like compounds
+    reference_data = {
+        "id": ["aspirin", "caffeine", "glucose", "ibuprofen", "naproxen", "ethanol", "methanol", "propanol"],
+        "smiles": [
+            "CC(=O)OC1=CC=CC=C1C(=O)O",  # aspirin
+            "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",  # caffeine
+            "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O",  # glucose
+            "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",  # ibuprofen
+            "COC1=CC2=CC(C(C)C(O)=O)=CC=C2C=C1",  # naproxen
+            "CCO",  # ethanol
+            "CO",  # methanol
+            "CCCO",  # propanol
+        ],
+    }
+    # Query dataset: Compounds to compare against reference
+    query_data = {
+        "id": ["acetaminophen", "theophylline", "benzene", "toluene", "phenol", "aniline"],
+        "smiles": [
+            "CC(=O)NC1=CC=C(C=C1)O",  # acetaminophen - similar to aspirin
+            "CN1C=NC2=C1C(=O)NC(=O)N2",  # theophylline - similar to caffeine
+            "c1ccccc1",  # benzene - simple aromatic
+            "Cc1ccccc1",  # toluene - similar to benzene
+            "Oc1ccccc1",  # phenol - hydroxyl benzene
+            "Nc1ccccc1",  # aniline - amino benzene
+        ],
+    }
+    df_reference = pd.DataFrame(reference_data)
+    df_query = pd.DataFrame(query_data)
+    print(f"   Reference: {len(df_reference)} compounds, Query: {len(df_query)} compounds")
+    overlap = CompoundDatasetOverlap(
+        df_reference, df_query, id_column_reference="id", id_column_query="id", radius=2, n_bits=1024
+    )
+    print("\n   Overlap results:")
+    print(overlap.overlap_df[["id", "nearest_neighbor_id", "tanimoto_similarity"]].to_string(index=False))
+    print("\n   Summary statistics:")
+    print(overlap.summary_stats())
+    # Test 2: Novel and similar compound identification
+    print("\n2. Testing novel/similar compound identification...")
+    similar = overlap.similar_compounds(threshold=0.3)
+    print(f"   Similar compounds (sim >= 0.3): {len(similar)}")
+    if len(similar) > 0:
+        print(similar[["id", "nearest_neighbor_id", "tanimoto_similarity"]].to_string(index=False))
+    novel = overlap.novel_compounds(threshold=0.3)
+    print(f"\n   Novel compounds (sim < 0.3): {len(novel)}")
+    if len(novel) > 0:
+        print(novel[["id", "nearest_neighbor_id", "tanimoto_similarity"]].to_string(index=False))
+    # Test 3: With Workbench data (if available)
+    print("\n3. Testing with Workbench FeatureSet (if available)...")
+    try:
+        from workbench.api import FeatureSet
+        fs = FeatureSet("aqsol_features")
+        full_df = fs.pull_dataframe()[:1000]  # Limit to first 1000 for testing
+        # Split into reference and query sets
+        df_reference = full_df.sample(frac=0.8, random_state=42)
+        df_query = full_df.drop(df_reference.index)
+        print(f"   Reference set: {len(df_reference)} compounds")
+        print(f"   Query set: {len(df_query)} compounds")
+        overlap = CompoundDatasetOverlap(
+            df_reference, df_query, id_column_reference=fs.id_column, id_column_query=fs.id_column
+        )
+        print("\n   Summary statistics:")
+        print(overlap.summary_stats())
+        print(f"\n   Overlap fraction (sim >= 0.7): {overlap.overlap_fraction(0.7):.2%}")
+        print(f"   Overlap fraction (sim >= 0.5): {overlap.overlap_fraction(0.5):.2%}")
+        print(f"   Novel compounds (sim < 0.4): {len(overlap.novel_compounds(0.4))}")
+        # Uncomment to show histogram
+        overlap.plot_histogram()
+    except Exception as e:
+        print(f"   Skipping Workbench test: {e}")
+    print("\n" + "=" * 80)
+    print("✅ All CompoundDatasetOverlap tests completed!")
+    print("=" * 80)

workbench/algorithms/dataframe/feature_space_proximity.py CHANGED Viewed

@@ -1,101 +1,194 @@
 import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from sklearn.neighbors import NearestNeighbors
+from typing import List, Optional
 import logging
 # Workbench Imports
 from workbench.algorithms.dataframe.proximity import Proximity
 from workbench.algorithms.dataframe.projection_2d import Projection2D
-from workbench.core.views.inference_view import InferenceView
-from workbench.api import FeatureSet, Model
 # Set up logging
 log = logging.getLogger("workbench")
 class FeatureSpaceProximity(Proximity):
-    def __init__(self, model: Model, n_neighbors: int = 10) -> None:
+    """Proximity computations for numeric feature spaces using Euclidean distance."""
+    def __init__(
+        self,
+        df: pd.DataFrame,
+        id_column: str,
+        features: List[str],
+        target: Optional[str] = None,
+        include_all_columns: bool = False,
+    ):
         """
         Initialize the FeatureSpaceProximity class.
         Args:
-            model (Model): A Workbench model object.
-            n_neighbors (int): Number of neighbors to compute. Defaults to 10.
+            df: DataFrame containing data for neighbor computations.
+            id_column: Name of the column used as the identifier.
+            features: List of feature column names to be used for neighbor computations.
+            target: Name of the target column. Defaults to None.
+            include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
         """
-        # Grab the features and target from the model
-        features = model.features()
-        target = model.target()
-        # Grab the feature set for the model
-        fs = FeatureSet(model.get_input())
-        # If we have a "inference" view, pull the data from that view
-        view_name = f"inf_{model.name.replace('-', '_')}"
-        if view_name in fs.views():
-            self.df = fs.view(view_name).pull_dataframe()
-        # Otherwise, pull the data from the feature set and run inference
-        else:
-            inf_view = InferenceView.create(model)
-            self.df = inf_view.pull_dataframe()
-        # Call the parent class constructor
-        super().__init__(self.df, id_column=fs.id_column, features=features, target=target, n_neighbors=n_neighbors)
-        # Project the data to 2D
-        self.df = Projection2D().fit_transform(self.df, features=features)
+        # Validate and filter features before calling parent init
+        self._raw_features = features
+        super().__init__(
+            df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns
+        )
+    def _prepare_data(self) -> None:
+        """Filter out non-numeric features and drop NaN rows."""
+        # Validate features
+        self.features = self._validate_features(self.df, self._raw_features)
+        # Drop NaN rows for the features we're using
+        self.df = self.df.dropna(subset=self.features).copy()
+    def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
+        """Remove non-numeric features and log warnings."""
+        non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
+        if non_numeric:
+            log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
+        return [f for f in features if f not in non_numeric]
+    def _build_model(self) -> None:
+        """Standardize features and fit Nearest Neighbors model."""
+        self.scaler = StandardScaler()
+        X = self.scaler.fit_transform(self.df[self.features])
+        self.nn = NearestNeighbors().fit(X)
+    def _transform_features(self, df: pd.DataFrame) -> np.ndarray:
+        """Transform features using the fitted scaler."""
+        return self.scaler.transform(df[self.features])
+    def _project_2d(self) -> None:
+        """Project the numeric features to 2D for visualization."""
+        if len(self.features) >= 2:
+            self.df = Projection2D().fit_transform(self.df, features=self.features)
+# Testing the FeatureSpaceProximity class
 if __name__ == "__main__":
     pd.set_option("display.max_columns", None)
     pd.set_option("display.width", 1000)
-    # Test a Workbench classification Model
-    m = Model("wine-classification")
-    fsp = FeatureSpaceProximity(m)
-    # Neighbors Test using a single row from FeatureSet
-    fs = FeatureSet(m.get_input())
+    # Create a sample DataFrame
+    data = {
+        "ID": [1, 2, 3, 4, 5],
+        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
+        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
+        "Feature3": [2.5, 2.4, 2.3, 2.3, np.nan],
+    }
+    df = pd.DataFrame(data)
+    # Test the FeatureSpaceProximity class
+    features = ["Feature1", "Feature2", "Feature3"]
+    prox = FeatureSpaceProximity(df, id_column="ID", features=features)
+    print(prox.neighbors(1, n_neighbors=2))
+    # Test the neighbors method with radius
+    print(prox.neighbors(1, radius=2.0))
+    # Test with Features list
+    prox = FeatureSpaceProximity(df, id_column="ID", features=["Feature1"])
+    print(prox.neighbors(1))
+    # Create a sample DataFrame
+    data = {
+        "id": ["a", "b", "c", "d", "e"],  # Testing string IDs
+        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
+        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
+        "target": [1, 0, 1, 0, 5],
+    }
+    df = pd.DataFrame(data)
+    # Test with String Ids
+    prox = FeatureSpaceProximity(
+        df,
+        id_column="id",
+        features=["Feature1", "Feature2"],
+        target="target",
+        include_all_columns=True,
+    )
+    print(prox.neighbors(["a", "b"]))
+    # Test duplicate IDs
+    data = {
+        "id": ["a", "b", "c", "d", "d"],  # Duplicate ID (d)
+        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
+        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
+        "target": [1, 0, 1, 0, 5],
+    }
+    df = pd.DataFrame(data)
+    prox = FeatureSpaceProximity(df, id_column="id", features=["Feature1", "Feature2"], target="target")
+    print(df.equals(prox.df))
+    # Test on real data from Workbench
+    from workbench.api import FeatureSet, Model
+    fs = FeatureSet("aqsol_features")
+    model = Model("aqsol-regression")
+    features = model.features()
     df = fs.pull_dataframe()
-    single_query_neighbors = fsp.neighbors(df.iloc[[0]])
-    print("\nNeighbors for Query ID:", df.iloc[0][fs.id_column])
-    print(single_query_neighbors)
-    # Test a Workbench regression model
-    m = Model("abalone-regression")
-    fsp = FeatureSpaceProximity(m)
-    # Neighbors Test using a multiple rows from FeatureSet
-    fs = FeatureSet(m.get_input())
-    df = fs.pull_dataframe()
-    query_neighbors = fsp.neighbors(df.iloc[0:2])
-    print("\nNeighbors for Query ID:", df.iloc[0][fs.id_column])
-    print(query_neighbors)
-    # Test a Workbench regression model
-    m = Model("aqsol-regression")
-    fsp = FeatureSpaceProximity(m)
-    # Neighbors Test using a multiple rows from FeatureSet
-    fs = FeatureSet(m.get_input())
-    df = fs.pull_dataframe()
-    query_neighbors = fsp.neighbors(df.iloc[5:7])
-    print("\nNeighbors for Query ID:", df.iloc[5][fs.id_column])
-    print(query_neighbors)
-    # Time the all_neighbors method
-    import time
-    start_time = time.time()
-    all_neighbors_df = fsp.all_neighbors()
-    end_time = time.time()
-    print("\nTime taken for all_neighbors:", end_time - start_time)
-    print("\nAll Neighbors DataFrame:")
-    print(all_neighbors_df)
-    # Show a scatter plot of the data
+    prox = FeatureSpaceProximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
+    print("\n" + "=" * 80)
+    print("Testing Neighbors...")
+    print("=" * 80)
+    test_id = df[fs.id_column].tolist()[0]
+    print(f"\nNeighbors for ID {test_id}:")
+    print(prox.neighbors(test_id))
+    print("\n" + "=" * 80)
+    print("Testing isolated_compounds...")
+    print("=" * 80)
+    # Test isolated data in the top 1%
+    isolated_1pct = prox.isolated(top_percent=1.0)
+    print(f"\nTop 1% most isolated compounds (n={len(isolated_1pct)}):")
+    print(isolated_1pct)
+    # Test isolated data in the top 5%
+    isolated_5pct = prox.isolated(top_percent=5.0)
+    print(f"\nTop 5% most isolated compounds (n={len(isolated_5pct)}):")
+    print(isolated_5pct)
+    print("\n" + "=" * 80)
+    print("Testing target_gradients...")
+    print("=" * 80)
+    # Test with different parameters
+    gradients_1pct = prox.target_gradients(top_percent=1.0, min_delta=1.0)
+    print(f"\nTop 1% target gradients (min_delta=5.0) (n={len(gradients_1pct)}):")
+    print(gradients_1pct)
+    gradients_5pct = prox.target_gradients(top_percent=5.0, min_delta=5.0)
+    print(f"\nTop 5% target gradients (min_delta=5.0) (n={len(gradients_5pct)}):")
+    print(gradients_5pct)
+    # Test proximity_stats
+    print("\n" + "=" * 80)
+    print("Testing proximity_stats...")
+    print("=" * 80)
+    stats = prox.proximity_stats()
+    print(stats)
+    # Plot the distance distribution using pandas
+    print("\n" + "=" * 80)
+    print("Plotting distance distribution...")
+    print("=" * 80)
+    prox.df["nn_distance"].hist(bins=50, figsize=(10, 6), edgecolor="black")
+    # Visualize the 2D projection
+    print("\n" + "=" * 80)
+    print("Visualizing 2D Projection...")
+    print("=" * 80)
     from workbench.web_interface.components.plugin_unit_test import PluginUnitTest
     from workbench.web_interface.components.plugins.scatter_plot import ScatterPlot
-    # Run the Unit Test on the Plugin using the new DataFrame with 'x' and 'y'
-    unit_test = PluginUnitTest(ScatterPlot, input_data=fsp.df, x="x", y="y")
+    unit_test = PluginUnitTest(ScatterPlot, input_data=prox.df[:1000], x="x", y="y", color=model.target())
     unit_test.run()

workbench 0.8.174__py3-none-any.whl → 0.8.227__py3-none-any.whl

Potentially problematic release.

workbench 0.8.174py3-none-any.whl → 0.8.227py3-none-any.whl