PyPI - workbench - Versions diffs - 0.8.162__py3-none-any.whl → 0.8.202__py3-none-any.whl - Mend

workbench 0.8.162py3-none-any.whl → 0.8.202py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (113) hide show

workbench/algorithms/dataframe/__init__.py +1 -2
workbench/algorithms/dataframe/fingerprint_proximity.py +2 -2
workbench/algorithms/dataframe/proximity.py +261 -235
workbench/algorithms/graph/light/proximity_graph.py +10 -8
workbench/api/__init__.py +2 -1
workbench/api/compound.py +1 -1
workbench/api/endpoint.py +11 -0
workbench/api/feature_set.py +11 -8
workbench/api/meta.py +5 -2
workbench/api/model.py +16 -15
workbench/api/monitor.py +1 -16
workbench/core/artifacts/__init__.py +11 -2
workbench/core/artifacts/artifact.py +11 -3
workbench/core/artifacts/data_capture_core.py +355 -0
workbench/core/artifacts/endpoint_core.py +256 -118
workbench/core/artifacts/feature_set_core.py +265 -16
workbench/core/artifacts/model_core.py +107 -60
workbench/core/artifacts/monitor_core.py +33 -248
workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
workbench/core/cloud_platform/aws/aws_meta.py +12 -5
workbench/core/cloud_platform/aws/aws_parameter_store.py +18 -2
workbench/core/cloud_platform/aws/aws_session.py +4 -4
workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
workbench/core/transforms/features_to_model/features_to_model.py +42 -32
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
workbench/core/views/training_view.py +113 -42
workbench/core/views/view.py +53 -3
workbench/core/views/view_utils.py +4 -4
workbench/model_scripts/chemprop/chemprop.template +852 -0
workbench/model_scripts/chemprop/generated_model_script.py +852 -0
workbench/model_scripts/chemprop/requirements.txt +11 -0
workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
workbench/model_scripts/custom_models/proximity/proximity.py +261 -235
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/meta_uq.template +166 -62
workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
workbench/model_scripts/custom_models/uq_models/proximity.py +261 -235
workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
workbench/model_scripts/pytorch_model/generated_model_script.py +373 -190
workbench/model_scripts/pytorch_model/pytorch.template +370 -187
workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +17 -9
workbench/model_scripts/uq_models/generated_model_script.py +605 -0
workbench/model_scripts/uq_models/mapie.template +605 -0
workbench/model_scripts/uq_models/requirements.txt +1 -0
workbench/model_scripts/xgb_model/generated_model_script.py +37 -46
workbench/model_scripts/xgb_model/xgb_model.template +44 -46
workbench/repl/workbench_shell.py +28 -14
workbench/scripts/endpoint_test.py +162 -0
workbench/scripts/lambda_test.py +73 -0
workbench/scripts/ml_pipeline_batch.py +137 -0
workbench/scripts/ml_pipeline_sqs.py +186 -0
workbench/scripts/monitor_cloud_watch.py +20 -100
workbench/utils/aws_utils.py +4 -3
workbench/utils/chem_utils/__init__.py +0 -0
workbench/utils/chem_utils/fingerprints.py +134 -0
workbench/utils/chem_utils/misc.py +194 -0
workbench/utils/chem_utils/mol_descriptors.py +483 -0
workbench/utils/chem_utils/mol_standardize.py +450 -0
workbench/utils/chem_utils/mol_tagging.py +348 -0
workbench/utils/chem_utils/projections.py +209 -0
workbench/utils/chem_utils/salts.py +256 -0
workbench/utils/chem_utils/sdf.py +292 -0
workbench/utils/chem_utils/toxicity.py +250 -0
workbench/utils/chem_utils/vis.py +253 -0
workbench/utils/chemprop_utils.py +760 -0
workbench/utils/cloudwatch_handler.py +1 -1
workbench/utils/cloudwatch_utils.py +137 -0
workbench/utils/config_manager.py +3 -7
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/model_utils.py +95 -34
workbench/utils/monitor_utils.py +44 -62
workbench/utils/pandas_utils.py +3 -3
workbench/utils/pytorch_utils.py +526 -0
workbench/utils/shap_utils.py +10 -2
workbench/utils/workbench_logging.py +0 -3
workbench/utils/workbench_sqs.py +1 -1
workbench/utils/xgboost_model_utils.py +371 -156
workbench/web_interface/components/model_plot.py +7 -1
workbench/web_interface/components/plugin_unit_test.py +5 -2
workbench/web_interface/components/plugins/dashboard_status.py +3 -1
workbench/web_interface/components/plugins/generated_compounds.py +1 -1
workbench/web_interface/components/plugins/model_details.py +9 -7
workbench/web_interface/components/plugins/scatter_plot.py +3 -3
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/METADATA +27 -6
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/RECORD +101 -85
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/entry_points.txt +4 -0
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/licenses/LICENSE +1 -1
workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/utils/chem_utils.py +0 -1556
workbench/utils/execution_environment.py +0 -211
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/WHEEL +0 -0
{workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/top_level.txt +0 -0

workbench/model_scripts/custom_models/proximity/proximity.py CHANGED Viewed

@@ -2,275 +2,307 @@ import pandas as pd
 import numpy as np
 from sklearn.preprocessing import StandardScaler
 from sklearn.neighbors import NearestNeighbors
-from typing import List, Dict
+from typing import List, Dict, Optional, Union
 import logging
-import pickle
-import os
-import json
-from pathlib import Path
-from enum import Enum
 # Set up logging
 log = logging.getLogger("workbench")
-# ^Enumerated^ Proximity Types (distance or similarity)
-class ProximityType(Enum):
-    DISTANCE = "distance"
-    SIMILARITY = "similarity"
 class Proximity:
     def __init__(
         self,
         df: pd.DataFrame,
         id_column: str,
         features: List[str],
-        target: str = None,
-        track_columns: List[str] = None,
-        n_neighbors: int = 10,
+        target: Optional[str] = None,
+        track_columns: Optional[List[str]] = None,
     ):
         """
         Initialize the Proximity class.
         Args:
-            df (pd.DataFrame): DataFrame containing data for neighbor computations.
-            id_column (str): Name of the column used as the identifier.
-            features (List[str]): List of feature column names to be used for neighbor computations.
-            target (str, optional): Name of the target column. Defaults to None.
-            track_columns (List[str], optional): Additional columns to track in results. Defaults to None.
-            n_neighbors (int): Number of neighbors to compute. Defaults to 10.
+            df: DataFrame containing data for neighbor computations.
+            id_column: Name of the column used as the identifier.
+            features: List of feature column names to be used for neighbor computations.
+            target: Name of the target column. Defaults to None.
+            track_columns: Additional columns to track in results. Defaults to None.
         """
-        self.df = df.dropna(subset=features).copy()
         self.id_column = id_column
-        self.n_neighbors = min(n_neighbors, len(self.df) - 1)
         self.target = target
-        self.features = features
-        self.scaler = None
-        self.X = None
-        self.nn = None
-        self.proximity_type = None
         self.track_columns = track_columns or []
-        # Right now we only support numeric features, so remove any columns that are not numeric
-        non_numeric_features = self.df[self.features].select_dtypes(exclude=["number"]).columns.tolist()
-        if non_numeric_features:
-            log.warning(f"Non-numeric features {non_numeric_features} aren't currently supported...")
-            self.features = [f for f in self.features if f not in non_numeric_features]
+        # Filter out non-numeric features
+        self.features = self._validate_features(df, features)
+        # Drop NaN rows and set up DataFrame
+        self.df = df.dropna(subset=self.features).copy()
+        # Compute target range if target is provided
+        self.target_range = None
+        if self.target and self.target in self.df.columns:
+            self.target_range = self.df[self.target].max() - self.df[self.target].min()
         # Build the proximity model
-        self.build_proximity_model()
+        self._build_model()
-    def build_proximity_model(self) -> None:
-        """Standardize features and fit Nearest Neighbors model.
-        Note: This method can be overridden in subclasses for custom behavior."""
-        self.proximity_type = ProximityType.DISTANCE
-        self.scaler = StandardScaler()
-        self.X = self.scaler.fit_transform(self.df[self.features])
-        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors + 1).fit(self.X)
+        # Precompute landscape metrics
+        self._precompute_metrics()
-    def all_neighbors(self) -> pd.DataFrame:
+    def isolated(self, top_percent: float = 1.0) -> pd.DataFrame:
         """
-        Compute nearest neighbors for all rows in the dataset.
+        Find isolated data points based on distance to nearest neighbor.
+        Args:
+            top_percent: Percentage of most isolated data points to return (e.g., 1.0 returns top 1%)
         Returns:
-            pd.DataFrame: A DataFrame of neighbors and their distances.
+            DataFrame of observations above the percentile threshold, sorted by distance (descending)
         """
-        distances, indices = self.nn.kneighbors(self.X)
-        results = []
+        percentile = 100 - top_percent
+        threshold = np.percentile(self.df["nn_distance"], percentile)
+        isolated = self.df[self.df["nn_distance"] >= threshold].copy()
+        return isolated.sort_values("nn_distance", ascending=False).reset_index(drop=True)
-        for i, (dists, nbrs) in enumerate(zip(distances, indices)):
-            query_id = self.df.iloc[i][self.id_column]
+    def target_gradients(
+        self,
+        top_percent: float = 1.0,
+        min_delta: Optional[float] = None,
+        k_neighbors: int = 4,
+        only_coincident: bool = False,
+    ) -> pd.DataFrame:
+        """
+        Find compounds with steep target gradients (data quality issues and activity cliffs).
-            # Process neighbors
-            for neighbor_idx, dist in zip(nbrs, dists):
-                # Skip self (neighbor index == current row index)
-                if neighbor_idx == i:
-                    continue
-                results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
+        Uses a two-phase approach:
+        1. Quick filter using nearest neighbor gradient
+        2. Verify using k-neighbor median to handle cases where the nearest neighbor is the outlier
+        Args:
+            top_percent: Percentage of compounds with steepest gradients to return (e.g., 1.0 = top 1%)
+            min_delta: Minimum absolute target difference to consider. If None, defaults to target_range/100
+            k_neighbors: Number of neighbors to use for median calculation (default: 4)
+            only_coincident: If True, only consider compounds that are coincident (default: False)
+        Returns:
+            DataFrame of compounds with steepest gradients, sorted by gradient (descending)
+        """
+        if self.target is None:
+            raise ValueError("Target column must be specified")
+        epsilon = 1e-5
+        # Phase 1: Quick filter using precomputed nearest neighbor
+        candidates = self.df.copy()
+        candidates["gradient"] = candidates["nn_target_diff"] / (candidates["nn_distance"] + epsilon)
+        # Apply min_delta
+        if min_delta is None:
+            min_delta = self.target_range / 100.0 if self.target_range > 0 else 0.0
+        candidates = candidates[candidates["nn_target_diff"] >= min_delta]
+        # Filter based on mode
+        if only_coincident:
+            # Only keep coincident points (nn_distance ~= 0)
+            candidates = candidates[candidates["nn_distance"] < epsilon].copy()
+        else:
+            # Get top X% by initial gradient
+            percentile = 100 - top_percent
+            threshold = np.percentile(candidates["gradient"], percentile)
+            candidates = candidates[candidates["gradient"] >= threshold].copy()
+        # Phase 2: Verify with k-neighbor median to filter out cases where nearest neighbor is the outlier
+        results = []
+        for _, row in candidates.iterrows():
+            cmpd_id = row[self.id_column]
+            cmpd_target = row[self.target]
+            # Get k nearest neighbors (excluding self)
+            nbrs = self.neighbors(cmpd_id, n_neighbors=k_neighbors, include_self=False)
+            # Calculate median target of k neighbors, excluding the nearest neighbor (index 0)
+            neighbor_median = nbrs.iloc[1:k_neighbors][self.target].median()
+            median_diff = abs(cmpd_target - neighbor_median)
+            # Only keep if compound differs from neighborhood median
+            # This filters out cases where the nearest neighbor is the outlier
+            if median_diff >= min_delta:
+                results.append(
+                    {
+                        self.id_column: cmpd_id,
+                        self.target: cmpd_target,
+                        "nn_target": row["nn_target"],
+                        "nn_target_diff": row["nn_target_diff"],
+                        "nn_distance": row["nn_distance"],
+                        "gradient": row["gradient"],  # Keep Phase 1 gradient
+                        "neighbor_median": neighbor_median,
+                        "neighbor_median_diff": median_diff,
+                    }
+                )
-        return pd.DataFrame(results)
+        # Handle empty results
+        if not results:
+            return pd.DataFrame(
+                columns=[
+                    self.id_column,
+                    self.target,
+                    "neighbor_median",
+                    "neighbor_median_diff",
+                    "mean_distance",
+                    "gradient",
+                ]
+            )
+        results_df = pd.DataFrame(results)
+        results_df = results_df.sort_values("gradient", ascending=False).reset_index(drop=True)
+        return results_df
     def neighbors(
         self,
-        query_df: pd.DataFrame,
-        radius: float = None,
+        id_or_ids: Union[str, int, List[Union[str, int]]],
+        n_neighbors: Optional[int] = 5,
+        radius: Optional[float] = None,
         include_self: bool = True,
     ) -> pd.DataFrame:
         """
-        Return neighbors for rows in a query DataFrame.
+        Return neighbors for ID(s) from the existing dataset.
         Args:
-            query_df: DataFrame containing query points
+            id_or_ids: Single ID or list of IDs to look up
+            n_neighbors: Number of neighbors to return (default: 5, ignored if radius is set)
             radius: If provided, find all neighbors within this radius
-            include_self: Whether to include self in results (if present)
+            include_self: Whether to include self in results (default: True)
         Returns:
             DataFrame containing neighbors and distances
-        Note: The query DataFrame must include the feature columns. The id_column is optional.
         """
-        # Check if all required features are present
-        missing = set(self.features) - set(query_df.columns)
-        if missing:
-            raise ValueError(f"Query DataFrame is missing required feature columns: {missing}")
-        # Check if id_column is present
-        id_column_present = self.id_column in query_df.columns
+        # Normalize to list
+        ids = [id_or_ids] if not isinstance(id_or_ids, list) else id_or_ids
-        # None of the features can be NaNs, so report rows with NaNs and then drop them
-        rows_with_nan = query_df[self.features].isna().any(axis=1)
+        # Validate IDs exist
+        missing_ids = set(ids) - set(self.df[self.id_column])
+        if missing_ids:
+            raise ValueError(f"IDs not found in dataset: {missing_ids}")
-        # Print the ID column for rows with NaNs
-        if rows_with_nan.any():
-            log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
-            log.warning(query_df.loc[rows_with_nan, self.id_column])
+        # Filter to requested IDs and preserve order
+        query_df = self.df[self.df[self.id_column].isin(ids)]
+        query_df = query_df.set_index(self.id_column).loc[ids].reset_index()
-        # Drop rows with NaNs in feature columns and reassign to query_df
-        query_df = query_df.dropna(subset=self.features)
-        # Transform the query features using the model's scaler
+        # Transform query features
         X_query = self.scaler.transform(query_df[self.features])
-        # Get neighbors using either radius or k-nearest neighbors
+        # Get neighbors
         if radius is not None:
             distances, indices = self.nn.radius_neighbors(X_query, radius=radius)
         else:
-            distances, indices = self.nn.kneighbors(X_query)
+            distances, indices = self.nn.kneighbors(X_query, n_neighbors=n_neighbors)
         # Build results
-        all_results = []
+        results = []
         for i, (dists, nbrs) in enumerate(zip(distances, indices)):
-            # Use the ID from the query DataFrame if available, otherwise use the row index
-            query_id = query_df.iloc[i][self.id_column] if id_column_present else f"query_{i}"
+            query_id = query_df.iloc[i][self.id_column]
             for neighbor_idx, dist in zip(nbrs, dists):
-                # Skip if the neighbor is the query itself and include_self is False
                 neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
+                # Skip self if requested
                 if not include_self and neighbor_id == query_id:
                     continue
-                all_results.append(
-                    self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist)
-                )
+                results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
-        return pd.DataFrame(all_results)
+        df_results = pd.DataFrame(results)
+        df_results["is_self"] = df_results["neighbor_id"] == df_results[self.id_column]
+        df_results = df_results.sort_values([self.id_column, "is_self", "distance"], ascending=[True, False, True])
+        return df_results.drop("is_self", axis=1).reset_index(drop=True)
-    def _build_neighbor_result(self, query_id, neighbor_idx: int, distance: float) -> Dict:
-        """
-        Internal: Build a result dictionary for a single neighbor.
+    def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
+        """Remove non-numeric features and log warnings."""
+        non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
+        if non_numeric:
+            log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
+        return [f for f in features if f not in non_numeric]
-        Args:
-            query_id: ID of the query point
-            neighbor_idx: Index of the neighbor in the original DataFrame
-            distance: Distance between query and neighbor
+    def _build_model(self) -> None:
+        """Standardize features and fit Nearest Neighbors model."""
+        self.scaler = StandardScaler()
+        X = self.scaler.fit_transform(self.df[self.features])
+        self.nn = NearestNeighbors().fit(X)
-        Returns:
-            Dictionary containing neighbor information
+    def _precompute_metrics(self, n_neighbors: int = 10) -> None:
         """
-        neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
-        # Basic neighbor info
-        neighbor_info = {
-            self.id_column: query_id,
-            "neighbor_id": neighbor_id,
-            "distance": distance,
-        }
-        # Determine which additional columns to include
-        relevant_cols = [self.target, "prediction"] if self.target else []
-        relevant_cols += [c for c in self.df.columns if "_proba" in c or "residual" in c]
-        relevant_cols += ["outlier"]
-        # Add user-specified columns
-        relevant_cols += self.track_columns
+        Precompute landscape metrics for all compounds.
-        # Add values for each relevant column that exists in the dataframe
-        for col in filter(lambda c: c in self.df.columns, relevant_cols):
-            neighbor_info[col] = self.df.iloc[neighbor_idx][col]
+        Adds columns to self.df:
+        - nn_distance: Distance to nearest neighbor
+        - nn_id: ID of nearest neighbor
-        return neighbor_info
-    def serialize(self, directory: str) -> None:
+        If target is specified, also adds:
+        - nn_target: Target value of nearest neighbor
+        - nn_target_diff: Absolute difference from nearest neighbor target
         """
-        Serialize the Proximity model to a directory.
+        log.info("Precomputing proximity metrics...")
-        Args:
-            directory: Directory path to save the model components
-        """
-        # Create directory if it doesn't exist
-        os.makedirs(directory, exist_ok=True)
-        # Save metadata
-        metadata = {
-            "id_column": self.id_column,
-            "features": self.features,
-            "target": self.target,
-            "track_columns": self.track_columns,
-            "n_neighbors": self.n_neighbors,
-        }
+        # Make sure n_neighbors isn't greater than dataset size
+        n_neighbors = min(n_neighbors, len(self.df) - 1)
-        with open(os.path.join(directory, "metadata.json"), "w") as f:
-            json.dump(metadata, f)
+        # Get nearest neighbors for all points (including self)
+        X = self.scaler.transform(self.df[self.features])
+        distances, indices = self.nn.kneighbors(X, n_neighbors=2)  # Just need nearest neighbor
-        # Save the DataFrame
-        self.df.to_pickle(os.path.join(directory, "df.pkl"))
+        # Extract nearest neighbor (index 1, since index 0 is self)
+        self.df["nn_distance"] = distances[:, 1]
+        self.df["nn_id"] = self.df.iloc[indices[:, 1]][self.id_column].values
-        # Save the scaler and nearest neighbors model
-        with open(os.path.join(directory, "scaler.pkl"), "wb") as f:
-            pickle.dump(self.scaler, f)
+        # If target exists, compute target-based metrics
+        if self.target and self.target in self.df.columns:
+            # Get target values for nearest neighbor
+            nn_target_values = self.df.iloc[indices[:, 1]][self.target].values
+            self.df["nn_target"] = nn_target_values
+            self.df["nn_target_diff"] = np.abs(self.df[self.target].values - nn_target_values)
-        with open(os.path.join(directory, "nn_model.pkl"), "wb") as f:
-            pickle.dump(self.nn, f)
+            # Precompute target range for min_delta default
+            self.target_range = self.df[self.target].max() - self.df[self.target].min()
-        log.info(f"Proximity model serialized to {directory}")
+        log.info("Proximity metrics precomputed successfully")
-    @classmethod
-    def deserialize(cls, directory: str) -> "Proximity":
+    def _build_neighbor_result(self, query_id, neighbor_idx: int, distance: float) -> Dict:
         """
-        Deserialize a Proximity model from a directory.
+        Build a result dictionary for a single neighbor.
         Args:
-            directory: Directory path containing the serialized model components
+            query_id: ID of the query point
+            neighbor_idx: Index of the neighbor in the original DataFrame
+            distance: Distance between query and neighbor
         Returns:
-            Proximity: A new Proximity instance
+            Dictionary containing neighbor information
         """
-        directory_path = Path(directory)
-        if not directory_path.exists() or not directory_path.is_dir():
-            raise ValueError(f"Directory {directory} does not exist or is not a directory")
+        neighbor_row = self.df.iloc[neighbor_idx]
+        neighbor_id = neighbor_row[self.id_column]
-        # Load metadata
-        with open(os.path.join(directory, "metadata.json"), "r") as f:
-            metadata = json.load(f)
-        # Load DataFrame
-        df_path = os.path.join(directory, "df.pkl")
-        if not os.path.exists(df_path):
-            raise FileNotFoundError(f"DataFrame file not found at {df_path}")
-        df = pd.read_pickle(df_path)
-        # Create instance but skip _prepare_data
-        instance = cls.__new__(cls)
-        instance.df = df
-        instance.id_column = metadata["id_column"]
-        instance.features = metadata["features"]
-        instance.target = metadata["target"]
-        instance.track_columns = metadata["track_columns"]
-        instance.n_neighbors = metadata["n_neighbors"]
+        # Start with basic info
+        result = {
+            self.id_column: query_id,
+            "neighbor_id": neighbor_id,
+            "distance": 0.0 if distance < 1e-5 else distance,
+        }
-        # Load scaler and nn model
-        with open(os.path.join(directory, "scaler.pkl"), "rb") as f:
-            instance.scaler = pickle.load(f)
+        # Add target if present
+        if self.target and self.target in self.df.columns:
+            result[self.target] = neighbor_row[self.target]
-        with open(os.path.join(directory, "nn_model.pkl"), "rb") as f:
-            instance.nn = pickle.load(f)
+        # Add tracked columns
+        for col in self.track_columns:
+            if col in self.df.columns:
+                result[col] = neighbor_row[col]
-        # Load X from scaler transform
-        instance.X = instance.scaler.transform(instance.df[instance.features])
+        # Add prediction/probability columns if they exist
+        for col in self.df.columns:
+            if col == "prediction" or "_proba" in col or "residual" in col or col == "in_model":
+                result[col] = neighbor_row[col]
-        log.info(f"Proximity model deserialized from {directory}")
-        return instance
+        return result
 # Testing the Proximity class
@@ -290,28 +322,15 @@ if __name__ == "__main__":
     # Test the Proximity class
     features = ["Feature1", "Feature2", "Feature3"]
-    prox = Proximity(df, id_column="ID", features=features, n_neighbors=3)
-    print(prox.all_neighbors())
-    # Test the neighbors method
-    print(prox.neighbors(query_df=df.iloc[[0]]))
+    prox = Proximity(df, id_column="ID", features=features)
+    print(prox.neighbors(1, n_neighbors=2))
     # Test the neighbors method with radius
-    print(prox.neighbors(query_df=df.iloc[0:2], radius=2.0))
-    # Test with data that isn't in the 'train' dataframe
-    query_data = {
-        "ID": [6],
-        "Feature1": [0.31],
-        "Feature2": [0.31],
-        "Feature3": [2.31],
-    }
-    query_df = pd.DataFrame(query_data)
-    print(prox.neighbors(query_df=query_df))
+    print(prox.neighbors(1, radius=2.0))
     # Test with Features list
-    prox = Proximity(df, id_column="ID", features=["Feature1"], n_neighbors=2)
-    print(prox.all_neighbors())
+    prox = Proximity(df, id_column="ID", features=["Feature1"])
+    print(prox.neighbors(1))
     # Create a sample DataFrame
     data = {
@@ -329,39 +348,8 @@ if __name__ == "__main__":
         features=["Feature1", "Feature2"],
         target="target",
         track_columns=["Feature1", "Feature2"],
-        n_neighbors=3,
     )
-    print(prox.all_neighbors())
-    # Test the neighbors method
-    print(prox.neighbors(query_df=df.iloc[0:2]))
-    # Time neighbors with all IDs versus calling all_neighbors
-    import time
-    start_time = time.time()
-    prox_df = prox.neighbors(query_df=df, include_self=False)
-    end_time = time.time()
-    print(f"Time taken for neighbors: {end_time - start_time:.4f} seconds")
-    start_time = time.time()
-    prox_df_all = prox.all_neighbors()
-    end_time = time.time()
-    print(f"Time taken for all_neighbors: {end_time - start_time:.4f} seconds")
-    # Now compare the two dataframes
-    print("Neighbors DataFrame:")
-    print(prox_df)
-    print("\nAll Neighbors DataFrame:")
-    print(prox_df_all)
-    # Check for any discrepancies
-    if prox_df.equals(prox_df_all):
-        print("The two DataFrames are equal :)")
-    else:
-        print("ERROR: The two DataFrames are not equal!")
-    # Test querying without the id_column
-    df_no_id = df.drop(columns=["foo_id"])
-    print(prox.neighbors(query_df=df_no_id, include_self=False))
+    print(prox.neighbors(["a", "b"]))
     # Test duplicate IDs
     data = {
@@ -371,14 +359,52 @@ if __name__ == "__main__":
         "target": [1, 0, 1, 0, 5],
     }
     df = pd.DataFrame(data)
-    prox = Proximity(df, id_column="foo_id", features=["Feature1", "Feature2"], target="target", n_neighbors=3)
+    prox = Proximity(df, id_column="foo_id", features=["Feature1", "Feature2"], target="target")
     print(df.equals(prox.df))
     # Test with a categorical feature
     from workbench.api import FeatureSet, Model
-    fs = FeatureSet("abalone_features")
-    model = Model("abalone-regression")
+    fs = FeatureSet("aqsol_features")
+    model = Model("aqsol-regression")
+    features = model.features()
     df = fs.pull_dataframe()
-    prox = Proximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
-    print(prox.neighbors(query_df=df[0:2]))
+    prox = Proximity(
+        df, id_column=fs.id_column, features=model.features(), target=model.target(), track_columns=features
+    )
+    print(prox.neighbors(df[fs.id_column].tolist()[:3]))
+    print("\n" + "=" * 80)
+    print("Testing isolated_compounds...")
+    print("=" * 80)
+    # Test isolated data in the top 1%
+    isolated_1pct = prox.isolated(top_percent=1.0)
+    print(f"\nTop 1% most isolated compounds (n={len(isolated_1pct)}):")
+    print(isolated_1pct[[fs.id_column, "nn_distance", "nn_id"]].head(10))
+    # Test isolated data in the top 5%
+    isolated_5pct = prox.isolated(top_percent=5.0)
+    print(f"\nTop 5% most isolated compounds (n={len(isolated_5pct)}):")
+    print(isolated_5pct[[fs.id_column, "nn_distance", "nn_id"]].head(10))
+    print("\n" + "=" * 80)
+    print("Testing target_gradients...")
+    print("=" * 80)
+    # Test with different parameters
+    gradients_1pct = prox.target_gradients(top_percent=1.0, min_delta=1.0)
+    print(f"\nTop 1% target gradients (min_delta=5.0) (n={len(gradients_1pct)}):")
+    print(
+        gradients_1pct[
+            [fs.id_column, model.target(), "neighbor_median", "neighbor_median_diff", "mean_distance", "gradient"]
+        ].head(10)
+    )
+    gradients_5pct = prox.target_gradients(top_percent=5.0, min_delta=5.0)
+    print(f"\nTop 5% target gradients (min_delta=5.0) (n={len(gradients_5pct)}):")
+    print(
+        gradients_5pct[
+            [fs.id_column, model.target(), "neighbor_median", "neighbor_median_diff", "mean_distance", "gradient"]
+        ].head(10)
+    )

workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template CHANGED Viewed

@@ -14,7 +14,7 @@ import pandas as pd
 TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
 }
@@ -37,7 +37,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
     Raises ValueError if any model features cannot be matched.
     """
     df_columns_lower = {col.lower(): col for col in df.columns}
@@ -81,10 +81,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -109,8 +106,10 @@ if __name__ == "__main__":
     # Create and train the Regression/Confidence model
     # model = BayesianRidge()
     model = BayesianRidge(
-        alpha_1=1e-6, alpha_2=1e-6,  # Noise precision
-        lambda_1=1e-6, lambda_2=1e-6,  # Weight precision
+        alpha_1=1e-6,
+        alpha_2=1e-6,  # Noise precision
+        lambda_1=1e-6,
+        lambda_2=1e-6,  # Weight precision
         fit_intercept=True,
     )

workbench 0.8.162__py3-none-any.whl → 0.8.202__py3-none-any.whl

Potentially problematic release.

workbench 0.8.162py3-none-any.whl → 0.8.202py3-none-any.whl