PyPI - workbench - Versions diffs - 0.8.183__py3-none-any.whl → 0.8.185__py3-none-any.whl - Mend

workbench 0.8.183py3-none-any.whl → 0.8.185py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (16) hide show

workbench/model_scripts/custom_models/uq_models/proximity.py CHANGED Viewed

@@ -2,10 +2,9 @@ import pandas as pd
 import numpy as np
 from sklearn.preprocessing import StandardScaler
 from sklearn.neighbors import NearestNeighbors
-from typing import List, Dict
+from typing import List, Dict, Optional
 import logging
 import pickle
-import os
 import json
 from pathlib import Path
 from enum import Enum
@@ -14,7 +13,6 @@ from enum import Enum
 log = logging.getLogger("workbench")
-# ^Enumerated^ Proximity Types (distance or similarity)
 class ProximityType(Enum):
     DISTANCE = "distance"
     SIMILARITY = "similarity"
@@ -26,44 +24,49 @@ class Proximity:
         df: pd.DataFrame,
         id_column: str,
         features: List[str],
-        target: str = None,
-        track_columns: List[str] = None,
+        target: Optional[str] = None,
+        track_columns: Optional[List[str]] = None,
         n_neighbors: int = 10,
     ):
         """
         Initialize the Proximity class.
         Args:
-            df (pd.DataFrame): DataFrame containing data for neighbor computations.
-            id_column (str): Name of the column used as the identifier.
-            features (List[str]): List of feature column names to be used for neighbor computations.
-            target (str, optional): Name of the target column. Defaults to None.
-            track_columns (List[str], optional): Additional columns to track in results. Defaults to None.
-            n_neighbors (int): Number of neighbors to compute. Defaults to 10.
+            df: DataFrame containing data for neighbor computations.
+            id_column: Name of the column used as the identifier.
+            features: List of feature column names to be used for neighbor computations.
+            target: Name of the target column. Defaults to None.
+            track_columns: Additional columns to track in results. Defaults to None.
+            n_neighbors: Number of neighbors to compute. Defaults to 10.
         """
-        self.df = df.dropna(subset=features).copy()
         self.id_column = id_column
-        self.n_neighbors = min(n_neighbors, len(self.df) - 1)
         self.target = target
-        self.features = features
+        self.track_columns = track_columns or []
+        self.proximity_type = None
         self.scaler = None
         self.X = None
         self.nn = None
-        self.proximity_type = None
-        self.track_columns = track_columns or []
-        # Right now we only support numeric features, so remove any columns that are not numeric
-        non_numeric_features = self.df[self.features].select_dtypes(exclude=["number"]).columns.tolist()
-        if non_numeric_features:
-            log.warning(f"Non-numeric features {non_numeric_features} aren't currently supported...")
-            self.features = [f for f in self.features if f not in non_numeric_features]
+        # Filter out non-numeric features
+        self.features = self._validate_features(df, features)
+        # Drop NaN rows and set up DataFrame
+        self.df = df.dropna(subset=self.features).copy()
+        self.n_neighbors = min(n_neighbors, len(self.df) - 1)
         # Build the proximity model
         self.build_proximity_model()
+    def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
+        """Remove non-numeric features and log warnings."""
+        non_numeric = df[features].select_dtypes(exclude=["number"]).columns.tolist()
+        if non_numeric:
+            log.warning(f"Non-numeric features {non_numeric} aren't currently supported...")
+            return [f for f in features if f not in non_numeric]
+        return features
     def build_proximity_model(self) -> None:
-        """Standardize features and fit Nearest Neighbors model.
-        Note: This method can be overridden in subclasses for custom behavior."""
+        """Standardize features and fit Nearest Neighbors model."""
         self.proximity_type = ProximityType.DISTANCE
         self.scaler = StandardScaler()
         self.X = self.scaler.fit_transform(self.df[self.features])
@@ -74,27 +77,60 @@ class Proximity:
         Compute nearest neighbors for all rows in the dataset.
         Returns:
-            pd.DataFrame: A DataFrame of neighbors and their distances.
+            DataFrame of neighbors and their distances.
         """
         distances, indices = self.nn.kneighbors(self.X)
-        results = []
-        for i, (dists, nbrs) in enumerate(zip(distances, indices)):
-            query_id = self.df.iloc[i][self.id_column]
-            # Process neighbors
-            for neighbor_idx, dist in zip(nbrs, dists):
-                # Skip self (neighbor index == current row index)
-                if neighbor_idx == i:
-                    continue
-                results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
+        results = [
+            self._build_neighbor_result(
+                query_id=self.df.iloc[i][self.id_column], neighbor_idx=neighbor_idx, distance=dist
+            )
+            for i, (dists, nbrs) in enumerate(zip(distances, indices))
+            for neighbor_idx, dist in zip(nbrs, dists)
+            if neighbor_idx != i  # Skip self
+        ]
         return pd.DataFrame(results)
     def neighbors(
+        self,
+        id_or_ids,
+        n_neighbors: Optional[int] = 5,
+        radius: Optional[float] = None,
+        include_self: bool = True,
+    ) -> pd.DataFrame:
+        """
+        Return neighbors for ID(s) from the existing dataset.
+        Args:
+            id_or_ids: Single ID or list of IDs to look up
+            n_neighbors: Number of neighbors to return (default: 5)
+            radius: If provided, find all neighbors within this radius
+            include_self: Whether to include self in results (if present)
+        Returns:
+            DataFrame containing neighbors and distances
+        """
+        # Normalize to list
+        ids = [id_or_ids] if not isinstance(id_or_ids, list) else id_or_ids
+        # Validate IDs exist
+        missing_ids = set(ids) - set(self.df[self.id_column])
+        if missing_ids:
+            raise ValueError(f"IDs not found in dataset: {missing_ids}")
+        # Filter to requested IDs and preserve order
+        query_df = self.df[self.df[self.id_column].isin(ids)]
+        query_df = query_df.set_index(self.id_column).loc[ids].reset_index()
+        # Use the core implementation
+        return self.find_neighbors(query_df, n_neighbors=n_neighbors, radius=radius, include_self=include_self)
+    def find_neighbors(
         self,
         query_df: pd.DataFrame,
-        radius: float = None,
+        n_neighbors: Optional[int] = 5,
+        radius: Optional[float] = None,
         include_self: bool = True,
     ) -> pd.DataFrame:
         """
@@ -102,63 +138,63 @@ class Proximity:
         Args:
             query_df: DataFrame containing query points
+            n_neighbors: Number of neighbors to return (default: 5)
             radius: If provided, find all neighbors within this radius
             include_self: Whether to include self in results (if present)
         Returns:
             DataFrame containing neighbors and distances
-        Note: The query DataFrame must include the feature columns. The id_column is optional.
         """
-        # Check if all required features are present
+        # Validate features
         missing = set(self.features) - set(query_df.columns)
         if missing:
             raise ValueError(f"Query DataFrame is missing required feature columns: {missing}")
-        # Check if id_column is present
         id_column_present = self.id_column in query_df.columns
-        # None of the features can be NaNs, so report rows with NaNs and then drop them
-        rows_with_nan = query_df[self.features].isna().any(axis=1)
-        # Print the ID column for rows with NaNs
-        if rows_with_nan.any():
-            log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
-            log.warning(query_df.loc[rows_with_nan, self.id_column])
-        # Drop rows with NaNs in feature columns and reassign to query_df
-        query_df = query_df.dropna(subset=self.features)
+        # Handle NaN rows
+        query_df = self._handle_nan_rows(query_df, id_column_present)
-        # Transform the query features using the model's scaler
+        # Transform query features
         X_query = self.scaler.transform(query_df[self.features])
-        # Get neighbors using either radius or k-nearest neighbors
+        # Get neighbors
         if radius is not None:
             distances, indices = self.nn.radius_neighbors(X_query, radius=radius)
         else:
-            distances, indices = self.nn.kneighbors(X_query)
+            distances, indices = self.nn.kneighbors(X_query, n_neighbors=n_neighbors)
         # Build results
-        all_results = []
+        results = []
         for i, (dists, nbrs) in enumerate(zip(distances, indices)):
-            # Use the ID from the query DataFrame if available, otherwise use the row index
             query_id = query_df.iloc[i][self.id_column] if id_column_present else f"query_{i}"
             for neighbor_idx, dist in zip(nbrs, dists):
-                # Skip if the neighbor is the query itself and include_self is False
                 neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
+                # Skip if neighbor is self and include_self is False
                 if not include_self and neighbor_id == query_id:
                     continue
-                all_results.append(
-                    self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist)
-                )
+                results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
+        results_df = pd.DataFrame(results).sort_values([self.id_column, "distance"]).reset_index(drop=True)
+        return results_df
+    def _handle_nan_rows(self, query_df: pd.DataFrame, id_column_present: bool) -> pd.DataFrame:
+        """Drop rows with NaN values in feature columns and log warnings."""
+        rows_with_nan = query_df[self.features].isna().any(axis=1)
+        if rows_with_nan.any():
+            log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
+            if id_column_present:
+                log.warning(query_df.loc[rows_with_nan, self.id_column])
-        return pd.DataFrame(all_results)
+        return query_df.dropna(subset=self.features)
     def _build_neighbor_result(self, query_id, neighbor_idx: int, distance: float) -> Dict:
         """
-        Internal: Build a result dictionary for a single neighbor.
+        Build a result dictionary for a single neighbor.
         Args:
             query_id: ID of the query point
@@ -169,27 +205,30 @@ class Proximity:
             Dictionary containing neighbor information
         """
         neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
+        neighbor_row = self.df.iloc[neighbor_idx]
-        # Basic neighbor info
-        neighbor_info = {
+        # Start with basic info
+        result = {
             self.id_column: query_id,
             "neighbor_id": neighbor_id,
             "distance": distance,
         }
-        # Determine which additional columns to include
-        relevant_cols = [self.target, "prediction"] if self.target else []
-        relevant_cols += [c for c in self.df.columns if "_proba" in c or "residual" in c]
-        relevant_cols += ["outlier"]
+        # Columns to automatically include if they exist
+        auto_include = (
+            ([self.target, "prediction"] if self.target else [])
+            + self.track_columns
+            + [col for col in self.df.columns if "_proba" in col or "residual" in col or col == "outlier"]
+        )
-        # Add user-specified columns
-        relevant_cols += self.track_columns
+        # Add values for existing columns
+        for col in auto_include:
+            if col in self.df.columns:
+                result[col] = neighbor_row[col]
-        # Add values for each relevant column that exists in the dataframe
-        for col in filter(lambda c: c in self.df.columns, relevant_cols):
-            neighbor_info[col] = self.df.iloc[neighbor_idx][col]
-        return neighbor_info
+        # Truncate very small distances to zero
+        result["distance"] = 0.0 if distance < 1e-7 else distance
+        return result
     def serialize(self, directory: str) -> None:
         """
@@ -198,8 +237,8 @@ class Proximity:
         Args:
             directory: Directory path to save the model components
         """
-        # Create directory if it doesn't exist
-        os.makedirs(directory, exist_ok=True)
+        dir_path = Path(directory)
+        dir_path.mkdir(parents=True, exist_ok=True)
         # Save metadata
         metadata = {
@@ -210,17 +249,16 @@ class Proximity:
             "n_neighbors": self.n_neighbors,
         }
-        with open(os.path.join(directory, "metadata.json"), "w") as f:
-            json.dump(metadata, f)
+        (dir_path / "metadata.json").write_text(json.dumps(metadata))
-        # Save the DataFrame
-        self.df.to_pickle(os.path.join(directory, "df.pkl"))
+        # Save DataFrame
+        self.df.to_pickle(dir_path / "df.pkl")
-        # Save the scaler and nearest neighbors model
-        with open(os.path.join(directory, "scaler.pkl"), "wb") as f:
+        # Save models
+        with open(dir_path / "scaler.pkl", "wb") as f:
             pickle.dump(self.scaler, f)
-        with open(os.path.join(directory, "nn_model.pkl"), "wb") as f:
+        with open(dir_path / "nn_model.pkl", "wb") as f:
             pickle.dump(self.nn, f)
         log.info(f"Proximity model serialized to {directory}")
@@ -234,23 +272,22 @@ class Proximity:
             directory: Directory path containing the serialized model components
         Returns:
-            Proximity: A new Proximity instance
+            A new Proximity instance
         """
-        directory_path = Path(directory)
-        if not directory_path.exists() or not directory_path.is_dir():
+        dir_path = Path(directory)
+        if not dir_path.is_dir():
             raise ValueError(f"Directory {directory} does not exist or is not a directory")
         # Load metadata
-        with open(os.path.join(directory, "metadata.json"), "r") as f:
-            metadata = json.load(f)
+        metadata = json.loads((dir_path / "metadata.json").read_text())
         # Load DataFrame
-        df_path = os.path.join(directory, "df.pkl")
-        if not os.path.exists(df_path):
+        df_path = dir_path / "df.pkl"
+        if not df_path.exists():
             raise FileNotFoundError(f"DataFrame file not found at {df_path}")
         df = pd.read_pickle(df_path)
-        # Create instance but skip _prepare_data
+        # Create instance without calling __init__
         instance = cls.__new__(cls)
         instance.df = df
         instance.id_column = metadata["id_column"]
@@ -259,15 +296,16 @@ class Proximity:
         instance.track_columns = metadata["track_columns"]
         instance.n_neighbors = metadata["n_neighbors"]
-        # Load scaler and nn model
-        with open(os.path.join(directory, "scaler.pkl"), "rb") as f:
+        # Load models
+        with open(dir_path / "scaler.pkl", "rb") as f:
             instance.scaler = pickle.load(f)
-        with open(os.path.join(directory, "nn_model.pkl"), "rb") as f:
+        with open(dir_path / "nn_model.pkl", "rb") as f:
             instance.nn = pickle.load(f)
-        # Load X from scaler transform
+        # Restore X
         instance.X = instance.scaler.transform(instance.df[instance.features])
+        instance.proximity_type = ProximityType.DISTANCE
         log.info(f"Proximity model deserialized from {directory}")
         return instance
@@ -294,10 +332,10 @@ if __name__ == "__main__":
     print(prox.all_neighbors())
     # Test the neighbors method
-    print(prox.neighbors(query_df=df.iloc[[0]]))
+    print(prox.neighbors(1))
     # Test the neighbors method with radius
-    print(prox.neighbors(query_df=df.iloc[0:2], radius=2.0))
+    print(prox.neighbors(1, radius=2.0))
     # Test with data that isn't in the 'train' dataframe
     query_data = {
@@ -307,7 +345,7 @@ if __name__ == "__main__":
         "Feature3": [2.31],
     }
     query_df = pd.DataFrame(query_data)
-    print(prox.neighbors(query_df=query_df))
+    print(prox.find_neighbors(query_df=query_df))  # For new data we use find_neighbors()
     # Test with Features list
     prox = Proximity(df, id_column="ID", features=["Feature1"], n_neighbors=2)
@@ -334,13 +372,13 @@ if __name__ == "__main__":
     print(prox.all_neighbors())
     # Test the neighbors method
-    print(prox.neighbors(query_df=df.iloc[0:2]))
+    print(prox.neighbors(["a", "b"]))
     # Time neighbors with all IDs versus calling all_neighbors
     import time
     start_time = time.time()
-    prox_df = prox.neighbors(query_df=df, include_self=False)
+    prox_df = prox.find_neighbors(query_df=df, include_self=False)
     end_time = time.time()
     print(f"Time taken for neighbors: {end_time - start_time:.4f} seconds")
     start_time = time.time()
@@ -361,7 +399,7 @@ if __name__ == "__main__":
     # Test querying without the id_column
     df_no_id = df.drop(columns=["foo_id"])
-    print(prox.neighbors(query_df=df_no_id, include_self=False))
+    print(prox.find_neighbors(query_df=df_no_id, include_self=False))
     # Test duplicate IDs
     data = {
@@ -379,6 +417,9 @@ if __name__ == "__main__":
     fs = FeatureSet("abalone_features")
     model = Model("abalone-regression")
+    features = model.features()
     df = fs.pull_dataframe()
-    prox = Proximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
-    print(prox.neighbors(query_df=df[0:2]))
+    prox = Proximity(
+        df, id_column=fs.id_column, features=model.features(), target=model.target(), track_columns=features
+    )
+    print(prox.find_neighbors(query_df=df[0:2]))

workbench/model_scripts/uq_models/generated_model_script.py CHANGED Viewed

@@ -19,7 +19,7 @@ from typing import List, Tuple
 # Template Placeholders
 TEMPLATE_PARAMS = {
     "target": "udm_asy_res_free_percent",
-    "features": ['vsa_estate6', 'naromatom', 'mollogp', 'fr_nh2', 'mp', 'c2sp2', 'xch_3d', 'axp_6d', 'bcut2d_mrhi', 'fr_benzene', 'mz', 'slogp_vsa6', 'fr_halogen', 'bcut2d_mwhi', 'vsa_estate4', 'slogp_vsa3', 'estate_vsa5', 'minestateindex', 'axp_3dv', 'estate_vsa3', 'vsa_estate9', 'molwt', 'hallkieralpha', 'fpdensitymorgan1', 'peoe_vsa13', 'xpc_5d', 'bcut2d_chghi', 'peoe_vsa8', 'axp_0dv', 'axp_2d', 'chi2v', 'bcut2d_logphi', 'axp_5d', 'peoe_vsa2', 'estate_vsa6', 'qed', 'numrotatablebonds', 'xc_3dv', 'peoe_vsa3', 'balabanj', 'slogp_vsa5', 'mv', 'vsa_estate2', 'bcut2d_mwlow', 'xch_7d', 'chi3n', 'vsa_estate8', 'estate_vsa4', 'xp_7dv', 'fr_nh1', 'vsa_estate3', 'fr_ketone_topliss', 'minpartialcharge', 'phi', 'peoe_vsa10', 'vsa_estate7', 'estate_vsa7', 'tpsa', 'kappa3', 'kappa2', 'bcut2d_logplow', 'xch_6d', 'maxpartialcharge', 'vsa_estate1', 'peoe_vsa9', 'axp_1d', 'fr_ar_n', 'chi2n', 'vsa_estate5', 'xp_4dv', 'slogp_vsa10', 'num_stereobonds', 'peoe_vsa11', 'bcut2d_chglo', 'chi1v', 'peoe_vsa7', 'bertzct', 'axp_2dv', 'estate_vsa2', 'smr_vsa9', 'peoe_vsa6', 'num_s_centers', 'num_r_centers', 'xch_7dv', 'xc_5d', 'axp_4dv', 'xc_5dv', 'mi', 'xc_3d', 'fpdensitymorgan2', 'xp_0dv', 'nhohcount', 'numatomstereocenters', 'mse', 'smr_vsa3', 'peoe_vsa12', 'nocount', 'fpdensitymorgan3', 'minabsestateindex', 'bcut2d_mrlow', 'axp_5dv', 'sz', 'vsa_estate10', 'axp_3d', 'xch_6dv', 'xch_4d', 'xc_6d', 'estate_vsa8', 'mpe', 'smr_vsa7', 'numhdonors', 'smr_vsa1', 'xp_5d', 'fr_para_hydroxylation', 'chi3v', 'xpc_6dv', 'nbase', 'heavyatommolwt', 'avgipc', 'maxestateindex', 'smr_vsa6', 'fr_bicyclic', 'xc_4dv', 'xp_7d', 'smr_vsa5', 'xpc_4d', 'smr_vsa4', 'peoe_vsa4', 'numheteroatoms', 'fr_nhpyrrole', 'axp_4d', 'smr_vsa10', 'xp_6d', 'sps', 'mare', 'slogp_vsa2', 'axp_0d', 'slogp_vsa4', 'fr_al_oh', 'numheterocycles', 'labuteasa', 'xp_3d', 'chi4n', 'fractioncsp3', 'maxabspartialcharge', 'fr_al_oh_notert', 'peoe_vsa1', 'axp_7dv', 'slogp_vsa11', 'peoe_vsa5', 'xpc_5dv', 'xpc_6d', 'xp_2d', 'xp_3dv', 'fr_ndealkylation1', 'axp_7d', 'estate_vsa9', 'molmr', 'num_stereocenters', 'si', 'estate_vsa1', 'xc_6dv', 'chi0v', 'fr_oxazole', 'axp_6dv', 'xp_6dv', 'xp_4d', 'numaliphaticheterocycles', 'fr_imine', 'fr_imidazole', 'xp_5dv', 'fr_piperdine', 'slogp_vsa7', 'chi1', 'c1sp2', 'numaromaticheterocycles', 'xpc_4dv', 'c3sp2', 'fr_aniline', 'fr_piperzine', 'axp_1dv', 'xch_4dv', 'chi4v', 'chi1n', 'minabspartialcharge', 'slogp_vsa1', 'fr_nh0', 'chi0n', 'c2sp3', 'xc_4d', 'xch_5dv', 'peoe_vsa14', 'xch_5d', 'numsaturatedrings', 'fr_pyridine', 'kappa1', 'slogp_vsa8', 'xp_2dv', 'fr_ar_coo', 'numvalenceelectrons'],
+    "features": ['naromatom', 'minabspartialcharge', 'bcut2d_mrhi', 'smr_vsa10', 'vsa_estate2', 'minpartialcharge', 'xpc_5d', 'sps', 'xc_3dv', 'smr_vsa7', 'bcut2d_logplow', 'mollogp', 'vsa_estate1', 'num_s_centers', 'vsa_estate4', 'peoe_vsa13', 'fr_nh2', 'bertzct', 'estate_vsa4', 'vsa_estate9', 'smr_vsa3', 'fr_nh1', 'molwt', 'estate_vsa5', 'slogp_vsa5', 'maxpartialcharge', 'estate_vsa1', 'fr_hoccn', 'xc_5d', 'nbase', 'chi1v', 'peoe_vsa10', 'tpsa', 'vsa_estate3', 'chi2v', 'estate_vsa8', 'numheteroatoms', 'estate_vsa2', 'peoe_vsa1', 'labuteasa', 'axp_4d', 'xch_7dv', 'chi0n', 'num_r_centers', 'vsa_estate8', 'minabsestateindex', 'bcut2d_chglo', 'bcut2d_mwhi', 'fr_nh0', 'chi4n', 'estate_vsa9', 'smr_vsa5', 'peoe_vsa2', 'peoe_vsa7', 'peoe_vsa9', 'kappa3', 'slogp_vsa3', 'fr_arn', 'estate_vsa3', 'avgipc', 'axp_5d', 'xpc_6d', 'c2sp2', 'peoe_vsa5', 'vsa_estate5', 'balabanj', 'maxabspartialcharge', 'fr_aniline', 'fr_piperdine', 'vsa_estate6', 'bcut2d_mwlow', 'numsaturatedheterocycles', 'vsa_estate10', 'smr_vsa1', 'estate_vsa6', 'smr_vsa6', 'fpdensitymorgan1', 'peoe_vsa3', 'peoe_vsa8', 'smr_vsa9', 'slogp_vsa2', 'nocount', 'fpdensitymorgan3', 'axp_6d', 'bcut2d_mrlow', 'bcut2d_logphi', 'axp_4dv', 'fpdensitymorgan2', 'mp', 'xp_5d', 'fr_nhpyrrole', 'mz', 'mv', 'vsa_estate7', 'axp_7dv', 'mi', 'c1sp2', 'xpc_6dv', 'slogp_vsa10', 'xp_7d', 'axp_3dv', 'peoe_vsa4', 'peoe_vsa6', 'axp_2dv', 'xch_5dv', 'qed', 'estate_vsa7', 'numaromaticrings', 'chi1n', 'axp_0d', 'axp_6dv', 'numrotatablebonds', 'hallkieralpha', 'c1sp3', 'xc_4dv', 'kappa2', 'bcut2d_chghi', 'xch_7d', 'axp_0dv', 'slogp_vsa7', 'axp_7d', 'minestateindex', 'axp_2d', 'axp_1d', 'chi0', 'fractioncsp3', 'slogp_vsa6', 'axp_1dv', 'chi2n', 'xp_6dv', 'maxestateindex', 'xpc_4d', 'numaliphaticheterocycles', 'chi1', 'phi', 'chi3n', 'xc_4d', 'xc_3d', 'peoe_vsa12', 'xp_6d', 'chi3v', 'axp_3d', 'axp_5dv', 'fr_benzene', 'slogp_vsa4', 'fr_pyridine', 'fr_aryl_methyl', 'xp_5dv', 'c3sp3', 'xp_7dv', 'slogp_vsa1', 'peoe_vsa11', 'mse', 'xc_5dv', 'xpc_5dv', 'xc_6dv', 'xp_0dv', 'xch_5d', 'c3sp2', 'numatomstereocenters', 'numhacceptors', 'fr_imidazole', 'numsaturatedrings', 'xpc_4dv', 'chi0v', 'numheterocycles', 'xch_6dv', 'estate_vsa10', 'chi4v', 'mare', 'numhdonors', 'xch_6d', 'xp_4d', 'fr_ar_n', 'numunspecifiedatomstereocenters', 'numspiroatoms', 'xch_4dv', 'fr_morpholine', 'fr_methoxy', 'mm', 'fr_piperzine'],
     "compressed_features": [],
     "train_all_data": True,
     "hyperparameters": {},

workbench/utils/model_utils.py CHANGED Viewed

@@ -93,6 +93,33 @@ def get_custom_script_path(package: str, script_name: str) -> Path:
     return script_path
+def proximity_model_local(model: "Model", filtered: bool = True):
+    """Create a Proximity Model for this Model
+    Args:
+        model (Model): The model to create the proximity model from
+        filtered (bool, optional): Use filtered training data for the Proximity Model (default: True)
+    Returns:
+        Proximity: The proximity model
+    """
+    from workbench.algorithms.dataframe.proximity import Proximity  # noqa: F401 (avoid circular import)
+    from workbench.api import Model, FeatureSet  # noqa: F401 (avoid circular import)
+    # Get Feature and Target Columns from the existing given Model
+    features = model.features()
+    target = model.target()
+    # Create the Proximity Model from our FeatureSet
+    fs = FeatureSet(model.get_input())
+    if filtered:
+        df = fs.view("training").pull_dataframe()
+    else:
+        df = fs.pull_dataframe()
+    id_column = fs.id_column
+    return Proximity(df, id_column, features, target, track_columns=features)
 def proximity_model(model: "Model", prox_model_name: str, track_columns: list = None) -> "Model":
     """Create a proximity model based on the given model

workbench/utils/xgboost_model_utils.py CHANGED Viewed

@@ -386,6 +386,106 @@ def cross_fold_inference(workbench_model: Any, nfolds: int = 5) -> Tuple[Dict[st
     return metrics_dict, predictions_df
+def leave_one_out_inference(workbench_model: Any) -> pd.DataFrame:
+    """
+    Performs leave-one-out cross-validation (parallelized).
+    For datasets > 1000 rows, first identifies top 100 worst predictions via 10-fold CV,
+    then performs true leave-one-out on those 100 samples.
+    Each model trains on ALL data except one sample.
+    """
+    from workbench.api import FeatureSet
+    from joblib import Parallel, delayed
+    from tqdm import tqdm
+    def train_and_predict_one(model_params, is_classifier, X, y, train_idx, val_idx):
+        """Train on train_idx, predict on val_idx."""
+        model = xgb.XGBClassifier(**model_params) if is_classifier else xgb.XGBRegressor(**model_params)
+        model.fit(X[train_idx], y[train_idx])
+        return model.predict(X[val_idx])[0]
+    # Load model and get params
+    model_artifact_uri = workbench_model.model_data_url()
+    loaded_model = xgboost_model_from_s3(model_artifact_uri)
+    if loaded_model is None:
+        log.error("No XGBoost model found in the artifact.")
+        return pd.DataFrame()
+    if isinstance(loaded_model, (xgb.XGBClassifier, xgb.XGBRegressor)):
+        is_classifier = isinstance(loaded_model, xgb.XGBClassifier)
+        model_params = loaded_model.get_params()
+    elif isinstance(loaded_model, xgb.Booster):
+        log.warning("Deprecated: Loaded model is a Booster, wrapping in sklearn model.")
+        is_classifier = workbench_model.model_type.value == "classifier"
+        model_params = {"enable_categorical": True}
+    else:
+        log.error(f"Unexpected model type: {type(loaded_model)}")
+        return pd.DataFrame()
+    # Load and prepare data
+    fs = FeatureSet(workbench_model.get_input())
+    df = fs.view("training").pull_dataframe()
+    id_col = fs.id_column
+    target_col = workbench_model.target()
+    feature_cols = workbench_model.features()
+    # Convert string features to categorical
+    for col in feature_cols:
+        if df[col].dtype in ["object", "string"]:
+            df[col] = df[col].astype("category")
+    # Determine which samples to run LOO on
+    if len(df) > 1000:
+        log.important(f"Dataset has {len(df)} rows. Running 10-fold CV to identify top 1000 worst predictions...")
+        _, predictions_df = cross_fold_inference(workbench_model, nfolds=10)
+        predictions_df["residual_abs"] = np.abs(predictions_df[target_col] - predictions_df["prediction"])
+        worst_samples = predictions_df.nlargest(1000, "residual_abs")
+        worst_ids = worst_samples[id_col].values
+        loo_indices = df[df[id_col].isin(worst_ids)].index.values
+        log.important(f"Running leave-one-out CV on 1000 worst samples. Each model trains on {len(df)-1} rows...")
+    else:
+        log.important(f"Running leave-one-out CV on all {len(df)} samples...")
+        loo_indices = df.index.values
+    # Prepare full dataset for training
+    X_full = df[feature_cols].values
+    y_full = df[target_col].values
+    # Encode target if classifier
+    label_encoder = LabelEncoder() if is_classifier else None
+    if label_encoder:
+        y_full = label_encoder.fit_transform(y_full)
+    # Generate LOO splits
+    splits = []
+    for loo_idx in loo_indices:
+        train_idx = np.delete(np.arange(len(X_full)), loo_idx)
+        val_idx = np.array([loo_idx])
+        splits.append((train_idx, val_idx))
+    # Parallel execution
+    predictions = Parallel(n_jobs=4)(
+        delayed(train_and_predict_one)(model_params, is_classifier, X_full, y_full, train_idx, val_idx)
+        for train_idx, val_idx in tqdm(splits, desc="LOO CV")
+    )
+    # Build results dataframe
+    predictions_array = np.array(predictions)
+    if label_encoder:
+        predictions_array = label_encoder.inverse_transform(predictions_array.astype(int))
+    predictions_df = pd.DataFrame(
+        {
+            id_col: df.loc[loo_indices, id_col].values,
+            target_col: df.loc[loo_indices, target_col].values,
+            "prediction": predictions_array,
+        }
+    )
+    predictions_df["residual_abs"] = np.abs(predictions_df[target_col] - predictions_df["prediction"])
+    return predictions_df
 if __name__ == "__main__":
     """Exercise the Model Utilities"""
     from workbench.api import Model, FeatureSet

{workbench-0.8.183.dist-info → workbench-0.8.185.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: workbench
-Version: 0.8.183
+Version: 0.8.185
 Summary: Workbench: A Dashboard and Python API for creating and deploying AWS SageMaker Model Pipelines
 Author-email: SuperCowPowers LLC <support@supercowpowers.com>
 License-Expression: MIT

workbench 0.8.183__py3-none-any.whl → 0.8.185__py3-none-any.whl

Potentially problematic release.

workbench 0.8.183py3-none-any.whl → 0.8.185py3-none-any.whl