PyPI - workbench - Versions diffs - 0.8.174__py3-none-any.whl → 0.8.227__py3-none-any.whl - Mend

workbench 0.8.174py3-none-any.whl → 0.8.227py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (145) hide show

workbench/__init__.py +1 -0
workbench/algorithms/dataframe/__init__.py +1 -2
workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
workbench/algorithms/dataframe/projection_2d.py +44 -21
workbench/algorithms/dataframe/proximity.py +259 -305
workbench/algorithms/graph/light/proximity_graph.py +12 -11
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +388 -0
workbench/algorithms/sql/column_stats.py +0 -1
workbench/algorithms/sql/correlations.py +0 -1
workbench/algorithms/sql/descriptive_stats.py +0 -1
workbench/algorithms/sql/outliers.py +3 -3
workbench/api/__init__.py +5 -1
workbench/api/df_store.py +17 -108
workbench/api/endpoint.py +14 -12
workbench/api/feature_set.py +117 -11
workbench/api/meta.py +0 -1
workbench/api/meta_model.py +289 -0
workbench/api/model.py +52 -21
workbench/api/parameter_store.py +3 -52
workbench/cached/cached_meta.py +0 -1
workbench/cached/cached_model.py +49 -11
workbench/core/artifacts/__init__.py +11 -2
workbench/core/artifacts/artifact.py +7 -7
workbench/core/artifacts/data_capture_core.py +8 -1
workbench/core/artifacts/df_store_core.py +114 -0
workbench/core/artifacts/endpoint_core.py +323 -205
workbench/core/artifacts/feature_set_core.py +249 -45
workbench/core/artifacts/model_core.py +133 -101
workbench/core/artifacts/parameter_store_core.py +98 -0
workbench/core/cloud_platform/aws/aws_account_clamp.py +48 -2
workbench/core/cloud_platform/cloud_meta.py +0 -1
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/features_to_model/features_to_model.py +60 -44
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +43 -10
workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
workbench/core/views/training_view.py +113 -42
workbench/core/views/view.py +53 -3
workbench/core/views/view_utils.py +4 -4
workbench/model_script_utils/model_script_utils.py +339 -0
workbench/model_script_utils/pytorch_utils.py +405 -0
workbench/model_script_utils/uq_harness.py +277 -0
workbench/model_scripts/chemprop/chemprop.template +774 -0
workbench/model_scripts/chemprop/generated_model_script.py +774 -0
workbench/model_scripts/chemprop/model_script_utils.py +339 -0
workbench/model_scripts/chemprop/requirements.txt +3 -0
workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +18 -7
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +80 -58
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -2
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/ngboost.template +15 -16
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +443 -499
workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
workbench/model_scripts/pytorch_model/pytorch.template +440 -496
workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
workbench/model_scripts/pytorch_model/requirements.txt +1 -1
workbench/model_scripts/pytorch_model/uq_harness.py +277 -0
workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +15 -12
workbench/model_scripts/uq_models/generated_model_script.py +248 -0
workbench/model_scripts/xgb_model/generated_model_script.py +371 -403
workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
workbench/model_scripts/xgb_model/uq_harness.py +277 -0
workbench/model_scripts/xgb_model/xgb_model.template +367 -399
workbench/repl/workbench_shell.py +18 -14
workbench/resources/open_source_api.key +1 -1
workbench/scripts/endpoint_test.py +162 -0
workbench/scripts/lambda_test.py +73 -0
workbench/scripts/meta_model_sim.py +35 -0
workbench/scripts/ml_pipeline_sqs.py +122 -6
workbench/scripts/training_test.py +85 -0
workbench/themes/dark/custom.css +59 -0
workbench/themes/dark/plotly.json +5 -5
workbench/themes/light/custom.css +153 -40
workbench/themes/light/plotly.json +9 -9
workbench/themes/midnight_blue/custom.css +59 -0
workbench/utils/aws_utils.py +0 -1
workbench/utils/chem_utils/fingerprints.py +87 -46
workbench/utils/chem_utils/mol_descriptors.py +18 -7
workbench/utils/chem_utils/mol_standardize.py +80 -58
workbench/utils/chem_utils/projections.py +16 -6
workbench/utils/chem_utils/vis.py +25 -27
workbench/utils/chemprop_utils.py +141 -0
workbench/utils/config_manager.py +2 -6
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/markdown_utils.py +57 -0
workbench/utils/meta_model_simulator.py +499 -0
workbench/utils/metrics_utils.py +256 -0
workbench/utils/model_utils.py +274 -87
workbench/utils/pipeline_utils.py +0 -1
workbench/utils/plot_utils.py +159 -34
workbench/utils/pytorch_utils.py +87 -0
workbench/utils/shap_utils.py +11 -57
workbench/utils/theme_manager.py +95 -30
workbench/utils/xgboost_local_crossfold.py +267 -0
workbench/utils/xgboost_model_utils.py +127 -220
workbench/web_interface/components/experiments/outlier_plot.py +0 -1
workbench/web_interface/components/model_plot.py +16 -2
workbench/web_interface/components/plugin_unit_test.py +5 -3
workbench/web_interface/components/plugins/ag_table.py +2 -4
workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
workbench/web_interface/components/plugins/model_details.py +48 -80
workbench/web_interface/components/plugins/scatter_plot.py +192 -92
workbench/web_interface/components/settings_menu.py +184 -0
workbench/web_interface/page_views/main_page.py +0 -1
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/METADATA +31 -17
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/RECORD +125 -111
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/entry_points.txt +4 -0
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/licenses/LICENSE +1 -1
workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie.template +0 -502
workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -386
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/themes/quartz/base_css.url +0 -1
workbench/themes/quartz/custom.css +0 -117
workbench/themes/quartz/plotly.json +0 -642
workbench/themes/quartz_dark/base_css.url +0 -1
workbench/themes/quartz_dark/custom.css +0 -131
workbench/themes/quartz_dark/plotly.json +0 -642
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/WHEEL +0 -0
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/top_level.txt +0 -0

workbench/model_scripts/custom_models/chem_info/mol_standardize.py CHANGED Viewed

@@ -81,6 +81,8 @@ Usage:
 import logging
 from typing import Optional, Tuple
 import pandas as pd
+import time
+from contextlib import contextmanager
 from rdkit import Chem
 from rdkit.Chem import Mol
 from rdkit.Chem.MolStandardize import rdMolStandardize
@@ -90,6 +92,14 @@ log = logging.getLogger("workbench")
 RDLogger.DisableLog("rdApp.warning")
+# Helper context manager for timing
+@contextmanager
+def timer(name):
+    start = time.time()
+    yield
+    print(f"{name}: {time.time() - start:.2f}s")
 class MolStandardizer:
     """
     Streamlined molecular standardizer for ADMET preprocessing
@@ -116,6 +126,7 @@ class MolStandardizer:
         Pipeline:
         1. Cleanup (remove Hs, disconnect metals, normalize)
         2. Get largest fragment (optional - only if remove_salts=True)
+           2a. Extract salt information BEFORE further modifications
         3. Neutralize charges
         4. Canonicalize tautomer (optional)
@@ -130,18 +141,24 @@ class MolStandardizer:
         try:
             # Step 1: Cleanup
-            mol = rdMolStandardize.Cleanup(mol, self.params)
-            if mol is None:
+            cleaned_mol = rdMolStandardize.Cleanup(mol, self.params)
+            if cleaned_mol is None:
                 return None, None
+            # If not doing any transformations, return early
+            if not self.remove_salts and not self.canonicalize_tautomer:
+                return cleaned_mol, None
             salt_smiles = None
+            mol = cleaned_mol
             # Step 2: Fragment handling (conditional based on remove_salts)
             if self.remove_salts:
-                # Get parent molecule and extract salt information
-                parent_mol = rdMolStandardize.FragmentParent(mol, self.params)
+                # Get parent molecule
+                parent_mol = rdMolStandardize.FragmentParent(cleaned_mol, self.params)
                 if parent_mol:
-                    salt_smiles = self._extract_salt(mol, parent_mol)
+                    # Extract salt BEFORE any modifications to parent
+                    salt_smiles = self._extract_salt(cleaned_mol, parent_mol)
                     mol = parent_mol
                 else:
                     return None, None
@@ -153,7 +170,7 @@ class MolStandardizer:
                 if mol is None:
                     return None, salt_smiles
-            # Step 4: Canonicalize tautomer
+            # Step 4: Canonicalize tautomer (LAST STEP)
             if self.canonicalize_tautomer:
                 mol = self.tautomer_enumerator.Canonicalize(mol)
@@ -172,13 +189,22 @@ class MolStandardizer:
         - Mixtures: multiple large neutral organic fragments
         Args:
-            orig_mol: Original molecule (before FragmentParent)
-            parent_mol: Parent molecule (after FragmentParent)
+            orig_mol: Original molecule (after Cleanup, before FragmentParent)
+            parent_mol: Parent molecule (after FragmentParent, before tautomerization)
         Returns:
             SMILES string of salt components or None if no salts/mixture detected
         """
         try:
+            # Quick atom count check
+            if orig_mol.GetNumAtoms() == parent_mol.GetNumAtoms():
+                return None
+            # Quick heavy atom difference check
+            heavy_diff = orig_mol.GetNumHeavyAtoms() - parent_mol.GetNumHeavyAtoms()
+            if heavy_diff <= 0:
+                return None
             # Get all fragments from original molecule
             orig_frags = Chem.GetMolFrags(orig_mol, asMols=True)
@@ -268,7 +294,7 @@ def standardize(
     if "orig_smiles" not in result.columns:
         result["orig_smiles"] = result[smiles_column]
-    # Initialize standardizer with salt removal control
+    # Initialize standardizer
     standardizer = MolStandardizer(canonicalize_tautomer=canonicalize_tautomer, remove_salts=extract_salts)
     def process_smiles(smiles: str) -> pd.Series:
@@ -286,6 +312,11 @@ def standardize(
             log.error("Encountered missing or empty SMILES string")
             return pd.Series({"smiles": None, "salt": None})
+        # Early check for unreasonably long SMILES
+        if len(smiles) > 1000:
+            log.error(f"SMILES too long ({len(smiles)} chars): {smiles[:50]}...")
+            return pd.Series({"smiles": None, "salt": None})
         # Parse molecule
         mol = Chem.MolFromSmiles(smiles)
         if mol is None:
@@ -299,7 +330,9 @@ def standardize(
         if std_mol is not None:
             # Check if molecule is reasonable
             if std_mol.GetNumAtoms() == 0 or std_mol.GetNumAtoms() > 200:  # Arbitrary limits
-                log.error(f"Unusual molecule size: {std_mol.GetNumAtoms()} atoms")
+                log.error(f"Rejecting molecule size: {std_mol.GetNumAtoms()} atoms")
+                log.error(f"Original SMILES: {smiles}")
+                return pd.Series({"smiles": None, "salt": salt_smiles})
         if std_mol is None:
             return pd.Series(
@@ -325,8 +358,11 @@ def standardize(
 if __name__ == "__main__":
-    import time
-    from workbench.api import DataSource
+    # Pandas display options for better readability
+    pd.set_option("display.max_columns", None)
+    pd.set_option("display.width", 1000)
+    pd.set_option("display.max_colwidth", 100)
     # Test with DataFrame including various salt forms
     test_data = pd.DataFrame(
@@ -362,67 +398,53 @@ if __name__ == "__main__":
     )
     # General test
+    print("Testing standardization with full dataset...")
     standardize(test_data)
     # Remove the last two rows to avoid errors with None and INVALID
     test_data = test_data.iloc[:-2].reset_index(drop=True)
     # Test WITHOUT salt removal (keeps full molecule)
-    print("\nStandardization KEEPING salts (extract_salts=False):")
-    print("This preserves the full molecule including counterions")
+    print("\nStandardization KEEPING salts (extract_salts=False) Tautomerization: True")
     result_keep = standardize(test_data, extract_salts=False, canonicalize_tautomer=True)
-    display_cols = ["compound_id", "orig_smiles", "smiles", "salt"]
-    print(result_keep[display_cols].to_string())
+    display_order = ["compound_id", "orig_smiles", "smiles", "salt"]
+    print(result_keep[display_order])
     # Test WITH salt removal
     print("\n" + "=" * 70)
     print("Standardization REMOVING salts (extract_salts=True):")
-    print("This extracts parent molecule and records salt information")
     result_remove = standardize(test_data, extract_salts=True, canonicalize_tautomer=True)
-    print(result_remove[display_cols].to_string())
+    print(result_remove[display_order])
-    # Test WITHOUT tautomerization (keeping salts)
+    # Test with problematic cases specifically
     print("\n" + "=" * 70)
-    print("Standardization KEEPING salts, NO tautomerization:")
-    result_no_taut = standardize(test_data, extract_salts=False, canonicalize_tautomer=False)
-    print(result_no_taut[display_cols].to_string())
+    print("Testing specific problematic cases:")
+    problem_cases = pd.DataFrame(
+        {
+            "smiles": [
+                "CC(=O)O.CCN",  # Should extract CC(=O)O as salt
+                "CCO.CC",  # Should return CC as salt
+            ],
+            "compound_id": ["TEST_C002", "TEST_C005"],
+        }
+    )
+    problem_result = standardize(problem_cases, extract_salts=True, canonicalize_tautomer=True)
+    print(problem_result[display_order])
+    # Performance test with larger dataset
+    from workbench.api import DataSource
-    # Show the difference for salt-containing molecules
-    print("\n" + "=" * 70)
-    print("Comparison showing differences:")
-    for idx, row in result_keep.iterrows():
-        keep_smiles = row["smiles"]
-        remove_smiles = result_remove.loc[idx, "smiles"]
-        no_taut_smiles = result_no_taut.loc[idx, "smiles"]
-        salt = result_remove.loc[idx, "salt"]
-        # Show differences when they exist
-        if keep_smiles != remove_smiles or keep_smiles != no_taut_smiles:
-            print(f"\n{row['compound_id']} ({row['orig_smiles']}):")
-            if keep_smiles != no_taut_smiles:
-                print(f"  With salt + taut:    {keep_smiles}")
-                print(f"  With salt, no taut:  {no_taut_smiles}")
-            if keep_smiles != remove_smiles:
-                print(f"  Parent only + taut:  {remove_smiles}")
-            if salt:
-                print(f"  Extracted salt:      {salt}")
-    # Summary statistics
     print("\n" + "=" * 70)
-    print("Summary:")
-    print(f"Total molecules: {len(result_remove)}")
-    print(f"Molecules with salts: {result_remove['salt'].notna().sum()}")
-    unique_salts = result_remove["salt"].dropna().unique()
-    print(f"Unique salts found: {unique_salts[:5].tolist()}")
-    # Get a real dataset from Workbench and time the standardization
     ds = DataSource("aqsol_data")
-    df = ds.pull_dataframe()[["id", "smiles"]]
-    start_time = time.time()
-    std_df = standardize(df, extract_salts=True, canonicalize_tautomer=True)
-    end_time = time.time()
-    print(f"\nStandardized {len(std_df)} molecules from Workbench in {end_time - start_time:.2f} seconds")
-    print(std_df.head())
-    print(f"Molecules with salts: {std_df['salt'].notna().sum()}")
-    unique_salts = std_df["salt"].dropna().unique()
-    print(f"Unique salts found: {unique_salts[:5].tolist()}")
+    df = ds.pull_dataframe()[["id", "smiles"]][:1000]
+    for tautomer in [True, False]:
+        for extract in [True, False]:
+            print(f"Performance test with AQSol dataset: tautomer={tautomer} extract_salts={extract}:")
+            start_time = time.time()
+            std_df = standardize(df, canonicalize_tautomer=tautomer, extract_salts=extract)
+            elapsed = time.time() - start_time
+            mol_per_sec = len(df) / elapsed
+            print(f"{elapsed:.2f}s ({mol_per_sec:.0f} mol/s)")

workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py CHANGED Viewed

@@ -15,7 +15,6 @@ import json
 from mol_standardize import standardize
 from mol_descriptors import compute_descriptors
 # TRAINING SECTION
 #
 # This section (__main__) is where SageMaker will execute the training job

workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py CHANGED Viewed

@@ -15,8 +15,7 @@ import pandas as pd
 import json
 # Local imports
-from local_utils import compute_morgan_fingerprints
+from fingerprints import compute_morgan_fingerprints
 # TRAINING SECTION
 #

workbench/model_scripts/custom_models/proximity/feature_space_proximity.py ADDED Viewed

@@ -0,0 +1,194 @@
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from sklearn.neighbors import NearestNeighbors
+from typing import List, Optional
+import logging
+# Workbench Imports
+from workbench.algorithms.dataframe.proximity import Proximity
+from workbench.algorithms.dataframe.projection_2d import Projection2D
+# Set up logging
+log = logging.getLogger("workbench")
+class FeatureSpaceProximity(Proximity):
+    """Proximity computations for numeric feature spaces using Euclidean distance."""
+    def __init__(
+        self,
+        df: pd.DataFrame,
+        id_column: str,
+        features: List[str],
+        target: Optional[str] = None,
+        include_all_columns: bool = False,
+    ):
+        """
+        Initialize the FeatureSpaceProximity class.
+        Args:
+            df: DataFrame containing data for neighbor computations.
+            id_column: Name of the column used as the identifier.
+            features: List of feature column names to be used for neighbor computations.
+            target: Name of the target column. Defaults to None.
+            include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
+        """
+        # Validate and filter features before calling parent init
+        self._raw_features = features
+        super().__init__(
+            df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns
+        )
+    def _prepare_data(self) -> None:
+        """Filter out non-numeric features and drop NaN rows."""
+        # Validate features
+        self.features = self._validate_features(self.df, self._raw_features)
+        # Drop NaN rows for the features we're using
+        self.df = self.df.dropna(subset=self.features).copy()
+    def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
+        """Remove non-numeric features and log warnings."""
+        non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
+        if non_numeric:
+            log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
+        return [f for f in features if f not in non_numeric]
+    def _build_model(self) -> None:
+        """Standardize features and fit Nearest Neighbors model."""
+        self.scaler = StandardScaler()
+        X = self.scaler.fit_transform(self.df[self.features])
+        self.nn = NearestNeighbors().fit(X)
+    def _transform_features(self, df: pd.DataFrame) -> np.ndarray:
+        """Transform features using the fitted scaler."""
+        return self.scaler.transform(df[self.features])
+    def _project_2d(self) -> None:
+        """Project the numeric features to 2D for visualization."""
+        if len(self.features) >= 2:
+            self.df = Projection2D().fit_transform(self.df, features=self.features)
+# Testing the FeatureSpaceProximity class
+if __name__ == "__main__":
+    pd.set_option("display.max_columns", None)
+    pd.set_option("display.width", 1000)
+    # Create a sample DataFrame
+    data = {
+        "ID": [1, 2, 3, 4, 5],
+        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
+        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
+        "Feature3": [2.5, 2.4, 2.3, 2.3, np.nan],
+    }
+    df = pd.DataFrame(data)
+    # Test the FeatureSpaceProximity class
+    features = ["Feature1", "Feature2", "Feature3"]
+    prox = FeatureSpaceProximity(df, id_column="ID", features=features)
+    print(prox.neighbors(1, n_neighbors=2))
+    # Test the neighbors method with radius
+    print(prox.neighbors(1, radius=2.0))
+    # Test with Features list
+    prox = FeatureSpaceProximity(df, id_column="ID", features=["Feature1"])
+    print(prox.neighbors(1))
+    # Create a sample DataFrame
+    data = {
+        "id": ["a", "b", "c", "d", "e"],  # Testing string IDs
+        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
+        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
+        "target": [1, 0, 1, 0, 5],
+    }
+    df = pd.DataFrame(data)
+    # Test with String Ids
+    prox = FeatureSpaceProximity(
+        df,
+        id_column="id",
+        features=["Feature1", "Feature2"],
+        target="target",
+        include_all_columns=True,
+    )
+    print(prox.neighbors(["a", "b"]))
+    # Test duplicate IDs
+    data = {
+        "id": ["a", "b", "c", "d", "d"],  # Duplicate ID (d)
+        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
+        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
+        "target": [1, 0, 1, 0, 5],
+    }
+    df = pd.DataFrame(data)
+    prox = FeatureSpaceProximity(df, id_column="id", features=["Feature1", "Feature2"], target="target")
+    print(df.equals(prox.df))
+    # Test on real data from Workbench
+    from workbench.api import FeatureSet, Model
+    fs = FeatureSet("aqsol_features")
+    model = Model("aqsol-regression")
+    features = model.features()
+    df = fs.pull_dataframe()
+    prox = FeatureSpaceProximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
+    print("\n" + "=" * 80)
+    print("Testing Neighbors...")
+    print("=" * 80)
+    test_id = df[fs.id_column].tolist()[0]
+    print(f"\nNeighbors for ID {test_id}:")
+    print(prox.neighbors(test_id))
+    print("\n" + "=" * 80)
+    print("Testing isolated_compounds...")
+    print("=" * 80)
+    # Test isolated data in the top 1%
+    isolated_1pct = prox.isolated(top_percent=1.0)
+    print(f"\nTop 1% most isolated compounds (n={len(isolated_1pct)}):")
+    print(isolated_1pct)
+    # Test isolated data in the top 5%
+    isolated_5pct = prox.isolated(top_percent=5.0)
+    print(f"\nTop 5% most isolated compounds (n={len(isolated_5pct)}):")
+    print(isolated_5pct)
+    print("\n" + "=" * 80)
+    print("Testing target_gradients...")
+    print("=" * 80)
+    # Test with different parameters
+    gradients_1pct = prox.target_gradients(top_percent=1.0, min_delta=1.0)
+    print(f"\nTop 1% target gradients (min_delta=5.0) (n={len(gradients_1pct)}):")
+    print(gradients_1pct)
+    gradients_5pct = prox.target_gradients(top_percent=5.0, min_delta=5.0)
+    print(f"\nTop 5% target gradients (min_delta=5.0) (n={len(gradients_5pct)}):")
+    print(gradients_5pct)
+    # Test proximity_stats
+    print("\n" + "=" * 80)
+    print("Testing proximity_stats...")
+    print("=" * 80)
+    stats = prox.proximity_stats()
+    print(stats)
+    # Plot the distance distribution using pandas
+    print("\n" + "=" * 80)
+    print("Plotting distance distribution...")
+    print("=" * 80)
+    prox.df["nn_distance"].hist(bins=50, figsize=(10, 6), edgecolor="black")
+    # Visualize the 2D projection
+    print("\n" + "=" * 80)
+    print("Visualizing 2D Projection...")
+    print("=" * 80)
+    from workbench.web_interface.components.plugin_unit_test import PluginUnitTest
+    from workbench.web_interface.components.plugins.scatter_plot import ScatterPlot
+    unit_test = PluginUnitTest(ScatterPlot, input_data=prox.df[:1000], x="x", y="y", color=model.target())
+    unit_test.run()

workbench/model_scripts/custom_models/proximity/feature_space_proximity.template CHANGED Viewed

@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
     "id_column": "{{id_column}}",
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
-    "track_columns": "{{track_columns}}"
+    "include_all_columns": "{{include_all_columns}}",
 }
 from io import StringIO
@@ -18,7 +18,7 @@ import os
 import pandas as pd
 # Local Imports
-from proximity import Proximity
+from feature_space_proximity import FeatureSpaceProximity
 # Function to check if dataframe is empty
@@ -61,7 +61,7 @@ if __name__ == "__main__":
     id_column = TEMPLATE_PARAMS["id_column"]
     features = TEMPLATE_PARAMS["features"]
     target = TEMPLATE_PARAMS["target"]  # Can be None for unsupervised models
-    track_columns = TEMPLATE_PARAMS["track_columns"]  # Can be None
+    include_all_columns = TEMPLATE_PARAMS["include_all_columns"]  # Defaults to False
     # Script arguments for input/output directories
     parser = argparse.ArgumentParser()
@@ -73,26 +73,24 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
     check_dataframe(all_df, "training_df")
-    # Create the Proximity model
-    model = Proximity(all_df, id_column, features, target, track_columns=track_columns)
+    # Create the FeatureSpaceProximity model
+    model = FeatureSpaceProximity(all_df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns)
     # Now serialize the model
     model.serialize(args.model_dir)
 # Model loading and prediction functions
 def model_fn(model_dir):
     # Deserialize the model
-    model = Proximity.deserialize(model_dir)
+    model = FeatureSpaceProximity.deserialize(model_dir)
     return model

workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template CHANGED Viewed

@@ -14,7 +14,7 @@ import pandas as pd
 TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
 }
@@ -37,7 +37,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
     Raises ValueError if any model features cannot be matched.
     """
     df_columns_lower = {col.lower(): col for col in df.columns}
@@ -81,10 +81,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -109,8 +106,10 @@ if __name__ == "__main__":
     # Create and train the Regression/Confidence model
     # model = BayesianRidge()
     model = BayesianRidge(
-        alpha_1=1e-6, alpha_2=1e-6,  # Noise precision
-        lambda_1=1e-6, lambda_2=1e-6,  # Weight precision
+        alpha_1=1e-6,
+        alpha_2=1e-6,  # Noise precision
+        lambda_1=1e-6,
+        lambda_2=1e-6,  # Weight precision
         fit_intercept=True,
     )

workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template CHANGED Viewed

@@ -4,13 +4,10 @@ import awswrangler as wr
 import numpy as np
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
 from sklearn.model_selection import KFold
 from scipy.optimize import minimize
+from scipy.stats import spearmanr
 from io import StringIO
 import json
@@ -23,7 +20,7 @@ TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
     "train_all_data": "{{train_all_data}}",
-    "model_metrics_s3_path": "{{model_metrics_s3_path}}"
+    "model_metrics_s3_path": "{{model_metrics_s3_path}}",
 }
@@ -47,7 +44,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
     Raises ValueError if any model features cannot be matched.
     """
     df_columns_lower = {col.lower(): col for col in df.columns}
@@ -90,10 +87,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
     # Check if the DataFrame is empty
@@ -172,16 +166,14 @@ if __name__ == "__main__":
     cv_residuals = np.array(cv_residuals)
     cv_uncertainties = np.array(cv_uncertainties)
     # Optimize calibration parameters: σ_cal = a * σ_uc + b
     def neg_log_likelihood(params):
         a, b = params
         sigma_cal = a * cv_uncertainties + b
         sigma_cal = np.maximum(sigma_cal, 1e-8)  # Prevent division by zero
-        return np.sum(0.5 * np.log(2 * np.pi * sigma_cal ** 2) + 0.5 * (cv_residuals ** 2) / (sigma_cal ** 2))
+        return np.sum(0.5 * np.log(2 * np.pi * sigma_cal**2) + 0.5 * (cv_residuals**2) / (sigma_cal**2))
-    result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method='Nelder-Mead')
+    result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method="Nelder-Mead")
     cal_a, cal_b = result.x
     print(f"Calibration parameters: a={cal_a:.4f}, b={cal_b:.4f}")
@@ -205,7 +197,9 @@ if __name__ == "__main__":
     result_df["prediction"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].mean(axis=1)
     # Compute uncalibrated uncertainty
-    result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(axis=1)
+    result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(
+        axis=1
+    )
     # Apply calibration to uncertainty
     result_df["prediction_std"] = cal_a * result_df["prediction_std_uc"] + cal_b
@@ -224,11 +218,16 @@ if __name__ == "__main__":
     # Report Performance Metrics
     rmse = root_mean_squared_error(result_df[target], result_df["prediction"])
     mae = mean_absolute_error(result_df[target], result_df["prediction"])
+    medae = median_absolute_error(result_df[target], result_df["prediction"])
     r2 = r2_score(result_df[target], result_df["prediction"])
-    print(f"RMSE: {rmse:.3f}")
-    print(f"MAE: {mae:.3f}")
-    print(f"R2: {r2:.3f}")
-    print(f"NumRows: {len(result_df)}")
+    spearman_corr = spearmanr(result_df[target], result_df["prediction"]).correlation
+    support = len(result_df)
+    print(f"rmse: {rmse:.3f}")
+    print(f"mae: {mae:.3f}")
+    print(f"medae: {medae:.3f}")
+    print(f"r2: {r2:.3f}")
+    print(f"spearmanr: {spearman_corr:.3f}")
+    print(f"support: {support}")
     # Now save the models
     for name, model in models.items():
@@ -352,4 +351,4 @@ def predict_fn(df, models) -> pd.DataFrame:
     df = df.reindex(sorted(df.columns), axis=1)
     # All done, return the DataFrame
-    return df
+    return df

workbench 0.8.174__py3-none-any.whl → 0.8.227__py3-none-any.whl

Potentially problematic release.

workbench 0.8.174py3-none-any.whl → 0.8.227py3-none-any.whl