PyPI - workbench - Versions diffs - 0.8.174__py3-none-any.whl → 0.8.227__py3-none-any.whl - Mend

workbench 0.8.174py3-none-any.whl → 0.8.227py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (145) hide show

workbench/__init__.py +1 -0
workbench/algorithms/dataframe/__init__.py +1 -2
workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
workbench/algorithms/dataframe/projection_2d.py +44 -21
workbench/algorithms/dataframe/proximity.py +259 -305
workbench/algorithms/graph/light/proximity_graph.py +12 -11
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +388 -0
workbench/algorithms/sql/column_stats.py +0 -1
workbench/algorithms/sql/correlations.py +0 -1
workbench/algorithms/sql/descriptive_stats.py +0 -1
workbench/algorithms/sql/outliers.py +3 -3
workbench/api/__init__.py +5 -1
workbench/api/df_store.py +17 -108
workbench/api/endpoint.py +14 -12
workbench/api/feature_set.py +117 -11
workbench/api/meta.py +0 -1
workbench/api/meta_model.py +289 -0
workbench/api/model.py +52 -21
workbench/api/parameter_store.py +3 -52
workbench/cached/cached_meta.py +0 -1
workbench/cached/cached_model.py +49 -11
workbench/core/artifacts/__init__.py +11 -2
workbench/core/artifacts/artifact.py +7 -7
workbench/core/artifacts/data_capture_core.py +8 -1
workbench/core/artifacts/df_store_core.py +114 -0
workbench/core/artifacts/endpoint_core.py +323 -205
workbench/core/artifacts/feature_set_core.py +249 -45
workbench/core/artifacts/model_core.py +133 -101
workbench/core/artifacts/parameter_store_core.py +98 -0
workbench/core/cloud_platform/aws/aws_account_clamp.py +48 -2
workbench/core/cloud_platform/cloud_meta.py +0 -1
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/features_to_model/features_to_model.py +60 -44
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +43 -10
workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
workbench/core/views/training_view.py +113 -42
workbench/core/views/view.py +53 -3
workbench/core/views/view_utils.py +4 -4
workbench/model_script_utils/model_script_utils.py +339 -0
workbench/model_script_utils/pytorch_utils.py +405 -0
workbench/model_script_utils/uq_harness.py +277 -0
workbench/model_scripts/chemprop/chemprop.template +774 -0
workbench/model_scripts/chemprop/generated_model_script.py +774 -0
workbench/model_scripts/chemprop/model_script_utils.py +339 -0
workbench/model_scripts/chemprop/requirements.txt +3 -0
workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +18 -7
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +80 -58
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -2
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/ngboost.template +15 -16
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +443 -499
workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
workbench/model_scripts/pytorch_model/pytorch.template +440 -496
workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
workbench/model_scripts/pytorch_model/requirements.txt +1 -1
workbench/model_scripts/pytorch_model/uq_harness.py +277 -0
workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +15 -12
workbench/model_scripts/uq_models/generated_model_script.py +248 -0
workbench/model_scripts/xgb_model/generated_model_script.py +371 -403
workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
workbench/model_scripts/xgb_model/uq_harness.py +277 -0
workbench/model_scripts/xgb_model/xgb_model.template +367 -399
workbench/repl/workbench_shell.py +18 -14
workbench/resources/open_source_api.key +1 -1
workbench/scripts/endpoint_test.py +162 -0
workbench/scripts/lambda_test.py +73 -0
workbench/scripts/meta_model_sim.py +35 -0
workbench/scripts/ml_pipeline_sqs.py +122 -6
workbench/scripts/training_test.py +85 -0
workbench/themes/dark/custom.css +59 -0
workbench/themes/dark/plotly.json +5 -5
workbench/themes/light/custom.css +153 -40
workbench/themes/light/plotly.json +9 -9
workbench/themes/midnight_blue/custom.css +59 -0
workbench/utils/aws_utils.py +0 -1
workbench/utils/chem_utils/fingerprints.py +87 -46
workbench/utils/chem_utils/mol_descriptors.py +18 -7
workbench/utils/chem_utils/mol_standardize.py +80 -58
workbench/utils/chem_utils/projections.py +16 -6
workbench/utils/chem_utils/vis.py +25 -27
workbench/utils/chemprop_utils.py +141 -0
workbench/utils/config_manager.py +2 -6
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/markdown_utils.py +57 -0
workbench/utils/meta_model_simulator.py +499 -0
workbench/utils/metrics_utils.py +256 -0
workbench/utils/model_utils.py +274 -87
workbench/utils/pipeline_utils.py +0 -1
workbench/utils/plot_utils.py +159 -34
workbench/utils/pytorch_utils.py +87 -0
workbench/utils/shap_utils.py +11 -57
workbench/utils/theme_manager.py +95 -30
workbench/utils/xgboost_local_crossfold.py +267 -0
workbench/utils/xgboost_model_utils.py +127 -220
workbench/web_interface/components/experiments/outlier_plot.py +0 -1
workbench/web_interface/components/model_plot.py +16 -2
workbench/web_interface/components/plugin_unit_test.py +5 -3
workbench/web_interface/components/plugins/ag_table.py +2 -4
workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
workbench/web_interface/components/plugins/model_details.py +48 -80
workbench/web_interface/components/plugins/scatter_plot.py +192 -92
workbench/web_interface/components/settings_menu.py +184 -0
workbench/web_interface/page_views/main_page.py +0 -1
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/METADATA +31 -17
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/RECORD +125 -111
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/entry_points.txt +4 -0
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/licenses/LICENSE +1 -1
workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie.template +0 -502
workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -386
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/themes/quartz/base_css.url +0 -1
workbench/themes/quartz/custom.css +0 -117
workbench/themes/quartz/plotly.json +0 -642
workbench/themes/quartz_dark/base_css.url +0 -1
workbench/themes/quartz_dark/custom.css +0 -131
workbench/themes/quartz_dark/plotly.json +0 -642
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/WHEEL +0 -0
{workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/top_level.txt +0 -0

workbench/utils/chem_utils/mol_descriptors.py CHANGED Viewed

@@ -91,6 +91,8 @@ import logging
 import pandas as pd
 import numpy as np
 import re
+import time
+from contextlib import contextmanager
 from rdkit import Chem
 from rdkit.Chem import Descriptors, rdCIPLabeler
 from rdkit.ML.Descriptors import MoleculeDescriptors
@@ -101,6 +103,14 @@ logger = logging.getLogger("workbench")
 logger.setLevel(logging.DEBUG)
+# Helper context manager for timing
+@contextmanager
+def timer(name):
+    start = time.time()
+    yield
+    print(f"{name}: {time.time() - start:.2f}s")
 def compute_stereochemistry_features(mol):
     """
     Compute stereochemistry descriptors using modern RDKit methods.
@@ -280,9 +290,11 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
                 descriptor_values.append([np.nan] * len(all_descriptors))
     # Create RDKit features DataFrame
-    rdkit_features_df = pd.DataFrame(descriptor_values, columns=calc.GetDescriptorNames(), index=result.index)
+    rdkit_features_df = pd.DataFrame(descriptor_values, columns=calc.GetDescriptorNames())
     # Add RDKit features to result
+    # Remove any columns from result that exist in rdkit_features_df
+    result = result.drop(columns=result.columns.intersection(rdkit_features_df.columns))
     result = pd.concat([result, rdkit_features_df], axis=1)
     # Compute Mordred descriptors
@@ -299,7 +311,7 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
         # Compute Mordred descriptors
         valid_mols = [mol if mol is not None else Chem.MolFromSmiles("C") for mol in molecules]
-        mordred_df = calc.pandas(valid_mols, nproc=1)  # For serverless, use nproc=1
+        mordred_df = calc.pandas(valid_mols, nproc=1)  # Endpoint multiprocessing will fail with nproc>1
         # Replace values for invalid molecules with NaN
         for i, mol in enumerate(molecules):
@@ -310,10 +322,9 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
         for col in mordred_df.columns:
             mordred_df[col] = pd.to_numeric(mordred_df[col], errors="coerce")
-        # Set index to match result DataFrame
-        mordred_df.index = result.index
         # Add Mordred features to result
+        # Remove any columns from result that exist in mordred
+        result = result.drop(columns=result.columns.intersection(mordred_df.columns))
         result = pd.concat([result, mordred_df], axis=1)
     # Compute stereochemistry features if requested
@@ -326,9 +337,10 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
             stereo_features.append(stereo_dict)
         # Create stereochemistry DataFrame
-        stereo_df = pd.DataFrame(stereo_features, index=result.index)
+        stereo_df = pd.DataFrame(stereo_features)
         # Add stereochemistry features to result
+        result = result.drop(columns=result.columns.intersection(stereo_df.columns))
         result = pd.concat([result, stereo_df], axis=1)
         logger.info(f"Added {len(stereo_df.columns)} stereochemistry descriptors")
@@ -357,7 +369,6 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
 if __name__ == "__main__":
-    import time
     from mol_standardize import standardize
     from workbench.api import DataSource

workbench/utils/chem_utils/mol_standardize.py CHANGED Viewed

@@ -81,6 +81,8 @@ Usage:
 import logging
 from typing import Optional, Tuple
 import pandas as pd
+import time
+from contextlib import contextmanager
 from rdkit import Chem
 from rdkit.Chem import Mol
 from rdkit.Chem.MolStandardize import rdMolStandardize
@@ -90,6 +92,14 @@ log = logging.getLogger("workbench")
 RDLogger.DisableLog("rdApp.warning")
+# Helper context manager for timing
+@contextmanager
+def timer(name):
+    start = time.time()
+    yield
+    print(f"{name}: {time.time() - start:.2f}s")
 class MolStandardizer:
     """
     Streamlined molecular standardizer for ADMET preprocessing
@@ -116,6 +126,7 @@ class MolStandardizer:
         Pipeline:
         1. Cleanup (remove Hs, disconnect metals, normalize)
         2. Get largest fragment (optional - only if remove_salts=True)
+           2a. Extract salt information BEFORE further modifications
         3. Neutralize charges
         4. Canonicalize tautomer (optional)
@@ -130,18 +141,24 @@ class MolStandardizer:
         try:
             # Step 1: Cleanup
-            mol = rdMolStandardize.Cleanup(mol, self.params)
-            if mol is None:
+            cleaned_mol = rdMolStandardize.Cleanup(mol, self.params)
+            if cleaned_mol is None:
                 return None, None
+            # If not doing any transformations, return early
+            if not self.remove_salts and not self.canonicalize_tautomer:
+                return cleaned_mol, None
             salt_smiles = None
+            mol = cleaned_mol
             # Step 2: Fragment handling (conditional based on remove_salts)
             if self.remove_salts:
-                # Get parent molecule and extract salt information
-                parent_mol = rdMolStandardize.FragmentParent(mol, self.params)
+                # Get parent molecule
+                parent_mol = rdMolStandardize.FragmentParent(cleaned_mol, self.params)
                 if parent_mol:
-                    salt_smiles = self._extract_salt(mol, parent_mol)
+                    # Extract salt BEFORE any modifications to parent
+                    salt_smiles = self._extract_salt(cleaned_mol, parent_mol)
                     mol = parent_mol
                 else:
                     return None, None
@@ -153,7 +170,7 @@ class MolStandardizer:
                 if mol is None:
                     return None, salt_smiles
-            # Step 4: Canonicalize tautomer
+            # Step 4: Canonicalize tautomer (LAST STEP)
             if self.canonicalize_tautomer:
                 mol = self.tautomer_enumerator.Canonicalize(mol)
@@ -172,13 +189,22 @@ class MolStandardizer:
         - Mixtures: multiple large neutral organic fragments
         Args:
-            orig_mol: Original molecule (before FragmentParent)
-            parent_mol: Parent molecule (after FragmentParent)
+            orig_mol: Original molecule (after Cleanup, before FragmentParent)
+            parent_mol: Parent molecule (after FragmentParent, before tautomerization)
         Returns:
             SMILES string of salt components or None if no salts/mixture detected
         """
         try:
+            # Quick atom count check
+            if orig_mol.GetNumAtoms() == parent_mol.GetNumAtoms():
+                return None
+            # Quick heavy atom difference check
+            heavy_diff = orig_mol.GetNumHeavyAtoms() - parent_mol.GetNumHeavyAtoms()
+            if heavy_diff <= 0:
+                return None
             # Get all fragments from original molecule
             orig_frags = Chem.GetMolFrags(orig_mol, asMols=True)
@@ -268,7 +294,7 @@ def standardize(
     if "orig_smiles" not in result.columns:
         result["orig_smiles"] = result[smiles_column]
-    # Initialize standardizer with salt removal control
+    # Initialize standardizer
     standardizer = MolStandardizer(canonicalize_tautomer=canonicalize_tautomer, remove_salts=extract_salts)
     def process_smiles(smiles: str) -> pd.Series:
@@ -286,6 +312,11 @@ def standardize(
             log.error("Encountered missing or empty SMILES string")
             return pd.Series({"smiles": None, "salt": None})
+        # Early check for unreasonably long SMILES
+        if len(smiles) > 1000:
+            log.error(f"SMILES too long ({len(smiles)} chars): {smiles[:50]}...")
+            return pd.Series({"smiles": None, "salt": None})
         # Parse molecule
         mol = Chem.MolFromSmiles(smiles)
         if mol is None:
@@ -299,7 +330,9 @@ def standardize(
         if std_mol is not None:
             # Check if molecule is reasonable
             if std_mol.GetNumAtoms() == 0 or std_mol.GetNumAtoms() > 200:  # Arbitrary limits
-                log.error(f"Unusual molecule size: {std_mol.GetNumAtoms()} atoms")
+                log.error(f"Rejecting molecule size: {std_mol.GetNumAtoms()} atoms")
+                log.error(f"Original SMILES: {smiles}")
+                return pd.Series({"smiles": None, "salt": salt_smiles})
         if std_mol is None:
             return pd.Series(
@@ -325,8 +358,11 @@ def standardize(
 if __name__ == "__main__":
-    import time
-    from workbench.api import DataSource
+    # Pandas display options for better readability
+    pd.set_option("display.max_columns", None)
+    pd.set_option("display.width", 1000)
+    pd.set_option("display.max_colwidth", 100)
     # Test with DataFrame including various salt forms
     test_data = pd.DataFrame(
@@ -362,67 +398,53 @@ if __name__ == "__main__":
     )
     # General test
+    print("Testing standardization with full dataset...")
     standardize(test_data)
     # Remove the last two rows to avoid errors with None and INVALID
     test_data = test_data.iloc[:-2].reset_index(drop=True)
     # Test WITHOUT salt removal (keeps full molecule)
-    print("\nStandardization KEEPING salts (extract_salts=False):")
-    print("This preserves the full molecule including counterions")
+    print("\nStandardization KEEPING salts (extract_salts=False) Tautomerization: True")
     result_keep = standardize(test_data, extract_salts=False, canonicalize_tautomer=True)
-    display_cols = ["compound_id", "orig_smiles", "smiles", "salt"]
-    print(result_keep[display_cols].to_string())
+    display_order = ["compound_id", "orig_smiles", "smiles", "salt"]
+    print(result_keep[display_order])
     # Test WITH salt removal
     print("\n" + "=" * 70)
     print("Standardization REMOVING salts (extract_salts=True):")
-    print("This extracts parent molecule and records salt information")
     result_remove = standardize(test_data, extract_salts=True, canonicalize_tautomer=True)
-    print(result_remove[display_cols].to_string())
+    print(result_remove[display_order])
-    # Test WITHOUT tautomerization (keeping salts)
+    # Test with problematic cases specifically
     print("\n" + "=" * 70)
-    print("Standardization KEEPING salts, NO tautomerization:")
-    result_no_taut = standardize(test_data, extract_salts=False, canonicalize_tautomer=False)
-    print(result_no_taut[display_cols].to_string())
+    print("Testing specific problematic cases:")
+    problem_cases = pd.DataFrame(
+        {
+            "smiles": [
+                "CC(=O)O.CCN",  # Should extract CC(=O)O as salt
+                "CCO.CC",  # Should return CC as salt
+            ],
+            "compound_id": ["TEST_C002", "TEST_C005"],
+        }
+    )
+    problem_result = standardize(problem_cases, extract_salts=True, canonicalize_tautomer=True)
+    print(problem_result[display_order])
+    # Performance test with larger dataset
+    from workbench.api import DataSource
-    # Show the difference for salt-containing molecules
-    print("\n" + "=" * 70)
-    print("Comparison showing differences:")
-    for idx, row in result_keep.iterrows():
-        keep_smiles = row["smiles"]
-        remove_smiles = result_remove.loc[idx, "smiles"]
-        no_taut_smiles = result_no_taut.loc[idx, "smiles"]
-        salt = result_remove.loc[idx, "salt"]
-        # Show differences when they exist
-        if keep_smiles != remove_smiles or keep_smiles != no_taut_smiles:
-            print(f"\n{row['compound_id']} ({row['orig_smiles']}):")
-            if keep_smiles != no_taut_smiles:
-                print(f"  With salt + taut:    {keep_smiles}")
-                print(f"  With salt, no taut:  {no_taut_smiles}")
-            if keep_smiles != remove_smiles:
-                print(f"  Parent only + taut:  {remove_smiles}")
-            if salt:
-                print(f"  Extracted salt:      {salt}")
-    # Summary statistics
     print("\n" + "=" * 70)
-    print("Summary:")
-    print(f"Total molecules: {len(result_remove)}")
-    print(f"Molecules with salts: {result_remove['salt'].notna().sum()}")
-    unique_salts = result_remove["salt"].dropna().unique()
-    print(f"Unique salts found: {unique_salts[:5].tolist()}")
-    # Get a real dataset from Workbench and time the standardization
     ds = DataSource("aqsol_data")
-    df = ds.pull_dataframe()[["id", "smiles"]]
-    start_time = time.time()
-    std_df = standardize(df, extract_salts=True, canonicalize_tautomer=True)
-    end_time = time.time()
-    print(f"\nStandardized {len(std_df)} molecules from Workbench in {end_time - start_time:.2f} seconds")
-    print(std_df.head())
-    print(f"Molecules with salts: {std_df['salt'].notna().sum()}")
-    unique_salts = std_df["salt"].dropna().unique()
-    print(f"Unique salts found: {unique_salts[:5].tolist()}")
+    df = ds.pull_dataframe()[["id", "smiles"]][:1000]
+    for tautomer in [True, False]:
+        for extract in [True, False]:
+            print(f"Performance test with AQSol dataset: tautomer={tautomer} extract_salts={extract}:")
+            start_time = time.time()
+            std_df = standardize(df, canonicalize_tautomer=tautomer, extract_salts=extract)
+            elapsed = time.time() - start_time
+            mol_per_sec = len(df) / elapsed
+            print(f"{elapsed:.2f}s ({mol_per_sec:.0f} mol/s)")

workbench/utils/chem_utils/projections.py CHANGED Viewed

@@ -17,18 +17,28 @@ log = logging.getLogger("workbench")
 def fingerprints_to_matrix(fingerprints, dtype=np.uint8):
     """
-    Convert bitstring fingerprints to numpy matrix.
+    Convert fingerprints to numpy matrix.
+    Supports two formats (auto-detected):
+        - Bitstrings: "10110010..." → matrix of 0s and 1s
+        - Count vectors: "0,3,0,1,5,..." → matrix of counts (or binary if dtype=np.bool_)
     Args:
-        fingerprints: pandas Series or list of bitstring fingerprints
-        dtype: numpy data type (uint8 is default: np.bool_ is good for Jaccard computations
+        fingerprints: pandas Series or list of fingerprints
+        dtype: numpy data type (uint8 is default; np.bool_ for Jaccard computations)
     Returns:
         dense numpy array of shape (n_molecules, n_bits)
     """
-    # Dense matrix representation (we might support sparse in the future)
-    return np.array([list(fp) for fp in fingerprints], dtype=dtype)
+    # Auto-detect format based on first fingerprint
+    sample = str(fingerprints.iloc[0] if hasattr(fingerprints, "iloc") else fingerprints[0])
+    if "," in sample:
+        # Count vector format: comma-separated integers
+        matrix = np.array([list(map(int, fp.split(","))) for fp in fingerprints], dtype=dtype)
+    else:
+        # Bitstring format: each character is a bit
+        matrix = np.array([list(fp) for fp in fingerprints], dtype=dtype)
+    return matrix
 def project_fingerprints(df: pd.DataFrame, projection: str = "UMAP") -> pd.DataFrame:

workbench/utils/chem_utils/vis.py CHANGED Viewed

@@ -2,34 +2,18 @@
 import logging
 import base64
-import re
 from typing import Optional, Tuple
 from rdkit import Chem
 from rdkit.Chem import AllChem, Draw
 from rdkit.Chem.Draw import rdMolDraw2D
+# Workbench Imports
+from workbench.utils.color_utils import is_dark
 # Set up the logger
 log = logging.getLogger("workbench")
-def _is_dark(color: str) -> bool:
-    """Determine if an rgba color is dark based on RGB average.
-    Args:
-        color: Color in rgba(...) format
-    Returns:
-        True if the color is dark, False otherwise
-    """
-    match = re.match(r"rgba?\((\d+),\s*(\d+),\s*(\d+)", color)
-    if not match:
-        log.warning(f"Invalid color format: {color}, defaulting to dark")
-        return True  # Default to dark mode on error
-    r, g, b = map(int, match.groups())
-    return (r + g + b) / 3 < 128
 def _rgba_to_tuple(rgba: str) -> Tuple[float, float, float, float]:
     """Convert rgba string to normalized tuple (R, G, B, A).
@@ -75,7 +59,13 @@ def _configure_draw_options(options: Draw.MolDrawOptions, background: str) -> No
         options: RDKit drawing options object
         background: Background color string
     """
-    if _is_dark(background):
+    try:
+        if is_dark(background):
+            rdMolDraw2D.SetDarkMode(options)
+        # Light backgrounds use RDKit defaults (no action needed)
+    except ValueError:
+        # Default to dark mode if color format is invalid
+        log.warning(f"Invalid color format: {background}, defaulting to dark mode")
         rdMolDraw2D.SetDarkMode(options)
     options.setBackgroundColour(_rgba_to_tuple(background))
@@ -137,7 +127,7 @@ def svg_from_smiles(
     drawer.DrawMolecule(mol)
     drawer.FinishDrawing()
-    # Encode SVG
+    # Encode SVG as base64 data URI
     svg = drawer.GetDrawingText()
     encoded_svg = base64.b64encode(svg.encode("utf-8")).decode("utf-8")
     return f"data:image/svg+xml;base64,{encoded_svg}"
@@ -222,7 +212,7 @@ if __name__ == "__main__":
     # Test 6: Color parsing functions
     print("\n6. Testing color utility functions...")
     test_colors = [
-        ("invalid_color", True, (0.25, 0.25, 0.25, 1.0)),  # Should use defaults
+        ("invalid_color", None, (0.25, 0.25, 0.25, 1.0)),  # Should raise ValueError
         ("rgba(255, 255, 255, 1)", False, (1.0, 1.0, 1.0, 1.0)),
         ("rgba(0, 0, 0, 1)", True, (0.0, 0.0, 0.0, 1.0)),
         ("rgba(64, 64, 64, 0.5)", True, (0.251, 0.251, 0.251, 0.5)),
@@ -230,12 +220,20 @@ if __name__ == "__main__":
     ]
     for color, expected_dark, expected_tuple in test_colors:
-        is_dark_result = _is_dark(color)
-        tuple_result = _rgba_to_tuple(color)
-        dark_status = "✓" if is_dark_result == expected_dark else "✗"
-        print(f"   {dark_status} is_dark('{color[:20]}...'): {is_dark_result} == {expected_dark}")
+        try:
+            is_dark_result = is_dark(color)
+            if expected_dark is None:
+                print(f"   ✗ is_dark('{color[:20]}...'): Expected ValueError but got {is_dark_result}")
+            else:
+                dark_status = "✓" if is_dark_result == expected_dark else "✗"
+                print(f"   {dark_status} is_dark('{color[:20]}...'): {is_dark_result} == {expected_dark}")
+        except ValueError:
+            if expected_dark is None:
+                print(f"   ✓ is_dark('{color[:20]}...'): Correctly raised ValueError")
+            else:
+                print(f"   ✗ is_dark('{color[:20]}...'): Unexpected ValueError")
+        tuple_result = _rgba_to_tuple(color)
         # Check tuple values with tolerance for floating point
         tuple_match = all(abs(a - b) < 0.01 for a, b in zip(tuple_result, expected_tuple))
         tuple_status = "✓" if tuple_match else "✗"

workbench/utils/chemprop_utils.py ADDED Viewed

@@ -0,0 +1,141 @@
+"""ChemProp utilities for Workbench models."""
+import logging
+import os
+from typing import Any, Tuple
+import pandas as pd
+from workbench.utils.aws_utils import pull_s3_data
+from workbench.utils.metrics_utils import compute_metrics_from_predictions
+from workbench.utils.model_utils import safe_extract_tarfile
+log = logging.getLogger("workbench")
+def download_and_extract_model(s3_uri: str, model_dir: str) -> None:
+    """Download model artifact from S3 and extract it.
+    Args:
+        s3_uri: S3 URI to the model artifact (model.tar.gz)
+        model_dir: Directory to extract model artifacts to
+    """
+    import awswrangler as wr
+    log.info(f"Downloading model from {s3_uri}...")
+    # Download to temp file
+    local_tar_path = os.path.join(model_dir, "model.tar.gz")
+    wr.s3.download(path=s3_uri, local_file=local_tar_path)
+    # Extract using safe extraction
+    log.info(f"Extracting to {model_dir}...")
+    safe_extract_tarfile(local_tar_path, model_dir)
+    # Cleanup tar file
+    os.unlink(local_tar_path)
+def load_chemprop_model_artifacts(model_dir: str) -> Tuple[Any, dict]:
+    """Load ChemProp MPNN model and artifacts from an extracted model directory.
+    Args:
+        model_dir: Directory containing extracted model artifacts
+    Returns:
+        Tuple of (MPNN model, artifacts_dict).
+        artifacts_dict contains 'label_encoder' and 'feature_metadata' if present.
+    """
+    import joblib
+    from chemprop import models
+    model_path = os.path.join(model_dir, "chemprop_model.pt")
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"No chemprop_model.pt found in {model_dir}")
+    model = models.MPNN.load_from_file(model_path)
+    model.eval()
+    # Load additional artifacts
+    artifacts = {}
+    label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
+    if os.path.exists(label_encoder_path):
+        artifacts["label_encoder"] = joblib.load(label_encoder_path)
+    feature_metadata_path = os.path.join(model_dir, "feature_metadata.joblib")
+    if os.path.exists(feature_metadata_path):
+        artifacts["feature_metadata"] = joblib.load(feature_metadata_path)
+    return model, artifacts
+def pull_cv_results(workbench_model: Any) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """Pull cross-validation results from AWS training artifacts.
+    This retrieves the validation predictions saved during model training and
+    computes metrics directly from them.
+    Note:
+        - Regression: Supports both single-target and multi-target models
+        - Classification: Only single-target is supported (with any number of classes)
+    Args:
+        workbench_model: Workbench model object
+    Returns:
+        Tuple of:
+            - DataFrame with computed metrics
+            - DataFrame with validation predictions
+    """
+    # Get the validation predictions from S3
+    s3_path = f"{workbench_model.model_training_path}/validation_predictions.csv"
+    predictions_df = pull_s3_data(s3_path)
+    if predictions_df is None:
+        raise ValueError(f"No validation predictions found at {s3_path}")
+    log.info(f"Pulled {len(predictions_df)} validation predictions from {s3_path}")
+    # Get target and class labels
+    target = workbench_model.target()
+    class_labels = workbench_model.class_labels()
+    # If single target just use the "prediction" column
+    if isinstance(target, str):
+        metrics_df = compute_metrics_from_predictions(predictions_df, target, class_labels)
+        return metrics_df, predictions_df
+    # Multi-target regression
+    metrics_list = []
+    for t in target:
+        # Prediction will be {target}_pred in multi-target case
+        pred_col = f"{t}_pred"
+        # Drop NaNs for this target
+        target_preds_df = predictions_df.dropna(subset=[t, pred_col])
+        metrics_df = compute_metrics_from_predictions(target_preds_df, t, class_labels, prediction_col=pred_col)
+        metrics_df.insert(0, "target", t)
+        metrics_list.append(metrics_df)
+    metrics_df = pd.concat(metrics_list, ignore_index=True) if metrics_list else pd.DataFrame()
+    return metrics_df, predictions_df
+if __name__ == "__main__":
+    # Tests for the ChemProp utilities
+    from workbench.api import Model
+    # Initialize Workbench model
+    model_name = "open-admet-chemprop-mt"
+    print(f"Loading Workbench model: {model_name}")
+    model = Model(model_name)
+    print(f"Model Framework: {model.model_framework}")
+    # Pull CV results
+    metrics_df, predictions_df = pull_cv_results(model)
+    print("\nTraining Metrics:")
+    print(metrics_df.to_string(index=False))
+    print(f"\nSample Predictions:\n{predictions_df.head().to_string(index=False)}")

workbench/utils/config_manager.py CHANGED Viewed

@@ -4,16 +4,13 @@ import os
 import sys
 import platform
 import logging
-import importlib.resources as resources  # noqa: F401 Python 3.9 compatibility
 from typing import Any, Dict
+from importlib.resources import files, as_file
 # Workbench imports
 from workbench.utils.license_manager import LicenseManager
 from workbench_bridges.utils.execution_environment import running_as_service
-# Python 3.9 compatibility
-from workbench.utils.resource_utils import get_resource_path
 class FatalConfigError(Exception):
     """Exception raised for errors in the configuration."""
@@ -172,8 +169,7 @@ class ConfigManager:
         Returns:
             str: The open source API key.
         """
-        # Python 3.9 compatibility
-        with get_resource_path("workbench.resources", "open_source_api.key") as open_source_key_path:
+        with as_file(files("workbench.resources").joinpath("open_source_api.key")) as open_source_key_path:
             with open(open_source_key_path, "r") as key_file:
                 return key_file.read().strip()

workbench/utils/endpoint_utils.py CHANGED Viewed

@@ -7,9 +7,7 @@ from typing import Union, Optional
 import pandas as pd
 # Workbench Imports
-from workbench.api.feature_set import FeatureSet
-from workbench.api.model import Model
-from workbench.api.endpoint import Endpoint
+from workbench.api import FeatureSet, Model, Endpoint
 # Set up the log
 log = logging.getLogger("workbench")
@@ -77,7 +75,7 @@ def internal_model_data_url(endpoint_config_name: str, session: boto3.Session) -
         return None
-def fs_training_data(end: Endpoint) -> pd.DataFrame:
+def get_training_data(end: Endpoint) -> pd.DataFrame:
     """Code to get the training data from the FeatureSet used to train the Model
     Args:
@@ -100,7 +98,7 @@ def fs_training_data(end: Endpoint) -> pd.DataFrame:
     return train_df
-def fs_evaluation_data(end: Endpoint) -> pd.DataFrame:
+def get_evaluation_data(end: Endpoint) -> pd.DataFrame:
     """Code to get the evaluation data from the FeatureSet NOT used for training
     Args:
@@ -178,11 +176,11 @@ if __name__ == "__main__":
     print(model_data_url)
     # Get the training data
-    my_train_df = fs_training_data(my_endpoint)
+    my_train_df = get_training_data(my_endpoint)
     print(my_train_df)
     # Get the evaluation data
-    my_eval_df = fs_evaluation_data(my_endpoint)
+    my_eval_df = get_evaluation_data(my_endpoint)
     print(my_eval_df)
     # Backtrack to the FeatureSet

workbench 0.8.174__py3-none-any.whl → 0.8.227__py3-none-any.whl

Potentially problematic release.

workbench 0.8.174py3-none-any.whl → 0.8.227py3-none-any.whl