PyPI - workbench - Versions diffs - 0.8.162__py3-none-any.whl → 0.8.220__py3-none-any.whl - Mend

workbench 0.8.162py3-none-any.whl → 0.8.220py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (147) hide show

workbench/algorithms/dataframe/__init__.py +1 -2
workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
workbench/algorithms/dataframe/projection_2d.py +44 -21
workbench/algorithms/dataframe/proximity.py +259 -305
workbench/algorithms/graph/light/proximity_graph.py +14 -12
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +388 -0
workbench/algorithms/sql/outliers.py +3 -3
workbench/api/__init__.py +5 -1
workbench/api/compound.py +1 -1
workbench/api/df_store.py +17 -108
workbench/api/endpoint.py +18 -5
workbench/api/feature_set.py +121 -15
workbench/api/meta.py +5 -2
workbench/api/meta_model.py +289 -0
workbench/api/model.py +55 -21
workbench/api/monitor.py +1 -16
workbench/api/parameter_store.py +3 -52
workbench/cached/cached_model.py +4 -4
workbench/core/artifacts/__init__.py +11 -2
workbench/core/artifacts/artifact.py +16 -8
workbench/core/artifacts/data_capture_core.py +355 -0
workbench/core/artifacts/df_store_core.py +114 -0
workbench/core/artifacts/endpoint_core.py +382 -253
workbench/core/artifacts/feature_set_core.py +249 -45
workbench/core/artifacts/model_core.py +135 -80
workbench/core/artifacts/monitor_core.py +33 -248
workbench/core/artifacts/parameter_store_core.py +98 -0
workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
workbench/core/cloud_platform/aws/aws_meta.py +12 -5
workbench/core/cloud_platform/aws/aws_session.py +4 -4
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
workbench/core/transforms/features_to_model/features_to_model.py +62 -40
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +76 -15
workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
workbench/core/views/training_view.py +113 -42
workbench/core/views/view.py +53 -3
workbench/core/views/view_utils.py +4 -4
workbench/model_script_utils/model_script_utils.py +339 -0
workbench/model_script_utils/pytorch_utils.py +405 -0
workbench/model_script_utils/uq_harness.py +278 -0
workbench/model_scripts/chemprop/chemprop.template +649 -0
workbench/model_scripts/chemprop/generated_model_script.py +649 -0
workbench/model_scripts/chemprop/model_script_utils.py +339 -0
workbench/model_scripts/chemprop/requirements.txt +3 -0
workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +444 -500
workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
workbench/model_scripts/pytorch_model/pytorch.template +440 -496
workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
workbench/model_scripts/pytorch_model/requirements.txt +1 -1
workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +20 -11
workbench/model_scripts/uq_models/generated_model_script.py +248 -0
workbench/model_scripts/xgb_model/generated_model_script.py +372 -404
workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
workbench/model_scripts/xgb_model/uq_harness.py +278 -0
workbench/model_scripts/xgb_model/xgb_model.template +369 -401
workbench/repl/workbench_shell.py +28 -19
workbench/resources/open_source_api.key +1 -1
workbench/scripts/endpoint_test.py +162 -0
workbench/scripts/lambda_test.py +73 -0
workbench/scripts/meta_model_sim.py +35 -0
workbench/scripts/ml_pipeline_batch.py +137 -0
workbench/scripts/ml_pipeline_sqs.py +186 -0
workbench/scripts/monitor_cloud_watch.py +20 -100
workbench/scripts/training_test.py +85 -0
workbench/utils/aws_utils.py +4 -3
workbench/utils/chem_utils/__init__.py +0 -0
workbench/utils/chem_utils/fingerprints.py +175 -0
workbench/utils/chem_utils/misc.py +194 -0
workbench/utils/chem_utils/mol_descriptors.py +483 -0
workbench/utils/chem_utils/mol_standardize.py +450 -0
workbench/utils/chem_utils/mol_tagging.py +348 -0
workbench/utils/chem_utils/projections.py +219 -0
workbench/utils/chem_utils/salts.py +256 -0
workbench/utils/chem_utils/sdf.py +292 -0
workbench/utils/chem_utils/toxicity.py +250 -0
workbench/utils/chem_utils/vis.py +253 -0
workbench/utils/chemprop_utils.py +141 -0
workbench/utils/cloudwatch_handler.py +1 -1
workbench/utils/cloudwatch_utils.py +137 -0
workbench/utils/config_manager.py +3 -7
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/meta_model_simulator.py +499 -0
workbench/utils/metrics_utils.py +256 -0
workbench/utils/model_utils.py +278 -79
workbench/utils/monitor_utils.py +44 -62
workbench/utils/pandas_utils.py +3 -3
workbench/utils/pytorch_utils.py +87 -0
workbench/utils/shap_utils.py +11 -57
workbench/utils/workbench_logging.py +0 -3
workbench/utils/workbench_sqs.py +1 -1
workbench/utils/xgboost_local_crossfold.py +267 -0
workbench/utils/xgboost_model_utils.py +127 -219
workbench/web_interface/components/model_plot.py +14 -2
workbench/web_interface/components/plugin_unit_test.py +5 -2
workbench/web_interface/components/plugins/dashboard_status.py +3 -1
workbench/web_interface/components/plugins/generated_compounds.py +1 -1
workbench/web_interface/components/plugins/model_details.py +38 -74
workbench/web_interface/components/plugins/scatter_plot.py +6 -10
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/METADATA +31 -9
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/RECORD +128 -96
workbench-0.8.220.dist-info/entry_points.txt +11 -0
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +1 -1
workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -273
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/utils/chem_utils.py +0 -1556
workbench/utils/execution_environment.py +0 -211
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
workbench-0.8.162.dist-info/entry_points.txt +0 -5
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
{workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0

workbench/model_scripts/custom_models/chem_info/mol_standardize.py ADDED Viewed

@@ -0,0 +1,450 @@
+"""
+mol_standardize.py - Molecular Standardization for ADMET Preprocessing
+Following ChEMBL structure standardization pipeline
+Purpose:
+    Standardizes chemical structures to ensure consistent molecular representations
+    for ADMET modeling. Handles tautomers, salts, charges, and structural variations
+    that can cause the same compound to be represented differently.
+Standardization Pipeline:
+    1. Cleanup
+       - Removes explicit hydrogens
+       - Disconnects metal atoms from organic fragments
+       - Normalizes functional groups (e.g., nitro, sulfoxide representations)
+    2. Fragment Parent Selection (optional, controlled by extract_salts parameter)
+       - Identifies and keeps the largest organic fragment
+       - Removes salts, solvents, and counterions
+       - Example: [Na+].CC(=O)[O-] → CC(=O)O (keeps acetate, removes sodium)
+    3. Charge Neutralization (optional, controlled by extract_salts parameter)
+       - Neutralizes charges where possible
+       - Only applied when extract_salts=True (following ChEMBL pipeline)
+       - Skipped when extract_salts=False to preserve ionic character
+       - Example: CC(=O)[O-] → CC(=O)O
+    4. Tautomer Canonicalization (optional, default=True)
+       - Generates canonical tautomer form for consistency
+       - Example: Oc1ccccn1 → O=c1cccc[nH]1 (2-hydroxypyridine → 2-pyridone)
+Output DataFrame Columns:
+    - orig_smiles: Original input SMILES (preserved for traceability)
+    - smiles: Standardized molecule (with or without salts based on extract_salts)
+    - salt: Removed salt/counterion as SMILES (only populated if extract_salts=True)
+Salt Handling:
+    Salt forms can dramatically affect properties like solubility.
+    This module offers two modes for handling salts:
+    When extract_salts=True (default, ChEMBL standard):
+        - Removes salts/counterions to get parent molecule
+        - Neutralizes charges on the parent
+        - Records removed salts in 'salt' column
+        Input: [Na+].CC(=O)[O-]  →  Parent: CC(=O)O, Salt: [Na+]
+    When extract_salts=False (preserve full salt form):
+        - Keeps all fragments including salts/counterions
+        - Preserves ionic charges (no neutralization)
+Mixture Detection:
+    The module detects and logs potential mixtures (vs true salt forms):
+    - Multiple large neutral organic fragments indicate a mixture
+    - Mixtures are logged but NOT recorded in the salt column
+    - True salts (small/charged fragments) are properly extracted
+    Downstream modeling options:
+    1. Use parent only (standard approach for most ADMET properties)
+    2. Include salt as a categorical or computed feature
+    3. Model parent + salt effects hierarchically
+    4. Use full salt form for properties like solubility/formulation
+References:
+    - "ChEMBL Structure Pipeline" (Bento et al., 2020)
+      https://doi.org/10.1186/s13321-020-00456-1
+    - "Standardization and Validation with the RDKit" (Greg Landrum, RSC Open Science 2021)
+      https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/Standardization%20and%20Validation%20with%20the%20RDKit.ipynb
+Usage:
+    from mol_standardize import standardize
+    # Basic usage (removes salts by default, ChEMBL standard)
+    df_std = standardize(df, smiles_column='smiles')
+    # Keep salts in the molecule (preserve ionic forms)
+    df_std = standardize(df, extract_salts=False)
+    # Without tautomer canonicalization (faster, less aggressive)
+    df_std = standardize(df, canonicalize_tautomer=False)
+"""
+import logging
+from typing import Optional, Tuple
+import pandas as pd
+import time
+from contextlib import contextmanager
+from rdkit import Chem
+from rdkit.Chem import Mol
+from rdkit.Chem.MolStandardize import rdMolStandardize
+from rdkit import RDLogger
+log = logging.getLogger("workbench")
+RDLogger.DisableLog("rdApp.warning")
+# Helper context manager for timing
+@contextmanager
+def timer(name):
+    start = time.time()
+    yield
+    print(f"{name}: {time.time() - start:.2f}s")
+class MolStandardizer:
+    """
+    Streamlined molecular standardizer for ADMET preprocessing
+    Uses ChEMBL standardization pipeline with RDKit
+    """
+    def __init__(self, canonicalize_tautomer: bool = True, remove_salts: bool = True):
+        """
+        Initialize standardizer with ChEMBL defaults
+        Args:
+            canonicalize_tautomer: Whether to canonicalize tautomers (default True)
+            remove_salts: Whether to remove salts/counterions (default True)
+        """
+        self.canonicalize_tautomer = canonicalize_tautomer
+        self.remove_salts = remove_salts
+        self.params = rdMolStandardize.CleanupParameters()
+        self.tautomer_enumerator = rdMolStandardize.TautomerEnumerator(self.params)
+    def standardize(self, mol: Mol) -> Tuple[Optional[Mol], Optional[str]]:
+        """
+        Main standardization pipeline for ADMET
+        Pipeline:
+        1. Cleanup (remove Hs, disconnect metals, normalize)
+        2. Get largest fragment (optional - only if remove_salts=True)
+           2a. Extract salt information BEFORE further modifications
+        3. Neutralize charges
+        4. Canonicalize tautomer (optional)
+        Args:
+            mol: RDKit molecule object
+        Returns:
+            Tuple of (standardized molecule or None if failed, salt SMILES or None)
+        """
+        if mol is None:
+            return None, None
+        try:
+            # Step 1: Cleanup
+            cleaned_mol = rdMolStandardize.Cleanup(mol, self.params)
+            if cleaned_mol is None:
+                return None, None
+            # If not doing any transformations, return early
+            if not self.remove_salts and not self.canonicalize_tautomer:
+                return cleaned_mol, None
+            salt_smiles = None
+            mol = cleaned_mol
+            # Step 2: Fragment handling (conditional based on remove_salts)
+            if self.remove_salts:
+                # Get parent molecule
+                parent_mol = rdMolStandardize.FragmentParent(cleaned_mol, self.params)
+                if parent_mol:
+                    # Extract salt BEFORE any modifications to parent
+                    salt_smiles = self._extract_salt(cleaned_mol, parent_mol)
+                    mol = parent_mol
+                else:
+                    return None, None
+            # If not removing salts, keep the full molecule intact
+            # Step 3: Neutralize charges (skip if keeping salts to preserve ionic forms)
+            if self.remove_salts:
+                mol = rdMolStandardize.ChargeParent(mol, self.params, skipStandardize=True)
+                if mol is None:
+                    return None, salt_smiles
+            # Step 4: Canonicalize tautomer (LAST STEP)
+            if self.canonicalize_tautomer:
+                mol = self.tautomer_enumerator.Canonicalize(mol)
+            return mol, salt_smiles
+        except Exception as e:
+            log.warning(f"Standardization failed: {e}")
+            return None, None
+    def _extract_salt(self, orig_mol: Mol, parent_mol: Mol) -> Optional[str]:
+        """
+        Extract salt/counterion by comparing original and parent molecules.
+        Detects and handles mixtures vs true salt forms:
+        - True salts: small (<= 6 heavy atoms) or charged fragments
+        - Mixtures: multiple large neutral organic fragments
+        Args:
+            orig_mol: Original molecule (after Cleanup, before FragmentParent)
+            parent_mol: Parent molecule (after FragmentParent, before tautomerization)
+        Returns:
+            SMILES string of salt components or None if no salts/mixture detected
+        """
+        try:
+            # Quick atom count check
+            if orig_mol.GetNumAtoms() == parent_mol.GetNumAtoms():
+                return None
+            # Quick heavy atom difference check
+            heavy_diff = orig_mol.GetNumHeavyAtoms() - parent_mol.GetNumHeavyAtoms()
+            if heavy_diff <= 0:
+                return None
+            # Get all fragments from original molecule
+            orig_frags = Chem.GetMolFrags(orig_mol, asMols=True)
+            # If only one fragment, no salt
+            if len(orig_frags) <= 1:
+                return None
+            # Get canonical SMILES of parent for comparison
+            parent_smiles = Chem.MolToSmiles(parent_mol, canonical=True)
+            # Separate fragments into salts vs potential mixture components
+            salt_frags = []
+            mixture_frags = []
+            for frag in orig_frags:
+                frag_smiles = Chem.MolToSmiles(frag, canonical=True)
+                # Skip the parent fragment
+                if frag_smiles == parent_smiles:
+                    continue
+                # Classify fragment as salt or mixture component
+                num_heavy = frag.GetNumHeavyAtoms()
+                has_charge = any(atom.GetFormalCharge() != 0 for atom in frag.GetAtoms())
+                # More nuanced classification
+                if has_charge and num_heavy <= 10:  # Small charged fragment - likely a salt
+                    salt_frags.append(frag_smiles)
+                elif not has_charge and num_heavy <= 6:  # Small neutral - could be solvent/salt
+                    salt_frags.append(frag_smiles)
+                else:
+                    # Large neutral fragment - likely part of a mixture
+                    mixture_frags.append(frag_smiles)
+            # Check if this looks like a mixture
+            if mixture_frags:
+                # Log mixture detection
+                total_frags = len(orig_frags)
+                log.warning(
+                    f"Mixture detected: {total_frags} total fragments, "
+                    f"{len(mixture_frags)} large neutral organics. "
+                    f"Removing: {'.'.join(mixture_frags + salt_frags)}"
+                )
+                # Return None for mixtures - don't pollute the salt column
+                return None
+            # Return actual salts only
+            return ".".join(salt_frags) if salt_frags else None
+        except Exception as e:
+            log.info(f"Salt extraction failed: {e}")
+            return None
+def standardize(
+    df: pd.DataFrame,
+    canonicalize_tautomer: bool = True,
+    extract_salts: bool = True,
+) -> pd.DataFrame:
+    """
+    Standardize molecules in a DataFrame for ADMET modeling
+    Args:
+        df: Input DataFrame with SMILES column
+        canonicalize_tautomer: Whether to canonicalize tautomers (default: True)
+        extract_salts: Whether to remove and extract salts (default: True)
+                      If False, keeps full molecule with salts/counterions intact,
+                      skipping charge neutralization to preserve ionic character
+    Returns:
+        DataFrame with:
+        - orig_smiles: Original SMILES (preserved)
+        - smiles: Standardized SMILES (working column for downstream)
+        - salt: Removed salt/counterion SMILES (only if extract_salts=True)
+                None for mixtures or when no true salts present
+    """
+    # Check for the smiles column (any capitalization)
+    smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
+    if smiles_column is None:
+        raise ValueError("Input DataFrame must have a 'smiles' column")
+    # Copy input DataFrame to avoid modifying original
+    result = df.copy()
+    # Preserve original SMILES if not already saved
+    if "orig_smiles" not in result.columns:
+        result["orig_smiles"] = result[smiles_column]
+    # Initialize standardizer
+    standardizer = MolStandardizer(canonicalize_tautomer=canonicalize_tautomer, remove_salts=extract_salts)
+    def process_smiles(smiles: str) -> pd.Series:
+        """
+        Process a single SMILES string through standardization pipeline
+        Args:
+            smiles: Input SMILES string
+        Returns:
+            Series with standardized SMILES and extracted salt (if applicable)
+        """
+        # Handle missing values
+        if pd.isna(smiles) or smiles == "":
+            log.error("Encountered missing or empty SMILES string")
+            return pd.Series({"smiles": None, "salt": None})
+        # Early check for unreasonably long SMILES
+        if len(smiles) > 1000:
+            log.error(f"SMILES too long ({len(smiles)} chars): {smiles[:50]}...")
+            return pd.Series({"smiles": None, "salt": None})
+        # Parse molecule
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            log.error(f"Invalid SMILES: {smiles}")
+            return pd.Series({"smiles": None, "salt": None})
+        # Full standardization with optional salt removal
+        std_mol, salt_smiles = standardizer.standardize(mol)
+        # After standardization, validate the result
+        if std_mol is not None:
+            # Check if molecule is reasonable
+            if std_mol.GetNumAtoms() == 0 or std_mol.GetNumAtoms() > 200:  # Arbitrary limits
+                log.error(f"Rejecting molecule size: {std_mol.GetNumAtoms()} atoms")
+                log.error(f"Original SMILES: {smiles}")
+                return pd.Series({"smiles": None, "salt": salt_smiles})
+        if std_mol is None:
+            return pd.Series(
+                {
+                    "smiles": None,
+                    "salt": salt_smiles,  # May have extracted salt even if full standardization failed
+                }
+            )
+        # Convert back to SMILES
+        return pd.Series(
+            {"smiles": Chem.MolToSmiles(std_mol, canonical=True), "salt": salt_smiles if extract_salts else None}
+        )
+    # Process molecules
+    processed = result[smiles_column].apply(process_smiles)
+    # Update the dataframe with processed results
+    for col in ["smiles", "salt"]:
+        result[col] = processed[col]
+    return result
+if __name__ == "__main__":
+    # Pandas display options for better readability
+    pd.set_option("display.max_columns", None)
+    pd.set_option("display.width", 1000)
+    pd.set_option("display.max_colwidth", 100)
+    # Test with DataFrame including various salt forms
+    test_data = pd.DataFrame(
+        {
+            "smiles": [
+                # Organic salts
+                "[Na+].CC(=O)[O-]",  # Sodium acetate
+                "CC(=O)O.CCN",  # Acetic acid + ethylamine (acid-base pair)
+                # Tautomers
+                "CC(=O)CC(C)=O",  # Acetylacetone - tautomer
+                "c1ccc(O)nc1",  # 2-hydroxypyridine/2-pyridone - tautomer
+                # Multi-fragment
+                "CCO.CC",  # Ethanol + methane mixture
+                # Simple organics
+                "CC(C)(C)c1ccccc1",  # tert-butylbenzene
+                # Carbonate salts
+                "[Na+].[Na+].[O-]C([O-])=O",  # Sodium carbonate
+                "[Li+].[Li+].[O-]C([O-])=O",  # Lithium carbonate
+                "[K+].[K+].[O-]C([O-])=O",  # Potassium carbonate
+                "[Mg++].[O-]C([O-])=O",  # Magnesium carbonate
+                "[Ca++].[O-]C([O-])=O",  # Calcium carbonate
+                # Drug salts
+                "CC(C)NCC(O)c1ccc(O)c(O)c1.Cl",  # Isoproterenol HCl
+                "CN1CCC[C@H]1c2cccnc2.[Cl-]",  # Nicotine HCl
+                # Tautomer with salt
+                "c1ccc(O)nc1.Cl",  # 2-hydroxypyridine with HCl
+                # Edge cases
+                None,  # Missing value
+                "INVALID",  # Invalid SMILES
+            ],
+            "compound_id": [f"C{i:03d}" for i in range(1, 17)],
+        }
+    )
+    # General test
+    print("Testing standardization with full dataset...")
+    standardize(test_data)
+    # Remove the last two rows to avoid errors with None and INVALID
+    test_data = test_data.iloc[:-2].reset_index(drop=True)
+    # Test WITHOUT salt removal (keeps full molecule)
+    print("\nStandardization KEEPING salts (extract_salts=False) Tautomerization: True")
+    result_keep = standardize(test_data, extract_salts=False, canonicalize_tautomer=True)
+    display_order = ["compound_id", "orig_smiles", "smiles", "salt"]
+    print(result_keep[display_order])
+    # Test WITH salt removal
+    print("\n" + "=" * 70)
+    print("Standardization REMOVING salts (extract_salts=True):")
+    result_remove = standardize(test_data, extract_salts=True, canonicalize_tautomer=True)
+    print(result_remove[display_order])
+    # Test with problematic cases specifically
+    print("\n" + "=" * 70)
+    print("Testing specific problematic cases:")
+    problem_cases = pd.DataFrame(
+        {
+            "smiles": [
+                "CC(=O)O.CCN",  # Should extract CC(=O)O as salt
+                "CCO.CC",  # Should return CC as salt
+            ],
+            "compound_id": ["TEST_C002", "TEST_C005"],
+        }
+    )
+    problem_result = standardize(problem_cases, extract_salts=True, canonicalize_tautomer=True)
+    print(problem_result[display_order])
+    # Performance test with larger dataset
+    from workbench.api import DataSource
+    print("\n" + "=" * 70)
+    ds = DataSource("aqsol_data")
+    df = ds.pull_dataframe()[["id", "smiles"]][:1000]
+    for tautomer in [True, False]:
+        for extract in [True, False]:
+            print(f"Performance test with AQSol dataset: tautomer={tautomer} extract_salts={extract}:")
+            start_time = time.time()
+            std_df = standardize(df, canonicalize_tautomer=tautomer, extract_salts=extract)
+            elapsed = time.time() - start_time
+            mol_per_sec = len(df) / elapsed
+            print(f"{elapsed:.2f}s ({mol_per_sec:.0f} mol/s)")

workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py CHANGED Viewed

@@ -7,13 +7,13 @@
 #
 import argparse
 import os
-import joblib
 from io import StringIO
 import pandas as pd
 import json
 # Local imports
-from local_utils import compute_molecular_descriptors
+from mol_standardize import standardize
+from mol_descriptors import compute_descriptors
 # TRAINING SECTION
@@ -32,15 +32,12 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # This model doesn't get trained, it just a feature creation 'model'
-    # Sagemaker seems to get upset if we don't save a model, so we'll create a placeholder model
-    placeholder_model = {}
-    joblib.dump(placeholder_model, os.path.join(args.model_dir, "model.joblib"))
+    # So we don't need to do anything here
 # Model loading and prediction functions
 def model_fn(model_dir):
-    return joblib.load(os.path.join(model_dir, "model.joblib"))
+    return None
 def input_fn(input_data, content_type):
@@ -78,6 +75,7 @@ def output_fn(output_df, accept_type):
 # Prediction function
 def predict_fn(df, model):
-    # Compute the Molecular Descriptors
-    df = compute_molecular_descriptors(df)
+    # Standardize the molecule (extract salts) and then compute descriptors
+    df = standardize(df, extract_salts=True)
+    df = compute_descriptors(df)
     return df

workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py CHANGED Viewed

@@ -15,7 +15,7 @@ import pandas as pd
 import json
 # Local imports
-from local_utils import compute_morgan_fingerprints
+from fingerprints import compute_morgan_fingerprints
 # TRAINING SECTION

workbench/model_scripts/custom_models/proximity/feature_space_proximity.py ADDED Viewed

@@ -0,0 +1,194 @@
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from sklearn.neighbors import NearestNeighbors
+from typing import List, Optional
+import logging
+# Workbench Imports
+from workbench.algorithms.dataframe.proximity import Proximity
+from workbench.algorithms.dataframe.projection_2d import Projection2D
+# Set up logging
+log = logging.getLogger("workbench")
+class FeatureSpaceProximity(Proximity):
+    """Proximity computations for numeric feature spaces using Euclidean distance."""
+    def __init__(
+        self,
+        df: pd.DataFrame,
+        id_column: str,
+        features: List[str],
+        target: Optional[str] = None,
+        include_all_columns: bool = False,
+    ):
+        """
+        Initialize the FeatureSpaceProximity class.
+        Args:
+            df: DataFrame containing data for neighbor computations.
+            id_column: Name of the column used as the identifier.
+            features: List of feature column names to be used for neighbor computations.
+            target: Name of the target column. Defaults to None.
+            include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
+        """
+        # Validate and filter features before calling parent init
+        self._raw_features = features
+        super().__init__(
+            df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns
+        )
+    def _prepare_data(self) -> None:
+        """Filter out non-numeric features and drop NaN rows."""
+        # Validate features
+        self.features = self._validate_features(self.df, self._raw_features)
+        # Drop NaN rows for the features we're using
+        self.df = self.df.dropna(subset=self.features).copy()
+    def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
+        """Remove non-numeric features and log warnings."""
+        non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
+        if non_numeric:
+            log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
+        return [f for f in features if f not in non_numeric]
+    def _build_model(self) -> None:
+        """Standardize features and fit Nearest Neighbors model."""
+        self.scaler = StandardScaler()
+        X = self.scaler.fit_transform(self.df[self.features])
+        self.nn = NearestNeighbors().fit(X)
+    def _transform_features(self, df: pd.DataFrame) -> np.ndarray:
+        """Transform features using the fitted scaler."""
+        return self.scaler.transform(df[self.features])
+    def _project_2d(self) -> None:
+        """Project the numeric features to 2D for visualization."""
+        if len(self.features) >= 2:
+            self.df = Projection2D().fit_transform(self.df, features=self.features)
+# Testing the FeatureSpaceProximity class
+if __name__ == "__main__":
+    pd.set_option("display.max_columns", None)
+    pd.set_option("display.width", 1000)
+    # Create a sample DataFrame
+    data = {
+        "ID": [1, 2, 3, 4, 5],
+        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
+        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
+        "Feature3": [2.5, 2.4, 2.3, 2.3, np.nan],
+    }
+    df = pd.DataFrame(data)
+    # Test the FeatureSpaceProximity class
+    features = ["Feature1", "Feature2", "Feature3"]
+    prox = FeatureSpaceProximity(df, id_column="ID", features=features)
+    print(prox.neighbors(1, n_neighbors=2))
+    # Test the neighbors method with radius
+    print(prox.neighbors(1, radius=2.0))
+    # Test with Features list
+    prox = FeatureSpaceProximity(df, id_column="ID", features=["Feature1"])
+    print(prox.neighbors(1))
+    # Create a sample DataFrame
+    data = {
+        "id": ["a", "b", "c", "d", "e"],  # Testing string IDs
+        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
+        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
+        "target": [1, 0, 1, 0, 5],
+    }
+    df = pd.DataFrame(data)
+    # Test with String Ids
+    prox = FeatureSpaceProximity(
+        df,
+        id_column="id",
+        features=["Feature1", "Feature2"],
+        target="target",
+        include_all_columns=True,
+    )
+    print(prox.neighbors(["a", "b"]))
+    # Test duplicate IDs
+    data = {
+        "id": ["a", "b", "c", "d", "d"],  # Duplicate ID (d)
+        "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
+        "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
+        "target": [1, 0, 1, 0, 5],
+    }
+    df = pd.DataFrame(data)
+    prox = FeatureSpaceProximity(df, id_column="id", features=["Feature1", "Feature2"], target="target")
+    print(df.equals(prox.df))
+    # Test on real data from Workbench
+    from workbench.api import FeatureSet, Model
+    fs = FeatureSet("aqsol_features")
+    model = Model("aqsol-regression")
+    features = model.features()
+    df = fs.pull_dataframe()
+    prox = FeatureSpaceProximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
+    print("\n" + "=" * 80)
+    print("Testing Neighbors...")
+    print("=" * 80)
+    test_id = df[fs.id_column].tolist()[0]
+    print(f"\nNeighbors for ID {test_id}:")
+    print(prox.neighbors(test_id))
+    print("\n" + "=" * 80)
+    print("Testing isolated_compounds...")
+    print("=" * 80)
+    # Test isolated data in the top 1%
+    isolated_1pct = prox.isolated(top_percent=1.0)
+    print(f"\nTop 1% most isolated compounds (n={len(isolated_1pct)}):")
+    print(isolated_1pct)
+    # Test isolated data in the top 5%
+    isolated_5pct = prox.isolated(top_percent=5.0)
+    print(f"\nTop 5% most isolated compounds (n={len(isolated_5pct)}):")
+    print(isolated_5pct)
+    print("\n" + "=" * 80)
+    print("Testing target_gradients...")
+    print("=" * 80)
+    # Test with different parameters
+    gradients_1pct = prox.target_gradients(top_percent=1.0, min_delta=1.0)
+    print(f"\nTop 1% target gradients (min_delta=5.0) (n={len(gradients_1pct)}):")
+    print(gradients_1pct)
+    gradients_5pct = prox.target_gradients(top_percent=5.0, min_delta=5.0)
+    print(f"\nTop 5% target gradients (min_delta=5.0) (n={len(gradients_5pct)}):")
+    print(gradients_5pct)
+    # Test proximity_stats
+    print("\n" + "=" * 80)
+    print("Testing proximity_stats...")
+    print("=" * 80)
+    stats = prox.proximity_stats()
+    print(stats)
+    # Plot the distance distribution using pandas
+    print("\n" + "=" * 80)
+    print("Plotting distance distribution...")
+    print("=" * 80)
+    prox.df["nn_distance"].hist(bins=50, figsize=(10, 6), edgecolor="black")
+    # Visualize the 2D projection
+    print("\n" + "=" * 80)
+    print("Visualizing 2D Projection...")
+    print("=" * 80)
+    from workbench.web_interface.components.plugin_unit_test import PluginUnitTest
+    from workbench.web_interface.components.plugins.scatter_plot import ScatterPlot
+    unit_test = PluginUnitTest(ScatterPlot, input_data=prox.df[:1000], x="x", y="y", color=model.target())
+    unit_test.run()

workbench 0.8.162__py3-none-any.whl → 0.8.220__py3-none-any.whl

Potentially problematic release.

workbench 0.8.162py3-none-any.whl → 0.8.220py3-none-any.whl