PyPI - workbench - Versions diffs - 0.8.161__py3-none-any.whl → 0.8.192__py3-none-any.whl - Mend

workbench 0.8.161py3-none-any.whl → 0.8.192py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

workbench/algorithms/dataframe/proximity.py +143 -102
workbench/algorithms/graph/light/proximity_graph.py +2 -1
workbench/api/compound.py +1 -1
workbench/api/endpoint.py +12 -0
workbench/api/feature_set.py +4 -4
workbench/api/meta.py +5 -2
workbench/api/model.py +16 -12
workbench/api/monitor.py +1 -16
workbench/core/artifacts/artifact.py +11 -3
workbench/core/artifacts/data_capture_core.py +355 -0
workbench/core/artifacts/endpoint_core.py +168 -78
workbench/core/artifacts/feature_set_core.py +72 -13
workbench/core/artifacts/model_core.py +50 -15
workbench/core/artifacts/monitor_core.py +33 -248
workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
workbench/core/cloud_platform/aws/aws_meta.py +12 -5
workbench/core/cloud_platform/aws/aws_session.py +4 -4
workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
workbench/core/transforms/features_to_model/features_to_model.py +9 -4
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
workbench/core/views/training_view.py +49 -53
workbench/core/views/view.py +51 -1
workbench/core/views/view_utils.py +4 -4
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
workbench/model_scripts/pytorch_model/pytorch.template +19 -20
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +7 -2
workbench/model_scripts/uq_models/mapie.template +492 -0
workbench/model_scripts/uq_models/requirements.txt +1 -0
workbench/model_scripts/xgb_model/xgb_model.template +31 -40
workbench/repl/workbench_shell.py +11 -6
workbench/scripts/lambda_launcher.py +63 -0
workbench/scripts/ml_pipeline_batch.py +137 -0
workbench/scripts/ml_pipeline_sqs.py +186 -0
workbench/scripts/monitor_cloud_watch.py +20 -100
workbench/utils/aws_utils.py +4 -3
workbench/utils/chem_utils/__init__.py +0 -0
workbench/utils/chem_utils/fingerprints.py +134 -0
workbench/utils/chem_utils/misc.py +194 -0
workbench/utils/chem_utils/mol_descriptors.py +483 -0
workbench/utils/chem_utils/mol_standardize.py +450 -0
workbench/utils/chem_utils/mol_tagging.py +348 -0
workbench/utils/chem_utils/projections.py +209 -0
workbench/utils/chem_utils/salts.py +256 -0
workbench/utils/chem_utils/sdf.py +292 -0
workbench/utils/chem_utils/toxicity.py +250 -0
workbench/utils/chem_utils/vis.py +253 -0
workbench/utils/cloudwatch_handler.py +1 -1
workbench/utils/cloudwatch_utils.py +137 -0
workbench/utils/config_manager.py +3 -7
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/model_utils.py +76 -30
workbench/utils/monitor_utils.py +44 -62
workbench/utils/pandas_utils.py +3 -3
workbench/utils/shap_utils.py +10 -2
workbench/utils/workbench_logging.py +0 -3
workbench/utils/workbench_sqs.py +1 -1
workbench/utils/xgboost_model_utils.py +283 -145
workbench/web_interface/components/plugins/dashboard_status.py +3 -1
workbench/web_interface/components/plugins/generated_compounds.py +1 -1
workbench/web_interface/components/plugins/scatter_plot.py +3 -3
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/METADATA +4 -4
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/RECORD +81 -76
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/entry_points.txt +3 -0
workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/pytorch_model/generated_model_script.py +0 -565
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
workbench/model_scripts/xgb_model/generated_model_script.py +0 -477
workbench/utils/chem_utils.py +0 -1556
workbench/utils/execution_environment.py +0 -211
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/WHEEL +0 -0
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/top_level.txt +0 -0

workbench/utils/chem_utils/fingerprints.py ADDED Viewed

@@ -0,0 +1,134 @@
+"""Molecular fingerprint computation utilities"""
+import logging
+import pandas as pd
+# Molecular Descriptor Imports
+from rdkit import Chem
+from rdkit.Chem import rdFingerprintGenerator
+from rdkit.Chem.MolStandardize import rdMolStandardize
+# Set up the logger
+log = logging.getLogger("workbench")
+def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=True) -> pd.DataFrame:
+    """Compute and add Morgan fingerprints to the DataFrame.
+    Args:
+        df (pd.DataFrame): Input DataFrame containing SMILES strings.
+        radius (int): Radius for the Morgan fingerprint.
+        n_bits (int): Number of bits for the fingerprint.
+        counts (bool): Count simulation for the fingerprint.
+    Returns:
+        pd.DataFrame: The input DataFrame with the Morgan fingerprints added as bit strings.
+    Note:
+        See: https://greglandrum.github.io/rdkit-blog/posts/2021-07-06-simulating-counts.html
+    """
+    delete_mol_column = False
+    # Check for the SMILES column (case-insensitive)
+    smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
+    if smiles_column is None:
+        raise ValueError("Input DataFrame must have a 'smiles' column")
+    # Sanity check the molecule column (sometimes it gets serialized, which doesn't work)
+    if "molecule" in df.columns and df["molecule"].dtype == "string":
+        log.warning("Detected serialized molecules in 'molecule' column. Removing...")
+        del df["molecule"]
+    # Convert SMILES to RDKit molecule objects (vectorized)
+    if "molecule" not in df.columns:
+        log.info("Converting SMILES to RDKit Molecules...")
+        delete_mol_column = True
+        df["molecule"] = df[smiles_column].apply(Chem.MolFromSmiles)
+        # Make sure our molecules are not None
+        failed_smiles = df[df["molecule"].isnull()][smiles_column].tolist()
+        if failed_smiles:
+            log.error(f"Failed to convert the following SMILES to molecules: {failed_smiles}")
+        df = df.dropna(subset=["molecule"])
+    # If we have fragments in our compounds, get the largest fragment before computing fingerprints
+    largest_frags = df["molecule"].apply(
+        lambda mol: rdMolStandardize.LargestFragmentChooser().choose(mol) if mol else None
+    )
+    # Create a Morgan fingerprint generator
+    if counts:
+        n_bits *= 4  # Multiply by 4 to simulate counts
+    morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits, countSimulation=counts)
+    # Compute Morgan fingerprints (vectorized)
+    fingerprints = largest_frags.apply(
+        lambda mol: (morgan_generator.GetFingerprint(mol).ToBitString() if mol else pd.NA)
+    )
+    # Add the fingerprints to the DataFrame
+    df["fingerprint"] = fingerprints
+    # Drop the intermediate 'molecule' column if it was added
+    if delete_mol_column:
+        del df["molecule"]
+    return df
+if __name__ == "__main__":
+    print("Running molecular fingerprint tests...")
+    print("Note: This requires molecular_screening module to be available")
+    # Test molecules
+    test_molecules = {
+        "aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
+        "caffeine": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
+        "glucose": "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O",  # With stereochemistry
+        "sodium_acetate": "CC(=O)[O-].[Na+]",  # Salt
+        "benzene": "c1ccccc1",
+        "butene_e": "C/C=C/C",  # E-butene
+        "butene_z": "C/C=C\\C",  # Z-butene
+    }
+    # Test 1: Morgan Fingerprints
+    print("\n1. Testing Morgan fingerprint generation...")
+    test_df = pd.DataFrame({"SMILES": list(test_molecules.values()), "name": list(test_molecules.keys())})
+    fp_df = compute_morgan_fingerprints(test_df.copy(), radius=2, n_bits=512, counts=False)
+    print("   Fingerprint generation results:")
+    for _, row in fp_df.iterrows():
+        fp = row.get("fingerprint", "N/A")
+        fp_len = len(fp) if fp != "N/A" else 0
+        print(f"   {row['name']:15} → {fp_len} bits")
+    # Test 2: Different fingerprint parameters
+    print("\n2. Testing different fingerprint parameters...")
+    # Test with counts enabled
+    fp_counts_df = compute_morgan_fingerprints(test_df.copy(), radius=3, n_bits=256, counts=True)
+    print("   With count simulation (256 bits * 4):")
+    for _, row in fp_counts_df.iterrows():
+        fp = row.get("fingerprint", "N/A")
+        fp_len = len(fp) if fp != "N/A" else 0
+        print(f"   {row['name']:15} → {fp_len} bits")
+    # Test 3: Edge cases
+    print("\n3. Testing edge cases...")
+    # Invalid SMILES
+    invalid_df = pd.DataFrame({"SMILES": ["INVALID", ""]})
+    try:
+        fp_invalid = compute_morgan_fingerprints(invalid_df.copy())
+        print(f"   ✓ Invalid SMILES handled: {len(fp_invalid)} valid molecules")
+    except Exception as e:
+        print(f"   ✓ Invalid SMILES properly raised error: {type(e).__name__}")
+    # Test with pre-existing molecule column
+    mol_df = test_df.copy()
+    mol_df["molecule"] = mol_df["SMILES"].apply(Chem.MolFromSmiles)
+    fp_with_mol = compute_morgan_fingerprints(mol_df)
+    print(f"   ✓ Pre-existing molecule column handled: {len(fp_with_mol)} fingerprints generated")
+    print("\n✅ All fingerprint tests completed!")

workbench/utils/chem_utils/misc.py ADDED Viewed

@@ -0,0 +1,194 @@
+"""Miscellaneous processing functions for molecular data."""
+import logging
+import numpy as np
+import pandas as pd
+from typing import List, Optional
+# Set up the logger
+log = logging.getLogger("workbench")
+def geometric_mean(series: pd.Series) -> float:
+    """Computes the geometric mean manually to avoid using scipy."""
+    return np.exp(np.log(series).mean())
+def rollup_experimental_data(
+    df: pd.DataFrame, id: str, time: str, target: str, use_gmean: bool = False
+) -> pd.DataFrame:
+    """
+    Rolls up a dataset by selecting the largest time per unique ID and averaging the target value
+    if multiple records exist at that time. Supports both arithmetic and geometric mean.
+    Parameters:
+        df (pd.DataFrame): Input dataframe.
+        id (str): Column representing the unique molecule ID.
+        time (str): Column representing the time.
+        target (str): Column representing the target value.
+        use_gmean (bool): Whether to use the geometric mean instead of the arithmetic mean.
+    Returns:
+        pd.DataFrame: Rolled-up dataframe with all original columns retained.
+    """
+    # Find the max time per unique ID
+    max_time_df = df.groupby(id)[time].transform("max")
+    filtered_df = df[df[time] == max_time_df]
+    # Define aggregation function
+    agg_func = geometric_mean if use_gmean else np.mean
+    # Perform aggregation on all columns
+    agg_dict = {col: "first" for col in df.columns if col not in [target, id, time]}
+    agg_dict[target] = lambda x: agg_func(x) if len(x) > 1 else x.iloc[0]  # Apply mean or gmean
+    rolled_up_df = filtered_df.groupby([id, time]).agg(agg_dict).reset_index()
+    return rolled_up_df
+def micromolar_to_log(series_µM: pd.Series) -> pd.Series:
+    """
+    Convert a pandas Series of concentrations in µM (micromolar) to their logarithmic values (log10).
+    Parameters:
+    series_uM (pd.Series): Series of concentrations in micromolar.
+    Returns:
+    pd.Series: Series of logarithmic values (log10).
+    """
+    # Replace 0 or negative values with a small number to avoid log errors
+    adjusted_series = series_µM.clip(lower=1e-9)  # Alignment with another project
+    series_mol_per_l = adjusted_series * 1e-6  # Convert µM/L to mol/L
+    log_series = np.log10(series_mol_per_l)
+    return log_series
+def log_to_micromolar(log_series: pd.Series) -> pd.Series:
+    """
+    Convert a pandas Series of logarithmic values (log10) back to concentrations in µM (micromolar).
+    Parameters:
+    log_series (pd.Series): Series of logarithmic values (log10).
+    Returns:
+    pd.Series: Series of concentrations in micromolar.
+    """
+    series_mol_per_l = 10**log_series  # Convert log10 back to mol/L
+    series_µM = series_mol_per_l * 1e6  # Convert mol/L to µM
+    return series_µM
+def feature_resolution_issues(df: pd.DataFrame, features: List[str], show_cols: Optional[List[str]] = None) -> None:
+    """
+    Identify and print groups in a DataFrame where the given features have more than one unique SMILES,
+    sorted by group size (largest number of unique SMILES first).
+    Args:
+        df (pd.DataFrame): Input DataFrame containing SMILES strings.
+        features (List[str]): List of features to check.
+        show_cols (Optional[List[str]]): Columns to display; defaults to all columns.
+    """
+    # Check for the 'smiles' column (case-insensitive)
+    smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
+    if smiles_column is None:
+        raise ValueError("Input DataFrame must have a 'smiles' column")
+    show_cols = show_cols if show_cols is not None else df.columns.tolist()
+    # Drop duplicates to keep only unique SMILES for each feature combination
+    unique_df = df.drop_duplicates(subset=[smiles_column] + features)
+    # Find groups with more than one unique SMILES
+    group_counts = unique_df.groupby(features).size()
+    collision_groups = group_counts[group_counts > 1].sort_values(ascending=False)
+    # Print each group in order of size (largest first)
+    for group, count in collision_groups.items():
+        # Get the rows for this group
+        if isinstance(group, tuple):
+            group_mask = (unique_df[features] == group).all(axis=1)
+        else:
+            group_mask = unique_df[features[0]] == group
+        group_df = unique_df[group_mask]
+        print(f"Feature Group (unique SMILES: {count}):")
+        print(group_df[show_cols])
+        print("\n")
+if __name__ == "__main__":
+    print("Running molecular processing and transformation tests...")
+    print("Note: This requires the molecular_filters module to be available")
+    # Test 1: Concentration conversions
+    print("\n1. Testing concentration conversions...")
+    # Test micromolar to log
+    test_conc = pd.Series([1.0, 10.0, 100.0, 1000.0, 0.001])
+    log_values = micromolar_to_log(test_conc)
+    back_to_uM = log_to_micromolar(log_values)
+    print("   µM → log10 → µM:")
+    for orig, log_val, back in zip(test_conc, log_values, back_to_uM):
+        print(f"   {orig:8.3f} µM → {log_val:6.2f} → {back:8.3f} µM")
+    # Test 2: Geometric mean
+    print("\n2. Testing geometric mean...")
+    test_series = pd.Series([2, 4, 8, 16])
+    geo_mean = geometric_mean(test_series)
+    arith_mean = np.mean(test_series)
+    print(f"   Series: {list(test_series)}")
+    print(f"   Arithmetic mean: {arith_mean:.2f}")
+    print(f"   Geometric mean: {geo_mean:.2f}")
+    # Test 3: Experimental data rollup
+    print("\n3. Testing experimental data rollup...")
+    # Create test data with multiple timepoints and replicates
+    test_data = pd.DataFrame(
+        {
+            "compound_id": ["A", "A", "A", "B", "B", "C", "C", "C"],
+            "time": [1, 2, 2, 1, 2, 1, 1, 2],
+            "activity": [10, 20, 22, 5, 8, 100, 110, 200],
+            "assay": ["kinase", "kinase", "kinase", "kinase", "kinase", "cell", "cell", "cell"],
+        }
+    )
+    # Rollup with arithmetic mean
+    rolled_arith = rollup_experimental_data(test_data, "compound_id", "time", "activity", use_gmean=False)
+    print("   Arithmetic mean rollup:")
+    print(rolled_arith[["compound_id", "time", "activity"]])
+    # Rollup with geometric mean
+    rolled_geo = rollup_experimental_data(test_data, "compound_id", "time", "activity", use_gmean=True)
+    print("\n   Geometric mean rollup:")
+    print(rolled_geo[["compound_id", "time", "activity"]])
+    # Test 4: Feature resolution issues
+    print("\n4. Testing feature resolution identification...")
+    # Create data with some duplicate features but different SMILES
+    resolution_df = pd.DataFrame(
+        {
+            "smiles": ["CCO", "C(C)O", "CC(C)O", "CCC(C)O", "CCCO"],
+            "assay_id": ["A1", "A1", "A2", "A2", "A3"],
+            "value": [1.0, 1.5, 2.0, 2.2, 3.0],
+        }
+    )
+    print("   Checking for feature collisions in 'assay_id':")
+    feature_resolution_issues(resolution_df, ["assay_id"], show_cols=["smiles", "assay_id", "value"])
+    # Test 7: Edge cases
+    print("\n7. Testing edge cases...")
+    # Zero and negative concentrations
+    edge_conc = pd.Series([0, -1, 1e-10])
+    edge_log = micromolar_to_log(edge_conc)
+    print("   Edge concentration handling:")
+    for c, l in zip(edge_conc, edge_log):
+        print(f"      {c:6.2e} µM → {l:6.2f}")
+    print("\n✅ All molecular processing tests completed!")

workbench 0.8.161__py3-none-any.whl → 0.8.192__py3-none-any.whl

workbench 0.8.161py3-none-any.whl → 0.8.192py3-none-any.whl