workbench 0.8.162__py3-none-any.whl → 0.8.202__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/algorithms/dataframe/__init__.py +1 -2
- workbench/algorithms/dataframe/fingerprint_proximity.py +2 -2
- workbench/algorithms/dataframe/proximity.py +261 -235
- workbench/algorithms/graph/light/proximity_graph.py +10 -8
- workbench/api/__init__.py +2 -1
- workbench/api/compound.py +1 -1
- workbench/api/endpoint.py +11 -0
- workbench/api/feature_set.py +11 -8
- workbench/api/meta.py +5 -2
- workbench/api/model.py +16 -15
- workbench/api/monitor.py +1 -16
- workbench/core/artifacts/__init__.py +11 -2
- workbench/core/artifacts/artifact.py +11 -3
- workbench/core/artifacts/data_capture_core.py +355 -0
- workbench/core/artifacts/endpoint_core.py +256 -118
- workbench/core/artifacts/feature_set_core.py +265 -16
- workbench/core/artifacts/model_core.py +107 -60
- workbench/core/artifacts/monitor_core.py +33 -248
- workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
- workbench/core/cloud_platform/aws/aws_meta.py +12 -5
- workbench/core/cloud_platform/aws/aws_parameter_store.py +18 -2
- workbench/core/cloud_platform/aws/aws_session.py +4 -4
- workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
- workbench/core/transforms/features_to_model/features_to_model.py +42 -32
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
- workbench/core/views/training_view.py +113 -42
- workbench/core/views/view.py +53 -3
- workbench/core/views/view_utils.py +4 -4
- workbench/model_scripts/chemprop/chemprop.template +852 -0
- workbench/model_scripts/chemprop/generated_model_script.py +852 -0
- workbench/model_scripts/chemprop/requirements.txt +11 -0
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
- workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
- workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
- workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
- workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
- workbench/model_scripts/custom_models/proximity/proximity.py +261 -235
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +166 -62
- workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
- workbench/model_scripts/custom_models/uq_models/proximity.py +261 -235
- workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
- workbench/model_scripts/pytorch_model/generated_model_script.py +373 -190
- workbench/model_scripts/pytorch_model/pytorch.template +370 -187
- workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/script_generation.py +17 -9
- workbench/model_scripts/uq_models/generated_model_script.py +605 -0
- workbench/model_scripts/uq_models/mapie.template +605 -0
- workbench/model_scripts/uq_models/requirements.txt +1 -0
- workbench/model_scripts/xgb_model/generated_model_script.py +37 -46
- workbench/model_scripts/xgb_model/xgb_model.template +44 -46
- workbench/repl/workbench_shell.py +28 -14
- workbench/scripts/endpoint_test.py +162 -0
- workbench/scripts/lambda_test.py +73 -0
- workbench/scripts/ml_pipeline_batch.py +137 -0
- workbench/scripts/ml_pipeline_sqs.py +186 -0
- workbench/scripts/monitor_cloud_watch.py +20 -100
- workbench/utils/aws_utils.py +4 -3
- workbench/utils/chem_utils/__init__.py +0 -0
- workbench/utils/chem_utils/fingerprints.py +134 -0
- workbench/utils/chem_utils/misc.py +194 -0
- workbench/utils/chem_utils/mol_descriptors.py +483 -0
- workbench/utils/chem_utils/mol_standardize.py +450 -0
- workbench/utils/chem_utils/mol_tagging.py +348 -0
- workbench/utils/chem_utils/projections.py +209 -0
- workbench/utils/chem_utils/salts.py +256 -0
- workbench/utils/chem_utils/sdf.py +292 -0
- workbench/utils/chem_utils/toxicity.py +250 -0
- workbench/utils/chem_utils/vis.py +253 -0
- workbench/utils/chemprop_utils.py +760 -0
- workbench/utils/cloudwatch_handler.py +1 -1
- workbench/utils/cloudwatch_utils.py +137 -0
- workbench/utils/config_manager.py +3 -7
- workbench/utils/endpoint_utils.py +5 -7
- workbench/utils/license_manager.py +2 -6
- workbench/utils/model_utils.py +95 -34
- workbench/utils/monitor_utils.py +44 -62
- workbench/utils/pandas_utils.py +3 -3
- workbench/utils/pytorch_utils.py +526 -0
- workbench/utils/shap_utils.py +10 -2
- workbench/utils/workbench_logging.py +0 -3
- workbench/utils/workbench_sqs.py +1 -1
- workbench/utils/xgboost_model_utils.py +371 -156
- workbench/web_interface/components/model_plot.py +7 -1
- workbench/web_interface/components/plugin_unit_test.py +5 -2
- workbench/web_interface/components/plugins/dashboard_status.py +3 -1
- workbench/web_interface/components/plugins/generated_compounds.py +1 -1
- workbench/web_interface/components/plugins/model_details.py +9 -7
- workbench/web_interface/components/plugins/scatter_plot.py +3 -3
- {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/METADATA +27 -6
- {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/RECORD +101 -85
- {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/entry_points.txt +4 -0
- {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/licenses/LICENSE +1 -1
- workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
- workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
- workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
- workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
- workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
- workbench/model_scripts/quant_regression/quant_regression.template +0 -279
- workbench/model_scripts/quant_regression/requirements.txt +0 -1
- workbench/utils/chem_utils.py +0 -1556
- workbench/utils/execution_environment.py +0 -211
- workbench/utils/fast_inference.py +0 -167
- workbench/utils/resource_utils.py +0 -39
- {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/WHEEL +0 -0
- {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Requirements for ChemProp model scripts
|
|
2
|
+
# Note: These are the local dev requirements. The Docker images have their own requirements.txt
|
|
3
|
+
chemprop==2.2.1
|
|
4
|
+
rdkit==2025.9.1
|
|
5
|
+
torch>=2.0.0
|
|
6
|
+
lightning>=2.0.0
|
|
7
|
+
pandas>=2.0.0
|
|
8
|
+
numpy>=1.24.0
|
|
9
|
+
scikit-learn>=1.3.0
|
|
10
|
+
awswrangler>=3.0.0
|
|
11
|
+
joblib>=1.3.0
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Molecular fingerprint computation utilities"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
# Molecular Descriptor Imports
|
|
7
|
+
from rdkit import Chem
|
|
8
|
+
from rdkit.Chem import rdFingerprintGenerator
|
|
9
|
+
from rdkit.Chem.MolStandardize import rdMolStandardize
|
|
10
|
+
|
|
11
|
+
# Set up the logger
|
|
12
|
+
log = logging.getLogger("workbench")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=True) -> pd.DataFrame:
|
|
16
|
+
"""Compute and add Morgan fingerprints to the DataFrame.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
df (pd.DataFrame): Input DataFrame containing SMILES strings.
|
|
20
|
+
radius (int): Radius for the Morgan fingerprint.
|
|
21
|
+
n_bits (int): Number of bits for the fingerprint.
|
|
22
|
+
counts (bool): Count simulation for the fingerprint.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
pd.DataFrame: The input DataFrame with the Morgan fingerprints added as bit strings.
|
|
26
|
+
|
|
27
|
+
Note:
|
|
28
|
+
See: https://greglandrum.github.io/rdkit-blog/posts/2021-07-06-simulating-counts.html
|
|
29
|
+
"""
|
|
30
|
+
delete_mol_column = False
|
|
31
|
+
|
|
32
|
+
# Check for the SMILES column (case-insensitive)
|
|
33
|
+
smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
|
|
34
|
+
if smiles_column is None:
|
|
35
|
+
raise ValueError("Input DataFrame must have a 'smiles' column")
|
|
36
|
+
|
|
37
|
+
# Sanity check the molecule column (sometimes it gets serialized, which doesn't work)
|
|
38
|
+
if "molecule" in df.columns and df["molecule"].dtype == "string":
|
|
39
|
+
log.warning("Detected serialized molecules in 'molecule' column. Removing...")
|
|
40
|
+
del df["molecule"]
|
|
41
|
+
|
|
42
|
+
# Convert SMILES to RDKit molecule objects (vectorized)
|
|
43
|
+
if "molecule" not in df.columns:
|
|
44
|
+
log.info("Converting SMILES to RDKit Molecules...")
|
|
45
|
+
delete_mol_column = True
|
|
46
|
+
df["molecule"] = df[smiles_column].apply(Chem.MolFromSmiles)
|
|
47
|
+
# Make sure our molecules are not None
|
|
48
|
+
failed_smiles = df[df["molecule"].isnull()][smiles_column].tolist()
|
|
49
|
+
if failed_smiles:
|
|
50
|
+
log.error(f"Failed to convert the following SMILES to molecules: {failed_smiles}")
|
|
51
|
+
df = df.dropna(subset=["molecule"])
|
|
52
|
+
|
|
53
|
+
# If we have fragments in our compounds, get the largest fragment before computing fingerprints
|
|
54
|
+
largest_frags = df["molecule"].apply(
|
|
55
|
+
lambda mol: rdMolStandardize.LargestFragmentChooser().choose(mol) if mol else None
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Create a Morgan fingerprint generator
|
|
59
|
+
if counts:
|
|
60
|
+
n_bits *= 4 # Multiply by 4 to simulate counts
|
|
61
|
+
morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits, countSimulation=counts)
|
|
62
|
+
|
|
63
|
+
# Compute Morgan fingerprints (vectorized)
|
|
64
|
+
fingerprints = largest_frags.apply(
|
|
65
|
+
lambda mol: (morgan_generator.GetFingerprint(mol).ToBitString() if mol else pd.NA)
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Add the fingerprints to the DataFrame
|
|
69
|
+
df["fingerprint"] = fingerprints
|
|
70
|
+
|
|
71
|
+
# Drop the intermediate 'molecule' column if it was added
|
|
72
|
+
if delete_mol_column:
|
|
73
|
+
del df["molecule"]
|
|
74
|
+
return df
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
if __name__ == "__main__":
|
|
78
|
+
print("Running molecular fingerprint tests...")
|
|
79
|
+
print("Note: This requires molecular_screening module to be available")
|
|
80
|
+
|
|
81
|
+
# Test molecules
|
|
82
|
+
test_molecules = {
|
|
83
|
+
"aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
|
|
84
|
+
"caffeine": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
|
|
85
|
+
"glucose": "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O", # With stereochemistry
|
|
86
|
+
"sodium_acetate": "CC(=O)[O-].[Na+]", # Salt
|
|
87
|
+
"benzene": "c1ccccc1",
|
|
88
|
+
"butene_e": "C/C=C/C", # E-butene
|
|
89
|
+
"butene_z": "C/C=C\\C", # Z-butene
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
# Test 1: Morgan Fingerprints
|
|
93
|
+
print("\n1. Testing Morgan fingerprint generation...")
|
|
94
|
+
|
|
95
|
+
test_df = pd.DataFrame({"SMILES": list(test_molecules.values()), "name": list(test_molecules.keys())})
|
|
96
|
+
|
|
97
|
+
fp_df = compute_morgan_fingerprints(test_df.copy(), radius=2, n_bits=512, counts=False)
|
|
98
|
+
|
|
99
|
+
print(" Fingerprint generation results:")
|
|
100
|
+
for _, row in fp_df.iterrows():
|
|
101
|
+
fp = row.get("fingerprint", "N/A")
|
|
102
|
+
fp_len = len(fp) if fp != "N/A" else 0
|
|
103
|
+
print(f" {row['name']:15} → {fp_len} bits")
|
|
104
|
+
|
|
105
|
+
# Test 2: Different fingerprint parameters
|
|
106
|
+
print("\n2. Testing different fingerprint parameters...")
|
|
107
|
+
|
|
108
|
+
# Test with counts enabled
|
|
109
|
+
fp_counts_df = compute_morgan_fingerprints(test_df.copy(), radius=3, n_bits=256, counts=True)
|
|
110
|
+
|
|
111
|
+
print(" With count simulation (256 bits * 4):")
|
|
112
|
+
for _, row in fp_counts_df.iterrows():
|
|
113
|
+
fp = row.get("fingerprint", "N/A")
|
|
114
|
+
fp_len = len(fp) if fp != "N/A" else 0
|
|
115
|
+
print(f" {row['name']:15} → {fp_len} bits")
|
|
116
|
+
|
|
117
|
+
# Test 3: Edge cases
|
|
118
|
+
print("\n3. Testing edge cases...")
|
|
119
|
+
|
|
120
|
+
# Invalid SMILES
|
|
121
|
+
invalid_df = pd.DataFrame({"SMILES": ["INVALID", ""]})
|
|
122
|
+
try:
|
|
123
|
+
fp_invalid = compute_morgan_fingerprints(invalid_df.copy())
|
|
124
|
+
print(f" ✓ Invalid SMILES handled: {len(fp_invalid)} valid molecules")
|
|
125
|
+
except Exception as e:
|
|
126
|
+
print(f" ✓ Invalid SMILES properly raised error: {type(e).__name__}")
|
|
127
|
+
|
|
128
|
+
# Test with pre-existing molecule column
|
|
129
|
+
mol_df = test_df.copy()
|
|
130
|
+
mol_df["molecule"] = mol_df["SMILES"].apply(Chem.MolFromSmiles)
|
|
131
|
+
fp_with_mol = compute_morgan_fingerprints(mol_df)
|
|
132
|
+
print(f" ✓ Pre-existing molecule column handled: {len(fp_with_mol)} fingerprints generated")
|
|
133
|
+
|
|
134
|
+
print("\n✅ All fingerprint tests completed!")
|
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
"""
|
|
2
|
+
mol_descriptors.py - Molecular descriptor computation for ADMET modeling
|
|
3
|
+
|
|
4
|
+
Purpose:
|
|
5
|
+
Computes comprehensive molecular descriptors for ADMET (Absorption, Distribution,
|
|
6
|
+
Metabolism, Excretion, Toxicity) property prediction. Combines RDKit's full
|
|
7
|
+
descriptor set with selected Mordred descriptors and custom stereochemistry features.
|
|
8
|
+
|
|
9
|
+
Descriptor Categories:
|
|
10
|
+
1. RDKit Descriptors (~220 descriptors)
|
|
11
|
+
- Constitutional (MW, heavy atom count, rotatable bonds)
|
|
12
|
+
- Topological (Balaban J, Kappa indices, Chi indices)
|
|
13
|
+
- Geometric (radius of gyration, spherocity)
|
|
14
|
+
- Electronic (HOMO/LUMO estimates, partial charges)
|
|
15
|
+
- Lipophilicity (LogP, MolLogP)
|
|
16
|
+
- Pharmacophore (H-bond donors/acceptors, aromatic rings)
|
|
17
|
+
- ADMET-specific (TPSA, QED, Lipinski descriptors)
|
|
18
|
+
|
|
19
|
+
2. Mordred Descriptors (~80 descriptors from 5 ADMET-relevant modules)
|
|
20
|
+
- AcidBase module: pH-dependent properties (nAcid, nBase)
|
|
21
|
+
- Aromatic module: CYP metabolism features (nAromAtom, nAromBond)
|
|
22
|
+
- Constitutional module: Structural complexity (~40 descriptors including nSpiro, nBridgehead)
|
|
23
|
+
- Chi module: Molecular connectivity indices (~42 descriptors, Chi0-Chi4 variants)
|
|
24
|
+
- CarbonTypes module: Carbon hybridization states for metabolism (~20 descriptors)
|
|
25
|
+
|
|
26
|
+
3. Stereochemistry Features (10 custom descriptors)
|
|
27
|
+
- Stereocenter counts (R/S, defined/undefined)
|
|
28
|
+
- Stereobond counts (E/Z, defined/undefined)
|
|
29
|
+
- Stereochemical complexity and coverage metrics
|
|
30
|
+
- Critical for distinguishing drug enantiomers/diastereomers
|
|
31
|
+
|
|
32
|
+
Pipeline Integration:
|
|
33
|
+
This module expects standardized SMILES from mol_standardize.py:
|
|
34
|
+
|
|
35
|
+
1. Standardize structures (mol_standardize.py)
|
|
36
|
+
↓
|
|
37
|
+
2. Compute descriptors (this module)
|
|
38
|
+
↓
|
|
39
|
+
3. Feature selection/ML modeling
|
|
40
|
+
|
|
41
|
+
Output:
|
|
42
|
+
Returns input DataFrame with added descriptor columns:
|
|
43
|
+
- ~220 RDKit descriptors
|
|
44
|
+
- ~85 Mordred descriptors (from 5 modules)
|
|
45
|
+
- 10 stereochemistry descriptors
|
|
46
|
+
Total: ~310 descriptors
|
|
47
|
+
|
|
48
|
+
Invalid molecules receive NaN values for all descriptors.
|
|
49
|
+
|
|
50
|
+
Performance Notes:
|
|
51
|
+
- RDKit descriptors: Fast, vectorized computation
|
|
52
|
+
- Mordred descriptors: Moderate speed
|
|
53
|
+
- Stereochemistry: Moderate speed, requires CIP labeling
|
|
54
|
+
- Memory: <1GB per 10,000 molecules with all descriptors
|
|
55
|
+
|
|
56
|
+
Special Considerations:
|
|
57
|
+
- Ipc descriptor excluded due to potential overflow issues
|
|
58
|
+
- Molecules failing descriptor calculation get NaN (not dropped)
|
|
59
|
+
- Stereochemistry features optional for non-chiral datasets
|
|
60
|
+
- Salt information from standardization not included in descriptors
|
|
61
|
+
(use separately as categorical feature if needed)
|
|
62
|
+
- Feature selection recommended due to descriptor redundancy
|
|
63
|
+
|
|
64
|
+
Example Usage:
|
|
65
|
+
import pandas as pd
|
|
66
|
+
from mol_standardize import standardize_dataframe
|
|
67
|
+
from mol_descriptors import compute_descriptors
|
|
68
|
+
|
|
69
|
+
# Standard pipeline
|
|
70
|
+
df = pd.read_csv("molecules.csv")
|
|
71
|
+
df = standardize_dataframe(df) # Standardize first
|
|
72
|
+
df = compute_descriptors(df) # Then compute descriptors
|
|
73
|
+
|
|
74
|
+
# For achiral molecules (faster)
|
|
75
|
+
df = compute_descriptors(df, include_stereo=False)
|
|
76
|
+
|
|
77
|
+
# Custom SMILES column
|
|
78
|
+
df = compute_descriptors(df, smiles_column='canonical_smiles')
|
|
79
|
+
|
|
80
|
+
# The resulting DataFrame is ready for ML modeling
|
|
81
|
+
X = df.select_dtypes(include=[np.number]) # All numeric descriptors
|
|
82
|
+
y = df['activity'] # Your target variable
|
|
83
|
+
|
|
84
|
+
References:
|
|
85
|
+
- RDKit descriptors: https://www.rdkit.org/docs/GettingStartedInPython.html#descriptors
|
|
86
|
+
- Mordred: https://github.com/mordred-descriptor/mordred
|
|
87
|
+
- Stereochemistry in drug discovery: https://doi.org/10.1021/acs.jmedchem.0c00915
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
import logging
|
|
91
|
+
import pandas as pd
|
|
92
|
+
import numpy as np
|
|
93
|
+
import re
|
|
94
|
+
import time
|
|
95
|
+
from contextlib import contextmanager
|
|
96
|
+
from rdkit import Chem
|
|
97
|
+
from rdkit.Chem import Descriptors, rdCIPLabeler
|
|
98
|
+
from rdkit.ML.Descriptors import MoleculeDescriptors
|
|
99
|
+
from mordred import Calculator as MordredCalculator
|
|
100
|
+
from mordred import AcidBase, Aromatic, Constitutional, Chi, CarbonTypes
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
logger = logging.getLogger("workbench")
|
|
104
|
+
logger.setLevel(logging.DEBUG)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# Helper context manager for timing
|
|
108
|
+
@contextmanager
|
|
109
|
+
def timer(name):
|
|
110
|
+
start = time.time()
|
|
111
|
+
yield
|
|
112
|
+
print(f"{name}: {time.time() - start:.2f}s")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def compute_stereochemistry_features(mol):
|
|
116
|
+
"""
|
|
117
|
+
Compute stereochemistry descriptors using modern RDKit methods.
|
|
118
|
+
|
|
119
|
+
Returns dict with 10 stereochemistry descriptors commonly used in ADMET.
|
|
120
|
+
"""
|
|
121
|
+
if mol is None:
|
|
122
|
+
return {
|
|
123
|
+
"num_stereocenters": np.nan,
|
|
124
|
+
"num_unspecified_stereocenters": np.nan,
|
|
125
|
+
"num_defined_stereocenters": np.nan,
|
|
126
|
+
"num_r_centers": np.nan,
|
|
127
|
+
"num_s_centers": np.nan,
|
|
128
|
+
"num_stereobonds": np.nan,
|
|
129
|
+
"num_e_bonds": np.nan,
|
|
130
|
+
"num_z_bonds": np.nan,
|
|
131
|
+
"stereo_complexity": np.nan,
|
|
132
|
+
"frac_defined_stereo": np.nan,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
# Find all potential stereogenic elements
|
|
137
|
+
stereo_info = Chem.FindPotentialStereo(mol)
|
|
138
|
+
|
|
139
|
+
# Initialize counters
|
|
140
|
+
defined_centers = 0
|
|
141
|
+
undefined_centers = 0
|
|
142
|
+
r_centers = 0
|
|
143
|
+
s_centers = 0
|
|
144
|
+
defined_bonds = 0
|
|
145
|
+
undefined_bonds = 0
|
|
146
|
+
e_bonds = 0
|
|
147
|
+
z_bonds = 0
|
|
148
|
+
|
|
149
|
+
# Assign CIP labels for accurate R/S and E/Z determination
|
|
150
|
+
rdCIPLabeler.AssignCIPLabels(mol)
|
|
151
|
+
|
|
152
|
+
# Process stereogenic elements
|
|
153
|
+
for element in stereo_info:
|
|
154
|
+
if element.type == Chem.StereoType.Atom_Tetrahedral:
|
|
155
|
+
if element.specified == Chem.StereoSpecified.Specified:
|
|
156
|
+
defined_centers += 1
|
|
157
|
+
# Get the atom and check its CIP code
|
|
158
|
+
atom = mol.GetAtomWithIdx(element.centeredOn)
|
|
159
|
+
if atom.HasProp("_CIPCode"):
|
|
160
|
+
cip = atom.GetProp("_CIPCode")
|
|
161
|
+
if cip == "R":
|
|
162
|
+
r_centers += 1
|
|
163
|
+
elif cip == "S":
|
|
164
|
+
s_centers += 1
|
|
165
|
+
else:
|
|
166
|
+
undefined_centers += 1
|
|
167
|
+
|
|
168
|
+
elif element.type == Chem.StereoType.Bond_Double:
|
|
169
|
+
if element.specified == Chem.StereoSpecified.Specified:
|
|
170
|
+
defined_bonds += 1
|
|
171
|
+
# Get the bond and check its CIP code
|
|
172
|
+
bond = mol.GetBondWithIdx(element.centeredOn)
|
|
173
|
+
if bond.HasProp("_CIPCode"):
|
|
174
|
+
cip = bond.GetProp("_CIPCode")
|
|
175
|
+
if cip == "E":
|
|
176
|
+
e_bonds += 1
|
|
177
|
+
elif cip == "Z":
|
|
178
|
+
z_bonds += 1
|
|
179
|
+
else:
|
|
180
|
+
undefined_bonds += 1
|
|
181
|
+
|
|
182
|
+
# Calculate derived metrics
|
|
183
|
+
total_stereocenters = defined_centers + undefined_centers
|
|
184
|
+
total_stereobonds = defined_bonds + undefined_bonds
|
|
185
|
+
total_stereo = total_stereocenters + total_stereobonds
|
|
186
|
+
|
|
187
|
+
# Stereochemical complexity (total stereogenic elements)
|
|
188
|
+
stereo_complexity = total_stereo
|
|
189
|
+
|
|
190
|
+
# Fraction of defined stereochemistry
|
|
191
|
+
if total_stereo > 0:
|
|
192
|
+
frac_defined = (defined_centers + defined_bonds) / total_stereo
|
|
193
|
+
else:
|
|
194
|
+
frac_defined = 1.0 # No stereo elements = fully defined
|
|
195
|
+
|
|
196
|
+
return {
|
|
197
|
+
"num_stereocenters": total_stereocenters,
|
|
198
|
+
"num_unspecified_stereocenters": undefined_centers,
|
|
199
|
+
"num_defined_stereocenters": defined_centers,
|
|
200
|
+
"num_r_centers": r_centers,
|
|
201
|
+
"num_s_centers": s_centers,
|
|
202
|
+
"num_stereobonds": total_stereobonds,
|
|
203
|
+
"num_e_bonds": e_bonds,
|
|
204
|
+
"num_z_bonds": z_bonds,
|
|
205
|
+
"stereo_complexity": stereo_complexity,
|
|
206
|
+
"frac_defined_stereo": frac_defined,
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
except Exception as e:
|
|
210
|
+
logger.warning(f"Stereochemistry calculation failed: {e}")
|
|
211
|
+
return {
|
|
212
|
+
"num_stereocenters": np.nan,
|
|
213
|
+
"num_unspecified_stereocenters": np.nan,
|
|
214
|
+
"num_defined_stereocenters": np.nan,
|
|
215
|
+
"num_r_centers": np.nan,
|
|
216
|
+
"num_s_centers": np.nan,
|
|
217
|
+
"num_stereobonds": np.nan,
|
|
218
|
+
"num_e_bonds": np.nan,
|
|
219
|
+
"num_z_bonds": np.nan,
|
|
220
|
+
"stereo_complexity": np.nan,
|
|
221
|
+
"frac_defined_stereo": np.nan,
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_stereo: bool = True) -> pd.DataFrame:
|
|
226
|
+
"""
|
|
227
|
+
Compute all molecular descriptors for ADMET modeling.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
df: Input DataFrame with SMILES
|
|
231
|
+
include_mordred: Whether to compute Mordred descriptors (default True)
|
|
232
|
+
include_stereo: Whether to compute stereochemistry features (default True)
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
DataFrame with all descriptor columns added
|
|
236
|
+
|
|
237
|
+
Example:
|
|
238
|
+
df = standardize(df) # First standardize
|
|
239
|
+
df = compute_descriptors(df) # Then compute descriptors with stereo
|
|
240
|
+
df = compute_descriptors(df, include_stereo=False) # Without stereo
|
|
241
|
+
df = compute_descriptors(df, include_mordred=False) # RDKit only
|
|
242
|
+
"""
|
|
243
|
+
|
|
244
|
+
# Check for the smiles column (any capitalization)
|
|
245
|
+
smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
|
|
246
|
+
if smiles_column is None:
|
|
247
|
+
raise ValueError("Input DataFrame must have a 'smiles' column")
|
|
248
|
+
|
|
249
|
+
result = df.copy()
|
|
250
|
+
|
|
251
|
+
# Create molecule objects
|
|
252
|
+
logger.info("Creating molecule objects...")
|
|
253
|
+
molecules = []
|
|
254
|
+
for idx, row in result.iterrows():
|
|
255
|
+
smiles = row[smiles_column]
|
|
256
|
+
|
|
257
|
+
if pd.isna(smiles) or smiles == "":
|
|
258
|
+
molecules.append(None)
|
|
259
|
+
else:
|
|
260
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
261
|
+
molecules.append(mol)
|
|
262
|
+
|
|
263
|
+
# Compute RDKit descriptors
|
|
264
|
+
logger.info("Computing RDKit Descriptors...")
|
|
265
|
+
|
|
266
|
+
# Get all RDKit descriptors
|
|
267
|
+
all_descriptors = [x[0] for x in Descriptors._descList]
|
|
268
|
+
|
|
269
|
+
# Remove IPC descriptor due to overflow issue
|
|
270
|
+
# See: https://github.com/rdkit/rdkit/issues/1527
|
|
271
|
+
if "Ipc" in all_descriptors:
|
|
272
|
+
all_descriptors.remove("Ipc")
|
|
273
|
+
|
|
274
|
+
# Make sure we don't have duplicates
|
|
275
|
+
all_descriptors = list(set(all_descriptors))
|
|
276
|
+
|
|
277
|
+
# Initialize calculator
|
|
278
|
+
calc = MoleculeDescriptors.MolecularDescriptorCalculator(all_descriptors)
|
|
279
|
+
|
|
280
|
+
# Compute descriptors
|
|
281
|
+
descriptor_values = []
|
|
282
|
+
for mol in molecules:
|
|
283
|
+
if mol is None:
|
|
284
|
+
descriptor_values.append([np.nan] * len(all_descriptors))
|
|
285
|
+
else:
|
|
286
|
+
try:
|
|
287
|
+
values = calc.CalcDescriptors(mol)
|
|
288
|
+
descriptor_values.append(values)
|
|
289
|
+
except Exception as e:
|
|
290
|
+
logger.warning(f"RDKit descriptor calculation failed: {e}")
|
|
291
|
+
descriptor_values.append([np.nan] * len(all_descriptors))
|
|
292
|
+
|
|
293
|
+
# Create RDKit features DataFrame
|
|
294
|
+
rdkit_features_df = pd.DataFrame(descriptor_values, columns=calc.GetDescriptorNames())
|
|
295
|
+
|
|
296
|
+
# Add RDKit features to result
|
|
297
|
+
# Remove any columns from result that exist in rdkit_features_df
|
|
298
|
+
result = result.drop(columns=result.columns.intersection(rdkit_features_df.columns))
|
|
299
|
+
result = pd.concat([result, rdkit_features_df], axis=1)
|
|
300
|
+
|
|
301
|
+
# Compute Mordred descriptors
|
|
302
|
+
if include_mordred:
|
|
303
|
+
logger.info("Computing Mordred descriptors from relevant modules...")
|
|
304
|
+
calc = MordredCalculator()
|
|
305
|
+
|
|
306
|
+
# Register 5 ADMET-focused modules (avoiding overlap with RDKit)
|
|
307
|
+
calc.register(AcidBase) # ~2 descriptors: nAcid, nBase
|
|
308
|
+
calc.register(Aromatic) # ~2 descriptors: nAromAtom, nAromBond
|
|
309
|
+
calc.register(Constitutional) # ~30 descriptors: structural complexity
|
|
310
|
+
calc.register(Chi) # ~32 descriptors: connectivity indices
|
|
311
|
+
calc.register(CarbonTypes) # ~20 descriptors: carbon hybridization
|
|
312
|
+
|
|
313
|
+
# Compute Mordred descriptors
|
|
314
|
+
valid_mols = [mol if mol is not None else Chem.MolFromSmiles("C") for mol in molecules]
|
|
315
|
+
mordred_df = calc.pandas(valid_mols, nproc=1) # Endpoint multiprocessing will fail with nproc>1
|
|
316
|
+
|
|
317
|
+
# Replace values for invalid molecules with NaN
|
|
318
|
+
for i, mol in enumerate(molecules):
|
|
319
|
+
if mol is None:
|
|
320
|
+
mordred_df.iloc[i] = np.nan
|
|
321
|
+
|
|
322
|
+
# Handle Mordred's special error values
|
|
323
|
+
for col in mordred_df.columns:
|
|
324
|
+
mordred_df[col] = pd.to_numeric(mordred_df[col], errors="coerce")
|
|
325
|
+
|
|
326
|
+
# Add Mordred features to result
|
|
327
|
+
# Remove any columns from result that exist in mordred
|
|
328
|
+
result = result.drop(columns=result.columns.intersection(mordred_df.columns))
|
|
329
|
+
result = pd.concat([result, mordred_df], axis=1)
|
|
330
|
+
|
|
331
|
+
# Compute stereochemistry features if requested
|
|
332
|
+
if include_stereo:
|
|
333
|
+
logger.info("Computing Stereochemistry Descriptors...")
|
|
334
|
+
|
|
335
|
+
stereo_features = []
|
|
336
|
+
for mol in molecules:
|
|
337
|
+
stereo_dict = compute_stereochemistry_features(mol)
|
|
338
|
+
stereo_features.append(stereo_dict)
|
|
339
|
+
|
|
340
|
+
# Create stereochemistry DataFrame
|
|
341
|
+
stereo_df = pd.DataFrame(stereo_features)
|
|
342
|
+
|
|
343
|
+
# Add stereochemistry features to result
|
|
344
|
+
result = result.drop(columns=result.columns.intersection(stereo_df.columns))
|
|
345
|
+
result = pd.concat([result, stereo_df], axis=1)
|
|
346
|
+
|
|
347
|
+
logger.info(f"Added {len(stereo_df.columns)} stereochemistry descriptors")
|
|
348
|
+
|
|
349
|
+
# Log summary
|
|
350
|
+
valid_mols = sum(1 for m in molecules if m is not None)
|
|
351
|
+
total_descriptors = len(result.columns) - len(df.columns)
|
|
352
|
+
logger.info(f"Computed {total_descriptors} descriptors for {valid_mols}/{len(df)} valid molecules")
|
|
353
|
+
|
|
354
|
+
# Log descriptor breakdown
|
|
355
|
+
rdkit_count = len(rdkit_features_df.columns)
|
|
356
|
+
mordred_count = len(mordred_df.columns) if include_mordred else 0
|
|
357
|
+
stereo_count = len(stereo_df.columns) if include_stereo else 0
|
|
358
|
+
logger.info(f"Descriptor breakdown: RDKit={rdkit_count}, Mordred={mordred_count}, Stereo={stereo_count}")
|
|
359
|
+
|
|
360
|
+
# Sanitize column names for AWS Athena compatibility
|
|
361
|
+
# - Must be lowercase, no special characters except underscore, no spaces
|
|
362
|
+
result.columns = [re.sub(r"_+", "_", re.sub(r"[^a-z0-9_]", "_", col.lower())) for col in result.columns]
|
|
363
|
+
|
|
364
|
+
# Drop duplicate columns if any exist after sanitization
|
|
365
|
+
if result.columns.duplicated().any():
|
|
366
|
+
logger.warning("Duplicate column names after sanitization - dropping duplicates!")
|
|
367
|
+
result = result.loc[:, ~result.columns.duplicated()]
|
|
368
|
+
|
|
369
|
+
return result
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
if __name__ == "__main__":
|
|
373
|
+
from mol_standardize import standardize
|
|
374
|
+
from workbench.api import DataSource
|
|
375
|
+
|
|
376
|
+
# Configure pandas display
|
|
377
|
+
pd.set_option("display.max_columns", None)
|
|
378
|
+
pd.set_option("display.max_colwidth", 100)
|
|
379
|
+
pd.set_option("display.width", 1200)
|
|
380
|
+
|
|
381
|
+
# Test data - stereochemistry examples
|
|
382
|
+
stereo_test_data = pd.DataFrame(
|
|
383
|
+
{
|
|
384
|
+
"smiles": [
|
|
385
|
+
"CC(=O)Oc1ccccc1C(=O)O", # Aspirin
|
|
386
|
+
"C[C@H](N)C(=O)O", # L-Alanine
|
|
387
|
+
"C[C@@H](N)C(=O)O", # D-Alanine
|
|
388
|
+
"C/C=C/C=C/C", # E,E-hexadiene
|
|
389
|
+
"CC(F)(Cl)Br", # Unspecified chiral
|
|
390
|
+
"",
|
|
391
|
+
"INVALID", # Invalid cases
|
|
392
|
+
],
|
|
393
|
+
"name": ["Aspirin", "L-Alanine", "D-Alanine", "E,E-hexadiene", "Unspecified", "Empty", "Invalid"],
|
|
394
|
+
}
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
# Test data - salt handling examples
|
|
398
|
+
salt_test_data = pd.DataFrame(
|
|
399
|
+
{
|
|
400
|
+
"smiles": [
|
|
401
|
+
"CC(=O)O", # Acetic acid
|
|
402
|
+
"[Na+].CC(=O)[O-]", # Sodium acetate
|
|
403
|
+
"CC(C)NCC(O)c1ccc(O)c(O)c1.Cl", # Drug HCl salt
|
|
404
|
+
"Oc1ccccn1", # Tautomer 1
|
|
405
|
+
"O=c1cccc[nH]1", # Tautomer 2
|
|
406
|
+
],
|
|
407
|
+
"compound_id": [f"C{i:03d}" for i in range(1, 6)],
|
|
408
|
+
}
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
def run_basic_tests():
|
|
412
|
+
"""Run basic functionality tests"""
|
|
413
|
+
print("=" * 80)
|
|
414
|
+
print("BASIC FUNCTIONALITY TESTS")
|
|
415
|
+
print("=" * 80)
|
|
416
|
+
|
|
417
|
+
# Test stereochemistry
|
|
418
|
+
result = compute_descriptors(stereo_test_data, include_stereo=True)
|
|
419
|
+
|
|
420
|
+
print("\nStereochemistry features (selected molecules):")
|
|
421
|
+
for idx, name in enumerate(stereo_test_data["name"][:4]):
|
|
422
|
+
print(
|
|
423
|
+
f"{name:15} - centers: {result.iloc[idx]['num_stereocenters']:.0f}, "
|
|
424
|
+
f"R/S: {result.iloc[idx]['num_r_centers']:.0f}/"
|
|
425
|
+
f"{result.iloc[idx]['num_s_centers']:.0f}"
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Test salt handling
|
|
429
|
+
print("\nSalt extraction test:")
|
|
430
|
+
std_result = standardize(salt_test_data, extract_salts=True)
|
|
431
|
+
for _, row in std_result.iterrows():
|
|
432
|
+
salt_info = f" → salt: {row['salt']}" if pd.notna(row["salt"]) else ""
|
|
433
|
+
print(f"{row['compound_id']}: {row['smiles'][:30]}{salt_info}")
|
|
434
|
+
|
|
435
|
+
def run_performance_tests():
|
|
436
|
+
"""Run performance timing tests"""
|
|
437
|
+
print("\n" + "=" * 80)
|
|
438
|
+
print("PERFORMANCE TESTS on real world molecules")
|
|
439
|
+
print("=" * 80)
|
|
440
|
+
|
|
441
|
+
# Get a real dataset from Workbench
|
|
442
|
+
ds = DataSource("aqsol_data")
|
|
443
|
+
df = ds.pull_dataframe()[["id", "smiles"]][:1000] # Limit to 1000 for testing
|
|
444
|
+
n_mols = df.shape[0]
|
|
445
|
+
print(f"Pulled {n_mols} molecules from DataSource 'aqsol_data'")
|
|
446
|
+
|
|
447
|
+
# Test configurations
|
|
448
|
+
configs = [
|
|
449
|
+
("Standardize (full)", standardize, {"extract_salts": True, "canonicalize_tautomer": True}),
|
|
450
|
+
("Standardize (minimal)", standardize, {"extract_salts": False, "canonicalize_tautomer": False}),
|
|
451
|
+
("Descriptors (all)", compute_descriptors, {"include_mordred": True, "include_stereo": True}),
|
|
452
|
+
("Descriptors (RDKit only)", compute_descriptors, {"include_mordred": False, "include_stereo": False}),
|
|
453
|
+
]
|
|
454
|
+
|
|
455
|
+
results = []
|
|
456
|
+
for name, func, params in configs:
|
|
457
|
+
start = time.time()
|
|
458
|
+
_ = func(df, **params)
|
|
459
|
+
elapsed = time.time() - start
|
|
460
|
+
throughput = n_mols / elapsed
|
|
461
|
+
results.append((name, elapsed, throughput))
|
|
462
|
+
print(f"{name:25} {elapsed:6.2f}s ({throughput:6.1f} mol/s)")
|
|
463
|
+
|
|
464
|
+
# Full pipeline test
|
|
465
|
+
print("\nFull pipeline (standardize + all descriptors):")
|
|
466
|
+
start = time.time()
|
|
467
|
+
std_data = standardize(df)
|
|
468
|
+
standardize_time = time.time() - start
|
|
469
|
+
print(f" Standardize: {standardize_time:.2f}s ({n_mols / standardize_time:.1f} mol/s)")
|
|
470
|
+
start = time.time()
|
|
471
|
+
_ = compute_descriptors(std_data)
|
|
472
|
+
descriptor_time = time.time() - start
|
|
473
|
+
print(f" Descriptors: {descriptor_time:.2f}s ({n_mols / descriptor_time:.1f} mol/s)")
|
|
474
|
+
pipeline_time = standardize_time + descriptor_time
|
|
475
|
+
print(f" Total: {pipeline_time:.2f}s ({n_mols / pipeline_time:.1f} mol/s)")
|
|
476
|
+
|
|
477
|
+
return results
|
|
478
|
+
|
|
479
|
+
# Run tests
|
|
480
|
+
run_basic_tests()
|
|
481
|
+
timing_results = run_performance_tests()
|
|
482
|
+
|
|
483
|
+
print("\n✅ All tests completed!")
|