synkit 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synkit/Chem/Fingerprint/__init__.py +0 -0
- synkit/Chem/Fingerprint/fp_calculator.py +122 -0
- synkit/Chem/Fingerprint/smiles_featurizer.py +185 -0
- synkit/Chem/Fingerprint/transformation_fp.py +79 -0
- synkit/Chem/Molecule/__init__.py +0 -0
- synkit/Chem/Molecule/standardize.py +137 -0
- synkit/Chem/Reaction/__init__.py +0 -0
- synkit/Chem/Reaction/balance_check.py +162 -0
- synkit/Chem/Reaction/cleanning.py +59 -0
- synkit/Chem/Reaction/deionize.py +289 -0
- synkit/Chem/Reaction/neutralize.py +256 -0
- synkit/Chem/Reaction/reagent.py +102 -0
- synkit/Chem/Reaction/standardize.py +157 -0
- synkit/Chem/Reaction/tautomerize.py +168 -0
- synkit/Graph/Cluster/__init__.py +0 -0
- synkit/Graph/Cluster/morphism.py +83 -0
- synkit/Graph/Feature/__init__.py +0 -0
- synkit/Graph/Feature/graph_descriptors.py +325 -0
- synkit/Graph/Feature/graph_fps.py +97 -0
- synkit/Graph/Feature/graph_signature.py +236 -0
- synkit/Graph/Feature/hash_fps.py +130 -0
- synkit/Graph/Feature/morgan_fps.py +87 -0
- synkit/Graph/Feature/path_fps.py +82 -0
- synkit/Graph/__init.py +0 -0
- synkit/IO/__init__.py +0 -0
- synkit/IO/chem_converter.py +231 -0
- synkit/IO/data_io.py +277 -0
- synkit/IO/data_process.py +49 -0
- synkit/IO/debug.py +78 -0
- synkit/IO/dg_to_gml.py +124 -0
- synkit/IO/gml_to_nx.py +119 -0
- synkit/IO/graph_to_mol.py +110 -0
- synkit/IO/mol_to_graph.py +282 -0
- synkit/IO/nx_to_gml.py +200 -0
- synkit/IO/parse_rule.py +172 -0
- synkit/IO/smiles_to_id.py +119 -0
- synkit/ITS/_misc.py +280 -0
- synkit/ITS/aam_validator.py +254 -0
- synkit/ITS/its_builder.py +94 -0
- synkit/ITS/its_construction.py +213 -0
- synkit/ITS/normalize_aam.py +183 -0
- synkit/ITS/partial_expand.py +170 -0
- synkit/Reactor/__init__.py +0 -0
- synkit/Reactor/core_engine.py +164 -0
- synkit/Reactor/inference.py +73 -0
- synkit/Reactor/multi_step.py +227 -0
- synkit/Reactor/multi_step_aam.py +82 -0
- synkit/Reactor/reagent.py +95 -0
- synkit/Reactor/rule_apply.py +81 -0
- synkit/Vis/__init__.py +0 -0
- synkit/Vis/chemical_graph_visualizer.py +378 -0
- synkit/Vis/chemical_reaction_visualizer.py +133 -0
- synkit/Vis/chemical_space.py +83 -0
- synkit/Vis/embedding.py +92 -0
- synkit/Vis/graph_visualizer.py +286 -0
- synkit/Vis/pdf_writer.py +143 -0
- synkit/Vis/rsmi_to_fig.py +169 -0
- synkit/__init__.py +0 -0
- synkit/_misc.py +181 -0
- synkit-0.0.1.dist-info/METADATA +148 -0
- synkit-0.0.1.dist-info/RECORD +63 -0
- synkit-0.0.1.dist-info/WHEEL +4 -0
- synkit-0.0.1.dist-info/licenses/LICENSE +21 -0
|
File without changes
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from drfp import DrfpEncoder
|
|
4
|
+
from joblib import Parallel, delayed
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from synkit.IO.debug import setup_logging
|
|
7
|
+
from synkit.Chem.Fingerprint.transformation_fp import TransformationFP
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FPCalculator:
|
|
11
|
+
"""
|
|
12
|
+
Class to calculate fingerprint vectors for chemical compounds represented by SMILES strings.
|
|
13
|
+
|
|
14
|
+
Attributes:
|
|
15
|
+
- data (pd.DataFrame): DataFrame containing SMILES strings and potentially other data.
|
|
16
|
+
- smiles_column (str): Name of the column containing the SMILES strings.
|
|
17
|
+
- fp_type (str): Type of fingerprint to calculate; supports 'drfp'.
|
|
18
|
+
- n_jobs (int): Number of parallel jobs to run for performance enhancement.
|
|
19
|
+
- verbose (int): Verbosity level of parallel computation.
|
|
20
|
+
- save_path (Optional[str]): Path to save the resulting DataFrame. If None, the DataFrame is not saved.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
data: pd.DataFrame,
|
|
26
|
+
smiles_column: str = "smiles",
|
|
27
|
+
fp_type: str = "drfp",
|
|
28
|
+
n_jobs: int = 2,
|
|
29
|
+
verbose: int = 0,
|
|
30
|
+
save_path: Optional[str] = None,
|
|
31
|
+
):
|
|
32
|
+
self.data = data
|
|
33
|
+
self.smiles_column = smiles_column
|
|
34
|
+
self.fp_type = fp_type
|
|
35
|
+
self.n_jobs = n_jobs
|
|
36
|
+
self.verbose = verbose
|
|
37
|
+
self.save_path = save_path
|
|
38
|
+
self.logger = setup_logging()
|
|
39
|
+
self._validate_fp_type(fp_type)
|
|
40
|
+
|
|
41
|
+
def _validate_fp_type(self, fp_type: str) -> None:
|
|
42
|
+
valid_fps = [
|
|
43
|
+
"drfp",
|
|
44
|
+
"avalon",
|
|
45
|
+
"maccs",
|
|
46
|
+
"torsion",
|
|
47
|
+
"pharm2D",
|
|
48
|
+
"ecfp2",
|
|
49
|
+
"ecfp4",
|
|
50
|
+
"ecfp6",
|
|
51
|
+
"fcfp2",
|
|
52
|
+
"fcfp4",
|
|
53
|
+
"fcfp6",
|
|
54
|
+
]
|
|
55
|
+
if fp_type not in valid_fps:
|
|
56
|
+
raise ValueError(
|
|
57
|
+
f"Unsupported fingerprint type '{fp_type}'. Currently supported: {', '.join(valid_fps)}."
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def calculate_drfp(smiles: str) -> np.ndarray:
|
|
62
|
+
"""
|
|
63
|
+
Calculate the fingerprint vector for a given SMILES string using the DrfpEncoder.
|
|
64
|
+
|
|
65
|
+
Parameters:
|
|
66
|
+
- smiles (str): A SMILES string representing a chemical compound.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
- np.ndarray: A numpy array representing the fingerprint vector.
|
|
70
|
+
"""
|
|
71
|
+
return DrfpEncoder.encode(smiles)[0]
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def smiles_to_vec(reaction_smiles: str, fp_type: str) -> np.ndarray:
|
|
75
|
+
"""
|
|
76
|
+
Convert a SMILES string to a fingerprint vector based on the specified fingerprint type.
|
|
77
|
+
|
|
78
|
+
Parameters:
|
|
79
|
+
- reaction_smiles (str): A SMILES string representing a chemical compound.
|
|
80
|
+
- fp_type (str): Type of fingerprint to calculate.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
- np.ndarray: A numpy array representing the fingerprint vector.
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
ValueError: If an unsupported fingerprint type is specified.
|
|
87
|
+
"""
|
|
88
|
+
if fp_type == "drfp":
|
|
89
|
+
return FPCalculator.calculate_drfp(reaction_smiles)
|
|
90
|
+
else:
|
|
91
|
+
return TransformationFP.fit(reaction_smiles, ">>", fp_type, True)
|
|
92
|
+
|
|
93
|
+
def fit(self) -> pd.DataFrame:
|
|
94
|
+
"""
|
|
95
|
+
Calculates the fingerprints for all SMILES strings in the dataset according to the specified fingerprint type.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
- pd.DataFrame: The original DataFrame with an added column for the calculated fingerprints.
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
ValueError: If the SMILES column specified does not exist in the DataFrame.
|
|
102
|
+
"""
|
|
103
|
+
if self.smiles_column not in self.data.columns:
|
|
104
|
+
raise ValueError(
|
|
105
|
+
f"Column '{self.smiles_column}' does not exist in the DataFrame."
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
fps = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
|
|
109
|
+
delayed(FPCalculator.smiles_to_vec)(smiles, self.fp_type)
|
|
110
|
+
for smiles in self.data[self.smiles_column]
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
fps_df = pd.DataFrame(
|
|
114
|
+
fps, columns=[f"{self.fp_type}_{i}" for i in range(len(fps[0]))]
|
|
115
|
+
)
|
|
116
|
+
result_data = pd.concat([self.data, fps_df], axis=1)
|
|
117
|
+
|
|
118
|
+
if self.save_path:
|
|
119
|
+
result_data.to_csv(self.save_path, index=False)
|
|
120
|
+
self.logger.info(f"Data saved to {self.save_path}")
|
|
121
|
+
|
|
122
|
+
return result_data
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from rdkit import Chem, DataStructs
|
|
3
|
+
from rdkit.Chem import AllChem, MACCSkeys
|
|
4
|
+
from rdkit.Chem.AtomPairs import Pairs, Torsions
|
|
5
|
+
from rdkit.Avalon import pyAvalonTools as fpAvalon
|
|
6
|
+
from rdkit.Chem.Pharm2D import Gobbi_Pharm2D, Generate
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SmilesFeaturizer:
|
|
10
|
+
def __init__(self):
|
|
11
|
+
"""
|
|
12
|
+
Initializes the SmilesFeaturizer class without any specific parameters for fingerprint generation.
|
|
13
|
+
"""
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
@staticmethod
|
|
17
|
+
def smiles_to_mol(smiles: str) -> Chem.Mol:
|
|
18
|
+
"""
|
|
19
|
+
Converts a SMILES string to an RDKit Mol object.
|
|
20
|
+
|
|
21
|
+
Parameters:
|
|
22
|
+
- smiles (str): The SMILES string to be converted.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
- Chem.Mol: The corresponding RDKit Mol object.
|
|
26
|
+
"""
|
|
27
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
28
|
+
if mol is None:
|
|
29
|
+
raise ValueError("Invalid SMILES string provided.")
|
|
30
|
+
return mol
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def get_maccs_keys(mol: Chem.Mol):
|
|
34
|
+
"""
|
|
35
|
+
Generates MACCS keys fingerprint from an RDKit Mol object.
|
|
36
|
+
|
|
37
|
+
Parameters:
|
|
38
|
+
- mol (Chem.Mol): The Mol object to be featurized.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
- RDKit ExplicitBitVect: The MACCS keys fingerprint of the Mol object.
|
|
42
|
+
"""
|
|
43
|
+
return MACCSkeys.GenMACCSKeys(mol)
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def get_avalon_fp(mol: Chem.Mol, nBits: int = 1024):
|
|
47
|
+
"""
|
|
48
|
+
Generates Avalon fingerprint from an RDKit Mol object.
|
|
49
|
+
|
|
50
|
+
Parameters:
|
|
51
|
+
- mol (Chem.Mol): The Mol object to be featurized.
|
|
52
|
+
- nBits (int): The number of bits in the generated fingerprint.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
- RDKit ExplicitBitVect: The Avalon fingerprint of the Mol object.
|
|
56
|
+
"""
|
|
57
|
+
return fpAvalon.GetAvalonFP(mol, nBits)
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def get_ecfp(
|
|
61
|
+
mol: Chem.Mol, radius: int, nBits: int = 2048, useFeatures: bool = False
|
|
62
|
+
):
|
|
63
|
+
"""
|
|
64
|
+
Generates Extended-Connectivity Fingerprints (ECFP) or
|
|
65
|
+
Feature-Class Fingerprints (FCFP) from an RDKit Mol object.
|
|
66
|
+
|
|
67
|
+
Parameters:
|
|
68
|
+
- mol (Chem.Mol): The Mol object to be featurized.
|
|
69
|
+
- radius (int): The radius of the fingerprint.
|
|
70
|
+
- nBits (int): The number of bits in the generated fingerprint.
|
|
71
|
+
- useFeatures (bool): Whether to use atom features instead of atom identities.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
- RDKit ExplicitBitVect: The ECFP or FCFP fingerprint of the Mol object.
|
|
75
|
+
"""
|
|
76
|
+
return AllChem.GetMorganFingerprintAsBitVect(
|
|
77
|
+
mol, radius, nBits=nBits, useFeatures=useFeatures
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
@staticmethod
|
|
81
|
+
def get_rdk_fp(
|
|
82
|
+
mol: Chem.Mol, maxPath: int, fpSize: int = 2048, nBitsPerHash: int = 2
|
|
83
|
+
):
|
|
84
|
+
"""
|
|
85
|
+
Generates RDKit fingerprint from an RDKit Mol object.
|
|
86
|
+
|
|
87
|
+
Parameters:
|
|
88
|
+
- mol (Chem.Mol): The Mol object to be featurized.
|
|
89
|
+
- maxPath (int): The maximum path length (in bonds) to be included.
|
|
90
|
+
- fpSize (int): The size of the fingerprint.
|
|
91
|
+
- nBitsPerHash (int): The number of bits per hash.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
- RDKit ExplicitBitVect: The RDKit fingerprint of the Mol object.
|
|
95
|
+
"""
|
|
96
|
+
return Chem.RDKFingerprint(
|
|
97
|
+
mol, maxPath=maxPath, fpSize=fpSize, nBitsPerHash=nBitsPerHash
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def mol_to_ap(mol: Chem.Mol) -> np.ndarray:
|
|
102
|
+
"""
|
|
103
|
+
Generates an Atom Pair fingerprint as a NumPy array from an RDKit Mol object.
|
|
104
|
+
|
|
105
|
+
Parameters:
|
|
106
|
+
- mol (Chem.Mol): The Mol object to be featurized.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
- RDKit ExplicitBitVect: The RDKit fingerprint of the Mol object.
|
|
110
|
+
"""
|
|
111
|
+
return Pairs.GetAtomPairFingerprint(mol)
|
|
112
|
+
|
|
113
|
+
@staticmethod
|
|
114
|
+
def mol_to_torsion(mol: Chem.Mol) -> np.ndarray:
|
|
115
|
+
"""
|
|
116
|
+
Generates a Topological Torsion fingerprint as a NumPy array from an RDKit Mol object.
|
|
117
|
+
|
|
118
|
+
Parameters:
|
|
119
|
+
- mol (Chem.Mol): The Mol object to be featurized.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
- RDKit ExplicitBitVect: The RDKit fingerprint of the Mol object.
|
|
123
|
+
"""
|
|
124
|
+
return Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol)
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def mol_to_pharm2d(mol: Chem.Mol) -> np.ndarray:
|
|
128
|
+
"""
|
|
129
|
+
Generates a 2D Pharmacophore fingerprint as a NumPy array from an RDKit Mol object.
|
|
130
|
+
|
|
131
|
+
Parameters:
|
|
132
|
+
- mol (Chem.Mol): The Mol object to be featurized.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
- RDKit ExplicitBitVect: The RDKit fingerprint of the Mol object.
|
|
136
|
+
"""
|
|
137
|
+
return Generate.Gen2DFingerprint(mol, Gobbi_Pharm2D.factory)
|
|
138
|
+
|
|
139
|
+
@classmethod
|
|
140
|
+
def featurize_smiles(
|
|
141
|
+
cls, smiles: str, fingerprint_type: str, convert_to_array: bool = True, **kwargs
|
|
142
|
+
) -> np.ndarray:
|
|
143
|
+
"""
|
|
144
|
+
Featurizes a SMILES string into the specified type of fingerprint, optionally converting it to a NumPy array.
|
|
145
|
+
|
|
146
|
+
Parameters:
|
|
147
|
+
- smiles (str): The SMILES string to be featurized.
|
|
148
|
+
- fingerprint_type (str): The type of fingerprint to generate.
|
|
149
|
+
- convert_to_array (bool): Whether to convert the fingerprint to a NumPy array. Defaults to True.
|
|
150
|
+
- **kwargs: Additional keyword arguments for the fingerprint function.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
- np.ndarray or RDKit ExplicitBitVect: The requested type of fingerprint for the SMILES string,
|
|
154
|
+
either as a NumPy array or as an RDKit bit vector, depending on `convert_to_array`.
|
|
155
|
+
"""
|
|
156
|
+
mol = cls.smiles_to_mol(smiles)
|
|
157
|
+
if fingerprint_type == "maccs":
|
|
158
|
+
fp = cls.get_maccs_keys(mol)
|
|
159
|
+
elif fingerprint_type == "avalon":
|
|
160
|
+
fp = cls.get_avalon_fp(mol, **kwargs)
|
|
161
|
+
elif fingerprint_type.startswith("ecfp") or fingerprint_type.startswith("fcfp"):
|
|
162
|
+
radius = int(fingerprint_type[4])
|
|
163
|
+
useFeatures = fingerprint_type.startswith("fcfp")
|
|
164
|
+
nBits = kwargs.get("nBits", 2048)
|
|
165
|
+
fp = cls.get_ecfp(mol, radius, nBits=nBits, useFeatures=useFeatures)
|
|
166
|
+
elif fingerprint_type.startswith("rdk"):
|
|
167
|
+
maxPath = int(fingerprint_type[3])
|
|
168
|
+
fp = cls.get_rdk_fp(mol, maxPath, **kwargs)
|
|
169
|
+
elif fingerprint_type == "avalon":
|
|
170
|
+
return cls.mol_to_ap(mol)
|
|
171
|
+
elif fingerprint_type == "torsion":
|
|
172
|
+
return cls.mol_to_torsion(mol)
|
|
173
|
+
elif fingerprint_type == "pharm2d":
|
|
174
|
+
return cls.mol_to_pharm2d(mol)
|
|
175
|
+
else:
|
|
176
|
+
raise ValueError(f"Unsupported fingerprint type: {fingerprint_type}")
|
|
177
|
+
if convert_to_array:
|
|
178
|
+
if fingerprint_type == "pharm2d":
|
|
179
|
+
return np.frombuffer(fp.ToBitString().encode(), "u1") - ord("0")
|
|
180
|
+
else:
|
|
181
|
+
ar = np.zeros((1,), dtype=np.int8)
|
|
182
|
+
DataStructs.ConvertToNumpyArray(fp, ar)
|
|
183
|
+
return ar
|
|
184
|
+
else:
|
|
185
|
+
return fp
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from typing import Union, Any
|
|
3
|
+
from rdkit.DataStructs import cDataStructs
|
|
4
|
+
from synkit.Chem.Fingerprint.smiles_featurizer import SmilesFeaturizer
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TransformationFP:
|
|
8
|
+
"""
|
|
9
|
+
A class for handling the transformation of chemical reactions into reaction fingerprints
|
|
10
|
+
based on SMILES strings.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self) -> None:
|
|
14
|
+
"""
|
|
15
|
+
Initializes the TransformationFP object. Currently, this constructor does not
|
|
16
|
+
perform any operations.
|
|
17
|
+
"""
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def convert_arr2vec(arr: np.ndarray) -> cDataStructs.ExplicitBitVect:
|
|
22
|
+
"""
|
|
23
|
+
Converts a numpy array to a RDKit ExplicitBitVect.
|
|
24
|
+
|
|
25
|
+
Parameters:
|
|
26
|
+
- arr (np.ndarray): The input array.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
- cDataStructs.ExplicitBitVect: The converted bit vector.
|
|
30
|
+
"""
|
|
31
|
+
arr_tostring = "".join(arr.astype(str))
|
|
32
|
+
EBitVect = cDataStructs.CreateFromBitString(arr_tostring)
|
|
33
|
+
return EBitVect
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def fit(
|
|
37
|
+
reaction_smiles: str,
|
|
38
|
+
symbols: str,
|
|
39
|
+
fp_type: str,
|
|
40
|
+
abs: bool,
|
|
41
|
+
return_array: bool = True,
|
|
42
|
+
**kwargs: Any,
|
|
43
|
+
) -> Union[np.ndarray, cDataStructs.ExplicitBitVect]:
|
|
44
|
+
"""
|
|
45
|
+
Generates a reaction fingerprint for a given reaction represented by a SMILES string.
|
|
46
|
+
|
|
47
|
+
Parameters:
|
|
48
|
+
- reaction_smiles (str): The SMILES string of the reaction, separated by `symbols`.
|
|
49
|
+
- symbols (str): The symbol used to separate reactants and products in the SMILES string.
|
|
50
|
+
- fp_type (str): The type of fingerprint to generate (e.g., 'maccs', 'ecfp').
|
|
51
|
+
- abs (bool): Whether to take the absolute value of the reaction fingerprint difference.
|
|
52
|
+
- return_array (bool): Whether to return the reaction fingerprint as a numpy array or as a bit vector.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
- Union[np.ndarray, cDataStructs.ExplicitBitVect]: The reaction fingerprint either as an array
|
|
56
|
+
or a bit vector, depending on the value of `return_array`.
|
|
57
|
+
"""
|
|
58
|
+
react, prod = reaction_smiles.split(symbols)
|
|
59
|
+
react_fps = None
|
|
60
|
+
for s in react.split("."):
|
|
61
|
+
if react_fps is None:
|
|
62
|
+
react_fps = SmilesFeaturizer.featurize_smiles(s, fp_type, **kwargs)
|
|
63
|
+
else:
|
|
64
|
+
react_fps += SmilesFeaturizer.featurize_smiles(s, fp_type, **kwargs)
|
|
65
|
+
|
|
66
|
+
prod_fps = None
|
|
67
|
+
for s in prod.split("."):
|
|
68
|
+
if prod_fps is None:
|
|
69
|
+
prod_fps = SmilesFeaturizer.featurize_smiles(s, fp_type, **kwargs)
|
|
70
|
+
else:
|
|
71
|
+
prod_fps += SmilesFeaturizer.featurize_smiles(s, fp_type, **kwargs)
|
|
72
|
+
|
|
73
|
+
reaction_fp = np.subtract(prod_fps, react_fps)
|
|
74
|
+
if abs:
|
|
75
|
+
reaction_fp = np.abs(reaction_fp)
|
|
76
|
+
if return_array:
|
|
77
|
+
return reaction_fp
|
|
78
|
+
else:
|
|
79
|
+
return TransformationFP.convert_arr2vec(reaction_fp)
|
|
File without changes
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
from rdkit import Chem
|
|
2
|
+
from rdkit.Chem import rdmolops
|
|
3
|
+
from rdkit.Chem.MolStandardize import rdMolStandardize
|
|
4
|
+
from rdkit.Chem.SaltRemover import SaltRemover
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def normalize_molecule(mol: Chem.Mol) -> Chem.Mol:
|
|
8
|
+
"""
|
|
9
|
+
Normalize a molecule using RDKit's Normalizer.
|
|
10
|
+
|
|
11
|
+
Parameters:
|
|
12
|
+
- mol (Chem.Mol): RDKit Mol object to be normalized.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
- Chem.Mol: Normalized RDKit Mol object.
|
|
16
|
+
"""
|
|
17
|
+
normalizer = rdMolStandardize.Normalizer()
|
|
18
|
+
return normalizer.normalize(mol)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def canonicalize_tautomer(mol: Chem.Mol) -> Chem.Mol:
|
|
22
|
+
"""
|
|
23
|
+
Canonicalize the tautomer of a molecule using RDKit's TautomerCanonicalizer.
|
|
24
|
+
|
|
25
|
+
Parameters:
|
|
26
|
+
- mol (Chem.Mol): RDKit Mol object.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
- Chem.Mol: Mol object with canonicalized tautomer.
|
|
30
|
+
"""
|
|
31
|
+
tautomer_canonicalizer = rdMolStandardize.TautomerEnumerator()
|
|
32
|
+
return tautomer_canonicalizer.Canonicalize(mol)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def salts_remover(mol: Chem.Mol) -> Chem.Mol:
|
|
36
|
+
"""
|
|
37
|
+
Remove salt fragments from a molecule using RDKit's SaltRemover.
|
|
38
|
+
|
|
39
|
+
Parameters:
|
|
40
|
+
- mol (Chem.Mol): RDKit Mol object.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
- Chem.Mol: Mol object with salts removed.
|
|
44
|
+
"""
|
|
45
|
+
remover = SaltRemover()
|
|
46
|
+
return remover.StripMol(mol)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def uncharge_molecule(mol: Chem.Mol) -> Chem.Mol:
|
|
50
|
+
"""
|
|
51
|
+
Neutralize a molecule by removing counter-ions using RDKit's Uncharger.
|
|
52
|
+
|
|
53
|
+
Parameters:
|
|
54
|
+
- mol (Chem.Mol): RDKit Mol object.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
- Chem.Mol: Neutralized Mol object.
|
|
58
|
+
"""
|
|
59
|
+
uncharger = rdMolStandardize.Uncharger()
|
|
60
|
+
return uncharger.uncharge(mol)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def fragments_remover(mol: Chem.Mol) -> Chem.Mol:
|
|
64
|
+
"""
|
|
65
|
+
Remove small fragments from a molecule, keeping only the largest one.
|
|
66
|
+
|
|
67
|
+
Parameters:
|
|
68
|
+
- mol (Chem.Mol): RDKit Mol object.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
- Chem.Mol: Mol object with small fragments removed.
|
|
72
|
+
"""
|
|
73
|
+
frags = Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=True)
|
|
74
|
+
return max(frags, default=None, key=lambda m: m.GetNumAtoms())
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def remove_explicit_hydrogens(mol: Chem.Mol) -> Chem.Mol:
|
|
78
|
+
"""
|
|
79
|
+
Remove explicit hydrogens from a molecule to leave only the heavy atoms.
|
|
80
|
+
|
|
81
|
+
Parameters:
|
|
82
|
+
- mol (Chem.Mol): RDKit Mol object.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
- Chem.Mol: Mol object with explicit hydrogens removed.
|
|
86
|
+
"""
|
|
87
|
+
return Chem.RemoveHs(mol)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def remove_radicals_and_add_hydrogens(mol: Chem.Mol) -> Chem.Mol:
|
|
91
|
+
"""
|
|
92
|
+
Remove radicals from a molecule by setting radical electrons to zero and adding hydrogens where needed.
|
|
93
|
+
|
|
94
|
+
Parameters:
|
|
95
|
+
- mol (Chem.Mol): RDKit Mol object.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
- Chem.Mol: Mol object with radicals removed and necessary hydrogens added.
|
|
99
|
+
"""
|
|
100
|
+
mol = Chem.RemoveHs(mol) # Remove explicit hydrogens first
|
|
101
|
+
for atom in mol.GetAtoms():
|
|
102
|
+
if atom.GetNumRadicalElectrons() > 0:
|
|
103
|
+
atom.SetNumExplicitHs(
|
|
104
|
+
atom.GetNumExplicitHs() + atom.GetNumRadicalElectrons()
|
|
105
|
+
)
|
|
106
|
+
atom.SetNumRadicalElectrons(0)
|
|
107
|
+
mol = rdmolops.AddHs(mol) # Add hydrogens back
|
|
108
|
+
return remove_explicit_hydrogens(mol)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def remove_isotopes(mol: Chem.Mol) -> Chem.Mol:
|
|
112
|
+
"""
|
|
113
|
+
Remove isotopic information from a molecule.
|
|
114
|
+
|
|
115
|
+
Parameters:
|
|
116
|
+
- mol (Chem.Mol): RDKit Mol object.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
- Chem.Mol: Mol object with isotopes removed.
|
|
120
|
+
"""
|
|
121
|
+
for atom in mol.GetAtoms():
|
|
122
|
+
atom.SetIsotope(0)
|
|
123
|
+
return mol
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def clear_stereochemistry(mol: Chem.Mol) -> Chem.Mol:
|
|
127
|
+
"""
|
|
128
|
+
Clear all stereochemical information from a molecule.
|
|
129
|
+
|
|
130
|
+
Parameters:
|
|
131
|
+
- mol (Chem.Mol): RDKit Mol object.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
- Chem.Mol: Mol object with stereochemistry cleared.
|
|
135
|
+
"""
|
|
136
|
+
Chem.RemoveStereochemistry(mol)
|
|
137
|
+
return mol
|
|
File without changes
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
from rdkit import Chem
|
|
2
|
+
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
|
|
3
|
+
|
|
4
|
+
from joblib import Parallel, delayed
|
|
5
|
+
from typing import List, Dict, Union, Tuple
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BalanceReactionCheck:
|
|
9
|
+
"""
|
|
10
|
+
A class to check the balance of chemical reactions given in SMILES format.
|
|
11
|
+
It supports parallel execution and maintains the input format in the output.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
n_jobs: int = 4,
|
|
17
|
+
verbose: int = 0,
|
|
18
|
+
):
|
|
19
|
+
"""
|
|
20
|
+
Initializes the class with given input data, the column name
|
|
21
|
+
for reactions in the input, number of jobs for
|
|
22
|
+
parallel processing, and verbosity level.
|
|
23
|
+
|
|
24
|
+
Parameters:
|
|
25
|
+
- input_data (Union[str, List[Union[str, Dict[str, str]]]]): A single SMILES
|
|
26
|
+
string, a list of SMILES strings, or a list of dictionaries with 'reactions' keys.
|
|
27
|
+
- rsmi_column (str): The key/column name for reaction SMILES strings
|
|
28
|
+
in the input data.
|
|
29
|
+
- n_jobs (int): The number of parallel jobs to run for balance checking.
|
|
30
|
+
- verbose (int): The verbosity level of joblib parallel execution.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
self.n_jobs = n_jobs
|
|
34
|
+
self.verbose = verbose
|
|
35
|
+
|
|
36
|
+
@staticmethod
|
|
37
|
+
def get_combined_molecular_formula(smiles: str) -> str:
|
|
38
|
+
"""
|
|
39
|
+
Computes the molecular formula for a molecule represented by a SMILES string.
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
- smiles (str): The SMILES string of the molecule.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
- str: The molecular formula, or an empty string if the molecule is invalid.
|
|
46
|
+
"""
|
|
47
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
48
|
+
if not mol:
|
|
49
|
+
return ""
|
|
50
|
+
return CalcMolFormula(mol)
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def parse_input(
|
|
54
|
+
input_data: Union[str, List[Union[str, Dict[str, str]]]],
|
|
55
|
+
rsmi_column: str = "reactions",
|
|
56
|
+
) -> List[Dict[str, str]]:
|
|
57
|
+
"""
|
|
58
|
+
Parses the input data into a standardized list containing
|
|
59
|
+
dictionaries for each reaction.
|
|
60
|
+
|
|
61
|
+
Parameters:
|
|
62
|
+
- input_data (Union[str, List[Union[str, Dict[str, str]]]]):
|
|
63
|
+
The input data to be processed.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
- List[Dict[str, str]]: A list of dictionaries with reaction SMILES strings.
|
|
67
|
+
"""
|
|
68
|
+
standardized_input = []
|
|
69
|
+
if isinstance(input_data, str):
|
|
70
|
+
standardized_input.append({rsmi_column: input_data})
|
|
71
|
+
elif isinstance(input_data, list):
|
|
72
|
+
for item in input_data:
|
|
73
|
+
if isinstance(item, str):
|
|
74
|
+
standardized_input.append({rsmi_column: item})
|
|
75
|
+
elif isinstance(item, dict) and rsmi_column in item:
|
|
76
|
+
standardized_input.append(item)
|
|
77
|
+
else:
|
|
78
|
+
raise ValueError("Unsupported input type")
|
|
79
|
+
return standardized_input
|
|
80
|
+
|
|
81
|
+
@staticmethod
|
|
82
|
+
def parse_reaction(reaction_smiles: str) -> Tuple[List[str], List[str]]:
|
|
83
|
+
"""
|
|
84
|
+
Splits a reaction SMILES string into reactants and products.
|
|
85
|
+
|
|
86
|
+
Parameters:
|
|
87
|
+
- reaction_smiles (str): A SMILES string representing a chemical reaction.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
- Tuple[List[str], List[str]]: Lists of SMILES strings for reactants and products.
|
|
91
|
+
"""
|
|
92
|
+
reactants_smiles, products_smiles = reaction_smiles.split(">>")
|
|
93
|
+
return reactants_smiles, products_smiles
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def rsmi_balance_check(reaction_smiles: str) -> bool:
|
|
97
|
+
"""
|
|
98
|
+
Checks if a reaction SMILES string is balanced.
|
|
99
|
+
|
|
100
|
+
Parameters:
|
|
101
|
+
- reaction_smiles (str): A SMILES string representing a chemical reaction.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
- bool: True if the reaction is balanced, False otherwise.
|
|
105
|
+
"""
|
|
106
|
+
reactants_smiles, products_smiles = BalanceReactionCheck.parse_reaction(
|
|
107
|
+
reaction_smiles
|
|
108
|
+
)
|
|
109
|
+
reactants_forumula = BalanceReactionCheck.get_combined_molecular_formula(
|
|
110
|
+
reactants_smiles
|
|
111
|
+
)
|
|
112
|
+
products_forumula = BalanceReactionCheck.get_combined_molecular_formula(
|
|
113
|
+
products_smiles
|
|
114
|
+
)
|
|
115
|
+
return reactants_forumula == products_forumula
|
|
116
|
+
|
|
117
|
+
@staticmethod
|
|
118
|
+
def dict_balance_check(
|
|
119
|
+
reaction_dict: Dict[str, str], rsmi_column: str
|
|
120
|
+
) -> Dict[str, Union[bool, str]]:
|
|
121
|
+
"""
|
|
122
|
+
Checks if a single reaction (in SMILES format) is balanced, maintaining
|
|
123
|
+
the input format.
|
|
124
|
+
|
|
125
|
+
Parameters:
|
|
126
|
+
- reaction_dict (Dict[str, str]): A dictionary containing the
|
|
127
|
+
reaction SMILES string.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
- Dict[str, Union[bool, str]]: A dictionary indicating if the reaction is
|
|
131
|
+
balanced, along with the original reaction data.
|
|
132
|
+
"""
|
|
133
|
+
reaction_smiles = reaction_dict[rsmi_column]
|
|
134
|
+
balance = BalanceReactionCheck.rsmi_balance_check(reaction_smiles)
|
|
135
|
+
return {"balanced": balance, **reaction_dict}
|
|
136
|
+
|
|
137
|
+
def dicts_balance_check(
|
|
138
|
+
self,
|
|
139
|
+
input_data: Union[str, List[Union[str, Dict[str, str]]]],
|
|
140
|
+
rsmi_column: str = "reactions",
|
|
141
|
+
) -> Tuple[List[Dict[str, Union[bool, str]]], List[Dict[str, Union[bool, str]]]]:
|
|
142
|
+
"""
|
|
143
|
+
Checks the balance of all reactions in the input data.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
- Tuple[List[Dict[str, Union[bool, str]]], List[Dict[str, Union[bool, str]]]]:
|
|
147
|
+
Two lists containing dictionaries of balanced and unbalanced reactions,
|
|
148
|
+
respectively.
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
reactions = self.parse_input(input_data, rsmi_column)
|
|
152
|
+
results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
|
|
153
|
+
delayed(self.dict_balance_check)(reaction, rsmi_column)
|
|
154
|
+
for reaction in reactions
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
balanced_reactions = [reaction for reaction in results if reaction["balanced"]]
|
|
158
|
+
unbalanced_reactions = [
|
|
159
|
+
reaction for reaction in results if not reaction["balanced"]
|
|
160
|
+
]
|
|
161
|
+
|
|
162
|
+
return balanced_reactions, unbalanced_reactions
|