synkit 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. synkit/Chem/Fingerprint/__init__.py +0 -0
  2. synkit/Chem/Fingerprint/fp_calculator.py +122 -0
  3. synkit/Chem/Fingerprint/smiles_featurizer.py +185 -0
  4. synkit/Chem/Fingerprint/transformation_fp.py +79 -0
  5. synkit/Chem/Molecule/__init__.py +0 -0
  6. synkit/Chem/Molecule/standardize.py +137 -0
  7. synkit/Chem/Reaction/__init__.py +0 -0
  8. synkit/Chem/Reaction/balance_check.py +162 -0
  9. synkit/Chem/Reaction/cleanning.py +59 -0
  10. synkit/Chem/Reaction/deionize.py +289 -0
  11. synkit/Chem/Reaction/neutralize.py +256 -0
  12. synkit/Chem/Reaction/reagent.py +102 -0
  13. synkit/Chem/Reaction/standardize.py +157 -0
  14. synkit/Chem/Reaction/tautomerize.py +168 -0
  15. synkit/Graph/Cluster/__init__.py +0 -0
  16. synkit/Graph/Cluster/morphism.py +83 -0
  17. synkit/Graph/Feature/__init__.py +0 -0
  18. synkit/Graph/Feature/graph_descriptors.py +325 -0
  19. synkit/Graph/Feature/graph_fps.py +97 -0
  20. synkit/Graph/Feature/graph_signature.py +236 -0
  21. synkit/Graph/Feature/hash_fps.py +130 -0
  22. synkit/Graph/Feature/morgan_fps.py +87 -0
  23. synkit/Graph/Feature/path_fps.py +82 -0
  24. synkit/Graph/__init.py +0 -0
  25. synkit/IO/__init__.py +0 -0
  26. synkit/IO/chem_converter.py +231 -0
  27. synkit/IO/data_io.py +277 -0
  28. synkit/IO/data_process.py +49 -0
  29. synkit/IO/debug.py +78 -0
  30. synkit/IO/dg_to_gml.py +124 -0
  31. synkit/IO/gml_to_nx.py +119 -0
  32. synkit/IO/graph_to_mol.py +110 -0
  33. synkit/IO/mol_to_graph.py +282 -0
  34. synkit/IO/nx_to_gml.py +200 -0
  35. synkit/IO/parse_rule.py +172 -0
  36. synkit/IO/smiles_to_id.py +119 -0
  37. synkit/ITS/_misc.py +280 -0
  38. synkit/ITS/aam_validator.py +254 -0
  39. synkit/ITS/its_builder.py +94 -0
  40. synkit/ITS/its_construction.py +213 -0
  41. synkit/ITS/normalize_aam.py +183 -0
  42. synkit/ITS/partial_expand.py +170 -0
  43. synkit/Reactor/__init__.py +0 -0
  44. synkit/Reactor/core_engine.py +164 -0
  45. synkit/Reactor/inference.py +73 -0
  46. synkit/Reactor/multi_step.py +227 -0
  47. synkit/Reactor/multi_step_aam.py +82 -0
  48. synkit/Reactor/reagent.py +95 -0
  49. synkit/Reactor/rule_apply.py +81 -0
  50. synkit/Vis/__init__.py +0 -0
  51. synkit/Vis/chemical_graph_visualizer.py +378 -0
  52. synkit/Vis/chemical_reaction_visualizer.py +133 -0
  53. synkit/Vis/chemical_space.py +83 -0
  54. synkit/Vis/embedding.py +92 -0
  55. synkit/Vis/graph_visualizer.py +286 -0
  56. synkit/Vis/pdf_writer.py +143 -0
  57. synkit/Vis/rsmi_to_fig.py +169 -0
  58. synkit/__init__.py +0 -0
  59. synkit/_misc.py +181 -0
  60. synkit-0.0.1.dist-info/METADATA +148 -0
  61. synkit-0.0.1.dist-info/RECORD +63 -0
  62. synkit-0.0.1.dist-info/WHEEL +4 -0
  63. synkit-0.0.1.dist-info/licenses/LICENSE +21 -0
File without changes
@@ -0,0 +1,122 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from drfp import DrfpEncoder
4
+ from joblib import Parallel, delayed
5
+ from typing import Optional
6
+ from synkit.IO.debug import setup_logging
7
+ from synkit.Chem.Fingerprint.transformation_fp import TransformationFP
8
+
9
+
10
+ class FPCalculator:
11
+ """
12
+ Class to calculate fingerprint vectors for chemical compounds represented by SMILES strings.
13
+
14
+ Attributes:
15
+ - data (pd.DataFrame): DataFrame containing SMILES strings and potentially other data.
16
+ - smiles_column (str): Name of the column containing the SMILES strings.
17
+ - fp_type (str): Type of fingerprint to calculate; supports 'drfp'.
18
+ - n_jobs (int): Number of parallel jobs to run for performance enhancement.
19
+ - verbose (int): Verbosity level of parallel computation.
20
+ - save_path (Optional[str]): Path to save the resulting DataFrame. If None, the DataFrame is not saved.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ data: pd.DataFrame,
26
+ smiles_column: str = "smiles",
27
+ fp_type: str = "drfp",
28
+ n_jobs: int = 2,
29
+ verbose: int = 0,
30
+ save_path: Optional[str] = None,
31
+ ):
32
+ self.data = data
33
+ self.smiles_column = smiles_column
34
+ self.fp_type = fp_type
35
+ self.n_jobs = n_jobs
36
+ self.verbose = verbose
37
+ self.save_path = save_path
38
+ self.logger = setup_logging()
39
+ self._validate_fp_type(fp_type)
40
+
41
+ def _validate_fp_type(self, fp_type: str) -> None:
42
+ valid_fps = [
43
+ "drfp",
44
+ "avalon",
45
+ "maccs",
46
+ "torsion",
47
+ "pharm2D",
48
+ "ecfp2",
49
+ "ecfp4",
50
+ "ecfp6",
51
+ "fcfp2",
52
+ "fcfp4",
53
+ "fcfp6",
54
+ ]
55
+ if fp_type not in valid_fps:
56
+ raise ValueError(
57
+ f"Unsupported fingerprint type '{fp_type}'. Currently supported: {', '.join(valid_fps)}."
58
+ )
59
+
60
+ @staticmethod
61
+ def calculate_drfp(smiles: str) -> np.ndarray:
62
+ """
63
+ Calculate the fingerprint vector for a given SMILES string using the DrfpEncoder.
64
+
65
+ Parameters:
66
+ - smiles (str): A SMILES string representing a chemical compound.
67
+
68
+ Returns:
69
+ - np.ndarray: A numpy array representing the fingerprint vector.
70
+ """
71
+ return DrfpEncoder.encode(smiles)[0]
72
+
73
+ @staticmethod
74
+ def smiles_to_vec(reaction_smiles: str, fp_type: str) -> np.ndarray:
75
+ """
76
+ Convert a SMILES string to a fingerprint vector based on the specified fingerprint type.
77
+
78
+ Parameters:
79
+ - reaction_smiles (str): A SMILES string representing a chemical compound.
80
+ - fp_type (str): Type of fingerprint to calculate.
81
+
82
+ Returns:
83
+ - np.ndarray: A numpy array representing the fingerprint vector.
84
+
85
+ Raises:
86
+ ValueError: If an unsupported fingerprint type is specified.
87
+ """
88
+ if fp_type == "drfp":
89
+ return FPCalculator.calculate_drfp(reaction_smiles)
90
+ else:
91
+ return TransformationFP.fit(reaction_smiles, ">>", fp_type, True)
92
+
93
+ def fit(self) -> pd.DataFrame:
94
+ """
95
+ Calculates the fingerprints for all SMILES strings in the dataset according to the specified fingerprint type.
96
+
97
+ Returns:
98
+ - pd.DataFrame: The original DataFrame with an added column for the calculated fingerprints.
99
+
100
+ Raises:
101
+ ValueError: If the SMILES column specified does not exist in the DataFrame.
102
+ """
103
+ if self.smiles_column not in self.data.columns:
104
+ raise ValueError(
105
+ f"Column '{self.smiles_column}' does not exist in the DataFrame."
106
+ )
107
+
108
+ fps = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
109
+ delayed(FPCalculator.smiles_to_vec)(smiles, self.fp_type)
110
+ for smiles in self.data[self.smiles_column]
111
+ )
112
+
113
+ fps_df = pd.DataFrame(
114
+ fps, columns=[f"{self.fp_type}_{i}" for i in range(len(fps[0]))]
115
+ )
116
+ result_data = pd.concat([self.data, fps_df], axis=1)
117
+
118
+ if self.save_path:
119
+ result_data.to_csv(self.save_path, index=False)
120
+ self.logger.info(f"Data saved to {self.save_path}")
121
+
122
+ return result_data
@@ -0,0 +1,185 @@
1
+ import numpy as np
2
+ from rdkit import Chem, DataStructs
3
+ from rdkit.Chem import AllChem, MACCSkeys
4
+ from rdkit.Chem.AtomPairs import Pairs, Torsions
5
+ from rdkit.Avalon import pyAvalonTools as fpAvalon
6
+ from rdkit.Chem.Pharm2D import Gobbi_Pharm2D, Generate
7
+
8
+
9
+ class SmilesFeaturizer:
10
+ def __init__(self):
11
+ """
12
+ Initializes the SmilesFeaturizer class without any specific parameters for fingerprint generation.
13
+ """
14
+ pass
15
+
16
+ @staticmethod
17
+ def smiles_to_mol(smiles: str) -> Chem.Mol:
18
+ """
19
+ Converts a SMILES string to an RDKit Mol object.
20
+
21
+ Parameters:
22
+ - smiles (str): The SMILES string to be converted.
23
+
24
+ Returns:
25
+ - Chem.Mol: The corresponding RDKit Mol object.
26
+ """
27
+ mol = Chem.MolFromSmiles(smiles)
28
+ if mol is None:
29
+ raise ValueError("Invalid SMILES string provided.")
30
+ return mol
31
+
32
+ @staticmethod
33
+ def get_maccs_keys(mol: Chem.Mol):
34
+ """
35
+ Generates MACCS keys fingerprint from an RDKit Mol object.
36
+
37
+ Parameters:
38
+ - mol (Chem.Mol): The Mol object to be featurized.
39
+
40
+ Returns:
41
+ - RDKit ExplicitBitVect: The MACCS keys fingerprint of the Mol object.
42
+ """
43
+ return MACCSkeys.GenMACCSKeys(mol)
44
+
45
+ @staticmethod
46
+ def get_avalon_fp(mol: Chem.Mol, nBits: int = 1024):
47
+ """
48
+ Generates Avalon fingerprint from an RDKit Mol object.
49
+
50
+ Parameters:
51
+ - mol (Chem.Mol): The Mol object to be featurized.
52
+ - nBits (int): The number of bits in the generated fingerprint.
53
+
54
+ Returns:
55
+ - RDKit ExplicitBitVect: The Avalon fingerprint of the Mol object.
56
+ """
57
+ return fpAvalon.GetAvalonFP(mol, nBits)
58
+
59
+ @staticmethod
60
+ def get_ecfp(
61
+ mol: Chem.Mol, radius: int, nBits: int = 2048, useFeatures: bool = False
62
+ ):
63
+ """
64
+ Generates Extended-Connectivity Fingerprints (ECFP) or
65
+ Feature-Class Fingerprints (FCFP) from an RDKit Mol object.
66
+
67
+ Parameters:
68
+ - mol (Chem.Mol): The Mol object to be featurized.
69
+ - radius (int): The radius of the fingerprint.
70
+ - nBits (int): The number of bits in the generated fingerprint.
71
+ - useFeatures (bool): Whether to use atom features instead of atom identities.
72
+
73
+ Returns:
74
+ - RDKit ExplicitBitVect: The ECFP or FCFP fingerprint of the Mol object.
75
+ """
76
+ return AllChem.GetMorganFingerprintAsBitVect(
77
+ mol, radius, nBits=nBits, useFeatures=useFeatures
78
+ )
79
+
80
+ @staticmethod
81
+ def get_rdk_fp(
82
+ mol: Chem.Mol, maxPath: int, fpSize: int = 2048, nBitsPerHash: int = 2
83
+ ):
84
+ """
85
+ Generates RDKit fingerprint from an RDKit Mol object.
86
+
87
+ Parameters:
88
+ - mol (Chem.Mol): The Mol object to be featurized.
89
+ - maxPath (int): The maximum path length (in bonds) to be included.
90
+ - fpSize (int): The size of the fingerprint.
91
+ - nBitsPerHash (int): The number of bits per hash.
92
+
93
+ Returns:
94
+ - RDKit ExplicitBitVect: The RDKit fingerprint of the Mol object.
95
+ """
96
+ return Chem.RDKFingerprint(
97
+ mol, maxPath=maxPath, fpSize=fpSize, nBitsPerHash=nBitsPerHash
98
+ )
99
+
100
+ @staticmethod
101
+ def mol_to_ap(mol: Chem.Mol) -> np.ndarray:
102
+ """
103
+ Generates an Atom Pair fingerprint as a NumPy array from an RDKit Mol object.
104
+
105
+ Parameters:
106
+ - mol (Chem.Mol): The Mol object to be featurized.
107
+
108
+ Returns:
109
+ - RDKit ExplicitBitVect: The RDKit fingerprint of the Mol object.
110
+ """
111
+ return Pairs.GetAtomPairFingerprint(mol)
112
+
113
+ @staticmethod
114
+ def mol_to_torsion(mol: Chem.Mol) -> np.ndarray:
115
+ """
116
+ Generates a Topological Torsion fingerprint as a NumPy array from an RDKit Mol object.
117
+
118
+ Parameters:
119
+ - mol (Chem.Mol): The Mol object to be featurized.
120
+
121
+ Returns:
122
+ - RDKit ExplicitBitVect: The RDKit fingerprint of the Mol object.
123
+ """
124
+ return Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol)
125
+
126
+ @staticmethod
127
+ def mol_to_pharm2d(mol: Chem.Mol) -> np.ndarray:
128
+ """
129
+ Generates a 2D Pharmacophore fingerprint as a NumPy array from an RDKit Mol object.
130
+
131
+ Parameters:
132
+ - mol (Chem.Mol): The Mol object to be featurized.
133
+
134
+ Returns:
135
+ - RDKit ExplicitBitVect: The RDKit fingerprint of the Mol object.
136
+ """
137
+ return Generate.Gen2DFingerprint(mol, Gobbi_Pharm2D.factory)
138
+
139
+ @classmethod
140
+ def featurize_smiles(
141
+ cls, smiles: str, fingerprint_type: str, convert_to_array: bool = True, **kwargs
142
+ ) -> np.ndarray:
143
+ """
144
+ Featurizes a SMILES string into the specified type of fingerprint, optionally converting it to a NumPy array.
145
+
146
+ Parameters:
147
+ - smiles (str): The SMILES string to be featurized.
148
+ - fingerprint_type (str): The type of fingerprint to generate.
149
+ - convert_to_array (bool): Whether to convert the fingerprint to a NumPy array. Defaults to True.
150
+ - **kwargs: Additional keyword arguments for the fingerprint function.
151
+
152
+ Returns:
153
+ - np.ndarray or RDKit ExplicitBitVect: The requested type of fingerprint for the SMILES string,
154
+ either as a NumPy array or as an RDKit bit vector, depending on `convert_to_array`.
155
+ """
156
+ mol = cls.smiles_to_mol(smiles)
157
+ if fingerprint_type == "maccs":
158
+ fp = cls.get_maccs_keys(mol)
159
+ elif fingerprint_type == "avalon":
160
+ fp = cls.get_avalon_fp(mol, **kwargs)
161
+ elif fingerprint_type.startswith("ecfp") or fingerprint_type.startswith("fcfp"):
162
+ radius = int(fingerprint_type[4])
163
+ useFeatures = fingerprint_type.startswith("fcfp")
164
+ nBits = kwargs.get("nBits", 2048)
165
+ fp = cls.get_ecfp(mol, radius, nBits=nBits, useFeatures=useFeatures)
166
+ elif fingerprint_type.startswith("rdk"):
167
+ maxPath = int(fingerprint_type[3])
168
+ fp = cls.get_rdk_fp(mol, maxPath, **kwargs)
169
+ elif fingerprint_type == "avalon":
170
+ return cls.mol_to_ap(mol)
171
+ elif fingerprint_type == "torsion":
172
+ return cls.mol_to_torsion(mol)
173
+ elif fingerprint_type == "pharm2d":
174
+ return cls.mol_to_pharm2d(mol)
175
+ else:
176
+ raise ValueError(f"Unsupported fingerprint type: {fingerprint_type}")
177
+ if convert_to_array:
178
+ if fingerprint_type == "pharm2d":
179
+ return np.frombuffer(fp.ToBitString().encode(), "u1") - ord("0")
180
+ else:
181
+ ar = np.zeros((1,), dtype=np.int8)
182
+ DataStructs.ConvertToNumpyArray(fp, ar)
183
+ return ar
184
+ else:
185
+ return fp
@@ -0,0 +1,79 @@
1
+ import numpy as np
2
+ from typing import Union, Any
3
+ from rdkit.DataStructs import cDataStructs
4
+ from synkit.Chem.Fingerprint.smiles_featurizer import SmilesFeaturizer
5
+
6
+
7
+ class TransformationFP:
8
+ """
9
+ A class for handling the transformation of chemical reactions into reaction fingerprints
10
+ based on SMILES strings.
11
+ """
12
+
13
+ def __init__(self) -> None:
14
+ """
15
+ Initializes the TransformationFP object. Currently, this constructor does not
16
+ perform any operations.
17
+ """
18
+ pass
19
+
20
+ @staticmethod
21
+ def convert_arr2vec(arr: np.ndarray) -> cDataStructs.ExplicitBitVect:
22
+ """
23
+ Converts a numpy array to a RDKit ExplicitBitVect.
24
+
25
+ Parameters:
26
+ - arr (np.ndarray): The input array.
27
+
28
+ Returns:
29
+ - cDataStructs.ExplicitBitVect: The converted bit vector.
30
+ """
31
+ arr_tostring = "".join(arr.astype(str))
32
+ EBitVect = cDataStructs.CreateFromBitString(arr_tostring)
33
+ return EBitVect
34
+
35
+ @staticmethod
36
+ def fit(
37
+ reaction_smiles: str,
38
+ symbols: str,
39
+ fp_type: str,
40
+ abs: bool,
41
+ return_array: bool = True,
42
+ **kwargs: Any,
43
+ ) -> Union[np.ndarray, cDataStructs.ExplicitBitVect]:
44
+ """
45
+ Generates a reaction fingerprint for a given reaction represented by a SMILES string.
46
+
47
+ Parameters:
48
+ - reaction_smiles (str): The SMILES string of the reaction, separated by `symbols`.
49
+ - symbols (str): The symbol used to separate reactants and products in the SMILES string.
50
+ - fp_type (str): The type of fingerprint to generate (e.g., 'maccs', 'ecfp').
51
+ - abs (bool): Whether to take the absolute value of the reaction fingerprint difference.
52
+ - return_array (bool): Whether to return the reaction fingerprint as a numpy array or as a bit vector.
53
+
54
+ Returns:
55
+ - Union[np.ndarray, cDataStructs.ExplicitBitVect]: The reaction fingerprint either as an array
56
+ or a bit vector, depending on the value of `return_array`.
57
+ """
58
+ react, prod = reaction_smiles.split(symbols)
59
+ react_fps = None
60
+ for s in react.split("."):
61
+ if react_fps is None:
62
+ react_fps = SmilesFeaturizer.featurize_smiles(s, fp_type, **kwargs)
63
+ else:
64
+ react_fps += SmilesFeaturizer.featurize_smiles(s, fp_type, **kwargs)
65
+
66
+ prod_fps = None
67
+ for s in prod.split("."):
68
+ if prod_fps is None:
69
+ prod_fps = SmilesFeaturizer.featurize_smiles(s, fp_type, **kwargs)
70
+ else:
71
+ prod_fps += SmilesFeaturizer.featurize_smiles(s, fp_type, **kwargs)
72
+
73
+ reaction_fp = np.subtract(prod_fps, react_fps)
74
+ if abs:
75
+ reaction_fp = np.abs(reaction_fp)
76
+ if return_array:
77
+ return reaction_fp
78
+ else:
79
+ return TransformationFP.convert_arr2vec(reaction_fp)
File without changes
@@ -0,0 +1,137 @@
1
+ from rdkit import Chem
2
+ from rdkit.Chem import rdmolops
3
+ from rdkit.Chem.MolStandardize import rdMolStandardize
4
+ from rdkit.Chem.SaltRemover import SaltRemover
5
+
6
+
7
+ def normalize_molecule(mol: Chem.Mol) -> Chem.Mol:
8
+ """
9
+ Normalize a molecule using RDKit's Normalizer.
10
+
11
+ Parameters:
12
+ - mol (Chem.Mol): RDKit Mol object to be normalized.
13
+
14
+ Returns:
15
+ - Chem.Mol: Normalized RDKit Mol object.
16
+ """
17
+ normalizer = rdMolStandardize.Normalizer()
18
+ return normalizer.normalize(mol)
19
+
20
+
21
+ def canonicalize_tautomer(mol: Chem.Mol) -> Chem.Mol:
22
+ """
23
+ Canonicalize the tautomer of a molecule using RDKit's TautomerCanonicalizer.
24
+
25
+ Parameters:
26
+ - mol (Chem.Mol): RDKit Mol object.
27
+
28
+ Returns:
29
+ - Chem.Mol: Mol object with canonicalized tautomer.
30
+ """
31
+ tautomer_canonicalizer = rdMolStandardize.TautomerEnumerator()
32
+ return tautomer_canonicalizer.Canonicalize(mol)
33
+
34
+
35
+ def salts_remover(mol: Chem.Mol) -> Chem.Mol:
36
+ """
37
+ Remove salt fragments from a molecule using RDKit's SaltRemover.
38
+
39
+ Parameters:
40
+ - mol (Chem.Mol): RDKit Mol object.
41
+
42
+ Returns:
43
+ - Chem.Mol: Mol object with salts removed.
44
+ """
45
+ remover = SaltRemover()
46
+ return remover.StripMol(mol)
47
+
48
+
49
+ def uncharge_molecule(mol: Chem.Mol) -> Chem.Mol:
50
+ """
51
+ Neutralize a molecule by removing counter-ions using RDKit's Uncharger.
52
+
53
+ Parameters:
54
+ - mol (Chem.Mol): RDKit Mol object.
55
+
56
+ Returns:
57
+ - Chem.Mol: Neutralized Mol object.
58
+ """
59
+ uncharger = rdMolStandardize.Uncharger()
60
+ return uncharger.uncharge(mol)
61
+
62
+
63
+ def fragments_remover(mol: Chem.Mol) -> Chem.Mol:
64
+ """
65
+ Remove small fragments from a molecule, keeping only the largest one.
66
+
67
+ Parameters:
68
+ - mol (Chem.Mol): RDKit Mol object.
69
+
70
+ Returns:
71
+ - Chem.Mol: Mol object with small fragments removed.
72
+ """
73
+ frags = Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=True)
74
+ return max(frags, default=None, key=lambda m: m.GetNumAtoms())
75
+
76
+
77
+ def remove_explicit_hydrogens(mol: Chem.Mol) -> Chem.Mol:
78
+ """
79
+ Remove explicit hydrogens from a molecule to leave only the heavy atoms.
80
+
81
+ Parameters:
82
+ - mol (Chem.Mol): RDKit Mol object.
83
+
84
+ Returns:
85
+ - Chem.Mol: Mol object with explicit hydrogens removed.
86
+ """
87
+ return Chem.RemoveHs(mol)
88
+
89
+
90
+ def remove_radicals_and_add_hydrogens(mol: Chem.Mol) -> Chem.Mol:
91
+ """
92
+ Remove radicals from a molecule by setting radical electrons to zero and adding hydrogens where needed.
93
+
94
+ Parameters:
95
+ - mol (Chem.Mol): RDKit Mol object.
96
+
97
+ Returns:
98
+ - Chem.Mol: Mol object with radicals removed and necessary hydrogens added.
99
+ """
100
+ mol = Chem.RemoveHs(mol) # Remove explicit hydrogens first
101
+ for atom in mol.GetAtoms():
102
+ if atom.GetNumRadicalElectrons() > 0:
103
+ atom.SetNumExplicitHs(
104
+ atom.GetNumExplicitHs() + atom.GetNumRadicalElectrons()
105
+ )
106
+ atom.SetNumRadicalElectrons(0)
107
+ mol = rdmolops.AddHs(mol) # Add hydrogens back
108
+ return remove_explicit_hydrogens(mol)
109
+
110
+
111
+ def remove_isotopes(mol: Chem.Mol) -> Chem.Mol:
112
+ """
113
+ Remove isotopic information from a molecule.
114
+
115
+ Parameters:
116
+ - mol (Chem.Mol): RDKit Mol object.
117
+
118
+ Returns:
119
+ - Chem.Mol: Mol object with isotopes removed.
120
+ """
121
+ for atom in mol.GetAtoms():
122
+ atom.SetIsotope(0)
123
+ return mol
124
+
125
+
126
+ def clear_stereochemistry(mol: Chem.Mol) -> Chem.Mol:
127
+ """
128
+ Clear all stereochemical information from a molecule.
129
+
130
+ Parameters:
131
+ - mol (Chem.Mol): RDKit Mol object.
132
+
133
+ Returns:
134
+ - Chem.Mol: Mol object with stereochemistry cleared.
135
+ """
136
+ Chem.RemoveStereochemistry(mol)
137
+ return mol
File without changes
@@ -0,0 +1,162 @@
1
+ from rdkit import Chem
2
+ from rdkit.Chem.rdMolDescriptors import CalcMolFormula
3
+
4
+ from joblib import Parallel, delayed
5
+ from typing import List, Dict, Union, Tuple
6
+
7
+
8
+ class BalanceReactionCheck:
9
+ """
10
+ A class to check the balance of chemical reactions given in SMILES format.
11
+ It supports parallel execution and maintains the input format in the output.
12
+ """
13
+
14
+ def __init__(
15
+ self,
16
+ n_jobs: int = 4,
17
+ verbose: int = 0,
18
+ ):
19
+ """
20
+ Initializes the class with given input data, the column name
21
+ for reactions in the input, number of jobs for
22
+ parallel processing, and verbosity level.
23
+
24
+ Parameters:
25
+ - input_data (Union[str, List[Union[str, Dict[str, str]]]]): A single SMILES
26
+ string, a list of SMILES strings, or a list of dictionaries with 'reactions' keys.
27
+ - rsmi_column (str): The key/column name for reaction SMILES strings
28
+ in the input data.
29
+ - n_jobs (int): The number of parallel jobs to run for balance checking.
30
+ - verbose (int): The verbosity level of joblib parallel execution.
31
+ """
32
+
33
+ self.n_jobs = n_jobs
34
+ self.verbose = verbose
35
+
36
+ @staticmethod
37
+ def get_combined_molecular_formula(smiles: str) -> str:
38
+ """
39
+ Computes the molecular formula for a molecule represented by a SMILES string.
40
+
41
+ Parameters:
42
+ - smiles (str): The SMILES string of the molecule.
43
+
44
+ Returns:
45
+ - str: The molecular formula, or an empty string if the molecule is invalid.
46
+ """
47
+ mol = Chem.MolFromSmiles(smiles)
48
+ if not mol:
49
+ return ""
50
+ return CalcMolFormula(mol)
51
+
52
+ @staticmethod
53
+ def parse_input(
54
+ input_data: Union[str, List[Union[str, Dict[str, str]]]],
55
+ rsmi_column: str = "reactions",
56
+ ) -> List[Dict[str, str]]:
57
+ """
58
+ Parses the input data into a standardized list containing
59
+ dictionaries for each reaction.
60
+
61
+ Parameters:
62
+ - input_data (Union[str, List[Union[str, Dict[str, str]]]]):
63
+ The input data to be processed.
64
+
65
+ Returns:
66
+ - List[Dict[str, str]]: A list of dictionaries with reaction SMILES strings.
67
+ """
68
+ standardized_input = []
69
+ if isinstance(input_data, str):
70
+ standardized_input.append({rsmi_column: input_data})
71
+ elif isinstance(input_data, list):
72
+ for item in input_data:
73
+ if isinstance(item, str):
74
+ standardized_input.append({rsmi_column: item})
75
+ elif isinstance(item, dict) and rsmi_column in item:
76
+ standardized_input.append(item)
77
+ else:
78
+ raise ValueError("Unsupported input type")
79
+ return standardized_input
80
+
81
+ @staticmethod
82
+ def parse_reaction(reaction_smiles: str) -> Tuple[List[str], List[str]]:
83
+ """
84
+ Splits a reaction SMILES string into reactants and products.
85
+
86
+ Parameters:
87
+ - reaction_smiles (str): A SMILES string representing a chemical reaction.
88
+
89
+ Returns:
90
+ - Tuple[List[str], List[str]]: Lists of SMILES strings for reactants and products.
91
+ """
92
+ reactants_smiles, products_smiles = reaction_smiles.split(">>")
93
+ return reactants_smiles, products_smiles
94
+
95
+ @staticmethod
96
+ def rsmi_balance_check(reaction_smiles: str) -> bool:
97
+ """
98
+ Checks if a reaction SMILES string is balanced.
99
+
100
+ Parameters:
101
+ - reaction_smiles (str): A SMILES string representing a chemical reaction.
102
+
103
+ Returns:
104
+ - bool: True if the reaction is balanced, False otherwise.
105
+ """
106
+ reactants_smiles, products_smiles = BalanceReactionCheck.parse_reaction(
107
+ reaction_smiles
108
+ )
109
+ reactants_forumula = BalanceReactionCheck.get_combined_molecular_formula(
110
+ reactants_smiles
111
+ )
112
+ products_forumula = BalanceReactionCheck.get_combined_molecular_formula(
113
+ products_smiles
114
+ )
115
+ return reactants_forumula == products_forumula
116
+
117
+ @staticmethod
118
+ def dict_balance_check(
119
+ reaction_dict: Dict[str, str], rsmi_column: str
120
+ ) -> Dict[str, Union[bool, str]]:
121
+ """
122
+ Checks if a single reaction (in SMILES format) is balanced, maintaining
123
+ the input format.
124
+
125
+ Parameters:
126
+ - reaction_dict (Dict[str, str]): A dictionary containing the
127
+ reaction SMILES string.
128
+
129
+ Returns:
130
+ - Dict[str, Union[bool, str]]: A dictionary indicating if the reaction is
131
+ balanced, along with the original reaction data.
132
+ """
133
+ reaction_smiles = reaction_dict[rsmi_column]
134
+ balance = BalanceReactionCheck.rsmi_balance_check(reaction_smiles)
135
+ return {"balanced": balance, **reaction_dict}
136
+
137
+ def dicts_balance_check(
138
+ self,
139
+ input_data: Union[str, List[Union[str, Dict[str, str]]]],
140
+ rsmi_column: str = "reactions",
141
+ ) -> Tuple[List[Dict[str, Union[bool, str]]], List[Dict[str, Union[bool, str]]]]:
142
+ """
143
+ Checks the balance of all reactions in the input data.
144
+
145
+ Returns:
146
+ - Tuple[List[Dict[str, Union[bool, str]]], List[Dict[str, Union[bool, str]]]]:
147
+ Two lists containing dictionaries of balanced and unbalanced reactions,
148
+ respectively.
149
+ """
150
+
151
+ reactions = self.parse_input(input_data, rsmi_column)
152
+ results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
153
+ delayed(self.dict_balance_check)(reaction, rsmi_column)
154
+ for reaction in reactions
155
+ )
156
+
157
+ balanced_reactions = [reaction for reaction in results if reaction["balanced"]]
158
+ unbalanced_reactions = [
159
+ reaction for reaction in results if not reaction["balanced"]
160
+ ]
161
+
162
+ return balanced_reactions, unbalanced_reactions