synkit 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synkit/Chem/Fingerprint/__init__.py +0 -0
- synkit/Chem/Fingerprint/fp_calculator.py +122 -0
- synkit/Chem/Fingerprint/smiles_featurizer.py +185 -0
- synkit/Chem/Fingerprint/transformation_fp.py +79 -0
- synkit/Chem/Molecule/__init__.py +0 -0
- synkit/Chem/Molecule/standardize.py +137 -0
- synkit/Chem/Reaction/__init__.py +0 -0
- synkit/Chem/Reaction/balance_check.py +162 -0
- synkit/Chem/Reaction/cleanning.py +59 -0
- synkit/Chem/Reaction/deionize.py +289 -0
- synkit/Chem/Reaction/neutralize.py +256 -0
- synkit/Chem/Reaction/reagent.py +102 -0
- synkit/Chem/Reaction/standardize.py +157 -0
- synkit/Chem/Reaction/tautomerize.py +168 -0
- synkit/Graph/Cluster/__init__.py +0 -0
- synkit/Graph/Cluster/morphism.py +83 -0
- synkit/Graph/Feature/__init__.py +0 -0
- synkit/Graph/Feature/graph_descriptors.py +325 -0
- synkit/Graph/Feature/graph_fps.py +97 -0
- synkit/Graph/Feature/graph_signature.py +236 -0
- synkit/Graph/Feature/hash_fps.py +130 -0
- synkit/Graph/Feature/morgan_fps.py +87 -0
- synkit/Graph/Feature/path_fps.py +82 -0
- synkit/Graph/__init.py +0 -0
- synkit/IO/__init__.py +0 -0
- synkit/IO/chem_converter.py +231 -0
- synkit/IO/data_io.py +277 -0
- synkit/IO/data_process.py +49 -0
- synkit/IO/debug.py +78 -0
- synkit/IO/dg_to_gml.py +124 -0
- synkit/IO/gml_to_nx.py +119 -0
- synkit/IO/graph_to_mol.py +110 -0
- synkit/IO/mol_to_graph.py +282 -0
- synkit/IO/nx_to_gml.py +200 -0
- synkit/IO/parse_rule.py +172 -0
- synkit/IO/smiles_to_id.py +119 -0
- synkit/ITS/_misc.py +280 -0
- synkit/ITS/aam_validator.py +254 -0
- synkit/ITS/its_builder.py +94 -0
- synkit/ITS/its_construction.py +213 -0
- synkit/ITS/normalize_aam.py +183 -0
- synkit/ITS/partial_expand.py +170 -0
- synkit/Reactor/__init__.py +0 -0
- synkit/Reactor/core_engine.py +164 -0
- synkit/Reactor/inference.py +73 -0
- synkit/Reactor/multi_step.py +227 -0
- synkit/Reactor/multi_step_aam.py +82 -0
- synkit/Reactor/reagent.py +95 -0
- synkit/Reactor/rule_apply.py +81 -0
- synkit/Vis/__init__.py +0 -0
- synkit/Vis/chemical_graph_visualizer.py +378 -0
- synkit/Vis/chemical_reaction_visualizer.py +133 -0
- synkit/Vis/chemical_space.py +83 -0
- synkit/Vis/embedding.py +92 -0
- synkit/Vis/graph_visualizer.py +286 -0
- synkit/Vis/pdf_writer.py +143 -0
- synkit/Vis/rsmi_to_fig.py +169 -0
- synkit/__init__.py +0 -0
- synkit/_misc.py +181 -0
- synkit-0.0.1.dist-info/METADATA +148 -0
- synkit-0.0.1.dist-info/RECORD +63 -0
- synkit-0.0.1.dist-info/WHEEL +4 -0
- synkit-0.0.1.dist-info/licenses/LICENSE +21 -0
synkit/IO/data_io.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import pickle
|
|
4
|
+
import numpy as np
|
|
5
|
+
from numpy import ndarray
|
|
6
|
+
from joblib import dump, load
|
|
7
|
+
from typing import List, Dict, Any, Generator
|
|
8
|
+
from synkit.IO.debug import setup_logging
|
|
9
|
+
|
|
10
|
+
logger = setup_logging()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def save_database(database: list[dict], pathname: str = "./Data/database.json") -> None:
|
|
14
|
+
"""
|
|
15
|
+
Save a database (a list of dictionaries) to a JSON file.
|
|
16
|
+
|
|
17
|
+
Parameters:
|
|
18
|
+
- database: The database to be saved.
|
|
19
|
+
- pathname: The path where the database will be saved.
|
|
20
|
+
Defaults to './Data/database.json'.
|
|
21
|
+
|
|
22
|
+
Raises:
|
|
23
|
+
- TypeError: If the database is not a list of dictionaries.
|
|
24
|
+
- ValueError: If there is an error writing the file.
|
|
25
|
+
"""
|
|
26
|
+
if not all(isinstance(item, dict) for item in database):
|
|
27
|
+
raise TypeError("Database should be a list of dictionaries.")
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
with open(pathname, "w") as f:
|
|
31
|
+
json.dump(database, f)
|
|
32
|
+
except IOError as e:
|
|
33
|
+
raise ValueError(f"Error writing to file {pathname}: {e}")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def load_database(pathname: str = "./Data/database.json") -> List[Dict]:
|
|
37
|
+
"""
|
|
38
|
+
Load a database (a list of dictionaries) from a JSON file.
|
|
39
|
+
|
|
40
|
+
Parameters:
|
|
41
|
+
- pathname: The path from where the database will be loaded.
|
|
42
|
+
Defaults to './Data/database.json'.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
- List[Dict]: The loaded database.
|
|
46
|
+
|
|
47
|
+
Raises:
|
|
48
|
+
- ValueError: If there is an error reading the file.
|
|
49
|
+
"""
|
|
50
|
+
try:
|
|
51
|
+
with open(pathname, "r") as f:
|
|
52
|
+
database = json.load(f) # Load the JSON data from the file
|
|
53
|
+
return database
|
|
54
|
+
except IOError as e:
|
|
55
|
+
raise ValueError(f"Error reading to file {pathname}: {e}")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def save_to_pickle(data: List[Dict[str, Any]], filename: str) -> None:
|
|
59
|
+
"""
|
|
60
|
+
Save a list of dictionaries to a pickle file.
|
|
61
|
+
|
|
62
|
+
Parameters:
|
|
63
|
+
- data (List[Dict[str, Any]]): A list of dictionaries to be saved.
|
|
64
|
+
- filename (str): The name of the file where the data will be saved.
|
|
65
|
+
"""
|
|
66
|
+
with open(filename, "wb") as file:
|
|
67
|
+
pickle.dump(data, file)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def load_from_pickle(filename: str) -> List[Any]:
|
|
71
|
+
"""
|
|
72
|
+
Load data from a pickle file.
|
|
73
|
+
|
|
74
|
+
Parameters:
|
|
75
|
+
- filename (str): The name of the pickle file to load data from.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
- List[Any]: The data loaded from the pickle file.
|
|
79
|
+
"""
|
|
80
|
+
with open(filename, "rb") as file:
|
|
81
|
+
return pickle.load(file)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def load_gml_as_text(gml_file_path):
|
|
85
|
+
"""
|
|
86
|
+
Load the contents of a GML file as a text string.
|
|
87
|
+
|
|
88
|
+
Parameters:
|
|
89
|
+
- gml_file_path (str): The file path to the GML file.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
- str: The text content of the GML file.
|
|
93
|
+
"""
|
|
94
|
+
try:
|
|
95
|
+
with open(gml_file_path, "r") as file:
|
|
96
|
+
return file.read()
|
|
97
|
+
except FileNotFoundError:
|
|
98
|
+
print(f"File not found: {gml_file_path}")
|
|
99
|
+
return None
|
|
100
|
+
except Exception as e:
|
|
101
|
+
print(f"An error occurred: {e}")
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def save_text_as_gml(gml_text, file_path):
|
|
106
|
+
"""
|
|
107
|
+
Save a GML text string to a file.
|
|
108
|
+
|
|
109
|
+
Parameters:
|
|
110
|
+
- gml_text (str): The GML content as a text string.
|
|
111
|
+
- file_path (str): The file path where the GML text will be saved.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
- bool: True if saving was successful, False otherwise.
|
|
115
|
+
"""
|
|
116
|
+
try:
|
|
117
|
+
with open(file_path, "w") as file:
|
|
118
|
+
file.write(gml_text)
|
|
119
|
+
print(f"GML text successfully saved to {file_path}")
|
|
120
|
+
return True
|
|
121
|
+
except Exception as e:
|
|
122
|
+
print(f"An error occurred while saving the GML text: {e}")
|
|
123
|
+
return False
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def save_compressed(array: ndarray, filename: str) -> None:
|
|
127
|
+
"""
|
|
128
|
+
Saves a NumPy array in a compressed format using .npz extension.
|
|
129
|
+
|
|
130
|
+
Parameters:
|
|
131
|
+
- array (ndarray): The NumPy array to be saved.
|
|
132
|
+
- filename (str): The file path or name to save the array to,
|
|
133
|
+
with a '.npz' extension.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
- None: This function does not return any value.
|
|
137
|
+
"""
|
|
138
|
+
np.savez_compressed(filename, array=array)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def load_compressed(filename: str) -> ndarray:
|
|
142
|
+
"""
|
|
143
|
+
Loads a NumPy array from a compressed .npz file.
|
|
144
|
+
|
|
145
|
+
Parameters:
|
|
146
|
+
- filename (str): The path of the .npz file to load.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
- ndarray: The loaded NumPy array.
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
- KeyError: If the .npz file does not contain an array with the key 'array'.
|
|
153
|
+
"""
|
|
154
|
+
with np.load(filename) as data:
|
|
155
|
+
if "array" in data:
|
|
156
|
+
return data["array"]
|
|
157
|
+
else:
|
|
158
|
+
raise KeyError(
|
|
159
|
+
"The .npz file does not contain" + " an array with the key 'array'."
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def save_model(model: Any, filename: str) -> None:
|
|
164
|
+
"""
|
|
165
|
+
Save a machine learning model to a file using joblib.
|
|
166
|
+
|
|
167
|
+
Parameters:
|
|
168
|
+
- model (Any): The machine learning model to save.
|
|
169
|
+
- filename (str): The path to the file where the model will be saved.
|
|
170
|
+
"""
|
|
171
|
+
dump(model, filename)
|
|
172
|
+
logger.info(f"Model saved successfully to {filename}")
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def load_model(filename: str) -> Any:
|
|
176
|
+
"""
|
|
177
|
+
Load a machine learning model from a file using joblib.
|
|
178
|
+
|
|
179
|
+
Parameters:
|
|
180
|
+
- filename (str): The path to the file from which the model will be loaded.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
- Any: The loaded machine learning model.
|
|
184
|
+
"""
|
|
185
|
+
model = load(filename)
|
|
186
|
+
logger.info(f"Model loaded successfully from {filename}")
|
|
187
|
+
return model
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def save_dict_to_json(data: dict, file_path: str) -> None:
|
|
191
|
+
"""
|
|
192
|
+
Save a dictionary to a JSON file.
|
|
193
|
+
|
|
194
|
+
Parameters:
|
|
195
|
+
-----------
|
|
196
|
+
data : dict
|
|
197
|
+
The dictionary to be saved.
|
|
198
|
+
|
|
199
|
+
file_path : str
|
|
200
|
+
The path to the file where the dictionary should be saved.
|
|
201
|
+
Make sure the file has a .json extension.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
--------
|
|
205
|
+
None
|
|
206
|
+
"""
|
|
207
|
+
with open(file_path, "w") as json_file:
|
|
208
|
+
json.dump(data, json_file, indent=4)
|
|
209
|
+
|
|
210
|
+
logger.info(f"Dictionary successfully saved to {file_path}")
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def load_dict_from_json(file_path: str) -> dict:
|
|
214
|
+
"""
|
|
215
|
+
Load a dictionary from a JSON file.
|
|
216
|
+
|
|
217
|
+
Parameters:
|
|
218
|
+
-----------
|
|
219
|
+
file_path : str
|
|
220
|
+
The path to the JSON file from which to load the dictionary.
|
|
221
|
+
Make sure the file has a .json extension.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
--------
|
|
225
|
+
dict
|
|
226
|
+
The dictionary loaded from the JSON file.
|
|
227
|
+
"""
|
|
228
|
+
try:
|
|
229
|
+
with open(file_path, "r") as json_file:
|
|
230
|
+
data = json.load(json_file)
|
|
231
|
+
logger.info(f"Dictionary successfully loaded from {file_path}")
|
|
232
|
+
return data
|
|
233
|
+
except Exception as e:
|
|
234
|
+
logger.error(e)
|
|
235
|
+
return None
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def load_from_pickle_generator(file_path: str) -> Generator[Any, None, None]:
|
|
239
|
+
"""
|
|
240
|
+
A generator that yields items from a pickle file where each pickle load returns a list
|
|
241
|
+
of dictionaries.
|
|
242
|
+
|
|
243
|
+
Paremeters:
|
|
244
|
+
- file_path (str): The path to the pickle file to load.
|
|
245
|
+
|
|
246
|
+
- Yields:
|
|
247
|
+
Any: Yields a single item from the list of dictionaries stored in the pickle file.
|
|
248
|
+
"""
|
|
249
|
+
with open(file_path, "rb") as file:
|
|
250
|
+
while True:
|
|
251
|
+
try:
|
|
252
|
+
batch_items = pickle.load(file)
|
|
253
|
+
for item in batch_items:
|
|
254
|
+
yield item
|
|
255
|
+
except EOFError:
|
|
256
|
+
break
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def collect_data(num_batches: int, temp_dir: str, file_template: str) -> List[Any]:
|
|
260
|
+
"""
|
|
261
|
+
Collects and aggregates data from multiple pickle files into a single list.
|
|
262
|
+
|
|
263
|
+
Paremeters:
|
|
264
|
+
- num_batches (int): The number of batch files to process.
|
|
265
|
+
- temp_dir (str): The directory where the batch files are stored.
|
|
266
|
+
- file_template (str): The template string for batch file names, expecting an integer
|
|
267
|
+
formatter.
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
List[Any]: A list of aggregated data items from all batch files.
|
|
271
|
+
"""
|
|
272
|
+
collected_data: List[Any] = []
|
|
273
|
+
for i in range(num_batches):
|
|
274
|
+
file_path = os.path.join(temp_dir, file_template.format(i))
|
|
275
|
+
for item in load_from_pickle_generator(file_path):
|
|
276
|
+
collected_data.append(item)
|
|
277
|
+
return collected_data
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from typing import List, Dict, Any
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def merge_dicts(
|
|
5
|
+
list1: List[Dict[str, Any]],
|
|
6
|
+
list2: List[Dict[str, Any]],
|
|
7
|
+
key: str,
|
|
8
|
+
intersection: bool = True,
|
|
9
|
+
) -> List[Dict[str, Any]]:
|
|
10
|
+
"""
|
|
11
|
+
Merges two lists of dictionaries based on a specified key, with an option to
|
|
12
|
+
either merge only dictionaries with matching key values (intersection) or
|
|
13
|
+
all dictionaries (union).
|
|
14
|
+
|
|
15
|
+
Parameters:
|
|
16
|
+
- list1 (List[Dict[str, Any]]): The first list of dictionaries.
|
|
17
|
+
- list2 (List[Dict[str, Any]]): The second list of dictionaries.
|
|
18
|
+
- key (str): The key used to match and merge dictionaries from both lists.
|
|
19
|
+
- intersection (bool): If True, only merge dictionaries with matching key values;
|
|
20
|
+
if False, merge all dictionaries, combining those with matching key values.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
- List[Dict[str, Any]]: A list of dictionaries with merged contents from both
|
|
24
|
+
input lists according to the specified merging strategy.
|
|
25
|
+
"""
|
|
26
|
+
dict1 = {item[key]: item for item in list1}
|
|
27
|
+
dict2 = {item[key]: item for item in list2}
|
|
28
|
+
|
|
29
|
+
if intersection:
|
|
30
|
+
# Intersection of keys: only keys present in both dictionaries are merged
|
|
31
|
+
merged_list = []
|
|
32
|
+
for item1 in list1:
|
|
33
|
+
r_id = item1.get(key)
|
|
34
|
+
if r_id in dict2:
|
|
35
|
+
merged_item = {**item1, **dict2[r_id]}
|
|
36
|
+
merged_list.append(merged_item)
|
|
37
|
+
return merged_list
|
|
38
|
+
else:
|
|
39
|
+
# Union of keys: all keys from both dictionaries are merged
|
|
40
|
+
merged_dict = {}
|
|
41
|
+
all_keys = set(dict1) | set(dict2)
|
|
42
|
+
for k in all_keys:
|
|
43
|
+
if k in dict1 and k in dict2:
|
|
44
|
+
merged_dict[k] = {**dict1[k], **dict2[k]}
|
|
45
|
+
elif k in dict1:
|
|
46
|
+
merged_dict[k] = dict1[k]
|
|
47
|
+
else:
|
|
48
|
+
merged_dict[k] = dict2[k]
|
|
49
|
+
return list(merged_dict.values())
|
synkit/IO/debug.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
import warnings
|
|
4
|
+
from rdkit import rdBase
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def setup_logging(log_level: str = "INFO", log_filename: str = None) -> logging.Logger:
|
|
8
|
+
"""
|
|
9
|
+
Configures logging to either the console or a file based on provided parameters.
|
|
10
|
+
|
|
11
|
+
Parameters
|
|
12
|
+
----------
|
|
13
|
+
log_level : str, optional
|
|
14
|
+
Logging level to set. Defaults to 'INFO'. Options include 'DEBUG', 'INFO',
|
|
15
|
+
'WARNING', 'ERROR', 'CRITICAL'.
|
|
16
|
+
log_filename : str, optional
|
|
17
|
+
If provided, logs are written to this file. Defaults to None,
|
|
18
|
+
which logs to console.
|
|
19
|
+
|
|
20
|
+
Returns
|
|
21
|
+
-------
|
|
22
|
+
logging.Logger
|
|
23
|
+
Configured logger instance.
|
|
24
|
+
|
|
25
|
+
Raises
|
|
26
|
+
------
|
|
27
|
+
ValueError
|
|
28
|
+
If an invalid log level is provided.
|
|
29
|
+
"""
|
|
30
|
+
log_format = "%(asctime)s - %(levelname)s - %(message)s"
|
|
31
|
+
numeric_level = getattr(logging, log_level.upper(), None)
|
|
32
|
+
|
|
33
|
+
if not isinstance(numeric_level, int):
|
|
34
|
+
raise ValueError(f"Invalid log level: {log_level}")
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger()
|
|
37
|
+
logger.handlers.clear() # Efficiently remove all existing handlers
|
|
38
|
+
|
|
39
|
+
if log_filename:
|
|
40
|
+
os.makedirs(os.path.dirname(log_filename), exist_ok=True)
|
|
41
|
+
logging.basicConfig(
|
|
42
|
+
level=numeric_level, format=log_format, filename=log_filename, filemode="a"
|
|
43
|
+
)
|
|
44
|
+
else:
|
|
45
|
+
logging.basicConfig(level=numeric_level, format=log_format)
|
|
46
|
+
|
|
47
|
+
return logger
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def configure_warnings_and_logs(
|
|
51
|
+
ignore_warnings: bool = False, disable_rdkit_logs: bool = False
|
|
52
|
+
) -> None:
|
|
53
|
+
"""
|
|
54
|
+
Configures Python warnings and RDKit log behavior based on input flags.
|
|
55
|
+
|
|
56
|
+
Parameters
|
|
57
|
+
----------
|
|
58
|
+
ignore_warnings : bool, optional
|
|
59
|
+
Whether to suppress Python warnings. Default is False.
|
|
60
|
+
disable_rdkit_logs : bool, optional
|
|
61
|
+
Whether to disable RDKit error and warning logs. Default is False.
|
|
62
|
+
|
|
63
|
+
Usage
|
|
64
|
+
-----
|
|
65
|
+
This function is useful for controlling verbosity in production or testing, but
|
|
66
|
+
should be used cautiously during development to avoid missing critical issues.
|
|
67
|
+
"""
|
|
68
|
+
if ignore_warnings:
|
|
69
|
+
warnings.filterwarnings("ignore")
|
|
70
|
+
else:
|
|
71
|
+
warnings.resetwarnings()
|
|
72
|
+
|
|
73
|
+
if disable_rdkit_logs:
|
|
74
|
+
rdBase.DisableLog("rdApp.error")
|
|
75
|
+
rdBase.DisableLog("rdApp.warning")
|
|
76
|
+
else:
|
|
77
|
+
rdBase.EnableLog("rdApp.error")
|
|
78
|
+
rdBase.EnableLog("rdApp.warning")
|
synkit/IO/dg_to_gml.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import regex
|
|
2
|
+
from synkit.IO.debug import setup_logging
|
|
3
|
+
from synkit.Chem.Reaction.standardize import Standardize
|
|
4
|
+
from mod import DGVertexMapper, smiles, Rule
|
|
5
|
+
|
|
6
|
+
logger = setup_logging()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DGToGML:
|
|
10
|
+
def __init__(self) -> None:
|
|
11
|
+
self.standardizer = Standardize()
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def getReactionSmiles(dg):
|
|
16
|
+
origSmiles = {}
|
|
17
|
+
for v in dg.vertices:
|
|
18
|
+
s = v.graph.smilesWithIds
|
|
19
|
+
s = regex.sub(":([0-9]+)]", ":o\\1]", s)
|
|
20
|
+
origSmiles[v.graph] = s
|
|
21
|
+
|
|
22
|
+
res = {}
|
|
23
|
+
for e in dg.edges:
|
|
24
|
+
vms = DGVertexMapper(e, rightLimit=1, leftLimit=1)
|
|
25
|
+
# vms = DGVertexMapper(e)
|
|
26
|
+
eductSmiles = [origSmiles[g] for g in vms.left]
|
|
27
|
+
|
|
28
|
+
for ev in vms.left.vertices:
|
|
29
|
+
s = eductSmiles[ev.graphIndex]
|
|
30
|
+
s = s.replace(f":o{ev.vertex.id}]", f":{ev.id}]")
|
|
31
|
+
eductSmiles[ev.graphIndex] = s
|
|
32
|
+
|
|
33
|
+
strs = set()
|
|
34
|
+
for vm in DGVertexMapper(e, rightLimit=1, leftLimit=1):
|
|
35
|
+
# for vm in DGVertexMapper(e):
|
|
36
|
+
productSmiles = [origSmiles[g] for g in vms.right]
|
|
37
|
+
for ev in vms.left.vertices:
|
|
38
|
+
pv = vm.map[ev]
|
|
39
|
+
if not pv:
|
|
40
|
+
continue
|
|
41
|
+
s = productSmiles[pv.graphIndex]
|
|
42
|
+
s = s.replace(f":o{pv.vertex.id}]", f":{ev.id}]")
|
|
43
|
+
productSmiles[pv.graphIndex] = s
|
|
44
|
+
count = vms.left.numVertices
|
|
45
|
+
for pv in vms.right.vertices:
|
|
46
|
+
ev = vm.map.inverse(pv)
|
|
47
|
+
if ev:
|
|
48
|
+
continue
|
|
49
|
+
s = productSmiles[pv.graphIndex]
|
|
50
|
+
s = s.replace(f":o{pv.vertex.id}]", f":{count}]")
|
|
51
|
+
count += 1
|
|
52
|
+
productSmiles[pv.graphIndex] = s
|
|
53
|
+
left = ".".join(eductSmiles)
|
|
54
|
+
right = ".".join(productSmiles)
|
|
55
|
+
s = f"{left}>>{right}"
|
|
56
|
+
assert ":o" not in s
|
|
57
|
+
strs.add(s)
|
|
58
|
+
res[e] = list(sorted(strs))
|
|
59
|
+
return res
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def parseReactionSmiles(line: str) -> Rule:
|
|
63
|
+
sLeft, sRight = line.split(">>")
|
|
64
|
+
ssLeft = sLeft.split(".")
|
|
65
|
+
ssRight = sRight.split(".")
|
|
66
|
+
mLeft = [smiles(s, add=False) for s in ssLeft]
|
|
67
|
+
mRight = [smiles(s, add=False) for s in ssRight]
|
|
68
|
+
|
|
69
|
+
def printGraph(g):
|
|
70
|
+
extFromInt = {}
|
|
71
|
+
for iExt in range(g.minExternalId, g.maxExternalId + 1):
|
|
72
|
+
v = g.getVertexFromExternalId(iExt)
|
|
73
|
+
if not v.isNull():
|
|
74
|
+
extFromInt[v] = iExt
|
|
75
|
+
s = ""
|
|
76
|
+
for v in g.vertices:
|
|
77
|
+
assert v in extFromInt
|
|
78
|
+
s += '\t\tnode [ id %d label "%s" ]\n' % (extFromInt[v], v.stringLabel)
|
|
79
|
+
for e in g.edges:
|
|
80
|
+
s += '\t\tedge [ source %d target %d label "%s" ]\n' % (
|
|
81
|
+
extFromInt[e.source],
|
|
82
|
+
extFromInt[e.target],
|
|
83
|
+
e.stringLabel,
|
|
84
|
+
)
|
|
85
|
+
return s
|
|
86
|
+
|
|
87
|
+
s = "rule [\n\tleft [\n"
|
|
88
|
+
for m in mLeft:
|
|
89
|
+
s += printGraph(m)
|
|
90
|
+
s += "\t]\n\tright [\n"
|
|
91
|
+
for m in mRight:
|
|
92
|
+
s += printGraph(m)
|
|
93
|
+
s += "\t]\n]\n"
|
|
94
|
+
return s, Rule.fromGMLString(s, add=False)
|
|
95
|
+
|
|
96
|
+
def fit(self, dg, origSmiles):
|
|
97
|
+
"""
|
|
98
|
+
Matches the original SMILES to a list of generated reaction SMILES and
|
|
99
|
+
returns the parsed reaction.
|
|
100
|
+
|
|
101
|
+
Parameters:
|
|
102
|
+
- dg (DataGenerator): The data generator instance containing the reactions.
|
|
103
|
+
- origSmiles (str): The original SMILES string to match.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
- Parsed reaction if a match is found; otherwise, None.
|
|
107
|
+
"""
|
|
108
|
+
try:
|
|
109
|
+
res = DGToGML.getReactionSmiles(dg)
|
|
110
|
+
smiles_list = [value for values in res.values() for value in values]
|
|
111
|
+
|
|
112
|
+
smiles_standard = [
|
|
113
|
+
self.standardizer.fit(rsmi, True, True) for rsmi in smiles_list
|
|
114
|
+
]
|
|
115
|
+
origSmiles_standard = self.standardizer.fit(origSmiles, True, True)
|
|
116
|
+
|
|
117
|
+
for index, value in enumerate(smiles_standard):
|
|
118
|
+
if value == origSmiles_standard:
|
|
119
|
+
return self.parseReactionSmiles(smiles_list[index])
|
|
120
|
+
|
|
121
|
+
return None
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.error(f"An error occurred: {e}")
|
|
124
|
+
return None
|
synkit/IO/gml_to_nx.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import networkx as nx
|
|
3
|
+
from typing import Tuple
|
|
4
|
+
from synkit.ITS.its_construction import ITSConstruction
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class GMLToNX:
|
|
8
|
+
def __init__(self, gml_text: str):
|
|
9
|
+
"""
|
|
10
|
+
Initializes a GMLToNX object that can parse GML-like text into separate
|
|
11
|
+
NetworkX graphs representing different stages or components of a chemical reaction.
|
|
12
|
+
"""
|
|
13
|
+
self.gml_text = gml_text
|
|
14
|
+
self.graphs = {"left": nx.Graph(), "context": nx.Graph(), "right": nx.Graph()}
|
|
15
|
+
|
|
16
|
+
def _parse_element(self, line: str, current_section: str):
|
|
17
|
+
"""
|
|
18
|
+
Parses a line of GML-like text to extract node or edge data and adds it to the
|
|
19
|
+
current section's graph.
|
|
20
|
+
"""
|
|
21
|
+
label_to_order = {"-": 1, ":": 1.5, "=": 2, "#": 3}
|
|
22
|
+
tokens = line.split()
|
|
23
|
+
|
|
24
|
+
if "node" in line:
|
|
25
|
+
node_id = int(tokens[tokens.index("id") + 1])
|
|
26
|
+
label = tokens[tokens.index("label") + 1].strip('"')
|
|
27
|
+
element, charge = self._extract_element_and_charge(label)
|
|
28
|
+
node_attributes = {
|
|
29
|
+
"element": element,
|
|
30
|
+
"charge": charge,
|
|
31
|
+
"atom_map": node_id,
|
|
32
|
+
}
|
|
33
|
+
self.graphs[current_section].add_node(node_id, **node_attributes)
|
|
34
|
+
|
|
35
|
+
elif "edge" in line:
|
|
36
|
+
source = int(tokens[tokens.index("source") + 1])
|
|
37
|
+
target = int(tokens[tokens.index("target") + 1])
|
|
38
|
+
label = tokens[tokens.index("label") + 1].strip('"')
|
|
39
|
+
order = label_to_order.get(label, 0)
|
|
40
|
+
self.graphs[current_section].add_edge(source, target, order=order)
|
|
41
|
+
|
|
42
|
+
def _extract_element_and_charge(self, label: str) -> Tuple[str, int]:
|
|
43
|
+
"""
|
|
44
|
+
Extracts the chemical element and its charge from a node label.
|
|
45
|
+
"""
|
|
46
|
+
match = re.match(r"([A-Za-z*]+)(\d+)?([+-])?$", label)
|
|
47
|
+
if not match:
|
|
48
|
+
return ("X", 0)
|
|
49
|
+
element = match.group(1)
|
|
50
|
+
num = match.group(2)
|
|
51
|
+
sign = match.group(3)
|
|
52
|
+
charge = 0
|
|
53
|
+
if sign:
|
|
54
|
+
charge_val = int(num) if num else 1
|
|
55
|
+
charge = charge_val if sign == "+" else -charge_val
|
|
56
|
+
return element, charge
|
|
57
|
+
|
|
58
|
+
def _synchronize_nodes_and_edges(self):
|
|
59
|
+
"""
|
|
60
|
+
Ensures that all nodes and edges in 'context' appear in both 'left' and 'right'.
|
|
61
|
+
We do not remove edges from left or right if they are not in context.
|
|
62
|
+
We only add missing context nodes and edges to left and right.
|
|
63
|
+
"""
|
|
64
|
+
# Add missing context nodes to left and right
|
|
65
|
+
for node, ndata in self.graphs["context"].nodes(data=True):
|
|
66
|
+
if node not in self.graphs["left"]:
|
|
67
|
+
self.graphs["left"].add_node(node, **ndata)
|
|
68
|
+
else:
|
|
69
|
+
# Merge attributes if node already exists in left
|
|
70
|
+
for k, v in ndata.items():
|
|
71
|
+
self.graphs["left"].nodes[node][k] = v
|
|
72
|
+
|
|
73
|
+
if node not in self.graphs["right"]:
|
|
74
|
+
self.graphs["right"].add_node(node, **ndata)
|
|
75
|
+
else:
|
|
76
|
+
# Merge attributes if node already exists in right
|
|
77
|
+
for k, v in ndata.items():
|
|
78
|
+
self.graphs["right"].nodes[node][k] = v
|
|
79
|
+
|
|
80
|
+
# Add missing context edges to left and right
|
|
81
|
+
for s, t, edata in self.graphs["context"].edges(data=True):
|
|
82
|
+
if not self.graphs["left"].has_edge(s, t):
|
|
83
|
+
self.graphs["left"].add_edge(s, t, **edata)
|
|
84
|
+
if not self.graphs["right"].has_edge(s, t):
|
|
85
|
+
self.graphs["right"].add_edge(s, t, **edata)
|
|
86
|
+
|
|
87
|
+
def transform(self) -> Tuple[nx.Graph, nx.Graph, nx.Graph]:
|
|
88
|
+
"""
|
|
89
|
+
Transforms the GML-like text into three NetworkX graphs: left, right, and context.
|
|
90
|
+
"""
|
|
91
|
+
current_section = None
|
|
92
|
+
lines = self.gml_text.split("\n")
|
|
93
|
+
for line in lines:
|
|
94
|
+
line = line.strip()
|
|
95
|
+
if line.startswith("rule") or line == "]":
|
|
96
|
+
continue
|
|
97
|
+
if any(section in line for section in ["left", "context", "right"]):
|
|
98
|
+
current_section = line.split("[")[0].strip()
|
|
99
|
+
continue
|
|
100
|
+
if line.startswith("node") or line.startswith("edge"):
|
|
101
|
+
self._parse_element(line, current_section)
|
|
102
|
+
|
|
103
|
+
# Synchronize after parsing
|
|
104
|
+
self._synchronize_nodes_and_edges()
|
|
105
|
+
|
|
106
|
+
# Create the ITS graph
|
|
107
|
+
its_graph = ITSConstruction.ITSGraph(self.graphs["left"], self.graphs["right"])
|
|
108
|
+
|
|
109
|
+
# Restore node attributes in ITS graph from left (or right)
|
|
110
|
+
for n in its_graph.nodes():
|
|
111
|
+
if n in self.graphs["left"].nodes:
|
|
112
|
+
for k, v in self.graphs["left"].nodes[n].items():
|
|
113
|
+
its_graph.nodes[n][k] = v
|
|
114
|
+
|
|
115
|
+
self.graphs["context"] = ITSConstruction.ITSGraph(
|
|
116
|
+
self.graphs["left"], self.graphs["right"]
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
return self.graphs["left"], self.graphs["right"], self.graphs["context"]
|