molbuilder 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- molbuilder/__init__.py +8 -0
- molbuilder/__main__.py +6 -0
- molbuilder/atomic/__init__.py +4 -0
- molbuilder/atomic/bohr.py +235 -0
- molbuilder/atomic/quantum_atom.py +334 -0
- molbuilder/atomic/quantum_numbers.py +196 -0
- molbuilder/atomic/wavefunctions.py +297 -0
- molbuilder/bonding/__init__.py +4 -0
- molbuilder/bonding/covalent.py +442 -0
- molbuilder/bonding/lewis.py +347 -0
- molbuilder/bonding/vsepr.py +433 -0
- molbuilder/cli/__init__.py +1 -0
- molbuilder/cli/demos.py +516 -0
- molbuilder/cli/menu.py +127 -0
- molbuilder/cli/wizard.py +831 -0
- molbuilder/core/__init__.py +6 -0
- molbuilder/core/bond_data.py +170 -0
- molbuilder/core/constants.py +51 -0
- molbuilder/core/element_properties.py +183 -0
- molbuilder/core/elements.py +181 -0
- molbuilder/core/geometry.py +232 -0
- molbuilder/gui/__init__.py +2 -0
- molbuilder/gui/app.py +286 -0
- molbuilder/gui/canvas3d.py +115 -0
- molbuilder/gui/dialogs.py +117 -0
- molbuilder/gui/event_handler.py +118 -0
- molbuilder/gui/sidebar.py +105 -0
- molbuilder/gui/toolbar.py +71 -0
- molbuilder/io/__init__.py +1 -0
- molbuilder/io/json_io.py +146 -0
- molbuilder/io/mol_sdf.py +169 -0
- molbuilder/io/pdb.py +184 -0
- molbuilder/io/smiles_io.py +47 -0
- molbuilder/io/xyz.py +103 -0
- molbuilder/molecule/__init__.py +2 -0
- molbuilder/molecule/amino_acids.py +919 -0
- molbuilder/molecule/builders.py +257 -0
- molbuilder/molecule/conformations.py +70 -0
- molbuilder/molecule/functional_groups.py +484 -0
- molbuilder/molecule/graph.py +712 -0
- molbuilder/molecule/peptides.py +13 -0
- molbuilder/molecule/stereochemistry.py +6 -0
- molbuilder/process/__init__.py +3 -0
- molbuilder/process/conditions.py +260 -0
- molbuilder/process/costing.py +316 -0
- molbuilder/process/purification.py +285 -0
- molbuilder/process/reactor.py +297 -0
- molbuilder/process/safety.py +476 -0
- molbuilder/process/scale_up.py +427 -0
- molbuilder/process/solvent_systems.py +204 -0
- molbuilder/reactions/__init__.py +3 -0
- molbuilder/reactions/functional_group_detect.py +728 -0
- molbuilder/reactions/knowledge_base.py +1716 -0
- molbuilder/reactions/reaction_types.py +102 -0
- molbuilder/reactions/reagent_data.py +1248 -0
- molbuilder/reactions/retrosynthesis.py +1430 -0
- molbuilder/reactions/synthesis_route.py +377 -0
- molbuilder/reports/__init__.py +158 -0
- molbuilder/reports/cost_report.py +206 -0
- molbuilder/reports/molecule_report.py +279 -0
- molbuilder/reports/safety_report.py +296 -0
- molbuilder/reports/synthesis_report.py +283 -0
- molbuilder/reports/text_formatter.py +170 -0
- molbuilder/smiles/__init__.py +4 -0
- molbuilder/smiles/parser.py +487 -0
- molbuilder/smiles/tokenizer.py +291 -0
- molbuilder/smiles/writer.py +375 -0
- molbuilder/visualization/__init__.py +1 -0
- molbuilder/visualization/bohr_viz.py +166 -0
- molbuilder/visualization/molecule_viz.py +368 -0
- molbuilder/visualization/quantum_viz.py +434 -0
- molbuilder/visualization/theme.py +12 -0
- molbuilder-1.0.0.dist-info/METADATA +360 -0
- molbuilder-1.0.0.dist-info/RECORD +78 -0
- molbuilder-1.0.0.dist-info/WHEEL +5 -0
- molbuilder-1.0.0.dist-info/entry_points.txt +2 -0
- molbuilder-1.0.0.dist-info/licenses/LICENSE +21 -0
- molbuilder-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1430 @@
|
|
|
1
|
+
"""Retrosynthetic analysis engine using beam search.
|
|
2
|
+
|
|
3
|
+
Given a target molecule, this module works backwards from the product to
|
|
4
|
+
identify commercially available starting materials, applying known reaction
|
|
5
|
+
templates in reverse (disconnection approach). A beam search explores the
|
|
6
|
+
most promising disconnections at each level, producing a retrosynthesis
|
|
7
|
+
tree that can later be converted into a forward synthesis route.
|
|
8
|
+
|
|
9
|
+
Key public function
|
|
10
|
+
-------------------
|
|
11
|
+
retrosynthesis(mol, max_depth, beam_width) -> RetrosynthesisTree
|
|
12
|
+
|
|
13
|
+
Supporting helpers
|
|
14
|
+
------------------
|
|
15
|
+
is_purchasable(smiles) -> bool
|
|
16
|
+
get_purchasable(smiles) -> Precursor | None
|
|
17
|
+
score_disconnection(template, precursors, target_mol) -> float
|
|
18
|
+
format_tree(tree) -> str
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import math
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
|
|
26
|
+
from molbuilder.molecule.graph import Molecule
|
|
27
|
+
from molbuilder.smiles.parser import parse
|
|
28
|
+
from molbuilder.smiles.writer import to_smiles
|
|
29
|
+
from molbuilder.reactions.reaction_types import ReactionTemplate, ReactionCategory
|
|
30
|
+
from molbuilder.reactions.knowledge_base import (
|
|
31
|
+
REACTION_TEMPLATES,
|
|
32
|
+
lookup_by_functional_group,
|
|
33
|
+
find_reactions_producing,
|
|
34
|
+
)
|
|
35
|
+
from molbuilder.reactions.functional_group_detect import (
|
|
36
|
+
detect_functional_groups,
|
|
37
|
+
FunctionalGroup,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# =====================================================================
|
|
42
|
+
# Purchasable starting materials database (~200 entries)
|
|
43
|
+
# =====================================================================
|
|
44
|
+
|
|
45
|
+
# Each entry maps a canonical SMILES to (common_name, cost_per_kg_usd).
|
|
46
|
+
# Organised roughly by functional-group class so the table is easy to
|
|
47
|
+
# extend. Costs are representative order-of-magnitude estimates for
|
|
48
|
+
# bulk laboratory quantities and are NOT authoritative pricing data.
|
|
49
|
+
|
|
50
|
+
PURCHASABLE_MATERIALS: dict[str, tuple[str, float]] = {
|
|
51
|
+
# --- simple hydrocarbons / gases ---
|
|
52
|
+
"C": ("methane", 0.50),
|
|
53
|
+
"CC": ("ethane", 1.00),
|
|
54
|
+
"CCC": ("propane", 1.20),
|
|
55
|
+
"CCCC": ("n-butane", 1.50),
|
|
56
|
+
"C=C": ("ethylene", 1.50),
|
|
57
|
+
"CC=C": ("propylene", 2.00),
|
|
58
|
+
"C=CC=C": ("1,3-butadiene", 3.00),
|
|
59
|
+
"C#C": ("acetylene", 2.50),
|
|
60
|
+
"CC#C": ("propyne", 5.00),
|
|
61
|
+
"C1CC1": ("cyclopropane", 8.00),
|
|
62
|
+
"C1CCC1": ("cyclobutane", 15.00),
|
|
63
|
+
"C1CCCC1": ("cyclopentane", 4.00),
|
|
64
|
+
"C1CCCCC1": ("cyclohexane", 3.00),
|
|
65
|
+
|
|
66
|
+
# --- alkyl halides ---
|
|
67
|
+
"CCl": ("chloromethane", 2.00),
|
|
68
|
+
"CBr": ("bromomethane", 5.00),
|
|
69
|
+
"CI": ("iodomethane", 12.00),
|
|
70
|
+
"CCCl": ("chloroethane", 3.00),
|
|
71
|
+
"CCBr": ("bromoethane", 6.00),
|
|
72
|
+
"CCI": ("iodoethane", 15.00),
|
|
73
|
+
"CCCCl": ("1-chloropropane", 4.00),
|
|
74
|
+
"CCCBr": ("1-bromopropane", 7.00),
|
|
75
|
+
"CCCI": ("1-iodopropane", 18.00),
|
|
76
|
+
"CCCCCl": ("1-chlorobutane", 5.00),
|
|
77
|
+
"CCCCBr": ("1-bromobutane", 8.00),
|
|
78
|
+
"CCCCI": ("1-iodobutane", 20.00),
|
|
79
|
+
"CC(C)Cl": ("2-chloropropane", 5.00),
|
|
80
|
+
"CC(C)Br": ("2-bromopropane", 8.00),
|
|
81
|
+
"CC(C)(C)Cl": ("tert-butyl chloride", 6.00),
|
|
82
|
+
"CC(C)(C)Br": ("tert-butyl bromide", 10.00),
|
|
83
|
+
"C(Cl)(Cl)Cl": ("chloroform", 2.50),
|
|
84
|
+
"C(Cl)Cl": ("dichloromethane", 2.00),
|
|
85
|
+
"ClC=C": ("vinyl chloride", 2.00),
|
|
86
|
+
"BrC=C": ("vinyl bromide", 8.00),
|
|
87
|
+
"ClCC=C": ("allyl chloride", 4.00),
|
|
88
|
+
"BrCC=C": ("allyl bromide", 7.00),
|
|
89
|
+
|
|
90
|
+
# --- alcohols ---
|
|
91
|
+
"CO": ("methanol", 1.00),
|
|
92
|
+
"CCO": ("ethanol", 2.00),
|
|
93
|
+
"CCCO": ("1-propanol", 3.00),
|
|
94
|
+
"CC(C)O": ("2-propanol", 2.50),
|
|
95
|
+
"CCCCO": ("1-butanol", 3.50),
|
|
96
|
+
"CC(C)(C)O": ("tert-butanol", 4.00),
|
|
97
|
+
"CCCCCO": ("1-pentanol", 5.00),
|
|
98
|
+
"CCCCCCO": ("1-hexanol", 6.00),
|
|
99
|
+
"OCC=C": ("allyl alcohol", 5.00),
|
|
100
|
+
"OC1CCCCC1": ("cyclohexanol", 5.00),
|
|
101
|
+
"OCCO": ("ethylene glycol", 2.00),
|
|
102
|
+
"OCC(O)CO": ("glycerol", 2.50),
|
|
103
|
+
"OC(C)(C)C": ("tert-butanol (alt)", 4.00),
|
|
104
|
+
|
|
105
|
+
# --- water and simple inorganics ---
|
|
106
|
+
"O": ("water", 0.01),
|
|
107
|
+
"[NH3]": ("ammonia", 0.80),
|
|
108
|
+
"N": ("ammonia (SMILES variant)", 0.80),
|
|
109
|
+
"Cl": ("hydrochloric acid", 0.50),
|
|
110
|
+
"O=C=O": ("carbon dioxide", 0.30),
|
|
111
|
+
"S": ("hydrogen sulfide", 1.00),
|
|
112
|
+
|
|
113
|
+
# --- aldehydes ---
|
|
114
|
+
"C=O": ("formaldehyde", 1.50),
|
|
115
|
+
"CC=O": ("acetaldehyde", 3.00),
|
|
116
|
+
"CCC=O": ("propanal", 5.00),
|
|
117
|
+
"CCCC=O": ("butanal", 6.00),
|
|
118
|
+
"CCCCC=O": ("pentanal", 8.00),
|
|
119
|
+
"O=CC=O": ("glyoxal", 5.00),
|
|
120
|
+
|
|
121
|
+
# --- ketones ---
|
|
122
|
+
"CC(C)=O": ("acetone", 1.50),
|
|
123
|
+
"CCC(C)=O": ("methyl ethyl ketone", 3.00),
|
|
124
|
+
"CCC(CC)=O": ("3-pentanone", 5.00),
|
|
125
|
+
"CCCC(C)=O": ("2-pentanone", 5.00),
|
|
126
|
+
"O=C1CCCCC1": ("cyclohexanone", 4.00),
|
|
127
|
+
"C=CC(C)=O": ("methyl vinyl ketone", 6.00),
|
|
128
|
+
|
|
129
|
+
# --- carboxylic acids ---
|
|
130
|
+
"OC=O": ("formic acid", 2.00),
|
|
131
|
+
"CC(O)=O": ("acetic acid", 1.50),
|
|
132
|
+
"CCC(O)=O": ("propionic acid", 3.00),
|
|
133
|
+
"CCCC(O)=O": ("butyric acid", 4.00),
|
|
134
|
+
"CCCCC(O)=O": ("valeric acid", 6.00),
|
|
135
|
+
"OC(=O)C=C": ("acrylic acid", 3.00),
|
|
136
|
+
"OC(=O)CC(O)=O": ("malonic acid", 5.00),
|
|
137
|
+
"OC(=O)CCC(O)=O": ("succinic acid", 4.00),
|
|
138
|
+
"OC(=O)CCCCC(O)=O": ("adipic acid", 4.50),
|
|
139
|
+
"OC(=O)C(O)=O": ("oxalic acid", 3.50),
|
|
140
|
+
|
|
141
|
+
# --- esters ---
|
|
142
|
+
"COC(C)=O": ("methyl acetate", 3.00),
|
|
143
|
+
"CCOC(C)=O": ("ethyl acetate", 2.50),
|
|
144
|
+
"CCOC(=O)CC": ("ethyl propanoate", 4.00),
|
|
145
|
+
"CCOC(=O)OCC": ("diethyl carbonate", 5.00),
|
|
146
|
+
|
|
147
|
+
# --- ethers ---
|
|
148
|
+
"COC": ("dimethyl ether", 2.00),
|
|
149
|
+
"CCOCC": ("diethyl ether", 3.00),
|
|
150
|
+
"C1CCOC1": ("tetrahydrofuran", 4.00),
|
|
151
|
+
"C1COCCO1": ("1,4-dioxane", 5.00),
|
|
152
|
+
"COC=C": ("methyl vinyl ether", 6.00),
|
|
153
|
+
"COCCOCCOCC": ("diglyme", 8.00),
|
|
154
|
+
|
|
155
|
+
# --- amines ---
|
|
156
|
+
"CN": ("methylamine", 3.00),
|
|
157
|
+
"CCN": ("ethylamine", 4.00),
|
|
158
|
+
"CCCN": ("propylamine", 5.00),
|
|
159
|
+
"CCCCN": ("butylamine", 6.00),
|
|
160
|
+
"CNC": ("dimethylamine", 4.00),
|
|
161
|
+
"CN(C)C": ("trimethylamine", 5.00),
|
|
162
|
+
"CCN(CC)CC": ("triethylamine", 6.00),
|
|
163
|
+
"NCC=C": ("allylamine", 7.00),
|
|
164
|
+
"NC1CCCCC1": ("cyclohexylamine", 8.00),
|
|
165
|
+
"NCCN": ("ethylenediamine", 5.00),
|
|
166
|
+
"NCCCN": ("1,3-diaminopropane", 7.00),
|
|
167
|
+
"NCCCCN": ("1,4-diaminobutane", 8.00),
|
|
168
|
+
|
|
169
|
+
# --- amides ---
|
|
170
|
+
"NC=O": ("formamide", 3.00),
|
|
171
|
+
"CC(N)=O": ("acetamide", 4.00),
|
|
172
|
+
"CN(C)C=O": ("dimethylformamide", 3.50),
|
|
173
|
+
|
|
174
|
+
# --- nitriles ---
|
|
175
|
+
"C#N": ("hydrogen cyanide", 2.00),
|
|
176
|
+
"CC#N": ("acetonitrile", 3.00),
|
|
177
|
+
"CCC#N": ("propionitrile", 5.00),
|
|
178
|
+
"CCCC#N": ("butyronitrile", 7.00),
|
|
179
|
+
|
|
180
|
+
# --- aromatics ---
|
|
181
|
+
"c1ccccc1": ("benzene", 2.50),
|
|
182
|
+
"Cc1ccccc1": ("toluene", 2.50),
|
|
183
|
+
"CCc1ccccc1": ("ethylbenzene", 3.50),
|
|
184
|
+
"C=Cc1ccccc1": ("styrene", 4.00),
|
|
185
|
+
"c1ccc(cc1)C": ("toluene (alt)", 2.50),
|
|
186
|
+
"Oc1ccccc1": ("phenol", 3.00),
|
|
187
|
+
"Nc1ccccc1": ("aniline", 4.00),
|
|
188
|
+
"Clc1ccccc1": ("chlorobenzene", 3.50),
|
|
189
|
+
"Brc1ccccc1": ("bromobenzene", 5.00),
|
|
190
|
+
"Ic1ccccc1": ("iodobenzene", 10.00),
|
|
191
|
+
"OC(=O)c1ccccc1": ("benzoic acid", 3.50),
|
|
192
|
+
"O=Cc1ccccc1": ("benzaldehyde", 5.00),
|
|
193
|
+
"CC(=O)c1ccccc1": ("acetophenone", 5.00),
|
|
194
|
+
"c1ccc2ccccc2c1": ("naphthalene", 4.00),
|
|
195
|
+
"c1ccncc1": ("pyridine", 4.00),
|
|
196
|
+
"C1=COC=C1": ("furan", 5.00),
|
|
197
|
+
"c1cc[nH]c1": ("pyrrole", 6.00),
|
|
198
|
+
|
|
199
|
+
# --- aromatic halides ---
|
|
200
|
+
"Fc1ccccc1": ("fluorobenzene", 6.00),
|
|
201
|
+
"Clc1ccc(Cl)cc1": ("1,4-dichlorobenzene", 4.00),
|
|
202
|
+
|
|
203
|
+
# --- amino acids (common L-forms, simplified SMILES) ---
|
|
204
|
+
"NCC(O)=O": ("glycine", 5.00),
|
|
205
|
+
"CC(N)C(O)=O": ("alanine", 8.00),
|
|
206
|
+
"CC(C)C(N)C(O)=O": ("valine", 15.00),
|
|
207
|
+
"CC(CC)C(N)C(O)=O": ("isoleucine", 20.00),
|
|
208
|
+
"CCCC(N)C(O)=O": ("leucine (linear approx)", 18.00),
|
|
209
|
+
"NC(=O)CC(N)C(O)=O": ("asparagine", 15.00),
|
|
210
|
+
"OC(=O)CC(N)C(O)=O": ("aspartic acid", 12.00),
|
|
211
|
+
"OC(=O)CCC(N)C(O)=O": ("glutamic acid", 12.00),
|
|
212
|
+
"NCCCCC(N)C(O)=O": ("lysine", 20.00),
|
|
213
|
+
"NC(N)=NCCCC(N)C(O)=O": ("arginine", 25.00),
|
|
214
|
+
|
|
215
|
+
# --- thiols ---
|
|
216
|
+
"CS": ("methanethiol", 4.00),
|
|
217
|
+
"CCS": ("ethanethiol", 5.00),
|
|
218
|
+
"CCCS": ("1-propanethiol", 7.00),
|
|
219
|
+
|
|
220
|
+
# --- acid chlorides ---
|
|
221
|
+
"CC(Cl)=O": ("acetyl chloride", 4.00),
|
|
222
|
+
"ClC(Cl)=O": ("phosgene", 3.00),
|
|
223
|
+
"CCC(Cl)=O": ("propanoyl chloride", 6.00),
|
|
224
|
+
"OC(Cl)=O": ("chloroformic acid", 5.00),
|
|
225
|
+
|
|
226
|
+
# --- acid anhydrides ---
|
|
227
|
+
"CC(=O)OC(C)=O": ("acetic anhydride", 3.00),
|
|
228
|
+
|
|
229
|
+
# --- epoxides ---
|
|
230
|
+
"C1CO1": ("ethylene oxide", 3.00),
|
|
231
|
+
"CC1CO1": ("propylene oxide", 4.00),
|
|
232
|
+
|
|
233
|
+
# --- miscellaneous building blocks ---
|
|
234
|
+
"C(=O)O": ("formic acid (alt)", 2.00),
|
|
235
|
+
"CCCCCCCCCCCC": ("dodecane", 4.00),
|
|
236
|
+
"CCCCCCCC": ("octane", 3.00),
|
|
237
|
+
"CCCCCC": ("hexane", 2.50),
|
|
238
|
+
"CCCCC": ("pentane", 2.00),
|
|
239
|
+
"CC(C)CC": ("isopentane", 2.50),
|
|
240
|
+
"CC(C)C": ("isobutane", 2.00),
|
|
241
|
+
"C=CC(=O)OC": ("methyl acrylate", 4.00),
|
|
242
|
+
"C=CC(=O)OCC": ("ethyl acrylate", 5.00),
|
|
243
|
+
"C=C(C)C(=O)OC": ("methyl methacrylate", 5.00),
|
|
244
|
+
"C(CO)O": ("ethylene glycol (alt)", 2.00),
|
|
245
|
+
"OCCCCO": ("1,4-butanediol", 4.00),
|
|
246
|
+
"C(F)(F)F": ("fluoroform", 3.00),
|
|
247
|
+
"C(Cl)(Cl)(Cl)Cl": ("carbon tetrachloride", 3.00),
|
|
248
|
+
"C(F)(F)(F)Cl": ("chlorotrifluoromethane", 5.00),
|
|
249
|
+
"CC(=O)OC=C": ("vinyl acetate", 4.00),
|
|
250
|
+
"ClCCCl": ("1,2-dichloroethane", 2.00),
|
|
251
|
+
"BrCCBr": ("1,2-dibromoethane", 6.00),
|
|
252
|
+
"CCCCCCCCCCCCCCCCCC(O)=O": ("stearic acid", 4.00),
|
|
253
|
+
"CCCCCCCC(O)=O": ("octanoic acid", 5.00),
|
|
254
|
+
|
|
255
|
+
# --- sugars / polyols ---
|
|
256
|
+
"OCC(O)C(O)C(O)C(O)CO": ("D-sorbitol", 3.50),
|
|
257
|
+
"OCC(O)C(O)CO": ("erythritol", 6.00),
|
|
258
|
+
|
|
259
|
+
# --- diacids / anhydrides ---
|
|
260
|
+
"O=C1OC(=O)C=C1": ("maleic anhydride", 3.50),
|
|
261
|
+
"OC(=O)C=CC(O)=O": ("maleic acid", 4.00),
|
|
262
|
+
|
|
263
|
+
# --- phosphorus / sulfur reagents (simplified) ---
|
|
264
|
+
"OP(O)(O)=O": ("phosphoric acid", 1.50),
|
|
265
|
+
"OS(O)(=O)=O": ("sulfuric acid", 0.50),
|
|
266
|
+
"OS(=O)=O": ("sulfurous acid", 2.00),
|
|
267
|
+
|
|
268
|
+
# --- azides and nitro compounds ---
|
|
269
|
+
"CN=[N+]=[N-]": ("methyl azide", 10.00),
|
|
270
|
+
"C[N+](=O)[O-]": ("nitromethane", 5.00),
|
|
271
|
+
"CC[N+](=O)[O-]": ("nitroethane", 7.00),
|
|
272
|
+
"[O-][N+](=O)c1ccccc1": ("nitrobenzene", 5.00),
|
|
273
|
+
|
|
274
|
+
# --- additional alcohols ---
|
|
275
|
+
"CC(O)CC": ("2-butanol", 4.00),
|
|
276
|
+
"C(CO)(CO)CO": ("pentaerythritol", 6.00),
|
|
277
|
+
"OC(C)C": ("2-propanol (alt)", 2.50),
|
|
278
|
+
|
|
279
|
+
# --- additional halides ---
|
|
280
|
+
"FC(F)F": ("trifluoromethane", 5.00),
|
|
281
|
+
"C(F)(F)(F)C(F)(F)F": ("hexafluoroethane", 8.00),
|
|
282
|
+
"ClC(Cl)=C": ("vinylidene chloride", 4.00),
|
|
283
|
+
"CC(Cl)(C)C": ("neopentyl chloride", 7.00),
|
|
284
|
+
"BrCCCBr": ("1,3-dibromopropane", 8.00),
|
|
285
|
+
"BrCCCCBr": ("1,4-dibromobutane", 10.00),
|
|
286
|
+
"ICCl": ("chloroiodomethane", 12.00),
|
|
287
|
+
|
|
288
|
+
# --- additional aromatics ---
|
|
289
|
+
"c1ccoc1": ("furan (aromatic)", 5.00),
|
|
290
|
+
"c1ccsc1": ("thiophene", 5.00),
|
|
291
|
+
"c1cnc2ccccc2c1": ("quinoline", 8.00),
|
|
292
|
+
"c1ccc2c(c1)cccc2": ("naphthalene (alt)", 4.00),
|
|
293
|
+
"OCc1ccccc1": ("benzyl alcohol", 5.00),
|
|
294
|
+
"NCc1ccccc1": ("benzylamine", 7.00),
|
|
295
|
+
"ClCc1ccccc1": ("benzyl chloride", 6.00),
|
|
296
|
+
"BrCc1ccccc1": ("benzyl bromide", 8.00),
|
|
297
|
+
"c1ccc(O)c(O)c1": ("catechol", 6.00),
|
|
298
|
+
"c1cc(O)cc(O)c1": ("resorcinol", 7.00),
|
|
299
|
+
"Oc1ccc(O)cc1": ("hydroquinone", 5.00),
|
|
300
|
+
"CC(=O)Oc1ccccc1": ("phenyl acetate", 6.00),
|
|
301
|
+
|
|
302
|
+
# --- heterocycles ---
|
|
303
|
+
"C1CCNCC1": ("piperidine", 5.00),
|
|
304
|
+
"C1CCNC1": ("pyrrolidine", 6.00),
|
|
305
|
+
"C1CCOC1": ("tetrahydrofuran (ring)", 4.00),
|
|
306
|
+
"C1CCOCC1": ("tetrahydropyran", 5.00),
|
|
307
|
+
"C1CNCCN1": ("piperazine", 6.00),
|
|
308
|
+
"c1c[nH]cn1": ("imidazole", 7.00),
|
|
309
|
+
"C1CO1": ("ethylene oxide (ring)", 3.00),
|
|
310
|
+
|
|
311
|
+
# --- additional carboxylic acid derivatives ---
|
|
312
|
+
"CC(=O)NC": ("N-methylacetamide", 5.00),
|
|
313
|
+
"O=C(Cl)c1ccccc1": ("benzoyl chloride", 7.00),
|
|
314
|
+
"OC(=O)CCCCCC(O)=O": ("pimelic acid", 6.00),
|
|
315
|
+
"OC(=O)c1ccc(C(O)=O)cc1": ("terephthalic acid", 5.00),
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
# Also accept alkyl halide generic name
|
|
319
|
+
_PURCHASABLE_ALIASES: dict[str, str] = {}
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
# =====================================================================
|
|
323
|
+
# Data structures
|
|
324
|
+
# =====================================================================
|
|
325
|
+
|
|
326
|
+
@dataclass
|
|
327
|
+
class Precursor:
|
|
328
|
+
"""A molecule that serves as starting material for one reaction step.
|
|
329
|
+
|
|
330
|
+
Attributes
|
|
331
|
+
----------
|
|
332
|
+
smiles : str
|
|
333
|
+
SMILES representation of the precursor.
|
|
334
|
+
molecule : Molecule | None
|
|
335
|
+
Parsed molecule object, or None if looked up from purchasable DB.
|
|
336
|
+
name : str
|
|
337
|
+
Human-readable name for display.
|
|
338
|
+
cost_per_kg : float
|
|
339
|
+
Estimated cost per kilogram in USD.
|
|
340
|
+
"""
|
|
341
|
+
smiles: str
|
|
342
|
+
molecule: Molecule | None
|
|
343
|
+
name: str
|
|
344
|
+
cost_per_kg: float
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
@dataclass
|
|
348
|
+
class Disconnection:
|
|
349
|
+
"""One possible retrosynthetic disconnection for a target node.
|
|
350
|
+
|
|
351
|
+
Attributes
|
|
352
|
+
----------
|
|
353
|
+
template : ReactionTemplate
|
|
354
|
+
The reaction template applied in reverse.
|
|
355
|
+
precursors : list[Precursor]
|
|
356
|
+
The precursor molecules produced by this disconnection.
|
|
357
|
+
score : float
|
|
358
|
+
Quality score from 0 to 100 (higher is better).
|
|
359
|
+
"""
|
|
360
|
+
template: ReactionTemplate
|
|
361
|
+
precursors: list[Precursor]
|
|
362
|
+
score: float
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
@dataclass
|
|
366
|
+
class RetroNode:
|
|
367
|
+
"""A node in the retrosynthetic search tree.
|
|
368
|
+
|
|
369
|
+
Each node represents one molecule. If the molecule is not purchasable,
|
|
370
|
+
its ``disconnections`` list holds candidate retrosynthetic steps, and
|
|
371
|
+
``children`` holds the recursively expanded precursor nodes for the
|
|
372
|
+
best disconnection.
|
|
373
|
+
|
|
374
|
+
Attributes
|
|
375
|
+
----------
|
|
376
|
+
smiles : str
|
|
377
|
+
SMILES string for this molecule.
|
|
378
|
+
molecule : Molecule
|
|
379
|
+
The parsed Molecule object.
|
|
380
|
+
functional_groups : list[FunctionalGroup]
|
|
381
|
+
Functional groups detected on this molecule.
|
|
382
|
+
is_purchasable : bool
|
|
383
|
+
True if this molecule appears in PURCHASABLE_MATERIALS.
|
|
384
|
+
disconnections : list[Disconnection]
|
|
385
|
+
Candidate retrosynthetic disconnections (best first).
|
|
386
|
+
best_disconnection : Disconnection | None
|
|
387
|
+
The top-scoring disconnection, if any.
|
|
388
|
+
children : list[RetroNode]
|
|
389
|
+
Expanded child nodes (precursors of the best disconnection).
|
|
390
|
+
depth : int
|
|
391
|
+
Depth of this node in the search tree (root = 0).
|
|
392
|
+
"""
|
|
393
|
+
smiles: str
|
|
394
|
+
molecule: Molecule
|
|
395
|
+
functional_groups: list[FunctionalGroup] = field(default_factory=list)
|
|
396
|
+
is_purchasable: bool = False
|
|
397
|
+
disconnections: list[Disconnection] = field(default_factory=list)
|
|
398
|
+
best_disconnection: Disconnection | None = None
|
|
399
|
+
children: list["RetroNode"] = field(default_factory=list)
|
|
400
|
+
depth: int = 0
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
@dataclass
|
|
404
|
+
class RetrosynthesisTree:
|
|
405
|
+
"""Complete retrosynthesis result.
|
|
406
|
+
|
|
407
|
+
Attributes
|
|
408
|
+
----------
|
|
409
|
+
target : RetroNode
|
|
410
|
+
Root of the retrosynthesis tree (the target molecule).
|
|
411
|
+
max_depth : int
|
|
412
|
+
Maximum search depth used.
|
|
413
|
+
beam_width : int
|
|
414
|
+
Beam width (number of disconnections kept per level).
|
|
415
|
+
routes_found : int
|
|
416
|
+
Total number of complete routes found to purchasable materials.
|
|
417
|
+
"""
|
|
418
|
+
target: RetroNode
|
|
419
|
+
max_depth: int
|
|
420
|
+
beam_width: int
|
|
421
|
+
routes_found: int
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
# =====================================================================
|
|
425
|
+
# Purchasability checks
|
|
426
|
+
# =====================================================================
|
|
427
|
+
|
|
428
|
+
def is_purchasable(smiles: str) -> bool:
|
|
429
|
+
"""Return True if *smiles* matches a known purchasable material.
|
|
430
|
+
|
|
431
|
+
The check tries the SMILES string as-is and also attempts a round-
|
|
432
|
+
trip (parse then re-serialise) to handle minor notational differences.
|
|
433
|
+
"""
|
|
434
|
+
if smiles in PURCHASABLE_MATERIALS:
|
|
435
|
+
return True
|
|
436
|
+
# Try canonical round-trip
|
|
437
|
+
try:
|
|
438
|
+
canon = to_smiles(parse(smiles))
|
|
439
|
+
if canon in PURCHASABLE_MATERIALS:
|
|
440
|
+
return True
|
|
441
|
+
except Exception:
|
|
442
|
+
pass
|
|
443
|
+
return False
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def get_purchasable(smiles: str) -> Precursor | None:
|
|
447
|
+
"""Return a Precursor for *smiles* if it is purchasable, else None."""
|
|
448
|
+
entry = PURCHASABLE_MATERIALS.get(smiles)
|
|
449
|
+
if entry is not None:
|
|
450
|
+
name, cost = entry
|
|
451
|
+
return Precursor(smiles=smiles, molecule=None, name=name,
|
|
452
|
+
cost_per_kg=cost)
|
|
453
|
+
# Try canonical form
|
|
454
|
+
try:
|
|
455
|
+
canon = to_smiles(parse(smiles))
|
|
456
|
+
entry = PURCHASABLE_MATERIALS.get(canon)
|
|
457
|
+
if entry is not None:
|
|
458
|
+
name, cost = entry
|
|
459
|
+
return Precursor(smiles=canon, molecule=None, name=name,
|
|
460
|
+
cost_per_kg=cost)
|
|
461
|
+
except Exception:
|
|
462
|
+
pass
|
|
463
|
+
return None
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
# =====================================================================
|
|
467
|
+
# Scoring helpers
|
|
468
|
+
# =====================================================================
|
|
469
|
+
|
|
470
|
+
def _count_heavy_atoms(mol: Molecule) -> int:
|
|
471
|
+
"""Count non-hydrogen atoms."""
|
|
472
|
+
return sum(1 for a in mol.atoms if a.symbol != "H")
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def _heavy_atom_count_from_smiles(smiles: str) -> int:
|
|
476
|
+
"""Count heavy atoms by parsing SMILES (returns 0 on failure)."""
|
|
477
|
+
try:
|
|
478
|
+
mol = parse(smiles)
|
|
479
|
+
return _count_heavy_atoms(mol)
|
|
480
|
+
except Exception:
|
|
481
|
+
return 0
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def score_disconnection(
|
|
485
|
+
template: ReactionTemplate,
|
|
486
|
+
precursors: list[Precursor],
|
|
487
|
+
target_mol: Molecule,
|
|
488
|
+
) -> float:
|
|
489
|
+
"""Score a retrosynthetic disconnection from 0 (poor) to 100 (ideal).
|
|
490
|
+
|
|
491
|
+
The score is a weighted sum of several heuristic factors:
|
|
492
|
+
|
|
493
|
+
1. **Yield expectation** (0--25 pts): Higher template yield is better.
|
|
494
|
+
2. **Precursor availability** (0--30 pts): More purchasable precursors
|
|
495
|
+
contribute more points.
|
|
496
|
+
3. **Complexity reduction** (0--20 pts): If precursors are significantly
|
|
497
|
+
simpler (fewer heavy atoms) than the target, the score is higher.
|
|
498
|
+
4. **Strategic bond preference** (0--15 pts): Reactions that form C-C
|
|
499
|
+
bonds (coupling, Grignard, aldol, etc.) score highest because C-C
|
|
500
|
+
disconnections are the backbone of retrosynthetic strategy.
|
|
501
|
+
5. **Template category bonus** (0--10 pts): Coupling and carbonyl
|
|
502
|
+
reactions get a small bonus as they are the most commonly used
|
|
503
|
+
strategic transforms.
|
|
504
|
+
"""
|
|
505
|
+
score = 0.0
|
|
506
|
+
|
|
507
|
+
# --- 1. Yield expectation (0--25) ---
|
|
508
|
+
lo, hi = template.typical_yield
|
|
509
|
+
mid_yield = (lo + hi) / 2.0
|
|
510
|
+
score += 25.0 * (mid_yield / 100.0)
|
|
511
|
+
|
|
512
|
+
# --- 2. Precursor availability (0--30) ---
|
|
513
|
+
if precursors:
|
|
514
|
+
purchasable_count = sum(1 for p in precursors if is_purchasable(p.smiles))
|
|
515
|
+
frac = purchasable_count / len(precursors)
|
|
516
|
+
score += 30.0 * frac
|
|
517
|
+
|
|
518
|
+
# --- 3. Complexity reduction (0--20) ---
|
|
519
|
+
target_heavy = _count_heavy_atoms(target_mol)
|
|
520
|
+
if target_heavy > 0 and precursors:
|
|
521
|
+
max_precursor_heavy = max(
|
|
522
|
+
_heavy_atom_count_from_smiles(p.smiles) for p in precursors
|
|
523
|
+
)
|
|
524
|
+
if max_precursor_heavy < target_heavy:
|
|
525
|
+
reduction = (target_heavy - max_precursor_heavy) / target_heavy
|
|
526
|
+
score += 20.0 * min(1.0, reduction * 2.0)
|
|
527
|
+
# If the precursor is no simpler, no points here.
|
|
528
|
+
|
|
529
|
+
# --- 4. Strategic bond preference (0--15) ---
|
|
530
|
+
cc_keywords = ("coupling", "grignard", "aldol", "wittig", "suzuki",
|
|
531
|
+
"heck", "sonogashira", "stille", "negishi",
|
|
532
|
+
"horner", "claisen condensation", "michael",
|
|
533
|
+
"robinson")
|
|
534
|
+
name_lower = template.name.lower()
|
|
535
|
+
named_lower = (template.named_reaction or "").lower()
|
|
536
|
+
if any(kw in name_lower or kw in named_lower for kw in cc_keywords):
|
|
537
|
+
score += 15.0
|
|
538
|
+
elif template.category == ReactionCategory.COUPLING:
|
|
539
|
+
score += 12.0
|
|
540
|
+
elif template.category in (ReactionCategory.CARBONYL,
|
|
541
|
+
ReactionCategory.ADDITION):
|
|
542
|
+
score += 6.0
|
|
543
|
+
|
|
544
|
+
# --- 5. Template category bonus (0--10) ---
|
|
545
|
+
category_bonus = {
|
|
546
|
+
ReactionCategory.COUPLING: 10.0,
|
|
547
|
+
ReactionCategory.CARBONYL: 8.0,
|
|
548
|
+
ReactionCategory.ADDITION: 6.0,
|
|
549
|
+
ReactionCategory.SUBSTITUTION: 5.0,
|
|
550
|
+
ReactionCategory.REDUCTION: 4.0,
|
|
551
|
+
ReactionCategory.OXIDATION: 4.0,
|
|
552
|
+
ReactionCategory.ELIMINATION: 3.0,
|
|
553
|
+
ReactionCategory.REARRANGEMENT: 3.0,
|
|
554
|
+
ReactionCategory.PROTECTION: 1.0,
|
|
555
|
+
ReactionCategory.DEPROTECTION: 1.0,
|
|
556
|
+
}
|
|
557
|
+
score += category_bonus.get(template.category, 2.0)
|
|
558
|
+
|
|
559
|
+
return min(100.0, max(0.0, score))
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
# =====================================================================
|
|
563
|
+
# Reverse transform: generate precursor SMILES from a template
|
|
564
|
+
# =====================================================================
|
|
565
|
+
|
|
566
|
+
def _generate_precursors_for_template(
|
|
567
|
+
target_smiles: str,
|
|
568
|
+
target_mol: Molecule,
|
|
569
|
+
template: ReactionTemplate,
|
|
570
|
+
fg: FunctionalGroup,
|
|
571
|
+
) -> list[Precursor]:
|
|
572
|
+
"""Generate precursor SMILES by conceptually reversing *template*.
|
|
573
|
+
|
|
574
|
+
The approach is a simplification: rather than performing a full
|
|
575
|
+
subgraph transform, we modify the target molecule according to
|
|
576
|
+
the functional group that the reaction *produces*. The idea is to
|
|
577
|
+
remove or simplify the functional group that the forward reaction
|
|
578
|
+
would create, yielding one or more simpler precursor molecules.
|
|
579
|
+
|
|
580
|
+
For multi-component reactions (e.g. Grignard, Suzuki) two precursors
|
|
581
|
+
are generated by splitting the target at the bond(s) adjacent to the
|
|
582
|
+
functional group centre.
|
|
583
|
+
|
|
584
|
+
Returns a list of Precursor objects (may be empty on failure).
|
|
585
|
+
"""
|
|
586
|
+
precursors: list[Precursor] = []
|
|
587
|
+
|
|
588
|
+
cat = template.category
|
|
589
|
+
fg_name = fg.name
|
|
590
|
+
center = fg.center
|
|
591
|
+
fg_atoms = fg.atoms
|
|
592
|
+
|
|
593
|
+
# ---- Strategy: map reaction category to precursor generation ----
|
|
594
|
+
|
|
595
|
+
# REDUCTION or OXIDATION: the precursor is the oxidised/reduced form.
|
|
596
|
+
# We approximate by swapping the FG for the one the template requires.
|
|
597
|
+
if cat == ReactionCategory.REDUCTION:
|
|
598
|
+
# Template reduces FG_required -> FG_produced.
|
|
599
|
+
# Reverse: we have the product, so precursor has the FG_required.
|
|
600
|
+
# Simplification: return a variant SMILES with the bond order changed.
|
|
601
|
+
precursor_smi = _modify_fg_smiles(
|
|
602
|
+
target_smiles, target_mol, fg, template, direction="oxidise")
|
|
603
|
+
if precursor_smi:
|
|
604
|
+
precursors.append(Precursor(
|
|
605
|
+
smiles=precursor_smi, molecule=None,
|
|
606
|
+
name=f"precursor ({template.name})",
|
|
607
|
+
cost_per_kg=_estimate_cost(precursor_smi),
|
|
608
|
+
))
|
|
609
|
+
return precursors
|
|
610
|
+
|
|
611
|
+
if cat == ReactionCategory.OXIDATION:
|
|
612
|
+
precursor_smi = _modify_fg_smiles(
|
|
613
|
+
target_smiles, target_mol, fg, template, direction="reduce")
|
|
614
|
+
if precursor_smi:
|
|
615
|
+
precursors.append(Precursor(
|
|
616
|
+
smiles=precursor_smi, molecule=None,
|
|
617
|
+
name=f"precursor ({template.name})",
|
|
618
|
+
cost_per_kg=_estimate_cost(precursor_smi),
|
|
619
|
+
))
|
|
620
|
+
return precursors
|
|
621
|
+
|
|
622
|
+
# COUPLING / CARBONYL: split into two fragments
|
|
623
|
+
if cat in (ReactionCategory.COUPLING, ReactionCategory.CARBONYL):
|
|
624
|
+
frags = _split_at_fg(target_smiles, target_mol, fg, template)
|
|
625
|
+
for smi in frags:
|
|
626
|
+
precursors.append(Precursor(
|
|
627
|
+
smiles=smi, molecule=None,
|
|
628
|
+
name=f"fragment ({template.name})",
|
|
629
|
+
cost_per_kg=_estimate_cost(smi),
|
|
630
|
+
))
|
|
631
|
+
return precursors
|
|
632
|
+
|
|
633
|
+
# SUBSTITUTION: replace the produced FG with the required one
|
|
634
|
+
if cat == ReactionCategory.SUBSTITUTION:
|
|
635
|
+
precursor_smi = _substitute_fg(
|
|
636
|
+
target_smiles, target_mol, fg, template)
|
|
637
|
+
if precursor_smi:
|
|
638
|
+
precursors.append(Precursor(
|
|
639
|
+
smiles=precursor_smi, molecule=None,
|
|
640
|
+
name=f"precursor ({template.name})",
|
|
641
|
+
cost_per_kg=_estimate_cost(precursor_smi),
|
|
642
|
+
))
|
|
643
|
+
# Also add the reagent as a precursor if it is recognisable
|
|
644
|
+
for reagent in template.reagents:
|
|
645
|
+
rp = _reagent_to_precursor(reagent)
|
|
646
|
+
if rp is not None:
|
|
647
|
+
precursors.append(rp)
|
|
648
|
+
return precursors
|
|
649
|
+
|
|
650
|
+
# ELIMINATION / ADDITION: forward-reverse pair
|
|
651
|
+
if cat == ReactionCategory.ELIMINATION:
|
|
652
|
+
# The product is an alkene; precursor is an alkyl halide or alcohol.
|
|
653
|
+
precursor_smi = _add_across_double_bond(
|
|
654
|
+
target_smiles, target_mol, fg, template)
|
|
655
|
+
if precursor_smi:
|
|
656
|
+
precursors.append(Precursor(
|
|
657
|
+
smiles=precursor_smi, molecule=None,
|
|
658
|
+
name=f"precursor ({template.name})",
|
|
659
|
+
cost_per_kg=_estimate_cost(precursor_smi),
|
|
660
|
+
))
|
|
661
|
+
return precursors
|
|
662
|
+
|
|
663
|
+
if cat == ReactionCategory.ADDITION:
|
|
664
|
+
# The product has a new FG across a former double bond.
|
|
665
|
+
precursor_smi = _remove_addition(
|
|
666
|
+
target_smiles, target_mol, fg, template)
|
|
667
|
+
if precursor_smi:
|
|
668
|
+
precursors.append(Precursor(
|
|
669
|
+
smiles=precursor_smi, molecule=None,
|
|
670
|
+
name=f"precursor ({template.name})",
|
|
671
|
+
cost_per_kg=_estimate_cost(precursor_smi),
|
|
672
|
+
))
|
|
673
|
+
for reagent in template.reagents:
|
|
674
|
+
rp = _reagent_to_precursor(reagent)
|
|
675
|
+
if rp is not None:
|
|
676
|
+
precursors.append(rp)
|
|
677
|
+
return precursors
|
|
678
|
+
|
|
679
|
+
# PROTECTION / DEPROTECTION: the core structure is essentially kept.
|
|
680
|
+
if cat in (ReactionCategory.PROTECTION, ReactionCategory.DEPROTECTION):
|
|
681
|
+
# Precursor is the unprotected / protected form.
|
|
682
|
+
precursor_smi = _toggle_protection(
|
|
683
|
+
target_smiles, target_mol, fg, template)
|
|
684
|
+
if precursor_smi:
|
|
685
|
+
precursors.append(Precursor(
|
|
686
|
+
smiles=precursor_smi, molecule=None,
|
|
687
|
+
name=f"precursor ({template.name})",
|
|
688
|
+
cost_per_kg=_estimate_cost(precursor_smi),
|
|
689
|
+
))
|
|
690
|
+
return precursors
|
|
691
|
+
|
|
692
|
+
# REARRANGEMENT: return the pre-rearrangement skeleton
|
|
693
|
+
if cat == ReactionCategory.REARRANGEMENT:
|
|
694
|
+
precursor_smi = _reverse_rearrangement(
|
|
695
|
+
target_smiles, target_mol, fg, template)
|
|
696
|
+
if precursor_smi:
|
|
697
|
+
precursors.append(Precursor(
|
|
698
|
+
smiles=precursor_smi, molecule=None,
|
|
699
|
+
name=f"precursor ({template.name})",
|
|
700
|
+
cost_per_kg=_estimate_cost(precursor_smi),
|
|
701
|
+
))
|
|
702
|
+
return precursors
|
|
703
|
+
|
|
704
|
+
# Fallback: try a simple truncation
|
|
705
|
+
precursor_smi = _simplify_molecule(target_smiles, target_mol, fg)
|
|
706
|
+
if precursor_smi:
|
|
707
|
+
precursors.append(Precursor(
|
|
708
|
+
smiles=precursor_smi, molecule=None,
|
|
709
|
+
name=f"simplified precursor",
|
|
710
|
+
cost_per_kg=_estimate_cost(precursor_smi),
|
|
711
|
+
))
|
|
712
|
+
return precursors
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
# =====================================================================
|
|
716
|
+
# Molecular transform helpers (heuristic / simplified)
|
|
717
|
+
# =====================================================================
|
|
718
|
+
|
|
719
|
+
def _modify_fg_smiles(
|
|
720
|
+
target_smiles: str,
|
|
721
|
+
target_mol: Molecule,
|
|
722
|
+
fg: FunctionalGroup,
|
|
723
|
+
template: ReactionTemplate,
|
|
724
|
+
direction: str,
|
|
725
|
+
) -> str | None:
|
|
726
|
+
"""Heuristically modify a functional group for redox transforms.
|
|
727
|
+
|
|
728
|
+
For 'oxidise': alcohol -> aldehyde/ketone, aldehyde -> carboxylic acid.
|
|
729
|
+
For 'reduce' : aldehyde/ketone -> alcohol, carboxylic acid -> aldehyde.
|
|
730
|
+
|
|
731
|
+
Returns a precursor SMILES string or None on failure.
|
|
732
|
+
"""
|
|
733
|
+
fg_name = fg.name
|
|
734
|
+
try:
|
|
735
|
+
if direction == "oxidise":
|
|
736
|
+
# Product was reduced, so precursor is oxidised form
|
|
737
|
+
if fg_name == "alcohol":
|
|
738
|
+
# Precursor is the corresponding aldehyde or ketone
|
|
739
|
+
return _replace_oh_with_carbonyl(target_smiles)
|
|
740
|
+
if fg_name in ("aldehyde", "ketone"):
|
|
741
|
+
# Precursor might be a carboxylic acid
|
|
742
|
+
return target_smiles # keep same (template applies to it)
|
|
743
|
+
else: # reduce
|
|
744
|
+
if fg_name == "aldehyde":
|
|
745
|
+
return _replace_carbonyl_with_oh(target_smiles)
|
|
746
|
+
if fg_name == "ketone":
|
|
747
|
+
return _replace_carbonyl_with_oh(target_smiles)
|
|
748
|
+
if fg_name == "carboxylic_acid":
|
|
749
|
+
return target_smiles
|
|
750
|
+
except Exception:
|
|
751
|
+
pass
|
|
752
|
+
return None
|
|
753
|
+
|
|
754
|
+
|
|
755
|
+
def _validate_smiles_transform(original: str, transformed: str) -> str | None:
|
|
756
|
+
"""Validate that a string-based SMILES transform produced a parseable result.
|
|
757
|
+
|
|
758
|
+
Returns the transformed SMILES if it:
|
|
759
|
+
1. Is different from the original
|
|
760
|
+
2. Parses without error
|
|
761
|
+
3. Produces at least one heavy atom
|
|
762
|
+
|
|
763
|
+
Returns None if validation fails, preventing corrupt SMILES from
|
|
764
|
+
propagating through the retrosynthesis tree.
|
|
765
|
+
"""
|
|
766
|
+
if transformed == original:
|
|
767
|
+
return None
|
|
768
|
+
try:
|
|
769
|
+
mol = parse(transformed)
|
|
770
|
+
if _count_heavy_atoms(mol) < 1:
|
|
771
|
+
return None
|
|
772
|
+
return transformed
|
|
773
|
+
except Exception:
|
|
774
|
+
return None
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
def _replace_oh_with_carbonyl(smiles: str) -> str | None:
|
|
778
|
+
"""Replace first C-OH with C=O (alcohol -> carbonyl).
|
|
779
|
+
|
|
780
|
+
Uses validation to prevent corrupt results from substring
|
|
781
|
+
collisions (e.g. 'COCO' should not become 'C=OCO').
|
|
782
|
+
"""
|
|
783
|
+
if "CO" in smiles and "C=O" not in smiles:
|
|
784
|
+
candidate = smiles.replace("CO", "C=O", 1)
|
|
785
|
+
return _validate_smiles_transform(smiles, candidate)
|
|
786
|
+
return None
|
|
787
|
+
|
|
788
|
+
|
|
789
|
+
def _replace_carbonyl_with_oh(smiles: str) -> str | None:
|
|
790
|
+
"""Replace first C=O with C-OH (carbonyl -> alcohol).
|
|
791
|
+
|
|
792
|
+
Uses validation to prevent corrupt results.
|
|
793
|
+
"""
|
|
794
|
+
if "C=O" in smiles:
|
|
795
|
+
candidate = smiles.replace("C=O", "CO", 1)
|
|
796
|
+
return _validate_smiles_transform(smiles, candidate)
|
|
797
|
+
return None
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
def _split_at_fg(
|
|
801
|
+
target_smiles: str,
|
|
802
|
+
target_mol: Molecule,
|
|
803
|
+
fg: FunctionalGroup,
|
|
804
|
+
template: ReactionTemplate,
|
|
805
|
+
) -> list[str]:
|
|
806
|
+
"""Split the target into two fragment SMILES at the functional group.
|
|
807
|
+
|
|
808
|
+
Used for coupling / carbonyl reactions where two components combine.
|
|
809
|
+
The heuristic removes the functional group centre and tries to return
|
|
810
|
+
the two largest remaining fragments as SMILES.
|
|
811
|
+
"""
|
|
812
|
+
center = fg.center
|
|
813
|
+
fg_atoms_set = set(fg.atoms)
|
|
814
|
+
|
|
815
|
+
# Find bonds connecting FG atoms to the rest of the molecule
|
|
816
|
+
break_bonds: list[tuple[int, int]] = []
|
|
817
|
+
for a_idx in fg.atoms:
|
|
818
|
+
for nb in target_mol.neighbors(a_idx):
|
|
819
|
+
if nb not in fg_atoms_set:
|
|
820
|
+
break_bonds.append((a_idx, nb))
|
|
821
|
+
|
|
822
|
+
if len(break_bonds) < 2:
|
|
823
|
+
# Cannot split meaningfully -- return the whole thing simplified
|
|
824
|
+
simp = _simplify_molecule(target_smiles, target_mol, fg)
|
|
825
|
+
return [simp] if simp else [target_smiles]
|
|
826
|
+
|
|
827
|
+
# Build two fragment atom sets by BFS from each side of the break
|
|
828
|
+
fragments: list[set[int]] = []
|
|
829
|
+
all_atoms = set(range(len(target_mol.atoms)))
|
|
830
|
+
excluded = fg_atoms_set
|
|
831
|
+
|
|
832
|
+
visited_global: set[int] = set()
|
|
833
|
+
for _, outside_atom in break_bonds:
|
|
834
|
+
if outside_atom in visited_global:
|
|
835
|
+
continue
|
|
836
|
+
# BFS from outside_atom, not crossing into fg_atoms
|
|
837
|
+
frag: set[int] = set()
|
|
838
|
+
stack = [outside_atom]
|
|
839
|
+
while stack:
|
|
840
|
+
cur = stack.pop()
|
|
841
|
+
if cur in frag or cur in excluded:
|
|
842
|
+
continue
|
|
843
|
+
frag.add(cur)
|
|
844
|
+
for nb in target_mol.neighbors(cur):
|
|
845
|
+
if nb not in frag and nb not in excluded:
|
|
846
|
+
stack.append(nb)
|
|
847
|
+
if frag:
|
|
848
|
+
visited_global |= frag
|
|
849
|
+
fragments.append(frag)
|
|
850
|
+
|
|
851
|
+
# Convert each fragment to SMILES using a simplified approach:
|
|
852
|
+
# We generate a sub-SMILES by collecting the heavy-atom symbols
|
|
853
|
+
# and connecting them linearly. This is an approximation.
|
|
854
|
+
result_smiles: list[str] = []
|
|
855
|
+
for frag in fragments[:2]:
|
|
856
|
+
smi = _fragment_to_smiles(target_mol, frag)
|
|
857
|
+
if smi:
|
|
858
|
+
result_smiles.append(smi)
|
|
859
|
+
|
|
860
|
+
# If we only got one fragment, add a simple reagent as the second
|
|
861
|
+
if len(result_smiles) == 1:
|
|
862
|
+
for reagent in template.reagents:
|
|
863
|
+
rp = _reagent_to_precursor(reagent)
|
|
864
|
+
if rp is not None:
|
|
865
|
+
result_smiles.append(rp.smiles)
|
|
866
|
+
break
|
|
867
|
+
else:
|
|
868
|
+
result_smiles.append("C") # methane fallback
|
|
869
|
+
|
|
870
|
+
if not result_smiles:
|
|
871
|
+
result_smiles = [target_smiles]
|
|
872
|
+
|
|
873
|
+
return result_smiles
|
|
874
|
+
|
|
875
|
+
|
|
876
|
+
def _fragment_to_smiles(mol: Molecule, atom_indices: set[int]) -> str:
|
|
877
|
+
"""Build an approximate SMILES for a subset of atoms in *mol*.
|
|
878
|
+
|
|
879
|
+
Constructs a new Molecule from the selected atoms (excluding H),
|
|
880
|
+
copies the bonds between them, and serialises with to_smiles.
|
|
881
|
+
"""
|
|
882
|
+
heavy_indices = sorted(
|
|
883
|
+
idx for idx in atom_indices if mol.atoms[idx].symbol != "H"
|
|
884
|
+
)
|
|
885
|
+
if not heavy_indices:
|
|
886
|
+
return ""
|
|
887
|
+
|
|
888
|
+
# Build a sub-molecule
|
|
889
|
+
sub = Molecule(name="fragment")
|
|
890
|
+
old_to_new: dict[int, int] = {}
|
|
891
|
+
for old_idx in heavy_indices:
|
|
892
|
+
atom = mol.atoms[old_idx]
|
|
893
|
+
new_idx = sub.add_atom(atom.symbol, atom.position.copy(),
|
|
894
|
+
atom.hybridization)
|
|
895
|
+
old_to_new[old_idx] = new_idx
|
|
896
|
+
|
|
897
|
+
# Copy bonds within the fragment
|
|
898
|
+
for bond in mol.bonds:
|
|
899
|
+
if bond.atom_i in old_to_new and bond.atom_j in old_to_new:
|
|
900
|
+
ni = old_to_new[bond.atom_i]
|
|
901
|
+
nj = old_to_new[bond.atom_j]
|
|
902
|
+
# Avoid duplicate bonds
|
|
903
|
+
if sub.get_bond(ni, nj) is None:
|
|
904
|
+
sub.add_bond(ni, nj, order=bond.order, rotatable=bond.rotatable)
|
|
905
|
+
|
|
906
|
+
# Add implicit hydrogens to satisfy valence (approximate)
|
|
907
|
+
# We rely on the SMILES writer to handle implicit H.
|
|
908
|
+
try:
|
|
909
|
+
return to_smiles(sub)
|
|
910
|
+
except Exception:
|
|
911
|
+
# Fallback: concatenate symbols
|
|
912
|
+
return "".join(mol.atoms[i].symbol for i in heavy_indices[:6])
|
|
913
|
+
|
|
914
|
+
|
|
915
|
+
def _substitute_fg(
|
|
916
|
+
target_smiles: str,
|
|
917
|
+
target_mol: Molecule,
|
|
918
|
+
fg: FunctionalGroup,
|
|
919
|
+
template: ReactionTemplate,
|
|
920
|
+
) -> str | None:
|
|
921
|
+
"""For substitution reactions, swap the produced FG for the required one.
|
|
922
|
+
|
|
923
|
+
E.g. if the template produces an alcohol from an alkyl halide, the
|
|
924
|
+
precursor is the alkyl halide form.
|
|
925
|
+
"""
|
|
926
|
+
# Determine what FG the precursor should have
|
|
927
|
+
required = template.functional_group_required
|
|
928
|
+
produced = template.functional_group_produced
|
|
929
|
+
|
|
930
|
+
# Map common FG swaps in SMILES
|
|
931
|
+
swap_map = {
|
|
932
|
+
("alcohol", "alkyl_halide"): ("O", "Br"),
|
|
933
|
+
("ether", "alkyl_halide"): ("OC", "Br"),
|
|
934
|
+
("ether", "alcohol"): ("OC", "O"),
|
|
935
|
+
("primary_amine", "alkyl_halide"): ("N", "Br"),
|
|
936
|
+
("nitrile", "alkyl_halide"): ("C#N", "Br"),
|
|
937
|
+
("azide", "alkyl_halide"): ("N=[N+]=[N-]", "Br"),
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
fg_name = fg.name
|
|
941
|
+
for req in required:
|
|
942
|
+
key = (fg_name, req)
|
|
943
|
+
if key in swap_map:
|
|
944
|
+
old_frag, new_frag = swap_map[key]
|
|
945
|
+
if old_frag in target_smiles:
|
|
946
|
+
candidate = target_smiles.replace(old_frag, new_frag, 1)
|
|
947
|
+
validated = _validate_smiles_transform(target_smiles, candidate)
|
|
948
|
+
if validated is not None:
|
|
949
|
+
return validated
|
|
950
|
+
|
|
951
|
+
# Generic fallback: just return the target with a halide substitution
|
|
952
|
+
if fg_name == "alcohol" and "O" in target_smiles:
|
|
953
|
+
candidate = target_smiles.replace("O", "Br", 1)
|
|
954
|
+
return _validate_smiles_transform(target_smiles, candidate)
|
|
955
|
+
return None
|
|
956
|
+
|
|
957
|
+
|
|
958
|
+
def _add_across_double_bond(
|
|
959
|
+
target_smiles: str,
|
|
960
|
+
target_mol: Molecule,
|
|
961
|
+
fg: FunctionalGroup,
|
|
962
|
+
template: ReactionTemplate,
|
|
963
|
+
) -> str | None:
|
|
964
|
+
"""Reverse of elimination: add HX across a double bond to get precursor."""
|
|
965
|
+
# If the target has an alkene, the precursor is an alkyl halide/alcohol.
|
|
966
|
+
if fg.name == "alkene" and "C=C" in target_smiles:
|
|
967
|
+
# Add H and Br across the double bond
|
|
968
|
+
candidate = target_smiles.replace("C=C", "CC(Br)", 1)
|
|
969
|
+
return _validate_smiles_transform(target_smiles, candidate)
|
|
970
|
+
return None
|
|
971
|
+
|
|
972
|
+
|
|
973
|
+
def _remove_addition(
|
|
974
|
+
target_smiles: str,
|
|
975
|
+
target_mol: Molecule,
|
|
976
|
+
fg: FunctionalGroup,
|
|
977
|
+
template: ReactionTemplate,
|
|
978
|
+
) -> str | None:
|
|
979
|
+
"""Reverse of addition: remove the added group to restore alkene."""
|
|
980
|
+
fg_name = fg.name
|
|
981
|
+
|
|
982
|
+
# The template required an alkene and produced the current FG
|
|
983
|
+
if "alkene" in template.functional_group_required:
|
|
984
|
+
# Restore the alkene by removing the added functionality
|
|
985
|
+
if fg_name == "alcohol" and "CO" in target_smiles:
|
|
986
|
+
candidate = target_smiles.replace("CO", "C=C", 1)
|
|
987
|
+
result = _validate_smiles_transform(target_smiles, candidate)
|
|
988
|
+
if result is not None:
|
|
989
|
+
return result
|
|
990
|
+
if fg_name.startswith("alkyl_halide"):
|
|
991
|
+
for hal in ("Br", "Cl", "I"):
|
|
992
|
+
if f"C{hal}" in target_smiles:
|
|
993
|
+
candidate = target_smiles.replace(f"C{hal}", "C=C", 1)
|
|
994
|
+
result = _validate_smiles_transform(target_smiles, candidate)
|
|
995
|
+
if result is not None:
|
|
996
|
+
return result
|
|
997
|
+
if fg_name == "epoxide" and "C1OC1" in target_smiles:
|
|
998
|
+
candidate = target_smiles.replace("C1OC1", "C=C", 1)
|
|
999
|
+
result = _validate_smiles_transform(target_smiles, candidate)
|
|
1000
|
+
if result is not None:
|
|
1001
|
+
return result
|
|
1002
|
+
return None
|
|
1003
|
+
|
|
1004
|
+
|
|
1005
|
+
def _toggle_protection(
|
|
1006
|
+
target_smiles: str,
|
|
1007
|
+
target_mol: Molecule,
|
|
1008
|
+
fg: FunctionalGroup,
|
|
1009
|
+
template: ReactionTemplate,
|
|
1010
|
+
) -> str | None:
|
|
1011
|
+
"""Toggle between protected and deprotected forms.
|
|
1012
|
+
|
|
1013
|
+
Simplification: for protection templates, just return the target
|
|
1014
|
+
since the core structure is essentially preserved.
|
|
1015
|
+
"""
|
|
1016
|
+
return target_smiles
|
|
1017
|
+
|
|
1018
|
+
|
|
1019
|
+
def _reverse_rearrangement(
|
|
1020
|
+
target_smiles: str,
|
|
1021
|
+
target_mol: Molecule,
|
|
1022
|
+
fg: FunctionalGroup,
|
|
1023
|
+
template: ReactionTemplate,
|
|
1024
|
+
) -> str | None:
|
|
1025
|
+
"""Rough approximation for reversing a rearrangement.
|
|
1026
|
+
|
|
1027
|
+
Returns the target itself as a stand-in since rearrangement
|
|
1028
|
+
precursors are structural isomers that are hard to derive
|
|
1029
|
+
without full subgraph matching.
|
|
1030
|
+
"""
|
|
1031
|
+
return target_smiles
|
|
1032
|
+
|
|
1033
|
+
|
|
1034
|
+
def _simplify_molecule(
|
|
1035
|
+
target_smiles: str,
|
|
1036
|
+
target_mol: Molecule,
|
|
1037
|
+
fg: FunctionalGroup,
|
|
1038
|
+
) -> str | None:
|
|
1039
|
+
"""Produce a simplified precursor by removing part of the molecule.
|
|
1040
|
+
|
|
1041
|
+
Heuristic: remove the functional group atoms and return the largest
|
|
1042
|
+
connected fragment.
|
|
1043
|
+
"""
|
|
1044
|
+
fg_set = set(fg.atoms)
|
|
1045
|
+
remaining = set(range(len(target_mol.atoms))) - fg_set
|
|
1046
|
+
heavy = {i for i in remaining if target_mol.atoms[i].symbol != "H"}
|
|
1047
|
+
|
|
1048
|
+
if not heavy:
|
|
1049
|
+
return None
|
|
1050
|
+
|
|
1051
|
+
# Find largest connected component among remaining heavy atoms
|
|
1052
|
+
visited: set[int] = set()
|
|
1053
|
+
best_comp: set[int] = set()
|
|
1054
|
+
for start in heavy:
|
|
1055
|
+
if start in visited:
|
|
1056
|
+
continue
|
|
1057
|
+
comp: set[int] = set()
|
|
1058
|
+
stack = [start]
|
|
1059
|
+
while stack:
|
|
1060
|
+
cur = stack.pop()
|
|
1061
|
+
if cur in comp or cur in fg_set:
|
|
1062
|
+
continue
|
|
1063
|
+
if cur not in heavy:
|
|
1064
|
+
continue
|
|
1065
|
+
comp.add(cur)
|
|
1066
|
+
for nb in target_mol.neighbors(cur):
|
|
1067
|
+
if nb not in comp and nb in heavy:
|
|
1068
|
+
stack.append(nb)
|
|
1069
|
+
visited |= comp
|
|
1070
|
+
if len(comp) > len(best_comp):
|
|
1071
|
+
best_comp = comp
|
|
1072
|
+
|
|
1073
|
+
if not best_comp:
|
|
1074
|
+
return None
|
|
1075
|
+
|
|
1076
|
+
return _fragment_to_smiles(target_mol, best_comp)
|
|
1077
|
+
|
|
1078
|
+
|
|
1079
|
+
def _reagent_to_precursor(reagent_str: str) -> Precursor | None:
|
|
1080
|
+
"""Try to match a reagent string to a purchasable material."""
|
|
1081
|
+
# Map common reagent names to SMILES
|
|
1082
|
+
reagent_map: dict[str, str] = {
|
|
1083
|
+
"NaOH": "O",
|
|
1084
|
+
"NaCN": "C#N",
|
|
1085
|
+
"NaN3": "CN=[N+]=[N-]",
|
|
1086
|
+
"NaOMe": "CO",
|
|
1087
|
+
"NaOEt": "CCO",
|
|
1088
|
+
"HBr": "Br",
|
|
1089
|
+
"HCl": "Cl",
|
|
1090
|
+
"H2O": "O",
|
|
1091
|
+
"MeOH": "CO",
|
|
1092
|
+
"EtOH": "CCO",
|
|
1093
|
+
"NaBH4": "O",
|
|
1094
|
+
"LiAlH4": "O",
|
|
1095
|
+
"H2": "O",
|
|
1096
|
+
"BH3*THF": "C1CCOC1",
|
|
1097
|
+
"mCPBA": "O",
|
|
1098
|
+
"Br2": "Br",
|
|
1099
|
+
"PCC": "O",
|
|
1100
|
+
"n-BuLi": "CCCC",
|
|
1101
|
+
}
|
|
1102
|
+
smi = reagent_map.get(reagent_str)
|
|
1103
|
+
if smi is not None:
|
|
1104
|
+
p = get_purchasable(smi)
|
|
1105
|
+
if p is not None:
|
|
1106
|
+
return p
|
|
1107
|
+
return None
|
|
1108
|
+
|
|
1109
|
+
|
|
1110
|
+
def _estimate_cost(smiles: str) -> float:
|
|
1111
|
+
"""Estimate cost per kg for a SMILES string.
|
|
1112
|
+
|
|
1113
|
+
Uses purchasable DB if available, otherwise estimates based on
|
|
1114
|
+
molecular size.
|
|
1115
|
+
"""
|
|
1116
|
+
entry = PURCHASABLE_MATERIALS.get(smiles)
|
|
1117
|
+
if entry is not None:
|
|
1118
|
+
return entry[1]
|
|
1119
|
+
# Rough estimate: $10/kg per heavy atom
|
|
1120
|
+
try:
|
|
1121
|
+
mol = parse(smiles)
|
|
1122
|
+
n_heavy = _count_heavy_atoms(mol)
|
|
1123
|
+
return max(5.0, n_heavy * 10.0)
|
|
1124
|
+
except Exception:
|
|
1125
|
+
return 50.0
|
|
1126
|
+
|
|
1127
|
+
|
|
1128
|
+
# =====================================================================
|
|
1129
|
+
# Template matching: which templates apply to a given FG?
|
|
1130
|
+
# =====================================================================
|
|
1131
|
+
|
|
1132
|
+
def _find_applicable_templates(
|
|
1133
|
+
fg: FunctionalGroup,
|
|
1134
|
+
all_fg_names: list[str],
|
|
1135
|
+
) -> list[ReactionTemplate]:
|
|
1136
|
+
"""Find reaction templates whose *produced* FG matches *fg*.
|
|
1137
|
+
|
|
1138
|
+
In retrosynthesis we work backwards: we look for reactions that
|
|
1139
|
+
**produce** the functional group found on the target, because
|
|
1140
|
+
reversing such a reaction gives us the precursors.
|
|
1141
|
+
|
|
1142
|
+
Also includes templates that **require** the FG (the forward
|
|
1143
|
+
reaction transforms it, so running it in reverse generates a
|
|
1144
|
+
molecule with that FG as starting material from something simpler).
|
|
1145
|
+
"""
|
|
1146
|
+
results: list[ReactionTemplate] = []
|
|
1147
|
+
seen_names: set[str] = set()
|
|
1148
|
+
|
|
1149
|
+
# Templates that produce this FG (primary retrosynthetic match)
|
|
1150
|
+
for tmpl in find_reactions_producing(fg.name):
|
|
1151
|
+
if tmpl.name not in seen_names and tmpl.is_compatible(all_fg_names):
|
|
1152
|
+
results.append(tmpl)
|
|
1153
|
+
seen_names.add(tmpl.name)
|
|
1154
|
+
|
|
1155
|
+
# Also consider templates that require this FG (the forward
|
|
1156
|
+
# reaction uses this FG as a handle).
|
|
1157
|
+
for tmpl in lookup_by_functional_group(fg.name):
|
|
1158
|
+
if tmpl.name not in seen_names and tmpl.is_compatible(all_fg_names):
|
|
1159
|
+
results.append(tmpl)
|
|
1160
|
+
seen_names.add(tmpl.name)
|
|
1161
|
+
|
|
1162
|
+
# Handle generic alkyl_halide name for specific halides
|
|
1163
|
+
if fg.name.startswith("alkyl_halide_"):
|
|
1164
|
+
for tmpl in lookup_by_functional_group("alkyl_halide"):
|
|
1165
|
+
if tmpl.name not in seen_names and tmpl.is_compatible(all_fg_names):
|
|
1166
|
+
results.append(tmpl)
|
|
1167
|
+
seen_names.add(tmpl.name)
|
|
1168
|
+
|
|
1169
|
+
return results
|
|
1170
|
+
|
|
1171
|
+
|
|
1172
|
+
# =====================================================================
|
|
1173
|
+
# Beam search retrosynthesis
|
|
1174
|
+
# =====================================================================
|
|
1175
|
+
|
|
1176
|
+
def _build_retro_node(
|
|
1177
|
+
smiles: str,
|
|
1178
|
+
mol: Molecule,
|
|
1179
|
+
depth: int,
|
|
1180
|
+
max_depth: int,
|
|
1181
|
+
beam_width: int,
|
|
1182
|
+
visited_smiles: set[str],
|
|
1183
|
+
) -> RetroNode:
|
|
1184
|
+
"""Build one node of the retrosynthesis tree.
|
|
1185
|
+
|
|
1186
|
+
If the molecule is purchasable, the node is a leaf. Otherwise,
|
|
1187
|
+
functional groups are detected, templates are matched, disconnections
|
|
1188
|
+
are scored, and the top *beam_width* disconnections are kept. The
|
|
1189
|
+
best disconnection's precursors are then expanded recursively.
|
|
1190
|
+
"""
|
|
1191
|
+
node = RetroNode(
|
|
1192
|
+
smiles=smiles,
|
|
1193
|
+
molecule=mol,
|
|
1194
|
+
depth=depth,
|
|
1195
|
+
)
|
|
1196
|
+
|
|
1197
|
+
# Check purchasability
|
|
1198
|
+
if is_purchasable(smiles):
|
|
1199
|
+
node.is_purchasable = True
|
|
1200
|
+
return node
|
|
1201
|
+
|
|
1202
|
+
# Detect functional groups
|
|
1203
|
+
fgs = detect_functional_groups(mol)
|
|
1204
|
+
node.functional_groups = fgs
|
|
1205
|
+
|
|
1206
|
+
# Depth limit
|
|
1207
|
+
if depth >= max_depth:
|
|
1208
|
+
return node
|
|
1209
|
+
|
|
1210
|
+
# Prevent infinite loops (mutable set with backtracking for efficiency)
|
|
1211
|
+
if smiles in visited_smiles:
|
|
1212
|
+
return node
|
|
1213
|
+
visited_smiles.add(smiles)
|
|
1214
|
+
|
|
1215
|
+
# Collect all FG names for compatibility check
|
|
1216
|
+
all_fg_names = [fg.name for fg in fgs]
|
|
1217
|
+
|
|
1218
|
+
# Generate disconnections
|
|
1219
|
+
disconnections: list[Disconnection] = []
|
|
1220
|
+
seen_templates: set[str] = set()
|
|
1221
|
+
|
|
1222
|
+
for fg in fgs:
|
|
1223
|
+
templates = _find_applicable_templates(fg, all_fg_names)
|
|
1224
|
+
for tmpl in templates:
|
|
1225
|
+
if tmpl.name in seen_templates:
|
|
1226
|
+
continue
|
|
1227
|
+
seen_templates.add(tmpl.name)
|
|
1228
|
+
|
|
1229
|
+
precursors = _generate_precursors_for_template(
|
|
1230
|
+
smiles, mol, tmpl, fg)
|
|
1231
|
+
if not precursors:
|
|
1232
|
+
continue
|
|
1233
|
+
|
|
1234
|
+
score = score_disconnection(tmpl, precursors, mol)
|
|
1235
|
+
disconnections.append(Disconnection(
|
|
1236
|
+
template=tmpl, precursors=precursors, score=score))
|
|
1237
|
+
|
|
1238
|
+
# Sort by score (highest first) and keep top beam_width
|
|
1239
|
+
disconnections.sort(key=lambda d: d.score, reverse=True)
|
|
1240
|
+
node.disconnections = disconnections[:beam_width]
|
|
1241
|
+
|
|
1242
|
+
# Select best disconnection
|
|
1243
|
+
if node.disconnections:
|
|
1244
|
+
node.best_disconnection = node.disconnections[0]
|
|
1245
|
+
|
|
1246
|
+
# Recursively expand precursors of the best disconnection
|
|
1247
|
+
for precursor in node.best_disconnection.precursors:
|
|
1248
|
+
if is_purchasable(precursor.smiles):
|
|
1249
|
+
child_mol = _safe_parse(precursor.smiles)
|
|
1250
|
+
if child_mol is None:
|
|
1251
|
+
continue
|
|
1252
|
+
child_node = RetroNode(
|
|
1253
|
+
smiles=precursor.smiles,
|
|
1254
|
+
molecule=child_mol,
|
|
1255
|
+
is_purchasable=True,
|
|
1256
|
+
depth=depth + 1,
|
|
1257
|
+
)
|
|
1258
|
+
node.children.append(child_node)
|
|
1259
|
+
else:
|
|
1260
|
+
child_mol = _safe_parse(precursor.smiles)
|
|
1261
|
+
if child_mol is None:
|
|
1262
|
+
continue
|
|
1263
|
+
child_node = _build_retro_node(
|
|
1264
|
+
precursor.smiles, child_mol,
|
|
1265
|
+
depth + 1, max_depth, beam_width,
|
|
1266
|
+
visited_smiles,
|
|
1267
|
+
)
|
|
1268
|
+
node.children.append(child_node)
|
|
1269
|
+
|
|
1270
|
+
return node
|
|
1271
|
+
|
|
1272
|
+
|
|
1273
|
+
def _safe_parse(smiles: str) -> Molecule | None:
|
|
1274
|
+
"""Parse SMILES, returning None on failure."""
|
|
1275
|
+
try:
|
|
1276
|
+
return parse(smiles)
|
|
1277
|
+
except Exception:
|
|
1278
|
+
return None
|
|
1279
|
+
|
|
1280
|
+
|
|
1281
|
+
def _count_routes(node: RetroNode) -> int:
|
|
1282
|
+
"""Count complete routes (paths from root to all-purchasable leaves)."""
|
|
1283
|
+
if node.is_purchasable:
|
|
1284
|
+
return 1
|
|
1285
|
+
if not node.children:
|
|
1286
|
+
return 0
|
|
1287
|
+
# A route is complete when all children are resolved
|
|
1288
|
+
child_counts = [_count_routes(c) for c in node.children]
|
|
1289
|
+
if all(c > 0 for c in child_counts):
|
|
1290
|
+
# Multiply: each combination of child routes is a complete route
|
|
1291
|
+
product = 1
|
|
1292
|
+
for c in child_counts:
|
|
1293
|
+
product *= c
|
|
1294
|
+
return product
|
|
1295
|
+
return 0
|
|
1296
|
+
|
|
1297
|
+
|
|
1298
|
+
def retrosynthesis(
|
|
1299
|
+
mol: Molecule,
|
|
1300
|
+
max_depth: int = 8,
|
|
1301
|
+
beam_width: int = 5,
|
|
1302
|
+
) -> RetrosynthesisTree:
|
|
1303
|
+
"""Perform retrosynthetic analysis on a target molecule.
|
|
1304
|
+
|
|
1305
|
+
Starting from the target, the algorithm works backwards:
|
|
1306
|
+
1. Convert target to SMILES and check purchasability.
|
|
1307
|
+
2. Detect functional groups on the target.
|
|
1308
|
+
3. For each FG, look up matching reaction templates from the
|
|
1309
|
+
knowledge base.
|
|
1310
|
+
4. For each matching template, generate precursor molecules by
|
|
1311
|
+
conceptual reverse transform.
|
|
1312
|
+
5. Score each disconnection using strategic bond preference,
|
|
1313
|
+
atom-count reduction, FG simplification, template yield, and
|
|
1314
|
+
precursor availability.
|
|
1315
|
+
6. Keep the top *beam_width* disconnections.
|
|
1316
|
+
7. Recurse on non-purchasable precursors up to *max_depth*.
|
|
1317
|
+
8. Mark the best route through the tree.
|
|
1318
|
+
|
|
1319
|
+
Parameters
|
|
1320
|
+
----------
|
|
1321
|
+
mol : Molecule
|
|
1322
|
+
The target molecule to analyse.
|
|
1323
|
+
max_depth : int
|
|
1324
|
+
Maximum number of retrosynthetic steps to explore.
|
|
1325
|
+
beam_width : int
|
|
1326
|
+
Number of disconnections to keep at each level.
|
|
1327
|
+
|
|
1328
|
+
Returns
|
|
1329
|
+
-------
|
|
1330
|
+
RetrosynthesisTree
|
|
1331
|
+
The full retrosynthesis tree with scored disconnections.
|
|
1332
|
+
"""
|
|
1333
|
+
target_smiles = to_smiles(mol)
|
|
1334
|
+
visited: set[str] = set()
|
|
1335
|
+
|
|
1336
|
+
root = _build_retro_node(
|
|
1337
|
+
target_smiles, mol, 0, max_depth, beam_width, visited)
|
|
1338
|
+
|
|
1339
|
+
routes = _count_routes(root)
|
|
1340
|
+
|
|
1341
|
+
return RetrosynthesisTree(
|
|
1342
|
+
target=root,
|
|
1343
|
+
max_depth=max_depth,
|
|
1344
|
+
beam_width=beam_width,
|
|
1345
|
+
routes_found=routes,
|
|
1346
|
+
)
|
|
1347
|
+
|
|
1348
|
+
|
|
1349
|
+
# =====================================================================
|
|
1350
|
+
# Tree formatting (ASCII text)
|
|
1351
|
+
# =====================================================================
|
|
1352
|
+
|
|
1353
|
+
def _format_node(node: RetroNode, indent: str, is_last: bool,
|
|
1354
|
+
lines: list[str]) -> None:
|
|
1355
|
+
"""Recursively format a node and its children as an ASCII tree."""
|
|
1356
|
+
connector = "`-- " if is_last else "|-- "
|
|
1357
|
+
status = ""
|
|
1358
|
+
if node.is_purchasable:
|
|
1359
|
+
entry = PURCHASABLE_MATERIALS.get(node.smiles)
|
|
1360
|
+
name = entry[0] if entry else "purchasable"
|
|
1361
|
+
status = f" [AVAILABLE: {name}]"
|
|
1362
|
+
elif node.best_disconnection:
|
|
1363
|
+
tmpl_name = node.best_disconnection.template.name
|
|
1364
|
+
score = node.best_disconnection.score
|
|
1365
|
+
status = f" <-- {tmpl_name} (score={score:.1f})"
|
|
1366
|
+
else:
|
|
1367
|
+
if node.depth > 0:
|
|
1368
|
+
status = " [no route found]"
|
|
1369
|
+
|
|
1370
|
+
lines.append(f"{indent}{connector}{node.smiles}{status}")
|
|
1371
|
+
|
|
1372
|
+
# Continuation indent for children
|
|
1373
|
+
child_indent = indent + (" " if is_last else "| ")
|
|
1374
|
+
|
|
1375
|
+
# Show functional groups at the root
|
|
1376
|
+
if node.depth == 0 and node.functional_groups:
|
|
1377
|
+
fg_names = ", ".join(fg.name for fg in node.functional_groups)
|
|
1378
|
+
lines.append(f"{child_indent}FGs: {fg_names}")
|
|
1379
|
+
|
|
1380
|
+
# Show alternative disconnections (briefly)
|
|
1381
|
+
if node.disconnections and len(node.disconnections) > 1:
|
|
1382
|
+
lines.append(f"{child_indent}({len(node.disconnections)} "
|
|
1383
|
+
f"disconnection(s) evaluated)")
|
|
1384
|
+
|
|
1385
|
+
# Recurse into children
|
|
1386
|
+
for i, child in enumerate(node.children):
|
|
1387
|
+
is_last_child = (i == len(node.children) - 1)
|
|
1388
|
+
_format_node(child, child_indent, is_last_child, lines)
|
|
1389
|
+
|
|
1390
|
+
|
|
1391
|
+
def format_tree(tree: RetrosynthesisTree) -> str:
|
|
1392
|
+
"""Format a RetrosynthesisTree as an ASCII text diagram.
|
|
1393
|
+
|
|
1394
|
+
Parameters
|
|
1395
|
+
----------
|
|
1396
|
+
tree : RetrosynthesisTree
|
|
1397
|
+
The retrosynthesis tree to format.
|
|
1398
|
+
|
|
1399
|
+
Returns
|
|
1400
|
+
-------
|
|
1401
|
+
str
|
|
1402
|
+
Multi-line ASCII text representation of the tree.
|
|
1403
|
+
|
|
1404
|
+
Example output::
|
|
1405
|
+
|
|
1406
|
+
Retrosynthetic Analysis
|
|
1407
|
+
==================================================
|
|
1408
|
+
Target: CC(=O)O
|
|
1409
|
+
Max depth: 8 Beam width: 5 Routes found: 2
|
|
1410
|
+
==================================================
|
|
1411
|
+
`-- CC(=O)O <-- Fischer esterification (score=72.3)
|
|
1412
|
+
FGs: carboxylic_acid, alcohol
|
|
1413
|
+
(3 disconnection(s) evaluated)
|
|
1414
|
+
|-- CC(O)=O [AVAILABLE: acetic acid]
|
|
1415
|
+
`-- CCO [AVAILABLE: ethanol]
|
|
1416
|
+
"""
|
|
1417
|
+
lines: list[str] = []
|
|
1418
|
+
lines.append("Retrosynthetic Analysis")
|
|
1419
|
+
lines.append("=" * 58)
|
|
1420
|
+
lines.append(f"Target: {tree.target.smiles}")
|
|
1421
|
+
lines.append(
|
|
1422
|
+
f"Max depth: {tree.max_depth} "
|
|
1423
|
+
f"Beam width: {tree.beam_width} "
|
|
1424
|
+
f"Routes found: {tree.routes_found}"
|
|
1425
|
+
)
|
|
1426
|
+
lines.append("=" * 58)
|
|
1427
|
+
|
|
1428
|
+
_format_node(tree.target, "", True, lines)
|
|
1429
|
+
|
|
1430
|
+
return "\n".join(lines)
|