molbuilder 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. molbuilder/__init__.py +8 -0
  2. molbuilder/__main__.py +6 -0
  3. molbuilder/atomic/__init__.py +4 -0
  4. molbuilder/atomic/bohr.py +235 -0
  5. molbuilder/atomic/quantum_atom.py +334 -0
  6. molbuilder/atomic/quantum_numbers.py +196 -0
  7. molbuilder/atomic/wavefunctions.py +297 -0
  8. molbuilder/bonding/__init__.py +4 -0
  9. molbuilder/bonding/covalent.py +442 -0
  10. molbuilder/bonding/lewis.py +347 -0
  11. molbuilder/bonding/vsepr.py +433 -0
  12. molbuilder/cli/__init__.py +1 -0
  13. molbuilder/cli/demos.py +516 -0
  14. molbuilder/cli/menu.py +127 -0
  15. molbuilder/cli/wizard.py +831 -0
  16. molbuilder/core/__init__.py +6 -0
  17. molbuilder/core/bond_data.py +170 -0
  18. molbuilder/core/constants.py +51 -0
  19. molbuilder/core/element_properties.py +183 -0
  20. molbuilder/core/elements.py +181 -0
  21. molbuilder/core/geometry.py +232 -0
  22. molbuilder/gui/__init__.py +2 -0
  23. molbuilder/gui/app.py +286 -0
  24. molbuilder/gui/canvas3d.py +115 -0
  25. molbuilder/gui/dialogs.py +117 -0
  26. molbuilder/gui/event_handler.py +118 -0
  27. molbuilder/gui/sidebar.py +105 -0
  28. molbuilder/gui/toolbar.py +71 -0
  29. molbuilder/io/__init__.py +1 -0
  30. molbuilder/io/json_io.py +146 -0
  31. molbuilder/io/mol_sdf.py +169 -0
  32. molbuilder/io/pdb.py +184 -0
  33. molbuilder/io/smiles_io.py +47 -0
  34. molbuilder/io/xyz.py +103 -0
  35. molbuilder/molecule/__init__.py +2 -0
  36. molbuilder/molecule/amino_acids.py +919 -0
  37. molbuilder/molecule/builders.py +257 -0
  38. molbuilder/molecule/conformations.py +70 -0
  39. molbuilder/molecule/functional_groups.py +484 -0
  40. molbuilder/molecule/graph.py +712 -0
  41. molbuilder/molecule/peptides.py +13 -0
  42. molbuilder/molecule/stereochemistry.py +6 -0
  43. molbuilder/process/__init__.py +3 -0
  44. molbuilder/process/conditions.py +260 -0
  45. molbuilder/process/costing.py +316 -0
  46. molbuilder/process/purification.py +285 -0
  47. molbuilder/process/reactor.py +297 -0
  48. molbuilder/process/safety.py +476 -0
  49. molbuilder/process/scale_up.py +427 -0
  50. molbuilder/process/solvent_systems.py +204 -0
  51. molbuilder/reactions/__init__.py +3 -0
  52. molbuilder/reactions/functional_group_detect.py +728 -0
  53. molbuilder/reactions/knowledge_base.py +1716 -0
  54. molbuilder/reactions/reaction_types.py +102 -0
  55. molbuilder/reactions/reagent_data.py +1248 -0
  56. molbuilder/reactions/retrosynthesis.py +1430 -0
  57. molbuilder/reactions/synthesis_route.py +377 -0
  58. molbuilder/reports/__init__.py +158 -0
  59. molbuilder/reports/cost_report.py +206 -0
  60. molbuilder/reports/molecule_report.py +279 -0
  61. molbuilder/reports/safety_report.py +296 -0
  62. molbuilder/reports/synthesis_report.py +283 -0
  63. molbuilder/reports/text_formatter.py +170 -0
  64. molbuilder/smiles/__init__.py +4 -0
  65. molbuilder/smiles/parser.py +487 -0
  66. molbuilder/smiles/tokenizer.py +291 -0
  67. molbuilder/smiles/writer.py +375 -0
  68. molbuilder/visualization/__init__.py +1 -0
  69. molbuilder/visualization/bohr_viz.py +166 -0
  70. molbuilder/visualization/molecule_viz.py +368 -0
  71. molbuilder/visualization/quantum_viz.py +434 -0
  72. molbuilder/visualization/theme.py +12 -0
  73. molbuilder-1.0.0.dist-info/METADATA +360 -0
  74. molbuilder-1.0.0.dist-info/RECORD +78 -0
  75. molbuilder-1.0.0.dist-info/WHEEL +5 -0
  76. molbuilder-1.0.0.dist-info/entry_points.txt +2 -0
  77. molbuilder-1.0.0.dist-info/licenses/LICENSE +21 -0
  78. molbuilder-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1430 @@
1
+ """Retrosynthetic analysis engine using beam search.
2
+
3
+ Given a target molecule, this module works backwards from the product to
4
+ identify commercially available starting materials, applying known reaction
5
+ templates in reverse (disconnection approach). A beam search explores the
6
+ most promising disconnections at each level, producing a retrosynthesis
7
+ tree that can later be converted into a forward synthesis route.
8
+
9
+ Key public function
10
+ -------------------
11
+ retrosynthesis(mol, max_depth, beam_width) -> RetrosynthesisTree
12
+
13
+ Supporting helpers
14
+ ------------------
15
+ is_purchasable(smiles) -> bool
16
+ get_purchasable(smiles) -> Precursor | None
17
+ score_disconnection(template, precursors, target_mol) -> float
18
+ format_tree(tree) -> str
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import math
24
+ from dataclasses import dataclass, field
25
+
26
+ from molbuilder.molecule.graph import Molecule
27
+ from molbuilder.smiles.parser import parse
28
+ from molbuilder.smiles.writer import to_smiles
29
+ from molbuilder.reactions.reaction_types import ReactionTemplate, ReactionCategory
30
+ from molbuilder.reactions.knowledge_base import (
31
+ REACTION_TEMPLATES,
32
+ lookup_by_functional_group,
33
+ find_reactions_producing,
34
+ )
35
+ from molbuilder.reactions.functional_group_detect import (
36
+ detect_functional_groups,
37
+ FunctionalGroup,
38
+ )
39
+
40
+
41
+ # =====================================================================
42
+ # Purchasable starting materials database (~200 entries)
43
+ # =====================================================================
44
+
45
+ # Each entry maps a canonical SMILES to (common_name, cost_per_kg_usd).
46
+ # Organised roughly by functional-group class so the table is easy to
47
+ # extend. Costs are representative order-of-magnitude estimates for
48
+ # bulk laboratory quantities and are NOT authoritative pricing data.
49
+
50
+ PURCHASABLE_MATERIALS: dict[str, tuple[str, float]] = {
51
+ # --- simple hydrocarbons / gases ---
52
+ "C": ("methane", 0.50),
53
+ "CC": ("ethane", 1.00),
54
+ "CCC": ("propane", 1.20),
55
+ "CCCC": ("n-butane", 1.50),
56
+ "C=C": ("ethylene", 1.50),
57
+ "CC=C": ("propylene", 2.00),
58
+ "C=CC=C": ("1,3-butadiene", 3.00),
59
+ "C#C": ("acetylene", 2.50),
60
+ "CC#C": ("propyne", 5.00),
61
+ "C1CC1": ("cyclopropane", 8.00),
62
+ "C1CCC1": ("cyclobutane", 15.00),
63
+ "C1CCCC1": ("cyclopentane", 4.00),
64
+ "C1CCCCC1": ("cyclohexane", 3.00),
65
+
66
+ # --- alkyl halides ---
67
+ "CCl": ("chloromethane", 2.00),
68
+ "CBr": ("bromomethane", 5.00),
69
+ "CI": ("iodomethane", 12.00),
70
+ "CCCl": ("chloroethane", 3.00),
71
+ "CCBr": ("bromoethane", 6.00),
72
+ "CCI": ("iodoethane", 15.00),
73
+ "CCCCl": ("1-chloropropane", 4.00),
74
+ "CCCBr": ("1-bromopropane", 7.00),
75
+ "CCCI": ("1-iodopropane", 18.00),
76
+ "CCCCCl": ("1-chlorobutane", 5.00),
77
+ "CCCCBr": ("1-bromobutane", 8.00),
78
+ "CCCCI": ("1-iodobutane", 20.00),
79
+ "CC(C)Cl": ("2-chloropropane", 5.00),
80
+ "CC(C)Br": ("2-bromopropane", 8.00),
81
+ "CC(C)(C)Cl": ("tert-butyl chloride", 6.00),
82
+ "CC(C)(C)Br": ("tert-butyl bromide", 10.00),
83
+ "C(Cl)(Cl)Cl": ("chloroform", 2.50),
84
+ "C(Cl)Cl": ("dichloromethane", 2.00),
85
+ "ClC=C": ("vinyl chloride", 2.00),
86
+ "BrC=C": ("vinyl bromide", 8.00),
87
+ "ClCC=C": ("allyl chloride", 4.00),
88
+ "BrCC=C": ("allyl bromide", 7.00),
89
+
90
+ # --- alcohols ---
91
+ "CO": ("methanol", 1.00),
92
+ "CCO": ("ethanol", 2.00),
93
+ "CCCO": ("1-propanol", 3.00),
94
+ "CC(C)O": ("2-propanol", 2.50),
95
+ "CCCCO": ("1-butanol", 3.50),
96
+ "CC(C)(C)O": ("tert-butanol", 4.00),
97
+ "CCCCCO": ("1-pentanol", 5.00),
98
+ "CCCCCCO": ("1-hexanol", 6.00),
99
+ "OCC=C": ("allyl alcohol", 5.00),
100
+ "OC1CCCCC1": ("cyclohexanol", 5.00),
101
+ "OCCO": ("ethylene glycol", 2.00),
102
+ "OCC(O)CO": ("glycerol", 2.50),
103
+ "OC(C)(C)C": ("tert-butanol (alt)", 4.00),
104
+
105
+ # --- water and simple inorganics ---
106
+ "O": ("water", 0.01),
107
+ "[NH3]": ("ammonia", 0.80),
108
+ "N": ("ammonia (SMILES variant)", 0.80),
109
+ "Cl": ("hydrochloric acid", 0.50),
110
+ "O=C=O": ("carbon dioxide", 0.30),
111
+ "S": ("hydrogen sulfide", 1.00),
112
+
113
+ # --- aldehydes ---
114
+ "C=O": ("formaldehyde", 1.50),
115
+ "CC=O": ("acetaldehyde", 3.00),
116
+ "CCC=O": ("propanal", 5.00),
117
+ "CCCC=O": ("butanal", 6.00),
118
+ "CCCCC=O": ("pentanal", 8.00),
119
+ "O=CC=O": ("glyoxal", 5.00),
120
+
121
+ # --- ketones ---
122
+ "CC(C)=O": ("acetone", 1.50),
123
+ "CCC(C)=O": ("methyl ethyl ketone", 3.00),
124
+ "CCC(CC)=O": ("3-pentanone", 5.00),
125
+ "CCCC(C)=O": ("2-pentanone", 5.00),
126
+ "O=C1CCCCC1": ("cyclohexanone", 4.00),
127
+ "C=CC(C)=O": ("methyl vinyl ketone", 6.00),
128
+
129
+ # --- carboxylic acids ---
130
+ "OC=O": ("formic acid", 2.00),
131
+ "CC(O)=O": ("acetic acid", 1.50),
132
+ "CCC(O)=O": ("propionic acid", 3.00),
133
+ "CCCC(O)=O": ("butyric acid", 4.00),
134
+ "CCCCC(O)=O": ("valeric acid", 6.00),
135
+ "OC(=O)C=C": ("acrylic acid", 3.00),
136
+ "OC(=O)CC(O)=O": ("malonic acid", 5.00),
137
+ "OC(=O)CCC(O)=O": ("succinic acid", 4.00),
138
+ "OC(=O)CCCCC(O)=O": ("adipic acid", 4.50),
139
+ "OC(=O)C(O)=O": ("oxalic acid", 3.50),
140
+
141
+ # --- esters ---
142
+ "COC(C)=O": ("methyl acetate", 3.00),
143
+ "CCOC(C)=O": ("ethyl acetate", 2.50),
144
+ "CCOC(=O)CC": ("ethyl propanoate", 4.00),
145
+ "CCOC(=O)OCC": ("diethyl carbonate", 5.00),
146
+
147
+ # --- ethers ---
148
+ "COC": ("dimethyl ether", 2.00),
149
+ "CCOCC": ("diethyl ether", 3.00),
150
+ "C1CCOC1": ("tetrahydrofuran", 4.00),
151
+ "C1COCCO1": ("1,4-dioxane", 5.00),
152
+ "COC=C": ("methyl vinyl ether", 6.00),
153
+ "COCCOCCOCC": ("diglyme", 8.00),
154
+
155
+ # --- amines ---
156
+ "CN": ("methylamine", 3.00),
157
+ "CCN": ("ethylamine", 4.00),
158
+ "CCCN": ("propylamine", 5.00),
159
+ "CCCCN": ("butylamine", 6.00),
160
+ "CNC": ("dimethylamine", 4.00),
161
+ "CN(C)C": ("trimethylamine", 5.00),
162
+ "CCN(CC)CC": ("triethylamine", 6.00),
163
+ "NCC=C": ("allylamine", 7.00),
164
+ "NC1CCCCC1": ("cyclohexylamine", 8.00),
165
+ "NCCN": ("ethylenediamine", 5.00),
166
+ "NCCCN": ("1,3-diaminopropane", 7.00),
167
+ "NCCCCN": ("1,4-diaminobutane", 8.00),
168
+
169
+ # --- amides ---
170
+ "NC=O": ("formamide", 3.00),
171
+ "CC(N)=O": ("acetamide", 4.00),
172
+ "CN(C)C=O": ("dimethylformamide", 3.50),
173
+
174
+ # --- nitriles ---
175
+ "C#N": ("hydrogen cyanide", 2.00),
176
+ "CC#N": ("acetonitrile", 3.00),
177
+ "CCC#N": ("propionitrile", 5.00),
178
+ "CCCC#N": ("butyronitrile", 7.00),
179
+
180
+ # --- aromatics ---
181
+ "c1ccccc1": ("benzene", 2.50),
182
+ "Cc1ccccc1": ("toluene", 2.50),
183
+ "CCc1ccccc1": ("ethylbenzene", 3.50),
184
+ "C=Cc1ccccc1": ("styrene", 4.00),
185
+ "c1ccc(cc1)C": ("toluene (alt)", 2.50),
186
+ "Oc1ccccc1": ("phenol", 3.00),
187
+ "Nc1ccccc1": ("aniline", 4.00),
188
+ "Clc1ccccc1": ("chlorobenzene", 3.50),
189
+ "Brc1ccccc1": ("bromobenzene", 5.00),
190
+ "Ic1ccccc1": ("iodobenzene", 10.00),
191
+ "OC(=O)c1ccccc1": ("benzoic acid", 3.50),
192
+ "O=Cc1ccccc1": ("benzaldehyde", 5.00),
193
+ "CC(=O)c1ccccc1": ("acetophenone", 5.00),
194
+ "c1ccc2ccccc2c1": ("naphthalene", 4.00),
195
+ "c1ccncc1": ("pyridine", 4.00),
196
+ "C1=COC=C1": ("furan", 5.00),
197
+ "c1cc[nH]c1": ("pyrrole", 6.00),
198
+
199
+ # --- aromatic halides ---
200
+ "Fc1ccccc1": ("fluorobenzene", 6.00),
201
+ "Clc1ccc(Cl)cc1": ("1,4-dichlorobenzene", 4.00),
202
+
203
+ # --- amino acids (common L-forms, simplified SMILES) ---
204
+ "NCC(O)=O": ("glycine", 5.00),
205
+ "CC(N)C(O)=O": ("alanine", 8.00),
206
+ "CC(C)C(N)C(O)=O": ("valine", 15.00),
207
+ "CC(CC)C(N)C(O)=O": ("isoleucine", 20.00),
208
+ "CCCC(N)C(O)=O": ("leucine (linear approx)", 18.00),
209
+ "NC(=O)CC(N)C(O)=O": ("asparagine", 15.00),
210
+ "OC(=O)CC(N)C(O)=O": ("aspartic acid", 12.00),
211
+ "OC(=O)CCC(N)C(O)=O": ("glutamic acid", 12.00),
212
+ "NCCCCC(N)C(O)=O": ("lysine", 20.00),
213
+ "NC(N)=NCCCC(N)C(O)=O": ("arginine", 25.00),
214
+
215
+ # --- thiols ---
216
+ "CS": ("methanethiol", 4.00),
217
+ "CCS": ("ethanethiol", 5.00),
218
+ "CCCS": ("1-propanethiol", 7.00),
219
+
220
+ # --- acid chlorides ---
221
+ "CC(Cl)=O": ("acetyl chloride", 4.00),
222
+ "ClC(Cl)=O": ("phosgene", 3.00),
223
+ "CCC(Cl)=O": ("propanoyl chloride", 6.00),
224
+ "OC(Cl)=O": ("chloroformic acid", 5.00),
225
+
226
+ # --- acid anhydrides ---
227
+ "CC(=O)OC(C)=O": ("acetic anhydride", 3.00),
228
+
229
+ # --- epoxides ---
230
+ "C1CO1": ("ethylene oxide", 3.00),
231
+ "CC1CO1": ("propylene oxide", 4.00),
232
+
233
+ # --- miscellaneous building blocks ---
234
+ "C(=O)O": ("formic acid (alt)", 2.00),
235
+ "CCCCCCCCCCCC": ("dodecane", 4.00),
236
+ "CCCCCCCC": ("octane", 3.00),
237
+ "CCCCCC": ("hexane", 2.50),
238
+ "CCCCC": ("pentane", 2.00),
239
+ "CC(C)CC": ("isopentane", 2.50),
240
+ "CC(C)C": ("isobutane", 2.00),
241
+ "C=CC(=O)OC": ("methyl acrylate", 4.00),
242
+ "C=CC(=O)OCC": ("ethyl acrylate", 5.00),
243
+ "C=C(C)C(=O)OC": ("methyl methacrylate", 5.00),
244
+ "C(CO)O": ("ethylene glycol (alt)", 2.00),
245
+ "OCCCCO": ("1,4-butanediol", 4.00),
246
+ "C(F)(F)F": ("fluoroform", 3.00),
247
+ "C(Cl)(Cl)(Cl)Cl": ("carbon tetrachloride", 3.00),
248
+ "C(F)(F)(F)Cl": ("chlorotrifluoromethane", 5.00),
249
+ "CC(=O)OC=C": ("vinyl acetate", 4.00),
250
+ "ClCCCl": ("1,2-dichloroethane", 2.00),
251
+ "BrCCBr": ("1,2-dibromoethane", 6.00),
252
+ "CCCCCCCCCCCCCCCCCC(O)=O": ("stearic acid", 4.00),
253
+ "CCCCCCCC(O)=O": ("octanoic acid", 5.00),
254
+
255
+ # --- sugars / polyols ---
256
+ "OCC(O)C(O)C(O)C(O)CO": ("D-sorbitol", 3.50),
257
+ "OCC(O)C(O)CO": ("erythritol", 6.00),
258
+
259
+ # --- diacids / anhydrides ---
260
+ "O=C1OC(=O)C=C1": ("maleic anhydride", 3.50),
261
+ "OC(=O)C=CC(O)=O": ("maleic acid", 4.00),
262
+
263
+ # --- phosphorus / sulfur reagents (simplified) ---
264
+ "OP(O)(O)=O": ("phosphoric acid", 1.50),
265
+ "OS(O)(=O)=O": ("sulfuric acid", 0.50),
266
+ "OS(=O)=O": ("sulfurous acid", 2.00),
267
+
268
+ # --- azides and nitro compounds ---
269
+ "CN=[N+]=[N-]": ("methyl azide", 10.00),
270
+ "C[N+](=O)[O-]": ("nitromethane", 5.00),
271
+ "CC[N+](=O)[O-]": ("nitroethane", 7.00),
272
+ "[O-][N+](=O)c1ccccc1": ("nitrobenzene", 5.00),
273
+
274
+ # --- additional alcohols ---
275
+ "CC(O)CC": ("2-butanol", 4.00),
276
+ "C(CO)(CO)CO": ("pentaerythritol", 6.00),
277
+ "OC(C)C": ("2-propanol (alt)", 2.50),
278
+
279
+ # --- additional halides ---
280
+ "FC(F)F": ("trifluoromethane", 5.00),
281
+ "C(F)(F)(F)C(F)(F)F": ("hexafluoroethane", 8.00),
282
+ "ClC(Cl)=C": ("vinylidene chloride", 4.00),
283
+ "CC(Cl)(C)C": ("neopentyl chloride", 7.00),
284
+ "BrCCCBr": ("1,3-dibromopropane", 8.00),
285
+ "BrCCCCBr": ("1,4-dibromobutane", 10.00),
286
+ "ICCl": ("chloroiodomethane", 12.00),
287
+
288
+ # --- additional aromatics ---
289
+ "c1ccoc1": ("furan (aromatic)", 5.00),
290
+ "c1ccsc1": ("thiophene", 5.00),
291
+ "c1cnc2ccccc2c1": ("quinoline", 8.00),
292
+ "c1ccc2c(c1)cccc2": ("naphthalene (alt)", 4.00),
293
+ "OCc1ccccc1": ("benzyl alcohol", 5.00),
294
+ "NCc1ccccc1": ("benzylamine", 7.00),
295
+ "ClCc1ccccc1": ("benzyl chloride", 6.00),
296
+ "BrCc1ccccc1": ("benzyl bromide", 8.00),
297
+ "c1ccc(O)c(O)c1": ("catechol", 6.00),
298
+ "c1cc(O)cc(O)c1": ("resorcinol", 7.00),
299
+ "Oc1ccc(O)cc1": ("hydroquinone", 5.00),
300
+ "CC(=O)Oc1ccccc1": ("phenyl acetate", 6.00),
301
+
302
+ # --- heterocycles ---
303
+ "C1CCNCC1": ("piperidine", 5.00),
304
+ "C1CCNC1": ("pyrrolidine", 6.00),
305
+ "C1CCOC1": ("tetrahydrofuran (ring)", 4.00),
306
+ "C1CCOCC1": ("tetrahydropyran", 5.00),
307
+ "C1CNCCN1": ("piperazine", 6.00),
308
+ "c1c[nH]cn1": ("imidazole", 7.00),
309
+ "C1CO1": ("ethylene oxide (ring)", 3.00),
310
+
311
+ # --- additional carboxylic acid derivatives ---
312
+ "CC(=O)NC": ("N-methylacetamide", 5.00),
313
+ "O=C(Cl)c1ccccc1": ("benzoyl chloride", 7.00),
314
+ "OC(=O)CCCCCC(O)=O": ("pimelic acid", 6.00),
315
+ "OC(=O)c1ccc(C(O)=O)cc1": ("terephthalic acid", 5.00),
316
+ }
317
+
318
+ # Also accept alkyl halide generic name
319
+ _PURCHASABLE_ALIASES: dict[str, str] = {}
320
+
321
+
322
+ # =====================================================================
323
+ # Data structures
324
+ # =====================================================================
325
+
326
+ @dataclass
327
+ class Precursor:
328
+ """A molecule that serves as starting material for one reaction step.
329
+
330
+ Attributes
331
+ ----------
332
+ smiles : str
333
+ SMILES representation of the precursor.
334
+ molecule : Molecule | None
335
+ Parsed molecule object, or None if looked up from purchasable DB.
336
+ name : str
337
+ Human-readable name for display.
338
+ cost_per_kg : float
339
+ Estimated cost per kilogram in USD.
340
+ """
341
+ smiles: str
342
+ molecule: Molecule | None
343
+ name: str
344
+ cost_per_kg: float
345
+
346
+
347
+ @dataclass
348
+ class Disconnection:
349
+ """One possible retrosynthetic disconnection for a target node.
350
+
351
+ Attributes
352
+ ----------
353
+ template : ReactionTemplate
354
+ The reaction template applied in reverse.
355
+ precursors : list[Precursor]
356
+ The precursor molecules produced by this disconnection.
357
+ score : float
358
+ Quality score from 0 to 100 (higher is better).
359
+ """
360
+ template: ReactionTemplate
361
+ precursors: list[Precursor]
362
+ score: float
363
+
364
+
365
+ @dataclass
366
+ class RetroNode:
367
+ """A node in the retrosynthetic search tree.
368
+
369
+ Each node represents one molecule. If the molecule is not purchasable,
370
+ its ``disconnections`` list holds candidate retrosynthetic steps, and
371
+ ``children`` holds the recursively expanded precursor nodes for the
372
+ best disconnection.
373
+
374
+ Attributes
375
+ ----------
376
+ smiles : str
377
+ SMILES string for this molecule.
378
+ molecule : Molecule
379
+ The parsed Molecule object.
380
+ functional_groups : list[FunctionalGroup]
381
+ Functional groups detected on this molecule.
382
+ is_purchasable : bool
383
+ True if this molecule appears in PURCHASABLE_MATERIALS.
384
+ disconnections : list[Disconnection]
385
+ Candidate retrosynthetic disconnections (best first).
386
+ best_disconnection : Disconnection | None
387
+ The top-scoring disconnection, if any.
388
+ children : list[RetroNode]
389
+ Expanded child nodes (precursors of the best disconnection).
390
+ depth : int
391
+ Depth of this node in the search tree (root = 0).
392
+ """
393
+ smiles: str
394
+ molecule: Molecule
395
+ functional_groups: list[FunctionalGroup] = field(default_factory=list)
396
+ is_purchasable: bool = False
397
+ disconnections: list[Disconnection] = field(default_factory=list)
398
+ best_disconnection: Disconnection | None = None
399
+ children: list["RetroNode"] = field(default_factory=list)
400
+ depth: int = 0
401
+
402
+
403
+ @dataclass
404
+ class RetrosynthesisTree:
405
+ """Complete retrosynthesis result.
406
+
407
+ Attributes
408
+ ----------
409
+ target : RetroNode
410
+ Root of the retrosynthesis tree (the target molecule).
411
+ max_depth : int
412
+ Maximum search depth used.
413
+ beam_width : int
414
+ Beam width (number of disconnections kept per level).
415
+ routes_found : int
416
+ Total number of complete routes found to purchasable materials.
417
+ """
418
+ target: RetroNode
419
+ max_depth: int
420
+ beam_width: int
421
+ routes_found: int
422
+
423
+
424
+ # =====================================================================
425
+ # Purchasability checks
426
+ # =====================================================================
427
+
428
+ def is_purchasable(smiles: str) -> bool:
429
+ """Return True if *smiles* matches a known purchasable material.
430
+
431
+ The check tries the SMILES string as-is and also attempts a round-
432
+ trip (parse then re-serialise) to handle minor notational differences.
433
+ """
434
+ if smiles in PURCHASABLE_MATERIALS:
435
+ return True
436
+ # Try canonical round-trip
437
+ try:
438
+ canon = to_smiles(parse(smiles))
439
+ if canon in PURCHASABLE_MATERIALS:
440
+ return True
441
+ except Exception:
442
+ pass
443
+ return False
444
+
445
+
446
+ def get_purchasable(smiles: str) -> Precursor | None:
447
+ """Return a Precursor for *smiles* if it is purchasable, else None."""
448
+ entry = PURCHASABLE_MATERIALS.get(smiles)
449
+ if entry is not None:
450
+ name, cost = entry
451
+ return Precursor(smiles=smiles, molecule=None, name=name,
452
+ cost_per_kg=cost)
453
+ # Try canonical form
454
+ try:
455
+ canon = to_smiles(parse(smiles))
456
+ entry = PURCHASABLE_MATERIALS.get(canon)
457
+ if entry is not None:
458
+ name, cost = entry
459
+ return Precursor(smiles=canon, molecule=None, name=name,
460
+ cost_per_kg=cost)
461
+ except Exception:
462
+ pass
463
+ return None
464
+
465
+
466
+ # =====================================================================
467
+ # Scoring helpers
468
+ # =====================================================================
469
+
470
+ def _count_heavy_atoms(mol: Molecule) -> int:
471
+ """Count non-hydrogen atoms."""
472
+ return sum(1 for a in mol.atoms if a.symbol != "H")
473
+
474
+
475
+ def _heavy_atom_count_from_smiles(smiles: str) -> int:
476
+ """Count heavy atoms by parsing SMILES (returns 0 on failure)."""
477
+ try:
478
+ mol = parse(smiles)
479
+ return _count_heavy_atoms(mol)
480
+ except Exception:
481
+ return 0
482
+
483
+
484
+ def score_disconnection(
485
+ template: ReactionTemplate,
486
+ precursors: list[Precursor],
487
+ target_mol: Molecule,
488
+ ) -> float:
489
+ """Score a retrosynthetic disconnection from 0 (poor) to 100 (ideal).
490
+
491
+ The score is a weighted sum of several heuristic factors:
492
+
493
+ 1. **Yield expectation** (0--25 pts): Higher template yield is better.
494
+ 2. **Precursor availability** (0--30 pts): More purchasable precursors
495
+ contribute more points.
496
+ 3. **Complexity reduction** (0--20 pts): If precursors are significantly
497
+ simpler (fewer heavy atoms) than the target, the score is higher.
498
+ 4. **Strategic bond preference** (0--15 pts): Reactions that form C-C
499
+ bonds (coupling, Grignard, aldol, etc.) score highest because C-C
500
+ disconnections are the backbone of retrosynthetic strategy.
501
+ 5. **Template category bonus** (0--10 pts): Coupling and carbonyl
502
+ reactions get a small bonus as they are the most commonly used
503
+ strategic transforms.
504
+ """
505
+ score = 0.0
506
+
507
+ # --- 1. Yield expectation (0--25) ---
508
+ lo, hi = template.typical_yield
509
+ mid_yield = (lo + hi) / 2.0
510
+ score += 25.0 * (mid_yield / 100.0)
511
+
512
+ # --- 2. Precursor availability (0--30) ---
513
+ if precursors:
514
+ purchasable_count = sum(1 for p in precursors if is_purchasable(p.smiles))
515
+ frac = purchasable_count / len(precursors)
516
+ score += 30.0 * frac
517
+
518
+ # --- 3. Complexity reduction (0--20) ---
519
+ target_heavy = _count_heavy_atoms(target_mol)
520
+ if target_heavy > 0 and precursors:
521
+ max_precursor_heavy = max(
522
+ _heavy_atom_count_from_smiles(p.smiles) for p in precursors
523
+ )
524
+ if max_precursor_heavy < target_heavy:
525
+ reduction = (target_heavy - max_precursor_heavy) / target_heavy
526
+ score += 20.0 * min(1.0, reduction * 2.0)
527
+ # If the precursor is no simpler, no points here.
528
+
529
+ # --- 4. Strategic bond preference (0--15) ---
530
+ cc_keywords = ("coupling", "grignard", "aldol", "wittig", "suzuki",
531
+ "heck", "sonogashira", "stille", "negishi",
532
+ "horner", "claisen condensation", "michael",
533
+ "robinson")
534
+ name_lower = template.name.lower()
535
+ named_lower = (template.named_reaction or "").lower()
536
+ if any(kw in name_lower or kw in named_lower for kw in cc_keywords):
537
+ score += 15.0
538
+ elif template.category == ReactionCategory.COUPLING:
539
+ score += 12.0
540
+ elif template.category in (ReactionCategory.CARBONYL,
541
+ ReactionCategory.ADDITION):
542
+ score += 6.0
543
+
544
+ # --- 5. Template category bonus (0--10) ---
545
+ category_bonus = {
546
+ ReactionCategory.COUPLING: 10.0,
547
+ ReactionCategory.CARBONYL: 8.0,
548
+ ReactionCategory.ADDITION: 6.0,
549
+ ReactionCategory.SUBSTITUTION: 5.0,
550
+ ReactionCategory.REDUCTION: 4.0,
551
+ ReactionCategory.OXIDATION: 4.0,
552
+ ReactionCategory.ELIMINATION: 3.0,
553
+ ReactionCategory.REARRANGEMENT: 3.0,
554
+ ReactionCategory.PROTECTION: 1.0,
555
+ ReactionCategory.DEPROTECTION: 1.0,
556
+ }
557
+ score += category_bonus.get(template.category, 2.0)
558
+
559
+ return min(100.0, max(0.0, score))
560
+
561
+
562
+ # =====================================================================
563
+ # Reverse transform: generate precursor SMILES from a template
564
+ # =====================================================================
565
+
566
+ def _generate_precursors_for_template(
567
+ target_smiles: str,
568
+ target_mol: Molecule,
569
+ template: ReactionTemplate,
570
+ fg: FunctionalGroup,
571
+ ) -> list[Precursor]:
572
+ """Generate precursor SMILES by conceptually reversing *template*.
573
+
574
+ The approach is a simplification: rather than performing a full
575
+ subgraph transform, we modify the target molecule according to
576
+ the functional group that the reaction *produces*. The idea is to
577
+ remove or simplify the functional group that the forward reaction
578
+ would create, yielding one or more simpler precursor molecules.
579
+
580
+ For multi-component reactions (e.g. Grignard, Suzuki) two precursors
581
+ are generated by splitting the target at the bond(s) adjacent to the
582
+ functional group centre.
583
+
584
+ Returns a list of Precursor objects (may be empty on failure).
585
+ """
586
+ precursors: list[Precursor] = []
587
+
588
+ cat = template.category
589
+ fg_name = fg.name
590
+ center = fg.center
591
+ fg_atoms = fg.atoms
592
+
593
+ # ---- Strategy: map reaction category to precursor generation ----
594
+
595
+ # REDUCTION or OXIDATION: the precursor is the oxidised/reduced form.
596
+ # We approximate by swapping the FG for the one the template requires.
597
+ if cat == ReactionCategory.REDUCTION:
598
+ # Template reduces FG_required -> FG_produced.
599
+ # Reverse: we have the product, so precursor has the FG_required.
600
+ # Simplification: return a variant SMILES with the bond order changed.
601
+ precursor_smi = _modify_fg_smiles(
602
+ target_smiles, target_mol, fg, template, direction="oxidise")
603
+ if precursor_smi:
604
+ precursors.append(Precursor(
605
+ smiles=precursor_smi, molecule=None,
606
+ name=f"precursor ({template.name})",
607
+ cost_per_kg=_estimate_cost(precursor_smi),
608
+ ))
609
+ return precursors
610
+
611
+ if cat == ReactionCategory.OXIDATION:
612
+ precursor_smi = _modify_fg_smiles(
613
+ target_smiles, target_mol, fg, template, direction="reduce")
614
+ if precursor_smi:
615
+ precursors.append(Precursor(
616
+ smiles=precursor_smi, molecule=None,
617
+ name=f"precursor ({template.name})",
618
+ cost_per_kg=_estimate_cost(precursor_smi),
619
+ ))
620
+ return precursors
621
+
622
+ # COUPLING / CARBONYL: split into two fragments
623
+ if cat in (ReactionCategory.COUPLING, ReactionCategory.CARBONYL):
624
+ frags = _split_at_fg(target_smiles, target_mol, fg, template)
625
+ for smi in frags:
626
+ precursors.append(Precursor(
627
+ smiles=smi, molecule=None,
628
+ name=f"fragment ({template.name})",
629
+ cost_per_kg=_estimate_cost(smi),
630
+ ))
631
+ return precursors
632
+
633
+ # SUBSTITUTION: replace the produced FG with the required one
634
+ if cat == ReactionCategory.SUBSTITUTION:
635
+ precursor_smi = _substitute_fg(
636
+ target_smiles, target_mol, fg, template)
637
+ if precursor_smi:
638
+ precursors.append(Precursor(
639
+ smiles=precursor_smi, molecule=None,
640
+ name=f"precursor ({template.name})",
641
+ cost_per_kg=_estimate_cost(precursor_smi),
642
+ ))
643
+ # Also add the reagent as a precursor if it is recognisable
644
+ for reagent in template.reagents:
645
+ rp = _reagent_to_precursor(reagent)
646
+ if rp is not None:
647
+ precursors.append(rp)
648
+ return precursors
649
+
650
+ # ELIMINATION / ADDITION: forward-reverse pair
651
+ if cat == ReactionCategory.ELIMINATION:
652
+ # The product is an alkene; precursor is an alkyl halide or alcohol.
653
+ precursor_smi = _add_across_double_bond(
654
+ target_smiles, target_mol, fg, template)
655
+ if precursor_smi:
656
+ precursors.append(Precursor(
657
+ smiles=precursor_smi, molecule=None,
658
+ name=f"precursor ({template.name})",
659
+ cost_per_kg=_estimate_cost(precursor_smi),
660
+ ))
661
+ return precursors
662
+
663
+ if cat == ReactionCategory.ADDITION:
664
+ # The product has a new FG across a former double bond.
665
+ precursor_smi = _remove_addition(
666
+ target_smiles, target_mol, fg, template)
667
+ if precursor_smi:
668
+ precursors.append(Precursor(
669
+ smiles=precursor_smi, molecule=None,
670
+ name=f"precursor ({template.name})",
671
+ cost_per_kg=_estimate_cost(precursor_smi),
672
+ ))
673
+ for reagent in template.reagents:
674
+ rp = _reagent_to_precursor(reagent)
675
+ if rp is not None:
676
+ precursors.append(rp)
677
+ return precursors
678
+
679
+ # PROTECTION / DEPROTECTION: the core structure is essentially kept.
680
+ if cat in (ReactionCategory.PROTECTION, ReactionCategory.DEPROTECTION):
681
+ # Precursor is the unprotected / protected form.
682
+ precursor_smi = _toggle_protection(
683
+ target_smiles, target_mol, fg, template)
684
+ if precursor_smi:
685
+ precursors.append(Precursor(
686
+ smiles=precursor_smi, molecule=None,
687
+ name=f"precursor ({template.name})",
688
+ cost_per_kg=_estimate_cost(precursor_smi),
689
+ ))
690
+ return precursors
691
+
692
+ # REARRANGEMENT: return the pre-rearrangement skeleton
693
+ if cat == ReactionCategory.REARRANGEMENT:
694
+ precursor_smi = _reverse_rearrangement(
695
+ target_smiles, target_mol, fg, template)
696
+ if precursor_smi:
697
+ precursors.append(Precursor(
698
+ smiles=precursor_smi, molecule=None,
699
+ name=f"precursor ({template.name})",
700
+ cost_per_kg=_estimate_cost(precursor_smi),
701
+ ))
702
+ return precursors
703
+
704
+ # Fallback: try a simple truncation
705
+ precursor_smi = _simplify_molecule(target_smiles, target_mol, fg)
706
+ if precursor_smi:
707
+ precursors.append(Precursor(
708
+ smiles=precursor_smi, molecule=None,
709
+ name=f"simplified precursor",
710
+ cost_per_kg=_estimate_cost(precursor_smi),
711
+ ))
712
+ return precursors
713
+
714
+
715
+ # =====================================================================
716
+ # Molecular transform helpers (heuristic / simplified)
717
+ # =====================================================================
718
+
719
+ def _modify_fg_smiles(
720
+ target_smiles: str,
721
+ target_mol: Molecule,
722
+ fg: FunctionalGroup,
723
+ template: ReactionTemplate,
724
+ direction: str,
725
+ ) -> str | None:
726
+ """Heuristically modify a functional group for redox transforms.
727
+
728
+ For 'oxidise': alcohol -> aldehyde/ketone, aldehyde -> carboxylic acid.
729
+ For 'reduce' : aldehyde/ketone -> alcohol, carboxylic acid -> aldehyde.
730
+
731
+ Returns a precursor SMILES string or None on failure.
732
+ """
733
+ fg_name = fg.name
734
+ try:
735
+ if direction == "oxidise":
736
+ # Product was reduced, so precursor is oxidised form
737
+ if fg_name == "alcohol":
738
+ # Precursor is the corresponding aldehyde or ketone
739
+ return _replace_oh_with_carbonyl(target_smiles)
740
+ if fg_name in ("aldehyde", "ketone"):
741
+ # Precursor might be a carboxylic acid
742
+ return target_smiles # keep same (template applies to it)
743
+ else: # reduce
744
+ if fg_name == "aldehyde":
745
+ return _replace_carbonyl_with_oh(target_smiles)
746
+ if fg_name == "ketone":
747
+ return _replace_carbonyl_with_oh(target_smiles)
748
+ if fg_name == "carboxylic_acid":
749
+ return target_smiles
750
+ except Exception:
751
+ pass
752
+ return None
753
+
754
+
755
+ def _validate_smiles_transform(original: str, transformed: str) -> str | None:
756
+ """Validate that a string-based SMILES transform produced a parseable result.
757
+
758
+ Returns the transformed SMILES if it:
759
+ 1. Is different from the original
760
+ 2. Parses without error
761
+ 3. Produces at least one heavy atom
762
+
763
+ Returns None if validation fails, preventing corrupt SMILES from
764
+ propagating through the retrosynthesis tree.
765
+ """
766
+ if transformed == original:
767
+ return None
768
+ try:
769
+ mol = parse(transformed)
770
+ if _count_heavy_atoms(mol) < 1:
771
+ return None
772
+ return transformed
773
+ except Exception:
774
+ return None
775
+
776
+
777
+ def _replace_oh_with_carbonyl(smiles: str) -> str | None:
778
+ """Replace first C-OH with C=O (alcohol -> carbonyl).
779
+
780
+ Uses validation to prevent corrupt results from substring
781
+ collisions (e.g. 'COCO' should not become 'C=OCO').
782
+ """
783
+ if "CO" in smiles and "C=O" not in smiles:
784
+ candidate = smiles.replace("CO", "C=O", 1)
785
+ return _validate_smiles_transform(smiles, candidate)
786
+ return None
787
+
788
+
789
+ def _replace_carbonyl_with_oh(smiles: str) -> str | None:
790
+ """Replace first C=O with C-OH (carbonyl -> alcohol).
791
+
792
+ Uses validation to prevent corrupt results.
793
+ """
794
+ if "C=O" in smiles:
795
+ candidate = smiles.replace("C=O", "CO", 1)
796
+ return _validate_smiles_transform(smiles, candidate)
797
+ return None
798
+
799
+
800
+ def _split_at_fg(
801
+ target_smiles: str,
802
+ target_mol: Molecule,
803
+ fg: FunctionalGroup,
804
+ template: ReactionTemplate,
805
+ ) -> list[str]:
806
+ """Split the target into two fragment SMILES at the functional group.
807
+
808
+ Used for coupling / carbonyl reactions where two components combine.
809
+ The heuristic removes the functional group centre and tries to return
810
+ the two largest remaining fragments as SMILES.
811
+ """
812
+ center = fg.center
813
+ fg_atoms_set = set(fg.atoms)
814
+
815
+ # Find bonds connecting FG atoms to the rest of the molecule
816
+ break_bonds: list[tuple[int, int]] = []
817
+ for a_idx in fg.atoms:
818
+ for nb in target_mol.neighbors(a_idx):
819
+ if nb not in fg_atoms_set:
820
+ break_bonds.append((a_idx, nb))
821
+
822
+ if len(break_bonds) < 2:
823
+ # Cannot split meaningfully -- return the whole thing simplified
824
+ simp = _simplify_molecule(target_smiles, target_mol, fg)
825
+ return [simp] if simp else [target_smiles]
826
+
827
+ # Build two fragment atom sets by BFS from each side of the break
828
+ fragments: list[set[int]] = []
829
+ all_atoms = set(range(len(target_mol.atoms)))
830
+ excluded = fg_atoms_set
831
+
832
+ visited_global: set[int] = set()
833
+ for _, outside_atom in break_bonds:
834
+ if outside_atom in visited_global:
835
+ continue
836
+ # BFS from outside_atom, not crossing into fg_atoms
837
+ frag: set[int] = set()
838
+ stack = [outside_atom]
839
+ while stack:
840
+ cur = stack.pop()
841
+ if cur in frag or cur in excluded:
842
+ continue
843
+ frag.add(cur)
844
+ for nb in target_mol.neighbors(cur):
845
+ if nb not in frag and nb not in excluded:
846
+ stack.append(nb)
847
+ if frag:
848
+ visited_global |= frag
849
+ fragments.append(frag)
850
+
851
+ # Convert each fragment to SMILES using a simplified approach:
852
+ # We generate a sub-SMILES by collecting the heavy-atom symbols
853
+ # and connecting them linearly. This is an approximation.
854
+ result_smiles: list[str] = []
855
+ for frag in fragments[:2]:
856
+ smi = _fragment_to_smiles(target_mol, frag)
857
+ if smi:
858
+ result_smiles.append(smi)
859
+
860
+ # If we only got one fragment, add a simple reagent as the second
861
+ if len(result_smiles) == 1:
862
+ for reagent in template.reagents:
863
+ rp = _reagent_to_precursor(reagent)
864
+ if rp is not None:
865
+ result_smiles.append(rp.smiles)
866
+ break
867
+ else:
868
+ result_smiles.append("C") # methane fallback
869
+
870
+ if not result_smiles:
871
+ result_smiles = [target_smiles]
872
+
873
+ return result_smiles
874
+
875
+
876
+ def _fragment_to_smiles(mol: Molecule, atom_indices: set[int]) -> str:
877
+ """Build an approximate SMILES for a subset of atoms in *mol*.
878
+
879
+ Constructs a new Molecule from the selected atoms (excluding H),
880
+ copies the bonds between them, and serialises with to_smiles.
881
+ """
882
+ heavy_indices = sorted(
883
+ idx for idx in atom_indices if mol.atoms[idx].symbol != "H"
884
+ )
885
+ if not heavy_indices:
886
+ return ""
887
+
888
+ # Build a sub-molecule
889
+ sub = Molecule(name="fragment")
890
+ old_to_new: dict[int, int] = {}
891
+ for old_idx in heavy_indices:
892
+ atom = mol.atoms[old_idx]
893
+ new_idx = sub.add_atom(atom.symbol, atom.position.copy(),
894
+ atom.hybridization)
895
+ old_to_new[old_idx] = new_idx
896
+
897
+ # Copy bonds within the fragment
898
+ for bond in mol.bonds:
899
+ if bond.atom_i in old_to_new and bond.atom_j in old_to_new:
900
+ ni = old_to_new[bond.atom_i]
901
+ nj = old_to_new[bond.atom_j]
902
+ # Avoid duplicate bonds
903
+ if sub.get_bond(ni, nj) is None:
904
+ sub.add_bond(ni, nj, order=bond.order, rotatable=bond.rotatable)
905
+
906
+ # Add implicit hydrogens to satisfy valence (approximate)
907
+ # We rely on the SMILES writer to handle implicit H.
908
+ try:
909
+ return to_smiles(sub)
910
+ except Exception:
911
+ # Fallback: concatenate symbols
912
+ return "".join(mol.atoms[i].symbol for i in heavy_indices[:6])
913
+
914
+
915
+ def _substitute_fg(
916
+ target_smiles: str,
917
+ target_mol: Molecule,
918
+ fg: FunctionalGroup,
919
+ template: ReactionTemplate,
920
+ ) -> str | None:
921
+ """For substitution reactions, swap the produced FG for the required one.
922
+
923
+ E.g. if the template produces an alcohol from an alkyl halide, the
924
+ precursor is the alkyl halide form.
925
+ """
926
+ # Determine what FG the precursor should have
927
+ required = template.functional_group_required
928
+ produced = template.functional_group_produced
929
+
930
+ # Map common FG swaps in SMILES
931
+ swap_map = {
932
+ ("alcohol", "alkyl_halide"): ("O", "Br"),
933
+ ("ether", "alkyl_halide"): ("OC", "Br"),
934
+ ("ether", "alcohol"): ("OC", "O"),
935
+ ("primary_amine", "alkyl_halide"): ("N", "Br"),
936
+ ("nitrile", "alkyl_halide"): ("C#N", "Br"),
937
+ ("azide", "alkyl_halide"): ("N=[N+]=[N-]", "Br"),
938
+ }
939
+
940
+ fg_name = fg.name
941
+ for req in required:
942
+ key = (fg_name, req)
943
+ if key in swap_map:
944
+ old_frag, new_frag = swap_map[key]
945
+ if old_frag in target_smiles:
946
+ candidate = target_smiles.replace(old_frag, new_frag, 1)
947
+ validated = _validate_smiles_transform(target_smiles, candidate)
948
+ if validated is not None:
949
+ return validated
950
+
951
+ # Generic fallback: just return the target with a halide substitution
952
+ if fg_name == "alcohol" and "O" in target_smiles:
953
+ candidate = target_smiles.replace("O", "Br", 1)
954
+ return _validate_smiles_transform(target_smiles, candidate)
955
+ return None
956
+
957
+
958
+ def _add_across_double_bond(
959
+ target_smiles: str,
960
+ target_mol: Molecule,
961
+ fg: FunctionalGroup,
962
+ template: ReactionTemplate,
963
+ ) -> str | None:
964
+ """Reverse of elimination: add HX across a double bond to get precursor."""
965
+ # If the target has an alkene, the precursor is an alkyl halide/alcohol.
966
+ if fg.name == "alkene" and "C=C" in target_smiles:
967
+ # Add H and Br across the double bond
968
+ candidate = target_smiles.replace("C=C", "CC(Br)", 1)
969
+ return _validate_smiles_transform(target_smiles, candidate)
970
+ return None
971
+
972
+
973
+ def _remove_addition(
974
+ target_smiles: str,
975
+ target_mol: Molecule,
976
+ fg: FunctionalGroup,
977
+ template: ReactionTemplate,
978
+ ) -> str | None:
979
+ """Reverse of addition: remove the added group to restore alkene."""
980
+ fg_name = fg.name
981
+
982
+ # The template required an alkene and produced the current FG
983
+ if "alkene" in template.functional_group_required:
984
+ # Restore the alkene by removing the added functionality
985
+ if fg_name == "alcohol" and "CO" in target_smiles:
986
+ candidate = target_smiles.replace("CO", "C=C", 1)
987
+ result = _validate_smiles_transform(target_smiles, candidate)
988
+ if result is not None:
989
+ return result
990
+ if fg_name.startswith("alkyl_halide"):
991
+ for hal in ("Br", "Cl", "I"):
992
+ if f"C{hal}" in target_smiles:
993
+ candidate = target_smiles.replace(f"C{hal}", "C=C", 1)
994
+ result = _validate_smiles_transform(target_smiles, candidate)
995
+ if result is not None:
996
+ return result
997
+ if fg_name == "epoxide" and "C1OC1" in target_smiles:
998
+ candidate = target_smiles.replace("C1OC1", "C=C", 1)
999
+ result = _validate_smiles_transform(target_smiles, candidate)
1000
+ if result is not None:
1001
+ return result
1002
+ return None
1003
+
1004
+
1005
+ def _toggle_protection(
1006
+ target_smiles: str,
1007
+ target_mol: Molecule,
1008
+ fg: FunctionalGroup,
1009
+ template: ReactionTemplate,
1010
+ ) -> str | None:
1011
+ """Toggle between protected and deprotected forms.
1012
+
1013
+ Simplification: for protection templates, just return the target
1014
+ since the core structure is essentially preserved.
1015
+ """
1016
+ return target_smiles
1017
+
1018
+
1019
+ def _reverse_rearrangement(
1020
+ target_smiles: str,
1021
+ target_mol: Molecule,
1022
+ fg: FunctionalGroup,
1023
+ template: ReactionTemplate,
1024
+ ) -> str | None:
1025
+ """Rough approximation for reversing a rearrangement.
1026
+
1027
+ Returns the target itself as a stand-in since rearrangement
1028
+ precursors are structural isomers that are hard to derive
1029
+ without full subgraph matching.
1030
+ """
1031
+ return target_smiles
1032
+
1033
+
1034
+ def _simplify_molecule(
1035
+ target_smiles: str,
1036
+ target_mol: Molecule,
1037
+ fg: FunctionalGroup,
1038
+ ) -> str | None:
1039
+ """Produce a simplified precursor by removing part of the molecule.
1040
+
1041
+ Heuristic: remove the functional group atoms and return the largest
1042
+ connected fragment.
1043
+ """
1044
+ fg_set = set(fg.atoms)
1045
+ remaining = set(range(len(target_mol.atoms))) - fg_set
1046
+ heavy = {i for i in remaining if target_mol.atoms[i].symbol != "H"}
1047
+
1048
+ if not heavy:
1049
+ return None
1050
+
1051
+ # Find largest connected component among remaining heavy atoms
1052
+ visited: set[int] = set()
1053
+ best_comp: set[int] = set()
1054
+ for start in heavy:
1055
+ if start in visited:
1056
+ continue
1057
+ comp: set[int] = set()
1058
+ stack = [start]
1059
+ while stack:
1060
+ cur = stack.pop()
1061
+ if cur in comp or cur in fg_set:
1062
+ continue
1063
+ if cur not in heavy:
1064
+ continue
1065
+ comp.add(cur)
1066
+ for nb in target_mol.neighbors(cur):
1067
+ if nb not in comp and nb in heavy:
1068
+ stack.append(nb)
1069
+ visited |= comp
1070
+ if len(comp) > len(best_comp):
1071
+ best_comp = comp
1072
+
1073
+ if not best_comp:
1074
+ return None
1075
+
1076
+ return _fragment_to_smiles(target_mol, best_comp)
1077
+
1078
+
1079
+ def _reagent_to_precursor(reagent_str: str) -> Precursor | None:
1080
+ """Try to match a reagent string to a purchasable material."""
1081
+ # Map common reagent names to SMILES
1082
+ reagent_map: dict[str, str] = {
1083
+ "NaOH": "O",
1084
+ "NaCN": "C#N",
1085
+ "NaN3": "CN=[N+]=[N-]",
1086
+ "NaOMe": "CO",
1087
+ "NaOEt": "CCO",
1088
+ "HBr": "Br",
1089
+ "HCl": "Cl",
1090
+ "H2O": "O",
1091
+ "MeOH": "CO",
1092
+ "EtOH": "CCO",
1093
+ "NaBH4": "O",
1094
+ "LiAlH4": "O",
1095
+ "H2": "O",
1096
+ "BH3*THF": "C1CCOC1",
1097
+ "mCPBA": "O",
1098
+ "Br2": "Br",
1099
+ "PCC": "O",
1100
+ "n-BuLi": "CCCC",
1101
+ }
1102
+ smi = reagent_map.get(reagent_str)
1103
+ if smi is not None:
1104
+ p = get_purchasable(smi)
1105
+ if p is not None:
1106
+ return p
1107
+ return None
1108
+
1109
+
1110
+ def _estimate_cost(smiles: str) -> float:
1111
+ """Estimate cost per kg for a SMILES string.
1112
+
1113
+ Uses purchasable DB if available, otherwise estimates based on
1114
+ molecular size.
1115
+ """
1116
+ entry = PURCHASABLE_MATERIALS.get(smiles)
1117
+ if entry is not None:
1118
+ return entry[1]
1119
+ # Rough estimate: $10/kg per heavy atom
1120
+ try:
1121
+ mol = parse(smiles)
1122
+ n_heavy = _count_heavy_atoms(mol)
1123
+ return max(5.0, n_heavy * 10.0)
1124
+ except Exception:
1125
+ return 50.0
1126
+
1127
+
1128
+ # =====================================================================
1129
+ # Template matching: which templates apply to a given FG?
1130
+ # =====================================================================
1131
+
1132
+ def _find_applicable_templates(
1133
+ fg: FunctionalGroup,
1134
+ all_fg_names: list[str],
1135
+ ) -> list[ReactionTemplate]:
1136
+ """Find reaction templates whose *produced* FG matches *fg*.
1137
+
1138
+ In retrosynthesis we work backwards: we look for reactions that
1139
+ **produce** the functional group found on the target, because
1140
+ reversing such a reaction gives us the precursors.
1141
+
1142
+ Also includes templates that **require** the FG (the forward
1143
+ reaction transforms it, so running it in reverse generates a
1144
+ molecule with that FG as starting material from something simpler).
1145
+ """
1146
+ results: list[ReactionTemplate] = []
1147
+ seen_names: set[str] = set()
1148
+
1149
+ # Templates that produce this FG (primary retrosynthetic match)
1150
+ for tmpl in find_reactions_producing(fg.name):
1151
+ if tmpl.name not in seen_names and tmpl.is_compatible(all_fg_names):
1152
+ results.append(tmpl)
1153
+ seen_names.add(tmpl.name)
1154
+
1155
+ # Also consider templates that require this FG (the forward
1156
+ # reaction uses this FG as a handle).
1157
+ for tmpl in lookup_by_functional_group(fg.name):
1158
+ if tmpl.name not in seen_names and tmpl.is_compatible(all_fg_names):
1159
+ results.append(tmpl)
1160
+ seen_names.add(tmpl.name)
1161
+
1162
+ # Handle generic alkyl_halide name for specific halides
1163
+ if fg.name.startswith("alkyl_halide_"):
1164
+ for tmpl in lookup_by_functional_group("alkyl_halide"):
1165
+ if tmpl.name not in seen_names and tmpl.is_compatible(all_fg_names):
1166
+ results.append(tmpl)
1167
+ seen_names.add(tmpl.name)
1168
+
1169
+ return results
1170
+
1171
+
1172
+ # =====================================================================
1173
+ # Beam search retrosynthesis
1174
+ # =====================================================================
1175
+
1176
+ def _build_retro_node(
1177
+ smiles: str,
1178
+ mol: Molecule,
1179
+ depth: int,
1180
+ max_depth: int,
1181
+ beam_width: int,
1182
+ visited_smiles: set[str],
1183
+ ) -> RetroNode:
1184
+ """Build one node of the retrosynthesis tree.
1185
+
1186
+ If the molecule is purchasable, the node is a leaf. Otherwise,
1187
+ functional groups are detected, templates are matched, disconnections
1188
+ are scored, and the top *beam_width* disconnections are kept. The
1189
+ best disconnection's precursors are then expanded recursively.
1190
+ """
1191
+ node = RetroNode(
1192
+ smiles=smiles,
1193
+ molecule=mol,
1194
+ depth=depth,
1195
+ )
1196
+
1197
+ # Check purchasability
1198
+ if is_purchasable(smiles):
1199
+ node.is_purchasable = True
1200
+ return node
1201
+
1202
+ # Detect functional groups
1203
+ fgs = detect_functional_groups(mol)
1204
+ node.functional_groups = fgs
1205
+
1206
+ # Depth limit
1207
+ if depth >= max_depth:
1208
+ return node
1209
+
1210
+ # Prevent infinite loops (mutable set with backtracking for efficiency)
1211
+ if smiles in visited_smiles:
1212
+ return node
1213
+ visited_smiles.add(smiles)
1214
+
1215
+ # Collect all FG names for compatibility check
1216
+ all_fg_names = [fg.name for fg in fgs]
1217
+
1218
+ # Generate disconnections
1219
+ disconnections: list[Disconnection] = []
1220
+ seen_templates: set[str] = set()
1221
+
1222
+ for fg in fgs:
1223
+ templates = _find_applicable_templates(fg, all_fg_names)
1224
+ for tmpl in templates:
1225
+ if tmpl.name in seen_templates:
1226
+ continue
1227
+ seen_templates.add(tmpl.name)
1228
+
1229
+ precursors = _generate_precursors_for_template(
1230
+ smiles, mol, tmpl, fg)
1231
+ if not precursors:
1232
+ continue
1233
+
1234
+ score = score_disconnection(tmpl, precursors, mol)
1235
+ disconnections.append(Disconnection(
1236
+ template=tmpl, precursors=precursors, score=score))
1237
+
1238
+ # Sort by score (highest first) and keep top beam_width
1239
+ disconnections.sort(key=lambda d: d.score, reverse=True)
1240
+ node.disconnections = disconnections[:beam_width]
1241
+
1242
+ # Select best disconnection
1243
+ if node.disconnections:
1244
+ node.best_disconnection = node.disconnections[0]
1245
+
1246
+ # Recursively expand precursors of the best disconnection
1247
+ for precursor in node.best_disconnection.precursors:
1248
+ if is_purchasable(precursor.smiles):
1249
+ child_mol = _safe_parse(precursor.smiles)
1250
+ if child_mol is None:
1251
+ continue
1252
+ child_node = RetroNode(
1253
+ smiles=precursor.smiles,
1254
+ molecule=child_mol,
1255
+ is_purchasable=True,
1256
+ depth=depth + 1,
1257
+ )
1258
+ node.children.append(child_node)
1259
+ else:
1260
+ child_mol = _safe_parse(precursor.smiles)
1261
+ if child_mol is None:
1262
+ continue
1263
+ child_node = _build_retro_node(
1264
+ precursor.smiles, child_mol,
1265
+ depth + 1, max_depth, beam_width,
1266
+ visited_smiles,
1267
+ )
1268
+ node.children.append(child_node)
1269
+
1270
+ return node
1271
+
1272
+
1273
+ def _safe_parse(smiles: str) -> Molecule | None:
1274
+ """Parse SMILES, returning None on failure."""
1275
+ try:
1276
+ return parse(smiles)
1277
+ except Exception:
1278
+ return None
1279
+
1280
+
1281
+ def _count_routes(node: RetroNode) -> int:
1282
+ """Count complete routes (paths from root to all-purchasable leaves)."""
1283
+ if node.is_purchasable:
1284
+ return 1
1285
+ if not node.children:
1286
+ return 0
1287
+ # A route is complete when all children are resolved
1288
+ child_counts = [_count_routes(c) for c in node.children]
1289
+ if all(c > 0 for c in child_counts):
1290
+ # Multiply: each combination of child routes is a complete route
1291
+ product = 1
1292
+ for c in child_counts:
1293
+ product *= c
1294
+ return product
1295
+ return 0
1296
+
1297
+
1298
+ def retrosynthesis(
1299
+ mol: Molecule,
1300
+ max_depth: int = 8,
1301
+ beam_width: int = 5,
1302
+ ) -> RetrosynthesisTree:
1303
+ """Perform retrosynthetic analysis on a target molecule.
1304
+
1305
+ Starting from the target, the algorithm works backwards:
1306
+ 1. Convert target to SMILES and check purchasability.
1307
+ 2. Detect functional groups on the target.
1308
+ 3. For each FG, look up matching reaction templates from the
1309
+ knowledge base.
1310
+ 4. For each matching template, generate precursor molecules by
1311
+ conceptual reverse transform.
1312
+ 5. Score each disconnection using strategic bond preference,
1313
+ atom-count reduction, FG simplification, template yield, and
1314
+ precursor availability.
1315
+ 6. Keep the top *beam_width* disconnections.
1316
+ 7. Recurse on non-purchasable precursors up to *max_depth*.
1317
+ 8. Mark the best route through the tree.
1318
+
1319
+ Parameters
1320
+ ----------
1321
+ mol : Molecule
1322
+ The target molecule to analyse.
1323
+ max_depth : int
1324
+ Maximum number of retrosynthetic steps to explore.
1325
+ beam_width : int
1326
+ Number of disconnections to keep at each level.
1327
+
1328
+ Returns
1329
+ -------
1330
+ RetrosynthesisTree
1331
+ The full retrosynthesis tree with scored disconnections.
1332
+ """
1333
+ target_smiles = to_smiles(mol)
1334
+ visited: set[str] = set()
1335
+
1336
+ root = _build_retro_node(
1337
+ target_smiles, mol, 0, max_depth, beam_width, visited)
1338
+
1339
+ routes = _count_routes(root)
1340
+
1341
+ return RetrosynthesisTree(
1342
+ target=root,
1343
+ max_depth=max_depth,
1344
+ beam_width=beam_width,
1345
+ routes_found=routes,
1346
+ )
1347
+
1348
+
1349
+ # =====================================================================
1350
+ # Tree formatting (ASCII text)
1351
+ # =====================================================================
1352
+
1353
+ def _format_node(node: RetroNode, indent: str, is_last: bool,
1354
+ lines: list[str]) -> None:
1355
+ """Recursively format a node and its children as an ASCII tree."""
1356
+ connector = "`-- " if is_last else "|-- "
1357
+ status = ""
1358
+ if node.is_purchasable:
1359
+ entry = PURCHASABLE_MATERIALS.get(node.smiles)
1360
+ name = entry[0] if entry else "purchasable"
1361
+ status = f" [AVAILABLE: {name}]"
1362
+ elif node.best_disconnection:
1363
+ tmpl_name = node.best_disconnection.template.name
1364
+ score = node.best_disconnection.score
1365
+ status = f" <-- {tmpl_name} (score={score:.1f})"
1366
+ else:
1367
+ if node.depth > 0:
1368
+ status = " [no route found]"
1369
+
1370
+ lines.append(f"{indent}{connector}{node.smiles}{status}")
1371
+
1372
+ # Continuation indent for children
1373
+ child_indent = indent + (" " if is_last else "| ")
1374
+
1375
+ # Show functional groups at the root
1376
+ if node.depth == 0 and node.functional_groups:
1377
+ fg_names = ", ".join(fg.name for fg in node.functional_groups)
1378
+ lines.append(f"{child_indent}FGs: {fg_names}")
1379
+
1380
+ # Show alternative disconnections (briefly)
1381
+ if node.disconnections and len(node.disconnections) > 1:
1382
+ lines.append(f"{child_indent}({len(node.disconnections)} "
1383
+ f"disconnection(s) evaluated)")
1384
+
1385
+ # Recurse into children
1386
+ for i, child in enumerate(node.children):
1387
+ is_last_child = (i == len(node.children) - 1)
1388
+ _format_node(child, child_indent, is_last_child, lines)
1389
+
1390
+
1391
+ def format_tree(tree: RetrosynthesisTree) -> str:
1392
+ """Format a RetrosynthesisTree as an ASCII text diagram.
1393
+
1394
+ Parameters
1395
+ ----------
1396
+ tree : RetrosynthesisTree
1397
+ The retrosynthesis tree to format.
1398
+
1399
+ Returns
1400
+ -------
1401
+ str
1402
+ Multi-line ASCII text representation of the tree.
1403
+
1404
+ Example output::
1405
+
1406
+ Retrosynthetic Analysis
1407
+ ==================================================
1408
+ Target: CC(=O)O
1409
+ Max depth: 8 Beam width: 5 Routes found: 2
1410
+ ==================================================
1411
+ `-- CC(=O)O <-- Fischer esterification (score=72.3)
1412
+ FGs: carboxylic_acid, alcohol
1413
+ (3 disconnection(s) evaluated)
1414
+ |-- CC(O)=O [AVAILABLE: acetic acid]
1415
+ `-- CCO [AVAILABLE: ethanol]
1416
+ """
1417
+ lines: list[str] = []
1418
+ lines.append("Retrosynthetic Analysis")
1419
+ lines.append("=" * 58)
1420
+ lines.append(f"Target: {tree.target.smiles}")
1421
+ lines.append(
1422
+ f"Max depth: {tree.max_depth} "
1423
+ f"Beam width: {tree.beam_width} "
1424
+ f"Routes found: {tree.routes_found}"
1425
+ )
1426
+ lines.append("=" * 58)
1427
+
1428
+ _format_node(tree.target, "", True, lines)
1429
+
1430
+ return "\n".join(lines)