molbuilder 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- molbuilder/__init__.py +8 -0
- molbuilder/__main__.py +6 -0
- molbuilder/atomic/__init__.py +4 -0
- molbuilder/atomic/bohr.py +235 -0
- molbuilder/atomic/quantum_atom.py +334 -0
- molbuilder/atomic/quantum_numbers.py +196 -0
- molbuilder/atomic/wavefunctions.py +297 -0
- molbuilder/bonding/__init__.py +4 -0
- molbuilder/bonding/covalent.py +442 -0
- molbuilder/bonding/lewis.py +347 -0
- molbuilder/bonding/vsepr.py +433 -0
- molbuilder/cli/__init__.py +1 -0
- molbuilder/cli/demos.py +516 -0
- molbuilder/cli/menu.py +127 -0
- molbuilder/cli/wizard.py +831 -0
- molbuilder/core/__init__.py +6 -0
- molbuilder/core/bond_data.py +170 -0
- molbuilder/core/constants.py +51 -0
- molbuilder/core/element_properties.py +183 -0
- molbuilder/core/elements.py +181 -0
- molbuilder/core/geometry.py +232 -0
- molbuilder/gui/__init__.py +2 -0
- molbuilder/gui/app.py +286 -0
- molbuilder/gui/canvas3d.py +115 -0
- molbuilder/gui/dialogs.py +117 -0
- molbuilder/gui/event_handler.py +118 -0
- molbuilder/gui/sidebar.py +105 -0
- molbuilder/gui/toolbar.py +71 -0
- molbuilder/io/__init__.py +1 -0
- molbuilder/io/json_io.py +146 -0
- molbuilder/io/mol_sdf.py +169 -0
- molbuilder/io/pdb.py +184 -0
- molbuilder/io/smiles_io.py +47 -0
- molbuilder/io/xyz.py +103 -0
- molbuilder/molecule/__init__.py +2 -0
- molbuilder/molecule/amino_acids.py +919 -0
- molbuilder/molecule/builders.py +257 -0
- molbuilder/molecule/conformations.py +70 -0
- molbuilder/molecule/functional_groups.py +484 -0
- molbuilder/molecule/graph.py +712 -0
- molbuilder/molecule/peptides.py +13 -0
- molbuilder/molecule/stereochemistry.py +6 -0
- molbuilder/process/__init__.py +3 -0
- molbuilder/process/conditions.py +260 -0
- molbuilder/process/costing.py +316 -0
- molbuilder/process/purification.py +285 -0
- molbuilder/process/reactor.py +297 -0
- molbuilder/process/safety.py +476 -0
- molbuilder/process/scale_up.py +427 -0
- molbuilder/process/solvent_systems.py +204 -0
- molbuilder/reactions/__init__.py +3 -0
- molbuilder/reactions/functional_group_detect.py +728 -0
- molbuilder/reactions/knowledge_base.py +1716 -0
- molbuilder/reactions/reaction_types.py +102 -0
- molbuilder/reactions/reagent_data.py +1248 -0
- molbuilder/reactions/retrosynthesis.py +1430 -0
- molbuilder/reactions/synthesis_route.py +377 -0
- molbuilder/reports/__init__.py +158 -0
- molbuilder/reports/cost_report.py +206 -0
- molbuilder/reports/molecule_report.py +279 -0
- molbuilder/reports/safety_report.py +296 -0
- molbuilder/reports/synthesis_report.py +283 -0
- molbuilder/reports/text_formatter.py +170 -0
- molbuilder/smiles/__init__.py +4 -0
- molbuilder/smiles/parser.py +487 -0
- molbuilder/smiles/tokenizer.py +291 -0
- molbuilder/smiles/writer.py +375 -0
- molbuilder/visualization/__init__.py +1 -0
- molbuilder/visualization/bohr_viz.py +166 -0
- molbuilder/visualization/molecule_viz.py +368 -0
- molbuilder/visualization/quantum_viz.py +434 -0
- molbuilder/visualization/theme.py +12 -0
- molbuilder-1.0.0.dist-info/METADATA +360 -0
- molbuilder-1.0.0.dist-info/RECORD +78 -0
- molbuilder-1.0.0.dist-info/WHEEL +5 -0
- molbuilder-1.0.0.dist-info/entry_points.txt +2 -0
- molbuilder-1.0.0.dist-info/licenses/LICENSE +21 -0
- molbuilder-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,728 @@
|
|
|
1
|
+
"""Functional group detection via subgraph pattern matching.
|
|
2
|
+
|
|
3
|
+
This module walks the atoms and bonds of a ``Molecule`` object and
|
|
4
|
+
identifies common organic functional groups by inspecting the local
|
|
5
|
+
neighbourhood of each atom. The detection is heuristic (based on
|
|
6
|
+
connectivity and bond order) rather than relying on SMARTS matching
|
|
7
|
+
so that it works with the graph representation already available in
|
|
8
|
+
the ``molbuilder.molecule.graph`` module.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from molbuilder.molecule.graph import Molecule, Hybridization
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# =====================================================================
|
|
17
|
+
# Data class
|
|
18
|
+
# =====================================================================
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class FunctionalGroup:
|
|
22
|
+
"""A detected functional group occurrence in a molecule.
|
|
23
|
+
|
|
24
|
+
Attributes
|
|
25
|
+
----------
|
|
26
|
+
name : str
|
|
27
|
+
Human-readable name (e.g. ``"alcohol"``, ``"ketone"``).
|
|
28
|
+
smarts_like : str
|
|
29
|
+
Simplified pattern description for display.
|
|
30
|
+
atoms : list[int]
|
|
31
|
+
All atom indices participating in the group.
|
|
32
|
+
center : int
|
|
33
|
+
Primary (most characteristic) atom index.
|
|
34
|
+
"""
|
|
35
|
+
name: str
|
|
36
|
+
smarts_like: str
|
|
37
|
+
atoms: list[int] = field(default_factory=list)
|
|
38
|
+
center: int = -1
|
|
39
|
+
|
|
40
|
+
def __repr__(self) -> str:
|
|
41
|
+
return f"FunctionalGroup({self.name!r}, center={self.center})"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# =====================================================================
|
|
45
|
+
# Helper utilities
|
|
46
|
+
# =====================================================================
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# Standard valences for implicit hydrogen inference (same data as
|
|
50
|
+
# smiles.tokenizer.DEFAULT_VALENCE but upper-cased and used here so
|
|
51
|
+
# that FG detection does not depend on the SMILES subpackage).
|
|
52
|
+
_STANDARD_VALENCE: dict[str, list[int]] = {
|
|
53
|
+
"B": [3], "C": [4], "N": [3, 5], "O": [2], "P": [3, 5],
|
|
54
|
+
"S": [2, 4, 6], "F": [1], "Cl": [1], "Br": [1], "I": [1],
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _element(mol: Molecule, idx: int) -> str:
|
|
59
|
+
"""Return the element symbol of atom *idx* (upper-cased first letter)."""
|
|
60
|
+
return mol.atoms[idx].symbol
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _neighbors(mol: Molecule, idx: int) -> list[int]:
|
|
64
|
+
"""Return indices of atoms bonded to *idx*."""
|
|
65
|
+
return mol.neighbors(idx)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _neighbor_elements(mol: Molecule, idx: int) -> list[str]:
|
|
69
|
+
"""Return element symbols of all neighbours of atom *idx*."""
|
|
70
|
+
return [_element(mol, n) for n in _neighbors(mol, idx)]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _bond_order(mol: Molecule, i: int, j: int) -> float:
|
|
74
|
+
"""Return the bond order between atoms *i* and *j*, or 0.0 if no bond."""
|
|
75
|
+
bond = mol.get_bond(i, j)
|
|
76
|
+
return bond.order if bond is not None else 0.0
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _sum_bond_orders(mol: Molecule, idx: int) -> int:
|
|
80
|
+
"""Sum of bond orders for all bonds on atom *idx*."""
|
|
81
|
+
total = 0
|
|
82
|
+
for n in _neighbors(mol, idx):
|
|
83
|
+
total += int(_bond_order(mol, idx, n))
|
|
84
|
+
return total
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _h_count(mol: Molecule, idx: int) -> int:
|
|
88
|
+
"""Total hydrogen count on atom *idx* (explicit + implicit).
|
|
89
|
+
|
|
90
|
+
Explicit H atoms are counted from the neighbour list. Implicit H
|
|
91
|
+
atoms are inferred from standard valence rules when the molecule
|
|
92
|
+
has fewer explicit neighbours than expected. This makes FG
|
|
93
|
+
detection work regardless of whether H atoms are represented as
|
|
94
|
+
explicit nodes in the graph (SMILES-built molecules) or are
|
|
95
|
+
absent (PDB/XYZ imports without H).
|
|
96
|
+
"""
|
|
97
|
+
explicit_h = sum(1 for e in _neighbor_elements(mol, idx) if e == "H")
|
|
98
|
+
if explicit_h > 0:
|
|
99
|
+
return explicit_h
|
|
100
|
+
|
|
101
|
+
# No explicit H found -- infer from valence rules
|
|
102
|
+
sym = _element(mol, idx)
|
|
103
|
+
valences = _STANDARD_VALENCE.get(sym)
|
|
104
|
+
if valences is None:
|
|
105
|
+
return 0
|
|
106
|
+
bond_order_sum = _sum_bond_orders(mol, idx)
|
|
107
|
+
# Pick the smallest standard valence that accommodates current bonds
|
|
108
|
+
for v in valences:
|
|
109
|
+
implicit = v - bond_order_sum
|
|
110
|
+
if implicit >= 0:
|
|
111
|
+
return implicit
|
|
112
|
+
return 0
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _has_h(mol: Molecule, idx: int) -> bool:
|
|
116
|
+
"""Return True if atom *idx* has at least one hydrogen (explicit or implicit)."""
|
|
117
|
+
return _h_count(mol, idx) > 0
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _count_element_neighbors(mol: Molecule, idx: int, elem: str) -> int:
|
|
121
|
+
"""Count how many neighbours of *idx* have element *elem*."""
|
|
122
|
+
return sum(1 for e in _neighbor_elements(mol, idx) if e == elem)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _double_bonded_to(mol: Molecule, idx: int, elem: str) -> list[int]:
|
|
126
|
+
"""Return neighbour indices that are *elem* and double-bonded to *idx*."""
|
|
127
|
+
result = []
|
|
128
|
+
for n in _neighbors(mol, idx):
|
|
129
|
+
if _element(mol, n) == elem and _bond_order(mol, idx, n) == 2.0:
|
|
130
|
+
result.append(n)
|
|
131
|
+
return result
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _single_bonded_to(mol: Molecule, idx: int, elem: str) -> list[int]:
|
|
135
|
+
"""Return neighbour indices that are *elem* and single-bonded to *idx*."""
|
|
136
|
+
result = []
|
|
137
|
+
for n in _neighbors(mol, idx):
|
|
138
|
+
if _element(mol, n) == elem and _bond_order(mol, idx, n) == 1.0:
|
|
139
|
+
result.append(n)
|
|
140
|
+
return result
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _triple_bonded_to(mol: Molecule, idx: int, elem: str) -> list[int]:
|
|
144
|
+
"""Return neighbour indices that are *elem* and triple-bonded to *idx*."""
|
|
145
|
+
result = []
|
|
146
|
+
for n in _neighbors(mol, idx):
|
|
147
|
+
if _element(mol, n) == elem and _bond_order(mol, idx, n) == 3.0:
|
|
148
|
+
result.append(n)
|
|
149
|
+
return result
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# =====================================================================
|
|
153
|
+
# Master dispatcher
|
|
154
|
+
# =====================================================================
|
|
155
|
+
|
|
156
|
+
def detect_functional_groups(mol: Molecule) -> list[FunctionalGroup]:
|
|
157
|
+
"""Detect all recognisable functional groups in *mol*.
|
|
158
|
+
|
|
159
|
+
Returns a list of ``FunctionalGroup`` instances, one per occurrence.
|
|
160
|
+
The same atom may appear in more than one group (e.g. an ester
|
|
161
|
+
contains both a C=O and a C-O-C linkage).
|
|
162
|
+
"""
|
|
163
|
+
groups: list[FunctionalGroup] = []
|
|
164
|
+
groups.extend(_detect_carboxylic_acids(mol))
|
|
165
|
+
groups.extend(_detect_esters(mol))
|
|
166
|
+
groups.extend(_detect_amides(mol))
|
|
167
|
+
groups.extend(_detect_aldehydes(mol))
|
|
168
|
+
groups.extend(_detect_ketones(mol))
|
|
169
|
+
groups.extend(_detect_alcohols(mol))
|
|
170
|
+
groups.extend(_detect_amines(mol))
|
|
171
|
+
groups.extend(_detect_alkyl_halides(mol))
|
|
172
|
+
groups.extend(_detect_alkenes(mol))
|
|
173
|
+
groups.extend(_detect_alkynes(mol))
|
|
174
|
+
groups.extend(_detect_ethers(mol))
|
|
175
|
+
groups.extend(_detect_thiols(mol))
|
|
176
|
+
groups.extend(_detect_nitriles(mol))
|
|
177
|
+
groups.extend(_detect_nitro(mol))
|
|
178
|
+
groups.extend(_detect_aromatic_rings(mol))
|
|
179
|
+
groups.extend(_detect_epoxides(mol))
|
|
180
|
+
groups.extend(_detect_acid_chlorides(mol))
|
|
181
|
+
groups.extend(_detect_anhydrides(mol))
|
|
182
|
+
groups.extend(_detect_sulfoxides(mol))
|
|
183
|
+
groups.extend(_detect_sulfones(mol))
|
|
184
|
+
groups.extend(_detect_imines(mol))
|
|
185
|
+
return groups
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# =====================================================================
|
|
189
|
+
# Individual detectors
|
|
190
|
+
# =====================================================================
|
|
191
|
+
|
|
192
|
+
def _detect_alcohols(mol: Molecule) -> list[FunctionalGroup]:
|
|
193
|
+
"""Alcohol: O bonded to C with an H (explicit or implicit).
|
|
194
|
+
|
|
195
|
+
The O must be single-bonded to C and not part of a C=O or ester
|
|
196
|
+
linkage. Works with both explicit H in the graph and implicit H
|
|
197
|
+
inferred from valence rules.
|
|
198
|
+
"""
|
|
199
|
+
found: list[FunctionalGroup] = []
|
|
200
|
+
for idx, atom in enumerate(mol.atoms):
|
|
201
|
+
if atom.symbol != "O":
|
|
202
|
+
continue
|
|
203
|
+
nbrs = _neighbors(mol, idx)
|
|
204
|
+
elems = [_element(mol, n) for n in nbrs]
|
|
205
|
+
|
|
206
|
+
# Need at least one C neighbour, single-bonded
|
|
207
|
+
c_indices = [nbrs[i] for i, e in enumerate(elems) if e == "C"]
|
|
208
|
+
if not c_indices:
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
for c_idx in c_indices:
|
|
212
|
+
if _bond_order(mol, idx, c_idx) != 1.0:
|
|
213
|
+
continue
|
|
214
|
+
# Check for H: explicit neighbour OR implicit from valence
|
|
215
|
+
if "H" in elems:
|
|
216
|
+
h_idx = nbrs[elems.index("H")]
|
|
217
|
+
found.append(FunctionalGroup(
|
|
218
|
+
name="alcohol", smarts_like="[C]-[OH]",
|
|
219
|
+
atoms=[c_idx, idx, h_idx], center=idx,
|
|
220
|
+
))
|
|
221
|
+
break
|
|
222
|
+
elif _h_count(mol, idx) >= 1:
|
|
223
|
+
# Implicit H -- no explicit H atom index to record
|
|
224
|
+
found.append(FunctionalGroup(
|
|
225
|
+
name="alcohol", smarts_like="[C]-[OH]",
|
|
226
|
+
atoms=[c_idx, idx], center=idx,
|
|
227
|
+
))
|
|
228
|
+
break
|
|
229
|
+
return found
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _detect_aldehydes(mol: Molecule) -> list[FunctionalGroup]:
|
|
233
|
+
"""Aldehyde: C=O where C also has an H (terminal carbonyl)."""
|
|
234
|
+
found: list[FunctionalGroup] = []
|
|
235
|
+
for idx, atom in enumerate(mol.atoms):
|
|
236
|
+
if atom.symbol != "C":
|
|
237
|
+
continue
|
|
238
|
+
dbl_o = _double_bonded_to(mol, idx, "O")
|
|
239
|
+
if not dbl_o:
|
|
240
|
+
continue
|
|
241
|
+
if _has_h(mol, idx):
|
|
242
|
+
o_idx = dbl_o[0]
|
|
243
|
+
found.append(FunctionalGroup(
|
|
244
|
+
name="aldehyde", smarts_like="[CX3H1](=O)",
|
|
245
|
+
atoms=[idx, o_idx], center=idx,
|
|
246
|
+
))
|
|
247
|
+
return found
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _detect_ketones(mol: Molecule) -> list[FunctionalGroup]:
|
|
251
|
+
"""Ketone: C=O where C is bonded to two other carbons (no H, no O-single)."""
|
|
252
|
+
found: list[FunctionalGroup] = []
|
|
253
|
+
for idx, atom in enumerate(mol.atoms):
|
|
254
|
+
if atom.symbol != "C":
|
|
255
|
+
continue
|
|
256
|
+
dbl_o = _double_bonded_to(mol, idx, "O")
|
|
257
|
+
if not dbl_o:
|
|
258
|
+
continue
|
|
259
|
+
# Must not have H on carbonyl C (that would be aldehyde)
|
|
260
|
+
if _has_h(mol, idx):
|
|
261
|
+
continue
|
|
262
|
+
# The other two neighbours should both be C
|
|
263
|
+
c_nbrs = _single_bonded_to(mol, idx, "C")
|
|
264
|
+
if len(c_nbrs) >= 2:
|
|
265
|
+
o_idx = dbl_o[0]
|
|
266
|
+
found.append(FunctionalGroup(
|
|
267
|
+
name="ketone", smarts_like="[CX3](=O)([C])[C]",
|
|
268
|
+
atoms=[idx, o_idx] + c_nbrs[:2], center=idx,
|
|
269
|
+
))
|
|
270
|
+
return found
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _detect_carboxylic_acids(mol: Molecule) -> list[FunctionalGroup]:
|
|
274
|
+
"""Carboxylic acid: C with C=O and C-OH."""
|
|
275
|
+
found: list[FunctionalGroup] = []
|
|
276
|
+
for idx, atom in enumerate(mol.atoms):
|
|
277
|
+
if atom.symbol != "C":
|
|
278
|
+
continue
|
|
279
|
+
dbl_o = _double_bonded_to(mol, idx, "O")
|
|
280
|
+
sgl_o = _single_bonded_to(mol, idx, "O")
|
|
281
|
+
if dbl_o and sgl_o:
|
|
282
|
+
# Check that the single-bonded O has an H
|
|
283
|
+
for o_idx in sgl_o:
|
|
284
|
+
if _has_h(mol, o_idx):
|
|
285
|
+
found.append(FunctionalGroup(
|
|
286
|
+
name="carboxylic_acid",
|
|
287
|
+
smarts_like="[CX3](=O)[OH]",
|
|
288
|
+
atoms=[idx, dbl_o[0], o_idx], center=idx,
|
|
289
|
+
))
|
|
290
|
+
break
|
|
291
|
+
return found
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _detect_esters(mol: Molecule) -> list[FunctionalGroup]:
|
|
295
|
+
"""Ester: C(=O)-O-C where the single-bonded O has no H."""
|
|
296
|
+
found: list[FunctionalGroup] = []
|
|
297
|
+
for idx, atom in enumerate(mol.atoms):
|
|
298
|
+
if atom.symbol != "C":
|
|
299
|
+
continue
|
|
300
|
+
dbl_o = _double_bonded_to(mol, idx, "O")
|
|
301
|
+
sgl_o = _single_bonded_to(mol, idx, "O")
|
|
302
|
+
if dbl_o and sgl_o:
|
|
303
|
+
for o_idx in sgl_o:
|
|
304
|
+
if not _has_h(mol, o_idx):
|
|
305
|
+
# Check that O is also bonded to a C (ester, not anhydride check)
|
|
306
|
+
o_c_nbrs = _single_bonded_to(mol, o_idx, "C")
|
|
307
|
+
other_c = [c for c in o_c_nbrs if c != idx]
|
|
308
|
+
if other_c:
|
|
309
|
+
found.append(FunctionalGroup(
|
|
310
|
+
name="ester",
|
|
311
|
+
smarts_like="[CX3](=O)[O][C]",
|
|
312
|
+
atoms=[idx, dbl_o[0], o_idx, other_c[0]],
|
|
313
|
+
center=idx,
|
|
314
|
+
))
|
|
315
|
+
break
|
|
316
|
+
return found
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def _detect_amides(mol: Molecule) -> list[FunctionalGroup]:
|
|
320
|
+
"""Amide: C(=O)-N."""
|
|
321
|
+
found: list[FunctionalGroup] = []
|
|
322
|
+
for idx, atom in enumerate(mol.atoms):
|
|
323
|
+
if atom.symbol != "C":
|
|
324
|
+
continue
|
|
325
|
+
dbl_o = _double_bonded_to(mol, idx, "O")
|
|
326
|
+
sgl_n = _single_bonded_to(mol, idx, "N")
|
|
327
|
+
if dbl_o and sgl_n:
|
|
328
|
+
found.append(FunctionalGroup(
|
|
329
|
+
name="amide", smarts_like="[CX3](=O)[NX3]",
|
|
330
|
+
atoms=[idx, dbl_o[0], sgl_n[0]], center=idx,
|
|
331
|
+
))
|
|
332
|
+
return found
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def _detect_amines(mol: Molecule) -> list[FunctionalGroup]:
|
|
336
|
+
"""Primary, secondary, and tertiary amines (not amides)."""
|
|
337
|
+
found: list[FunctionalGroup] = []
|
|
338
|
+
for idx, atom in enumerate(mol.atoms):
|
|
339
|
+
if atom.symbol != "N":
|
|
340
|
+
continue
|
|
341
|
+
# Skip if N is part of an amide (bonded to a carbonyl C)
|
|
342
|
+
is_amide = False
|
|
343
|
+
for c_idx in _single_bonded_to(mol, idx, "C"):
|
|
344
|
+
if _double_bonded_to(mol, c_idx, "O"):
|
|
345
|
+
is_amide = True
|
|
346
|
+
break
|
|
347
|
+
if is_amide:
|
|
348
|
+
continue
|
|
349
|
+
|
|
350
|
+
c_count = _count_element_neighbors(mol, idx, "C")
|
|
351
|
+
h_count = _h_count(mol, idx)
|
|
352
|
+
|
|
353
|
+
if c_count == 1 and h_count == 2:
|
|
354
|
+
found.append(FunctionalGroup(
|
|
355
|
+
name="primary_amine", smarts_like="[NX3H2][C]",
|
|
356
|
+
atoms=[idx] + _single_bonded_to(mol, idx, "C"),
|
|
357
|
+
center=idx,
|
|
358
|
+
))
|
|
359
|
+
elif c_count == 2 and h_count == 1:
|
|
360
|
+
found.append(FunctionalGroup(
|
|
361
|
+
name="secondary_amine", smarts_like="[NX3H1]([C])[C]",
|
|
362
|
+
atoms=[idx] + _single_bonded_to(mol, idx, "C"),
|
|
363
|
+
center=idx,
|
|
364
|
+
))
|
|
365
|
+
elif c_count == 3 and h_count == 0:
|
|
366
|
+
found.append(FunctionalGroup(
|
|
367
|
+
name="tertiary_amine", smarts_like="[NX3]([C])([C])[C]",
|
|
368
|
+
atoms=[idx] + _single_bonded_to(mol, idx, "C"),
|
|
369
|
+
center=idx,
|
|
370
|
+
))
|
|
371
|
+
return found
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def _detect_alkyl_halides(mol: Molecule) -> list[FunctionalGroup]:
|
|
375
|
+
"""Alkyl halide: C bonded to F, Cl, Br, or I."""
|
|
376
|
+
halogens = {"F", "Cl", "Br", "I"}
|
|
377
|
+
found: list[FunctionalGroup] = []
|
|
378
|
+
for idx, atom in enumerate(mol.atoms):
|
|
379
|
+
if atom.symbol != "C":
|
|
380
|
+
continue
|
|
381
|
+
for n in _neighbors(mol, idx):
|
|
382
|
+
if _element(mol, n) in halogens and _bond_order(mol, idx, n) == 1.0:
|
|
383
|
+
hal = _element(mol, n)
|
|
384
|
+
found.append(FunctionalGroup(
|
|
385
|
+
name=f"alkyl_halide_{hal.lower()}",
|
|
386
|
+
smarts_like=f"[C][{hal}]",
|
|
387
|
+
atoms=[idx, n], center=idx,
|
|
388
|
+
))
|
|
389
|
+
return found
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def _detect_alkenes(mol: Molecule) -> list[FunctionalGroup]:
|
|
393
|
+
"""Alkene: C=C double bond."""
|
|
394
|
+
found: list[FunctionalGroup] = []
|
|
395
|
+
seen: set[tuple[int, int]] = set()
|
|
396
|
+
for idx, atom in enumerate(mol.atoms):
|
|
397
|
+
if atom.symbol != "C":
|
|
398
|
+
continue
|
|
399
|
+
for n in _double_bonded_to(mol, idx, "C"):
|
|
400
|
+
pair = (min(idx, n), max(idx, n))
|
|
401
|
+
if pair not in seen:
|
|
402
|
+
seen.add(pair)
|
|
403
|
+
found.append(FunctionalGroup(
|
|
404
|
+
name="alkene", smarts_like="[C]=[C]",
|
|
405
|
+
atoms=list(pair), center=pair[0],
|
|
406
|
+
))
|
|
407
|
+
return found
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def _detect_alkynes(mol: Molecule) -> list[FunctionalGroup]:
|
|
411
|
+
"""Alkyne: C#C triple bond."""
|
|
412
|
+
found: list[FunctionalGroup] = []
|
|
413
|
+
seen: set[tuple[int, int]] = set()
|
|
414
|
+
for idx, atom in enumerate(mol.atoms):
|
|
415
|
+
if atom.symbol != "C":
|
|
416
|
+
continue
|
|
417
|
+
for n in _triple_bonded_to(mol, idx, "C"):
|
|
418
|
+
pair = (min(idx, n), max(idx, n))
|
|
419
|
+
if pair not in seen:
|
|
420
|
+
seen.add(pair)
|
|
421
|
+
found.append(FunctionalGroup(
|
|
422
|
+
name="alkyne", smarts_like="[C]#[C]",
|
|
423
|
+
atoms=list(pair), center=pair[0],
|
|
424
|
+
))
|
|
425
|
+
return found
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def _detect_ethers(mol: Molecule) -> list[FunctionalGroup]:
|
|
429
|
+
"""Ether: C-O-C (oxygen single-bonded to two carbons, no C=O on either)."""
|
|
430
|
+
found: list[FunctionalGroup] = []
|
|
431
|
+
for idx, atom in enumerate(mol.atoms):
|
|
432
|
+
if atom.symbol != "O":
|
|
433
|
+
continue
|
|
434
|
+
c_nbrs = _single_bonded_to(mol, idx, "C")
|
|
435
|
+
if len(c_nbrs) != 2:
|
|
436
|
+
continue
|
|
437
|
+
# Exclude if either C has a C=O (that would be ester)
|
|
438
|
+
is_ester = any(_double_bonded_to(mol, c, "O") for c in c_nbrs)
|
|
439
|
+
if is_ester:
|
|
440
|
+
continue
|
|
441
|
+
found.append(FunctionalGroup(
|
|
442
|
+
name="ether", smarts_like="[C]-[O]-[C]",
|
|
443
|
+
atoms=[c_nbrs[0], idx, c_nbrs[1]], center=idx,
|
|
444
|
+
))
|
|
445
|
+
return found
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def _detect_thiols(mol: Molecule) -> list[FunctionalGroup]:
|
|
449
|
+
"""Thiol: S bonded to C with an H (explicit or implicit)."""
|
|
450
|
+
found: list[FunctionalGroup] = []
|
|
451
|
+
for idx, atom in enumerate(mol.atoms):
|
|
452
|
+
if atom.symbol != "S":
|
|
453
|
+
continue
|
|
454
|
+
nbrs = _neighbors(mol, idx)
|
|
455
|
+
elems = [_element(mol, n) for n in nbrs]
|
|
456
|
+
c_indices = [nbrs[i] for i, e in enumerate(elems) if e == "C"]
|
|
457
|
+
if not c_indices:
|
|
458
|
+
continue
|
|
459
|
+
if _has_h(mol, idx):
|
|
460
|
+
c_idx = c_indices[0]
|
|
461
|
+
found.append(FunctionalGroup(
|
|
462
|
+
name="thiol", smarts_like="[C]-[SH]",
|
|
463
|
+
atoms=[c_idx, idx], center=idx,
|
|
464
|
+
))
|
|
465
|
+
return found
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def _detect_nitriles(mol: Molecule) -> list[FunctionalGroup]:
|
|
469
|
+
"""Nitrile: C#N triple bond."""
|
|
470
|
+
found: list[FunctionalGroup] = []
|
|
471
|
+
for idx, atom in enumerate(mol.atoms):
|
|
472
|
+
if atom.symbol != "C":
|
|
473
|
+
continue
|
|
474
|
+
for n in _triple_bonded_to(mol, idx, "N"):
|
|
475
|
+
found.append(FunctionalGroup(
|
|
476
|
+
name="nitrile", smarts_like="[C]#[N]",
|
|
477
|
+
atoms=[idx, n], center=idx,
|
|
478
|
+
))
|
|
479
|
+
return found
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def _detect_nitro(mol: Molecule) -> list[FunctionalGroup]:
|
|
483
|
+
"""Nitro group: N bonded to two O atoms with at least one N=O."""
|
|
484
|
+
found: list[FunctionalGroup] = []
|
|
485
|
+
for idx, atom in enumerate(mol.atoms):
|
|
486
|
+
if atom.symbol != "N":
|
|
487
|
+
continue
|
|
488
|
+
dbl_o = _double_bonded_to(mol, idx, "O")
|
|
489
|
+
sgl_o = _single_bonded_to(mol, idx, "O")
|
|
490
|
+
total_o = len(dbl_o) + len(sgl_o)
|
|
491
|
+
if total_o >= 2 and len(dbl_o) >= 1:
|
|
492
|
+
found.append(FunctionalGroup(
|
|
493
|
+
name="nitro", smarts_like="[N](=O)[O]",
|
|
494
|
+
atoms=[idx] + dbl_o + sgl_o, center=idx,
|
|
495
|
+
))
|
|
496
|
+
return found
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
def _detect_aromatic_rings(mol: Molecule) -> list[FunctionalGroup]:
|
|
500
|
+
"""Simplified aromatic ring detection.
|
|
501
|
+
|
|
502
|
+
Looks for six-membered rings composed entirely of carbons (or with
|
|
503
|
+
one nitrogen for pyridine) where all ring bonds have order >= 1.5
|
|
504
|
+
(aromatic) *or* alternating single/double bonds that form a
|
|
505
|
+
conjugated cycle.
|
|
506
|
+
|
|
507
|
+
This uses a breadth-first ring-finding approach limited to length 6.
|
|
508
|
+
"""
|
|
509
|
+
found: list[FunctionalGroup] = []
|
|
510
|
+
n_atoms = len(mol.atoms)
|
|
511
|
+
seen_rings: set[tuple[int, ...]] = set()
|
|
512
|
+
|
|
513
|
+
for start in range(n_atoms):
|
|
514
|
+
if _element(mol, start) not in ("C", "N", "O", "S"):
|
|
515
|
+
continue
|
|
516
|
+
# BFS / DFS for 6-membered rings from start
|
|
517
|
+
rings = _find_rings_of_size(mol, start, 6)
|
|
518
|
+
# Also search for 5-membered rings (furan, thiophene, pyrrole, etc.)
|
|
519
|
+
rings += _find_rings_of_size(mol, start, 5)
|
|
520
|
+
for ring in rings:
|
|
521
|
+
canon = _canonicalise_ring(ring)
|
|
522
|
+
if canon in seen_rings:
|
|
523
|
+
continue
|
|
524
|
+
seen_rings.add(canon)
|
|
525
|
+
# Check that ring is plausibly aromatic
|
|
526
|
+
if _ring_is_aromatic(mol, ring):
|
|
527
|
+
found.append(FunctionalGroup(
|
|
528
|
+
name="aromatic_ring",
|
|
529
|
+
smarts_like="c1ccccc1",
|
|
530
|
+
atoms=list(ring), center=ring[0],
|
|
531
|
+
))
|
|
532
|
+
return found
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
def _find_rings_of_size(mol: Molecule, start: int, size: int) -> list[tuple[int, ...]]:
|
|
536
|
+
"""Return all simple rings of exactly *size* atoms that include *start*.
|
|
537
|
+
|
|
538
|
+
Uses iterative depth-limited DFS. To keep cost manageable the
|
|
539
|
+
search only proceeds through C and N atoms.
|
|
540
|
+
"""
|
|
541
|
+
results: list[tuple[int, ...]] = []
|
|
542
|
+
allowed = {"C", "N", "O", "S"}
|
|
543
|
+
# stack entries: (current_atom, path_so_far)
|
|
544
|
+
stack: list[tuple[int, list[int]]] = [(start, [start])]
|
|
545
|
+
while stack:
|
|
546
|
+
current, path = stack.pop()
|
|
547
|
+
if len(path) == size:
|
|
548
|
+
# Check if we can close the ring back to start
|
|
549
|
+
if start in [n for n in _neighbors(mol, current)]:
|
|
550
|
+
results.append(tuple(path))
|
|
551
|
+
continue
|
|
552
|
+
for nbr in _neighbors(mol, current):
|
|
553
|
+
if nbr == start and len(path) >= 3:
|
|
554
|
+
# Early closure -- ring smaller than *size*; skip
|
|
555
|
+
continue
|
|
556
|
+
if nbr in path:
|
|
557
|
+
continue
|
|
558
|
+
if _element(mol, nbr) not in allowed:
|
|
559
|
+
continue
|
|
560
|
+
stack.append((nbr, path + [nbr]))
|
|
561
|
+
return results
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def _canonicalise_ring(ring: tuple[int, ...]) -> tuple[int, ...]:
|
|
565
|
+
"""Return a canonical form for a ring so that rotations/reflections match."""
|
|
566
|
+
min_val = min(ring)
|
|
567
|
+
min_idx = ring.index(min_val)
|
|
568
|
+
forward = ring[min_idx:] + ring[:min_idx]
|
|
569
|
+
backward = (ring[min_idx],) + tuple(reversed(ring[:min_idx])) + tuple(reversed(ring[min_idx + 1:]))
|
|
570
|
+
return min(forward, backward)
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def _ring_is_aromatic(mol: Molecule, ring: tuple[int, ...]) -> bool:
|
|
574
|
+
"""Heuristically decide if a ring is aromatic.
|
|
575
|
+
|
|
576
|
+
A ring is considered aromatic if any of:
|
|
577
|
+
- All bond orders are >= 1.5 (explicit aromatic annotation), **or**
|
|
578
|
+
- The ring consists of alternating single (1.0) and double (2.0)
|
|
579
|
+
bonds forming a fully conjugated system, **or**
|
|
580
|
+
- All ring atoms have SP2 hybridization (aromatic SMILES atoms are
|
|
581
|
+
assigned SP2 by the parser even though bonds are stored as order 1;
|
|
582
|
+
this catches furan, thiophene, pyrrole and other heteroaromatics).
|
|
583
|
+
"""
|
|
584
|
+
n = len(ring)
|
|
585
|
+
orders = []
|
|
586
|
+
for i in range(n):
|
|
587
|
+
a, b = ring[i], ring[(i + 1) % n]
|
|
588
|
+
orders.append(_bond_order(mol, a, b))
|
|
589
|
+
|
|
590
|
+
# All aromatic-annotated bonds
|
|
591
|
+
if all(o >= 1.5 for o in orders):
|
|
592
|
+
return True
|
|
593
|
+
|
|
594
|
+
# Alternating single/double
|
|
595
|
+
if all(o in (1.0, 2.0) for o in orders):
|
|
596
|
+
alternating = all(orders[i] != orders[(i + 1) % n] for i in range(n))
|
|
597
|
+
if alternating:
|
|
598
|
+
return True
|
|
599
|
+
|
|
600
|
+
# All atoms SP2-hybridized (aromatic SMILES atoms, or conjugated rings)
|
|
601
|
+
if all(mol.atoms[idx].hybridization == Hybridization.SP2 for idx in ring):
|
|
602
|
+
return True
|
|
603
|
+
|
|
604
|
+
return False
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
def _detect_epoxides(mol: Molecule) -> list[FunctionalGroup]:
|
|
608
|
+
"""Epoxide: 3-membered ring containing one O and two C atoms."""
|
|
609
|
+
found: list[FunctionalGroup] = []
|
|
610
|
+
seen_rings: set[tuple[int, ...]] = set()
|
|
611
|
+
|
|
612
|
+
for idx, atom in enumerate(mol.atoms):
|
|
613
|
+
if atom.symbol != "O":
|
|
614
|
+
continue
|
|
615
|
+
c_nbrs = _single_bonded_to(mol, idx, "C")
|
|
616
|
+
if len(c_nbrs) < 2:
|
|
617
|
+
continue
|
|
618
|
+
# Check each pair of C neighbours for a bond between them
|
|
619
|
+
for i in range(len(c_nbrs)):
|
|
620
|
+
for j in range(i + 1, len(c_nbrs)):
|
|
621
|
+
c1, c2 = c_nbrs[i], c_nbrs[j]
|
|
622
|
+
if _bond_order(mol, c1, c2) > 0:
|
|
623
|
+
canon = _canonicalise_ring((idx, c1, c2))
|
|
624
|
+
if canon not in seen_rings:
|
|
625
|
+
seen_rings.add(canon)
|
|
626
|
+
found.append(FunctionalGroup(
|
|
627
|
+
name="epoxide",
|
|
628
|
+
smarts_like="C1OC1",
|
|
629
|
+
atoms=[c1, idx, c2], center=idx,
|
|
630
|
+
))
|
|
631
|
+
return found
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def _detect_acid_chlorides(mol: Molecule) -> list[FunctionalGroup]:
|
|
635
|
+
"""Acid chloride (acyl chloride): C(=O)Cl."""
|
|
636
|
+
found: list[FunctionalGroup] = []
|
|
637
|
+
for idx, atom in enumerate(mol.atoms):
|
|
638
|
+
if atom.symbol != "C":
|
|
639
|
+
continue
|
|
640
|
+
dbl_o = _double_bonded_to(mol, idx, "O")
|
|
641
|
+
sgl_cl = _single_bonded_to(mol, idx, "Cl")
|
|
642
|
+
if dbl_o and sgl_cl:
|
|
643
|
+
found.append(FunctionalGroup(
|
|
644
|
+
name="acid_chloride", smarts_like="[CX3](=O)[Cl]",
|
|
645
|
+
atoms=[idx, dbl_o[0], sgl_cl[0]], center=idx,
|
|
646
|
+
))
|
|
647
|
+
return found
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def _detect_anhydrides(mol: Molecule) -> list[FunctionalGroup]:
|
|
651
|
+
"""Acid anhydride: C(=O)-O-C(=O)."""
|
|
652
|
+
found: list[FunctionalGroup] = []
|
|
653
|
+
seen: set[int] = set()
|
|
654
|
+
for idx, atom in enumerate(mol.atoms):
|
|
655
|
+
if atom.symbol != "O":
|
|
656
|
+
continue
|
|
657
|
+
if idx in seen:
|
|
658
|
+
continue
|
|
659
|
+
c_nbrs = _single_bonded_to(mol, idx, "C")
|
|
660
|
+
if len(c_nbrs) < 2:
|
|
661
|
+
continue
|
|
662
|
+
# Both C neighbours must have a C=O
|
|
663
|
+
carbonyl_cs = [c for c in c_nbrs if _double_bonded_to(mol, c, "O")]
|
|
664
|
+
if len(carbonyl_cs) >= 2:
|
|
665
|
+
c1, c2 = carbonyl_cs[0], carbonyl_cs[1]
|
|
666
|
+
o1 = _double_bonded_to(mol, c1, "O")[0]
|
|
667
|
+
o2 = _double_bonded_to(mol, c2, "O")[0]
|
|
668
|
+
seen.add(idx)
|
|
669
|
+
found.append(FunctionalGroup(
|
|
670
|
+
name="anhydride", smarts_like="[CX3](=O)[O][CX3](=O)",
|
|
671
|
+
atoms=[c1, o1, idx, c2, o2], center=idx,
|
|
672
|
+
))
|
|
673
|
+
return found
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
def _detect_sulfoxides(mol: Molecule) -> list[FunctionalGroup]:
|
|
677
|
+
"""Sulfoxide: S(=O) bonded to two carbons (no second O=S)."""
|
|
678
|
+
found: list[FunctionalGroup] = []
|
|
679
|
+
for idx, atom in enumerate(mol.atoms):
|
|
680
|
+
if atom.symbol != "S":
|
|
681
|
+
continue
|
|
682
|
+
dbl_o = _double_bonded_to(mol, idx, "O")
|
|
683
|
+
c_nbrs = _single_bonded_to(mol, idx, "C")
|
|
684
|
+
if len(dbl_o) == 1 and len(c_nbrs) >= 2:
|
|
685
|
+
found.append(FunctionalGroup(
|
|
686
|
+
name="sulfoxide", smarts_like="[SX3](=O)([C])[C]",
|
|
687
|
+
atoms=[idx, dbl_o[0]] + c_nbrs[:2], center=idx,
|
|
688
|
+
))
|
|
689
|
+
return found
|
|
690
|
+
|
|
691
|
+
|
|
692
|
+
def _detect_sulfones(mol: Molecule) -> list[FunctionalGroup]:
|
|
693
|
+
"""Sulfone: S(=O)(=O) bonded to two carbons."""
|
|
694
|
+
found: list[FunctionalGroup] = []
|
|
695
|
+
for idx, atom in enumerate(mol.atoms):
|
|
696
|
+
if atom.symbol != "S":
|
|
697
|
+
continue
|
|
698
|
+
dbl_o = _double_bonded_to(mol, idx, "O")
|
|
699
|
+
c_nbrs = _single_bonded_to(mol, idx, "C")
|
|
700
|
+
if len(dbl_o) >= 2 and len(c_nbrs) >= 2:
|
|
701
|
+
found.append(FunctionalGroup(
|
|
702
|
+
name="sulfone", smarts_like="[SX4](=O)(=O)([C])[C]",
|
|
703
|
+
atoms=[idx] + dbl_o[:2] + c_nbrs[:2], center=idx,
|
|
704
|
+
))
|
|
705
|
+
return found
|
|
706
|
+
|
|
707
|
+
|
|
708
|
+
def _detect_imines(mol: Molecule) -> list[FunctionalGroup]:
|
|
709
|
+
"""Imine: C=N (not part of nitrile C#N)."""
|
|
710
|
+
found: list[FunctionalGroup] = []
|
|
711
|
+
seen: set[tuple[int, int]] = set()
|
|
712
|
+
for idx, atom in enumerate(mol.atoms):
|
|
713
|
+
if atom.symbol != "C":
|
|
714
|
+
continue
|
|
715
|
+
for n in _neighbors(mol, idx):
|
|
716
|
+
if _element(mol, n) != "N":
|
|
717
|
+
continue
|
|
718
|
+
if _bond_order(mol, idx, n) != 2.0:
|
|
719
|
+
continue
|
|
720
|
+
pair = (min(idx, n), max(idx, n))
|
|
721
|
+
if pair in seen:
|
|
722
|
+
continue
|
|
723
|
+
seen.add(pair)
|
|
724
|
+
found.append(FunctionalGroup(
|
|
725
|
+
name="imine", smarts_like="[C]=[N]",
|
|
726
|
+
atoms=list(pair), center=idx,
|
|
727
|
+
))
|
|
728
|
+
return found
|