molbuilder 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- molbuilder/__init__.py +8 -0
- molbuilder/__main__.py +6 -0
- molbuilder/atomic/__init__.py +4 -0
- molbuilder/atomic/bohr.py +235 -0
- molbuilder/atomic/quantum_atom.py +334 -0
- molbuilder/atomic/quantum_numbers.py +196 -0
- molbuilder/atomic/wavefunctions.py +297 -0
- molbuilder/bonding/__init__.py +4 -0
- molbuilder/bonding/covalent.py +442 -0
- molbuilder/bonding/lewis.py +347 -0
- molbuilder/bonding/vsepr.py +433 -0
- molbuilder/cli/__init__.py +1 -0
- molbuilder/cli/demos.py +516 -0
- molbuilder/cli/menu.py +127 -0
- molbuilder/cli/wizard.py +831 -0
- molbuilder/core/__init__.py +6 -0
- molbuilder/core/bond_data.py +170 -0
- molbuilder/core/constants.py +51 -0
- molbuilder/core/element_properties.py +183 -0
- molbuilder/core/elements.py +181 -0
- molbuilder/core/geometry.py +232 -0
- molbuilder/gui/__init__.py +2 -0
- molbuilder/gui/app.py +286 -0
- molbuilder/gui/canvas3d.py +115 -0
- molbuilder/gui/dialogs.py +117 -0
- molbuilder/gui/event_handler.py +118 -0
- molbuilder/gui/sidebar.py +105 -0
- molbuilder/gui/toolbar.py +71 -0
- molbuilder/io/__init__.py +1 -0
- molbuilder/io/json_io.py +146 -0
- molbuilder/io/mol_sdf.py +169 -0
- molbuilder/io/pdb.py +184 -0
- molbuilder/io/smiles_io.py +47 -0
- molbuilder/io/xyz.py +103 -0
- molbuilder/molecule/__init__.py +2 -0
- molbuilder/molecule/amino_acids.py +919 -0
- molbuilder/molecule/builders.py +257 -0
- molbuilder/molecule/conformations.py +70 -0
- molbuilder/molecule/functional_groups.py +484 -0
- molbuilder/molecule/graph.py +712 -0
- molbuilder/molecule/peptides.py +13 -0
- molbuilder/molecule/stereochemistry.py +6 -0
- molbuilder/process/__init__.py +3 -0
- molbuilder/process/conditions.py +260 -0
- molbuilder/process/costing.py +316 -0
- molbuilder/process/purification.py +285 -0
- molbuilder/process/reactor.py +297 -0
- molbuilder/process/safety.py +476 -0
- molbuilder/process/scale_up.py +427 -0
- molbuilder/process/solvent_systems.py +204 -0
- molbuilder/reactions/__init__.py +3 -0
- molbuilder/reactions/functional_group_detect.py +728 -0
- molbuilder/reactions/knowledge_base.py +1716 -0
- molbuilder/reactions/reaction_types.py +102 -0
- molbuilder/reactions/reagent_data.py +1248 -0
- molbuilder/reactions/retrosynthesis.py +1430 -0
- molbuilder/reactions/synthesis_route.py +377 -0
- molbuilder/reports/__init__.py +158 -0
- molbuilder/reports/cost_report.py +206 -0
- molbuilder/reports/molecule_report.py +279 -0
- molbuilder/reports/safety_report.py +296 -0
- molbuilder/reports/synthesis_report.py +283 -0
- molbuilder/reports/text_formatter.py +170 -0
- molbuilder/smiles/__init__.py +4 -0
- molbuilder/smiles/parser.py +487 -0
- molbuilder/smiles/tokenizer.py +291 -0
- molbuilder/smiles/writer.py +375 -0
- molbuilder/visualization/__init__.py +1 -0
- molbuilder/visualization/bohr_viz.py +166 -0
- molbuilder/visualization/molecule_viz.py +368 -0
- molbuilder/visualization/quantum_viz.py +434 -0
- molbuilder/visualization/theme.py +12 -0
- molbuilder-1.0.0.dist-info/METADATA +360 -0
- molbuilder-1.0.0.dist-info/RECORD +78 -0
- molbuilder-1.0.0.dist-info/WHEEL +5 -0
- molbuilder-1.0.0.dist-info/entry_points.txt +2 -0
- molbuilder-1.0.0.dist-info/licenses/LICENSE +21 -0
- molbuilder-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
"""SMILES string tokenizer.
|
|
2
|
+
|
|
3
|
+
Converts a SMILES string into a sequence of Token objects for the parser.
|
|
4
|
+
|
|
5
|
+
Handles:
|
|
6
|
+
- Organic subset atoms: B, C, N, O, P, S, F, Cl, Br, I (uppercase)
|
|
7
|
+
- Bracket atoms: [NH3+], [Fe], [13C], etc.
|
|
8
|
+
- Aromatic atoms: b, c, n, o, p, s (lowercase)
|
|
9
|
+
- Bond types: - (single), = (double), # (triple), : (aromatic)
|
|
10
|
+
- Branch notation: ( and )
|
|
11
|
+
- Ring closure digits: 0-9, and %nn for two-digit ring numbers
|
|
12
|
+
- Dot disconnection: .
|
|
13
|
+
- Hydrogen counts in brackets: [NH2], [CH3]
|
|
14
|
+
- Charges in brackets: +, -, +2, -1
|
|
15
|
+
- Isotopes in brackets: [13C], [2H]
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from enum import Enum, auto
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# ===================================================================
|
|
25
|
+
# Token types and data class
|
|
26
|
+
# ===================================================================
|
|
27
|
+
|
|
28
|
+
class TokenType(Enum):
|
|
29
|
+
ATOM = auto() # organic subset atom or bracket atom
|
|
30
|
+
BOND = auto() # -, =, #, :
|
|
31
|
+
BRANCH_OPEN = auto() # (
|
|
32
|
+
BRANCH_CLOSE = auto() # )
|
|
33
|
+
RING_DIGIT = auto() # ring closure number
|
|
34
|
+
DOT = auto() # . (disconnection)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class Token:
|
|
39
|
+
"""A single lexical token from a SMILES string.
|
|
40
|
+
|
|
41
|
+
Attributes
|
|
42
|
+
----------
|
|
43
|
+
type : TokenType
|
|
44
|
+
The category of this token.
|
|
45
|
+
value : str
|
|
46
|
+
The atom symbol for ATOM tokens, the bond character for BOND
|
|
47
|
+
tokens, or the digit string for RING_DIGIT tokens.
|
|
48
|
+
isotope : int | None
|
|
49
|
+
Mass number from a bracket atom, e.g. 13 in ``[13C]``.
|
|
50
|
+
hcount : int | None
|
|
51
|
+
Explicit hydrogen count from a bracket atom, e.g. 2 in ``[NH2]``.
|
|
52
|
+
``None`` means no explicit H specification (implicit semantics
|
|
53
|
+
apply). ``0`` means explicitly zero hydrogens.
|
|
54
|
+
charge : int
|
|
55
|
+
Formal charge from a bracket atom, e.g. +1 in ``[NH4+]``.
|
|
56
|
+
aromatic : bool
|
|
57
|
+
True if the atom was given in lowercase (aromatic notation).
|
|
58
|
+
chirality : str | None
|
|
59
|
+
Chirality marker from a bracket atom: ``"@"`` (anticlockwise),
|
|
60
|
+
``"@@"`` (clockwise), or ``None`` (no chirality specified).
|
|
61
|
+
"""
|
|
62
|
+
type: TokenType
|
|
63
|
+
value: str
|
|
64
|
+
isotope: int | None = None
|
|
65
|
+
hcount: int | None = None
|
|
66
|
+
charge: int = 0
|
|
67
|
+
aromatic: bool = False
|
|
68
|
+
chirality: str | None = None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ===================================================================
|
|
72
|
+
# Constants
|
|
73
|
+
# ===================================================================
|
|
74
|
+
|
|
75
|
+
# Organic subset: atoms that don't need brackets
|
|
76
|
+
ORGANIC_SUBSET = {"B", "C", "N", "O", "P", "S", "F", "Cl", "Br", "I"}
|
|
77
|
+
AROMATIC_ATOMS = {"b", "c", "n", "o", "p", "s"}
|
|
78
|
+
|
|
79
|
+
# Two-letter organic subset atoms (checked before single-letter)
|
|
80
|
+
TWO_LETTER_ORGANIC = {"Cl", "Br"}
|
|
81
|
+
|
|
82
|
+
# Default valence for implicit hydrogen calculation
|
|
83
|
+
DEFAULT_VALENCE: dict[str, list[int]] = {
|
|
84
|
+
"B": [3], "C": [4], "N": [3, 5], "O": [2], "P": [3, 5],
|
|
85
|
+
"S": [2, 4, 6], "F": [1], "Cl": [1], "Br": [1], "I": [1],
|
|
86
|
+
"b": [3], "c": [4], "n": [3], "o": [2], "p": [3], "s": [2],
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
BOND_CHARS = {"-", "=", "#", ":", "/", "\\"}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ===================================================================
|
|
93
|
+
# Bracket-atom parser helper
|
|
94
|
+
# ===================================================================
|
|
95
|
+
|
|
96
|
+
def _parse_bracket(smiles: str, start: int) -> tuple[Token, int]:
|
|
97
|
+
"""Parse a bracket atom ``[...]`` starting at *start* (the ``[``).
|
|
98
|
+
|
|
99
|
+
Returns the Token and the index of the character after the closing
|
|
100
|
+
``]``.
|
|
101
|
+
"""
|
|
102
|
+
pos = start + 1 # skip '['
|
|
103
|
+
end = smiles.index("]", pos)
|
|
104
|
+
inner = smiles[pos:end]
|
|
105
|
+
|
|
106
|
+
isotope: int | None = None
|
|
107
|
+
symbol: str = ""
|
|
108
|
+
hcount: int | None = None
|
|
109
|
+
charge: int = 0
|
|
110
|
+
aromatic: bool = False
|
|
111
|
+
chirality: str | None = None
|
|
112
|
+
|
|
113
|
+
i = 0
|
|
114
|
+
n = len(inner)
|
|
115
|
+
|
|
116
|
+
# --- isotope (leading digits) ---
|
|
117
|
+
iso_start = i
|
|
118
|
+
while i < n and inner[i].isdigit():
|
|
119
|
+
i += 1
|
|
120
|
+
if i > iso_start:
|
|
121
|
+
isotope = int(inner[iso_start:i])
|
|
122
|
+
|
|
123
|
+
# --- element symbol ---
|
|
124
|
+
# Symbol starts with an uppercase letter followed by optional lowercase,
|
|
125
|
+
# OR a single lowercase letter for aromatic atoms.
|
|
126
|
+
if i < n and inner[i].isupper():
|
|
127
|
+
symbol = inner[i]
|
|
128
|
+
i += 1
|
|
129
|
+
while i < n and inner[i].islower():
|
|
130
|
+
symbol += inner[i]
|
|
131
|
+
i += 1
|
|
132
|
+
elif i < n and inner[i].islower():
|
|
133
|
+
# aromatic bracket atom
|
|
134
|
+
symbol = inner[i]
|
|
135
|
+
aromatic = True
|
|
136
|
+
i += 1
|
|
137
|
+
else:
|
|
138
|
+
raise ValueError(
|
|
139
|
+
f"Expected element symbol in bracket atom: [{inner}]")
|
|
140
|
+
|
|
141
|
+
# --- chirality (@, @@) ---
|
|
142
|
+
if i < n and inner[i] == "@":
|
|
143
|
+
i += 1
|
|
144
|
+
if i < n and inner[i] == "@":
|
|
145
|
+
chirality = "@@"
|
|
146
|
+
i += 1
|
|
147
|
+
else:
|
|
148
|
+
chirality = "@"
|
|
149
|
+
|
|
150
|
+
# --- hydrogen count ---
|
|
151
|
+
if i < n and inner[i] == "H":
|
|
152
|
+
i += 1
|
|
153
|
+
h_start = i
|
|
154
|
+
while i < n and inner[i].isdigit():
|
|
155
|
+
i += 1
|
|
156
|
+
if i > h_start:
|
|
157
|
+
hcount = int(inner[h_start:i])
|
|
158
|
+
else:
|
|
159
|
+
hcount = 1 # bare H means 1 hydrogen
|
|
160
|
+
|
|
161
|
+
# --- charge ---
|
|
162
|
+
if i < n and inner[i] in ("+", "-"):
|
|
163
|
+
sign = 1 if inner[i] == "+" else -1
|
|
164
|
+
i += 1
|
|
165
|
+
ch_start = i
|
|
166
|
+
while i < n and inner[i].isdigit():
|
|
167
|
+
i += 1
|
|
168
|
+
if i > ch_start:
|
|
169
|
+
charge = sign * int(inner[ch_start:i])
|
|
170
|
+
else:
|
|
171
|
+
# Count consecutive +/- signs (e.g. ++ means +2)
|
|
172
|
+
extra = 0
|
|
173
|
+
ch = "+" if sign == 1 else "-"
|
|
174
|
+
while i < n and inner[i] == ch:
|
|
175
|
+
extra += 1
|
|
176
|
+
i += 1
|
|
177
|
+
charge = sign * (1 + extra)
|
|
178
|
+
|
|
179
|
+
return Token(
|
|
180
|
+
type=TokenType.ATOM,
|
|
181
|
+
value=symbol,
|
|
182
|
+
isotope=isotope,
|
|
183
|
+
hcount=hcount,
|
|
184
|
+
charge=charge,
|
|
185
|
+
aromatic=aromatic,
|
|
186
|
+
chirality=chirality,
|
|
187
|
+
), end + 1
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
# ===================================================================
|
|
191
|
+
# Main tokenizer
|
|
192
|
+
# ===================================================================
|
|
193
|
+
|
|
194
|
+
def tokenize(smiles: str) -> list[Token]:
|
|
195
|
+
"""Convert a SMILES string into a list of Token objects.
|
|
196
|
+
|
|
197
|
+
Parameters
|
|
198
|
+
----------
|
|
199
|
+
smiles : str
|
|
200
|
+
A valid SMILES string, e.g. ``"CCO"``, ``"c1ccccc1"``,
|
|
201
|
+
``"[NH4+]"``.
|
|
202
|
+
|
|
203
|
+
Returns
|
|
204
|
+
-------
|
|
205
|
+
list[Token]
|
|
206
|
+
Ordered sequence of tokens ready for the parser.
|
|
207
|
+
|
|
208
|
+
Raises
|
|
209
|
+
------
|
|
210
|
+
ValueError
|
|
211
|
+
If the string contains unexpected characters or malformed
|
|
212
|
+
bracket atoms.
|
|
213
|
+
"""
|
|
214
|
+
tokens: list[Token] = []
|
|
215
|
+
i = 0
|
|
216
|
+
n = len(smiles)
|
|
217
|
+
|
|
218
|
+
while i < n:
|
|
219
|
+
ch = smiles[i]
|
|
220
|
+
|
|
221
|
+
# --- bracket atom ---
|
|
222
|
+
if ch == "[":
|
|
223
|
+
token, i = _parse_bracket(smiles, i)
|
|
224
|
+
tokens.append(token)
|
|
225
|
+
continue
|
|
226
|
+
|
|
227
|
+
# --- two-letter organic atoms (Cl, Br) ---
|
|
228
|
+
if i + 1 < n and smiles[i:i + 2] in TWO_LETTER_ORGANIC:
|
|
229
|
+
tokens.append(Token(type=TokenType.ATOM, value=smiles[i:i + 2]))
|
|
230
|
+
i += 2
|
|
231
|
+
continue
|
|
232
|
+
|
|
233
|
+
# --- single-letter organic subset atoms ---
|
|
234
|
+
if ch in {"B", "C", "N", "O", "P", "S", "F", "I"}:
|
|
235
|
+
tokens.append(Token(type=TokenType.ATOM, value=ch))
|
|
236
|
+
i += 1
|
|
237
|
+
continue
|
|
238
|
+
|
|
239
|
+
# --- aromatic atoms ---
|
|
240
|
+
if ch in {"b", "c", "n", "o", "p", "s"}:
|
|
241
|
+
tokens.append(Token(
|
|
242
|
+
type=TokenType.ATOM, value=ch, aromatic=True))
|
|
243
|
+
i += 1
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
# --- bond symbols ---
|
|
247
|
+
if ch in BOND_CHARS:
|
|
248
|
+
tokens.append(Token(type=TokenType.BOND, value=ch))
|
|
249
|
+
i += 1
|
|
250
|
+
continue
|
|
251
|
+
|
|
252
|
+
# --- branch open / close ---
|
|
253
|
+
if ch == "(":
|
|
254
|
+
tokens.append(Token(type=TokenType.BRANCH_OPEN, value="("))
|
|
255
|
+
i += 1
|
|
256
|
+
continue
|
|
257
|
+
|
|
258
|
+
if ch == ")":
|
|
259
|
+
tokens.append(Token(type=TokenType.BRANCH_CLOSE, value=")"))
|
|
260
|
+
i += 1
|
|
261
|
+
continue
|
|
262
|
+
|
|
263
|
+
# --- ring closure digit ---
|
|
264
|
+
if ch.isdigit():
|
|
265
|
+
tokens.append(Token(type=TokenType.RING_DIGIT, value=ch))
|
|
266
|
+
i += 1
|
|
267
|
+
continue
|
|
268
|
+
|
|
269
|
+
# --- two-digit ring closure: %nn ---
|
|
270
|
+
if ch == "%":
|
|
271
|
+
if i + 2 >= n or not smiles[i + 1].isdigit() or not smiles[i + 2].isdigit():
|
|
272
|
+
raise ValueError(
|
|
273
|
+
f"Expected two digits after '%' at position {i}")
|
|
274
|
+
tokens.append(Token(
|
|
275
|
+
type=TokenType.RING_DIGIT,
|
|
276
|
+
value=smiles[i + 1:i + 3]))
|
|
277
|
+
i += 3
|
|
278
|
+
continue
|
|
279
|
+
|
|
280
|
+
# --- dot disconnection ---
|
|
281
|
+
if ch == ".":
|
|
282
|
+
tokens.append(Token(type=TokenType.DOT, value="."))
|
|
283
|
+
i += 1
|
|
284
|
+
continue
|
|
285
|
+
|
|
286
|
+
# --- unexpected character ---
|
|
287
|
+
raise ValueError(
|
|
288
|
+
f"Unexpected character {ch!r} at position {i} in SMILES "
|
|
289
|
+
f"string {smiles!r}")
|
|
290
|
+
|
|
291
|
+
return tokens
|
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
"""SMILES writer: Molecule -> canonical SMILES string.
|
|
2
|
+
|
|
3
|
+
Uses Morgan's algorithm for canonical atom numbering, then DFS traversal
|
|
4
|
+
to generate the SMILES string.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from collections import deque
|
|
10
|
+
|
|
11
|
+
from molbuilder.molecule.graph import Molecule
|
|
12
|
+
from molbuilder.core.elements import SYMBOL_TO_Z
|
|
13
|
+
from molbuilder.smiles.tokenizer import ORGANIC_SUBSET, DEFAULT_VALENCE
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# ===================================================================
|
|
17
|
+
# Morgan canonical ordering
|
|
18
|
+
# ===================================================================
|
|
19
|
+
|
|
20
|
+
def _morgan_canonical_order(mol: Molecule) -> dict[int, int]:
|
|
21
|
+
"""Compute canonical atom ranks using Morgan's extended connectivity.
|
|
22
|
+
|
|
23
|
+
Algorithm
|
|
24
|
+
---------
|
|
25
|
+
1. Initialize each atom's invariant to its degree (number of
|
|
26
|
+
neighbours).
|
|
27
|
+
2. Iteratively replace each atom's invariant with the sum of its
|
|
28
|
+
neighbours' invariants until the number of distinct values
|
|
29
|
+
stabilises.
|
|
30
|
+
3. Break ties using atomic number (higher Z = higher rank).
|
|
31
|
+
4. Return a mapping from atom index to rank (0 = lowest priority,
|
|
32
|
+
first in canonical order).
|
|
33
|
+
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
mol : Molecule
|
|
37
|
+
The molecule whose atoms should be ranked.
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
dict[int, int]
|
|
42
|
+
Mapping from atom index to canonical rank.
|
|
43
|
+
"""
|
|
44
|
+
n = len(mol.atoms)
|
|
45
|
+
if n == 0:
|
|
46
|
+
return {}
|
|
47
|
+
|
|
48
|
+
# Initial connectivity value = degree
|
|
49
|
+
ec = [len(mol.neighbors(i)) for i in range(n)]
|
|
50
|
+
|
|
51
|
+
prev_classes = 0
|
|
52
|
+
for _iteration in range(100):
|
|
53
|
+
new_ec = [0] * n
|
|
54
|
+
for i in range(n):
|
|
55
|
+
new_ec[i] = sum(ec[j] for j in mol.neighbors(i))
|
|
56
|
+
|
|
57
|
+
# Count distinct classes
|
|
58
|
+
classes = len(set(new_ec))
|
|
59
|
+
if classes == prev_classes:
|
|
60
|
+
break
|
|
61
|
+
prev_classes = classes
|
|
62
|
+
ec = new_ec
|
|
63
|
+
|
|
64
|
+
# Build (ec_value, atomic_number, original_index) for stable sort
|
|
65
|
+
sort_keys = []
|
|
66
|
+
for i in range(n):
|
|
67
|
+
z = SYMBOL_TO_Z.get(mol.atoms[i].symbol, 0)
|
|
68
|
+
sort_keys.append((ec[i], z, i))
|
|
69
|
+
|
|
70
|
+
sorted_atoms = sorted(sort_keys)
|
|
71
|
+
order: dict[int, int] = {}
|
|
72
|
+
for rank, (_, _, idx) in enumerate(sorted_atoms):
|
|
73
|
+
order[idx] = rank
|
|
74
|
+
|
|
75
|
+
return order
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# ===================================================================
|
|
79
|
+
# Connected components
|
|
80
|
+
# ===================================================================
|
|
81
|
+
|
|
82
|
+
def _connected_components(mol: Molecule) -> list[list[int]]:
|
|
83
|
+
"""Return lists of atom indices for each connected fragment.
|
|
84
|
+
|
|
85
|
+
Considers only heavy (non-hydrogen) atoms.
|
|
86
|
+
"""
|
|
87
|
+
n = len(mol.atoms)
|
|
88
|
+
heavy = {i for i in range(n) if mol.atoms[i].symbol != "H"}
|
|
89
|
+
visited: set[int] = set()
|
|
90
|
+
components: list[list[int]] = []
|
|
91
|
+
|
|
92
|
+
for start in heavy:
|
|
93
|
+
if start in visited:
|
|
94
|
+
continue
|
|
95
|
+
comp: list[int] = []
|
|
96
|
+
stack = [start]
|
|
97
|
+
while stack:
|
|
98
|
+
cur = stack.pop()
|
|
99
|
+
if cur in visited:
|
|
100
|
+
continue
|
|
101
|
+
visited.add(cur)
|
|
102
|
+
if cur not in heavy:
|
|
103
|
+
continue
|
|
104
|
+
comp.append(cur)
|
|
105
|
+
for nb in mol.neighbors(cur):
|
|
106
|
+
if nb not in visited and nb in heavy:
|
|
107
|
+
stack.append(nb)
|
|
108
|
+
if comp:
|
|
109
|
+
components.append(comp)
|
|
110
|
+
|
|
111
|
+
return components
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# ===================================================================
|
|
115
|
+
# Bond order symbol
|
|
116
|
+
# ===================================================================
|
|
117
|
+
|
|
118
|
+
def _bond_symbol(mol: Molecule, i: int, j: int) -> str:
|
|
119
|
+
"""Return the SMILES bond symbol between atoms *i* and *j*.
|
|
120
|
+
|
|
121
|
+
Single bonds return ``""`` (implicit), double ``"="``, triple ``"#"``.
|
|
122
|
+
"""
|
|
123
|
+
bond = mol.get_bond(i, j)
|
|
124
|
+
if bond is None:
|
|
125
|
+
return ""
|
|
126
|
+
if bond.order == 2:
|
|
127
|
+
return "="
|
|
128
|
+
if bond.order == 3:
|
|
129
|
+
return "#"
|
|
130
|
+
return ""
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# ===================================================================
|
|
134
|
+
# DFS SMILES generation
|
|
135
|
+
# ===================================================================
|
|
136
|
+
|
|
137
|
+
def _dfs_smiles(mol: Molecule, order: dict[int, int],
|
|
138
|
+
component: list[int]) -> str:
|
|
139
|
+
"""Generate a SMILES string for one connected component via DFS.
|
|
140
|
+
|
|
141
|
+
Parameters
|
|
142
|
+
----------
|
|
143
|
+
mol : Molecule
|
|
144
|
+
The full molecule.
|
|
145
|
+
order : dict[int, int]
|
|
146
|
+
Canonical ranking from ``_morgan_canonical_order``.
|
|
147
|
+
component : list[int]
|
|
148
|
+
Atom indices belonging to this fragment (heavy atoms only).
|
|
149
|
+
|
|
150
|
+
Returns
|
|
151
|
+
-------
|
|
152
|
+
str
|
|
153
|
+
SMILES string for the fragment.
|
|
154
|
+
"""
|
|
155
|
+
if not component:
|
|
156
|
+
return ""
|
|
157
|
+
|
|
158
|
+
comp_set = set(component)
|
|
159
|
+
|
|
160
|
+
# Start DFS from the atom with the lowest canonical rank in this component
|
|
161
|
+
start = min(component, key=lambda i: order.get(i, 0))
|
|
162
|
+
|
|
163
|
+
visited: set[int] = set()
|
|
164
|
+
# Track ring closure pairs: when DFS finds a back-edge, both
|
|
165
|
+
# endpoints get a digit.
|
|
166
|
+
ring_bonds: dict[tuple[int, int], int] = {}
|
|
167
|
+
next_ring_digit = 1
|
|
168
|
+
|
|
169
|
+
parts: list[str] = []
|
|
170
|
+
|
|
171
|
+
def _heavy_neighbors(idx: int) -> list[int]:
|
|
172
|
+
"""Return non-hydrogen neighbours in this component, sorted by
|
|
173
|
+
canonical rank (highest rank = most preferred child = last in
|
|
174
|
+
the sort so it gets visited first on the main chain)."""
|
|
175
|
+
nbs = []
|
|
176
|
+
for nb in mol.neighbors(idx):
|
|
177
|
+
if mol.atoms[nb].symbol == "H":
|
|
178
|
+
continue
|
|
179
|
+
if nb not in comp_set:
|
|
180
|
+
continue
|
|
181
|
+
nbs.append(nb)
|
|
182
|
+
# Sort: the *last* element becomes the "main chain" child (no
|
|
183
|
+
# parentheses). Lower rank = visited first = branch.
|
|
184
|
+
nbs.sort(key=lambda x: order.get(x, 0))
|
|
185
|
+
return nbs
|
|
186
|
+
|
|
187
|
+
def _atom_str(idx: int) -> str:
|
|
188
|
+
"""Return the SMILES atom token for atom *idx*.
|
|
189
|
+
|
|
190
|
+
Outputs bracket notation ``[<isotope><symbol><chirality><hcount><charge>]``
|
|
191
|
+
when the atom has non-default properties (chirality, isotope, charge,
|
|
192
|
+
or is not in the organic subset). Organic subset atoms without
|
|
193
|
+
special properties are written without brackets.
|
|
194
|
+
"""
|
|
195
|
+
atom = mol.atoms[idx]
|
|
196
|
+
sym = atom.symbol
|
|
197
|
+
has_chirality = atom.chirality is not None
|
|
198
|
+
has_isotope = atom.isotope is not None
|
|
199
|
+
has_charge = atom.formal_charge != 0
|
|
200
|
+
needs_bracket = has_chirality or has_isotope or has_charge or sym not in ORGANIC_SUBSET
|
|
201
|
+
|
|
202
|
+
if not needs_bracket:
|
|
203
|
+
return sym
|
|
204
|
+
|
|
205
|
+
# Build bracket atom string: [<isotope><symbol><chirality><Hn><charge>]
|
|
206
|
+
parts: list[str] = []
|
|
207
|
+
if has_isotope:
|
|
208
|
+
parts.append(str(atom.isotope))
|
|
209
|
+
parts.append(sym)
|
|
210
|
+
if has_chirality:
|
|
211
|
+
parts.append(atom.chirality)
|
|
212
|
+
|
|
213
|
+
# Compute implicit H count: count explicit H neighbours
|
|
214
|
+
h_count = sum(1 for nb in mol.neighbors(idx)
|
|
215
|
+
if mol.atoms[nb].symbol == "H")
|
|
216
|
+
if h_count == 1:
|
|
217
|
+
parts.append("H")
|
|
218
|
+
elif h_count > 1:
|
|
219
|
+
parts.append(f"H{h_count}")
|
|
220
|
+
|
|
221
|
+
if has_charge:
|
|
222
|
+
ch = atom.formal_charge
|
|
223
|
+
if ch == 1:
|
|
224
|
+
parts.append("+")
|
|
225
|
+
elif ch == -1:
|
|
226
|
+
parts.append("-")
|
|
227
|
+
elif ch > 0:
|
|
228
|
+
parts.append(f"+{ch}")
|
|
229
|
+
else:
|
|
230
|
+
parts.append(str(ch))
|
|
231
|
+
|
|
232
|
+
return f"[{''.join(parts)}]"
|
|
233
|
+
|
|
234
|
+
def _emit_ring_closure(idx: int, nb: int) -> None:
|
|
235
|
+
"""Register and emit a ring closure digit between idx and nb."""
|
|
236
|
+
nonlocal next_ring_digit
|
|
237
|
+
edge = (min(idx, nb), max(idx, nb))
|
|
238
|
+
if edge not in ring_bonds:
|
|
239
|
+
ring_bonds[edge] = next_ring_digit
|
|
240
|
+
next_ring_digit += 1
|
|
241
|
+
digit = ring_bonds[edge]
|
|
242
|
+
bsym = _bond_symbol(mol, idx, nb)
|
|
243
|
+
if digit < 10:
|
|
244
|
+
parts.append(f"{bsym}{digit}")
|
|
245
|
+
else:
|
|
246
|
+
parts.append(f"{bsym}%{digit:02d}")
|
|
247
|
+
|
|
248
|
+
def _dfs(idx: int, parent: int | None = None) -> None:
|
|
249
|
+
visited.add(idx)
|
|
250
|
+
parts.append(_atom_str(idx))
|
|
251
|
+
|
|
252
|
+
# Classify neighbours into ring closures and tree children
|
|
253
|
+
nbs = _heavy_neighbors(idx)
|
|
254
|
+
parent_consumed = False
|
|
255
|
+
ring_nbs: list[int] = []
|
|
256
|
+
unvisited_nbs: list[int] = []
|
|
257
|
+
|
|
258
|
+
for nb in nbs:
|
|
259
|
+
if nb == parent and not parent_consumed:
|
|
260
|
+
# Skip the tree edge we arrived on (consume once)
|
|
261
|
+
parent_consumed = True
|
|
262
|
+
continue
|
|
263
|
+
if nb in visited:
|
|
264
|
+
ring_nbs.append(nb)
|
|
265
|
+
else:
|
|
266
|
+
unvisited_nbs.append(nb)
|
|
267
|
+
|
|
268
|
+
# Emit ring closure digits at this atom
|
|
269
|
+
for nb in ring_nbs:
|
|
270
|
+
_emit_ring_closure(idx, nb)
|
|
271
|
+
|
|
272
|
+
if not unvisited_nbs:
|
|
273
|
+
return
|
|
274
|
+
|
|
275
|
+
# Process children. All except the last become branches
|
|
276
|
+
# (wrapped in parentheses); the last is the main chain.
|
|
277
|
+
# After each branch, re-check whether later children have
|
|
278
|
+
# been visited (they may have been reached through a ring
|
|
279
|
+
# inside the branch -- in that case emit the ring closure
|
|
280
|
+
# digit at this atom's position).
|
|
281
|
+
remaining = list(unvisited_nbs)
|
|
282
|
+
while remaining:
|
|
283
|
+
# Promote children visited during a sibling branch to ring
|
|
284
|
+
# closures: their partner already emitted the digit, so we
|
|
285
|
+
# must emit the matching digit here.
|
|
286
|
+
still_unvisited: list[int] = []
|
|
287
|
+
for c in remaining:
|
|
288
|
+
if c in visited:
|
|
289
|
+
edge = (min(idx, c), max(idx, c))
|
|
290
|
+
if edge in ring_bonds:
|
|
291
|
+
_emit_ring_closure(idx, c)
|
|
292
|
+
# else: already handled or no ring bond -- skip
|
|
293
|
+
else:
|
|
294
|
+
still_unvisited.append(c)
|
|
295
|
+
remaining = still_unvisited
|
|
296
|
+
|
|
297
|
+
if not remaining:
|
|
298
|
+
return
|
|
299
|
+
|
|
300
|
+
if len(remaining) == 1:
|
|
301
|
+
# Last remaining child: main chain, no parentheses
|
|
302
|
+
child = remaining[0]
|
|
303
|
+
bsym = _bond_symbol(mol, idx, child)
|
|
304
|
+
parts.append(bsym)
|
|
305
|
+
_dfs(child, parent=idx)
|
|
306
|
+
return
|
|
307
|
+
|
|
308
|
+
# Branch child (not the last one)
|
|
309
|
+
child = remaining.pop(0)
|
|
310
|
+
if child in visited:
|
|
311
|
+
# Reached through another path; ring closure
|
|
312
|
+
_emit_ring_closure(idx, child)
|
|
313
|
+
else:
|
|
314
|
+
bsym = _bond_symbol(mol, idx, child)
|
|
315
|
+
parts.append(f"({bsym}")
|
|
316
|
+
_dfs(child, parent=idx)
|
|
317
|
+
parts.append(")")
|
|
318
|
+
|
|
319
|
+
_dfs(start)
|
|
320
|
+
|
|
321
|
+
# Also emit ring closure digits for atoms that have back-edge
|
|
322
|
+
# partners not yet annotated. (The partner side is handled when
|
|
323
|
+
# the DFS visits that atom -- it adds the digit there.)
|
|
324
|
+
# In the standard algorithm above, both sides are already handled.
|
|
325
|
+
|
|
326
|
+
return "".join(parts)
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
# ===================================================================
|
|
330
|
+
# Public API
|
|
331
|
+
# ===================================================================
|
|
332
|
+
|
|
333
|
+
def to_smiles(mol: Molecule) -> str:
|
|
334
|
+
"""Convert a Molecule to a SMILES string.
|
|
335
|
+
|
|
336
|
+
Hydrogen atoms are omitted (they are implicit in SMILES notation).
|
|
337
|
+
Multi-fragment molecules are joined with ``.`` separators.
|
|
338
|
+
|
|
339
|
+
Parameters
|
|
340
|
+
----------
|
|
341
|
+
mol : Molecule
|
|
342
|
+
A molecule built with the ``molbuilder`` framework.
|
|
343
|
+
|
|
344
|
+
Returns
|
|
345
|
+
-------
|
|
346
|
+
str
|
|
347
|
+
A SMILES string representing the molecule.
|
|
348
|
+
|
|
349
|
+
Examples
|
|
350
|
+
--------
|
|
351
|
+
>>> from molbuilder.smiles.parser import parse
|
|
352
|
+
>>> mol = parse("CCO")
|
|
353
|
+
>>> to_smiles(mol) # may return "CCO" or "OCC" depending on canonicalization
|
|
354
|
+
'...'
|
|
355
|
+
"""
|
|
356
|
+
if not mol.atoms:
|
|
357
|
+
return ""
|
|
358
|
+
|
|
359
|
+
order = _morgan_canonical_order(mol)
|
|
360
|
+
components = _connected_components(mol)
|
|
361
|
+
|
|
362
|
+
if not components:
|
|
363
|
+
return ""
|
|
364
|
+
|
|
365
|
+
# Sort components for deterministic output (largest first,
|
|
366
|
+
# then by lowest canonical rank of their start atom)
|
|
367
|
+
components.sort(key=lambda c: (-len(c), min(order.get(i, 0) for i in c)))
|
|
368
|
+
|
|
369
|
+
fragments = []
|
|
370
|
+
for comp in components:
|
|
371
|
+
smi = _dfs_smiles(mol, order, comp)
|
|
372
|
+
if smi:
|
|
373
|
+
fragments.append(smi)
|
|
374
|
+
|
|
375
|
+
return ".".join(fragments)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Visualization: Bohr models, orbital clouds, molecule rendering."""
|