cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2843 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Name-driven IUPAC decomposition.
|
|
3
|
+
|
|
4
|
+
Parse the bracket hierarchy of an IUPAC name to find substituent
|
|
5
|
+
boundaries, then generate alternative valid names by swapping
|
|
6
|
+
parent ↔ substituent roles. Uses ChemDraw (via ChemScript) as
|
|
7
|
+
the naming oracle — we never try to parse IUPAC grammar ourselves.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python name_decomposer.py <SMILES> [-v] [--json] [--max-depth N]
|
|
11
|
+
"""
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import re
|
|
15
|
+
import sys
|
|
16
|
+
import time
|
|
17
|
+
from dataclasses import dataclass, field, asdict
|
|
18
|
+
from functools import lru_cache
|
|
19
|
+
from typing import List, Optional, Tuple
|
|
20
|
+
|
|
21
|
+
from rdkit import Chem, RDLogger
|
|
22
|
+
from cdxml_toolkit.chemdraw.chemscript_bridge import ChemScriptBridge
|
|
23
|
+
|
|
24
|
+
RDLogger.logger().setLevel(RDLogger.ERROR)
|
|
25
|
+
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
# Data classes
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class BracketNode:
|
|
32
|
+
"""A parenthesised group in an IUPAC name."""
|
|
33
|
+
text: str # content inside the parens (excluding the parens)
|
|
34
|
+
start: int # index of '(' in the full name
|
|
35
|
+
end: int # index of ')' in the full name (inclusive)
|
|
36
|
+
children: List["BracketNode"] = field(default_factory=list)
|
|
37
|
+
depth: int = 0 # nesting depth (0 = top-level group)
|
|
38
|
+
kind: str = "" # "stereo" | "multiplier" | "substituent" | "unknown"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class Alternative:
|
|
43
|
+
"""One alternative IUPAC name for the molecule."""
|
|
44
|
+
name: str
|
|
45
|
+
parent_name: str # name of the fragment used as parent
|
|
46
|
+
sub_name: str # name of the fragment used as substituent
|
|
47
|
+
locant: str # locant on the new parent
|
|
48
|
+
valid: bool # round-trip validated?
|
|
49
|
+
strategy: str = "" # how the name was assembled
|
|
50
|
+
notes: str = ""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class DecompositionResult:
|
|
55
|
+
original_smiles: str
|
|
56
|
+
canonical_smiles: str
|
|
57
|
+
canonical_name: str
|
|
58
|
+
bracket_tree: Optional[BracketNode]
|
|
59
|
+
alternatives: List[Alternative] = field(default_factory=list)
|
|
60
|
+
errors: List[str] = field(default_factory=list)
|
|
61
|
+
canonical_parent: str = "" # parent name in the canonical naming
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# Bracket tree parser
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
def parse_bracket_tree(name: str) -> BracketNode:
|
|
69
|
+
"""Parse parenthesised groups in an IUPAC name into a tree.
|
|
70
|
+
|
|
71
|
+
Skips square brackets [...] (stereo descriptors, ring-fusion).
|
|
72
|
+
Returns a root node whose children are the top-level (...) groups.
|
|
73
|
+
"""
|
|
74
|
+
root = BracketNode(text=name, start=0, end=len(name) - 1, depth=-1)
|
|
75
|
+
stack: List[Tuple[int, int]] = [] # (start_pos, depth)
|
|
76
|
+
nodes_by_depth: dict[int, List[BracketNode]] = {}
|
|
77
|
+
i = 0
|
|
78
|
+
while i < len(name):
|
|
79
|
+
ch = name[i]
|
|
80
|
+
if ch == '[':
|
|
81
|
+
# skip entire [...] block
|
|
82
|
+
j = name.find(']', i + 1)
|
|
83
|
+
if j == -1:
|
|
84
|
+
i += 1
|
|
85
|
+
else:
|
|
86
|
+
i = j + 1
|
|
87
|
+
continue
|
|
88
|
+
if ch == '(':
|
|
89
|
+
depth = len(stack)
|
|
90
|
+
stack.append((i, depth))
|
|
91
|
+
elif ch == ')' and stack:
|
|
92
|
+
start_pos, depth = stack.pop()
|
|
93
|
+
text = name[start_pos + 1:i]
|
|
94
|
+
node = BracketNode(
|
|
95
|
+
text=text, start=start_pos, end=i, depth=depth
|
|
96
|
+
)
|
|
97
|
+
nodes_by_depth.setdefault(depth, []).append(node)
|
|
98
|
+
i += 1
|
|
99
|
+
|
|
100
|
+
# Build tree: depth-0 nodes are children of root; depth-N nodes are
|
|
101
|
+
# children of the nearest enclosing depth-(N-1) node.
|
|
102
|
+
all_depths = sorted(nodes_by_depth.keys())
|
|
103
|
+
for d in all_depths:
|
|
104
|
+
if d == 0:
|
|
105
|
+
root.children = nodes_by_depth[d]
|
|
106
|
+
else:
|
|
107
|
+
parent_nodes = nodes_by_depth.get(d - 1, [])
|
|
108
|
+
for node in nodes_by_depth[d]:
|
|
109
|
+
# Find the parent that encloses this node
|
|
110
|
+
for pn in parent_nodes:
|
|
111
|
+
if pn.start < node.start and node.end < pn.end:
|
|
112
|
+
pn.children.append(node)
|
|
113
|
+
break
|
|
114
|
+
|
|
115
|
+
return root
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# ---------------------------------------------------------------------------
|
|
119
|
+
# Bracket group classification
|
|
120
|
+
# ---------------------------------------------------------------------------
|
|
121
|
+
|
|
122
|
+
# Patterns that are NOT substituents
|
|
123
|
+
_STEREO_RE = re.compile(
|
|
124
|
+
r'^[RSEZ±]$|^rac$|^rel$|^[RSrs],[RSrs]$|^[0-9]+[RSEZ]$'
|
|
125
|
+
r'|^[0-9]+[a-z]*[RSEZ](,[0-9]+[a-z]*[RSEZ])*$',
|
|
126
|
+
re.IGNORECASE
|
|
127
|
+
)
|
|
128
|
+
_MULTIPLIER_RE = re.compile(
|
|
129
|
+
r'^di$|^tri$|^tetra$|^penta$|^hexa$|^bis$|^tris$', re.IGNORECASE
|
|
130
|
+
)
|
|
131
|
+
_NUMBERSONLY_RE = re.compile(r'^[\d,\' ]+$')
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def classify_node(node: BracketNode) -> str:
|
|
135
|
+
"""Quick regex classification of a bracket group.
|
|
136
|
+
|
|
137
|
+
Returns "stereo", "multiplier", "skip", or "candidate".
|
|
138
|
+
"""
|
|
139
|
+
t = node.text.strip()
|
|
140
|
+
if not t:
|
|
141
|
+
return "skip"
|
|
142
|
+
if _STEREO_RE.match(t):
|
|
143
|
+
return "stereo"
|
|
144
|
+
if _MULTIPLIER_RE.match(t):
|
|
145
|
+
return "multiplier"
|
|
146
|
+
if _NUMBERSONLY_RE.match(t):
|
|
147
|
+
return "skip" # ring assembly numbering
|
|
148
|
+
# Very short texts are unlikely to be substituents
|
|
149
|
+
if len(t) <= 2 and not t.endswith("yl"):
|
|
150
|
+
return "skip"
|
|
151
|
+
return "candidate"
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# ---------------------------------------------------------------------------
|
|
155
|
+
# ChemDraw interaction helpers
|
|
156
|
+
# ---------------------------------------------------------------------------
|
|
157
|
+
|
|
158
|
+
_cs: Optional[ChemScriptBridge] = None
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _get_cs() -> ChemScriptBridge:
|
|
162
|
+
global _cs
|
|
163
|
+
if _cs is None:
|
|
164
|
+
_cs = ChemScriptBridge()
|
|
165
|
+
return _cs
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _name_to_smiles(name: str) -> Optional[str]:
|
|
169
|
+
"""Try to resolve an IUPAC name to SMILES via ChemDraw."""
|
|
170
|
+
try:
|
|
171
|
+
smi = _get_cs().write_data(name, "smiles", source_format="name")
|
|
172
|
+
if smi and Chem.MolFromSmiles(smi) is not None:
|
|
173
|
+
return smi
|
|
174
|
+
except Exception:
|
|
175
|
+
pass
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _smiles_to_name(smiles: str) -> Optional[str]:
|
|
180
|
+
"""Get IUPAC name for a SMILES string."""
|
|
181
|
+
try:
|
|
182
|
+
return _get_cs().get_name(smiles)
|
|
183
|
+
except Exception:
|
|
184
|
+
return None
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _canonical(smiles: str) -> Optional[str]:
|
|
188
|
+
"""RDKit canonical SMILES."""
|
|
189
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
190
|
+
if mol is None:
|
|
191
|
+
return None
|
|
192
|
+
return Chem.MolToSmiles(mol)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _add_at(mol: Chem.Mol, atom_idx: int) -> Optional[Tuple[Chem.Mol, str]]:
|
|
196
|
+
"""Add astatine (At, Z=85) at a specific atom. Return (mol, smiles).
|
|
197
|
+
|
|
198
|
+
For ring NH atoms, At replaces the H (removes one explicit H).
|
|
199
|
+
"""
|
|
200
|
+
edit = Chem.RWMol(mol)
|
|
201
|
+
target = edit.GetAtomWithIdx(atom_idx)
|
|
202
|
+
# If target is ring N/O with explicit H, At replaces H
|
|
203
|
+
if (target.IsInRing() and target.GetAtomicNum() != 6
|
|
204
|
+
and target.GetTotalNumHs() > 0):
|
|
205
|
+
explicit_h = target.GetNumExplicitHs()
|
|
206
|
+
if explicit_h > 0:
|
|
207
|
+
target.SetNumExplicitHs(explicit_h - 1)
|
|
208
|
+
at_idx = edit.AddAtom(Chem.Atom(85))
|
|
209
|
+
edit.AddBond(atom_idx, at_idx, Chem.BondType.SINGLE)
|
|
210
|
+
try:
|
|
211
|
+
Chem.SanitizeMol(edit)
|
|
212
|
+
result = edit.GetMol()
|
|
213
|
+
return result, Chem.MolToSmiles(result)
|
|
214
|
+
except Exception:
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _get_yl_via_acid_probe(mol: Chem.Mol, attach_idx: int,
|
|
219
|
+
verbose: bool = False) -> Optional[str]:
|
|
220
|
+
"""Get the -yl substituent form of a fragment using icosanoic acid probe.
|
|
221
|
+
|
|
222
|
+
Attaches the fragment to icosanoic acid (C20, COOH), names the result via
|
|
223
|
+
ChemDraw, and extracts the -yl name from "20-(SUBSTITUENT)icosanoic acid".
|
|
224
|
+
|
|
225
|
+
Uses C20 acid because:
|
|
226
|
+
- COOH is a PCG → always forces the chain as naming parent
|
|
227
|
+
- No drug molecule has a C20 chain → zero confusion
|
|
228
|
+
- Locant 20 and "icosanoic acid" suffix are unambiguous to parse
|
|
229
|
+
"""
|
|
230
|
+
acid = Chem.MolFromSmiles("CCCCCCCCCCCCCCCCCCCC(=O)O")
|
|
231
|
+
if acid is None:
|
|
232
|
+
return None
|
|
233
|
+
|
|
234
|
+
combo = Chem.RWMol(Chem.CombineMols(mol, acid))
|
|
235
|
+
# The acid's first carbon (C20, terminal) is at offset = mol.GetNumAtoms()
|
|
236
|
+
acid_c_idx = mol.GetNumAtoms()
|
|
237
|
+
combo.AddBond(attach_idx, acid_c_idx, Chem.BondType.SINGLE)
|
|
238
|
+
try:
|
|
239
|
+
Chem.SanitizeMol(combo)
|
|
240
|
+
except Exception:
|
|
241
|
+
return None
|
|
242
|
+
|
|
243
|
+
acid_smi = Chem.MolToSmiles(combo.GetMol())
|
|
244
|
+
acid_name = _smiles_to_name(acid_smi)
|
|
245
|
+
if acid_name is None:
|
|
246
|
+
return None
|
|
247
|
+
|
|
248
|
+
if verbose:
|
|
249
|
+
print(f" Icosanoic acid probe: '{acid_name}'", file=sys.stderr)
|
|
250
|
+
|
|
251
|
+
# Extract -yl form from "20-(substituent)icosanoic acid"
|
|
252
|
+
m = re.match(r'20-\((.+)\)icosanoic acid$', acid_name)
|
|
253
|
+
if m:
|
|
254
|
+
return m.group(1)
|
|
255
|
+
# Try without parentheses: "20-substitutenticosanoic acid"
|
|
256
|
+
m = re.match(r'20-(.+)icosanoic acid$', acid_name)
|
|
257
|
+
if m:
|
|
258
|
+
return m.group(1)
|
|
259
|
+
return None
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
_ACID_SMILES = "CCCCCCCCCCCCCCCCCCCC(=O)O"
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _get_yl_via_selenyl_probe(mol: Chem.Mol, attach_idx: int,
|
|
266
|
+
verbose: bool = False) -> Optional[str]:
|
|
267
|
+
"""Get the -yl substituent form via a Se-linked icosanoic acid probe.
|
|
268
|
+
|
|
269
|
+
Builds: fragment—Se—CH₂(C₁₈)—COOH, names via ChemDraw, extracts
|
|
270
|
+
the substituent name from ``20-({sub}selanyl)icosanoic acid``.
|
|
271
|
+
|
|
272
|
+
The Se linker isolates the fragment from the acid chain, which avoids
|
|
273
|
+
the ambiguity that breaks the direct acid probe for carbonyl and
|
|
274
|
+
hydroxyl fragments (formyl, acetyl, Boc, phenylmethanol, etc.).
|
|
275
|
+
"""
|
|
276
|
+
acid = Chem.MolFromSmiles(_ACID_SMILES)
|
|
277
|
+
if acid is None:
|
|
278
|
+
return None
|
|
279
|
+
|
|
280
|
+
combo = Chem.RWMol(Chem.CombineMols(mol, acid))
|
|
281
|
+
se_idx = combo.AddAtom(Chem.Atom(34)) # Se
|
|
282
|
+
combo.AddBond(attach_idx, se_idx, Chem.BondType.SINGLE)
|
|
283
|
+
acid_c_start = mol.GetNumAtoms() # C-20 of the acid
|
|
284
|
+
combo.AddBond(se_idx, acid_c_start, Chem.BondType.SINGLE)
|
|
285
|
+
try:
|
|
286
|
+
Chem.SanitizeMol(combo)
|
|
287
|
+
except Exception:
|
|
288
|
+
return None
|
|
289
|
+
|
|
290
|
+
probe_smi = Chem.MolToSmiles(combo.GetMol())
|
|
291
|
+
probe_name = _smiles_to_name(probe_smi)
|
|
292
|
+
if probe_name is None:
|
|
293
|
+
return None
|
|
294
|
+
|
|
295
|
+
if verbose:
|
|
296
|
+
print(f" Se-probe: '{probe_name}'", file=sys.stderr)
|
|
297
|
+
|
|
298
|
+
# Extract from "20-({sub}selanyl)icosanoic acid"
|
|
299
|
+
m = re.match(r'20-\((.+?)selanyl\)icosanoic acid$', probe_name)
|
|
300
|
+
if m:
|
|
301
|
+
return m.group(1)
|
|
302
|
+
# Without outer parens: "20-{sub}selanylicosanoic acid"
|
|
303
|
+
m = re.match(r'20-(.+?)selanylicosanoic acid$', probe_name)
|
|
304
|
+
if m:
|
|
305
|
+
return m.group(1)
|
|
306
|
+
return None
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
# ---------------------------------------------------------------------------
|
|
310
|
+
# Public fragment-naming API
|
|
311
|
+
# ---------------------------------------------------------------------------
|
|
312
|
+
|
|
313
|
+
# Simple single-atom substituent lookup (avoids ChemScript calls)
|
|
314
|
+
_SIMPLE_SUB_MAP = {
|
|
315
|
+
9: "fluoro", # F
|
|
316
|
+
17: "chloro", # Cl
|
|
317
|
+
35: "bromo", # Br
|
|
318
|
+
53: "iodo", # I
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _name_via_naphthalene_probe(mol: Chem.Mol, attach_idx: int,
|
|
323
|
+
verbose: bool = False) -> Optional[str]:
|
|
324
|
+
"""Fallback naming: attach fragment to naphthalene, extract substituent.
|
|
325
|
+
|
|
326
|
+
Used when the icosanoic acid probe fails (e.g. for simple alkyl groups
|
|
327
|
+
that merge into the acid chain). Naphthalene is a named bicyclic ring
|
|
328
|
+
system that takes IUPAC parent priority over most drug-like fragments.
|
|
329
|
+
|
|
330
|
+
Extracts from "2-(SUBSTITUENT)naphthalene" or "2-SUBSTITUENTnaphthalene".
|
|
331
|
+
"""
|
|
332
|
+
naph = Chem.MolFromSmiles("c1ccc2ccccc2c1")
|
|
333
|
+
if naph is None:
|
|
334
|
+
return None
|
|
335
|
+
|
|
336
|
+
combo = Chem.RWMol(Chem.CombineMols(mol, naph))
|
|
337
|
+
# Naphthalene position 2 = first atom after offset in canonical SMILES.
|
|
338
|
+
# In 'c1ccc2ccccc2c1' the atoms are ordered 0-9; position 2 corresponds
|
|
339
|
+
# to atom index 1 in canonical ordering. We use index 1 (the second
|
|
340
|
+
# carbon of the first ring — bonded to C1 and C3).
|
|
341
|
+
naph_c2_idx = mol.GetNumAtoms() + 1 # offset + 1
|
|
342
|
+
combo.AddBond(attach_idx, naph_c2_idx, Chem.BondType.SINGLE)
|
|
343
|
+
try:
|
|
344
|
+
Chem.SanitizeMol(combo)
|
|
345
|
+
except Exception:
|
|
346
|
+
return None
|
|
347
|
+
|
|
348
|
+
combo_smi = Chem.MolToSmiles(combo.GetMol())
|
|
349
|
+
combo_name = _smiles_to_name(combo_smi)
|
|
350
|
+
if combo_name is None:
|
|
351
|
+
return None
|
|
352
|
+
|
|
353
|
+
if verbose:
|
|
354
|
+
print(f" Naphthalene probe: '{combo_name}'", file=sys.stderr)
|
|
355
|
+
|
|
356
|
+
# Try bracketed form first: "2-(substituent)naphthalene"
|
|
357
|
+
m = re.match(r'\d+-\((.+)\)naphthalene$', combo_name)
|
|
358
|
+
if m:
|
|
359
|
+
return m.group(1)
|
|
360
|
+
# Unbracketed: "2-substituentnaphthalene"
|
|
361
|
+
m = re.match(r'\d+-(.+)naphthalene$', combo_name)
|
|
362
|
+
if m:
|
|
363
|
+
return m.group(1)
|
|
364
|
+
return None
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
@lru_cache(maxsize=256)
|
|
368
|
+
def _name_fragment_cached(canonical_frag_smiles: str,
|
|
369
|
+
verbose: bool = False) -> Optional[str]:
|
|
370
|
+
"""Cache-friendly inner function keyed on canonical SMILES."""
|
|
371
|
+
mol = Chem.MolFromSmiles(canonical_frag_smiles)
|
|
372
|
+
if mol is None:
|
|
373
|
+
return None
|
|
374
|
+
|
|
375
|
+
# Find dummy atom
|
|
376
|
+
dummy_idx = None
|
|
377
|
+
attach_idx = None
|
|
378
|
+
for atom in mol.GetAtoms():
|
|
379
|
+
if atom.GetAtomicNum() == 0:
|
|
380
|
+
dummy_idx = atom.GetIdx()
|
|
381
|
+
break
|
|
382
|
+
if dummy_idx is None:
|
|
383
|
+
return None
|
|
384
|
+
|
|
385
|
+
# Find the neighbor (attachment atom in the fragment)
|
|
386
|
+
dummy_atom = mol.GetAtomWithIdx(dummy_idx)
|
|
387
|
+
neighbors = list(dummy_atom.GetNeighbors())
|
|
388
|
+
if not neighbors:
|
|
389
|
+
return None
|
|
390
|
+
attach_idx = neighbors[0].GetIdx()
|
|
391
|
+
|
|
392
|
+
# --- Simple single-atom check ---
|
|
393
|
+
# If the fragment is just [*]-X where X is a single heavy atom with no
|
|
394
|
+
# other heavy-atom neighbors, use the lookup table.
|
|
395
|
+
attach_atom = mol.GetAtomWithIdx(attach_idx)
|
|
396
|
+
heavy_neighbors_of_attach = [
|
|
397
|
+
n for n in attach_atom.GetNeighbors() if n.GetAtomicNum() != 0
|
|
398
|
+
]
|
|
399
|
+
if (mol.GetNumHeavyAtoms() == 2 # [*] + one heavy atom
|
|
400
|
+
and attach_atom.GetAtomicNum() in _SIMPLE_SUB_MAP
|
|
401
|
+
and not heavy_neighbors_of_attach):
|
|
402
|
+
return _SIMPLE_SUB_MAP[attach_atom.GetAtomicNum()]
|
|
403
|
+
|
|
404
|
+
# Check for heteroatom directly bonded to dummy with further structure:
|
|
405
|
+
# [*]O (hydroxy), [*]N (amino), [*]S (sulfanyl) — only when no other
|
|
406
|
+
# heavy neighbors of the heteroatom (otherwise it's part of a bigger fragment)
|
|
407
|
+
if (mol.GetNumHeavyAtoms() == 2
|
|
408
|
+
and not heavy_neighbors_of_attach):
|
|
409
|
+
z = attach_atom.GetAtomicNum()
|
|
410
|
+
if z == 8:
|
|
411
|
+
return "hydroxy"
|
|
412
|
+
if z == 7:
|
|
413
|
+
return "amino"
|
|
414
|
+
if z == 16:
|
|
415
|
+
return "sulfanyl"
|
|
416
|
+
|
|
417
|
+
# --- General case: use icosanoic acid probe ---
|
|
418
|
+
# Remove the dummy atom and prepare clean fragment mol
|
|
419
|
+
edit = Chem.RWMol(mol)
|
|
420
|
+
edit.RemoveAtom(dummy_idx)
|
|
421
|
+
# Adjust attach_idx for the removal
|
|
422
|
+
adjusted_idx = attach_idx if attach_idx < dummy_idx else attach_idx - 1
|
|
423
|
+
try:
|
|
424
|
+
Chem.SanitizeMol(edit)
|
|
425
|
+
except Exception:
|
|
426
|
+
return None
|
|
427
|
+
|
|
428
|
+
frag_clean = edit.GetMol()
|
|
429
|
+
|
|
430
|
+
# Try acid probe first (works for ring-based and hetero-chain fragments)
|
|
431
|
+
result = _get_yl_via_acid_probe(frag_clean, adjusted_idx, verbose=verbose)
|
|
432
|
+
if result is not None:
|
|
433
|
+
return result
|
|
434
|
+
|
|
435
|
+
# Acid probe fails for simple alkyls (they extend the C20 chain).
|
|
436
|
+
# Fallback: attach to naphthalene and extract from "2-(X)naphthalene".
|
|
437
|
+
naph_result = _name_via_naphthalene_probe(frag_clean, adjusted_idx,
|
|
438
|
+
verbose=verbose)
|
|
439
|
+
if naph_result is not None:
|
|
440
|
+
return naph_result
|
|
441
|
+
|
|
442
|
+
# Both probes failed — try acyl fragment detection.
|
|
443
|
+
# Acyl groups ([*]-C(=O)-R) cause probe parents to flip because C=O
|
|
444
|
+
# becomes the principal characteristic group.
|
|
445
|
+
# Strategy: detect C=O at attachment, cap with OH → carboxylic acid form,
|
|
446
|
+
# name the acid, derive the acyl prefix.
|
|
447
|
+
attach_a = frag_clean.GetAtomWithIdx(adjusted_idx)
|
|
448
|
+
if attach_a.GetAtomicNum() == 6:
|
|
449
|
+
# Check for C=O double bond on attachment carbon
|
|
450
|
+
carbonyl_o_idx = None
|
|
451
|
+
for bond in attach_a.GetBonds():
|
|
452
|
+
other = frag_clean.GetAtomWithIdx(bond.GetOtherAtomIdx(adjusted_idx))
|
|
453
|
+
if (other.GetAtomicNum() == 8
|
|
454
|
+
and bond.GetBondType() == Chem.BondType.DOUBLE):
|
|
455
|
+
carbonyl_o_idx = other.GetIdx()
|
|
456
|
+
break
|
|
457
|
+
if carbonyl_o_idx is not None:
|
|
458
|
+
# Build the carboxylic acid: add OH at the attachment point
|
|
459
|
+
acid_edit = Chem.RWMol(frag_clean)
|
|
460
|
+
oh_idx = acid_edit.AddAtom(Chem.Atom(8))
|
|
461
|
+
acid_edit.AddBond(adjusted_idx, oh_idx, Chem.BondType.SINGLE)
|
|
462
|
+
try:
|
|
463
|
+
Chem.SanitizeMol(acid_edit)
|
|
464
|
+
acid_smi = Chem.MolToSmiles(acid_edit.GetMol())
|
|
465
|
+
acid_name = _smiles_to_name(acid_smi)
|
|
466
|
+
if verbose:
|
|
467
|
+
print(f" Acyl acid form: '{acid_name}'",
|
|
468
|
+
file=sys.stderr)
|
|
469
|
+
if acid_name:
|
|
470
|
+
# Convert acid name → acyl prefix:
|
|
471
|
+
# "formic acid" → "formyl"
|
|
472
|
+
# "acetic acid" → "acetyl"
|
|
473
|
+
# "benzoic acid" → "benzoyl"
|
|
474
|
+
# "X-ic acid" → "X-yl" (general rule)
|
|
475
|
+
if acid_name.endswith("ic acid"):
|
|
476
|
+
base = acid_name[:-len("ic acid")]
|
|
477
|
+
if base.endswith("carboxyl"):
|
|
478
|
+
return base + "yl"
|
|
479
|
+
return base + "yl"
|
|
480
|
+
except Exception:
|
|
481
|
+
pass
|
|
482
|
+
|
|
483
|
+
# Acyl-ester pattern: [*]-C(=O)-O-R → "(R-oxy)carbonyl"
|
|
484
|
+
# Detect: attachment C has C=O and also single-bonded O
|
|
485
|
+
ester_o_idx = None
|
|
486
|
+
for bond in attach_a.GetBonds():
|
|
487
|
+
other_idx = bond.GetOtherAtomIdx(adjusted_idx)
|
|
488
|
+
other = frag_clean.GetAtomWithIdx(other_idx)
|
|
489
|
+
if (other.GetAtomicNum() == 8
|
|
490
|
+
and bond.GetBondType() == Chem.BondType.SINGLE
|
|
491
|
+
and other_idx != carbonyl_o_idx):
|
|
492
|
+
ester_o_idx = other_idx
|
|
493
|
+
break
|
|
494
|
+
|
|
495
|
+
if ester_o_idx is not None:
|
|
496
|
+
# Build the R-OH fragment (the ester's alcohol)
|
|
497
|
+
# Break bond at carbonyl C → ester O, replace with [*]
|
|
498
|
+
r_edit = Chem.RWMol(frag_clean)
|
|
499
|
+
r_edit.RemoveBond(adjusted_idx, ester_o_idx)
|
|
500
|
+
# Remove the C=O + attachment side, keep the O-R side
|
|
501
|
+
# Simpler: build [*]-O-R directly
|
|
502
|
+
r_frag = Chem.RWMol()
|
|
503
|
+
# BFS from ester_o_idx to collect all atoms on that side
|
|
504
|
+
visited_r = set()
|
|
505
|
+
queue_r = [ester_o_idx]
|
|
506
|
+
while queue_r:
|
|
507
|
+
ai = queue_r.pop()
|
|
508
|
+
if ai in visited_r or ai == adjusted_idx:
|
|
509
|
+
continue
|
|
510
|
+
visited_r.add(ai)
|
|
511
|
+
for nbr in frag_clean.GetAtomWithIdx(ai).GetNeighbors():
|
|
512
|
+
ni = nbr.GetIdx()
|
|
513
|
+
if ni != adjusted_idx and ni not in visited_r:
|
|
514
|
+
queue_r.append(ni)
|
|
515
|
+
|
|
516
|
+
old_to_new_r = {}
|
|
517
|
+
for old_i in sorted(visited_r):
|
|
518
|
+
src = frag_clean.GetAtomWithIdx(old_i)
|
|
519
|
+
na = Chem.Atom(src.GetAtomicNum())
|
|
520
|
+
na.SetFormalCharge(src.GetFormalCharge())
|
|
521
|
+
na.SetIsAromatic(src.GetIsAromatic())
|
|
522
|
+
new_i = r_frag.AddAtom(na)
|
|
523
|
+
old_to_new_r[old_i] = new_i
|
|
524
|
+
|
|
525
|
+
# Add dummy at where the C(=O) was
|
|
526
|
+
dummy_new = r_frag.AddAtom(Chem.Atom(0))
|
|
527
|
+
r_frag.AddBond(old_to_new_r[ester_o_idx], dummy_new,
|
|
528
|
+
Chem.BondType.SINGLE)
|
|
529
|
+
|
|
530
|
+
# Add bonds within R-O fragment
|
|
531
|
+
for old_i in visited_r:
|
|
532
|
+
for bond in frag_clean.GetAtomWithIdx(old_i).GetBonds():
|
|
533
|
+
other_i = bond.GetOtherAtomIdx(old_i)
|
|
534
|
+
if other_i in visited_r and old_i < other_i:
|
|
535
|
+
r_frag.AddBond(old_to_new_r[old_i],
|
|
536
|
+
old_to_new_r[other_i],
|
|
537
|
+
bond.GetBondType())
|
|
538
|
+
try:
|
|
539
|
+
Chem.SanitizeMol(r_frag)
|
|
540
|
+
r_smi = Chem.MolToSmiles(r_frag)
|
|
541
|
+
# Name the [*]-O-R fragment → should give "R-oxy"
|
|
542
|
+
r_name = name_fragment_as_substituent(r_smi, verbose=verbose)
|
|
543
|
+
if r_name:
|
|
544
|
+
return f"({r_name})carbonyl"
|
|
545
|
+
except Exception:
|
|
546
|
+
pass
|
|
547
|
+
|
|
548
|
+
return None
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def name_fragment_as_substituent(frag_smiles: str,
|
|
552
|
+
verbose: bool = False) -> Optional[str]:
|
|
553
|
+
"""Convert a [*]-bearing fragment SMILES to its IUPAC substituent prefix.
|
|
554
|
+
|
|
555
|
+
Uses the icosanoic acid probe (C20 acid): attaches the fragment at [*]
|
|
556
|
+
to the acid's terminal carbon, names the whole molecule via ChemScript,
|
|
557
|
+
and extracts the substituent from "20-(SUBSTITUENT)icosanoic acid".
|
|
558
|
+
|
|
559
|
+
For simple single-atom fragments (F, Cl, Br, I, O, N, S) a direct
|
|
560
|
+
lookup table is used to avoid a ChemScript call.
|
|
561
|
+
|
|
562
|
+
Args:
|
|
563
|
+
frag_smiles: SMILES with [*] marking the attachment point.
|
|
564
|
+
E.g. "[*]c1ccccc1" for phenyl, "[*]F" for fluoro.
|
|
565
|
+
verbose: Print debug info to stderr.
|
|
566
|
+
|
|
567
|
+
Returns:
|
|
568
|
+
Substituent prefix name (e.g. "phenyl", "fluoro", "morpholino",
|
|
569
|
+
"(piperidin-1-yl)") or None on failure.
|
|
570
|
+
|
|
571
|
+
Examples::
|
|
572
|
+
|
|
573
|
+
>>> name_fragment_as_substituent("[*]F")
|
|
574
|
+
'fluoro'
|
|
575
|
+
>>> name_fragment_as_substituent("[*]c1ccccc1")
|
|
576
|
+
'phenyl'
|
|
577
|
+
"""
|
|
578
|
+
# Canonicalise the fragment SMILES for cache lookup
|
|
579
|
+
mol = Chem.MolFromSmiles(frag_smiles)
|
|
580
|
+
if mol is None:
|
|
581
|
+
return None
|
|
582
|
+
canon = Chem.MolToSmiles(mol)
|
|
583
|
+
return _name_fragment_cached(canon, verbose=verbose)
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
def _get_yl_suffix_via_acid(parent_mol: Chem.Mol, parent_attach_idx: int,
|
|
587
|
+
heteroatom_num: int,
|
|
588
|
+
verbose: bool = False) -> Optional[str]:
|
|
589
|
+
"""Get the -yl+suffix form by building parent + heteroatom + icosanoic acid.
|
|
590
|
+
|
|
591
|
+
For heteroatom linkages (O, N, S), the substituent name includes the
|
|
592
|
+
heteroatom suffix (e.g., "pyridin-4-yloxy" for O, "phenylamino" for N).
|
|
593
|
+
We build: parent-[heteroatom]-icosanoic_acid, name it, and extract
|
|
594
|
+
the substituent from "20-(SUBSTITUENT)icosanoic acid".
|
|
595
|
+
"""
|
|
596
|
+
acid = Chem.MolFromSmiles("CCCCCCCCCCCCCCCCCCCC(=O)O")
|
|
597
|
+
if acid is None:
|
|
598
|
+
return None
|
|
599
|
+
het_atom = Chem.Atom(heteroatom_num)
|
|
600
|
+
combo = Chem.RWMol(Chem.CombineMols(parent_mol, acid))
|
|
601
|
+
het_idx = combo.AddAtom(het_atom)
|
|
602
|
+
combo.AddBond(parent_attach_idx, het_idx, Chem.BondType.SINGLE)
|
|
603
|
+
acid_c_start = parent_mol.GetNumAtoms()
|
|
604
|
+
combo.AddBond(het_idx, acid_c_start, Chem.BondType.SINGLE)
|
|
605
|
+
try:
|
|
606
|
+
Chem.SanitizeMol(combo)
|
|
607
|
+
except Exception:
|
|
608
|
+
return None
|
|
609
|
+
acid_name = _smiles_to_name(Chem.MolToSmiles(combo.GetMol()))
|
|
610
|
+
if acid_name is None:
|
|
611
|
+
return None
|
|
612
|
+
if verbose:
|
|
613
|
+
print(f" Acid+heteroatom probe: '{acid_name}'", file=sys.stderr)
|
|
614
|
+
m = re.match(r'20-\((.+)\)icosanoic acid$', acid_name)
|
|
615
|
+
if m:
|
|
616
|
+
return m.group(1)
|
|
617
|
+
m = re.match(r'20-(.+)icosanoic acid$', acid_name)
|
|
618
|
+
if m:
|
|
619
|
+
return m.group(1)
|
|
620
|
+
return None
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
def _get_locant_replace_heteroatom(sub_mol: Chem.Mol, sub_attach_idx: int,
|
|
624
|
+
verbose: bool = False
|
|
625
|
+
) -> Optional[Tuple[str, Optional[str]]]:
|
|
626
|
+
"""Remove heteroatom from sub fragment, add At to its C neighbor, name.
|
|
627
|
+
|
|
628
|
+
Returns (at_probe_name, locant) or None.
|
|
629
|
+
The At-probe name serves as the assembly template: replace "astato" with
|
|
630
|
+
the yl+suffix form to get the swapped name.
|
|
631
|
+
"""
|
|
632
|
+
het_atom = sub_mol.GetAtomWithIdx(sub_attach_idx)
|
|
633
|
+
# Find carbon neighbor of the heteroatom within the fragment
|
|
634
|
+
c_neighbor_idx = None
|
|
635
|
+
for n in het_atom.GetNeighbors():
|
|
636
|
+
if n.GetAtomicNum() == 6:
|
|
637
|
+
c_neighbor_idx = n.GetIdx()
|
|
638
|
+
break
|
|
639
|
+
if c_neighbor_idx is None:
|
|
640
|
+
return None
|
|
641
|
+
|
|
642
|
+
edit = Chem.RWMol(sub_mol)
|
|
643
|
+
edit.RemoveAtom(sub_attach_idx)
|
|
644
|
+
try:
|
|
645
|
+
Chem.SanitizeMol(edit)
|
|
646
|
+
except Exception:
|
|
647
|
+
return None
|
|
648
|
+
|
|
649
|
+
# Adjust index after atom removal
|
|
650
|
+
new_c_idx = (c_neighbor_idx - 1
|
|
651
|
+
if sub_attach_idx < c_neighbor_idx else c_neighbor_idx)
|
|
652
|
+
at_i = edit.AddAtom(Chem.Atom(85))
|
|
653
|
+
edit.AddBond(new_c_idx, at_i, Chem.BondType.SINGLE)
|
|
654
|
+
try:
|
|
655
|
+
Chem.SanitizeMol(edit)
|
|
656
|
+
except Exception:
|
|
657
|
+
return None
|
|
658
|
+
|
|
659
|
+
at_name = _smiles_to_name(Chem.MolToSmiles(edit.GetMol()))
|
|
660
|
+
if at_name is None:
|
|
661
|
+
return None
|
|
662
|
+
|
|
663
|
+
if verbose:
|
|
664
|
+
print(f" Het->At probe: '{at_name}'", file=sys.stderr)
|
|
665
|
+
|
|
666
|
+
locant = None
|
|
667
|
+
m = re.search(r'(\d+)-astato', at_name, re.IGNORECASE)
|
|
668
|
+
if m:
|
|
669
|
+
locant = m.group(1)
|
|
670
|
+
elif 'astato' in at_name.lower():
|
|
671
|
+
locant = ""
|
|
672
|
+
return at_name, locant
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
# ---------------------------------------------------------------------------
|
|
676
|
+
# Core decomposition logic
|
|
677
|
+
# ---------------------------------------------------------------------------
|
|
678
|
+
|
|
679
|
+
def validate_as_substituent(full_name: str, node: BracketNode,
|
|
680
|
+
verbose: bool = False) -> bool:
|
|
681
|
+
"""Check if replacing a bracket group with 'astato' gives a valid name.
|
|
682
|
+
|
|
683
|
+
This tells us ChemDraw treats that position as a real substituent slot.
|
|
684
|
+
"""
|
|
685
|
+
# Build modified name: replace (content) with astato
|
|
686
|
+
before = full_name[:node.start]
|
|
687
|
+
after = full_name[node.end + 1:]
|
|
688
|
+
modified = before + "astato" + after
|
|
689
|
+
if verbose:
|
|
690
|
+
print(f" At-probe name: {modified}", file=sys.stderr)
|
|
691
|
+
return _name_to_smiles(modified) is not None
|
|
692
|
+
|
|
693
|
+
|
|
694
|
+
def _find_at_atom(mol: Chem.Mol) -> Optional[int]:
|
|
695
|
+
"""Find the atom index of At in a molecule."""
|
|
696
|
+
for atom in mol.GetAtoms():
|
|
697
|
+
if atom.GetAtomicNum() == 85:
|
|
698
|
+
return atom.GetIdx()
|
|
699
|
+
return None
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
def _split_at_at(smiles: str) -> Optional[Tuple[str, str, int]]:
|
|
703
|
+
"""Given SMILES containing At, return (parent_smiles, At_neighbor_idx_in_parent).
|
|
704
|
+
|
|
705
|
+
Removes At and returns the molecule with At removed, plus the atom index
|
|
706
|
+
where At was attached (for later probe attachment).
|
|
707
|
+
Returns (smiles_without_at, original_smiles, neighbor_idx_in_clean_mol).
|
|
708
|
+
"""
|
|
709
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
710
|
+
if mol is None:
|
|
711
|
+
return None
|
|
712
|
+
at_idx = _find_at_atom(mol)
|
|
713
|
+
if at_idx is None:
|
|
714
|
+
return None
|
|
715
|
+
|
|
716
|
+
at_atom = mol.GetAtomWithIdx(at_idx)
|
|
717
|
+
neighbors = at_atom.GetNeighbors()
|
|
718
|
+
if not neighbors:
|
|
719
|
+
return None
|
|
720
|
+
neighbor_idx = neighbors[0].GetIdx()
|
|
721
|
+
|
|
722
|
+
# Remove At. Try sanitization first; if it fails (e.g. ring N losing
|
|
723
|
+
# a bond needs H compensation), try again with explicit H.
|
|
724
|
+
edit = Chem.RWMol(mol)
|
|
725
|
+
edit.RemoveAtom(at_idx)
|
|
726
|
+
try:
|
|
727
|
+
Chem.SanitizeMol(edit)
|
|
728
|
+
except Exception:
|
|
729
|
+
# Retry with explicit H on the neighbor (now shifted by At removal)
|
|
730
|
+
edit = Chem.RWMol(mol)
|
|
731
|
+
edit.RemoveAtom(at_idx)
|
|
732
|
+
adj_idx = neighbor_idx - 1 if at_idx < neighbor_idx else neighbor_idx
|
|
733
|
+
atom = edit.GetAtomWithIdx(adj_idx)
|
|
734
|
+
atom.SetNumExplicitHs(atom.GetNumExplicitHs() + 1)
|
|
735
|
+
try:
|
|
736
|
+
Chem.SanitizeMol(edit)
|
|
737
|
+
except Exception:
|
|
738
|
+
return None
|
|
739
|
+
clean_mol = edit.GetMol()
|
|
740
|
+
clean_smi = Chem.MolToSmiles(clean_mol)
|
|
741
|
+
|
|
742
|
+
# The neighbor index may have shifted if at_idx < neighbor_idx
|
|
743
|
+
if at_idx < neighbor_idx:
|
|
744
|
+
new_neighbor_idx = neighbor_idx - 1
|
|
745
|
+
else:
|
|
746
|
+
new_neighbor_idx = neighbor_idx
|
|
747
|
+
|
|
748
|
+
return clean_smi, smiles, new_neighbor_idx
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
def get_parent_smiles_from_at_probe(full_name: str,
|
|
752
|
+
node: BracketNode) -> Optional[Tuple[str, int]]:
|
|
753
|
+
"""Replace bracket group with 'astato', resolve to SMILES,
|
|
754
|
+
remove the At to get the parent fragment + attachment index.
|
|
755
|
+
|
|
756
|
+
Returns (parent_smiles, attach_idx_in_parent) or None.
|
|
757
|
+
"""
|
|
758
|
+
before = full_name[:node.start]
|
|
759
|
+
after = full_name[node.end + 1:]
|
|
760
|
+
modified = before + "astato" + after
|
|
761
|
+
at_smiles = _name_to_smiles(modified)
|
|
762
|
+
if at_smiles is None:
|
|
763
|
+
return None
|
|
764
|
+
result = _split_at_at(at_smiles)
|
|
765
|
+
if result is None:
|
|
766
|
+
return None
|
|
767
|
+
parent_smi, _, attach_idx = result
|
|
768
|
+
return parent_smi, attach_idx
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
def get_sub_smiles_from_bracket(node: BracketNode) -> Optional[str]:
|
|
772
|
+
"""Try to resolve the bracket content as a standalone chemical name.
|
|
773
|
+
|
|
774
|
+
The bracket content is the substituent in -yl form. We try several
|
|
775
|
+
strategies to resolve it to SMILES:
|
|
776
|
+
1. Direct: try the text as-is (works for e.g. "phenyl")
|
|
777
|
+
2. Strip trailing -yl and add -e (e.g. "pyridin-4-yl" → "pyridine")
|
|
778
|
+
"""
|
|
779
|
+
text = node.text.strip()
|
|
780
|
+
if not text:
|
|
781
|
+
return None
|
|
782
|
+
|
|
783
|
+
# Try as-is (e.g., "phenyl", "morpholino")
|
|
784
|
+
smi = _name_to_smiles(text)
|
|
785
|
+
if smi:
|
|
786
|
+
return smi
|
|
787
|
+
|
|
788
|
+
# Try removing trailing "-yl" variants and restoring parent form
|
|
789
|
+
for suffix in ["-yl", "yl"]:
|
|
790
|
+
if text.endswith(suffix):
|
|
791
|
+
stem = text[:-len(suffix)]
|
|
792
|
+
# Try adding 'e' back (pyridin → pyridine)
|
|
793
|
+
for restore in [stem + "e", stem + "ene", stem]:
|
|
794
|
+
smi = _name_to_smiles(restore)
|
|
795
|
+
if smi:
|
|
796
|
+
return smi
|
|
797
|
+
|
|
798
|
+
return None
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
# ---------------------------------------------------------------------------
|
|
802
|
+
# -yl form construction
|
|
803
|
+
# ---------------------------------------------------------------------------
|
|
804
|
+
|
|
805
|
+
# Well-known parent → substituent name mappings
|
|
806
|
+
_YL_SPECIALS = {
|
|
807
|
+
"benzene": ["phenyl"],
|
|
808
|
+
"naphthalene": ["naphthyl"],
|
|
809
|
+
"toluene": ["tolyl"],
|
|
810
|
+
}
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
def construct_yl_form(parent_name: str, locant: str) -> List[str]:
|
|
814
|
+
"""Construct candidate '-yl' substituent forms from a parent name.
|
|
815
|
+
|
|
816
|
+
Returns a list of candidates to try (most likely first).
|
|
817
|
+
ChemDraw round-trip validation will pick the correct one.
|
|
818
|
+
"""
|
|
819
|
+
lower = parent_name.lower().strip()
|
|
820
|
+
|
|
821
|
+
# Check specials
|
|
822
|
+
if lower in _YL_SPECIALS:
|
|
823
|
+
candidates = list(_YL_SPECIALS[lower])
|
|
824
|
+
# Also add the locant variant if applicable
|
|
825
|
+
if locant:
|
|
826
|
+
for c in list(candidates):
|
|
827
|
+
candidates.append(f"{c.replace('yl', f'-{locant}-yl')}")
|
|
828
|
+
return candidates
|
|
829
|
+
|
|
830
|
+
# General rule: drop trailing 'e' (if present), insert locant, add '-yl'
|
|
831
|
+
name = parent_name.strip()
|
|
832
|
+
if name.endswith('e') and not name.endswith('ene'):
|
|
833
|
+
stem = name[:-1]
|
|
834
|
+
else:
|
|
835
|
+
stem = name
|
|
836
|
+
|
|
837
|
+
candidates = []
|
|
838
|
+
if locant:
|
|
839
|
+
candidates.append(f"{stem}-{locant}-yl")
|
|
840
|
+
# Also try without locant (some names omit it)
|
|
841
|
+
candidates.append(f"{stem}-yl")
|
|
842
|
+
else:
|
|
843
|
+
candidates.append(f"{stem}-yl")
|
|
844
|
+
|
|
845
|
+
return candidates
|
|
846
|
+
|
|
847
|
+
|
|
848
|
+
def get_locant_via_at_probe(fragment_smiles: str,
|
|
849
|
+
attach_idx: int) -> Optional[str]:
|
|
850
|
+
"""Add At at attachment point, name via ChemDraw, extract locant.
|
|
851
|
+
|
|
852
|
+
Returns the locant string (e.g., "4") or None.
|
|
853
|
+
"""
|
|
854
|
+
mol = Chem.MolFromSmiles(fragment_smiles)
|
|
855
|
+
if mol is None:
|
|
856
|
+
return None
|
|
857
|
+
|
|
858
|
+
result = _add_at(mol, attach_idx)
|
|
859
|
+
if result is None:
|
|
860
|
+
return None
|
|
861
|
+
_, at_smi = result
|
|
862
|
+
|
|
863
|
+
at_name = _smiles_to_name(at_smi)
|
|
864
|
+
if at_name is None:
|
|
865
|
+
return None
|
|
866
|
+
|
|
867
|
+
# Extract locant from "X-astato..." pattern
|
|
868
|
+
m = re.search(r'(\d+)-astato', at_name, re.IGNORECASE)
|
|
869
|
+
if m:
|
|
870
|
+
return m.group(1)
|
|
871
|
+
|
|
872
|
+
# Check for "astato" without a numeric locant (position 1 implied)
|
|
873
|
+
if 'astato' in at_name.lower():
|
|
874
|
+
return ""
|
|
875
|
+
|
|
876
|
+
return None
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
# ---------------------------------------------------------------------------
|
|
880
|
+
# Prefix substituent detection (for bracketless names)
|
|
881
|
+
# ---------------------------------------------------------------------------
|
|
882
|
+
|
|
883
|
+
def find_prefix_substituents(name: str,
|
|
884
|
+
verbose: bool = False,
|
|
885
|
+
skip_single_prefix: bool = False,
|
|
886
|
+
) -> List[BracketNode]:
|
|
887
|
+
"""Detect non-bracketed substituent prefixes in a name.
|
|
888
|
+
|
|
889
|
+
For names like "4-phenylpyridine", bracket parsing finds nothing.
|
|
890
|
+
This function scans for the parent name at the end and identifies
|
|
891
|
+
substituent prefixes before it.
|
|
892
|
+
|
|
893
|
+
Strategy: try suffixes of increasing length as potential parent names
|
|
894
|
+
via ChemDraw. The longest valid suffix (that isn't the whole name)
|
|
895
|
+
is the parent; everything before it is substituent prefix(es).
|
|
896
|
+
|
|
897
|
+
Returns synthetic BracketNode(s) representing the prefix substituents,
|
|
898
|
+
with positions set so that the At-probe approach works.
|
|
899
|
+
"""
|
|
900
|
+
# Skip names that already have bracket groups (handled elsewhere)
|
|
901
|
+
if '(' in name:
|
|
902
|
+
return []
|
|
903
|
+
|
|
904
|
+
# Try suffixes from longest to shortest
|
|
905
|
+
# The parent name is at the END of the IUPAC name
|
|
906
|
+
words = name.split()
|
|
907
|
+
# For multi-word names like "benzoic acid", "butanoic acid",
|
|
908
|
+
# work with the last word first, then try multi-word suffixes
|
|
909
|
+
candidates = []
|
|
910
|
+
|
|
911
|
+
# Try each position as a split point
|
|
912
|
+
for i in range(1, len(name)):
|
|
913
|
+
suffix = name[i:]
|
|
914
|
+
prefix = name[:i]
|
|
915
|
+
|
|
916
|
+
# The suffix should start with a letter (parent name)
|
|
917
|
+
# Also allow "1H-" prefix (hydrogen designation in heterocycles)
|
|
918
|
+
# Also allow locant-prefixed parents like "1,3,4-oxadiazole"
|
|
919
|
+
if not suffix or (not suffix[0].isalpha()
|
|
920
|
+
and not re.match(r'\d+H-', suffix)
|
|
921
|
+
and not re.match(r'[\d,]+-[a-zA-Z]', suffix)):
|
|
922
|
+
continue
|
|
923
|
+
|
|
924
|
+
# The prefix should end with a substituent-like pattern
|
|
925
|
+
# (ends with a letter, typically 'yl', 'o', 'oxy', etc.)
|
|
926
|
+
if not prefix:
|
|
927
|
+
continue
|
|
928
|
+
|
|
929
|
+
# Quick filter: skip if suffix is too short to be a real parent name
|
|
930
|
+
if len(suffix) < 4:
|
|
931
|
+
continue
|
|
932
|
+
|
|
933
|
+
# Check if suffix is a valid parent name
|
|
934
|
+
smi = _name_to_smiles(suffix)
|
|
935
|
+
if smi is not None:
|
|
936
|
+
candidates.append((i, suffix, prefix, smi))
|
|
937
|
+
|
|
938
|
+
if not candidates:
|
|
939
|
+
return []
|
|
940
|
+
|
|
941
|
+
# Find the best candidate: the one where the prefix looks most like
|
|
942
|
+
# a substituent. Prefer splits where the prefix ends with a common
|
|
943
|
+
# substituent suffix (-yl, -o, -amino, etc.)
|
|
944
|
+
# and the parent name is a real ring/chain system.
|
|
945
|
+
best = None
|
|
946
|
+
for i, suffix, prefix, smi in candidates:
|
|
947
|
+
# Strip leading locant+dash from prefix (numeric or N-locants)
|
|
948
|
+
stripped = re.sub(r'^(?:N(?:,N)*[,-]|\d+[,-])+', '', prefix).rstrip('-')
|
|
949
|
+
if not stripped:
|
|
950
|
+
continue
|
|
951
|
+
|
|
952
|
+
# Check if the stripped prefix resolves as a substituent (name)
|
|
953
|
+
# by trying to resolve it.
|
|
954
|
+
# But skip if it contains internal or trailing locants
|
|
955
|
+
# (e.g. "chloro-4-phenyl" or "cyclohexyl-2" — really multi-prefix)
|
|
956
|
+
if re.search(r'.\d+[,-]|[a-z]-\d+$', stripped):
|
|
957
|
+
continue # multi-prefix, handled in fallback
|
|
958
|
+
sub_smi = _name_to_smiles(stripped)
|
|
959
|
+
if sub_smi is not None:
|
|
960
|
+
if best is None or len(suffix) > len(best[1]):
|
|
961
|
+
best = (i, suffix, prefix, smi, stripped, sub_smi)
|
|
962
|
+
continue
|
|
963
|
+
|
|
964
|
+
# Also try common -yl to parent conversions
|
|
965
|
+
if stripped.endswith('yl'):
|
|
966
|
+
for restore_suffix in ['e', 'ene', '']:
|
|
967
|
+
parent_form = stripped.rstrip('yl').rstrip('-') + restore_suffix
|
|
968
|
+
if parent_form:
|
|
969
|
+
sub_smi = _name_to_smiles(parent_form)
|
|
970
|
+
if sub_smi is not None:
|
|
971
|
+
if best is None or len(suffix) > len(best[1]):
|
|
972
|
+
best = (i, suffix, prefix, smi, stripped, sub_smi)
|
|
973
|
+
break
|
|
974
|
+
|
|
975
|
+
if best is not None and not skip_single_prefix:
|
|
976
|
+
split_pos, suffix, prefix, parent_smi, sub_text, sub_smi = best
|
|
977
|
+
|
|
978
|
+
if verbose:
|
|
979
|
+
print(f" Prefix scan: '{prefix}' + '{suffix}'", file=sys.stderr)
|
|
980
|
+
print(f" Substituent text: '{sub_text}' -> {sub_smi}",
|
|
981
|
+
file=sys.stderr)
|
|
982
|
+
print(f" Parent: '{suffix}' -> {parent_smi}", file=sys.stderr)
|
|
983
|
+
|
|
984
|
+
# Create a synthetic BracketNode for the prefix substituent
|
|
985
|
+
# Position it so that replacing name[start:end+1] with "astato"
|
|
986
|
+
# gives a valid At-probe name.
|
|
987
|
+
sub_start = prefix.find(sub_text)
|
|
988
|
+
if sub_start == -1:
|
|
989
|
+
sub_start = 0
|
|
990
|
+
sub_end = sub_start + len(sub_text) - 1
|
|
991
|
+
|
|
992
|
+
node = BracketNode(
|
|
993
|
+
text=sub_text,
|
|
994
|
+
start=sub_start,
|
|
995
|
+
end=sub_end,
|
|
996
|
+
depth=0,
|
|
997
|
+
kind="prefix_substituent",
|
|
998
|
+
)
|
|
999
|
+
return [node]
|
|
1000
|
+
|
|
1001
|
+
# Fallback: multi-prefix scan.
|
|
1002
|
+
# For "2-chloro-4-phenylquinoline", the whole prefix doesn't resolve
|
|
1003
|
+
# as one substituent. Split on locant boundaries and try each piece.
|
|
1004
|
+
# Try candidates sorted by suffix length ascending (shortest parent first
|
|
1005
|
+
# = longest prefix = most substituents to decompose).
|
|
1006
|
+
sorted_candidates = sorted(candidates, key=lambda c: len(c[1]))
|
|
1007
|
+
|
|
1008
|
+
for _, suffix, prefix, parent_smi in sorted_candidates:
|
|
1009
|
+
# Split prefix into individual substituents on locant boundaries
|
|
1010
|
+
# e.g. "2-chloro-4-phenyl" → ["2-chloro-", "4-phenyl"]
|
|
1011
|
+
parts = re.split(r'(?=(?:N(?:,N)*|\d+)[,-])', prefix)
|
|
1012
|
+
parts = [p for p in parts if p]
|
|
1013
|
+
|
|
1014
|
+
if verbose:
|
|
1015
|
+
print(f" Multi-prefix scan: prefix='{prefix}' parent='{suffix}'",
|
|
1016
|
+
file=sys.stderr)
|
|
1017
|
+
print(f" Parts: {parts}", file=sys.stderr)
|
|
1018
|
+
|
|
1019
|
+
nodes = []
|
|
1020
|
+
for part in parts:
|
|
1021
|
+
# Strip locant prefix
|
|
1022
|
+
stripped = re.sub(r'^(?:N(?:,N)*[,-]|\d+[,-])+', '',
|
|
1023
|
+
part).rstrip('-')
|
|
1024
|
+
if not stripped or len(stripped) < 3:
|
|
1025
|
+
continue
|
|
1026
|
+
|
|
1027
|
+
# Try to resolve as a substituent name
|
|
1028
|
+
sub_smi = _name_to_smiles(stripped)
|
|
1029
|
+
if sub_smi is None and stripped.endswith('yl'):
|
|
1030
|
+
for restore_suffix in ['e', 'ene', '']:
|
|
1031
|
+
parent_form = (stripped.rstrip('yl').rstrip('-')
|
|
1032
|
+
+ restore_suffix)
|
|
1033
|
+
if parent_form:
|
|
1034
|
+
sub_smi = _name_to_smiles(parent_form)
|
|
1035
|
+
if sub_smi is not None:
|
|
1036
|
+
break
|
|
1037
|
+
|
|
1038
|
+
if sub_smi is None:
|
|
1039
|
+
continue
|
|
1040
|
+
|
|
1041
|
+
# Find position of substituent text in the full name
|
|
1042
|
+
sub_start = name.find(stripped)
|
|
1043
|
+
if sub_start == -1:
|
|
1044
|
+
continue
|
|
1045
|
+
sub_end = sub_start + len(stripped) - 1
|
|
1046
|
+
|
|
1047
|
+
if verbose:
|
|
1048
|
+
print(f" Multi-prefix sub: '{stripped}' -> {sub_smi} "
|
|
1049
|
+
f"(pos {sub_start}-{sub_end})", file=sys.stderr)
|
|
1050
|
+
|
|
1051
|
+
nodes.append(BracketNode(
|
|
1052
|
+
text=stripped,
|
|
1053
|
+
start=sub_start,
|
|
1054
|
+
end=sub_end,
|
|
1055
|
+
depth=0,
|
|
1056
|
+
kind="prefix_substituent",
|
|
1057
|
+
))
|
|
1058
|
+
|
|
1059
|
+
if nodes:
|
|
1060
|
+
return nodes
|
|
1061
|
+
|
|
1062
|
+
# Fallback: multiplied prefix scan.
|
|
1063
|
+
# For "2,3-diphenylquinoline": locants=2,3, multiplier=di, sub=phenyl.
|
|
1064
|
+
# Construct At-probe for each locant: "3-phenyl-2-astatoquinoline".
|
|
1065
|
+
_MULTIPLIER_RE = re.compile(
|
|
1066
|
+
r'^([\d,]+)-' # locant list: "2,3-"
|
|
1067
|
+
r'(di|tri|tetra|penta|hexa)' # multiplier
|
|
1068
|
+
r'(.+)$' # base substituent: "phenyl"
|
|
1069
|
+
)
|
|
1070
|
+
for _, suffix, prefix, parent_smi in sorted_candidates:
|
|
1071
|
+
m = _MULTIPLIER_RE.match(prefix)
|
|
1072
|
+
if not m:
|
|
1073
|
+
continue
|
|
1074
|
+
locant_str, multiplier, base_sub = m.groups()
|
|
1075
|
+
locants = locant_str.split(',')
|
|
1076
|
+
|
|
1077
|
+
# Verify the base substituent resolves
|
|
1078
|
+
sub_smi = _name_to_smiles(base_sub.rstrip('-'))
|
|
1079
|
+
if sub_smi is None:
|
|
1080
|
+
continue
|
|
1081
|
+
|
|
1082
|
+
if verbose:
|
|
1083
|
+
print(f" Multiplied prefix: locants={locants} "
|
|
1084
|
+
f"mult={multiplier} sub='{base_sub}' "
|
|
1085
|
+
f"parent='{suffix}'", file=sys.stderr)
|
|
1086
|
+
|
|
1087
|
+
# For each locant, create a node whose At-probe replaces ONE
|
|
1088
|
+
# instance. The At-probe name is constructed manually:
|
|
1089
|
+
# "locants_minus_one-sub-locant-astato-parent"
|
|
1090
|
+
nodes = []
|
|
1091
|
+
clean_sub = base_sub.rstrip('-')
|
|
1092
|
+
# Ensure suffix starts with dash if it starts with a digit (locants)
|
|
1093
|
+
dash_suffix = suffix if suffix[0].isalpha() else f"-{suffix}"
|
|
1094
|
+
for loc in locants:
|
|
1095
|
+
other_locs = [l for l in locants if l != loc]
|
|
1096
|
+
if other_locs:
|
|
1097
|
+
# Build: "other_loc-sub-loc-astato-parent"
|
|
1098
|
+
other_prefix = ','.join(other_locs) + f'-{clean_sub}'
|
|
1099
|
+
probe_name = f"{other_prefix}-{loc}-astato{dash_suffix}"
|
|
1100
|
+
else:
|
|
1101
|
+
probe_name = f"{loc}-astato{dash_suffix}"
|
|
1102
|
+
|
|
1103
|
+
# Verify the probe resolves
|
|
1104
|
+
probe_smi = _name_to_smiles(probe_name)
|
|
1105
|
+
if probe_smi is None:
|
|
1106
|
+
if verbose:
|
|
1107
|
+
print(f" Mult At-probe FAIL: '{probe_name}'",
|
|
1108
|
+
file=sys.stderr)
|
|
1109
|
+
continue
|
|
1110
|
+
|
|
1111
|
+
if verbose:
|
|
1112
|
+
print(f" Mult At-probe OK: '{probe_name}'",
|
|
1113
|
+
file=sys.stderr)
|
|
1114
|
+
|
|
1115
|
+
# Create a special node that stores the full probe name
|
|
1116
|
+
# (can't use simple text replacement for multiplied prefixes)
|
|
1117
|
+
node = BracketNode(
|
|
1118
|
+
text=base_sub.rstrip('-'),
|
|
1119
|
+
start=-1, # sentinel: not a simple text position
|
|
1120
|
+
end=-1,
|
|
1121
|
+
depth=0,
|
|
1122
|
+
kind="multiplied_prefix",
|
|
1123
|
+
)
|
|
1124
|
+
# Store probe name + locant in the node text for later use
|
|
1125
|
+
node._probe_name = probe_name
|
|
1126
|
+
node._locant = loc
|
|
1127
|
+
node._parent_suffix = suffix
|
|
1128
|
+
nodes.append(node)
|
|
1129
|
+
|
|
1130
|
+
if nodes:
|
|
1131
|
+
return nodes
|
|
1132
|
+
|
|
1133
|
+
return []
|
|
1134
|
+
|
|
1135
|
+
|
|
1136
|
+
def _at_probe_for_prefix(full_name: str, node: BracketNode,
|
|
1137
|
+
verbose: bool = False) -> bool:
|
|
1138
|
+
"""Validate a prefix substituent by replacing it with 'astato'.
|
|
1139
|
+
|
|
1140
|
+
For prefix substituents, we replace the text directly (no parens to remove).
|
|
1141
|
+
For multiplied prefixes, the probe name is pre-computed.
|
|
1142
|
+
"""
|
|
1143
|
+
if node.kind == "multiplied_prefix" and hasattr(node, '_probe_name'):
|
|
1144
|
+
modified = node._probe_name
|
|
1145
|
+
else:
|
|
1146
|
+
before = full_name[:node.start]
|
|
1147
|
+
after = full_name[node.end + 1:]
|
|
1148
|
+
modified = before + "astato" + after
|
|
1149
|
+
if verbose:
|
|
1150
|
+
print(f" Prefix At-probe: '{modified}'", file=sys.stderr)
|
|
1151
|
+
return _name_to_smiles(modified) is not None
|
|
1152
|
+
|
|
1153
|
+
|
|
1154
|
+
@dataclass
|
|
1155
|
+
class FragmentResult:
|
|
1156
|
+
"""Result of splitting a molecule into parent and substituent."""
|
|
1157
|
+
parent_smi: str
|
|
1158
|
+
parent_mol: Chem.Mol
|
|
1159
|
+
parent_attach_idx: int
|
|
1160
|
+
sub_smi: str
|
|
1161
|
+
sub_mol: Chem.Mol
|
|
1162
|
+
sub_attach_idx: int
|
|
1163
|
+
|
|
1164
|
+
|
|
1165
|
+
def _get_fragments_via_at_probe(canonical_smiles: str, at_probe_smiles: str,
|
|
1166
|
+
verbose: bool = False
|
|
1167
|
+
) -> Optional[FragmentResult]:
|
|
1168
|
+
"""From the At-probe SMILES, extract parent and substituent fragments.
|
|
1169
|
+
|
|
1170
|
+
The At-probe SMILES has At replacing the substituent. We:
|
|
1171
|
+
1. Find the At atom and its neighbor in the At-probe molecule
|
|
1172
|
+
2. Remove At to get parent SMILES + attachment index
|
|
1173
|
+
3. Use substructure matching to find which atoms in the full molecule
|
|
1174
|
+
belong to the parent, then the rest is the substituent
|
|
1175
|
+
|
|
1176
|
+
Returns FragmentResult with mol objects (preserving atom indices) or None.
|
|
1177
|
+
"""
|
|
1178
|
+
result = _split_at_at(at_probe_smiles)
|
|
1179
|
+
if result is None:
|
|
1180
|
+
return None
|
|
1181
|
+
parent_smi, _, parent_attach_idx = result
|
|
1182
|
+
|
|
1183
|
+
# Match parent in full molecule to find substituent atoms
|
|
1184
|
+
full_mol = Chem.MolFromSmiles(canonical_smiles)
|
|
1185
|
+
parent_mol = Chem.MolFromSmiles(parent_smi)
|
|
1186
|
+
if full_mol is None or parent_mol is None:
|
|
1187
|
+
return None
|
|
1188
|
+
|
|
1189
|
+
parent_match = full_mol.GetSubstructMatch(parent_mol)
|
|
1190
|
+
if not parent_match:
|
|
1191
|
+
return None
|
|
1192
|
+
|
|
1193
|
+
parent_set = set(parent_match)
|
|
1194
|
+
sub_atoms = [i for i in range(full_mol.GetNumAtoms()) if i not in parent_set]
|
|
1195
|
+
|
|
1196
|
+
if not sub_atoms:
|
|
1197
|
+
return None
|
|
1198
|
+
|
|
1199
|
+
# Find attachment bond: parent atom → sub atom
|
|
1200
|
+
sub_attach_full = None
|
|
1201
|
+
parent_attach_full = None
|
|
1202
|
+
for bond in full_mol.GetBonds():
|
|
1203
|
+
a1, a2 = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
|
|
1204
|
+
if a1 in parent_set and a2 not in parent_set:
|
|
1205
|
+
parent_attach_full = a1
|
|
1206
|
+
sub_attach_full = a2
|
|
1207
|
+
break
|
|
1208
|
+
if a2 in parent_set and a1 not in parent_set:
|
|
1209
|
+
parent_attach_full = a2
|
|
1210
|
+
sub_attach_full = a1
|
|
1211
|
+
break
|
|
1212
|
+
|
|
1213
|
+
if sub_attach_full is None:
|
|
1214
|
+
return None
|
|
1215
|
+
|
|
1216
|
+
# Extract substituent as a separate molecule
|
|
1217
|
+
# Use RWMol: remove the bond, get fragments.
|
|
1218
|
+
# Clear aromaticity before bond removal to avoid kekulization issues,
|
|
1219
|
+
# then let SanitizeMol recalculate properly for each fragment.
|
|
1220
|
+
edit = Chem.RWMol(full_mol)
|
|
1221
|
+
Chem.Kekulize(edit, clearAromaticFlags=True)
|
|
1222
|
+
edit.RemoveBond(parent_attach_full, sub_attach_full)
|
|
1223
|
+
try:
|
|
1224
|
+
Chem.SanitizeMol(edit)
|
|
1225
|
+
except Exception:
|
|
1226
|
+
return None
|
|
1227
|
+
|
|
1228
|
+
frag_atom_lists = Chem.GetMolFrags(edit, asMols=False)
|
|
1229
|
+
frag_mols = Chem.GetMolFrags(edit, asMols=True, sanitizeFrags=True)
|
|
1230
|
+
|
|
1231
|
+
# Identify which fragment is the substituent and parent
|
|
1232
|
+
sub_frag_idx = None
|
|
1233
|
+
parent_frag_idx = None
|
|
1234
|
+
for fi, atom_list in enumerate(frag_atom_lists):
|
|
1235
|
+
if sub_attach_full in atom_list:
|
|
1236
|
+
sub_frag_idx = fi
|
|
1237
|
+
if parent_attach_full in atom_list:
|
|
1238
|
+
parent_frag_idx = fi
|
|
1239
|
+
|
|
1240
|
+
if sub_frag_idx is None or parent_frag_idx is None:
|
|
1241
|
+
return None
|
|
1242
|
+
|
|
1243
|
+
sub_frag_mol = frag_mols[sub_frag_idx]
|
|
1244
|
+
parent_frag_mol = frag_mols[parent_frag_idx]
|
|
1245
|
+
sub_smi = Chem.MolToSmiles(sub_frag_mol)
|
|
1246
|
+
parent_frag_smi = Chem.MolToSmiles(parent_frag_mol)
|
|
1247
|
+
|
|
1248
|
+
# Map attachment atom indices from full molecule to fragment indices
|
|
1249
|
+
sub_mapping = {old: new for new, old in enumerate(frag_atom_lists[sub_frag_idx])}
|
|
1250
|
+
parent_frag_mapping = {old: new for new, old in enumerate(frag_atom_lists[parent_frag_idx])}
|
|
1251
|
+
sub_attach_in_frag = sub_mapping.get(sub_attach_full)
|
|
1252
|
+
parent_attach_in_frag = parent_frag_mapping.get(parent_attach_full)
|
|
1253
|
+
|
|
1254
|
+
if sub_attach_in_frag is None or parent_attach_in_frag is None:
|
|
1255
|
+
return None
|
|
1256
|
+
|
|
1257
|
+
return FragmentResult(
|
|
1258
|
+
parent_smi=parent_frag_smi,
|
|
1259
|
+
parent_mol=parent_frag_mol,
|
|
1260
|
+
parent_attach_idx=parent_attach_in_frag,
|
|
1261
|
+
sub_smi=sub_smi,
|
|
1262
|
+
sub_mol=sub_frag_mol,
|
|
1263
|
+
sub_attach_idx=sub_attach_in_frag,
|
|
1264
|
+
)
|
|
1265
|
+
|
|
1266
|
+
|
|
1267
|
+
def generate_alternative_from_prefix(full_name: str, canonical_smiles: str,
|
|
1268
|
+
node: BracketNode,
|
|
1269
|
+
verbose: bool = False,
|
|
1270
|
+
max_depth: int = 0,
|
|
1271
|
+
_deadline: Optional[float] = None,
|
|
1272
|
+
) -> List[Alternative]:
|
|
1273
|
+
"""Generate alternatives for a prefix substituent (no brackets).
|
|
1274
|
+
|
|
1275
|
+
Uses the At-probe to identify parent/substituent fragments from the
|
|
1276
|
+
molecular graph, avoiding the need to resolve substituent names
|
|
1277
|
+
(like "phenyl") which can give radical SMILES.
|
|
1278
|
+
"""
|
|
1279
|
+
alternatives = []
|
|
1280
|
+
|
|
1281
|
+
# Get parent and substituent fragments via At-probe
|
|
1282
|
+
if node.kind == "multiplied_prefix" and hasattr(node, '_probe_name'):
|
|
1283
|
+
at_name = node._probe_name
|
|
1284
|
+
else:
|
|
1285
|
+
before = full_name[:node.start]
|
|
1286
|
+
after = full_name[node.end + 1:]
|
|
1287
|
+
at_name = before + "astato" + after
|
|
1288
|
+
at_smi = _name_to_smiles(at_name)
|
|
1289
|
+
if at_smi is None:
|
|
1290
|
+
if verbose:
|
|
1291
|
+
print(f" Prefix At-probe failed: '{at_name}'", file=sys.stderr)
|
|
1292
|
+
return alternatives
|
|
1293
|
+
|
|
1294
|
+
frags = _get_fragments_via_at_probe(canonical_smiles, at_smi,
|
|
1295
|
+
verbose=verbose)
|
|
1296
|
+
if frags is None:
|
|
1297
|
+
if verbose:
|
|
1298
|
+
print(f" Fragment extraction failed", file=sys.stderr)
|
|
1299
|
+
return alternatives
|
|
1300
|
+
|
|
1301
|
+
return _assemble_alternatives(frags, canonical_smiles, verbose=verbose,
|
|
1302
|
+
max_depth=max_depth, _deadline=_deadline)
|
|
1303
|
+
|
|
1304
|
+
|
|
1305
|
+
# ---------------------------------------------------------------------------
|
|
1306
|
+
# Helpers for recursive assembly
|
|
1307
|
+
# ---------------------------------------------------------------------------
|
|
1308
|
+
|
|
1309
|
+
def _parent_name_from_bracket_yl(yl_text: str) -> Optional[str]:
|
|
1310
|
+
"""Derive a parent name from a bracket-group -yl text.
|
|
1311
|
+
|
|
1312
|
+
E.g., '2-morpholino-4-phenylquinolin-3-yl'
|
|
1313
|
+
→ strip '-3-yl' → '2-morpholino-4-phenylquinolin'
|
|
1314
|
+
→ add 'e' → '2-morpholino-4-phenylquinoline'
|
|
1315
|
+
|
|
1316
|
+
Returns the parent name if it resolves via ChemDraw, else None.
|
|
1317
|
+
"""
|
|
1318
|
+
m = re.search(r'-(\d+)-yl$', yl_text)
|
|
1319
|
+
if not m:
|
|
1320
|
+
return None
|
|
1321
|
+
base = yl_text[:m.start()]
|
|
1322
|
+
# Most IUPAC ring names drop a trailing 'e' to form -yl
|
|
1323
|
+
# (quinoline → quinolin-yl, pyridine → pyridin-yl)
|
|
1324
|
+
for suffix in ('e', ''):
|
|
1325
|
+
candidate = base + suffix
|
|
1326
|
+
if _name_to_smiles(candidate) is not None:
|
|
1327
|
+
return candidate
|
|
1328
|
+
return None
|
|
1329
|
+
|
|
1330
|
+
|
|
1331
|
+
def _insert_prefix_by_locant(name: str, locant: str,
|
|
1332
|
+
prefix_text: str) -> str:
|
|
1333
|
+
"""Insert '{locant}-{prefix_text}-' at the correct numerical position.
|
|
1334
|
+
|
|
1335
|
+
Scans top-level locants (skipping bracketed content) and inserts
|
|
1336
|
+
before the first locant that is numerically greater than *locant*.
|
|
1337
|
+
|
|
1338
|
+
>>> _insert_prefix_by_locant('2-morpholino-4-phenylquinoline',
|
|
1339
|
+
... '3', '(phenylmethanol-yl)')
|
|
1340
|
+
'2-morpholino-3-(phenylmethanol-yl)-4-phenylquinoline'
|
|
1341
|
+
"""
|
|
1342
|
+
target = int(locant)
|
|
1343
|
+
depth = 0
|
|
1344
|
+
i = 0
|
|
1345
|
+
while i < len(name):
|
|
1346
|
+
c = name[i]
|
|
1347
|
+
if c in '([':
|
|
1348
|
+
depth += 1
|
|
1349
|
+
i += 1
|
|
1350
|
+
elif c in ')]':
|
|
1351
|
+
depth -= 1
|
|
1352
|
+
i += 1
|
|
1353
|
+
elif c.isdigit() and depth == 0:
|
|
1354
|
+
j = i
|
|
1355
|
+
while j < len(name) and name[j].isdigit():
|
|
1356
|
+
j += 1
|
|
1357
|
+
if j < len(name) and name[j] == '-':
|
|
1358
|
+
num = int(name[i:j])
|
|
1359
|
+
if num > target:
|
|
1360
|
+
return (name[:i] + f"{locant}-{prefix_text}-"
|
|
1361
|
+
+ name[i:])
|
|
1362
|
+
i = j
|
|
1363
|
+
else:
|
|
1364
|
+
i += 1
|
|
1365
|
+
# Fallback: no locant greater than target was found. The new prefix
|
|
1366
|
+
# should go AFTER all existing locant-prefix groups (before the parent
|
|
1367
|
+
# stem), not at the very start.
|
|
1368
|
+
#
|
|
1369
|
+
# If the last prefix is bracketed, we can find its closing bracket and
|
|
1370
|
+
# insert right after it. For unbracketed prefixes we fall back to
|
|
1371
|
+
# prepending (may give non-ascending locant order, but ChemDraw's
|
|
1372
|
+
# resolver is lenient).
|
|
1373
|
+
#
|
|
1374
|
+
# Find the last locant-dash at depth 0:
|
|
1375
|
+
last_ld_end = None # position right after the last locant's '-'
|
|
1376
|
+
d2 = 0
|
|
1377
|
+
k = 0
|
|
1378
|
+
while k < len(name):
|
|
1379
|
+
ch = name[k]
|
|
1380
|
+
if ch in '([':
|
|
1381
|
+
d2 += 1; k += 1
|
|
1382
|
+
elif ch in ')]':
|
|
1383
|
+
d2 -= 1; k += 1
|
|
1384
|
+
elif ch.isdigit() and d2 == 0:
|
|
1385
|
+
kj = k
|
|
1386
|
+
while kj < len(name) and name[kj].isdigit():
|
|
1387
|
+
kj += 1
|
|
1388
|
+
if kj < len(name) and name[kj] == '-':
|
|
1389
|
+
last_ld_end = kj + 1
|
|
1390
|
+
k = kj
|
|
1391
|
+
else:
|
|
1392
|
+
k += 1
|
|
1393
|
+
|
|
1394
|
+
if last_ld_end is not None and last_ld_end < len(name) and name[last_ld_end] == '(':
|
|
1395
|
+
# Last prefix is bracketed — find the matching ')'.
|
|
1396
|
+
bd = 1
|
|
1397
|
+
bp = last_ld_end + 1
|
|
1398
|
+
while bp < len(name) and bd > 0:
|
|
1399
|
+
if name[bp] in '([':
|
|
1400
|
+
bd += 1
|
|
1401
|
+
elif name[bp] in ')]':
|
|
1402
|
+
bd -= 1
|
|
1403
|
+
bp += 1
|
|
1404
|
+
# bp is right after the closing bracket.
|
|
1405
|
+
# Insert: {existing}-{locant}-{prefix}{rest}
|
|
1406
|
+
rest = name[bp:]
|
|
1407
|
+
sep = "" if not rest or not rest[0].isdigit() else "-"
|
|
1408
|
+
return name[:bp] + f"-{locant}-{prefix_text}{sep}" + rest
|
|
1409
|
+
|
|
1410
|
+
# Ultimate fallback: prepend. No trailing hyphen when *name* starts
|
|
1411
|
+
# with a letter (the parent stem).
|
|
1412
|
+
sep = "-" if name and name[0].isdigit() else ""
|
|
1413
|
+
return f"{locant}-{prefix_text}{sep}" + name
|
|
1414
|
+
|
|
1415
|
+
|
|
1416
|
+
# ---------------------------------------------------------------------------
|
|
1417
|
+
# Alternative name generation
|
|
1418
|
+
# ---------------------------------------------------------------------------
|
|
1419
|
+
|
|
1420
|
+
def generate_alternative(full_name: str, canonical_smiles: str,
|
|
1421
|
+
node: BracketNode,
|
|
1422
|
+
verbose: bool = False,
|
|
1423
|
+
max_depth: int = 0,
|
|
1424
|
+
_deadline: Optional[float] = None,
|
|
1425
|
+
) -> List[Alternative]:
|
|
1426
|
+
"""Generate alternative names by swapping parent ↔ substituent at one bracket.
|
|
1427
|
+
|
|
1428
|
+
Uses At-probe + molecular graph fragmentation to extract parent/sub
|
|
1429
|
+
fragments with correct atom indices.
|
|
1430
|
+
"""
|
|
1431
|
+
# Get At-probe SMILES (replace bracket with astato)
|
|
1432
|
+
before = full_name[:node.start]
|
|
1433
|
+
after = full_name[node.end + 1:]
|
|
1434
|
+
at_name = before + "astato" + after
|
|
1435
|
+
at_smi = _name_to_smiles(at_name)
|
|
1436
|
+
if at_smi is None:
|
|
1437
|
+
if verbose:
|
|
1438
|
+
print(f" At-probe failed: '{at_name}'", file=sys.stderr)
|
|
1439
|
+
return []
|
|
1440
|
+
|
|
1441
|
+
# Extract parent and substituent fragments from the molecular graph
|
|
1442
|
+
frags = _get_fragments_via_at_probe(canonical_smiles, at_smi,
|
|
1443
|
+
verbose=verbose)
|
|
1444
|
+
if frags is None:
|
|
1445
|
+
if verbose:
|
|
1446
|
+
print(f" Fragment extraction failed", file=sys.stderr)
|
|
1447
|
+
return []
|
|
1448
|
+
|
|
1449
|
+
return _assemble_alternatives(frags, canonical_smiles, verbose=verbose,
|
|
1450
|
+
max_depth=max_depth, _deadline=_deadline,
|
|
1451
|
+
_bracket_yl_text=node.text)
|
|
1452
|
+
|
|
1453
|
+
|
|
1454
|
+
def _assemble_alternatives(frags: FragmentResult, canonical_smiles: str,
|
|
1455
|
+
verbose: bool = False,
|
|
1456
|
+
max_depth: int = 0,
|
|
1457
|
+
_deadline: Optional[float] = None,
|
|
1458
|
+
_bracket_yl_text: str = "",
|
|
1459
|
+
) -> List[Alternative]:
|
|
1460
|
+
"""Shared assembly logic for both bracket and prefix substituents.
|
|
1461
|
+
|
|
1462
|
+
Given parent/sub fragments (with correct mol objects and attachment indices),
|
|
1463
|
+
construct -yl form of parent, get locant on new parent, assemble via
|
|
1464
|
+
replace-hal, and validate.
|
|
1465
|
+
"""
|
|
1466
|
+
alternatives = []
|
|
1467
|
+
|
|
1468
|
+
# Name both fragments
|
|
1469
|
+
parent_name = _smiles_to_name(frags.parent_smi)
|
|
1470
|
+
sub_parent_name = _smiles_to_name(frags.sub_smi)
|
|
1471
|
+
if parent_name is None or sub_parent_name is None:
|
|
1472
|
+
if verbose:
|
|
1473
|
+
print(f" Could not name fragments: parent={frags.parent_smi} "
|
|
1474
|
+
f"sub={frags.sub_smi}", file=sys.stderr)
|
|
1475
|
+
return alternatives
|
|
1476
|
+
|
|
1477
|
+
# Get locant on current parent
|
|
1478
|
+
# Use the mol object directly to preserve atom indices
|
|
1479
|
+
parent_locant_result = _add_at(frags.parent_mol, frags.parent_attach_idx)
|
|
1480
|
+
parent_locant = None
|
|
1481
|
+
if parent_locant_result:
|
|
1482
|
+
_, parent_at_smi = parent_locant_result
|
|
1483
|
+
parent_at_name = _smiles_to_name(parent_at_smi)
|
|
1484
|
+
if parent_at_name:
|
|
1485
|
+
m = re.search(r'(\d+)-astato', parent_at_name, re.IGNORECASE)
|
|
1486
|
+
if m:
|
|
1487
|
+
parent_locant = m.group(1)
|
|
1488
|
+
elif 'astato' in parent_at_name.lower():
|
|
1489
|
+
parent_locant = ""
|
|
1490
|
+
|
|
1491
|
+
# Construct -yl form candidates for the old parent
|
|
1492
|
+
yl_candidates = construct_yl_form(parent_name, parent_locant or "")
|
|
1493
|
+
|
|
1494
|
+
# Se-probe: often gives superior -yl forms (e.g. formyl, acetyl,
|
|
1495
|
+
# hydroxy(phenyl)methyl) that construct_yl_form cannot derive.
|
|
1496
|
+
se_yl = _get_yl_via_selenyl_probe(
|
|
1497
|
+
frags.parent_mol, frags.parent_attach_idx, verbose=verbose)
|
|
1498
|
+
if se_yl and se_yl not in yl_candidates:
|
|
1499
|
+
yl_candidates.insert(0, se_yl)
|
|
1500
|
+
|
|
1501
|
+
if verbose:
|
|
1502
|
+
print(f" Sub fragment: {frags.sub_smi} → '{sub_parent_name}'",
|
|
1503
|
+
file=sys.stderr)
|
|
1504
|
+
print(f" Parent: {frags.parent_smi} → '{parent_name}' "
|
|
1505
|
+
f"(locant={parent_locant})", file=sys.stderr)
|
|
1506
|
+
print(f" -yl candidates: {yl_candidates}", file=sys.stderr)
|
|
1507
|
+
|
|
1508
|
+
# Get locant on new parent (the old substituent) using mol object
|
|
1509
|
+
# Check if attachment is on a heteroatom (O, N, S) — needs special handling
|
|
1510
|
+
# BUT: ring heteroatoms (like N in morpholine) work fine with At-probe,
|
|
1511
|
+
# only exocyclic heteroatoms (O in ethers, N in amines) need special path
|
|
1512
|
+
sub_attach_atom = frags.sub_mol.GetAtomWithIdx(frags.sub_attach_idx)
|
|
1513
|
+
is_heteroatom = (sub_attach_atom.GetAtomicNum() in (7, 8, 16)
|
|
1514
|
+
and not sub_attach_atom.IsInRing()) # exocyclic only
|
|
1515
|
+
|
|
1516
|
+
new_parent_at_name = None
|
|
1517
|
+
new_parent_locant = None
|
|
1518
|
+
heteroatom_yl_suffix = None # e.g., "pyridin-4-yloxy"
|
|
1519
|
+
|
|
1520
|
+
if is_heteroatom:
|
|
1521
|
+
# Heteroatom pathway: can't add At to O/N/S directly
|
|
1522
|
+
het_num = sub_attach_atom.GetAtomicNum()
|
|
1523
|
+
if verbose:
|
|
1524
|
+
print(f" Heteroatom attachment: {sub_attach_atom.GetSymbol()} "
|
|
1525
|
+
f"(Z={het_num})", file=sys.stderr)
|
|
1526
|
+
|
|
1527
|
+
# Step A: Get yl+suffix via acid probe through heteroatom
|
|
1528
|
+
heteroatom_yl_suffix = _get_yl_suffix_via_acid(
|
|
1529
|
+
frags.parent_mol, frags.parent_attach_idx, het_num,
|
|
1530
|
+
verbose=verbose)
|
|
1531
|
+
|
|
1532
|
+
# Step B: Get locant by replacing heteroatom with At
|
|
1533
|
+
loc_result = _get_locant_replace_heteroatom(
|
|
1534
|
+
frags.sub_mol, frags.sub_attach_idx, verbose=verbose)
|
|
1535
|
+
if loc_result is not None:
|
|
1536
|
+
new_parent_at_name, new_parent_locant = loc_result
|
|
1537
|
+
|
|
1538
|
+
if new_parent_at_name is None or heteroatom_yl_suffix is None:
|
|
1539
|
+
if verbose:
|
|
1540
|
+
print(f" Heteroatom pathway failed: at_name={new_parent_at_name} "
|
|
1541
|
+
f"yl_suffix={heteroatom_yl_suffix}", file=sys.stderr)
|
|
1542
|
+
return alternatives
|
|
1543
|
+
else:
|
|
1544
|
+
# Normal pathway: At directly on carbon
|
|
1545
|
+
at_result = _add_at(frags.sub_mol, frags.sub_attach_idx)
|
|
1546
|
+
if at_result is None:
|
|
1547
|
+
if verbose:
|
|
1548
|
+
print(f" At addition to sub failed at "
|
|
1549
|
+
f"{sub_attach_atom.GetSymbol()}"
|
|
1550
|
+
f"(idx={frags.sub_attach_idx})", file=sys.stderr)
|
|
1551
|
+
return alternatives
|
|
1552
|
+
_, new_parent_at_smi = at_result
|
|
1553
|
+
new_parent_at_name = _smiles_to_name(new_parent_at_smi)
|
|
1554
|
+
if new_parent_at_name is None:
|
|
1555
|
+
if verbose:
|
|
1556
|
+
print(f" ChemScript can't name At-probe: "
|
|
1557
|
+
f"{new_parent_at_smi}", file=sys.stderr)
|
|
1558
|
+
return alternatives
|
|
1559
|
+
|
|
1560
|
+
m = re.search(r'(\d+)-astato', new_parent_at_name, re.IGNORECASE)
|
|
1561
|
+
if m:
|
|
1562
|
+
new_parent_locant = m.group(1)
|
|
1563
|
+
elif 'astato' in new_parent_at_name.lower():
|
|
1564
|
+
new_parent_locant = ""
|
|
1565
|
+
|
|
1566
|
+
if verbose:
|
|
1567
|
+
print(f" New parent At-probe: '{new_parent_at_name}' "
|
|
1568
|
+
f"(locant={new_parent_locant})", file=sys.stderr)
|
|
1569
|
+
if heteroatom_yl_suffix:
|
|
1570
|
+
print(f" Heteroatom yl+suffix: '{heteroatom_yl_suffix}'",
|
|
1571
|
+
file=sys.stderr)
|
|
1572
|
+
|
|
1573
|
+
# Assemble alternatives via replace-hal
|
|
1574
|
+
# For heteroatom cases, use the acid-derived yl+suffix form instead
|
|
1575
|
+
if is_heteroatom and heteroatom_yl_suffix:
|
|
1576
|
+
all_yl_forms = [heteroatom_yl_suffix]
|
|
1577
|
+
else:
|
|
1578
|
+
all_yl_forms = list(yl_candidates)
|
|
1579
|
+
|
|
1580
|
+
for yl_form in all_yl_forms:
|
|
1581
|
+
if new_parent_locant:
|
|
1582
|
+
# Derive locanted sub-parent from the At-probe name by stripping
|
|
1583
|
+
# the astatine prefix. E.g. "1-astato-4-fluorobenzene" → "4-fluorobenzene".
|
|
1584
|
+
# This preserves locant info that the canonical sub_parent_name
|
|
1585
|
+
# (e.g. "fluorobenzene") lacks, enabling correct prefix ordering.
|
|
1586
|
+
at_prefix = f"{new_parent_locant}-astato"
|
|
1587
|
+
if new_parent_at_name.lower().startswith(at_prefix.lower()):
|
|
1588
|
+
locanted_parent = new_parent_at_name[len(at_prefix):]
|
|
1589
|
+
if locanted_parent.startswith("-"):
|
|
1590
|
+
locanted_parent = locanted_parent[1:]
|
|
1591
|
+
else:
|
|
1592
|
+
locanted_parent = sub_parent_name
|
|
1593
|
+
for strat, assembled in [
|
|
1594
|
+
("replace-hal-noparens",
|
|
1595
|
+
_insert_prefix_by_locant(
|
|
1596
|
+
locanted_parent, new_parent_locant, yl_form)),
|
|
1597
|
+
("replace-hal-parens",
|
|
1598
|
+
_insert_prefix_by_locant(
|
|
1599
|
+
locanted_parent, new_parent_locant,
|
|
1600
|
+
f"({yl_form})")),
|
|
1601
|
+
]:
|
|
1602
|
+
valid = _validate_name(assembled, canonical_smiles)
|
|
1603
|
+
if verbose:
|
|
1604
|
+
tag = "VALID" if valid else "INVALID"
|
|
1605
|
+
print(f" Assembled ({strat}): '{assembled}' [{tag}]",
|
|
1606
|
+
file=sys.stderr)
|
|
1607
|
+
alternatives.append(Alternative(
|
|
1608
|
+
name=assembled,
|
|
1609
|
+
parent_name=sub_parent_name,
|
|
1610
|
+
sub_name=yl_form,
|
|
1611
|
+
locant=new_parent_locant or "",
|
|
1612
|
+
valid=valid,
|
|
1613
|
+
strategy=strat,
|
|
1614
|
+
))
|
|
1615
|
+
if valid:
|
|
1616
|
+
break # skip more-bracketed variants
|
|
1617
|
+
continue
|
|
1618
|
+
|
|
1619
|
+
if "astato" in new_parent_at_name.lower():
|
|
1620
|
+
# Try noparens first; skip parens if noparens validates
|
|
1621
|
+
for strat, repl in [("replace-hal-noparens", yl_form),
|
|
1622
|
+
("replace-hal-parens", f"({yl_form})")]:
|
|
1623
|
+
assembled = re.sub(
|
|
1624
|
+
r'\d*-?astato', repl, new_parent_at_name,
|
|
1625
|
+
flags=re.IGNORECASE
|
|
1626
|
+
)
|
|
1627
|
+
valid = _validate_name(assembled, canonical_smiles)
|
|
1628
|
+
if verbose:
|
|
1629
|
+
tag = "VALID" if valid else "INVALID"
|
|
1630
|
+
print(f" Assembled ({strat}): '{assembled}' [{tag}]",
|
|
1631
|
+
file=sys.stderr)
|
|
1632
|
+
alternatives.append(Alternative(
|
|
1633
|
+
name=assembled,
|
|
1634
|
+
parent_name=sub_parent_name,
|
|
1635
|
+
sub_name=yl_form,
|
|
1636
|
+
locant=new_parent_locant or "",
|
|
1637
|
+
valid=valid,
|
|
1638
|
+
strategy=strat,
|
|
1639
|
+
))
|
|
1640
|
+
if valid:
|
|
1641
|
+
break # skip more-bracketed variants
|
|
1642
|
+
|
|
1643
|
+
# Fallback: if no valid alternatives from construct_yl_form, try acid probe
|
|
1644
|
+
has_valid = any(a.valid for a in alternatives)
|
|
1645
|
+
if not has_valid:
|
|
1646
|
+
acid_yl = _get_yl_via_acid_probe(
|
|
1647
|
+
frags.parent_mol, frags.parent_attach_idx, verbose=verbose
|
|
1648
|
+
)
|
|
1649
|
+
if acid_yl and acid_yl not in yl_candidates:
|
|
1650
|
+
if verbose:
|
|
1651
|
+
print(f" Acid probe -yl: '{acid_yl}'", file=sys.stderr)
|
|
1652
|
+
# Try assembly with acid-probe -yl form
|
|
1653
|
+
# Try noparens first; skip parens if noparens validates
|
|
1654
|
+
if new_parent_locant:
|
|
1655
|
+
pattern = f"{new_parent_locant}-astato"
|
|
1656
|
+
if pattern in new_parent_at_name:
|
|
1657
|
+
for strat, assembled in [
|
|
1658
|
+
("acid-probe-noparens",
|
|
1659
|
+
new_parent_at_name.replace(
|
|
1660
|
+
pattern, f"{new_parent_locant}-{acid_yl}")),
|
|
1661
|
+
("acid-probe-parens",
|
|
1662
|
+
new_parent_at_name.replace(
|
|
1663
|
+
pattern, f"{new_parent_locant}-({acid_yl})")),
|
|
1664
|
+
]:
|
|
1665
|
+
valid = _validate_name(assembled, canonical_smiles)
|
|
1666
|
+
if verbose:
|
|
1667
|
+
tag = "VALID" if valid else "INVALID"
|
|
1668
|
+
print(f" Assembled ({strat}): '{assembled}' "
|
|
1669
|
+
f"[{tag}]", file=sys.stderr)
|
|
1670
|
+
alternatives.append(Alternative(
|
|
1671
|
+
name=assembled,
|
|
1672
|
+
parent_name=sub_parent_name,
|
|
1673
|
+
sub_name=acid_yl,
|
|
1674
|
+
locant=new_parent_locant or "",
|
|
1675
|
+
valid=valid,
|
|
1676
|
+
strategy=strat,
|
|
1677
|
+
))
|
|
1678
|
+
if valid:
|
|
1679
|
+
break # skip more-bracketed variants
|
|
1680
|
+
elif "astato" in new_parent_at_name.lower():
|
|
1681
|
+
for strat, repl in [("acid-probe-noparens", acid_yl),
|
|
1682
|
+
("acid-probe-parens", f"({acid_yl})")]:
|
|
1683
|
+
assembled = re.sub(
|
|
1684
|
+
r'\d*-?astato', repl, new_parent_at_name,
|
|
1685
|
+
flags=re.IGNORECASE
|
|
1686
|
+
)
|
|
1687
|
+
valid = _validate_name(assembled, canonical_smiles)
|
|
1688
|
+
if verbose:
|
|
1689
|
+
tag = "VALID" if valid else "INVALID"
|
|
1690
|
+
print(f" Assembled ({strat}): '{assembled}' "
|
|
1691
|
+
f"[{tag}]", file=sys.stderr)
|
|
1692
|
+
alternatives.append(Alternative(
|
|
1693
|
+
name=assembled,
|
|
1694
|
+
parent_name=sub_parent_name,
|
|
1695
|
+
sub_name=acid_yl,
|
|
1696
|
+
locant=new_parent_locant or "",
|
|
1697
|
+
valid=valid,
|
|
1698
|
+
strategy=strat,
|
|
1699
|
+
))
|
|
1700
|
+
if valid:
|
|
1701
|
+
break # skip more-bracketed variants
|
|
1702
|
+
|
|
1703
|
+
# Recursive decomposition: try alternative parent names for sub-fragment
|
|
1704
|
+
# max_depth: -1 = unlimited (until timeout), 0 = disabled, >0 = N levels
|
|
1705
|
+
if max_depth != 0 and new_parent_locant:
|
|
1706
|
+
if _deadline is not None and time.monotonic() > _deadline:
|
|
1707
|
+
if verbose:
|
|
1708
|
+
print(f" Skipping recursive decomposition (timeout)",
|
|
1709
|
+
file=sys.stderr)
|
|
1710
|
+
else:
|
|
1711
|
+
if verbose:
|
|
1712
|
+
print(f" Recursive decomposition of sub-fragment "
|
|
1713
|
+
f"(max_depth={max_depth})...", file=sys.stderr)
|
|
1714
|
+
next_depth = max_depth - 1 if max_depth > 0 else max_depth
|
|
1715
|
+
sub_decomp = decompose_name(frags.sub_smi, max_depth=next_depth,
|
|
1716
|
+
verbose=verbose, _deadline=_deadline)
|
|
1717
|
+
|
|
1718
|
+
# Collect recursive alt parent names (deduplicated)
|
|
1719
|
+
recursive_parents = []
|
|
1720
|
+
seen_parents = set()
|
|
1721
|
+
sub_canon = _canonical(frags.sub_smi)
|
|
1722
|
+
for sub_alt in sub_decomp.alternatives:
|
|
1723
|
+
if sub_alt.valid and sub_alt.name not in seen_parents:
|
|
1724
|
+
if sub_alt.name != sub_parent_name:
|
|
1725
|
+
seen_parents.add(sub_alt.name)
|
|
1726
|
+
recursive_parents.append(sub_alt.name)
|
|
1727
|
+
|
|
1728
|
+
# Bracket-text shortcut: the bracket content may encode a
|
|
1729
|
+
# flat-prefix parent name unreachable by recursive decomp
|
|
1730
|
+
# (e.g. "2-morpholino-4-phenylquinolin-3-yl"
|
|
1731
|
+
# → "2-morpholino-4-phenylquinoline")
|
|
1732
|
+
if _bracket_yl_text:
|
|
1733
|
+
bt_parent = _parent_name_from_bracket_yl(_bracket_yl_text)
|
|
1734
|
+
if bt_parent and bt_parent not in seen_parents:
|
|
1735
|
+
bt_smi = _name_to_smiles(bt_parent)
|
|
1736
|
+
if (bt_smi and sub_canon
|
|
1737
|
+
and _canonical(bt_smi) == sub_canon):
|
|
1738
|
+
if verbose:
|
|
1739
|
+
print(f" Bracket-text parent: '{bt_parent}'",
|
|
1740
|
+
file=sys.stderr)
|
|
1741
|
+
seen_parents.add(bt_parent)
|
|
1742
|
+
recursive_parents.append(bt_parent)
|
|
1743
|
+
|
|
1744
|
+
for alt_parent in recursive_parents:
|
|
1745
|
+
if verbose:
|
|
1746
|
+
print(f" Recursive alt: '{alt_parent}'",
|
|
1747
|
+
file=sys.stderr)
|
|
1748
|
+
for yl_form in all_yl_forms:
|
|
1749
|
+
# Try noparens first; skip more-bracketed if valid
|
|
1750
|
+
for strat, assembled in [
|
|
1751
|
+
("recursive-noparens",
|
|
1752
|
+
_insert_prefix_by_locant(
|
|
1753
|
+
alt_parent, new_parent_locant,
|
|
1754
|
+
yl_form)),
|
|
1755
|
+
("recursive-parens",
|
|
1756
|
+
_insert_prefix_by_locant(
|
|
1757
|
+
alt_parent, new_parent_locant,
|
|
1758
|
+
f"({yl_form})")),
|
|
1759
|
+
("recursive-brackets",
|
|
1760
|
+
_insert_prefix_by_locant(
|
|
1761
|
+
alt_parent, new_parent_locant,
|
|
1762
|
+
f"[{yl_form}]")),
|
|
1763
|
+
]:
|
|
1764
|
+
valid = _validate_name(assembled, canonical_smiles)
|
|
1765
|
+
if verbose:
|
|
1766
|
+
tag = "VALID" if valid else "INVALID"
|
|
1767
|
+
print(f" Recursive ({strat}): "
|
|
1768
|
+
f"'{assembled}' [{tag}]",
|
|
1769
|
+
file=sys.stderr)
|
|
1770
|
+
alternatives.append(Alternative(
|
|
1771
|
+
name=assembled,
|
|
1772
|
+
parent_name=alt_parent,
|
|
1773
|
+
sub_name=yl_form,
|
|
1774
|
+
locant=new_parent_locant,
|
|
1775
|
+
valid=valid,
|
|
1776
|
+
strategy=strat,
|
|
1777
|
+
))
|
|
1778
|
+
if valid:
|
|
1779
|
+
break # skip more-bracketed variants
|
|
1780
|
+
|
|
1781
|
+
# Se-probe reverse assembly: the Se-probe yl of the old parent may
|
|
1782
|
+
# encode a suffix→prefix conversion (e.g. "-3-carbaldehyde" becomes
|
|
1783
|
+
# "3-formyl-" inside the yl text). Extract the converted parent name
|
|
1784
|
+
# and insert the sub-fragment's prefix form at the attachment locant.
|
|
1785
|
+
if se_yl and max_depth != 0:
|
|
1786
|
+
yl_text = se_yl.strip('()[]')
|
|
1787
|
+
se_parent = _parent_name_from_bracket_yl(yl_text)
|
|
1788
|
+
if (se_parent and se_parent != sub_parent_name
|
|
1789
|
+
and se_parent != parent_name):
|
|
1790
|
+
# Verify the extracted parent resolves to the same molecule
|
|
1791
|
+
se_parent_smi = _name_to_smiles(se_parent)
|
|
1792
|
+
parent_canon = _canonical(frags.parent_smi)
|
|
1793
|
+
if (se_parent_smi and parent_canon
|
|
1794
|
+
and _canonical(se_parent_smi) == parent_canon):
|
|
1795
|
+
# Get the attachment locant from the yl text
|
|
1796
|
+
m_loc = re.search(r'-(\d+)-yl$', yl_text)
|
|
1797
|
+
if m_loc:
|
|
1798
|
+
se_locant = m_loc.group(1)
|
|
1799
|
+
# Compute sub-fragment prefix/yl forms
|
|
1800
|
+
sub_yl_candidates = construct_yl_form(
|
|
1801
|
+
sub_parent_name, new_parent_locant or "")
|
|
1802
|
+
sub_se_yl = _get_yl_via_selenyl_probe(
|
|
1803
|
+
frags.sub_mol, frags.sub_attach_idx,
|
|
1804
|
+
verbose=verbose)
|
|
1805
|
+
if sub_se_yl and sub_se_yl not in sub_yl_candidates:
|
|
1806
|
+
sub_yl_candidates.insert(0, sub_se_yl)
|
|
1807
|
+
|
|
1808
|
+
if verbose:
|
|
1809
|
+
print(f" Se-reverse parent: '{se_parent}' "
|
|
1810
|
+
f"(locant={se_locant})",
|
|
1811
|
+
file=sys.stderr)
|
|
1812
|
+
print(f" Sub-fragment yl candidates: "
|
|
1813
|
+
f"{sub_yl_candidates}", file=sys.stderr)
|
|
1814
|
+
|
|
1815
|
+
for sub_yl in sub_yl_candidates:
|
|
1816
|
+
# Try noparens first; skip parens if valid
|
|
1817
|
+
for strat, assembled in [
|
|
1818
|
+
("se-reverse-noparens",
|
|
1819
|
+
_insert_prefix_by_locant(
|
|
1820
|
+
se_parent, se_locant, sub_yl)),
|
|
1821
|
+
("se-reverse-parens",
|
|
1822
|
+
_insert_prefix_by_locant(
|
|
1823
|
+
se_parent, se_locant,
|
|
1824
|
+
f"({sub_yl})")),
|
|
1825
|
+
]:
|
|
1826
|
+
valid = _validate_name(
|
|
1827
|
+
assembled, canonical_smiles)
|
|
1828
|
+
if verbose:
|
|
1829
|
+
tag = "VALID" if valid else "INVALID"
|
|
1830
|
+
print(f" Se-reverse ({strat}): "
|
|
1831
|
+
f"'{assembled}' [{tag}]",
|
|
1832
|
+
file=sys.stderr)
|
|
1833
|
+
alternatives.append(Alternative(
|
|
1834
|
+
name=assembled,
|
|
1835
|
+
parent_name=se_parent,
|
|
1836
|
+
sub_name=sub_yl,
|
|
1837
|
+
locant=se_locant,
|
|
1838
|
+
valid=valid,
|
|
1839
|
+
strategy=strat,
|
|
1840
|
+
))
|
|
1841
|
+
if valid:
|
|
1842
|
+
break # skip more-bracketed variants
|
|
1843
|
+
|
|
1844
|
+
return alternatives
|
|
1845
|
+
|
|
1846
|
+
|
|
1847
|
+
def _validate_name(name: str, expected_canonical: str) -> bool:
|
|
1848
|
+
"""Round-trip validate: name → SMILES → canonical, compare."""
|
|
1849
|
+
smi = _name_to_smiles(name)
|
|
1850
|
+
if smi is None:
|
|
1851
|
+
return False
|
|
1852
|
+
canon = _canonical(smi)
|
|
1853
|
+
if canon is None:
|
|
1854
|
+
return False
|
|
1855
|
+
return canon == expected_canonical
|
|
1856
|
+
|
|
1857
|
+
|
|
1858
|
+
# ---------------------------------------------------------------------------
|
|
1859
|
+
# Suffix → prefix conversion
|
|
1860
|
+
# ---------------------------------------------------------------------------
|
|
1861
|
+
|
|
1862
|
+
# (suffix, prefix_form, terminal_e_elided_before_suffix)
|
|
1863
|
+
# Longest suffix first to avoid partial matches.
|
|
1864
|
+
_SUFFIX_PREFIX_MAP = [
|
|
1865
|
+
# Longest suffix first to avoid partial matches.
|
|
1866
|
+
("carboxylic acid", "carboxy", False),
|
|
1867
|
+
("sulfonic acid", "sulfo", False),
|
|
1868
|
+
("sulfonamide", "sulfamoyl", False),
|
|
1869
|
+
("carbonitrile", "cyano", False),
|
|
1870
|
+
("carbaldehyde", "formyl", False),
|
|
1871
|
+
("carboxamide", "carbamoyl", False),
|
|
1872
|
+
("amine", "amino", True),
|
|
1873
|
+
("ol", "hydroxy", True),
|
|
1874
|
+
("one", "oxo", True),
|
|
1875
|
+
("thiol", "sulfanyl", False),
|
|
1876
|
+
]
|
|
1877
|
+
|
|
1878
|
+
|
|
1879
|
+
def _suffix_to_prefix_alternatives(canonical_name: str,
|
|
1880
|
+
canonical_smiles: str,
|
|
1881
|
+
verbose: bool = False,
|
|
1882
|
+
) -> List[Alternative]:
|
|
1883
|
+
"""Convert IUPAC suffix to prefix form.
|
|
1884
|
+
|
|
1885
|
+
E.g. pyridin-4-amine → 4-aminopyridine,
|
|
1886
|
+
cyclohexan-1-ol → 1-hydroxycyclohexane.
|
|
1887
|
+
|
|
1888
|
+
Only handles single-locant suffixes (not multiplied like 1,4-diamine).
|
|
1889
|
+
"""
|
|
1890
|
+
alternatives: List[Alternative] = []
|
|
1891
|
+
|
|
1892
|
+
for suffix, prefix, e_elided in _SUFFIX_PREFIX_MAP:
|
|
1893
|
+
if not canonical_name.endswith(suffix):
|
|
1894
|
+
continue
|
|
1895
|
+
|
|
1896
|
+
before = canonical_name[:-len(suffix)]
|
|
1897
|
+
# Expect: {stem}-{locant(s)}-
|
|
1898
|
+
m = re.match(r'^(.+)-(\d+(?:,\d+)*)-$', before)
|
|
1899
|
+
if not m:
|
|
1900
|
+
continue
|
|
1901
|
+
|
|
1902
|
+
stem = m.group(1)
|
|
1903
|
+
locants_str = m.group(2)
|
|
1904
|
+
|
|
1905
|
+
# Skip multiplied locants for now (e.g. benzene-1,4-diamine)
|
|
1906
|
+
if ',' in locants_str:
|
|
1907
|
+
continue
|
|
1908
|
+
|
|
1909
|
+
# Restore terminal 'e' if elided before vowel-starting suffix
|
|
1910
|
+
parent = stem + 'e' if e_elided else stem
|
|
1911
|
+
|
|
1912
|
+
# Assemble prefix form via locant-ordered insertion
|
|
1913
|
+
assembled = _insert_prefix_by_locant(parent, locants_str, prefix)
|
|
1914
|
+
|
|
1915
|
+
valid = _validate_name(assembled, canonical_smiles)
|
|
1916
|
+
if verbose:
|
|
1917
|
+
tag = "VALID" if valid else "INVALID"
|
|
1918
|
+
print(f" Suffix→prefix ({suffix}→{prefix}): "
|
|
1919
|
+
f"'{assembled}' [{tag}]", file=sys.stderr)
|
|
1920
|
+
|
|
1921
|
+
alternatives.append(Alternative(
|
|
1922
|
+
name=assembled,
|
|
1923
|
+
parent_name=parent,
|
|
1924
|
+
sub_name=prefix,
|
|
1925
|
+
locant=locants_str,
|
|
1926
|
+
valid=valid,
|
|
1927
|
+
strategy="suffix-to-prefix",
|
|
1928
|
+
))
|
|
1929
|
+
|
|
1930
|
+
# Only one principal characteristic group per name
|
|
1931
|
+
break
|
|
1932
|
+
|
|
1933
|
+
return alternatives
|
|
1934
|
+
|
|
1935
|
+
|
|
1936
|
+
def _deduplicate_alternatives(alternatives: List[Alternative],
|
|
1937
|
+
verbose: bool = False) -> List[Alternative]:
|
|
1938
|
+
"""Remove redundant alternatives: bracket-only variants and
|
|
1939
|
+
single-position synonym variants.
|
|
1940
|
+
|
|
1941
|
+
**Step 1 — bracket-stripped dedup.** Group validated names by their
|
|
1942
|
+
bracket-stripped form (all ``()``, ``[]`` removed). Within each group
|
|
1943
|
+
keep only the name with the fewest bracket characters (ties broken by
|
|
1944
|
+
shortest total length).
|
|
1945
|
+
|
|
1946
|
+
**Step 2 — single-segment synonym collapse.** For any two surviving
|
|
1947
|
+
names that differ at exactly one contiguous segment where one segment
|
|
1948
|
+
has outer brackets and the other does not, discard the bracketed
|
|
1949
|
+
variant. Since both names round-trip to the same canonical SMILES,
|
|
1950
|
+
the segments must be synonymous (e.g. ``morpholino`` vs
|
|
1951
|
+
``(morpholin-4-yl)``).
|
|
1952
|
+
"""
|
|
1953
|
+
valid = [a for a in alternatives if a.valid]
|
|
1954
|
+
invalid = [a for a in alternatives if not a.valid]
|
|
1955
|
+
|
|
1956
|
+
if len(valid) <= 1:
|
|
1957
|
+
return alternatives
|
|
1958
|
+
|
|
1959
|
+
# --- Step 1: bracket-stripped dedup ---
|
|
1960
|
+
def _strip_brackets(name: str) -> str:
|
|
1961
|
+
return name.replace('(', '').replace(')', '').replace('[', '').replace(']', '')
|
|
1962
|
+
|
|
1963
|
+
def _bracket_count(name: str) -> int:
|
|
1964
|
+
return sum(1 for c in name if c in '()[]')
|
|
1965
|
+
|
|
1966
|
+
groups: dict = {}
|
|
1967
|
+
for alt in valid:
|
|
1968
|
+
key = _strip_brackets(alt.name)
|
|
1969
|
+
groups.setdefault(key, []).append(alt)
|
|
1970
|
+
|
|
1971
|
+
step1: List[Alternative] = []
|
|
1972
|
+
for group in groups.values():
|
|
1973
|
+
best = min(group, key=lambda a: (_bracket_count(a.name), len(a.name)))
|
|
1974
|
+
step1.append(best)
|
|
1975
|
+
if verbose and len(group) > 1:
|
|
1976
|
+
removed = [a.name for a in group if a is not best]
|
|
1977
|
+
print(f" Dedup (bracket-strip): kept '{best.name}', "
|
|
1978
|
+
f"removed {removed}", file=sys.stderr)
|
|
1979
|
+
|
|
1980
|
+
# --- Step 2: single-segment synonym collapse ---
|
|
1981
|
+
# Sort shortest-first so shorter names are preferred as "keepers".
|
|
1982
|
+
step1.sort(key=lambda a: (len(a.name), a.name))
|
|
1983
|
+
|
|
1984
|
+
to_remove: set = set()
|
|
1985
|
+
for i in range(len(step1)):
|
|
1986
|
+
if i in to_remove:
|
|
1987
|
+
continue
|
|
1988
|
+
for j in range(i + 1, len(step1)):
|
|
1989
|
+
if j in to_remove:
|
|
1990
|
+
continue
|
|
1991
|
+
name_i = step1[i].name
|
|
1992
|
+
name_j = step1[j].name
|
|
1993
|
+
|
|
1994
|
+
# Find longest common prefix
|
|
1995
|
+
pfx = 0
|
|
1996
|
+
for k in range(min(len(name_i), len(name_j))):
|
|
1997
|
+
if name_i[k] == name_j[k]:
|
|
1998
|
+
pfx = k + 1
|
|
1999
|
+
else:
|
|
2000
|
+
break
|
|
2001
|
+
|
|
2002
|
+
# Find longest common suffix (not overlapping with prefix)
|
|
2003
|
+
sfx = 0
|
|
2004
|
+
max_sfx = min(len(name_i), len(name_j)) - pfx
|
|
2005
|
+
for k in range(1, max_sfx + 1):
|
|
2006
|
+
if name_i[-k] == name_j[-k]:
|
|
2007
|
+
sfx = k
|
|
2008
|
+
else:
|
|
2009
|
+
break
|
|
2010
|
+
|
|
2011
|
+
end_i = len(name_i) - sfx if sfx else len(name_i)
|
|
2012
|
+
end_j = len(name_j) - sfx if sfx else len(name_j)
|
|
2013
|
+
mid_i = name_i[pfx:end_i]
|
|
2014
|
+
mid_j = name_j[pfx:end_j]
|
|
2015
|
+
|
|
2016
|
+
if not mid_i or not mid_j:
|
|
2017
|
+
continue
|
|
2018
|
+
|
|
2019
|
+
# Check: does exactly one segment have outer brackets?
|
|
2020
|
+
def _has_outer_brackets(s: str) -> bool:
|
|
2021
|
+
return (len(s) >= 2
|
|
2022
|
+
and ((s[0] == '(' and s[-1] == ')')
|
|
2023
|
+
or (s[0] == '[' and s[-1] == ']')))
|
|
2024
|
+
|
|
2025
|
+
i_outer = _has_outer_brackets(mid_i)
|
|
2026
|
+
j_outer = _has_outer_brackets(mid_j)
|
|
2027
|
+
|
|
2028
|
+
if i_outer != j_outer:
|
|
2029
|
+
# One has outer brackets, one doesn't.
|
|
2030
|
+
# Only collapse if the inner text shares a common stem
|
|
2031
|
+
# (≥ 4 chars) with the non-bracketed form — this avoids
|
|
2032
|
+
# collapsing genuinely different yl-forms (e.g.
|
|
2033
|
+
# "hydroxy(phenyl)methyl" vs "phenylmethanol-yl").
|
|
2034
|
+
if i_outer:
|
|
2035
|
+
inner = mid_i[1:-1]
|
|
2036
|
+
other = mid_j
|
|
2037
|
+
else:
|
|
2038
|
+
inner = mid_j[1:-1]
|
|
2039
|
+
other = mid_i
|
|
2040
|
+
|
|
2041
|
+
common_prefix_len = 0
|
|
2042
|
+
for k in range(min(len(inner), len(other))):
|
|
2043
|
+
if inner[k] == other[k]:
|
|
2044
|
+
common_prefix_len = k + 1
|
|
2045
|
+
else:
|
|
2046
|
+
break
|
|
2047
|
+
|
|
2048
|
+
if common_prefix_len >= 4:
|
|
2049
|
+
if i_outer:
|
|
2050
|
+
to_remove.add(i)
|
|
2051
|
+
if verbose:
|
|
2052
|
+
print(f" Dedup (synonym): removed '{name_i}' "
|
|
2053
|
+
f"(kept shorter '{name_j}')",
|
|
2054
|
+
file=sys.stderr)
|
|
2055
|
+
break # i is removed, skip remaining j
|
|
2056
|
+
else:
|
|
2057
|
+
to_remove.add(j)
|
|
2058
|
+
if verbose:
|
|
2059
|
+
print(f" Dedup (synonym): removed '{name_j}' "
|
|
2060
|
+
f"(kept shorter '{name_i}')",
|
|
2061
|
+
file=sys.stderr)
|
|
2062
|
+
|
|
2063
|
+
step2 = [alt for idx, alt in enumerate(step1) if idx not in to_remove]
|
|
2064
|
+
|
|
2065
|
+
return step2 + invalid
|
|
2066
|
+
|
|
2067
|
+
|
|
2068
|
+
# ---------------------------------------------------------------------------
|
|
2069
|
+
# Space-separated yl-group alternatives
|
|
2070
|
+
# ---------------------------------------------------------------------------
|
|
2071
|
+
|
|
2072
|
+
# Ring stems that appear in "ring-N-yl" patterns (e.g. pyridin-4-yl)
|
|
2073
|
+
_YL_RING_STEMS = [
|
|
2074
|
+
"quinolin", "isoquinolin", "quinoxalin", "quinazolin",
|
|
2075
|
+
"pyridin", "pyrimidin", "pyrazin", "pyridazin",
|
|
2076
|
+
"morpholin", "piperidin", "piperazin", "pyrrolidin",
|
|
2077
|
+
"indol", "benzimidazol", "benzothiazol", "benzofuran", "benzoxazol",
|
|
2078
|
+
"naphthal", "acridin", "carbazol", "phenanthrol",
|
|
2079
|
+
"thien", "furan", "pyrrol", "imidazol", "oxazol", "thiazol",
|
|
2080
|
+
"triazin", "tetrazol", "triazol", "oxadiazol",
|
|
2081
|
+
"phenyl", # for phenylbenzoate etc.
|
|
2082
|
+
]
|
|
2083
|
+
|
|
2084
|
+
|
|
2085
|
+
def _space_sep_yl_alternatives(
|
|
2086
|
+
canonical_name: str,
|
|
2087
|
+
canonical_smiles: str,
|
|
2088
|
+
verbose: bool = False,
|
|
2089
|
+
max_depth: int = 0,
|
|
2090
|
+
_deadline: Optional[float] = None,
|
|
2091
|
+
) -> List[Alternative]:
|
|
2092
|
+
"""Generate alternatives for space-separated names with embedded yl-groups.
|
|
2093
|
+
|
|
2094
|
+
Handles names like "tert-butyl pyridin-4-ylcarbamate" where a ring-yl
|
|
2095
|
+
pattern is fused into the name without brackets. Creates a synthetic
|
|
2096
|
+
BracketNode and delegates to the standard generate_alternative() path.
|
|
2097
|
+
|
|
2098
|
+
Produces alternatives like "4-((tert-butoxycarbonyl)amino)pyridine"
|
|
2099
|
+
where the ring becomes the parent.
|
|
2100
|
+
"""
|
|
2101
|
+
if ' ' not in canonical_name:
|
|
2102
|
+
return []
|
|
2103
|
+
|
|
2104
|
+
alternatives: List[Alternative] = []
|
|
2105
|
+
|
|
2106
|
+
# Build regex matching ring-N-yl patterns
|
|
2107
|
+
stem_pattern = '|'.join(re.escape(s) for s in
|
|
2108
|
+
sorted(_YL_RING_STEMS, key=len, reverse=True))
|
|
2109
|
+
# Match ring stem + optional fused annotation + locant + yl
|
|
2110
|
+
# E.g.: pyridin-4-yl, quinolin-7-yl, thieno[2,3-d]pyrimidin-4-yl
|
|
2111
|
+
yl_re = re.compile(
|
|
2112
|
+
r'((?:' + stem_pattern + r')'
|
|
2113
|
+
r'(?:\[[^\]]+\])?' # optional fused ring annotation [2,3-d]
|
|
2114
|
+
r'(?:[a-z]*)' # optional stem continuation (e.g. "oline" in morpholine)
|
|
2115
|
+
r'(?:-\d+(?:,\d+)*)?' # optional locant(s)
|
|
2116
|
+
r'-yl)',
|
|
2117
|
+
re.IGNORECASE,
|
|
2118
|
+
)
|
|
2119
|
+
|
|
2120
|
+
for m in yl_re.finditer(canonical_name):
|
|
2121
|
+
yl_text = m.group(1) # e.g. "pyridin-4-yl"
|
|
2122
|
+
yl_start = m.start()
|
|
2123
|
+
yl_end = m.end() - 1 # inclusive
|
|
2124
|
+
|
|
2125
|
+
# The yl-group should appear after a space (space-separated name)
|
|
2126
|
+
# and be followed by more text (the functional group suffix)
|
|
2127
|
+
if yl_start == 0:
|
|
2128
|
+
continue
|
|
2129
|
+
# Check there's a space somewhere before this yl-group
|
|
2130
|
+
before_text = canonical_name[:yl_start]
|
|
2131
|
+
if ' ' not in before_text:
|
|
2132
|
+
continue
|
|
2133
|
+
# Check there's a suffix after the yl-group (not end-of-name)
|
|
2134
|
+
after_text = canonical_name[yl_end + 1:]
|
|
2135
|
+
if not after_text or after_text.startswith(' '):
|
|
2136
|
+
continue # yl at end of name or before space — not embedded
|
|
2137
|
+
|
|
2138
|
+
if verbose:
|
|
2139
|
+
print(f" Space-sep yl: '{yl_text}' in '{canonical_name}'",
|
|
2140
|
+
file=sys.stderr)
|
|
2141
|
+
print(f" before='{before_text}' after='{after_text}'",
|
|
2142
|
+
file=sys.stderr)
|
|
2143
|
+
|
|
2144
|
+
# Create synthetic BracketNode
|
|
2145
|
+
# The At-probe: replace yl-text with "astato"
|
|
2146
|
+
node = BracketNode(
|
|
2147
|
+
text=yl_text,
|
|
2148
|
+
start=yl_start,
|
|
2149
|
+
end=yl_end,
|
|
2150
|
+
depth=0,
|
|
2151
|
+
kind="candidate",
|
|
2152
|
+
)
|
|
2153
|
+
|
|
2154
|
+
# Try At-probe (replace yl-text with "astato")
|
|
2155
|
+
at_name = canonical_name[:yl_start] + "astato" + canonical_name[yl_end + 1:]
|
|
2156
|
+
if verbose:
|
|
2157
|
+
print(f" At-probe name: '{at_name}'", file=sys.stderr)
|
|
2158
|
+
|
|
2159
|
+
at_smi = _name_to_smiles(at_name)
|
|
2160
|
+
if at_smi is None:
|
|
2161
|
+
if verbose:
|
|
2162
|
+
print(f" At-probe failed", file=sys.stderr)
|
|
2163
|
+
continue
|
|
2164
|
+
|
|
2165
|
+
# Extract fragments
|
|
2166
|
+
frags = _get_fragments_via_at_probe(canonical_smiles, at_smi,
|
|
2167
|
+
verbose=verbose)
|
|
2168
|
+
if frags is None:
|
|
2169
|
+
if verbose:
|
|
2170
|
+
print(f" Fragment extraction failed", file=sys.stderr)
|
|
2171
|
+
continue
|
|
2172
|
+
|
|
2173
|
+
# Assemble alternatives
|
|
2174
|
+
alts = _assemble_alternatives(
|
|
2175
|
+
frags, canonical_smiles, verbose=verbose,
|
|
2176
|
+
max_depth=max_depth, _deadline=_deadline,
|
|
2177
|
+
_bracket_yl_text=yl_text,
|
|
2178
|
+
)
|
|
2179
|
+
alternatives.extend(alts)
|
|
2180
|
+
|
|
2181
|
+
if verbose:
|
|
2182
|
+
print(f" Generated {len(alts)} alternatives from space-sep yl",
|
|
2183
|
+
file=sys.stderr)
|
|
2184
|
+
|
|
2185
|
+
return alternatives
|
|
2186
|
+
|
|
2187
|
+
|
|
2188
|
+
# ---------------------------------------------------------------------------
|
|
2189
|
+
# Main decomposition
|
|
2190
|
+
# ---------------------------------------------------------------------------
|
|
2191
|
+
|
|
2192
|
+
def decompose_name(smiles: str, max_depth: int = -1,
|
|
2193
|
+
verbose: bool = False,
|
|
2194
|
+
timeout: Optional[float] = 30.0,
|
|
2195
|
+
_deadline: Optional[float] = None,
|
|
2196
|
+
) -> DecompositionResult:
|
|
2197
|
+
"""Main entry point: decompose an IUPAC name into alternatives.
|
|
2198
|
+
|
|
2199
|
+
1. Get canonical name from ChemDraw
|
|
2200
|
+
2. Parse bracket tree
|
|
2201
|
+
3. Classify bracket groups
|
|
2202
|
+
4. For each substituent group, generate alternative names
|
|
2203
|
+
|
|
2204
|
+
Args:
|
|
2205
|
+
max_depth: Recursion depth limit. ``-1`` (default) = unlimited
|
|
2206
|
+
(recurse until timeout or convergence). ``0`` = no recursion.
|
|
2207
|
+
Positive integer = that many levels.
|
|
2208
|
+
timeout: Wall-clock seconds before recursive decomposition is
|
|
2209
|
+
skipped. Set to ``None`` to disable. Only used on the
|
|
2210
|
+
outermost call; recursive calls inherit the computed deadline
|
|
2211
|
+
via ``_deadline``.
|
|
2212
|
+
"""
|
|
2213
|
+
# Compute deadline on the outermost call; inner calls inherit it.
|
|
2214
|
+
if _deadline is None and timeout is not None:
|
|
2215
|
+
_deadline = time.monotonic() + timeout
|
|
2216
|
+
canon_smi = _canonical(smiles)
|
|
2217
|
+
if canon_smi is None:
|
|
2218
|
+
return DecompositionResult(
|
|
2219
|
+
original_smiles=smiles, canonical_smiles="",
|
|
2220
|
+
canonical_name="", bracket_tree=None,
|
|
2221
|
+
errors=["Invalid SMILES"]
|
|
2222
|
+
)
|
|
2223
|
+
|
|
2224
|
+
canonical_name = _smiles_to_name(smiles)
|
|
2225
|
+
if canonical_name is None:
|
|
2226
|
+
return DecompositionResult(
|
|
2227
|
+
original_smiles=smiles, canonical_smiles=canon_smi,
|
|
2228
|
+
canonical_name="", bracket_tree=None,
|
|
2229
|
+
errors=["ChemDraw could not name this structure"]
|
|
2230
|
+
)
|
|
2231
|
+
|
|
2232
|
+
if verbose:
|
|
2233
|
+
print(f"\nCanonical name: {canonical_name}", file=sys.stderr)
|
|
2234
|
+
|
|
2235
|
+
tree = parse_bracket_tree(canonical_name)
|
|
2236
|
+
|
|
2237
|
+
if verbose:
|
|
2238
|
+
print(f"Top-level bracket groups: {len(tree.children)}",
|
|
2239
|
+
file=sys.stderr)
|
|
2240
|
+
for i, child in enumerate(tree.children):
|
|
2241
|
+
print(f" [{i}] depth={child.depth} "
|
|
2242
|
+
f"pos={child.start}-{child.end} "
|
|
2243
|
+
f"text='{child.text}'", file=sys.stderr)
|
|
2244
|
+
|
|
2245
|
+
result = DecompositionResult(
|
|
2246
|
+
original_smiles=smiles,
|
|
2247
|
+
canonical_smiles=canon_smi,
|
|
2248
|
+
canonical_name=canonical_name,
|
|
2249
|
+
bracket_tree=tree,
|
|
2250
|
+
)
|
|
2251
|
+
|
|
2252
|
+
# Collect ALL bracket nodes at all depths (breadth-first)
|
|
2253
|
+
def _collect_nodes(node):
|
|
2254
|
+
nodes = []
|
|
2255
|
+
for child in node.children:
|
|
2256
|
+
nodes.append(child)
|
|
2257
|
+
nodes.extend(_collect_nodes(child))
|
|
2258
|
+
return nodes
|
|
2259
|
+
|
|
2260
|
+
all_nodes = _collect_nodes(tree)
|
|
2261
|
+
if verbose:
|
|
2262
|
+
print(f"Total bracket nodes (all depths): {len(all_nodes)}",
|
|
2263
|
+
file=sys.stderr)
|
|
2264
|
+
|
|
2265
|
+
# Process all bracket groups at all depths
|
|
2266
|
+
for node in all_nodes:
|
|
2267
|
+
kind = classify_node(node)
|
|
2268
|
+
node.kind = kind
|
|
2269
|
+
if verbose:
|
|
2270
|
+
print(f"\n Bracket '({node.text})' depth={node.depth} → {kind}",
|
|
2271
|
+
file=sys.stderr)
|
|
2272
|
+
|
|
2273
|
+
if kind != "candidate":
|
|
2274
|
+
continue
|
|
2275
|
+
|
|
2276
|
+
# Validate as substituent via At-probe
|
|
2277
|
+
if not validate_as_substituent(canonical_name, node,
|
|
2278
|
+
verbose=verbose):
|
|
2279
|
+
node.kind = "invalid_sub"
|
|
2280
|
+
if verbose:
|
|
2281
|
+
print(f" At-probe validation failed", file=sys.stderr)
|
|
2282
|
+
continue
|
|
2283
|
+
|
|
2284
|
+
node.kind = "substituent"
|
|
2285
|
+
|
|
2286
|
+
# Generate alternatives
|
|
2287
|
+
alts = generate_alternative(
|
|
2288
|
+
canonical_name, canon_smi, node, verbose=verbose,
|
|
2289
|
+
max_depth=max_depth, _deadline=_deadline,
|
|
2290
|
+
)
|
|
2291
|
+
result.alternatives.extend(alts)
|
|
2292
|
+
|
|
2293
|
+
# Fallback: if no bracket groups found substituents, try prefix scanning
|
|
2294
|
+
if not result.alternatives:
|
|
2295
|
+
prefix_nodes = find_prefix_substituents(
|
|
2296
|
+
canonical_name, verbose=verbose
|
|
2297
|
+
)
|
|
2298
|
+
for pnode in prefix_nodes:
|
|
2299
|
+
if _at_probe_for_prefix(canonical_name, pnode, verbose=verbose):
|
|
2300
|
+
if pnode.kind not in ("multiplied_prefix",):
|
|
2301
|
+
pnode.kind = "prefix_substituent"
|
|
2302
|
+
alts = generate_alternative_from_prefix(
|
|
2303
|
+
canonical_name, canon_smi, pnode, verbose=verbose,
|
|
2304
|
+
max_depth=max_depth, _deadline=_deadline,
|
|
2305
|
+
)
|
|
2306
|
+
result.alternatives.extend(alts)
|
|
2307
|
+
|
|
2308
|
+
# Retry: if single-prefix nodes produced no valid alternatives,
|
|
2309
|
+
# try again with multi-prefix fallback (skip_single_prefix=True).
|
|
2310
|
+
# This handles cases like "2-chloro-4-phenylquinoline" where "chloro"
|
|
2311
|
+
# is found first as a single-prefix but can't produce useful alts.
|
|
2312
|
+
valid_alts = [a for a in result.alternatives if a.valid]
|
|
2313
|
+
if not valid_alts and prefix_nodes:
|
|
2314
|
+
if verbose:
|
|
2315
|
+
print(" Retrying with multi-prefix fallback...",
|
|
2316
|
+
file=sys.stderr)
|
|
2317
|
+
prefix_nodes2 = find_prefix_substituents(
|
|
2318
|
+
canonical_name, verbose=verbose,
|
|
2319
|
+
skip_single_prefix=True
|
|
2320
|
+
)
|
|
2321
|
+
for pnode in prefix_nodes2:
|
|
2322
|
+
if _at_probe_for_prefix(canonical_name, pnode,
|
|
2323
|
+
verbose=verbose):
|
|
2324
|
+
if pnode.kind not in ("multiplied_prefix",):
|
|
2325
|
+
pnode.kind = "prefix_substituent"
|
|
2326
|
+
alts = generate_alternative_from_prefix(
|
|
2327
|
+
canonical_name, canon_smi, pnode, verbose=verbose,
|
|
2328
|
+
max_depth=max_depth, _deadline=_deadline,
|
|
2329
|
+
)
|
|
2330
|
+
result.alternatives.extend(alts)
|
|
2331
|
+
|
|
2332
|
+
# Suffix→prefix conversion (e.g. pyridin-4-amine → 4-aminopyridine)
|
|
2333
|
+
suffix_alts = _suffix_to_prefix_alternatives(
|
|
2334
|
+
canonical_name, canon_smi, verbose=verbose)
|
|
2335
|
+
result.alternatives.extend(suffix_alts)
|
|
2336
|
+
|
|
2337
|
+
# Space-separated names with embedded yl-groups
|
|
2338
|
+
# (e.g. "tert-butyl pyridin-4-ylcarbamate" — no brackets, no prefix match)
|
|
2339
|
+
if ' ' in canonical_name:
|
|
2340
|
+
yl_alts = _space_sep_yl_alternatives(
|
|
2341
|
+
canonical_name, canon_smi, verbose=verbose,
|
|
2342
|
+
max_depth=max_depth, _deadline=_deadline,
|
|
2343
|
+
)
|
|
2344
|
+
result.alternatives.extend(yl_alts)
|
|
2345
|
+
|
|
2346
|
+
# Deduplicate alternatives:
|
|
2347
|
+
# 1. Remove exact-name duplicates and names identical to canonical
|
|
2348
|
+
seen: set = {canonical_name} # canonical is already listed separately
|
|
2349
|
+
unique: list = []
|
|
2350
|
+
for alt in result.alternatives:
|
|
2351
|
+
if alt.name not in seen:
|
|
2352
|
+
seen.add(alt.name)
|
|
2353
|
+
unique.append(alt)
|
|
2354
|
+
# 2. Remove bracket-only variants and single-position synonyms
|
|
2355
|
+
result.alternatives = _deduplicate_alternatives(unique, verbose=verbose)
|
|
2356
|
+
|
|
2357
|
+
# Infer canonical parent: for the canonical name, the parent is the
|
|
2358
|
+
# fragment that remains when the first substituent is removed.
|
|
2359
|
+
# We can extract this from the At-probe of the first substituent node.
|
|
2360
|
+
if result.alternatives:
|
|
2361
|
+
# The first alternative's parent_name is the OLD substituent that
|
|
2362
|
+
# became new parent — so the OLD parent is what the canonical name
|
|
2363
|
+
# uses. We can extract it from the At-probe: replace substituent
|
|
2364
|
+
# with At, resolve, remove At → parent fragment → name it.
|
|
2365
|
+
for alt in result.alternatives:
|
|
2366
|
+
if alt.valid:
|
|
2367
|
+
# The sub_name (in -yl form) tells us what the canonical
|
|
2368
|
+
# parent is. But it's simpler to just check the At-probe.
|
|
2369
|
+
# For now, use a heuristic: look for the longest suffix of
|
|
2370
|
+
# canonical_name that resolves as a valid compound.
|
|
2371
|
+
break
|
|
2372
|
+
|
|
2373
|
+
# Try to determine canonical parent from prefix scan or bracket analysis
|
|
2374
|
+
if not result.canonical_parent:
|
|
2375
|
+
# For bracket names: parent is name minus bracket group text
|
|
2376
|
+
# For prefix names: parent is the suffix
|
|
2377
|
+
# Simplest heuristic: try removing first valid bracket group
|
|
2378
|
+
for node in all_nodes:
|
|
2379
|
+
if node.kind == "substituent":
|
|
2380
|
+
# At-probe: replace bracket with At → SMILES → remove At → name
|
|
2381
|
+
before = canonical_name[:node.start]
|
|
2382
|
+
after = canonical_name[node.end + 1:]
|
|
2383
|
+
probe = before + "astato" + after
|
|
2384
|
+
at_smi = _name_to_smiles(probe)
|
|
2385
|
+
if at_smi:
|
|
2386
|
+
split = _split_at_at(at_smi)
|
|
2387
|
+
if split:
|
|
2388
|
+
parent_smi, _, _ = split
|
|
2389
|
+
parent_name = _smiles_to_name(parent_smi)
|
|
2390
|
+
if parent_name:
|
|
2391
|
+
result.canonical_parent = parent_name
|
|
2392
|
+
break
|
|
2393
|
+
|
|
2394
|
+
# Fallback: try the prefix scan parent
|
|
2395
|
+
if not result.canonical_parent and '(' not in canonical_name:
|
|
2396
|
+
for i in range(len(canonical_name) - 4, 0, -1):
|
|
2397
|
+
suffix = canonical_name[i:]
|
|
2398
|
+
if suffix[0].isalpha():
|
|
2399
|
+
smi = _name_to_smiles(suffix)
|
|
2400
|
+
if smi:
|
|
2401
|
+
result.canonical_parent = suffix
|
|
2402
|
+
break
|
|
2403
|
+
|
|
2404
|
+
return result
|
|
2405
|
+
|
|
2406
|
+
|
|
2407
|
+
# ---------------------------------------------------------------------------
|
|
2408
|
+
# R-group / placeholder handling
|
|
2409
|
+
# ---------------------------------------------------------------------------
|
|
2410
|
+
|
|
2411
|
+
# Two probe sets for dual-probe consensus. We run the decomposition with
|
|
2412
|
+
# each set, replace probe names with R-labels, and only keep names that
|
|
2413
|
+
# agree across both runs. This cleanly handles molecules that contain
|
|
2414
|
+
# real halogens — if probe A collides with a real halogen, probe B won't,
|
|
2415
|
+
# and the intersection filters out the bad names.
|
|
2416
|
+
#
|
|
2417
|
+
# Each entry: (atomic_number, IUPAC_prefix, IUPAC_stem)
|
|
2418
|
+
_PROBE_SET_A = [
|
|
2419
|
+
(9, 'fluoro', 'fluor'), # F — first label
|
|
2420
|
+
(17, 'chloro', 'chlor'), # Cl — second label (multi-R-group)
|
|
2421
|
+
]
|
|
2422
|
+
_PROBE_SET_B = [
|
|
2423
|
+
(53, 'iodo', 'iod'), # I — first label
|
|
2424
|
+
(35, 'bromo', 'brom'), # Br — second label (multi-R-group)
|
|
2425
|
+
]
|
|
2426
|
+
|
|
2427
|
+
|
|
2428
|
+
def _replace_probe_in_name(name: str, label: str,
|
|
2429
|
+
probe_prefix: str = 'bromo',
|
|
2430
|
+
probe_stem: str = 'brom') -> str:
|
|
2431
|
+
"""Replace probe-atom name fragments with the R-group label.
|
|
2432
|
+
|
|
2433
|
+
Tries several patterns; replaces only the FIRST match to avoid
|
|
2434
|
+
clobbering legitimate atoms in the rest of the molecule.
|
|
2435
|
+
"""
|
|
2436
|
+
# Try exact prefix replacement first (most common case)
|
|
2437
|
+
# e.g. "4-fluoropyridine" -> '4-"R"-pyridine'
|
|
2438
|
+
m = re.search(r'(\d+-)?' + re.escape(probe_prefix), name, re.IGNORECASE)
|
|
2439
|
+
if m:
|
|
2440
|
+
locant = m.group(1) or ""
|
|
2441
|
+
after = name[m.end():]
|
|
2442
|
+
# Add dash before suffix if it starts with a letter
|
|
2443
|
+
sep = "-" if after and after[0].isalpha() else ""
|
|
2444
|
+
return name[:m.start()] + locant + '"' + label + '"' + sep + after
|
|
2445
|
+
|
|
2446
|
+
# Bracket form: "(fluoro)" -> '("R")'
|
|
2447
|
+
pat_bracket = re.compile(r'\(' + re.escape(probe_prefix) + r'\)',
|
|
2448
|
+
re.IGNORECASE)
|
|
2449
|
+
m = pat_bracket.search(name)
|
|
2450
|
+
if m:
|
|
2451
|
+
return name[:m.start()] + '("' + label + '")' + name[m.end():]
|
|
2452
|
+
|
|
2453
|
+
# Any remaining probe stem substring
|
|
2454
|
+
pat_stem = re.compile(re.escape(probe_stem) + r'\w*', re.IGNORECASE)
|
|
2455
|
+
m = pat_stem.search(name)
|
|
2456
|
+
if m:
|
|
2457
|
+
after = name[m.end():]
|
|
2458
|
+
sep = "-" if after and after[0].isalpha() else ""
|
|
2459
|
+
return name[:m.start()] + '"' + label + '"' + sep + after
|
|
2460
|
+
|
|
2461
|
+
return name
|
|
2462
|
+
|
|
2463
|
+
|
|
2464
|
+
@dataclass
|
|
2465
|
+
class RGroupMapping:
|
|
2466
|
+
"""Tracks an R-group label and its position in the molecule."""
|
|
2467
|
+
label: str # Text label: "R", "R1", "X", "Ar", etc.
|
|
2468
|
+
atom_idx: int # Atom index in the original SMILES (dummy atom)
|
|
2469
|
+
probe_atom_idx: int # Atom index in the probed SMILES (halogen atom)
|
|
2470
|
+
|
|
2471
|
+
|
|
2472
|
+
def _build_label_map(dummy_indices: List[int],
|
|
2473
|
+
labels) -> dict:
|
|
2474
|
+
"""Build {atom_idx: label_str} from various label formats."""
|
|
2475
|
+
if labels is None:
|
|
2476
|
+
if len(dummy_indices) == 1:
|
|
2477
|
+
return {dummy_indices[0]: "R"}
|
|
2478
|
+
else:
|
|
2479
|
+
return {idx: f"R{i}" for i, idx in enumerate(dummy_indices, 1)}
|
|
2480
|
+
elif isinstance(labels, (list, tuple)):
|
|
2481
|
+
label_map = {}
|
|
2482
|
+
for i, idx in enumerate(dummy_indices):
|
|
2483
|
+
label_map[idx] = labels[i] if i < len(labels) else f"R{i+1}"
|
|
2484
|
+
return label_map
|
|
2485
|
+
else:
|
|
2486
|
+
return dict(labels)
|
|
2487
|
+
|
|
2488
|
+
|
|
2489
|
+
def prepare_rgroup_smiles(smiles: str,
|
|
2490
|
+
labels=None,
|
|
2491
|
+
probe_set=None,
|
|
2492
|
+
label_probe_map=None,
|
|
2493
|
+
) -> Tuple[Optional[str], List[RGroupMapping]]:
|
|
2494
|
+
"""Replace dummy atoms (*) with halogen probe atoms.
|
|
2495
|
+
|
|
2496
|
+
Args:
|
|
2497
|
+
smiles: SMILES string, possibly containing [*] dummy atoms.
|
|
2498
|
+
labels: Optional. Can be:
|
|
2499
|
+
- dict mapping atom index -> label string
|
|
2500
|
+
- list of label strings (matched to dummy atoms in order)
|
|
2501
|
+
- None: auto-generate as R, R1, R2...
|
|
2502
|
+
probe_set: List of (atomic_num, prefix, stem) tuples.
|
|
2503
|
+
Defaults to _PROBE_SET_A. Ignored if label_probe_map
|
|
2504
|
+
is provided.
|
|
2505
|
+
label_probe_map: Explicit {label: (atomic_num, prefix, stem)} dict.
|
|
2506
|
+
Overrides probe_set if given.
|
|
2507
|
+
|
|
2508
|
+
Returns:
|
|
2509
|
+
(probed_smiles, mappings) where probed_smiles has halogens
|
|
2510
|
+
instead of *, and mappings tracks which atoms were replaced.
|
|
2511
|
+
Returns (None, []) if no dummy atoms found or on error.
|
|
2512
|
+
"""
|
|
2513
|
+
if probe_set is None and label_probe_map is None:
|
|
2514
|
+
probe_set = _PROBE_SET_A
|
|
2515
|
+
|
|
2516
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
2517
|
+
if mol is None:
|
|
2518
|
+
return None, []
|
|
2519
|
+
|
|
2520
|
+
# Find dummy atoms (atomic number 0)
|
|
2521
|
+
dummy_indices = []
|
|
2522
|
+
for atom in mol.GetAtoms():
|
|
2523
|
+
if atom.GetAtomicNum() == 0:
|
|
2524
|
+
dummy_indices.append(atom.GetIdx())
|
|
2525
|
+
|
|
2526
|
+
if not dummy_indices:
|
|
2527
|
+
return None, [] # No R-groups
|
|
2528
|
+
|
|
2529
|
+
label_map = _build_label_map(dummy_indices, labels)
|
|
2530
|
+
|
|
2531
|
+
# Build label → probe assignment
|
|
2532
|
+
if label_probe_map:
|
|
2533
|
+
label_to_probe = {}
|
|
2534
|
+
for idx in dummy_indices:
|
|
2535
|
+
label = label_map.get(idx, f"R{idx}")
|
|
2536
|
+
if label in label_probe_map:
|
|
2537
|
+
label_to_probe[label] = label_probe_map[label]
|
|
2538
|
+
elif probe_set:
|
|
2539
|
+
label_to_probe[label] = probe_set[0]
|
|
2540
|
+
else:
|
|
2541
|
+
label_to_probe[label] = _PROBE_SET_A[0]
|
|
2542
|
+
else:
|
|
2543
|
+
label_to_probe = {}
|
|
2544
|
+
probe_idx = 0
|
|
2545
|
+
for idx in dummy_indices:
|
|
2546
|
+
label = label_map.get(idx, f"R{idx}")
|
|
2547
|
+
if label not in label_to_probe:
|
|
2548
|
+
if probe_idx < len(probe_set):
|
|
2549
|
+
label_to_probe[label] = probe_set[probe_idx]
|
|
2550
|
+
probe_idx += 1
|
|
2551
|
+
else:
|
|
2552
|
+
label_to_probe[label] = probe_set[0]
|
|
2553
|
+
|
|
2554
|
+
# Replace dummy atoms with their assigned probe atom
|
|
2555
|
+
edit = Chem.RWMol(mol)
|
|
2556
|
+
mappings = []
|
|
2557
|
+
for idx in dummy_indices:
|
|
2558
|
+
atom = edit.GetAtomWithIdx(idx)
|
|
2559
|
+
label = label_map.get(idx, f"R{idx}")
|
|
2560
|
+
probe_z, _prefix, _stem = label_to_probe[label]
|
|
2561
|
+
atom.SetAtomicNum(probe_z)
|
|
2562
|
+
atom.SetFormalCharge(0)
|
|
2563
|
+
atom.SetNoImplicit(False)
|
|
2564
|
+
mappings.append(RGroupMapping(
|
|
2565
|
+
label=label, atom_idx=idx, probe_atom_idx=idx
|
|
2566
|
+
))
|
|
2567
|
+
|
|
2568
|
+
try:
|
|
2569
|
+
Chem.SanitizeMol(edit)
|
|
2570
|
+
probed_smi = Chem.MolToSmiles(edit)
|
|
2571
|
+
return probed_smi, mappings
|
|
2572
|
+
except Exception:
|
|
2573
|
+
return None, []
|
|
2574
|
+
|
|
2575
|
+
|
|
2576
|
+
def _probe_label_mapping(mappings: List[RGroupMapping],
|
|
2577
|
+
probe_set: list) -> dict:
|
|
2578
|
+
"""Build {label: (prefix, stem)} from mappings and probe_set."""
|
|
2579
|
+
result = {}
|
|
2580
|
+
probe_idx = 0
|
|
2581
|
+
for m in mappings:
|
|
2582
|
+
if m.label not in result:
|
|
2583
|
+
if probe_idx < len(probe_set):
|
|
2584
|
+
_z, prefix, stem = probe_set[probe_idx]
|
|
2585
|
+
result[m.label] = (prefix, stem)
|
|
2586
|
+
probe_idx += 1
|
|
2587
|
+
else:
|
|
2588
|
+
_z, prefix, stem = probe_set[0]
|
|
2589
|
+
result[m.label] = (prefix, stem)
|
|
2590
|
+
return result
|
|
2591
|
+
|
|
2592
|
+
|
|
2593
|
+
def _replace_all_probes(name: str, label_to_probe: dict) -> str:
|
|
2594
|
+
"""Replace all probe-atom names in a string with R-group labels."""
|
|
2595
|
+
result = name
|
|
2596
|
+
for label, (prefix, stem) in label_to_probe.items():
|
|
2597
|
+
result = _replace_probe_in_name(result, label,
|
|
2598
|
+
probe_prefix=prefix,
|
|
2599
|
+
probe_stem=stem)
|
|
2600
|
+
return result
|
|
2601
|
+
|
|
2602
|
+
|
|
2603
|
+
def decompose_name_with_rgroups(smiles: str,
|
|
2604
|
+
labels=None,
|
|
2605
|
+
verbose: bool = False
|
|
2606
|
+
) -> DecompositionResult:
|
|
2607
|
+
"""Decompose a molecule with R-group placeholders using dual-probe consensus.
|
|
2608
|
+
|
|
2609
|
+
Strategy: run decomposition twice with different probe halogen sets
|
|
2610
|
+
(A: F/Cl, B: I/Br), replace probe names with R-group labels, and
|
|
2611
|
+
INTERSECT — only keep names that both sets agree on.
|
|
2612
|
+
|
|
2613
|
+
The two probe sets are designed with matching alphabetical orderings:
|
|
2614
|
+
Set A: fluoro (1st label), chloro (2nd label) → chloro < fluoro
|
|
2615
|
+
Set B: iodo (1st label), bromo (2nd label) → bromo < iodo
|
|
2616
|
+
This ensures that IUPAC alphabetical prefix ordering is consistent
|
|
2617
|
+
between sets, so name strings match after probe→label replacement.
|
|
2618
|
+
|
|
2619
|
+
If the molecule already contains one of the probe halogens, the
|
|
2620
|
+
collision is detected via the intersection (colliding set produces
|
|
2621
|
+
wrong names) and a single-probe fallback is used.
|
|
2622
|
+
|
|
2623
|
+
If the SMILES has no dummy atoms, falls through to regular decompose_name.
|
|
2624
|
+
|
|
2625
|
+
Args:
|
|
2626
|
+
smiles: SMILES string, may contain [*] dummy atoms for R-groups.
|
|
2627
|
+
labels: Optional labels for R-groups. Can be:
|
|
2628
|
+
- None: auto-generate R, R1, R2...
|
|
2629
|
+
- list: ['R', 'X'] matched to dummies in order
|
|
2630
|
+
- dict: {atom_idx: label}
|
|
2631
|
+
verbose: Print debug info to stderr.
|
|
2632
|
+
"""
|
|
2633
|
+
# Prepare both probe sets
|
|
2634
|
+
probed_a, mappings_a = prepare_rgroup_smiles(
|
|
2635
|
+
smiles, labels, probe_set=_PROBE_SET_A)
|
|
2636
|
+
probed_b, mappings_b = prepare_rgroup_smiles(
|
|
2637
|
+
smiles, labels, probe_set=_PROBE_SET_B)
|
|
2638
|
+
|
|
2639
|
+
if probed_a is None:
|
|
2640
|
+
# No R-groups found — regular decomposition
|
|
2641
|
+
return decompose_name(smiles, verbose=verbose)
|
|
2642
|
+
|
|
2643
|
+
# Build label→(prefix, stem) for each probe set
|
|
2644
|
+
ltp_a = _probe_label_mapping(mappings_a, _PROBE_SET_A)
|
|
2645
|
+
ltp_b = _probe_label_mapping(mappings_b, _PROBE_SET_B)
|
|
2646
|
+
|
|
2647
|
+
if verbose:
|
|
2648
|
+
print(f" R-group dual-probe consensus:", file=sys.stderr)
|
|
2649
|
+
print(f" Set A ({probed_a}): "
|
|
2650
|
+
+ ", ".join(f"{l}={p}" for l, (p, _) in ltp_a.items()),
|
|
2651
|
+
file=sys.stderr)
|
|
2652
|
+
print(f" Set B ({probed_b}): "
|
|
2653
|
+
+ ", ".join(f"{l}={p}" for l, (p, _) in ltp_b.items()),
|
|
2654
|
+
file=sys.stderr)
|
|
2655
|
+
|
|
2656
|
+
# Run decomposition with each probe set
|
|
2657
|
+
result_a = decompose_name(probed_a, verbose=verbose)
|
|
2658
|
+
result_b = decompose_name(probed_b, verbose=verbose)
|
|
2659
|
+
|
|
2660
|
+
# Replace probes with labels in canonical names
|
|
2661
|
+
canon_a = _replace_all_probes(result_a.canonical_name, ltp_a)
|
|
2662
|
+
canon_b = _replace_all_probes(result_b.canonical_name, ltp_b)
|
|
2663
|
+
|
|
2664
|
+
# Determine canonical name: prefer consensus, fall back to non-colliding
|
|
2665
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
2666
|
+
real_elements = {a.GetAtomicNum() for a in mol.GetAtoms()
|
|
2667
|
+
if a.GetAtomicNum() != 0}
|
|
2668
|
+
a_collides = any(z in real_elements for z, _, _ in _PROBE_SET_A)
|
|
2669
|
+
b_collides = any(z in real_elements for z, _, _ in _PROBE_SET_B)
|
|
2670
|
+
|
|
2671
|
+
if canon_a == canon_b:
|
|
2672
|
+
canonical = canon_a
|
|
2673
|
+
if verbose:
|
|
2674
|
+
print(f" Canonical consensus: {canonical}", file=sys.stderr)
|
|
2675
|
+
else:
|
|
2676
|
+
if a_collides and not b_collides:
|
|
2677
|
+
canonical = canon_b
|
|
2678
|
+
elif b_collides and not a_collides:
|
|
2679
|
+
canonical = canon_a
|
|
2680
|
+
else:
|
|
2681
|
+
canonical = canon_a # No collision — names just differ slightly
|
|
2682
|
+
if verbose:
|
|
2683
|
+
print(f" Canonical disagree: A={canon_a}, B={canon_b}",
|
|
2684
|
+
file=sys.stderr)
|
|
2685
|
+
print(f" Using: {canonical}", file=sys.stderr)
|
|
2686
|
+
|
|
2687
|
+
# Collect alternatives from each set, keyed by name-after-replacement
|
|
2688
|
+
def _alts_by_name(result, ltp):
|
|
2689
|
+
by_name = {}
|
|
2690
|
+
for alt in result.alternatives:
|
|
2691
|
+
if alt.valid:
|
|
2692
|
+
replaced = _replace_all_probes(alt.name, ltp)
|
|
2693
|
+
if replaced not in by_name:
|
|
2694
|
+
by_name[replaced] = alt
|
|
2695
|
+
return by_name
|
|
2696
|
+
|
|
2697
|
+
alts_a = _alts_by_name(result_a, ltp_a)
|
|
2698
|
+
alts_b = _alts_by_name(result_b, ltp_b)
|
|
2699
|
+
|
|
2700
|
+
# Intersect: only keep names that both sets agree on
|
|
2701
|
+
common_names = set(alts_a.keys()) & set(alts_b.keys())
|
|
2702
|
+
|
|
2703
|
+
if verbose:
|
|
2704
|
+
print(f" Set A alts: {len(alts_a)}, Set B alts: {len(alts_b)}, "
|
|
2705
|
+
f"consensus: {len(common_names)}", file=sys.stderr)
|
|
2706
|
+
for name in sorted(common_names):
|
|
2707
|
+
print(f" [OK] {name}", file=sys.stderr)
|
|
2708
|
+
only_a = set(alts_a.keys()) - common_names
|
|
2709
|
+
only_b = set(alts_b.keys()) - common_names
|
|
2710
|
+
for name in sorted(only_a):
|
|
2711
|
+
print(f" [A only] {name}", file=sys.stderr)
|
|
2712
|
+
for name in sorted(only_b):
|
|
2713
|
+
print(f" [B only] {name}", file=sys.stderr)
|
|
2714
|
+
|
|
2715
|
+
# --- Build final result ---
|
|
2716
|
+
canon_smi = Chem.MolToSmiles(mol) if mol else ""
|
|
2717
|
+
result = DecompositionResult(
|
|
2718
|
+
original_smiles=smiles,
|
|
2719
|
+
canonical_smiles=canon_smi,
|
|
2720
|
+
canonical_name=canonical,
|
|
2721
|
+
canonical_parent=_replace_all_probes(
|
|
2722
|
+
result_a.canonical_parent or "", ltp_a) or None,
|
|
2723
|
+
bracket_tree=None,
|
|
2724
|
+
)
|
|
2725
|
+
|
|
2726
|
+
for name in sorted(common_names):
|
|
2727
|
+
alt_a = alts_a[name]
|
|
2728
|
+
result.alternatives.append(Alternative(
|
|
2729
|
+
name=name,
|
|
2730
|
+
parent_name=_replace_all_probes(alt_a.parent_name, ltp_a),
|
|
2731
|
+
sub_name=_replace_all_probes(alt_a.sub_name, ltp_a),
|
|
2732
|
+
locant=alt_a.locant,
|
|
2733
|
+
valid=True,
|
|
2734
|
+
strategy=alt_a.strategy,
|
|
2735
|
+
))
|
|
2736
|
+
|
|
2737
|
+
# Fallback: if consensus is empty, use the non-colliding set
|
|
2738
|
+
if not common_names:
|
|
2739
|
+
if a_collides and not b_collides and alts_b:
|
|
2740
|
+
fallback_alts, fallback_ltp = alts_b, ltp_b
|
|
2741
|
+
elif b_collides and not a_collides and alts_a:
|
|
2742
|
+
fallback_alts, fallback_ltp = alts_a, ltp_a
|
|
2743
|
+
elif alts_a:
|
|
2744
|
+
fallback_alts, fallback_ltp = alts_a, ltp_a
|
|
2745
|
+
else:
|
|
2746
|
+
fallback_alts, fallback_ltp = alts_b, ltp_b
|
|
2747
|
+
|
|
2748
|
+
for name, alt in fallback_alts.items():
|
|
2749
|
+
result.alternatives.append(Alternative(
|
|
2750
|
+
name=name,
|
|
2751
|
+
parent_name=_replace_all_probes(alt.parent_name, fallback_ltp),
|
|
2752
|
+
sub_name=_replace_all_probes(alt.sub_name, fallback_ltp),
|
|
2753
|
+
locant=alt.locant,
|
|
2754
|
+
valid=True,
|
|
2755
|
+
strategy=alt.strategy + " (single-probe fallback)",
|
|
2756
|
+
))
|
|
2757
|
+
if verbose and fallback_alts:
|
|
2758
|
+
print(f" Fallback to single probe: {len(fallback_alts)} alts",
|
|
2759
|
+
file=sys.stderr)
|
|
2760
|
+
|
|
2761
|
+
return result
|
|
2762
|
+
|
|
2763
|
+
|
|
2764
|
+
# ---------------------------------------------------------------------------
|
|
2765
|
+
# CLI
|
|
2766
|
+
# ---------------------------------------------------------------------------
|
|
2767
|
+
|
|
2768
|
+
def _format_text(result: DecompositionResult) -> str:
|
|
2769
|
+
"""Format result as human-readable text."""
|
|
2770
|
+
lines = []
|
|
2771
|
+
lines.append(f"Input SMILES: {result.original_smiles}")
|
|
2772
|
+
lines.append(f"Canonical SMILES: {result.canonical_smiles}")
|
|
2773
|
+
lines.append(f"Canonical name: {result.canonical_name}")
|
|
2774
|
+
|
|
2775
|
+
if result.errors:
|
|
2776
|
+
for e in result.errors:
|
|
2777
|
+
lines.append(f" ERROR: {e}")
|
|
2778
|
+
return "\n".join(lines)
|
|
2779
|
+
|
|
2780
|
+
if result.bracket_tree:
|
|
2781
|
+
lines.append(f"\nBracket groups ({len(result.bracket_tree.children)}):")
|
|
2782
|
+
for child in result.bracket_tree.children:
|
|
2783
|
+
lines.append(f" ({child.text}) [{child.kind}]")
|
|
2784
|
+
|
|
2785
|
+
valid_alts = [a for a in result.alternatives if a.valid]
|
|
2786
|
+
invalid_alts = [a for a in result.alternatives if not a.valid]
|
|
2787
|
+
|
|
2788
|
+
lines.append(f"\nAlternatives ({len(valid_alts)} valid, "
|
|
2789
|
+
f"{len(invalid_alts)} invalid):")
|
|
2790
|
+
|
|
2791
|
+
lines.append(f" 1. {result.canonical_name} [canonical]")
|
|
2792
|
+
for i, alt in enumerate(valid_alts, 2):
|
|
2793
|
+
lines.append(f" {i}. {alt.name} [VALID, parent: {alt.parent_name}]")
|
|
2794
|
+
|
|
2795
|
+
if invalid_alts:
|
|
2796
|
+
lines.append(f"\n Invalid attempts:")
|
|
2797
|
+
for alt in invalid_alts:
|
|
2798
|
+
lines.append(f" - {alt.name} [{alt.strategy}]")
|
|
2799
|
+
|
|
2800
|
+
return "\n".join(lines)
|
|
2801
|
+
|
|
2802
|
+
|
|
2803
|
+
def _format_json(result: DecompositionResult) -> str:
|
|
2804
|
+
"""Format result as JSON."""
|
|
2805
|
+
d = {
|
|
2806
|
+
"original_smiles": result.original_smiles,
|
|
2807
|
+
"canonical_smiles": result.canonical_smiles,
|
|
2808
|
+
"canonical_name": result.canonical_name,
|
|
2809
|
+
"errors": result.errors,
|
|
2810
|
+
"alternatives": [asdict(a) for a in result.alternatives],
|
|
2811
|
+
}
|
|
2812
|
+
return json.dumps(d, indent=2)
|
|
2813
|
+
|
|
2814
|
+
|
|
2815
|
+
def main():
|
|
2816
|
+
parser = argparse.ArgumentParser(
|
|
2817
|
+
description="Name-driven IUPAC decomposition"
|
|
2818
|
+
)
|
|
2819
|
+
parser.add_argument("smiles", help="SMILES string to decompose")
|
|
2820
|
+
parser.add_argument("-v", "--verbose", action="store_true",
|
|
2821
|
+
help="Print detailed progress to stderr")
|
|
2822
|
+
parser.add_argument("--json", action="store_true",
|
|
2823
|
+
help="Output as JSON")
|
|
2824
|
+
parser.add_argument("--max-depth", type=int, default=-1,
|
|
2825
|
+
help="Maximum recursion depth (default: -1, "
|
|
2826
|
+
"unlimited until timeout). 0 = no recursion.")
|
|
2827
|
+
parser.add_argument("--timeout", type=float, default=30.0,
|
|
2828
|
+
help="Timeout in seconds (default: 30). "
|
|
2829
|
+
"Use 0 to disable.")
|
|
2830
|
+
args = parser.parse_args()
|
|
2831
|
+
|
|
2832
|
+
timeout = args.timeout if args.timeout > 0 else None
|
|
2833
|
+
result = decompose_name(args.smiles, max_depth=args.max_depth,
|
|
2834
|
+
verbose=args.verbose, timeout=timeout)
|
|
2835
|
+
|
|
2836
|
+
if args.json:
|
|
2837
|
+
print(_format_json(result))
|
|
2838
|
+
else:
|
|
2839
|
+
print(_format_text(result))
|
|
2840
|
+
|
|
2841
|
+
|
|
2842
|
+
if __name__ == "__main__":
|
|
2843
|
+
main()
|