cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2843 @@
1
+ """
2
+ Name-driven IUPAC decomposition.
3
+
4
+ Parse the bracket hierarchy of an IUPAC name to find substituent
5
+ boundaries, then generate alternative valid names by swapping
6
+ parent ↔ substituent roles. Uses ChemDraw (via ChemScript) as
7
+ the naming oracle — we never try to parse IUPAC grammar ourselves.
8
+
9
+ Usage:
10
+ python name_decomposer.py <SMILES> [-v] [--json] [--max-depth N]
11
+ """
12
+ import argparse
13
+ import json
14
+ import re
15
+ import sys
16
+ import time
17
+ from dataclasses import dataclass, field, asdict
18
+ from functools import lru_cache
19
+ from typing import List, Optional, Tuple
20
+
21
+ from rdkit import Chem, RDLogger
22
+ from cdxml_toolkit.chemdraw.chemscript_bridge import ChemScriptBridge
23
+
24
+ RDLogger.logger().setLevel(RDLogger.ERROR)
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Data classes
28
+ # ---------------------------------------------------------------------------
29
+
30
+ @dataclass
31
+ class BracketNode:
32
+ """A parenthesised group in an IUPAC name."""
33
+ text: str # content inside the parens (excluding the parens)
34
+ start: int # index of '(' in the full name
35
+ end: int # index of ')' in the full name (inclusive)
36
+ children: List["BracketNode"] = field(default_factory=list)
37
+ depth: int = 0 # nesting depth (0 = top-level group)
38
+ kind: str = "" # "stereo" | "multiplier" | "substituent" | "unknown"
39
+
40
+
41
+ @dataclass
42
+ class Alternative:
43
+ """One alternative IUPAC name for the molecule."""
44
+ name: str
45
+ parent_name: str # name of the fragment used as parent
46
+ sub_name: str # name of the fragment used as substituent
47
+ locant: str # locant on the new parent
48
+ valid: bool # round-trip validated?
49
+ strategy: str = "" # how the name was assembled
50
+ notes: str = ""
51
+
52
+
53
+ @dataclass
54
+ class DecompositionResult:
55
+ original_smiles: str
56
+ canonical_smiles: str
57
+ canonical_name: str
58
+ bracket_tree: Optional[BracketNode]
59
+ alternatives: List[Alternative] = field(default_factory=list)
60
+ errors: List[str] = field(default_factory=list)
61
+ canonical_parent: str = "" # parent name in the canonical naming
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Bracket tree parser
66
+ # ---------------------------------------------------------------------------
67
+
68
+ def parse_bracket_tree(name: str) -> BracketNode:
69
+ """Parse parenthesised groups in an IUPAC name into a tree.
70
+
71
+ Skips square brackets [...] (stereo descriptors, ring-fusion).
72
+ Returns a root node whose children are the top-level (...) groups.
73
+ """
74
+ root = BracketNode(text=name, start=0, end=len(name) - 1, depth=-1)
75
+ stack: List[Tuple[int, int]] = [] # (start_pos, depth)
76
+ nodes_by_depth: dict[int, List[BracketNode]] = {}
77
+ i = 0
78
+ while i < len(name):
79
+ ch = name[i]
80
+ if ch == '[':
81
+ # skip entire [...] block
82
+ j = name.find(']', i + 1)
83
+ if j == -1:
84
+ i += 1
85
+ else:
86
+ i = j + 1
87
+ continue
88
+ if ch == '(':
89
+ depth = len(stack)
90
+ stack.append((i, depth))
91
+ elif ch == ')' and stack:
92
+ start_pos, depth = stack.pop()
93
+ text = name[start_pos + 1:i]
94
+ node = BracketNode(
95
+ text=text, start=start_pos, end=i, depth=depth
96
+ )
97
+ nodes_by_depth.setdefault(depth, []).append(node)
98
+ i += 1
99
+
100
+ # Build tree: depth-0 nodes are children of root; depth-N nodes are
101
+ # children of the nearest enclosing depth-(N-1) node.
102
+ all_depths = sorted(nodes_by_depth.keys())
103
+ for d in all_depths:
104
+ if d == 0:
105
+ root.children = nodes_by_depth[d]
106
+ else:
107
+ parent_nodes = nodes_by_depth.get(d - 1, [])
108
+ for node in nodes_by_depth[d]:
109
+ # Find the parent that encloses this node
110
+ for pn in parent_nodes:
111
+ if pn.start < node.start and node.end < pn.end:
112
+ pn.children.append(node)
113
+ break
114
+
115
+ return root
116
+
117
+
118
+ # ---------------------------------------------------------------------------
119
+ # Bracket group classification
120
+ # ---------------------------------------------------------------------------
121
+
122
+ # Patterns that are NOT substituents
123
+ _STEREO_RE = re.compile(
124
+ r'^[RSEZ±]$|^rac$|^rel$|^[RSrs],[RSrs]$|^[0-9]+[RSEZ]$'
125
+ r'|^[0-9]+[a-z]*[RSEZ](,[0-9]+[a-z]*[RSEZ])*$',
126
+ re.IGNORECASE
127
+ )
128
+ _MULTIPLIER_RE = re.compile(
129
+ r'^di$|^tri$|^tetra$|^penta$|^hexa$|^bis$|^tris$', re.IGNORECASE
130
+ )
131
+ _NUMBERSONLY_RE = re.compile(r'^[\d,\' ]+$')
132
+
133
+
134
+ def classify_node(node: BracketNode) -> str:
135
+ """Quick regex classification of a bracket group.
136
+
137
+ Returns "stereo", "multiplier", "skip", or "candidate".
138
+ """
139
+ t = node.text.strip()
140
+ if not t:
141
+ return "skip"
142
+ if _STEREO_RE.match(t):
143
+ return "stereo"
144
+ if _MULTIPLIER_RE.match(t):
145
+ return "multiplier"
146
+ if _NUMBERSONLY_RE.match(t):
147
+ return "skip" # ring assembly numbering
148
+ # Very short texts are unlikely to be substituents
149
+ if len(t) <= 2 and not t.endswith("yl"):
150
+ return "skip"
151
+ return "candidate"
152
+
153
+
154
+ # ---------------------------------------------------------------------------
155
+ # ChemDraw interaction helpers
156
+ # ---------------------------------------------------------------------------
157
+
158
+ _cs: Optional[ChemScriptBridge] = None
159
+
160
+
161
+ def _get_cs() -> ChemScriptBridge:
162
+ global _cs
163
+ if _cs is None:
164
+ _cs = ChemScriptBridge()
165
+ return _cs
166
+
167
+
168
+ def _name_to_smiles(name: str) -> Optional[str]:
169
+ """Try to resolve an IUPAC name to SMILES via ChemDraw."""
170
+ try:
171
+ smi = _get_cs().write_data(name, "smiles", source_format="name")
172
+ if smi and Chem.MolFromSmiles(smi) is not None:
173
+ return smi
174
+ except Exception:
175
+ pass
176
+ return None
177
+
178
+
179
+ def _smiles_to_name(smiles: str) -> Optional[str]:
180
+ """Get IUPAC name for a SMILES string."""
181
+ try:
182
+ return _get_cs().get_name(smiles)
183
+ except Exception:
184
+ return None
185
+
186
+
187
+ def _canonical(smiles: str) -> Optional[str]:
188
+ """RDKit canonical SMILES."""
189
+ mol = Chem.MolFromSmiles(smiles)
190
+ if mol is None:
191
+ return None
192
+ return Chem.MolToSmiles(mol)
193
+
194
+
195
+ def _add_at(mol: Chem.Mol, atom_idx: int) -> Optional[Tuple[Chem.Mol, str]]:
196
+ """Add astatine (At, Z=85) at a specific atom. Return (mol, smiles).
197
+
198
+ For ring NH atoms, At replaces the H (removes one explicit H).
199
+ """
200
+ edit = Chem.RWMol(mol)
201
+ target = edit.GetAtomWithIdx(atom_idx)
202
+ # If target is ring N/O with explicit H, At replaces H
203
+ if (target.IsInRing() and target.GetAtomicNum() != 6
204
+ and target.GetTotalNumHs() > 0):
205
+ explicit_h = target.GetNumExplicitHs()
206
+ if explicit_h > 0:
207
+ target.SetNumExplicitHs(explicit_h - 1)
208
+ at_idx = edit.AddAtom(Chem.Atom(85))
209
+ edit.AddBond(atom_idx, at_idx, Chem.BondType.SINGLE)
210
+ try:
211
+ Chem.SanitizeMol(edit)
212
+ result = edit.GetMol()
213
+ return result, Chem.MolToSmiles(result)
214
+ except Exception:
215
+ return None
216
+
217
+
218
+ def _get_yl_via_acid_probe(mol: Chem.Mol, attach_idx: int,
219
+ verbose: bool = False) -> Optional[str]:
220
+ """Get the -yl substituent form of a fragment using icosanoic acid probe.
221
+
222
+ Attaches the fragment to icosanoic acid (C20, COOH), names the result via
223
+ ChemDraw, and extracts the -yl name from "20-(SUBSTITUENT)icosanoic acid".
224
+
225
+ Uses C20 acid because:
226
+ - COOH is a PCG → always forces the chain as naming parent
227
+ - No drug molecule has a C20 chain → zero confusion
228
+ - Locant 20 and "icosanoic acid" suffix are unambiguous to parse
229
+ """
230
+ acid = Chem.MolFromSmiles("CCCCCCCCCCCCCCCCCCCC(=O)O")
231
+ if acid is None:
232
+ return None
233
+
234
+ combo = Chem.RWMol(Chem.CombineMols(mol, acid))
235
+ # The acid's first carbon (C20, terminal) is at offset = mol.GetNumAtoms()
236
+ acid_c_idx = mol.GetNumAtoms()
237
+ combo.AddBond(attach_idx, acid_c_idx, Chem.BondType.SINGLE)
238
+ try:
239
+ Chem.SanitizeMol(combo)
240
+ except Exception:
241
+ return None
242
+
243
+ acid_smi = Chem.MolToSmiles(combo.GetMol())
244
+ acid_name = _smiles_to_name(acid_smi)
245
+ if acid_name is None:
246
+ return None
247
+
248
+ if verbose:
249
+ print(f" Icosanoic acid probe: '{acid_name}'", file=sys.stderr)
250
+
251
+ # Extract -yl form from "20-(substituent)icosanoic acid"
252
+ m = re.match(r'20-\((.+)\)icosanoic acid$', acid_name)
253
+ if m:
254
+ return m.group(1)
255
+ # Try without parentheses: "20-substitutenticosanoic acid"
256
+ m = re.match(r'20-(.+)icosanoic acid$', acid_name)
257
+ if m:
258
+ return m.group(1)
259
+ return None
260
+
261
+
262
+ _ACID_SMILES = "CCCCCCCCCCCCCCCCCCCC(=O)O"
263
+
264
+
265
+ def _get_yl_via_selenyl_probe(mol: Chem.Mol, attach_idx: int,
266
+ verbose: bool = False) -> Optional[str]:
267
+ """Get the -yl substituent form via a Se-linked icosanoic acid probe.
268
+
269
+ Builds: fragment—Se—CH₂(C₁₈)—COOH, names via ChemDraw, extracts
270
+ the substituent name from ``20-({sub}selanyl)icosanoic acid``.
271
+
272
+ The Se linker isolates the fragment from the acid chain, which avoids
273
+ the ambiguity that breaks the direct acid probe for carbonyl and
274
+ hydroxyl fragments (formyl, acetyl, Boc, phenylmethanol, etc.).
275
+ """
276
+ acid = Chem.MolFromSmiles(_ACID_SMILES)
277
+ if acid is None:
278
+ return None
279
+
280
+ combo = Chem.RWMol(Chem.CombineMols(mol, acid))
281
+ se_idx = combo.AddAtom(Chem.Atom(34)) # Se
282
+ combo.AddBond(attach_idx, se_idx, Chem.BondType.SINGLE)
283
+ acid_c_start = mol.GetNumAtoms() # C-20 of the acid
284
+ combo.AddBond(se_idx, acid_c_start, Chem.BondType.SINGLE)
285
+ try:
286
+ Chem.SanitizeMol(combo)
287
+ except Exception:
288
+ return None
289
+
290
+ probe_smi = Chem.MolToSmiles(combo.GetMol())
291
+ probe_name = _smiles_to_name(probe_smi)
292
+ if probe_name is None:
293
+ return None
294
+
295
+ if verbose:
296
+ print(f" Se-probe: '{probe_name}'", file=sys.stderr)
297
+
298
+ # Extract from "20-({sub}selanyl)icosanoic acid"
299
+ m = re.match(r'20-\((.+?)selanyl\)icosanoic acid$', probe_name)
300
+ if m:
301
+ return m.group(1)
302
+ # Without outer parens: "20-{sub}selanylicosanoic acid"
303
+ m = re.match(r'20-(.+?)selanylicosanoic acid$', probe_name)
304
+ if m:
305
+ return m.group(1)
306
+ return None
307
+
308
+
309
+ # ---------------------------------------------------------------------------
310
+ # Public fragment-naming API
311
+ # ---------------------------------------------------------------------------
312
+
313
+ # Simple single-atom substituent lookup (avoids ChemScript calls)
314
+ _SIMPLE_SUB_MAP = {
315
+ 9: "fluoro", # F
316
+ 17: "chloro", # Cl
317
+ 35: "bromo", # Br
318
+ 53: "iodo", # I
319
+ }
320
+
321
+
322
+ def _name_via_naphthalene_probe(mol: Chem.Mol, attach_idx: int,
323
+ verbose: bool = False) -> Optional[str]:
324
+ """Fallback naming: attach fragment to naphthalene, extract substituent.
325
+
326
+ Used when the icosanoic acid probe fails (e.g. for simple alkyl groups
327
+ that merge into the acid chain). Naphthalene is a named bicyclic ring
328
+ system that takes IUPAC parent priority over most drug-like fragments.
329
+
330
+ Extracts from "2-(SUBSTITUENT)naphthalene" or "2-SUBSTITUENTnaphthalene".
331
+ """
332
+ naph = Chem.MolFromSmiles("c1ccc2ccccc2c1")
333
+ if naph is None:
334
+ return None
335
+
336
+ combo = Chem.RWMol(Chem.CombineMols(mol, naph))
337
+ # Naphthalene position 2 = first atom after offset in canonical SMILES.
338
+ # In 'c1ccc2ccccc2c1' the atoms are ordered 0-9; position 2 corresponds
339
+ # to atom index 1 in canonical ordering. We use index 1 (the second
340
+ # carbon of the first ring — bonded to C1 and C3).
341
+ naph_c2_idx = mol.GetNumAtoms() + 1 # offset + 1
342
+ combo.AddBond(attach_idx, naph_c2_idx, Chem.BondType.SINGLE)
343
+ try:
344
+ Chem.SanitizeMol(combo)
345
+ except Exception:
346
+ return None
347
+
348
+ combo_smi = Chem.MolToSmiles(combo.GetMol())
349
+ combo_name = _smiles_to_name(combo_smi)
350
+ if combo_name is None:
351
+ return None
352
+
353
+ if verbose:
354
+ print(f" Naphthalene probe: '{combo_name}'", file=sys.stderr)
355
+
356
+ # Try bracketed form first: "2-(substituent)naphthalene"
357
+ m = re.match(r'\d+-\((.+)\)naphthalene$', combo_name)
358
+ if m:
359
+ return m.group(1)
360
+ # Unbracketed: "2-substituentnaphthalene"
361
+ m = re.match(r'\d+-(.+)naphthalene$', combo_name)
362
+ if m:
363
+ return m.group(1)
364
+ return None
365
+
366
+
367
+ @lru_cache(maxsize=256)
368
+ def _name_fragment_cached(canonical_frag_smiles: str,
369
+ verbose: bool = False) -> Optional[str]:
370
+ """Cache-friendly inner function keyed on canonical SMILES."""
371
+ mol = Chem.MolFromSmiles(canonical_frag_smiles)
372
+ if mol is None:
373
+ return None
374
+
375
+ # Find dummy atom
376
+ dummy_idx = None
377
+ attach_idx = None
378
+ for atom in mol.GetAtoms():
379
+ if atom.GetAtomicNum() == 0:
380
+ dummy_idx = atom.GetIdx()
381
+ break
382
+ if dummy_idx is None:
383
+ return None
384
+
385
+ # Find the neighbor (attachment atom in the fragment)
386
+ dummy_atom = mol.GetAtomWithIdx(dummy_idx)
387
+ neighbors = list(dummy_atom.GetNeighbors())
388
+ if not neighbors:
389
+ return None
390
+ attach_idx = neighbors[0].GetIdx()
391
+
392
+ # --- Simple single-atom check ---
393
+ # If the fragment is just [*]-X where X is a single heavy atom with no
394
+ # other heavy-atom neighbors, use the lookup table.
395
+ attach_atom = mol.GetAtomWithIdx(attach_idx)
396
+ heavy_neighbors_of_attach = [
397
+ n for n in attach_atom.GetNeighbors() if n.GetAtomicNum() != 0
398
+ ]
399
+ if (mol.GetNumHeavyAtoms() == 2 # [*] + one heavy atom
400
+ and attach_atom.GetAtomicNum() in _SIMPLE_SUB_MAP
401
+ and not heavy_neighbors_of_attach):
402
+ return _SIMPLE_SUB_MAP[attach_atom.GetAtomicNum()]
403
+
404
+ # Check for heteroatom directly bonded to dummy with further structure:
405
+ # [*]O (hydroxy), [*]N (amino), [*]S (sulfanyl) — only when no other
406
+ # heavy neighbors of the heteroatom (otherwise it's part of a bigger fragment)
407
+ if (mol.GetNumHeavyAtoms() == 2
408
+ and not heavy_neighbors_of_attach):
409
+ z = attach_atom.GetAtomicNum()
410
+ if z == 8:
411
+ return "hydroxy"
412
+ if z == 7:
413
+ return "amino"
414
+ if z == 16:
415
+ return "sulfanyl"
416
+
417
+ # --- General case: use icosanoic acid probe ---
418
+ # Remove the dummy atom and prepare clean fragment mol
419
+ edit = Chem.RWMol(mol)
420
+ edit.RemoveAtom(dummy_idx)
421
+ # Adjust attach_idx for the removal
422
+ adjusted_idx = attach_idx if attach_idx < dummy_idx else attach_idx - 1
423
+ try:
424
+ Chem.SanitizeMol(edit)
425
+ except Exception:
426
+ return None
427
+
428
+ frag_clean = edit.GetMol()
429
+
430
+ # Try acid probe first (works for ring-based and hetero-chain fragments)
431
+ result = _get_yl_via_acid_probe(frag_clean, adjusted_idx, verbose=verbose)
432
+ if result is not None:
433
+ return result
434
+
435
+ # Acid probe fails for simple alkyls (they extend the C20 chain).
436
+ # Fallback: attach to naphthalene and extract from "2-(X)naphthalene".
437
+ naph_result = _name_via_naphthalene_probe(frag_clean, adjusted_idx,
438
+ verbose=verbose)
439
+ if naph_result is not None:
440
+ return naph_result
441
+
442
+ # Both probes failed — try acyl fragment detection.
443
+ # Acyl groups ([*]-C(=O)-R) cause probe parents to flip because C=O
444
+ # becomes the principal characteristic group.
445
+ # Strategy: detect C=O at attachment, cap with OH → carboxylic acid form,
446
+ # name the acid, derive the acyl prefix.
447
+ attach_a = frag_clean.GetAtomWithIdx(adjusted_idx)
448
+ if attach_a.GetAtomicNum() == 6:
449
+ # Check for C=O double bond on attachment carbon
450
+ carbonyl_o_idx = None
451
+ for bond in attach_a.GetBonds():
452
+ other = frag_clean.GetAtomWithIdx(bond.GetOtherAtomIdx(adjusted_idx))
453
+ if (other.GetAtomicNum() == 8
454
+ and bond.GetBondType() == Chem.BondType.DOUBLE):
455
+ carbonyl_o_idx = other.GetIdx()
456
+ break
457
+ if carbonyl_o_idx is not None:
458
+ # Build the carboxylic acid: add OH at the attachment point
459
+ acid_edit = Chem.RWMol(frag_clean)
460
+ oh_idx = acid_edit.AddAtom(Chem.Atom(8))
461
+ acid_edit.AddBond(adjusted_idx, oh_idx, Chem.BondType.SINGLE)
462
+ try:
463
+ Chem.SanitizeMol(acid_edit)
464
+ acid_smi = Chem.MolToSmiles(acid_edit.GetMol())
465
+ acid_name = _smiles_to_name(acid_smi)
466
+ if verbose:
467
+ print(f" Acyl acid form: '{acid_name}'",
468
+ file=sys.stderr)
469
+ if acid_name:
470
+ # Convert acid name → acyl prefix:
471
+ # "formic acid" → "formyl"
472
+ # "acetic acid" → "acetyl"
473
+ # "benzoic acid" → "benzoyl"
474
+ # "X-ic acid" → "X-yl" (general rule)
475
+ if acid_name.endswith("ic acid"):
476
+ base = acid_name[:-len("ic acid")]
477
+ if base.endswith("carboxyl"):
478
+ return base + "yl"
479
+ return base + "yl"
480
+ except Exception:
481
+ pass
482
+
483
+ # Acyl-ester pattern: [*]-C(=O)-O-R → "(R-oxy)carbonyl"
484
+ # Detect: attachment C has C=O and also single-bonded O
485
+ ester_o_idx = None
486
+ for bond in attach_a.GetBonds():
487
+ other_idx = bond.GetOtherAtomIdx(adjusted_idx)
488
+ other = frag_clean.GetAtomWithIdx(other_idx)
489
+ if (other.GetAtomicNum() == 8
490
+ and bond.GetBondType() == Chem.BondType.SINGLE
491
+ and other_idx != carbonyl_o_idx):
492
+ ester_o_idx = other_idx
493
+ break
494
+
495
+ if ester_o_idx is not None:
496
+ # Build the R-OH fragment (the ester's alcohol)
497
+ # Break bond at carbonyl C → ester O, replace with [*]
498
+ r_edit = Chem.RWMol(frag_clean)
499
+ r_edit.RemoveBond(adjusted_idx, ester_o_idx)
500
+ # Remove the C=O + attachment side, keep the O-R side
501
+ # Simpler: build [*]-O-R directly
502
+ r_frag = Chem.RWMol()
503
+ # BFS from ester_o_idx to collect all atoms on that side
504
+ visited_r = set()
505
+ queue_r = [ester_o_idx]
506
+ while queue_r:
507
+ ai = queue_r.pop()
508
+ if ai in visited_r or ai == adjusted_idx:
509
+ continue
510
+ visited_r.add(ai)
511
+ for nbr in frag_clean.GetAtomWithIdx(ai).GetNeighbors():
512
+ ni = nbr.GetIdx()
513
+ if ni != adjusted_idx and ni not in visited_r:
514
+ queue_r.append(ni)
515
+
516
+ old_to_new_r = {}
517
+ for old_i in sorted(visited_r):
518
+ src = frag_clean.GetAtomWithIdx(old_i)
519
+ na = Chem.Atom(src.GetAtomicNum())
520
+ na.SetFormalCharge(src.GetFormalCharge())
521
+ na.SetIsAromatic(src.GetIsAromatic())
522
+ new_i = r_frag.AddAtom(na)
523
+ old_to_new_r[old_i] = new_i
524
+
525
+ # Add dummy at where the C(=O) was
526
+ dummy_new = r_frag.AddAtom(Chem.Atom(0))
527
+ r_frag.AddBond(old_to_new_r[ester_o_idx], dummy_new,
528
+ Chem.BondType.SINGLE)
529
+
530
+ # Add bonds within R-O fragment
531
+ for old_i in visited_r:
532
+ for bond in frag_clean.GetAtomWithIdx(old_i).GetBonds():
533
+ other_i = bond.GetOtherAtomIdx(old_i)
534
+ if other_i in visited_r and old_i < other_i:
535
+ r_frag.AddBond(old_to_new_r[old_i],
536
+ old_to_new_r[other_i],
537
+ bond.GetBondType())
538
+ try:
539
+ Chem.SanitizeMol(r_frag)
540
+ r_smi = Chem.MolToSmiles(r_frag)
541
+ # Name the [*]-O-R fragment → should give "R-oxy"
542
+ r_name = name_fragment_as_substituent(r_smi, verbose=verbose)
543
+ if r_name:
544
+ return f"({r_name})carbonyl"
545
+ except Exception:
546
+ pass
547
+
548
+ return None
549
+
550
+
551
+ def name_fragment_as_substituent(frag_smiles: str,
552
+ verbose: bool = False) -> Optional[str]:
553
+ """Convert a [*]-bearing fragment SMILES to its IUPAC substituent prefix.
554
+
555
+ Uses the icosanoic acid probe (C20 acid): attaches the fragment at [*]
556
+ to the acid's terminal carbon, names the whole molecule via ChemScript,
557
+ and extracts the substituent from "20-(SUBSTITUENT)icosanoic acid".
558
+
559
+ For simple single-atom fragments (F, Cl, Br, I, O, N, S) a direct
560
+ lookup table is used to avoid a ChemScript call.
561
+
562
+ Args:
563
+ frag_smiles: SMILES with [*] marking the attachment point.
564
+ E.g. "[*]c1ccccc1" for phenyl, "[*]F" for fluoro.
565
+ verbose: Print debug info to stderr.
566
+
567
+ Returns:
568
+ Substituent prefix name (e.g. "phenyl", "fluoro", "morpholino",
569
+ "(piperidin-1-yl)") or None on failure.
570
+
571
+ Examples::
572
+
573
+ >>> name_fragment_as_substituent("[*]F")
574
+ 'fluoro'
575
+ >>> name_fragment_as_substituent("[*]c1ccccc1")
576
+ 'phenyl'
577
+ """
578
+ # Canonicalise the fragment SMILES for cache lookup
579
+ mol = Chem.MolFromSmiles(frag_smiles)
580
+ if mol is None:
581
+ return None
582
+ canon = Chem.MolToSmiles(mol)
583
+ return _name_fragment_cached(canon, verbose=verbose)
584
+
585
+
586
+ def _get_yl_suffix_via_acid(parent_mol: Chem.Mol, parent_attach_idx: int,
587
+ heteroatom_num: int,
588
+ verbose: bool = False) -> Optional[str]:
589
+ """Get the -yl+suffix form by building parent + heteroatom + icosanoic acid.
590
+
591
+ For heteroatom linkages (O, N, S), the substituent name includes the
592
+ heteroatom suffix (e.g., "pyridin-4-yloxy" for O, "phenylamino" for N).
593
+ We build: parent-[heteroatom]-icosanoic_acid, name it, and extract
594
+ the substituent from "20-(SUBSTITUENT)icosanoic acid".
595
+ """
596
+ acid = Chem.MolFromSmiles("CCCCCCCCCCCCCCCCCCCC(=O)O")
597
+ if acid is None:
598
+ return None
599
+ het_atom = Chem.Atom(heteroatom_num)
600
+ combo = Chem.RWMol(Chem.CombineMols(parent_mol, acid))
601
+ het_idx = combo.AddAtom(het_atom)
602
+ combo.AddBond(parent_attach_idx, het_idx, Chem.BondType.SINGLE)
603
+ acid_c_start = parent_mol.GetNumAtoms()
604
+ combo.AddBond(het_idx, acid_c_start, Chem.BondType.SINGLE)
605
+ try:
606
+ Chem.SanitizeMol(combo)
607
+ except Exception:
608
+ return None
609
+ acid_name = _smiles_to_name(Chem.MolToSmiles(combo.GetMol()))
610
+ if acid_name is None:
611
+ return None
612
+ if verbose:
613
+ print(f" Acid+heteroatom probe: '{acid_name}'", file=sys.stderr)
614
+ m = re.match(r'20-\((.+)\)icosanoic acid$', acid_name)
615
+ if m:
616
+ return m.group(1)
617
+ m = re.match(r'20-(.+)icosanoic acid$', acid_name)
618
+ if m:
619
+ return m.group(1)
620
+ return None
621
+
622
+
623
+ def _get_locant_replace_heteroatom(sub_mol: Chem.Mol, sub_attach_idx: int,
624
+ verbose: bool = False
625
+ ) -> Optional[Tuple[str, Optional[str]]]:
626
+ """Remove heteroatom from sub fragment, add At to its C neighbor, name.
627
+
628
+ Returns (at_probe_name, locant) or None.
629
+ The At-probe name serves as the assembly template: replace "astato" with
630
+ the yl+suffix form to get the swapped name.
631
+ """
632
+ het_atom = sub_mol.GetAtomWithIdx(sub_attach_idx)
633
+ # Find carbon neighbor of the heteroatom within the fragment
634
+ c_neighbor_idx = None
635
+ for n in het_atom.GetNeighbors():
636
+ if n.GetAtomicNum() == 6:
637
+ c_neighbor_idx = n.GetIdx()
638
+ break
639
+ if c_neighbor_idx is None:
640
+ return None
641
+
642
+ edit = Chem.RWMol(sub_mol)
643
+ edit.RemoveAtom(sub_attach_idx)
644
+ try:
645
+ Chem.SanitizeMol(edit)
646
+ except Exception:
647
+ return None
648
+
649
+ # Adjust index after atom removal
650
+ new_c_idx = (c_neighbor_idx - 1
651
+ if sub_attach_idx < c_neighbor_idx else c_neighbor_idx)
652
+ at_i = edit.AddAtom(Chem.Atom(85))
653
+ edit.AddBond(new_c_idx, at_i, Chem.BondType.SINGLE)
654
+ try:
655
+ Chem.SanitizeMol(edit)
656
+ except Exception:
657
+ return None
658
+
659
+ at_name = _smiles_to_name(Chem.MolToSmiles(edit.GetMol()))
660
+ if at_name is None:
661
+ return None
662
+
663
+ if verbose:
664
+ print(f" Het->At probe: '{at_name}'", file=sys.stderr)
665
+
666
+ locant = None
667
+ m = re.search(r'(\d+)-astato', at_name, re.IGNORECASE)
668
+ if m:
669
+ locant = m.group(1)
670
+ elif 'astato' in at_name.lower():
671
+ locant = ""
672
+ return at_name, locant
673
+
674
+
675
+ # ---------------------------------------------------------------------------
676
+ # Core decomposition logic
677
+ # ---------------------------------------------------------------------------
678
+
679
+ def validate_as_substituent(full_name: str, node: BracketNode,
680
+ verbose: bool = False) -> bool:
681
+ """Check if replacing a bracket group with 'astato' gives a valid name.
682
+
683
+ This tells us ChemDraw treats that position as a real substituent slot.
684
+ """
685
+ # Build modified name: replace (content) with astato
686
+ before = full_name[:node.start]
687
+ after = full_name[node.end + 1:]
688
+ modified = before + "astato" + after
689
+ if verbose:
690
+ print(f" At-probe name: {modified}", file=sys.stderr)
691
+ return _name_to_smiles(modified) is not None
692
+
693
+
694
+ def _find_at_atom(mol: Chem.Mol) -> Optional[int]:
695
+ """Find the atom index of At in a molecule."""
696
+ for atom in mol.GetAtoms():
697
+ if atom.GetAtomicNum() == 85:
698
+ return atom.GetIdx()
699
+ return None
700
+
701
+
702
+ def _split_at_at(smiles: str) -> Optional[Tuple[str, str, int]]:
703
+ """Given SMILES containing At, return (parent_smiles, At_neighbor_idx_in_parent).
704
+
705
+ Removes At and returns the molecule with At removed, plus the atom index
706
+ where At was attached (for later probe attachment).
707
+ Returns (smiles_without_at, original_smiles, neighbor_idx_in_clean_mol).
708
+ """
709
+ mol = Chem.MolFromSmiles(smiles)
710
+ if mol is None:
711
+ return None
712
+ at_idx = _find_at_atom(mol)
713
+ if at_idx is None:
714
+ return None
715
+
716
+ at_atom = mol.GetAtomWithIdx(at_idx)
717
+ neighbors = at_atom.GetNeighbors()
718
+ if not neighbors:
719
+ return None
720
+ neighbor_idx = neighbors[0].GetIdx()
721
+
722
+ # Remove At. Try sanitization first; if it fails (e.g. ring N losing
723
+ # a bond needs H compensation), try again with explicit H.
724
+ edit = Chem.RWMol(mol)
725
+ edit.RemoveAtom(at_idx)
726
+ try:
727
+ Chem.SanitizeMol(edit)
728
+ except Exception:
729
+ # Retry with explicit H on the neighbor (now shifted by At removal)
730
+ edit = Chem.RWMol(mol)
731
+ edit.RemoveAtom(at_idx)
732
+ adj_idx = neighbor_idx - 1 if at_idx < neighbor_idx else neighbor_idx
733
+ atom = edit.GetAtomWithIdx(adj_idx)
734
+ atom.SetNumExplicitHs(atom.GetNumExplicitHs() + 1)
735
+ try:
736
+ Chem.SanitizeMol(edit)
737
+ except Exception:
738
+ return None
739
+ clean_mol = edit.GetMol()
740
+ clean_smi = Chem.MolToSmiles(clean_mol)
741
+
742
+ # The neighbor index may have shifted if at_idx < neighbor_idx
743
+ if at_idx < neighbor_idx:
744
+ new_neighbor_idx = neighbor_idx - 1
745
+ else:
746
+ new_neighbor_idx = neighbor_idx
747
+
748
+ return clean_smi, smiles, new_neighbor_idx
749
+
750
+
751
+ def get_parent_smiles_from_at_probe(full_name: str,
752
+ node: BracketNode) -> Optional[Tuple[str, int]]:
753
+ """Replace bracket group with 'astato', resolve to SMILES,
754
+ remove the At to get the parent fragment + attachment index.
755
+
756
+ Returns (parent_smiles, attach_idx_in_parent) or None.
757
+ """
758
+ before = full_name[:node.start]
759
+ after = full_name[node.end + 1:]
760
+ modified = before + "astato" + after
761
+ at_smiles = _name_to_smiles(modified)
762
+ if at_smiles is None:
763
+ return None
764
+ result = _split_at_at(at_smiles)
765
+ if result is None:
766
+ return None
767
+ parent_smi, _, attach_idx = result
768
+ return parent_smi, attach_idx
769
+
770
+
771
+ def get_sub_smiles_from_bracket(node: BracketNode) -> Optional[str]:
772
+ """Try to resolve the bracket content as a standalone chemical name.
773
+
774
+ The bracket content is the substituent in -yl form. We try several
775
+ strategies to resolve it to SMILES:
776
+ 1. Direct: try the text as-is (works for e.g. "phenyl")
777
+ 2. Strip trailing -yl and add -e (e.g. "pyridin-4-yl" → "pyridine")
778
+ """
779
+ text = node.text.strip()
780
+ if not text:
781
+ return None
782
+
783
+ # Try as-is (e.g., "phenyl", "morpholino")
784
+ smi = _name_to_smiles(text)
785
+ if smi:
786
+ return smi
787
+
788
+ # Try removing trailing "-yl" variants and restoring parent form
789
+ for suffix in ["-yl", "yl"]:
790
+ if text.endswith(suffix):
791
+ stem = text[:-len(suffix)]
792
+ # Try adding 'e' back (pyridin → pyridine)
793
+ for restore in [stem + "e", stem + "ene", stem]:
794
+ smi = _name_to_smiles(restore)
795
+ if smi:
796
+ return smi
797
+
798
+ return None
799
+
800
+
801
+ # ---------------------------------------------------------------------------
802
+ # -yl form construction
803
+ # ---------------------------------------------------------------------------
804
+
805
+ # Well-known parent → substituent name mappings
806
+ _YL_SPECIALS = {
807
+ "benzene": ["phenyl"],
808
+ "naphthalene": ["naphthyl"],
809
+ "toluene": ["tolyl"],
810
+ }
811
+
812
+
813
+ def construct_yl_form(parent_name: str, locant: str) -> List[str]:
814
+ """Construct candidate '-yl' substituent forms from a parent name.
815
+
816
+ Returns a list of candidates to try (most likely first).
817
+ ChemDraw round-trip validation will pick the correct one.
818
+ """
819
+ lower = parent_name.lower().strip()
820
+
821
+ # Check specials
822
+ if lower in _YL_SPECIALS:
823
+ candidates = list(_YL_SPECIALS[lower])
824
+ # Also add the locant variant if applicable
825
+ if locant:
826
+ for c in list(candidates):
827
+ candidates.append(f"{c.replace('yl', f'-{locant}-yl')}")
828
+ return candidates
829
+
830
+ # General rule: drop trailing 'e' (if present), insert locant, add '-yl'
831
+ name = parent_name.strip()
832
+ if name.endswith('e') and not name.endswith('ene'):
833
+ stem = name[:-1]
834
+ else:
835
+ stem = name
836
+
837
+ candidates = []
838
+ if locant:
839
+ candidates.append(f"{stem}-{locant}-yl")
840
+ # Also try without locant (some names omit it)
841
+ candidates.append(f"{stem}-yl")
842
+ else:
843
+ candidates.append(f"{stem}-yl")
844
+
845
+ return candidates
846
+
847
+
848
+ def get_locant_via_at_probe(fragment_smiles: str,
849
+ attach_idx: int) -> Optional[str]:
850
+ """Add At at attachment point, name via ChemDraw, extract locant.
851
+
852
+ Returns the locant string (e.g., "4") or None.
853
+ """
854
+ mol = Chem.MolFromSmiles(fragment_smiles)
855
+ if mol is None:
856
+ return None
857
+
858
+ result = _add_at(mol, attach_idx)
859
+ if result is None:
860
+ return None
861
+ _, at_smi = result
862
+
863
+ at_name = _smiles_to_name(at_smi)
864
+ if at_name is None:
865
+ return None
866
+
867
+ # Extract locant from "X-astato..." pattern
868
+ m = re.search(r'(\d+)-astato', at_name, re.IGNORECASE)
869
+ if m:
870
+ return m.group(1)
871
+
872
+ # Check for "astato" without a numeric locant (position 1 implied)
873
+ if 'astato' in at_name.lower():
874
+ return ""
875
+
876
+ return None
877
+
878
+
879
+ # ---------------------------------------------------------------------------
880
+ # Prefix substituent detection (for bracketless names)
881
+ # ---------------------------------------------------------------------------
882
+
883
+ def find_prefix_substituents(name: str,
884
+ verbose: bool = False,
885
+ skip_single_prefix: bool = False,
886
+ ) -> List[BracketNode]:
887
+ """Detect non-bracketed substituent prefixes in a name.
888
+
889
+ For names like "4-phenylpyridine", bracket parsing finds nothing.
890
+ This function scans for the parent name at the end and identifies
891
+ substituent prefixes before it.
892
+
893
+ Strategy: try suffixes of increasing length as potential parent names
894
+ via ChemDraw. The longest valid suffix (that isn't the whole name)
895
+ is the parent; everything before it is substituent prefix(es).
896
+
897
+ Returns synthetic BracketNode(s) representing the prefix substituents,
898
+ with positions set so that the At-probe approach works.
899
+ """
900
+ # Skip names that already have bracket groups (handled elsewhere)
901
+ if '(' in name:
902
+ return []
903
+
904
+ # Try suffixes from longest to shortest
905
+ # The parent name is at the END of the IUPAC name
906
+ words = name.split()
907
+ # For multi-word names like "benzoic acid", "butanoic acid",
908
+ # work with the last word first, then try multi-word suffixes
909
+ candidates = []
910
+
911
+ # Try each position as a split point
912
+ for i in range(1, len(name)):
913
+ suffix = name[i:]
914
+ prefix = name[:i]
915
+
916
+ # The suffix should start with a letter (parent name)
917
+ # Also allow "1H-" prefix (hydrogen designation in heterocycles)
918
+ # Also allow locant-prefixed parents like "1,3,4-oxadiazole"
919
+ if not suffix or (not suffix[0].isalpha()
920
+ and not re.match(r'\d+H-', suffix)
921
+ and not re.match(r'[\d,]+-[a-zA-Z]', suffix)):
922
+ continue
923
+
924
+ # The prefix should end with a substituent-like pattern
925
+ # (ends with a letter, typically 'yl', 'o', 'oxy', etc.)
926
+ if not prefix:
927
+ continue
928
+
929
+ # Quick filter: skip if suffix is too short to be a real parent name
930
+ if len(suffix) < 4:
931
+ continue
932
+
933
+ # Check if suffix is a valid parent name
934
+ smi = _name_to_smiles(suffix)
935
+ if smi is not None:
936
+ candidates.append((i, suffix, prefix, smi))
937
+
938
+ if not candidates:
939
+ return []
940
+
941
+ # Find the best candidate: the one where the prefix looks most like
942
+ # a substituent. Prefer splits where the prefix ends with a common
943
+ # substituent suffix (-yl, -o, -amino, etc.)
944
+ # and the parent name is a real ring/chain system.
945
+ best = None
946
+ for i, suffix, prefix, smi in candidates:
947
+ # Strip leading locant+dash from prefix (numeric or N-locants)
948
+ stripped = re.sub(r'^(?:N(?:,N)*[,-]|\d+[,-])+', '', prefix).rstrip('-')
949
+ if not stripped:
950
+ continue
951
+
952
+ # Check if the stripped prefix resolves as a substituent (name)
953
+ # by trying to resolve it.
954
+ # But skip if it contains internal or trailing locants
955
+ # (e.g. "chloro-4-phenyl" or "cyclohexyl-2" — really multi-prefix)
956
+ if re.search(r'.\d+[,-]|[a-z]-\d+$', stripped):
957
+ continue # multi-prefix, handled in fallback
958
+ sub_smi = _name_to_smiles(stripped)
959
+ if sub_smi is not None:
960
+ if best is None or len(suffix) > len(best[1]):
961
+ best = (i, suffix, prefix, smi, stripped, sub_smi)
962
+ continue
963
+
964
+ # Also try common -yl to parent conversions
965
+ if stripped.endswith('yl'):
966
+ for restore_suffix in ['e', 'ene', '']:
967
+ parent_form = stripped.rstrip('yl').rstrip('-') + restore_suffix
968
+ if parent_form:
969
+ sub_smi = _name_to_smiles(parent_form)
970
+ if sub_smi is not None:
971
+ if best is None or len(suffix) > len(best[1]):
972
+ best = (i, suffix, prefix, smi, stripped, sub_smi)
973
+ break
974
+
975
+ if best is not None and not skip_single_prefix:
976
+ split_pos, suffix, prefix, parent_smi, sub_text, sub_smi = best
977
+
978
+ if verbose:
979
+ print(f" Prefix scan: '{prefix}' + '{suffix}'", file=sys.stderr)
980
+ print(f" Substituent text: '{sub_text}' -> {sub_smi}",
981
+ file=sys.stderr)
982
+ print(f" Parent: '{suffix}' -> {parent_smi}", file=sys.stderr)
983
+
984
+ # Create a synthetic BracketNode for the prefix substituent
985
+ # Position it so that replacing name[start:end+1] with "astato"
986
+ # gives a valid At-probe name.
987
+ sub_start = prefix.find(sub_text)
988
+ if sub_start == -1:
989
+ sub_start = 0
990
+ sub_end = sub_start + len(sub_text) - 1
991
+
992
+ node = BracketNode(
993
+ text=sub_text,
994
+ start=sub_start,
995
+ end=sub_end,
996
+ depth=0,
997
+ kind="prefix_substituent",
998
+ )
999
+ return [node]
1000
+
1001
+ # Fallback: multi-prefix scan.
1002
+ # For "2-chloro-4-phenylquinoline", the whole prefix doesn't resolve
1003
+ # as one substituent. Split on locant boundaries and try each piece.
1004
+ # Try candidates sorted by suffix length ascending (shortest parent first
1005
+ # = longest prefix = most substituents to decompose).
1006
+ sorted_candidates = sorted(candidates, key=lambda c: len(c[1]))
1007
+
1008
+ for _, suffix, prefix, parent_smi in sorted_candidates:
1009
+ # Split prefix into individual substituents on locant boundaries
1010
+ # e.g. "2-chloro-4-phenyl" → ["2-chloro-", "4-phenyl"]
1011
+ parts = re.split(r'(?=(?:N(?:,N)*|\d+)[,-])', prefix)
1012
+ parts = [p for p in parts if p]
1013
+
1014
+ if verbose:
1015
+ print(f" Multi-prefix scan: prefix='{prefix}' parent='{suffix}'",
1016
+ file=sys.stderr)
1017
+ print(f" Parts: {parts}", file=sys.stderr)
1018
+
1019
+ nodes = []
1020
+ for part in parts:
1021
+ # Strip locant prefix
1022
+ stripped = re.sub(r'^(?:N(?:,N)*[,-]|\d+[,-])+', '',
1023
+ part).rstrip('-')
1024
+ if not stripped or len(stripped) < 3:
1025
+ continue
1026
+
1027
+ # Try to resolve as a substituent name
1028
+ sub_smi = _name_to_smiles(stripped)
1029
+ if sub_smi is None and stripped.endswith('yl'):
1030
+ for restore_suffix in ['e', 'ene', '']:
1031
+ parent_form = (stripped.rstrip('yl').rstrip('-')
1032
+ + restore_suffix)
1033
+ if parent_form:
1034
+ sub_smi = _name_to_smiles(parent_form)
1035
+ if sub_smi is not None:
1036
+ break
1037
+
1038
+ if sub_smi is None:
1039
+ continue
1040
+
1041
+ # Find position of substituent text in the full name
1042
+ sub_start = name.find(stripped)
1043
+ if sub_start == -1:
1044
+ continue
1045
+ sub_end = sub_start + len(stripped) - 1
1046
+
1047
+ if verbose:
1048
+ print(f" Multi-prefix sub: '{stripped}' -> {sub_smi} "
1049
+ f"(pos {sub_start}-{sub_end})", file=sys.stderr)
1050
+
1051
+ nodes.append(BracketNode(
1052
+ text=stripped,
1053
+ start=sub_start,
1054
+ end=sub_end,
1055
+ depth=0,
1056
+ kind="prefix_substituent",
1057
+ ))
1058
+
1059
+ if nodes:
1060
+ return nodes
1061
+
1062
+ # Fallback: multiplied prefix scan.
1063
+ # For "2,3-diphenylquinoline": locants=2,3, multiplier=di, sub=phenyl.
1064
+ # Construct At-probe for each locant: "3-phenyl-2-astatoquinoline".
1065
+ _MULTIPLIER_RE = re.compile(
1066
+ r'^([\d,]+)-' # locant list: "2,3-"
1067
+ r'(di|tri|tetra|penta|hexa)' # multiplier
1068
+ r'(.+)$' # base substituent: "phenyl"
1069
+ )
1070
+ for _, suffix, prefix, parent_smi in sorted_candidates:
1071
+ m = _MULTIPLIER_RE.match(prefix)
1072
+ if not m:
1073
+ continue
1074
+ locant_str, multiplier, base_sub = m.groups()
1075
+ locants = locant_str.split(',')
1076
+
1077
+ # Verify the base substituent resolves
1078
+ sub_smi = _name_to_smiles(base_sub.rstrip('-'))
1079
+ if sub_smi is None:
1080
+ continue
1081
+
1082
+ if verbose:
1083
+ print(f" Multiplied prefix: locants={locants} "
1084
+ f"mult={multiplier} sub='{base_sub}' "
1085
+ f"parent='{suffix}'", file=sys.stderr)
1086
+
1087
+ # For each locant, create a node whose At-probe replaces ONE
1088
+ # instance. The At-probe name is constructed manually:
1089
+ # "locants_minus_one-sub-locant-astato-parent"
1090
+ nodes = []
1091
+ clean_sub = base_sub.rstrip('-')
1092
+ # Ensure suffix starts with dash if it starts with a digit (locants)
1093
+ dash_suffix = suffix if suffix[0].isalpha() else f"-{suffix}"
1094
+ for loc in locants:
1095
+ other_locs = [l for l in locants if l != loc]
1096
+ if other_locs:
1097
+ # Build: "other_loc-sub-loc-astato-parent"
1098
+ other_prefix = ','.join(other_locs) + f'-{clean_sub}'
1099
+ probe_name = f"{other_prefix}-{loc}-astato{dash_suffix}"
1100
+ else:
1101
+ probe_name = f"{loc}-astato{dash_suffix}"
1102
+
1103
+ # Verify the probe resolves
1104
+ probe_smi = _name_to_smiles(probe_name)
1105
+ if probe_smi is None:
1106
+ if verbose:
1107
+ print(f" Mult At-probe FAIL: '{probe_name}'",
1108
+ file=sys.stderr)
1109
+ continue
1110
+
1111
+ if verbose:
1112
+ print(f" Mult At-probe OK: '{probe_name}'",
1113
+ file=sys.stderr)
1114
+
1115
+ # Create a special node that stores the full probe name
1116
+ # (can't use simple text replacement for multiplied prefixes)
1117
+ node = BracketNode(
1118
+ text=base_sub.rstrip('-'),
1119
+ start=-1, # sentinel: not a simple text position
1120
+ end=-1,
1121
+ depth=0,
1122
+ kind="multiplied_prefix",
1123
+ )
1124
+ # Store probe name + locant in the node text for later use
1125
+ node._probe_name = probe_name
1126
+ node._locant = loc
1127
+ node._parent_suffix = suffix
1128
+ nodes.append(node)
1129
+
1130
+ if nodes:
1131
+ return nodes
1132
+
1133
+ return []
1134
+
1135
+
1136
+ def _at_probe_for_prefix(full_name: str, node: BracketNode,
1137
+ verbose: bool = False) -> bool:
1138
+ """Validate a prefix substituent by replacing it with 'astato'.
1139
+
1140
+ For prefix substituents, we replace the text directly (no parens to remove).
1141
+ For multiplied prefixes, the probe name is pre-computed.
1142
+ """
1143
+ if node.kind == "multiplied_prefix" and hasattr(node, '_probe_name'):
1144
+ modified = node._probe_name
1145
+ else:
1146
+ before = full_name[:node.start]
1147
+ after = full_name[node.end + 1:]
1148
+ modified = before + "astato" + after
1149
+ if verbose:
1150
+ print(f" Prefix At-probe: '{modified}'", file=sys.stderr)
1151
+ return _name_to_smiles(modified) is not None
1152
+
1153
+
1154
+ @dataclass
1155
+ class FragmentResult:
1156
+ """Result of splitting a molecule into parent and substituent."""
1157
+ parent_smi: str
1158
+ parent_mol: Chem.Mol
1159
+ parent_attach_idx: int
1160
+ sub_smi: str
1161
+ sub_mol: Chem.Mol
1162
+ sub_attach_idx: int
1163
+
1164
+
1165
+ def _get_fragments_via_at_probe(canonical_smiles: str, at_probe_smiles: str,
1166
+ verbose: bool = False
1167
+ ) -> Optional[FragmentResult]:
1168
+ """From the At-probe SMILES, extract parent and substituent fragments.
1169
+
1170
+ The At-probe SMILES has At replacing the substituent. We:
1171
+ 1. Find the At atom and its neighbor in the At-probe molecule
1172
+ 2. Remove At to get parent SMILES + attachment index
1173
+ 3. Use substructure matching to find which atoms in the full molecule
1174
+ belong to the parent, then the rest is the substituent
1175
+
1176
+ Returns FragmentResult with mol objects (preserving atom indices) or None.
1177
+ """
1178
+ result = _split_at_at(at_probe_smiles)
1179
+ if result is None:
1180
+ return None
1181
+ parent_smi, _, parent_attach_idx = result
1182
+
1183
+ # Match parent in full molecule to find substituent atoms
1184
+ full_mol = Chem.MolFromSmiles(canonical_smiles)
1185
+ parent_mol = Chem.MolFromSmiles(parent_smi)
1186
+ if full_mol is None or parent_mol is None:
1187
+ return None
1188
+
1189
+ parent_match = full_mol.GetSubstructMatch(parent_mol)
1190
+ if not parent_match:
1191
+ return None
1192
+
1193
+ parent_set = set(parent_match)
1194
+ sub_atoms = [i for i in range(full_mol.GetNumAtoms()) if i not in parent_set]
1195
+
1196
+ if not sub_atoms:
1197
+ return None
1198
+
1199
+ # Find attachment bond: parent atom → sub atom
1200
+ sub_attach_full = None
1201
+ parent_attach_full = None
1202
+ for bond in full_mol.GetBonds():
1203
+ a1, a2 = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
1204
+ if a1 in parent_set and a2 not in parent_set:
1205
+ parent_attach_full = a1
1206
+ sub_attach_full = a2
1207
+ break
1208
+ if a2 in parent_set and a1 not in parent_set:
1209
+ parent_attach_full = a2
1210
+ sub_attach_full = a1
1211
+ break
1212
+
1213
+ if sub_attach_full is None:
1214
+ return None
1215
+
1216
+ # Extract substituent as a separate molecule
1217
+ # Use RWMol: remove the bond, get fragments.
1218
+ # Clear aromaticity before bond removal to avoid kekulization issues,
1219
+ # then let SanitizeMol recalculate properly for each fragment.
1220
+ edit = Chem.RWMol(full_mol)
1221
+ Chem.Kekulize(edit, clearAromaticFlags=True)
1222
+ edit.RemoveBond(parent_attach_full, sub_attach_full)
1223
+ try:
1224
+ Chem.SanitizeMol(edit)
1225
+ except Exception:
1226
+ return None
1227
+
1228
+ frag_atom_lists = Chem.GetMolFrags(edit, asMols=False)
1229
+ frag_mols = Chem.GetMolFrags(edit, asMols=True, sanitizeFrags=True)
1230
+
1231
+ # Identify which fragment is the substituent and parent
1232
+ sub_frag_idx = None
1233
+ parent_frag_idx = None
1234
+ for fi, atom_list in enumerate(frag_atom_lists):
1235
+ if sub_attach_full in atom_list:
1236
+ sub_frag_idx = fi
1237
+ if parent_attach_full in atom_list:
1238
+ parent_frag_idx = fi
1239
+
1240
+ if sub_frag_idx is None or parent_frag_idx is None:
1241
+ return None
1242
+
1243
+ sub_frag_mol = frag_mols[sub_frag_idx]
1244
+ parent_frag_mol = frag_mols[parent_frag_idx]
1245
+ sub_smi = Chem.MolToSmiles(sub_frag_mol)
1246
+ parent_frag_smi = Chem.MolToSmiles(parent_frag_mol)
1247
+
1248
+ # Map attachment atom indices from full molecule to fragment indices
1249
+ sub_mapping = {old: new for new, old in enumerate(frag_atom_lists[sub_frag_idx])}
1250
+ parent_frag_mapping = {old: new for new, old in enumerate(frag_atom_lists[parent_frag_idx])}
1251
+ sub_attach_in_frag = sub_mapping.get(sub_attach_full)
1252
+ parent_attach_in_frag = parent_frag_mapping.get(parent_attach_full)
1253
+
1254
+ if sub_attach_in_frag is None or parent_attach_in_frag is None:
1255
+ return None
1256
+
1257
+ return FragmentResult(
1258
+ parent_smi=parent_frag_smi,
1259
+ parent_mol=parent_frag_mol,
1260
+ parent_attach_idx=parent_attach_in_frag,
1261
+ sub_smi=sub_smi,
1262
+ sub_mol=sub_frag_mol,
1263
+ sub_attach_idx=sub_attach_in_frag,
1264
+ )
1265
+
1266
+
1267
+ def generate_alternative_from_prefix(full_name: str, canonical_smiles: str,
1268
+ node: BracketNode,
1269
+ verbose: bool = False,
1270
+ max_depth: int = 0,
1271
+ _deadline: Optional[float] = None,
1272
+ ) -> List[Alternative]:
1273
+ """Generate alternatives for a prefix substituent (no brackets).
1274
+
1275
+ Uses the At-probe to identify parent/substituent fragments from the
1276
+ molecular graph, avoiding the need to resolve substituent names
1277
+ (like "phenyl") which can give radical SMILES.
1278
+ """
1279
+ alternatives = []
1280
+
1281
+ # Get parent and substituent fragments via At-probe
1282
+ if node.kind == "multiplied_prefix" and hasattr(node, '_probe_name'):
1283
+ at_name = node._probe_name
1284
+ else:
1285
+ before = full_name[:node.start]
1286
+ after = full_name[node.end + 1:]
1287
+ at_name = before + "astato" + after
1288
+ at_smi = _name_to_smiles(at_name)
1289
+ if at_smi is None:
1290
+ if verbose:
1291
+ print(f" Prefix At-probe failed: '{at_name}'", file=sys.stderr)
1292
+ return alternatives
1293
+
1294
+ frags = _get_fragments_via_at_probe(canonical_smiles, at_smi,
1295
+ verbose=verbose)
1296
+ if frags is None:
1297
+ if verbose:
1298
+ print(f" Fragment extraction failed", file=sys.stderr)
1299
+ return alternatives
1300
+
1301
+ return _assemble_alternatives(frags, canonical_smiles, verbose=verbose,
1302
+ max_depth=max_depth, _deadline=_deadline)
1303
+
1304
+
1305
+ # ---------------------------------------------------------------------------
1306
+ # Helpers for recursive assembly
1307
+ # ---------------------------------------------------------------------------
1308
+
1309
+ def _parent_name_from_bracket_yl(yl_text: str) -> Optional[str]:
1310
+ """Derive a parent name from a bracket-group -yl text.
1311
+
1312
+ E.g., '2-morpholino-4-phenylquinolin-3-yl'
1313
+ → strip '-3-yl' → '2-morpholino-4-phenylquinolin'
1314
+ → add 'e' → '2-morpholino-4-phenylquinoline'
1315
+
1316
+ Returns the parent name if it resolves via ChemDraw, else None.
1317
+ """
1318
+ m = re.search(r'-(\d+)-yl$', yl_text)
1319
+ if not m:
1320
+ return None
1321
+ base = yl_text[:m.start()]
1322
+ # Most IUPAC ring names drop a trailing 'e' to form -yl
1323
+ # (quinoline → quinolin-yl, pyridine → pyridin-yl)
1324
+ for suffix in ('e', ''):
1325
+ candidate = base + suffix
1326
+ if _name_to_smiles(candidate) is not None:
1327
+ return candidate
1328
+ return None
1329
+
1330
+
1331
+ def _insert_prefix_by_locant(name: str, locant: str,
1332
+ prefix_text: str) -> str:
1333
+ """Insert '{locant}-{prefix_text}-' at the correct numerical position.
1334
+
1335
+ Scans top-level locants (skipping bracketed content) and inserts
1336
+ before the first locant that is numerically greater than *locant*.
1337
+
1338
+ >>> _insert_prefix_by_locant('2-morpholino-4-phenylquinoline',
1339
+ ... '3', '(phenylmethanol-yl)')
1340
+ '2-morpholino-3-(phenylmethanol-yl)-4-phenylquinoline'
1341
+ """
1342
+ target = int(locant)
1343
+ depth = 0
1344
+ i = 0
1345
+ while i < len(name):
1346
+ c = name[i]
1347
+ if c in '([':
1348
+ depth += 1
1349
+ i += 1
1350
+ elif c in ')]':
1351
+ depth -= 1
1352
+ i += 1
1353
+ elif c.isdigit() and depth == 0:
1354
+ j = i
1355
+ while j < len(name) and name[j].isdigit():
1356
+ j += 1
1357
+ if j < len(name) and name[j] == '-':
1358
+ num = int(name[i:j])
1359
+ if num > target:
1360
+ return (name[:i] + f"{locant}-{prefix_text}-"
1361
+ + name[i:])
1362
+ i = j
1363
+ else:
1364
+ i += 1
1365
+ # Fallback: no locant greater than target was found. The new prefix
1366
+ # should go AFTER all existing locant-prefix groups (before the parent
1367
+ # stem), not at the very start.
1368
+ #
1369
+ # If the last prefix is bracketed, we can find its closing bracket and
1370
+ # insert right after it. For unbracketed prefixes we fall back to
1371
+ # prepending (may give non-ascending locant order, but ChemDraw's
1372
+ # resolver is lenient).
1373
+ #
1374
+ # Find the last locant-dash at depth 0:
1375
+ last_ld_end = None # position right after the last locant's '-'
1376
+ d2 = 0
1377
+ k = 0
1378
+ while k < len(name):
1379
+ ch = name[k]
1380
+ if ch in '([':
1381
+ d2 += 1; k += 1
1382
+ elif ch in ')]':
1383
+ d2 -= 1; k += 1
1384
+ elif ch.isdigit() and d2 == 0:
1385
+ kj = k
1386
+ while kj < len(name) and name[kj].isdigit():
1387
+ kj += 1
1388
+ if kj < len(name) and name[kj] == '-':
1389
+ last_ld_end = kj + 1
1390
+ k = kj
1391
+ else:
1392
+ k += 1
1393
+
1394
+ if last_ld_end is not None and last_ld_end < len(name) and name[last_ld_end] == '(':
1395
+ # Last prefix is bracketed — find the matching ')'.
1396
+ bd = 1
1397
+ bp = last_ld_end + 1
1398
+ while bp < len(name) and bd > 0:
1399
+ if name[bp] in '([':
1400
+ bd += 1
1401
+ elif name[bp] in ')]':
1402
+ bd -= 1
1403
+ bp += 1
1404
+ # bp is right after the closing bracket.
1405
+ # Insert: {existing}-{locant}-{prefix}{rest}
1406
+ rest = name[bp:]
1407
+ sep = "" if not rest or not rest[0].isdigit() else "-"
1408
+ return name[:bp] + f"-{locant}-{prefix_text}{sep}" + rest
1409
+
1410
+ # Ultimate fallback: prepend. No trailing hyphen when *name* starts
1411
+ # with a letter (the parent stem).
1412
+ sep = "-" if name and name[0].isdigit() else ""
1413
+ return f"{locant}-{prefix_text}{sep}" + name
1414
+
1415
+
1416
+ # ---------------------------------------------------------------------------
1417
+ # Alternative name generation
1418
+ # ---------------------------------------------------------------------------
1419
+
1420
+ def generate_alternative(full_name: str, canonical_smiles: str,
1421
+ node: BracketNode,
1422
+ verbose: bool = False,
1423
+ max_depth: int = 0,
1424
+ _deadline: Optional[float] = None,
1425
+ ) -> List[Alternative]:
1426
+ """Generate alternative names by swapping parent ↔ substituent at one bracket.
1427
+
1428
+ Uses At-probe + molecular graph fragmentation to extract parent/sub
1429
+ fragments with correct atom indices.
1430
+ """
1431
+ # Get At-probe SMILES (replace bracket with astato)
1432
+ before = full_name[:node.start]
1433
+ after = full_name[node.end + 1:]
1434
+ at_name = before + "astato" + after
1435
+ at_smi = _name_to_smiles(at_name)
1436
+ if at_smi is None:
1437
+ if verbose:
1438
+ print(f" At-probe failed: '{at_name}'", file=sys.stderr)
1439
+ return []
1440
+
1441
+ # Extract parent and substituent fragments from the molecular graph
1442
+ frags = _get_fragments_via_at_probe(canonical_smiles, at_smi,
1443
+ verbose=verbose)
1444
+ if frags is None:
1445
+ if verbose:
1446
+ print(f" Fragment extraction failed", file=sys.stderr)
1447
+ return []
1448
+
1449
+ return _assemble_alternatives(frags, canonical_smiles, verbose=verbose,
1450
+ max_depth=max_depth, _deadline=_deadline,
1451
+ _bracket_yl_text=node.text)
1452
+
1453
+
1454
+ def _assemble_alternatives(frags: FragmentResult, canonical_smiles: str,
1455
+ verbose: bool = False,
1456
+ max_depth: int = 0,
1457
+ _deadline: Optional[float] = None,
1458
+ _bracket_yl_text: str = "",
1459
+ ) -> List[Alternative]:
1460
+ """Shared assembly logic for both bracket and prefix substituents.
1461
+
1462
+ Given parent/sub fragments (with correct mol objects and attachment indices),
1463
+ construct -yl form of parent, get locant on new parent, assemble via
1464
+ replace-hal, and validate.
1465
+ """
1466
+ alternatives = []
1467
+
1468
+ # Name both fragments
1469
+ parent_name = _smiles_to_name(frags.parent_smi)
1470
+ sub_parent_name = _smiles_to_name(frags.sub_smi)
1471
+ if parent_name is None or sub_parent_name is None:
1472
+ if verbose:
1473
+ print(f" Could not name fragments: parent={frags.parent_smi} "
1474
+ f"sub={frags.sub_smi}", file=sys.stderr)
1475
+ return alternatives
1476
+
1477
+ # Get locant on current parent
1478
+ # Use the mol object directly to preserve atom indices
1479
+ parent_locant_result = _add_at(frags.parent_mol, frags.parent_attach_idx)
1480
+ parent_locant = None
1481
+ if parent_locant_result:
1482
+ _, parent_at_smi = parent_locant_result
1483
+ parent_at_name = _smiles_to_name(parent_at_smi)
1484
+ if parent_at_name:
1485
+ m = re.search(r'(\d+)-astato', parent_at_name, re.IGNORECASE)
1486
+ if m:
1487
+ parent_locant = m.group(1)
1488
+ elif 'astato' in parent_at_name.lower():
1489
+ parent_locant = ""
1490
+
1491
+ # Construct -yl form candidates for the old parent
1492
+ yl_candidates = construct_yl_form(parent_name, parent_locant or "")
1493
+
1494
+ # Se-probe: often gives superior -yl forms (e.g. formyl, acetyl,
1495
+ # hydroxy(phenyl)methyl) that construct_yl_form cannot derive.
1496
+ se_yl = _get_yl_via_selenyl_probe(
1497
+ frags.parent_mol, frags.parent_attach_idx, verbose=verbose)
1498
+ if se_yl and se_yl not in yl_candidates:
1499
+ yl_candidates.insert(0, se_yl)
1500
+
1501
+ if verbose:
1502
+ print(f" Sub fragment: {frags.sub_smi} → '{sub_parent_name}'",
1503
+ file=sys.stderr)
1504
+ print(f" Parent: {frags.parent_smi} → '{parent_name}' "
1505
+ f"(locant={parent_locant})", file=sys.stderr)
1506
+ print(f" -yl candidates: {yl_candidates}", file=sys.stderr)
1507
+
1508
+ # Get locant on new parent (the old substituent) using mol object
1509
+ # Check if attachment is on a heteroatom (O, N, S) — needs special handling
1510
+ # BUT: ring heteroatoms (like N in morpholine) work fine with At-probe,
1511
+ # only exocyclic heteroatoms (O in ethers, N in amines) need special path
1512
+ sub_attach_atom = frags.sub_mol.GetAtomWithIdx(frags.sub_attach_idx)
1513
+ is_heteroatom = (sub_attach_atom.GetAtomicNum() in (7, 8, 16)
1514
+ and not sub_attach_atom.IsInRing()) # exocyclic only
1515
+
1516
+ new_parent_at_name = None
1517
+ new_parent_locant = None
1518
+ heteroatom_yl_suffix = None # e.g., "pyridin-4-yloxy"
1519
+
1520
+ if is_heteroatom:
1521
+ # Heteroatom pathway: can't add At to O/N/S directly
1522
+ het_num = sub_attach_atom.GetAtomicNum()
1523
+ if verbose:
1524
+ print(f" Heteroatom attachment: {sub_attach_atom.GetSymbol()} "
1525
+ f"(Z={het_num})", file=sys.stderr)
1526
+
1527
+ # Step A: Get yl+suffix via acid probe through heteroatom
1528
+ heteroatom_yl_suffix = _get_yl_suffix_via_acid(
1529
+ frags.parent_mol, frags.parent_attach_idx, het_num,
1530
+ verbose=verbose)
1531
+
1532
+ # Step B: Get locant by replacing heteroatom with At
1533
+ loc_result = _get_locant_replace_heteroatom(
1534
+ frags.sub_mol, frags.sub_attach_idx, verbose=verbose)
1535
+ if loc_result is not None:
1536
+ new_parent_at_name, new_parent_locant = loc_result
1537
+
1538
+ if new_parent_at_name is None or heteroatom_yl_suffix is None:
1539
+ if verbose:
1540
+ print(f" Heteroatom pathway failed: at_name={new_parent_at_name} "
1541
+ f"yl_suffix={heteroatom_yl_suffix}", file=sys.stderr)
1542
+ return alternatives
1543
+ else:
1544
+ # Normal pathway: At directly on carbon
1545
+ at_result = _add_at(frags.sub_mol, frags.sub_attach_idx)
1546
+ if at_result is None:
1547
+ if verbose:
1548
+ print(f" At addition to sub failed at "
1549
+ f"{sub_attach_atom.GetSymbol()}"
1550
+ f"(idx={frags.sub_attach_idx})", file=sys.stderr)
1551
+ return alternatives
1552
+ _, new_parent_at_smi = at_result
1553
+ new_parent_at_name = _smiles_to_name(new_parent_at_smi)
1554
+ if new_parent_at_name is None:
1555
+ if verbose:
1556
+ print(f" ChemScript can't name At-probe: "
1557
+ f"{new_parent_at_smi}", file=sys.stderr)
1558
+ return alternatives
1559
+
1560
+ m = re.search(r'(\d+)-astato', new_parent_at_name, re.IGNORECASE)
1561
+ if m:
1562
+ new_parent_locant = m.group(1)
1563
+ elif 'astato' in new_parent_at_name.lower():
1564
+ new_parent_locant = ""
1565
+
1566
+ if verbose:
1567
+ print(f" New parent At-probe: '{new_parent_at_name}' "
1568
+ f"(locant={new_parent_locant})", file=sys.stderr)
1569
+ if heteroatom_yl_suffix:
1570
+ print(f" Heteroatom yl+suffix: '{heteroatom_yl_suffix}'",
1571
+ file=sys.stderr)
1572
+
1573
+ # Assemble alternatives via replace-hal
1574
+ # For heteroatom cases, use the acid-derived yl+suffix form instead
1575
+ if is_heteroatom and heteroatom_yl_suffix:
1576
+ all_yl_forms = [heteroatom_yl_suffix]
1577
+ else:
1578
+ all_yl_forms = list(yl_candidates)
1579
+
1580
+ for yl_form in all_yl_forms:
1581
+ if new_parent_locant:
1582
+ # Derive locanted sub-parent from the At-probe name by stripping
1583
+ # the astatine prefix. E.g. "1-astato-4-fluorobenzene" → "4-fluorobenzene".
1584
+ # This preserves locant info that the canonical sub_parent_name
1585
+ # (e.g. "fluorobenzene") lacks, enabling correct prefix ordering.
1586
+ at_prefix = f"{new_parent_locant}-astato"
1587
+ if new_parent_at_name.lower().startswith(at_prefix.lower()):
1588
+ locanted_parent = new_parent_at_name[len(at_prefix):]
1589
+ if locanted_parent.startswith("-"):
1590
+ locanted_parent = locanted_parent[1:]
1591
+ else:
1592
+ locanted_parent = sub_parent_name
1593
+ for strat, assembled in [
1594
+ ("replace-hal-noparens",
1595
+ _insert_prefix_by_locant(
1596
+ locanted_parent, new_parent_locant, yl_form)),
1597
+ ("replace-hal-parens",
1598
+ _insert_prefix_by_locant(
1599
+ locanted_parent, new_parent_locant,
1600
+ f"({yl_form})")),
1601
+ ]:
1602
+ valid = _validate_name(assembled, canonical_smiles)
1603
+ if verbose:
1604
+ tag = "VALID" if valid else "INVALID"
1605
+ print(f" Assembled ({strat}): '{assembled}' [{tag}]",
1606
+ file=sys.stderr)
1607
+ alternatives.append(Alternative(
1608
+ name=assembled,
1609
+ parent_name=sub_parent_name,
1610
+ sub_name=yl_form,
1611
+ locant=new_parent_locant or "",
1612
+ valid=valid,
1613
+ strategy=strat,
1614
+ ))
1615
+ if valid:
1616
+ break # skip more-bracketed variants
1617
+ continue
1618
+
1619
+ if "astato" in new_parent_at_name.lower():
1620
+ # Try noparens first; skip parens if noparens validates
1621
+ for strat, repl in [("replace-hal-noparens", yl_form),
1622
+ ("replace-hal-parens", f"({yl_form})")]:
1623
+ assembled = re.sub(
1624
+ r'\d*-?astato', repl, new_parent_at_name,
1625
+ flags=re.IGNORECASE
1626
+ )
1627
+ valid = _validate_name(assembled, canonical_smiles)
1628
+ if verbose:
1629
+ tag = "VALID" if valid else "INVALID"
1630
+ print(f" Assembled ({strat}): '{assembled}' [{tag}]",
1631
+ file=sys.stderr)
1632
+ alternatives.append(Alternative(
1633
+ name=assembled,
1634
+ parent_name=sub_parent_name,
1635
+ sub_name=yl_form,
1636
+ locant=new_parent_locant or "",
1637
+ valid=valid,
1638
+ strategy=strat,
1639
+ ))
1640
+ if valid:
1641
+ break # skip more-bracketed variants
1642
+
1643
+ # Fallback: if no valid alternatives from construct_yl_form, try acid probe
1644
+ has_valid = any(a.valid for a in alternatives)
1645
+ if not has_valid:
1646
+ acid_yl = _get_yl_via_acid_probe(
1647
+ frags.parent_mol, frags.parent_attach_idx, verbose=verbose
1648
+ )
1649
+ if acid_yl and acid_yl not in yl_candidates:
1650
+ if verbose:
1651
+ print(f" Acid probe -yl: '{acid_yl}'", file=sys.stderr)
1652
+ # Try assembly with acid-probe -yl form
1653
+ # Try noparens first; skip parens if noparens validates
1654
+ if new_parent_locant:
1655
+ pattern = f"{new_parent_locant}-astato"
1656
+ if pattern in new_parent_at_name:
1657
+ for strat, assembled in [
1658
+ ("acid-probe-noparens",
1659
+ new_parent_at_name.replace(
1660
+ pattern, f"{new_parent_locant}-{acid_yl}")),
1661
+ ("acid-probe-parens",
1662
+ new_parent_at_name.replace(
1663
+ pattern, f"{new_parent_locant}-({acid_yl})")),
1664
+ ]:
1665
+ valid = _validate_name(assembled, canonical_smiles)
1666
+ if verbose:
1667
+ tag = "VALID" if valid else "INVALID"
1668
+ print(f" Assembled ({strat}): '{assembled}' "
1669
+ f"[{tag}]", file=sys.stderr)
1670
+ alternatives.append(Alternative(
1671
+ name=assembled,
1672
+ parent_name=sub_parent_name,
1673
+ sub_name=acid_yl,
1674
+ locant=new_parent_locant or "",
1675
+ valid=valid,
1676
+ strategy=strat,
1677
+ ))
1678
+ if valid:
1679
+ break # skip more-bracketed variants
1680
+ elif "astato" in new_parent_at_name.lower():
1681
+ for strat, repl in [("acid-probe-noparens", acid_yl),
1682
+ ("acid-probe-parens", f"({acid_yl})")]:
1683
+ assembled = re.sub(
1684
+ r'\d*-?astato', repl, new_parent_at_name,
1685
+ flags=re.IGNORECASE
1686
+ )
1687
+ valid = _validate_name(assembled, canonical_smiles)
1688
+ if verbose:
1689
+ tag = "VALID" if valid else "INVALID"
1690
+ print(f" Assembled ({strat}): '{assembled}' "
1691
+ f"[{tag}]", file=sys.stderr)
1692
+ alternatives.append(Alternative(
1693
+ name=assembled,
1694
+ parent_name=sub_parent_name,
1695
+ sub_name=acid_yl,
1696
+ locant=new_parent_locant or "",
1697
+ valid=valid,
1698
+ strategy=strat,
1699
+ ))
1700
+ if valid:
1701
+ break # skip more-bracketed variants
1702
+
1703
+ # Recursive decomposition: try alternative parent names for sub-fragment
1704
+ # max_depth: -1 = unlimited (until timeout), 0 = disabled, >0 = N levels
1705
+ if max_depth != 0 and new_parent_locant:
1706
+ if _deadline is not None and time.monotonic() > _deadline:
1707
+ if verbose:
1708
+ print(f" Skipping recursive decomposition (timeout)",
1709
+ file=sys.stderr)
1710
+ else:
1711
+ if verbose:
1712
+ print(f" Recursive decomposition of sub-fragment "
1713
+ f"(max_depth={max_depth})...", file=sys.stderr)
1714
+ next_depth = max_depth - 1 if max_depth > 0 else max_depth
1715
+ sub_decomp = decompose_name(frags.sub_smi, max_depth=next_depth,
1716
+ verbose=verbose, _deadline=_deadline)
1717
+
1718
+ # Collect recursive alt parent names (deduplicated)
1719
+ recursive_parents = []
1720
+ seen_parents = set()
1721
+ sub_canon = _canonical(frags.sub_smi)
1722
+ for sub_alt in sub_decomp.alternatives:
1723
+ if sub_alt.valid and sub_alt.name not in seen_parents:
1724
+ if sub_alt.name != sub_parent_name:
1725
+ seen_parents.add(sub_alt.name)
1726
+ recursive_parents.append(sub_alt.name)
1727
+
1728
+ # Bracket-text shortcut: the bracket content may encode a
1729
+ # flat-prefix parent name unreachable by recursive decomp
1730
+ # (e.g. "2-morpholino-4-phenylquinolin-3-yl"
1731
+ # → "2-morpholino-4-phenylquinoline")
1732
+ if _bracket_yl_text:
1733
+ bt_parent = _parent_name_from_bracket_yl(_bracket_yl_text)
1734
+ if bt_parent and bt_parent not in seen_parents:
1735
+ bt_smi = _name_to_smiles(bt_parent)
1736
+ if (bt_smi and sub_canon
1737
+ and _canonical(bt_smi) == sub_canon):
1738
+ if verbose:
1739
+ print(f" Bracket-text parent: '{bt_parent}'",
1740
+ file=sys.stderr)
1741
+ seen_parents.add(bt_parent)
1742
+ recursive_parents.append(bt_parent)
1743
+
1744
+ for alt_parent in recursive_parents:
1745
+ if verbose:
1746
+ print(f" Recursive alt: '{alt_parent}'",
1747
+ file=sys.stderr)
1748
+ for yl_form in all_yl_forms:
1749
+ # Try noparens first; skip more-bracketed if valid
1750
+ for strat, assembled in [
1751
+ ("recursive-noparens",
1752
+ _insert_prefix_by_locant(
1753
+ alt_parent, new_parent_locant,
1754
+ yl_form)),
1755
+ ("recursive-parens",
1756
+ _insert_prefix_by_locant(
1757
+ alt_parent, new_parent_locant,
1758
+ f"({yl_form})")),
1759
+ ("recursive-brackets",
1760
+ _insert_prefix_by_locant(
1761
+ alt_parent, new_parent_locant,
1762
+ f"[{yl_form}]")),
1763
+ ]:
1764
+ valid = _validate_name(assembled, canonical_smiles)
1765
+ if verbose:
1766
+ tag = "VALID" if valid else "INVALID"
1767
+ print(f" Recursive ({strat}): "
1768
+ f"'{assembled}' [{tag}]",
1769
+ file=sys.stderr)
1770
+ alternatives.append(Alternative(
1771
+ name=assembled,
1772
+ parent_name=alt_parent,
1773
+ sub_name=yl_form,
1774
+ locant=new_parent_locant,
1775
+ valid=valid,
1776
+ strategy=strat,
1777
+ ))
1778
+ if valid:
1779
+ break # skip more-bracketed variants
1780
+
1781
+ # Se-probe reverse assembly: the Se-probe yl of the old parent may
1782
+ # encode a suffix→prefix conversion (e.g. "-3-carbaldehyde" becomes
1783
+ # "3-formyl-" inside the yl text). Extract the converted parent name
1784
+ # and insert the sub-fragment's prefix form at the attachment locant.
1785
+ if se_yl and max_depth != 0:
1786
+ yl_text = se_yl.strip('()[]')
1787
+ se_parent = _parent_name_from_bracket_yl(yl_text)
1788
+ if (se_parent and se_parent != sub_parent_name
1789
+ and se_parent != parent_name):
1790
+ # Verify the extracted parent resolves to the same molecule
1791
+ se_parent_smi = _name_to_smiles(se_parent)
1792
+ parent_canon = _canonical(frags.parent_smi)
1793
+ if (se_parent_smi and parent_canon
1794
+ and _canonical(se_parent_smi) == parent_canon):
1795
+ # Get the attachment locant from the yl text
1796
+ m_loc = re.search(r'-(\d+)-yl$', yl_text)
1797
+ if m_loc:
1798
+ se_locant = m_loc.group(1)
1799
+ # Compute sub-fragment prefix/yl forms
1800
+ sub_yl_candidates = construct_yl_form(
1801
+ sub_parent_name, new_parent_locant or "")
1802
+ sub_se_yl = _get_yl_via_selenyl_probe(
1803
+ frags.sub_mol, frags.sub_attach_idx,
1804
+ verbose=verbose)
1805
+ if sub_se_yl and sub_se_yl not in sub_yl_candidates:
1806
+ sub_yl_candidates.insert(0, sub_se_yl)
1807
+
1808
+ if verbose:
1809
+ print(f" Se-reverse parent: '{se_parent}' "
1810
+ f"(locant={se_locant})",
1811
+ file=sys.stderr)
1812
+ print(f" Sub-fragment yl candidates: "
1813
+ f"{sub_yl_candidates}", file=sys.stderr)
1814
+
1815
+ for sub_yl in sub_yl_candidates:
1816
+ # Try noparens first; skip parens if valid
1817
+ for strat, assembled in [
1818
+ ("se-reverse-noparens",
1819
+ _insert_prefix_by_locant(
1820
+ se_parent, se_locant, sub_yl)),
1821
+ ("se-reverse-parens",
1822
+ _insert_prefix_by_locant(
1823
+ se_parent, se_locant,
1824
+ f"({sub_yl})")),
1825
+ ]:
1826
+ valid = _validate_name(
1827
+ assembled, canonical_smiles)
1828
+ if verbose:
1829
+ tag = "VALID" if valid else "INVALID"
1830
+ print(f" Se-reverse ({strat}): "
1831
+ f"'{assembled}' [{tag}]",
1832
+ file=sys.stderr)
1833
+ alternatives.append(Alternative(
1834
+ name=assembled,
1835
+ parent_name=se_parent,
1836
+ sub_name=sub_yl,
1837
+ locant=se_locant,
1838
+ valid=valid,
1839
+ strategy=strat,
1840
+ ))
1841
+ if valid:
1842
+ break # skip more-bracketed variants
1843
+
1844
+ return alternatives
1845
+
1846
+
1847
+ def _validate_name(name: str, expected_canonical: str) -> bool:
1848
+ """Round-trip validate: name → SMILES → canonical, compare."""
1849
+ smi = _name_to_smiles(name)
1850
+ if smi is None:
1851
+ return False
1852
+ canon = _canonical(smi)
1853
+ if canon is None:
1854
+ return False
1855
+ return canon == expected_canonical
1856
+
1857
+
1858
+ # ---------------------------------------------------------------------------
1859
+ # Suffix → prefix conversion
1860
+ # ---------------------------------------------------------------------------
1861
+
1862
+ # (suffix, prefix_form, terminal_e_elided_before_suffix)
1863
+ # Longest suffix first to avoid partial matches.
1864
+ _SUFFIX_PREFIX_MAP = [
1865
+ # Longest suffix first to avoid partial matches.
1866
+ ("carboxylic acid", "carboxy", False),
1867
+ ("sulfonic acid", "sulfo", False),
1868
+ ("sulfonamide", "sulfamoyl", False),
1869
+ ("carbonitrile", "cyano", False),
1870
+ ("carbaldehyde", "formyl", False),
1871
+ ("carboxamide", "carbamoyl", False),
1872
+ ("amine", "amino", True),
1873
+ ("ol", "hydroxy", True),
1874
+ ("one", "oxo", True),
1875
+ ("thiol", "sulfanyl", False),
1876
+ ]
1877
+
1878
+
1879
+ def _suffix_to_prefix_alternatives(canonical_name: str,
1880
+ canonical_smiles: str,
1881
+ verbose: bool = False,
1882
+ ) -> List[Alternative]:
1883
+ """Convert IUPAC suffix to prefix form.
1884
+
1885
+ E.g. pyridin-4-amine → 4-aminopyridine,
1886
+ cyclohexan-1-ol → 1-hydroxycyclohexane.
1887
+
1888
+ Only handles single-locant suffixes (not multiplied like 1,4-diamine).
1889
+ """
1890
+ alternatives: List[Alternative] = []
1891
+
1892
+ for suffix, prefix, e_elided in _SUFFIX_PREFIX_MAP:
1893
+ if not canonical_name.endswith(suffix):
1894
+ continue
1895
+
1896
+ before = canonical_name[:-len(suffix)]
1897
+ # Expect: {stem}-{locant(s)}-
1898
+ m = re.match(r'^(.+)-(\d+(?:,\d+)*)-$', before)
1899
+ if not m:
1900
+ continue
1901
+
1902
+ stem = m.group(1)
1903
+ locants_str = m.group(2)
1904
+
1905
+ # Skip multiplied locants for now (e.g. benzene-1,4-diamine)
1906
+ if ',' in locants_str:
1907
+ continue
1908
+
1909
+ # Restore terminal 'e' if elided before vowel-starting suffix
1910
+ parent = stem + 'e' if e_elided else stem
1911
+
1912
+ # Assemble prefix form via locant-ordered insertion
1913
+ assembled = _insert_prefix_by_locant(parent, locants_str, prefix)
1914
+
1915
+ valid = _validate_name(assembled, canonical_smiles)
1916
+ if verbose:
1917
+ tag = "VALID" if valid else "INVALID"
1918
+ print(f" Suffix→prefix ({suffix}→{prefix}): "
1919
+ f"'{assembled}' [{tag}]", file=sys.stderr)
1920
+
1921
+ alternatives.append(Alternative(
1922
+ name=assembled,
1923
+ parent_name=parent,
1924
+ sub_name=prefix,
1925
+ locant=locants_str,
1926
+ valid=valid,
1927
+ strategy="suffix-to-prefix",
1928
+ ))
1929
+
1930
+ # Only one principal characteristic group per name
1931
+ break
1932
+
1933
+ return alternatives
1934
+
1935
+
1936
+ def _deduplicate_alternatives(alternatives: List[Alternative],
1937
+ verbose: bool = False) -> List[Alternative]:
1938
+ """Remove redundant alternatives: bracket-only variants and
1939
+ single-position synonym variants.
1940
+
1941
+ **Step 1 — bracket-stripped dedup.** Group validated names by their
1942
+ bracket-stripped form (all ``()``, ``[]`` removed). Within each group
1943
+ keep only the name with the fewest bracket characters (ties broken by
1944
+ shortest total length).
1945
+
1946
+ **Step 2 — single-segment synonym collapse.** For any two surviving
1947
+ names that differ at exactly one contiguous segment where one segment
1948
+ has outer brackets and the other does not, discard the bracketed
1949
+ variant. Since both names round-trip to the same canonical SMILES,
1950
+ the segments must be synonymous (e.g. ``morpholino`` vs
1951
+ ``(morpholin-4-yl)``).
1952
+ """
1953
+ valid = [a for a in alternatives if a.valid]
1954
+ invalid = [a for a in alternatives if not a.valid]
1955
+
1956
+ if len(valid) <= 1:
1957
+ return alternatives
1958
+
1959
+ # --- Step 1: bracket-stripped dedup ---
1960
+ def _strip_brackets(name: str) -> str:
1961
+ return name.replace('(', '').replace(')', '').replace('[', '').replace(']', '')
1962
+
1963
+ def _bracket_count(name: str) -> int:
1964
+ return sum(1 for c in name if c in '()[]')
1965
+
1966
+ groups: dict = {}
1967
+ for alt in valid:
1968
+ key = _strip_brackets(alt.name)
1969
+ groups.setdefault(key, []).append(alt)
1970
+
1971
+ step1: List[Alternative] = []
1972
+ for group in groups.values():
1973
+ best = min(group, key=lambda a: (_bracket_count(a.name), len(a.name)))
1974
+ step1.append(best)
1975
+ if verbose and len(group) > 1:
1976
+ removed = [a.name for a in group if a is not best]
1977
+ print(f" Dedup (bracket-strip): kept '{best.name}', "
1978
+ f"removed {removed}", file=sys.stderr)
1979
+
1980
+ # --- Step 2: single-segment synonym collapse ---
1981
+ # Sort shortest-first so shorter names are preferred as "keepers".
1982
+ step1.sort(key=lambda a: (len(a.name), a.name))
1983
+
1984
+ to_remove: set = set()
1985
+ for i in range(len(step1)):
1986
+ if i in to_remove:
1987
+ continue
1988
+ for j in range(i + 1, len(step1)):
1989
+ if j in to_remove:
1990
+ continue
1991
+ name_i = step1[i].name
1992
+ name_j = step1[j].name
1993
+
1994
+ # Find longest common prefix
1995
+ pfx = 0
1996
+ for k in range(min(len(name_i), len(name_j))):
1997
+ if name_i[k] == name_j[k]:
1998
+ pfx = k + 1
1999
+ else:
2000
+ break
2001
+
2002
+ # Find longest common suffix (not overlapping with prefix)
2003
+ sfx = 0
2004
+ max_sfx = min(len(name_i), len(name_j)) - pfx
2005
+ for k in range(1, max_sfx + 1):
2006
+ if name_i[-k] == name_j[-k]:
2007
+ sfx = k
2008
+ else:
2009
+ break
2010
+
2011
+ end_i = len(name_i) - sfx if sfx else len(name_i)
2012
+ end_j = len(name_j) - sfx if sfx else len(name_j)
2013
+ mid_i = name_i[pfx:end_i]
2014
+ mid_j = name_j[pfx:end_j]
2015
+
2016
+ if not mid_i or not mid_j:
2017
+ continue
2018
+
2019
+ # Check: does exactly one segment have outer brackets?
2020
+ def _has_outer_brackets(s: str) -> bool:
2021
+ return (len(s) >= 2
2022
+ and ((s[0] == '(' and s[-1] == ')')
2023
+ or (s[0] == '[' and s[-1] == ']')))
2024
+
2025
+ i_outer = _has_outer_brackets(mid_i)
2026
+ j_outer = _has_outer_brackets(mid_j)
2027
+
2028
+ if i_outer != j_outer:
2029
+ # One has outer brackets, one doesn't.
2030
+ # Only collapse if the inner text shares a common stem
2031
+ # (≥ 4 chars) with the non-bracketed form — this avoids
2032
+ # collapsing genuinely different yl-forms (e.g.
2033
+ # "hydroxy(phenyl)methyl" vs "phenylmethanol-yl").
2034
+ if i_outer:
2035
+ inner = mid_i[1:-1]
2036
+ other = mid_j
2037
+ else:
2038
+ inner = mid_j[1:-1]
2039
+ other = mid_i
2040
+
2041
+ common_prefix_len = 0
2042
+ for k in range(min(len(inner), len(other))):
2043
+ if inner[k] == other[k]:
2044
+ common_prefix_len = k + 1
2045
+ else:
2046
+ break
2047
+
2048
+ if common_prefix_len >= 4:
2049
+ if i_outer:
2050
+ to_remove.add(i)
2051
+ if verbose:
2052
+ print(f" Dedup (synonym): removed '{name_i}' "
2053
+ f"(kept shorter '{name_j}')",
2054
+ file=sys.stderr)
2055
+ break # i is removed, skip remaining j
2056
+ else:
2057
+ to_remove.add(j)
2058
+ if verbose:
2059
+ print(f" Dedup (synonym): removed '{name_j}' "
2060
+ f"(kept shorter '{name_i}')",
2061
+ file=sys.stderr)
2062
+
2063
+ step2 = [alt for idx, alt in enumerate(step1) if idx not in to_remove]
2064
+
2065
+ return step2 + invalid
2066
+
2067
+
2068
+ # ---------------------------------------------------------------------------
2069
+ # Space-separated yl-group alternatives
2070
+ # ---------------------------------------------------------------------------
2071
+
2072
+ # Ring stems that appear in "ring-N-yl" patterns (e.g. pyridin-4-yl)
2073
+ _YL_RING_STEMS = [
2074
+ "quinolin", "isoquinolin", "quinoxalin", "quinazolin",
2075
+ "pyridin", "pyrimidin", "pyrazin", "pyridazin",
2076
+ "morpholin", "piperidin", "piperazin", "pyrrolidin",
2077
+ "indol", "benzimidazol", "benzothiazol", "benzofuran", "benzoxazol",
2078
+ "naphthal", "acridin", "carbazol", "phenanthrol",
2079
+ "thien", "furan", "pyrrol", "imidazol", "oxazol", "thiazol",
2080
+ "triazin", "tetrazol", "triazol", "oxadiazol",
2081
+ "phenyl", # for phenylbenzoate etc.
2082
+ ]
2083
+
2084
+
2085
+ def _space_sep_yl_alternatives(
2086
+ canonical_name: str,
2087
+ canonical_smiles: str,
2088
+ verbose: bool = False,
2089
+ max_depth: int = 0,
2090
+ _deadline: Optional[float] = None,
2091
+ ) -> List[Alternative]:
2092
+ """Generate alternatives for space-separated names with embedded yl-groups.
2093
+
2094
+ Handles names like "tert-butyl pyridin-4-ylcarbamate" where a ring-yl
2095
+ pattern is fused into the name without brackets. Creates a synthetic
2096
+ BracketNode and delegates to the standard generate_alternative() path.
2097
+
2098
+ Produces alternatives like "4-((tert-butoxycarbonyl)amino)pyridine"
2099
+ where the ring becomes the parent.
2100
+ """
2101
+ if ' ' not in canonical_name:
2102
+ return []
2103
+
2104
+ alternatives: List[Alternative] = []
2105
+
2106
+ # Build regex matching ring-N-yl patterns
2107
+ stem_pattern = '|'.join(re.escape(s) for s in
2108
+ sorted(_YL_RING_STEMS, key=len, reverse=True))
2109
+ # Match ring stem + optional fused annotation + locant + yl
2110
+ # E.g.: pyridin-4-yl, quinolin-7-yl, thieno[2,3-d]pyrimidin-4-yl
2111
+ yl_re = re.compile(
2112
+ r'((?:' + stem_pattern + r')'
2113
+ r'(?:\[[^\]]+\])?' # optional fused ring annotation [2,3-d]
2114
+ r'(?:[a-z]*)' # optional stem continuation (e.g. "oline" in morpholine)
2115
+ r'(?:-\d+(?:,\d+)*)?' # optional locant(s)
2116
+ r'-yl)',
2117
+ re.IGNORECASE,
2118
+ )
2119
+
2120
+ for m in yl_re.finditer(canonical_name):
2121
+ yl_text = m.group(1) # e.g. "pyridin-4-yl"
2122
+ yl_start = m.start()
2123
+ yl_end = m.end() - 1 # inclusive
2124
+
2125
+ # The yl-group should appear after a space (space-separated name)
2126
+ # and be followed by more text (the functional group suffix)
2127
+ if yl_start == 0:
2128
+ continue
2129
+ # Check there's a space somewhere before this yl-group
2130
+ before_text = canonical_name[:yl_start]
2131
+ if ' ' not in before_text:
2132
+ continue
2133
+ # Check there's a suffix after the yl-group (not end-of-name)
2134
+ after_text = canonical_name[yl_end + 1:]
2135
+ if not after_text or after_text.startswith(' '):
2136
+ continue # yl at end of name or before space — not embedded
2137
+
2138
+ if verbose:
2139
+ print(f" Space-sep yl: '{yl_text}' in '{canonical_name}'",
2140
+ file=sys.stderr)
2141
+ print(f" before='{before_text}' after='{after_text}'",
2142
+ file=sys.stderr)
2143
+
2144
+ # Create synthetic BracketNode
2145
+ # The At-probe: replace yl-text with "astato"
2146
+ node = BracketNode(
2147
+ text=yl_text,
2148
+ start=yl_start,
2149
+ end=yl_end,
2150
+ depth=0,
2151
+ kind="candidate",
2152
+ )
2153
+
2154
+ # Try At-probe (replace yl-text with "astato")
2155
+ at_name = canonical_name[:yl_start] + "astato" + canonical_name[yl_end + 1:]
2156
+ if verbose:
2157
+ print(f" At-probe name: '{at_name}'", file=sys.stderr)
2158
+
2159
+ at_smi = _name_to_smiles(at_name)
2160
+ if at_smi is None:
2161
+ if verbose:
2162
+ print(f" At-probe failed", file=sys.stderr)
2163
+ continue
2164
+
2165
+ # Extract fragments
2166
+ frags = _get_fragments_via_at_probe(canonical_smiles, at_smi,
2167
+ verbose=verbose)
2168
+ if frags is None:
2169
+ if verbose:
2170
+ print(f" Fragment extraction failed", file=sys.stderr)
2171
+ continue
2172
+
2173
+ # Assemble alternatives
2174
+ alts = _assemble_alternatives(
2175
+ frags, canonical_smiles, verbose=verbose,
2176
+ max_depth=max_depth, _deadline=_deadline,
2177
+ _bracket_yl_text=yl_text,
2178
+ )
2179
+ alternatives.extend(alts)
2180
+
2181
+ if verbose:
2182
+ print(f" Generated {len(alts)} alternatives from space-sep yl",
2183
+ file=sys.stderr)
2184
+
2185
+ return alternatives
2186
+
2187
+
2188
+ # ---------------------------------------------------------------------------
2189
+ # Main decomposition
2190
+ # ---------------------------------------------------------------------------
2191
+
2192
+ def decompose_name(smiles: str, max_depth: int = -1,
2193
+ verbose: bool = False,
2194
+ timeout: Optional[float] = 30.0,
2195
+ _deadline: Optional[float] = None,
2196
+ ) -> DecompositionResult:
2197
+ """Main entry point: decompose an IUPAC name into alternatives.
2198
+
2199
+ 1. Get canonical name from ChemDraw
2200
+ 2. Parse bracket tree
2201
+ 3. Classify bracket groups
2202
+ 4. For each substituent group, generate alternative names
2203
+
2204
+ Args:
2205
+ max_depth: Recursion depth limit. ``-1`` (default) = unlimited
2206
+ (recurse until timeout or convergence). ``0`` = no recursion.
2207
+ Positive integer = that many levels.
2208
+ timeout: Wall-clock seconds before recursive decomposition is
2209
+ skipped. Set to ``None`` to disable. Only used on the
2210
+ outermost call; recursive calls inherit the computed deadline
2211
+ via ``_deadline``.
2212
+ """
2213
+ # Compute deadline on the outermost call; inner calls inherit it.
2214
+ if _deadline is None and timeout is not None:
2215
+ _deadline = time.monotonic() + timeout
2216
+ canon_smi = _canonical(smiles)
2217
+ if canon_smi is None:
2218
+ return DecompositionResult(
2219
+ original_smiles=smiles, canonical_smiles="",
2220
+ canonical_name="", bracket_tree=None,
2221
+ errors=["Invalid SMILES"]
2222
+ )
2223
+
2224
+ canonical_name = _smiles_to_name(smiles)
2225
+ if canonical_name is None:
2226
+ return DecompositionResult(
2227
+ original_smiles=smiles, canonical_smiles=canon_smi,
2228
+ canonical_name="", bracket_tree=None,
2229
+ errors=["ChemDraw could not name this structure"]
2230
+ )
2231
+
2232
+ if verbose:
2233
+ print(f"\nCanonical name: {canonical_name}", file=sys.stderr)
2234
+
2235
+ tree = parse_bracket_tree(canonical_name)
2236
+
2237
+ if verbose:
2238
+ print(f"Top-level bracket groups: {len(tree.children)}",
2239
+ file=sys.stderr)
2240
+ for i, child in enumerate(tree.children):
2241
+ print(f" [{i}] depth={child.depth} "
2242
+ f"pos={child.start}-{child.end} "
2243
+ f"text='{child.text}'", file=sys.stderr)
2244
+
2245
+ result = DecompositionResult(
2246
+ original_smiles=smiles,
2247
+ canonical_smiles=canon_smi,
2248
+ canonical_name=canonical_name,
2249
+ bracket_tree=tree,
2250
+ )
2251
+
2252
+ # Collect ALL bracket nodes at all depths (breadth-first)
2253
+ def _collect_nodes(node):
2254
+ nodes = []
2255
+ for child in node.children:
2256
+ nodes.append(child)
2257
+ nodes.extend(_collect_nodes(child))
2258
+ return nodes
2259
+
2260
+ all_nodes = _collect_nodes(tree)
2261
+ if verbose:
2262
+ print(f"Total bracket nodes (all depths): {len(all_nodes)}",
2263
+ file=sys.stderr)
2264
+
2265
+ # Process all bracket groups at all depths
2266
+ for node in all_nodes:
2267
+ kind = classify_node(node)
2268
+ node.kind = kind
2269
+ if verbose:
2270
+ print(f"\n Bracket '({node.text})' depth={node.depth} → {kind}",
2271
+ file=sys.stderr)
2272
+
2273
+ if kind != "candidate":
2274
+ continue
2275
+
2276
+ # Validate as substituent via At-probe
2277
+ if not validate_as_substituent(canonical_name, node,
2278
+ verbose=verbose):
2279
+ node.kind = "invalid_sub"
2280
+ if verbose:
2281
+ print(f" At-probe validation failed", file=sys.stderr)
2282
+ continue
2283
+
2284
+ node.kind = "substituent"
2285
+
2286
+ # Generate alternatives
2287
+ alts = generate_alternative(
2288
+ canonical_name, canon_smi, node, verbose=verbose,
2289
+ max_depth=max_depth, _deadline=_deadline,
2290
+ )
2291
+ result.alternatives.extend(alts)
2292
+
2293
+ # Fallback: if no bracket groups found substituents, try prefix scanning
2294
+ if not result.alternatives:
2295
+ prefix_nodes = find_prefix_substituents(
2296
+ canonical_name, verbose=verbose
2297
+ )
2298
+ for pnode in prefix_nodes:
2299
+ if _at_probe_for_prefix(canonical_name, pnode, verbose=verbose):
2300
+ if pnode.kind not in ("multiplied_prefix",):
2301
+ pnode.kind = "prefix_substituent"
2302
+ alts = generate_alternative_from_prefix(
2303
+ canonical_name, canon_smi, pnode, verbose=verbose,
2304
+ max_depth=max_depth, _deadline=_deadline,
2305
+ )
2306
+ result.alternatives.extend(alts)
2307
+
2308
+ # Retry: if single-prefix nodes produced no valid alternatives,
2309
+ # try again with multi-prefix fallback (skip_single_prefix=True).
2310
+ # This handles cases like "2-chloro-4-phenylquinoline" where "chloro"
2311
+ # is found first as a single-prefix but can't produce useful alts.
2312
+ valid_alts = [a for a in result.alternatives if a.valid]
2313
+ if not valid_alts and prefix_nodes:
2314
+ if verbose:
2315
+ print(" Retrying with multi-prefix fallback...",
2316
+ file=sys.stderr)
2317
+ prefix_nodes2 = find_prefix_substituents(
2318
+ canonical_name, verbose=verbose,
2319
+ skip_single_prefix=True
2320
+ )
2321
+ for pnode in prefix_nodes2:
2322
+ if _at_probe_for_prefix(canonical_name, pnode,
2323
+ verbose=verbose):
2324
+ if pnode.kind not in ("multiplied_prefix",):
2325
+ pnode.kind = "prefix_substituent"
2326
+ alts = generate_alternative_from_prefix(
2327
+ canonical_name, canon_smi, pnode, verbose=verbose,
2328
+ max_depth=max_depth, _deadline=_deadline,
2329
+ )
2330
+ result.alternatives.extend(alts)
2331
+
2332
+ # Suffix→prefix conversion (e.g. pyridin-4-amine → 4-aminopyridine)
2333
+ suffix_alts = _suffix_to_prefix_alternatives(
2334
+ canonical_name, canon_smi, verbose=verbose)
2335
+ result.alternatives.extend(suffix_alts)
2336
+
2337
+ # Space-separated names with embedded yl-groups
2338
+ # (e.g. "tert-butyl pyridin-4-ylcarbamate" — no brackets, no prefix match)
2339
+ if ' ' in canonical_name:
2340
+ yl_alts = _space_sep_yl_alternatives(
2341
+ canonical_name, canon_smi, verbose=verbose,
2342
+ max_depth=max_depth, _deadline=_deadline,
2343
+ )
2344
+ result.alternatives.extend(yl_alts)
2345
+
2346
+ # Deduplicate alternatives:
2347
+ # 1. Remove exact-name duplicates and names identical to canonical
2348
+ seen: set = {canonical_name} # canonical is already listed separately
2349
+ unique: list = []
2350
+ for alt in result.alternatives:
2351
+ if alt.name not in seen:
2352
+ seen.add(alt.name)
2353
+ unique.append(alt)
2354
+ # 2. Remove bracket-only variants and single-position synonyms
2355
+ result.alternatives = _deduplicate_alternatives(unique, verbose=verbose)
2356
+
2357
+ # Infer canonical parent: for the canonical name, the parent is the
2358
+ # fragment that remains when the first substituent is removed.
2359
+ # We can extract this from the At-probe of the first substituent node.
2360
+ if result.alternatives:
2361
+ # The first alternative's parent_name is the OLD substituent that
2362
+ # became new parent — so the OLD parent is what the canonical name
2363
+ # uses. We can extract it from the At-probe: replace substituent
2364
+ # with At, resolve, remove At → parent fragment → name it.
2365
+ for alt in result.alternatives:
2366
+ if alt.valid:
2367
+ # The sub_name (in -yl form) tells us what the canonical
2368
+ # parent is. But it's simpler to just check the At-probe.
2369
+ # For now, use a heuristic: look for the longest suffix of
2370
+ # canonical_name that resolves as a valid compound.
2371
+ break
2372
+
2373
+ # Try to determine canonical parent from prefix scan or bracket analysis
2374
+ if not result.canonical_parent:
2375
+ # For bracket names: parent is name minus bracket group text
2376
+ # For prefix names: parent is the suffix
2377
+ # Simplest heuristic: try removing first valid bracket group
2378
+ for node in all_nodes:
2379
+ if node.kind == "substituent":
2380
+ # At-probe: replace bracket with At → SMILES → remove At → name
2381
+ before = canonical_name[:node.start]
2382
+ after = canonical_name[node.end + 1:]
2383
+ probe = before + "astato" + after
2384
+ at_smi = _name_to_smiles(probe)
2385
+ if at_smi:
2386
+ split = _split_at_at(at_smi)
2387
+ if split:
2388
+ parent_smi, _, _ = split
2389
+ parent_name = _smiles_to_name(parent_smi)
2390
+ if parent_name:
2391
+ result.canonical_parent = parent_name
2392
+ break
2393
+
2394
+ # Fallback: try the prefix scan parent
2395
+ if not result.canonical_parent and '(' not in canonical_name:
2396
+ for i in range(len(canonical_name) - 4, 0, -1):
2397
+ suffix = canonical_name[i:]
2398
+ if suffix[0].isalpha():
2399
+ smi = _name_to_smiles(suffix)
2400
+ if smi:
2401
+ result.canonical_parent = suffix
2402
+ break
2403
+
2404
+ return result
2405
+
2406
+
2407
+ # ---------------------------------------------------------------------------
2408
+ # R-group / placeholder handling
2409
+ # ---------------------------------------------------------------------------
2410
+
2411
+ # Two probe sets for dual-probe consensus. We run the decomposition with
2412
+ # each set, replace probe names with R-labels, and only keep names that
2413
+ # agree across both runs. This cleanly handles molecules that contain
2414
+ # real halogens — if probe A collides with a real halogen, probe B won't,
2415
+ # and the intersection filters out the bad names.
2416
+ #
2417
+ # Each entry: (atomic_number, IUPAC_prefix, IUPAC_stem)
2418
+ _PROBE_SET_A = [
2419
+ (9, 'fluoro', 'fluor'), # F — first label
2420
+ (17, 'chloro', 'chlor'), # Cl — second label (multi-R-group)
2421
+ ]
2422
+ _PROBE_SET_B = [
2423
+ (53, 'iodo', 'iod'), # I — first label
2424
+ (35, 'bromo', 'brom'), # Br — second label (multi-R-group)
2425
+ ]
2426
+
2427
+
2428
+ def _replace_probe_in_name(name: str, label: str,
2429
+ probe_prefix: str = 'bromo',
2430
+ probe_stem: str = 'brom') -> str:
2431
+ """Replace probe-atom name fragments with the R-group label.
2432
+
2433
+ Tries several patterns; replaces only the FIRST match to avoid
2434
+ clobbering legitimate atoms in the rest of the molecule.
2435
+ """
2436
+ # Try exact prefix replacement first (most common case)
2437
+ # e.g. "4-fluoropyridine" -> '4-"R"-pyridine'
2438
+ m = re.search(r'(\d+-)?' + re.escape(probe_prefix), name, re.IGNORECASE)
2439
+ if m:
2440
+ locant = m.group(1) or ""
2441
+ after = name[m.end():]
2442
+ # Add dash before suffix if it starts with a letter
2443
+ sep = "-" if after and after[0].isalpha() else ""
2444
+ return name[:m.start()] + locant + '"' + label + '"' + sep + after
2445
+
2446
+ # Bracket form: "(fluoro)" -> '("R")'
2447
+ pat_bracket = re.compile(r'\(' + re.escape(probe_prefix) + r'\)',
2448
+ re.IGNORECASE)
2449
+ m = pat_bracket.search(name)
2450
+ if m:
2451
+ return name[:m.start()] + '("' + label + '")' + name[m.end():]
2452
+
2453
+ # Any remaining probe stem substring
2454
+ pat_stem = re.compile(re.escape(probe_stem) + r'\w*', re.IGNORECASE)
2455
+ m = pat_stem.search(name)
2456
+ if m:
2457
+ after = name[m.end():]
2458
+ sep = "-" if after and after[0].isalpha() else ""
2459
+ return name[:m.start()] + '"' + label + '"' + sep + after
2460
+
2461
+ return name
2462
+
2463
+
2464
+ @dataclass
2465
+ class RGroupMapping:
2466
+ """Tracks an R-group label and its position in the molecule."""
2467
+ label: str # Text label: "R", "R1", "X", "Ar", etc.
2468
+ atom_idx: int # Atom index in the original SMILES (dummy atom)
2469
+ probe_atom_idx: int # Atom index in the probed SMILES (halogen atom)
2470
+
2471
+
2472
+ def _build_label_map(dummy_indices: List[int],
2473
+ labels) -> dict:
2474
+ """Build {atom_idx: label_str} from various label formats."""
2475
+ if labels is None:
2476
+ if len(dummy_indices) == 1:
2477
+ return {dummy_indices[0]: "R"}
2478
+ else:
2479
+ return {idx: f"R{i}" for i, idx in enumerate(dummy_indices, 1)}
2480
+ elif isinstance(labels, (list, tuple)):
2481
+ label_map = {}
2482
+ for i, idx in enumerate(dummy_indices):
2483
+ label_map[idx] = labels[i] if i < len(labels) else f"R{i+1}"
2484
+ return label_map
2485
+ else:
2486
+ return dict(labels)
2487
+
2488
+
2489
+ def prepare_rgroup_smiles(smiles: str,
2490
+ labels=None,
2491
+ probe_set=None,
2492
+ label_probe_map=None,
2493
+ ) -> Tuple[Optional[str], List[RGroupMapping]]:
2494
+ """Replace dummy atoms (*) with halogen probe atoms.
2495
+
2496
+ Args:
2497
+ smiles: SMILES string, possibly containing [*] dummy atoms.
2498
+ labels: Optional. Can be:
2499
+ - dict mapping atom index -> label string
2500
+ - list of label strings (matched to dummy atoms in order)
2501
+ - None: auto-generate as R, R1, R2...
2502
+ probe_set: List of (atomic_num, prefix, stem) tuples.
2503
+ Defaults to _PROBE_SET_A. Ignored if label_probe_map
2504
+ is provided.
2505
+ label_probe_map: Explicit {label: (atomic_num, prefix, stem)} dict.
2506
+ Overrides probe_set if given.
2507
+
2508
+ Returns:
2509
+ (probed_smiles, mappings) where probed_smiles has halogens
2510
+ instead of *, and mappings tracks which atoms were replaced.
2511
+ Returns (None, []) if no dummy atoms found or on error.
2512
+ """
2513
+ if probe_set is None and label_probe_map is None:
2514
+ probe_set = _PROBE_SET_A
2515
+
2516
+ mol = Chem.MolFromSmiles(smiles)
2517
+ if mol is None:
2518
+ return None, []
2519
+
2520
+ # Find dummy atoms (atomic number 0)
2521
+ dummy_indices = []
2522
+ for atom in mol.GetAtoms():
2523
+ if atom.GetAtomicNum() == 0:
2524
+ dummy_indices.append(atom.GetIdx())
2525
+
2526
+ if not dummy_indices:
2527
+ return None, [] # No R-groups
2528
+
2529
+ label_map = _build_label_map(dummy_indices, labels)
2530
+
2531
+ # Build label → probe assignment
2532
+ if label_probe_map:
2533
+ label_to_probe = {}
2534
+ for idx in dummy_indices:
2535
+ label = label_map.get(idx, f"R{idx}")
2536
+ if label in label_probe_map:
2537
+ label_to_probe[label] = label_probe_map[label]
2538
+ elif probe_set:
2539
+ label_to_probe[label] = probe_set[0]
2540
+ else:
2541
+ label_to_probe[label] = _PROBE_SET_A[0]
2542
+ else:
2543
+ label_to_probe = {}
2544
+ probe_idx = 0
2545
+ for idx in dummy_indices:
2546
+ label = label_map.get(idx, f"R{idx}")
2547
+ if label not in label_to_probe:
2548
+ if probe_idx < len(probe_set):
2549
+ label_to_probe[label] = probe_set[probe_idx]
2550
+ probe_idx += 1
2551
+ else:
2552
+ label_to_probe[label] = probe_set[0]
2553
+
2554
+ # Replace dummy atoms with their assigned probe atom
2555
+ edit = Chem.RWMol(mol)
2556
+ mappings = []
2557
+ for idx in dummy_indices:
2558
+ atom = edit.GetAtomWithIdx(idx)
2559
+ label = label_map.get(idx, f"R{idx}")
2560
+ probe_z, _prefix, _stem = label_to_probe[label]
2561
+ atom.SetAtomicNum(probe_z)
2562
+ atom.SetFormalCharge(0)
2563
+ atom.SetNoImplicit(False)
2564
+ mappings.append(RGroupMapping(
2565
+ label=label, atom_idx=idx, probe_atom_idx=idx
2566
+ ))
2567
+
2568
+ try:
2569
+ Chem.SanitizeMol(edit)
2570
+ probed_smi = Chem.MolToSmiles(edit)
2571
+ return probed_smi, mappings
2572
+ except Exception:
2573
+ return None, []
2574
+
2575
+
2576
+ def _probe_label_mapping(mappings: List[RGroupMapping],
2577
+ probe_set: list) -> dict:
2578
+ """Build {label: (prefix, stem)} from mappings and probe_set."""
2579
+ result = {}
2580
+ probe_idx = 0
2581
+ for m in mappings:
2582
+ if m.label not in result:
2583
+ if probe_idx < len(probe_set):
2584
+ _z, prefix, stem = probe_set[probe_idx]
2585
+ result[m.label] = (prefix, stem)
2586
+ probe_idx += 1
2587
+ else:
2588
+ _z, prefix, stem = probe_set[0]
2589
+ result[m.label] = (prefix, stem)
2590
+ return result
2591
+
2592
+
2593
+ def _replace_all_probes(name: str, label_to_probe: dict) -> str:
2594
+ """Replace all probe-atom names in a string with R-group labels."""
2595
+ result = name
2596
+ for label, (prefix, stem) in label_to_probe.items():
2597
+ result = _replace_probe_in_name(result, label,
2598
+ probe_prefix=prefix,
2599
+ probe_stem=stem)
2600
+ return result
2601
+
2602
+
2603
+ def decompose_name_with_rgroups(smiles: str,
2604
+ labels=None,
2605
+ verbose: bool = False
2606
+ ) -> DecompositionResult:
2607
+ """Decompose a molecule with R-group placeholders using dual-probe consensus.
2608
+
2609
+ Strategy: run decomposition twice with different probe halogen sets
2610
+ (A: F/Cl, B: I/Br), replace probe names with R-group labels, and
2611
+ INTERSECT — only keep names that both sets agree on.
2612
+
2613
+ The two probe sets are designed with matching alphabetical orderings:
2614
+ Set A: fluoro (1st label), chloro (2nd label) → chloro < fluoro
2615
+ Set B: iodo (1st label), bromo (2nd label) → bromo < iodo
2616
+ This ensures that IUPAC alphabetical prefix ordering is consistent
2617
+ between sets, so name strings match after probe→label replacement.
2618
+
2619
+ If the molecule already contains one of the probe halogens, the
2620
+ collision is detected via the intersection (colliding set produces
2621
+ wrong names) and a single-probe fallback is used.
2622
+
2623
+ If the SMILES has no dummy atoms, falls through to regular decompose_name.
2624
+
2625
+ Args:
2626
+ smiles: SMILES string, may contain [*] dummy atoms for R-groups.
2627
+ labels: Optional labels for R-groups. Can be:
2628
+ - None: auto-generate R, R1, R2...
2629
+ - list: ['R', 'X'] matched to dummies in order
2630
+ - dict: {atom_idx: label}
2631
+ verbose: Print debug info to stderr.
2632
+ """
2633
+ # Prepare both probe sets
2634
+ probed_a, mappings_a = prepare_rgroup_smiles(
2635
+ smiles, labels, probe_set=_PROBE_SET_A)
2636
+ probed_b, mappings_b = prepare_rgroup_smiles(
2637
+ smiles, labels, probe_set=_PROBE_SET_B)
2638
+
2639
+ if probed_a is None:
2640
+ # No R-groups found — regular decomposition
2641
+ return decompose_name(smiles, verbose=verbose)
2642
+
2643
+ # Build label→(prefix, stem) for each probe set
2644
+ ltp_a = _probe_label_mapping(mappings_a, _PROBE_SET_A)
2645
+ ltp_b = _probe_label_mapping(mappings_b, _PROBE_SET_B)
2646
+
2647
+ if verbose:
2648
+ print(f" R-group dual-probe consensus:", file=sys.stderr)
2649
+ print(f" Set A ({probed_a}): "
2650
+ + ", ".join(f"{l}={p}" for l, (p, _) in ltp_a.items()),
2651
+ file=sys.stderr)
2652
+ print(f" Set B ({probed_b}): "
2653
+ + ", ".join(f"{l}={p}" for l, (p, _) in ltp_b.items()),
2654
+ file=sys.stderr)
2655
+
2656
+ # Run decomposition with each probe set
2657
+ result_a = decompose_name(probed_a, verbose=verbose)
2658
+ result_b = decompose_name(probed_b, verbose=verbose)
2659
+
2660
+ # Replace probes with labels in canonical names
2661
+ canon_a = _replace_all_probes(result_a.canonical_name, ltp_a)
2662
+ canon_b = _replace_all_probes(result_b.canonical_name, ltp_b)
2663
+
2664
+ # Determine canonical name: prefer consensus, fall back to non-colliding
2665
+ mol = Chem.MolFromSmiles(smiles)
2666
+ real_elements = {a.GetAtomicNum() for a in mol.GetAtoms()
2667
+ if a.GetAtomicNum() != 0}
2668
+ a_collides = any(z in real_elements for z, _, _ in _PROBE_SET_A)
2669
+ b_collides = any(z in real_elements for z, _, _ in _PROBE_SET_B)
2670
+
2671
+ if canon_a == canon_b:
2672
+ canonical = canon_a
2673
+ if verbose:
2674
+ print(f" Canonical consensus: {canonical}", file=sys.stderr)
2675
+ else:
2676
+ if a_collides and not b_collides:
2677
+ canonical = canon_b
2678
+ elif b_collides and not a_collides:
2679
+ canonical = canon_a
2680
+ else:
2681
+ canonical = canon_a # No collision — names just differ slightly
2682
+ if verbose:
2683
+ print(f" Canonical disagree: A={canon_a}, B={canon_b}",
2684
+ file=sys.stderr)
2685
+ print(f" Using: {canonical}", file=sys.stderr)
2686
+
2687
+ # Collect alternatives from each set, keyed by name-after-replacement
2688
+ def _alts_by_name(result, ltp):
2689
+ by_name = {}
2690
+ for alt in result.alternatives:
2691
+ if alt.valid:
2692
+ replaced = _replace_all_probes(alt.name, ltp)
2693
+ if replaced not in by_name:
2694
+ by_name[replaced] = alt
2695
+ return by_name
2696
+
2697
+ alts_a = _alts_by_name(result_a, ltp_a)
2698
+ alts_b = _alts_by_name(result_b, ltp_b)
2699
+
2700
+ # Intersect: only keep names that both sets agree on
2701
+ common_names = set(alts_a.keys()) & set(alts_b.keys())
2702
+
2703
+ if verbose:
2704
+ print(f" Set A alts: {len(alts_a)}, Set B alts: {len(alts_b)}, "
2705
+ f"consensus: {len(common_names)}", file=sys.stderr)
2706
+ for name in sorted(common_names):
2707
+ print(f" [OK] {name}", file=sys.stderr)
2708
+ only_a = set(alts_a.keys()) - common_names
2709
+ only_b = set(alts_b.keys()) - common_names
2710
+ for name in sorted(only_a):
2711
+ print(f" [A only] {name}", file=sys.stderr)
2712
+ for name in sorted(only_b):
2713
+ print(f" [B only] {name}", file=sys.stderr)
2714
+
2715
+ # --- Build final result ---
2716
+ canon_smi = Chem.MolToSmiles(mol) if mol else ""
2717
+ result = DecompositionResult(
2718
+ original_smiles=smiles,
2719
+ canonical_smiles=canon_smi,
2720
+ canonical_name=canonical,
2721
+ canonical_parent=_replace_all_probes(
2722
+ result_a.canonical_parent or "", ltp_a) or None,
2723
+ bracket_tree=None,
2724
+ )
2725
+
2726
+ for name in sorted(common_names):
2727
+ alt_a = alts_a[name]
2728
+ result.alternatives.append(Alternative(
2729
+ name=name,
2730
+ parent_name=_replace_all_probes(alt_a.parent_name, ltp_a),
2731
+ sub_name=_replace_all_probes(alt_a.sub_name, ltp_a),
2732
+ locant=alt_a.locant,
2733
+ valid=True,
2734
+ strategy=alt_a.strategy,
2735
+ ))
2736
+
2737
+ # Fallback: if consensus is empty, use the non-colliding set
2738
+ if not common_names:
2739
+ if a_collides and not b_collides and alts_b:
2740
+ fallback_alts, fallback_ltp = alts_b, ltp_b
2741
+ elif b_collides and not a_collides and alts_a:
2742
+ fallback_alts, fallback_ltp = alts_a, ltp_a
2743
+ elif alts_a:
2744
+ fallback_alts, fallback_ltp = alts_a, ltp_a
2745
+ else:
2746
+ fallback_alts, fallback_ltp = alts_b, ltp_b
2747
+
2748
+ for name, alt in fallback_alts.items():
2749
+ result.alternatives.append(Alternative(
2750
+ name=name,
2751
+ parent_name=_replace_all_probes(alt.parent_name, fallback_ltp),
2752
+ sub_name=_replace_all_probes(alt.sub_name, fallback_ltp),
2753
+ locant=alt.locant,
2754
+ valid=True,
2755
+ strategy=alt.strategy + " (single-probe fallback)",
2756
+ ))
2757
+ if verbose and fallback_alts:
2758
+ print(f" Fallback to single probe: {len(fallback_alts)} alts",
2759
+ file=sys.stderr)
2760
+
2761
+ return result
2762
+
2763
+
2764
+ # ---------------------------------------------------------------------------
2765
+ # CLI
2766
+ # ---------------------------------------------------------------------------
2767
+
2768
+ def _format_text(result: DecompositionResult) -> str:
2769
+ """Format result as human-readable text."""
2770
+ lines = []
2771
+ lines.append(f"Input SMILES: {result.original_smiles}")
2772
+ lines.append(f"Canonical SMILES: {result.canonical_smiles}")
2773
+ lines.append(f"Canonical name: {result.canonical_name}")
2774
+
2775
+ if result.errors:
2776
+ for e in result.errors:
2777
+ lines.append(f" ERROR: {e}")
2778
+ return "\n".join(lines)
2779
+
2780
+ if result.bracket_tree:
2781
+ lines.append(f"\nBracket groups ({len(result.bracket_tree.children)}):")
2782
+ for child in result.bracket_tree.children:
2783
+ lines.append(f" ({child.text}) [{child.kind}]")
2784
+
2785
+ valid_alts = [a for a in result.alternatives if a.valid]
2786
+ invalid_alts = [a for a in result.alternatives if not a.valid]
2787
+
2788
+ lines.append(f"\nAlternatives ({len(valid_alts)} valid, "
2789
+ f"{len(invalid_alts)} invalid):")
2790
+
2791
+ lines.append(f" 1. {result.canonical_name} [canonical]")
2792
+ for i, alt in enumerate(valid_alts, 2):
2793
+ lines.append(f" {i}. {alt.name} [VALID, parent: {alt.parent_name}]")
2794
+
2795
+ if invalid_alts:
2796
+ lines.append(f"\n Invalid attempts:")
2797
+ for alt in invalid_alts:
2798
+ lines.append(f" - {alt.name} [{alt.strategy}]")
2799
+
2800
+ return "\n".join(lines)
2801
+
2802
+
2803
+ def _format_json(result: DecompositionResult) -> str:
2804
+ """Format result as JSON."""
2805
+ d = {
2806
+ "original_smiles": result.original_smiles,
2807
+ "canonical_smiles": result.canonical_smiles,
2808
+ "canonical_name": result.canonical_name,
2809
+ "errors": result.errors,
2810
+ "alternatives": [asdict(a) for a in result.alternatives],
2811
+ }
2812
+ return json.dumps(d, indent=2)
2813
+
2814
+
2815
+ def main():
2816
+ parser = argparse.ArgumentParser(
2817
+ description="Name-driven IUPAC decomposition"
2818
+ )
2819
+ parser.add_argument("smiles", help="SMILES string to decompose")
2820
+ parser.add_argument("-v", "--verbose", action="store_true",
2821
+ help="Print detailed progress to stderr")
2822
+ parser.add_argument("--json", action="store_true",
2823
+ help="Output as JSON")
2824
+ parser.add_argument("--max-depth", type=int, default=-1,
2825
+ help="Maximum recursion depth (default: -1, "
2826
+ "unlimited until timeout). 0 = no recursion.")
2827
+ parser.add_argument("--timeout", type=float, default=30.0,
2828
+ help="Timeout in seconds (default: 30). "
2829
+ "Use 0 to disable.")
2830
+ args = parser.parse_args()
2831
+
2832
+ timeout = args.timeout if args.timeout > 0 else None
2833
+ result = decompose_name(args.smiles, max_depth=args.max_depth,
2834
+ verbose=args.verbose, timeout=timeout)
2835
+
2836
+ if args.json:
2837
+ print(_format_json(result))
2838
+ else:
2839
+ print(_format_text(result))
2840
+
2841
+
2842
+ if __name__ == "__main__":
2843
+ main()