cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2342 @@
1
+ """
2
+ aligned_namer.py — Aligned IUPAC Name Generation
3
+
4
+ Pairwise alignment (SM→product pairs) and multi-step sequence alignment
5
+ for synthetic routes.
6
+
7
+ Uses name_decomposer to exhaustively generate alternative names for each
8
+ molecule, then picks names that share the same naming parent, making the
9
+ transformation obvious from the names alone.
10
+
11
+ Multi-step sequences use parent-aware dynamic programming (Viterbi) to
12
+ minimise parent-ring switches first, then chemistry-aware token diff as
13
+ tiebreaker.
14
+
15
+ Usage:
16
+ python aligned_namer.py --sm "BrC1=CC=CC=C1" --product "C1=CC=C(C2=CC=NC=C2)C=C1"
17
+ python aligned_namer.py --showcase # run on all showcase reactions
18
+ python aligned_namer.py --showcase --report alignment_report.txt
19
+ """
20
+ import argparse
21
+ import difflib
22
+ import html as html_mod
23
+ import re
24
+ import sys
25
+ import os
26
+ import glob
27
+ from collections import Counter, defaultdict
28
+ from dataclasses import dataclass, field
29
+ from typing import Dict, List, Tuple, Optional
30
+
31
+ from rdkit import Chem, RDLogger
32
+ from rdkit.Chem import rdFMCS
33
+ RDLogger.logger().setLevel(RDLogger.ERROR)
34
+
35
+ from cdxml_toolkit.naming.name_decomposer import (
36
+ decompose_name, DecompositionResult, name_fragment_as_substituent,
37
+ _validate_name, _canonical, _name_to_smiles,
38
+ )
39
+
40
+ try:
41
+ from rdkit.Chem.inchi import MolToInchi
42
+ except ImportError:
43
+ MolToInchi = None # type: ignore[assignment]
44
+
45
+
46
+ def _validate_variant(name: str, expected_canonical: str) -> bool:
47
+ """Validate a variant name resolves to the same molecule.
48
+
49
+ First tries canonical SMILES comparison (fast). Falls back to
50
+ InChI comparison to handle tautomers (e.g. quinazolinone NH position).
51
+ """
52
+ if _validate_name(name, expected_canonical):
53
+ return True
54
+ # Canonical SMILES didn't match — try InChI (tautomer-tolerant)
55
+ if MolToInchi is None:
56
+ return False
57
+ smi = _name_to_smiles(name)
58
+ if smi is None:
59
+ return False
60
+ try:
61
+ mol_variant = Chem.MolFromSmiles(smi)
62
+ mol_expected = Chem.MolFromSmiles(expected_canonical)
63
+ if mol_variant is None or mol_expected is None:
64
+ return False
65
+ return MolToInchi(mol_variant) == MolToInchi(mol_expected)
66
+ except Exception:
67
+ return False
68
+
69
+
70
+ # ---------------------------------------------------------------------------
71
+ # Levenshtein distance
72
+ # ---------------------------------------------------------------------------
73
+
74
+ def _levenshtein(s1: str, s2: str) -> int:
75
+ """Compute Levenshtein edit distance between two strings."""
76
+ if len(s1) < len(s2):
77
+ return _levenshtein(s2, s1)
78
+ if len(s2) == 0:
79
+ return len(s1)
80
+ prev = list(range(len(s2) + 1))
81
+ for i, c1 in enumerate(s1):
82
+ curr = [i + 1]
83
+ for j, c2 in enumerate(s2):
84
+ # insertion, deletion, substitution
85
+ curr.append(min(
86
+ prev[j + 1] + 1,
87
+ curr[j] + 1,
88
+ prev[j] + (0 if c1 == c2 else 1)
89
+ ))
90
+ prev = curr
91
+ return prev[-1]
92
+
93
+
94
+ def name_similarity(name1: str, name2: str) -> float:
95
+ """Compute similarity between two names as 1 - normalized Levenshtein.
96
+
97
+ Returns a float in [0, 1] where 1.0 means identical.
98
+ """
99
+ if not name1 or not name2:
100
+ return 0.0
101
+ dist = _levenshtein(name1.lower(), name2.lower())
102
+ max_len = max(len(name1), len(name2))
103
+ return 1.0 - (dist / max_len) if max_len > 0 else 0.0
104
+
105
+
106
+ # ---------------------------------------------------------------------------
107
+ # Chemistry-aware tokeniser
108
+ # ---------------------------------------------------------------------------
109
+
110
+ # Ring system names (ordered longest-first for greedy matching)
111
+ _RING_SYSTEMS = sorted([
112
+ "quinoline", "isoquinoline", "quinoxaline", "quinazoline",
113
+ "pyridine", "pyrimidine", "pyrazine", "pyridazine",
114
+ "benzene", "naphthalene", "anthracene",
115
+ "indole", "benzimidazole", "benzothiazole", "benzofuran", "benzoxazole",
116
+ "thiophene", "furan", "pyrrole", "imidazole", "oxazole",
117
+ "thiazole", "triazine", "tetrazole", "triazole", "oxadiazole",
118
+ "morpholine", "piperidine", "piperazine", "pyrrolidine",
119
+ "carbazole", "acridine", "phenanthroline",
120
+ "thienopyrimidine", "isoindoline", "isoindole",
121
+ "carbamate", "benzamide", "acetamide",
122
+ # Additional heterocycles common in drug synthesis
123
+ "pyrazole", "isoxazole", "isothiazole",
124
+ "oxazolidine", "oxazolidinone", "thiazolidine",
125
+ "tetrahydronaphthalene", "dihydronaphthalene",
126
+ "phthalazine", "cinnoline",
127
+ "purine", "xanthine",
128
+ "azetidine", "aziridine", "oxetane", "thietane",
129
+ "diazepine", "oxazepine",
130
+ # Retained names (for tokenizer splitting: "dimethylaniline" → "dimethyl"+"aniline")
131
+ "aniline", "phenol", "benzenol",
132
+ "anisole", "benzaldehyde", "acetophenone", "styrene",
133
+ ], key=len, reverse=True)
134
+
135
+ _SUBSTITUENT_PREFIXES = sorted([
136
+ "amino", "bromo", "chloro", "fluoro", "iodo", "nitro",
137
+ "methyl", "ethyl", "propyl", "butyl", "phenyl", "benzyl",
138
+ "methoxy", "ethoxy", "hydroxy", "oxo", "formyl",
139
+ "methoxycarbonyl", "ethoxycarbonyl", "carbamoyl", "carboxy", "cyano",
140
+ "morpholino", "morpholin", "piperidin", "pyrrolidin", "piperazin",
141
+ "benzamido", "acetamido", "acetyl",
142
+ "tert", "sec", "iso", "cyclo",
143
+ ], key=len, reverse=True)
144
+
145
+ _MULTIPLIERS = {"di", "tri", "tetra", "penta", "hexa", "bis", "tris"}
146
+ _STEREO = {"r", "s", "e", "z", "cis", "trans", "rac", "dl", "meso"}
147
+
148
+
149
+ _LINKERS = frozenset({
150
+ 'yl', 'oxy', 'oyl', 'amido', 'amino', 'thio', 'sulfonyl',
151
+ 'amine', 'ol', 'one', 'thiol',
152
+ })
153
+ _FG_SUFFIXES = ('amine', 'amide', 'thiol', 'aldehyde', 'nitrile')
154
+
155
+
156
+ def _classify_token(tok: str, out: list) -> None:
157
+ """Recursively classify and split a single IUPAC token.
158
+
159
+ Appends (token, category) tuples to *out*.
160
+ """
161
+ if not tok:
162
+ return
163
+
164
+ # 1. Locant
165
+ if re.match(r'^\d+(?:,\d+)*$', tok):
166
+ out.append((tok, 'locant'))
167
+ return
168
+
169
+ # 2. Ring match (longest ring first — _RING_SYSTEMS is pre-sorted)
170
+ for ring in _RING_SYSTEMS:
171
+ if tok == ring or tok.endswith(ring):
172
+ prefix = tok[:len(tok) - len(ring)]
173
+ if prefix:
174
+ _classify_token(prefix, out) # recurse on prefix
175
+ out.append((ring, 'ring'))
176
+ return
177
+
178
+ # 3. Exact multiplier / stereo / linker
179
+ if tok in _MULTIPLIERS:
180
+ out.append((tok, 'multiplier'))
181
+ return
182
+ if tok in _STEREO:
183
+ out.append((tok, 'stereo'))
184
+ return
185
+ if tok in _LINKERS:
186
+ out.append((tok, 'linker'))
187
+ return
188
+
189
+ # 4. Exact substituent prefix
190
+ for sub in _SUBSTITUENT_PREFIXES:
191
+ if tok == sub:
192
+ out.append((sub, 'substituent'))
193
+ return
194
+
195
+ # 5. Split on substituent suffix (longest match first)
196
+ # E.g. "dimethylphenyl" → "dimethyl" + "phenyl"
197
+ for sub in _SUBSTITUENT_PREFIXES:
198
+ if tok.endswith(sub) and len(tok) > len(sub):
199
+ _classify_token(tok[:-len(sub)], out) # recurse on prefix
200
+ out.append((sub, 'substituent'))
201
+ return
202
+
203
+ # 6. Split on functional group suffix
204
+ # E.g. "phenylamine" → "phenyl" + "amine"
205
+ for fg in _FG_SUFFIXES:
206
+ if tok.endswith(fg) and len(tok) > len(fg):
207
+ _classify_token(tok[:-len(fg)], out) # recurse on prefix
208
+ out.append((fg, 'linker'))
209
+ return
210
+
211
+ # 7. Fallback
212
+ out.append((tok, 'other'))
213
+
214
+
215
+ def _tokenize_name_chem(name: str) -> List[Tuple[str, str]]:
216
+ """Chemistry-aware IUPAC name tokeniser.
217
+
218
+ Returns list of (token, category) tuples where category is one of:
219
+ locant, ring, substituent, multiplier, stereo, linker, other.
220
+ """
221
+ result: List[Tuple[str, str]] = []
222
+ s = name.lower().strip()
223
+ raw = re.findall(r'\d+(?:,\d+)*|[a-z]+|\S', s)
224
+
225
+ for tok in raw:
226
+ if not tok or tok in ('(', ')', '-', ',', '[', ']', ' '):
227
+ continue
228
+ _classify_token(tok, result)
229
+
230
+ return result
231
+
232
+
233
+ def _chem_tokens_flat(name: str) -> List[str]:
234
+ """Get just the token strings from chemistry-aware tokeniser."""
235
+ return [tok for tok, _ in _tokenize_name_chem(name)]
236
+
237
+
238
+ def chem_token_diff_count(a: str, b: str) -> float:
239
+ """Token diff count using chemistry-aware tokeniser with soft equivalences.
240
+
241
+ Exact token mismatches cost 1.0 each. Tokens that are chemically
242
+ related (e.g. "phenyl"/"benzene", "aniline"/"phenylamine") contribute
243
+ a reduced cost (0.3) instead of the full 1.0 per token.
244
+ """
245
+ ta = Counter(_chem_tokens_flat(a))
246
+ tb = Counter(_chem_tokens_flat(b))
247
+ # Work on copies so we can consume matching equivalences
248
+ ra = dict(ta) # residual counts for a
249
+ rb = dict(tb) # residual counts for b
250
+
251
+ cost = 0.0
252
+
253
+ # First pass: consume exact matches (cost = 0)
254
+ for k in set(ra) & set(rb):
255
+ matched = min(ra[k], rb[k])
256
+ ra[k] -= matched
257
+ rb[k] -= matched
258
+
259
+ # Second pass: try soft equivalences on remaining tokens
260
+ # Build residual sets (only tokens with count > 0)
261
+ ra = {k: v for k, v in ra.items() if v > 0}
262
+ rb = {k: v for k, v in rb.items() if v > 0}
263
+
264
+ for tok_a, tok_b, n_consumed_a, n_consumed_b in _soft_equivalence_pairs(ra, rb):
265
+ matched = min(ra.get(tok_a, 0) // n_consumed_a,
266
+ rb.get(tok_b, 0) // n_consumed_b)
267
+ if matched > 0:
268
+ ra[tok_a] = ra.get(tok_a, 0) - matched * n_consumed_a
269
+ rb[tok_b] = rb.get(tok_b, 0) - matched * n_consumed_b
270
+ if ra[tok_a] <= 0:
271
+ ra.pop(tok_a, None)
272
+ if rb[tok_b] <= 0:
273
+ rb.pop(tok_b, None)
274
+ cost += matched * _SOFT_EQUIV_COST
275
+
276
+ # Remaining unmatched tokens cost 1.0 each
277
+ cost += sum(v for v in ra.values())
278
+ cost += sum(v for v in rb.values())
279
+
280
+ return cost
281
+
282
+
283
+ # Soft equivalence: related tokens that should have reduced mismatch cost.
284
+ # Each entry: (token_a, token_b, count_a, count_b)
285
+ # Meaning: 1 of token_a ≈ 1 of token_b (consuming count_a and count_b respectively)
286
+ _SOFT_EQUIV_TABLE = [
287
+ # Ring/substituent forms of the same moiety
288
+ ("benzene", "phenyl", 1, 1),
289
+ ("naphthalene", "naphthyl", 1, 1),
290
+ # Retained → systematic (only entries where BOTH sides are single tokens
291
+ # after _chem_tokens_flat; multi-token targets like "phenylamine" and
292
+ # "methoxybenzene" are handled by _retained_systematic_variants instead)
293
+ ("phenol", "benzenol", 1, 1),
294
+ # Functional group name equivalences
295
+ ("amine", "amino", 1, 1),
296
+ ("ol", "hydroxy", 1, 1),
297
+ ("one", "oxo", 1, 1),
298
+ ("thiol", "sulfanyl", 1, 1),
299
+ # Ester naming equivalences
300
+ ("carboxylate", "carboxylic", 1, 1),
301
+ ]
302
+ _SOFT_EQUIV_COST = 0.3 # cost per soft-equivalent pair (vs 1.0 for hard mismatch)
303
+
304
+
305
+ def _soft_equivalence_pairs(ra: dict, rb: dict):
306
+ """Yield applicable (tok_a, tok_b, n_a, n_b) from the equivalence table.
307
+
308
+ Only yields pairs where tok_a is in *ra* and tok_b is in *rb*,
309
+ or vice versa (bidirectional).
310
+ """
311
+ for tok_a, tok_b, n_a, n_b in _SOFT_EQUIV_TABLE:
312
+ if ra.get(tok_a, 0) >= n_a and rb.get(tok_b, 0) >= n_b:
313
+ yield tok_a, tok_b, n_a, n_b
314
+ elif ra.get(tok_b, 0) >= n_b and rb.get(tok_a, 0) >= n_a:
315
+ yield tok_b, tok_a, n_b, n_a
316
+
317
+
318
+ # ---------------------------------------------------------------------------
319
+ # Parent ring extraction
320
+ # ---------------------------------------------------------------------------
321
+
322
+ # Known ring systems for parent classification.
323
+ # Fused names like "thieno[2,3-d]pyrimidine" are intentionally omitted —
324
+ # they contain "pyrimidine" as substring, so the simpler ring name matches.
325
+ # This avoids false switches when the decomposer reports different levels
326
+ # of specificity for the same scaffold.
327
+ _KNOWN_RINGS = {
328
+ # 6-membered N-heterocycles
329
+ "pyridine", "pyrimidine", "pyrazine", "pyridazine", "triazine",
330
+ "pyran", "thiopyran",
331
+ # 5-membered heterocycles
332
+ "thiophene", "furan", "pyrrole", "imidazole", "oxazole",
333
+ "thiazole", "tetrazole", "pyrazole", "isoxazole", "isothiazole",
334
+ "triazole", "oxadiazole", "thiadiazole",
335
+ "selenophene",
336
+ # Saturated 5-membered
337
+ "pyrrolidine", "oxazolidine", "thiazolidine", "dioxolane",
338
+ # Saturated 6-membered
339
+ "piperidine", "piperazine", "morpholine",
340
+ "dioxane", "dithiane",
341
+ # 3- and 4-membered
342
+ "oxirane", "aziridine", "thiirane",
343
+ "azetidine", "oxetane", "thietane",
344
+ # 7-membered
345
+ "diazepine", "oxazepine", "azepane", "azepine", "oxepane",
346
+ # Benzo-fused N-heterocycles
347
+ "quinoline", "isoquinoline", "quinoxaline", "quinazoline",
348
+ "phthalazine", "cinnoline",
349
+ "indole", "isoindole", "indazole", "indoline", "isoindoline",
350
+ "benzimidazole", "benzotriazole",
351
+ # Benzo-fused O/S heterocycles
352
+ "benzofuran", "benzothiophene", "benzoxazole",
353
+ "benzothiazole", "benzisoxazole", "benzisothiazole",
354
+ "chromene", "chromone", "coumarin", "chroman",
355
+ "benzodioxole", "benzodioxane",
356
+ # Larger fused heterocycles
357
+ "carbazole", "acridine", "phenanthroline",
358
+ "phenothiazine", "phenoxazine", "phenazine", "phenanthridine",
359
+ "purine", "xanthine", "xanthene", "pteridine",
360
+ "naphthyridine", "benzodiazepine",
361
+ # Fused N-rich (common in kinase inhibitors)
362
+ "pyrrolopyrimidine", "pyrazolopyrimidine", "imidazopyridine",
363
+ "pyrrolizine", "indolizine",
364
+ "thienopyridine", "thienopyrimidine",
365
+ # Drug-relevant lactams/imides
366
+ "hydantoin",
367
+ # Saturated fused / partial
368
+ "tetrahydroisoquinoline", "tetrahydroquinoline",
369
+ # Carbocycles — simple
370
+ "benzene", "toluene", "naphthalene", "anthracene",
371
+ "cyclopropane", "cyclobutane",
372
+ "cyclopentane", "cyclopentene", "cyclopentadiene",
373
+ "cyclohexane", "cyclohexene", "cyclohexadiene",
374
+ "cycloheptane", "cyclooctane",
375
+ # Carbocycles — polycyclic
376
+ "indene", "indane", "fluorene", "phenanthrene", "azulene",
377
+ "decalin", "tetralin",
378
+ "adamantane", "norbornane",
379
+ "biphenyl",
380
+ }
381
+
382
+
383
+ # Retained IUPAC names that map to a base ring system.
384
+ # These are trivially-substituted rings whose retained name doesn't
385
+ # contain the base ring string as a substring.
386
+ _RETAINED_TO_BASE = {
387
+ # Benzene retained names
388
+ "aniline": "benzene", "phenol": "benzene", "anisole": "benzene",
389
+ "acetophenone": "benzene", "benzaldehyde": "benzene",
390
+ "benzoic acid": "benzene", "styrene": "benzene",
391
+ "catechol": "benzene", "resorcinol": "benzene",
392
+ "hydroquinone": "benzene", "cresol": "benzene",
393
+ "xylene": "benzene", "toluene": "benzene",
394
+ "cumene": "benzene", "mesitylene": "benzene",
395
+ # Naphthalene retained names
396
+ "naphthol": "naphthalene",
397
+ # Saturated/partial naphthalene
398
+ "tetralin": "naphthalene", "decalin": "naphthalene",
399
+ # Indene/indane family
400
+ "indane": "indene",
401
+ }
402
+
403
+ # Pre-compute elided stems: "quinazoline" → "quinazolin", etc.
404
+ # IUPAC drops terminal 'e' before vowel-starting suffixes (-ol, -one, -amine).
405
+ _KNOWN_RING_STEMS = {}
406
+ for _ring in _KNOWN_RINGS:
407
+ if _ring.endswith('e'):
408
+ _KNOWN_RING_STEMS[_ring[:-1]] = _ring
409
+ _KNOWN_RING_STEMS[_ring] = _ring
410
+
411
+ # Pre-sorted versions for hot-path functions (extract_parent_ring, etc.)
412
+ # Avoids re-sorting on every call inside the DP inner loop.
413
+ _KNOWN_RINGS_BY_LEN = sorted(_KNOWN_RINGS, key=len, reverse=True)
414
+ _KNOWN_RING_STEMS_BY_LEN = sorted(_KNOWN_RING_STEMS, key=len, reverse=True)
415
+
416
+
417
+ def _strip_locants(s: str) -> str:
418
+ """Remove IUPAC locant insertions so ring substrings become contiguous.
419
+
420
+ E.g. "cyclohex-1-ene-1-carboxylate" → "cyclohexenecarboxylate"
421
+ """
422
+ return re.sub(r'-[\d,()H]+-', '', s)
423
+
424
+
425
+ def extract_parent_ring(parent: str) -> str:
426
+ """Extract core ring system from a parent name string.
427
+
428
+ Checks (in order):
429
+ 1. Known ring names as substrings, longest first
430
+ 2. Elided stems (e.g. "quinazolin" for "quinazoline")
431
+ 3. Locant-stripped matching (handles "cyclohex-1-ene" → "cyclohexene")
432
+ 4. Retained name → base ring mapping
433
+ 5. "phenyl" in name → benzene (chain compounds with phenyl substituent)
434
+ 6. Suffix patterns (e.g. "-phenone" → benzene)
435
+ 7. Fallback: lowered parent string
436
+ """
437
+ p = parent.lower().strip()
438
+ # 1. Direct ring match (longest first)
439
+ for ring in _KNOWN_RINGS_BY_LEN:
440
+ if ring in p:
441
+ return _RETAINED_TO_BASE.get(ring, ring)
442
+ # 2. Elided stem match (longest first) — handles vowel elision
443
+ # e.g. "quinazolin-4-one" matches stem "quinazolin" → "quinazoline"
444
+ for stem in _KNOWN_RING_STEMS_BY_LEN:
445
+ if stem in p:
446
+ ring = _KNOWN_RING_STEMS[stem]
447
+ return _RETAINED_TO_BASE.get(ring, ring)
448
+ # 3. Locant-stripped matching — handles "cyclohex-1-ene" → "cyclohexene"
449
+ p_stripped = _strip_locants(p)
450
+ if p_stripped != p:
451
+ for ring in _KNOWN_RINGS_BY_LEN:
452
+ if ring in p_stripped:
453
+ return _RETAINED_TO_BASE.get(ring, ring)
454
+ for stem in _KNOWN_RING_STEMS_BY_LEN:
455
+ if stem in p_stripped:
456
+ ring = _KNOWN_RING_STEMS[stem]
457
+ return _RETAINED_TO_BASE.get(ring, ring)
458
+ # 4. Retained names
459
+ for retained, base in _RETAINED_TO_BASE.items():
460
+ if retained in p:
461
+ return base
462
+ # 5. "phenyl" in name → benzene (chain compounds like "1-phenylpropan-1-ol")
463
+ if "phenyl" in p:
464
+ return "benzene"
465
+ # 6. Suffix patterns
466
+ if p.endswith("phenone") or p.endswith("phenol"):
467
+ return "benzene"
468
+ # 7. Names starting with "benz"
469
+ if p.startswith("benz"):
470
+ return "benzene"
471
+ return p
472
+
473
+
474
+ # ---------------------------------------------------------------------------
475
+ # Post-hoc alignment variant generator
476
+ # ---------------------------------------------------------------------------
477
+ # Expands the candidate name set with IUPAC-equivalent alternatives that
478
+ # the decomposer may not have produced, specifically:
479
+ # 1. Ester naming: "alkyl X-ate" ↔ "X-ic acid alkyl ester"
480
+ # 2. Retained→systematic: "aniline" → "phenylamine", etc.
481
+ # 3. Indicated-H lactam suffix→prefix: "-4(3H)-one" → "4-oxo-"
482
+
483
+ _COMMON_ALKYL_ESTERS = {
484
+ "methyl", "ethyl", "propyl", "isopropyl", "butyl", "tert-butyl",
485
+ "isobutyl", "sec-butyl", "benzyl", "allyl", "phenyl", "vinyl",
486
+ "neopentyl", "cyclopentyl", "cyclohexyl",
487
+ }
488
+
489
+ # Alkyl → alkoxy mapping for ester suffix→prefix conversion.
490
+ # "methyl X-carboxylate" → "(methoxycarbonyl)X"
491
+ _ALKYL_TO_ALKOXY = {
492
+ "methyl": "methoxy", "ethyl": "ethoxy", "propyl": "propoxy",
493
+ "isopropyl": "isopropoxy", "butyl": "butoxy",
494
+ "tert-butyl": "tert-butoxy", "isobutyl": "isobutoxy",
495
+ "sec-butyl": "sec-butoxy", "benzyl": "benzyloxy",
496
+ "allyl": "allyloxy", "phenyl": "phenoxy", "vinyl": "vinyloxy",
497
+ "neopentyl": "neopentyloxy", "cyclopentyl": "cyclopentyloxy",
498
+ "cyclohexyl": "cyclohexyloxy",
499
+ }
500
+ _ALKOXY_TO_ALKYL = {v: k for k, v in _ALKYL_TO_ALKOXY.items()}
501
+
502
+
503
+ def _ester_variants(name: str, parent: str) -> List[Tuple[str, str]]:
504
+ """Generate ester naming alternatives.
505
+
506
+ "ethyl X-carboxylate" → "X-carboxylic acid ethyl ester"
507
+ "X-ic acid alkyl ester" → "alkyl X-ate"
508
+ """
509
+ variants: List[Tuple[str, str]] = []
510
+
511
+ # Direction 1: "alkyl ...ate" → "...ic acid alkyl ester"
512
+ parts = name.split(None, 1)
513
+ if len(parts) == 2:
514
+ first = parts[0]
515
+ rest = parts[1]
516
+ if first.lower() in _COMMON_ALKYL_ESTERS and rest.endswith("ate"):
517
+ acid_form = rest[:-3] + "ic acid"
518
+ variant = acid_form + " " + first + " ester"
519
+ # Parent: use the acid form as parent (same ring)
520
+ acid_parent = parent
521
+ if parent and parent.endswith("ate"):
522
+ acid_parent = parent[:-3] + "ic acid"
523
+ variants.append((variant, acid_parent))
524
+
525
+ # Direction 2: "...ic acid alkyl ester" → "alkyl ...ate"
526
+ m = re.match(r'^(.+ic acid)\s+(\S+)\s+ester$', name, re.IGNORECASE)
527
+ if m:
528
+ acid_part = m.group(1)
529
+ alkyl = m.group(2)
530
+ # "Xic acid" → "Xate" (strip "ic acid" = 7 chars, append "ate")
531
+ ester_form = alkyl + " " + acid_part[:-7] + "ate"
532
+ variants.append((ester_form, parent))
533
+
534
+ return variants
535
+
536
+
537
+ # Retained IUPAC name → systematic alternative(s).
538
+ # Includes both the base retained name and common derivatives.
539
+ _RETAINED_TO_SYSTEMATIC = {
540
+ # Benzene derivatives
541
+ "aniline": "phenylamine",
542
+ "phenol": "benzenol",
543
+ "anisole": "methoxybenzene",
544
+ "benzaldehyde": "benzenecarbaldehyde",
545
+ "acetophenone": "1-phenylethanone",
546
+ "styrene": "ethenylbenzene",
547
+ "catechol": "benzene-1,2-diol",
548
+ "resorcinol": "benzene-1,3-diol",
549
+ "hydroquinone": "benzene-1,4-diol",
550
+ "cresol": "methylphenol",
551
+ "toluene": "methylbenzene",
552
+ "xylene": "dimethylbenzene",
553
+ "cumene": "isopropylbenzene",
554
+ # Naphthalene derivatives
555
+ "naphthol": "naphthalenol",
556
+ # Common heterocycle retained names
557
+ "nicotinamide": "pyridine-3-carboxamide",
558
+ "nicotinic acid": "pyridine-3-carboxylic acid",
559
+ "salicylaldehyde": "2-hydroxybenzaldehyde",
560
+ "salicylic acid": "2-hydroxybenzoic acid",
561
+ }
562
+
563
+
564
+ def _retained_systematic_variants(
565
+ name: str, parent: str,
566
+ ) -> List[Tuple[str, str]]:
567
+ """Generate systematic IUPAC alternatives for retained names.
568
+
569
+ E.g. "2,6-dimethylaniline" → "2,6-dimethylphenylamine"
570
+ """
571
+ variants: List[Tuple[str, str]] = []
572
+ name_lower = name.lower()
573
+
574
+ for retained, systematic in _RETAINED_TO_SYSTEMATIC.items():
575
+ if retained in name_lower:
576
+ idx = name_lower.index(retained)
577
+ # Preserve original case of prefix
578
+ variant = name[:idx] + systematic + name[idx + len(retained):]
579
+ # Parent: replace retained name in parent too
580
+ if parent:
581
+ parent_lower = parent.lower()
582
+ if retained in parent_lower:
583
+ pidx = parent_lower.index(retained)
584
+ new_parent = parent[:pidx] + systematic + parent[pidx + len(retained):]
585
+ else:
586
+ new_parent = parent
587
+ else:
588
+ new_parent = variant
589
+ variants.append((variant, new_parent))
590
+
591
+ # Also generate reverse: systematic → retained
592
+ for retained, systematic in _RETAINED_TO_SYSTEMATIC.items():
593
+ if systematic in name_lower and retained not in name_lower:
594
+ idx = name_lower.index(systematic)
595
+ variant = name[:idx] + retained + name[idx + len(systematic):]
596
+ if parent:
597
+ parent_lower = parent.lower()
598
+ if systematic in parent_lower:
599
+ pidx = parent_lower.index(systematic)
600
+ new_parent = parent[:pidx] + retained + parent[pidx + len(systematic):]
601
+ else:
602
+ new_parent = parent
603
+ else:
604
+ new_parent = variant
605
+ variants.append((variant, new_parent))
606
+
607
+ return variants
608
+
609
+
610
+ def _indicated_h_variants(name: str, parent: str) -> List[Tuple[str, str]]:
611
+ """Generate suffix→prefix variants for indicated-H lactam names.
612
+
613
+ E.g. "6,7-dimethoxyquinazolin-4(3H)-one" → "4-oxo-6,7-dimethoxyquinazoline"
614
+
615
+ The decomposer's suffix→prefix sometimes fails for names with
616
+ indicated-hydrogen notation like (3H), (1H), etc.
617
+ """
618
+ variants: List[Tuple[str, str]] = []
619
+
620
+ suffix_map = {
621
+ "one": "oxo",
622
+ "ol": "hydroxy",
623
+ "amine": "amino",
624
+ "thione": "thioxo",
625
+ }
626
+
627
+ name_lower = name.lower()
628
+
629
+ for suffix, prefix in suffix_map.items():
630
+ # Look for "STEM-LOCANT(IH)-SUFFIX" where STEM is a known ring stem
631
+ tail_pattern = r'-(\d+)\(\d+[hH]\)-' + re.escape(suffix) + r'$'
632
+ tail_m = re.search(tail_pattern, name_lower)
633
+ if not tail_m:
634
+ continue
635
+
636
+ locant = tail_m.group(1)
637
+ before_locant = name[:tail_m.start()] # everything before "-LOCANT(IH)-SUFFIX"
638
+
639
+ # Find the longest known ring stem at the END of before_locant
640
+ best_stem = None
641
+ best_ring = None
642
+ for stem, ring in _KNOWN_RING_STEMS.items():
643
+ if before_locant.lower().endswith(stem):
644
+ if best_stem is None or len(stem) > len(best_stem):
645
+ best_stem = stem
646
+ best_ring = ring
647
+
648
+ if best_stem:
649
+ # Split: leading substituents + ring
650
+ leading = before_locant[:len(before_locant) - len(best_stem)]
651
+
652
+ # Build variant: "LOCANT-PREFIX-LEADING-RING_FULL"
653
+ # E.g. "4-oxo-6,7-dimethoxyquinazoline"
654
+ if leading:
655
+ variant = locant + "-" + prefix + "-" + leading + best_ring
656
+ else:
657
+ variant = locant + "-" + prefix + best_ring
658
+ variant = re.sub(r'-{2,}', '-', variant)
659
+ variant = variant.strip('-')
660
+
661
+ new_parent = variant
662
+ variants.append((variant, new_parent))
663
+
664
+ return variants
665
+
666
+
667
+ def _general_suffix_prefix_variants(name: str, parent: str) -> List[Tuple[str, str]]:
668
+ """Generate suffix→prefix variants for standard IUPAC names.
669
+
670
+ Handles names WITHOUT indicated-H, e.g.:
671
+ "pyridin-2-amine" → "2-aminopyridine"
672
+ "naphthalen-1-ol" → "1-hydroxynapthalene"
673
+
674
+ Complements _indicated_h_variants which handles (NH) notation.
675
+ """
676
+ variants: List[Tuple[str, str]] = []
677
+ name_lower = name.lower()
678
+
679
+ suffix_map = {
680
+ "amine": "amino",
681
+ "ol": "hydroxy",
682
+ "one": "oxo",
683
+ "thiol": "sulfanyl",
684
+ }
685
+
686
+ for suffix, prefix in suffix_map.items():
687
+ # Pattern: "ring-LOCANT-SUFFIX" at the end
688
+ # E.g. "pyridin-2-amine", "naphthalen-1-ol"
689
+ # Must NOT have indicated-H (handled by _indicated_h_variants)
690
+ pattern = r'-(\d+(?:,\d+)*)-' + re.escape(suffix) + r'$'
691
+ m = re.search(pattern, name_lower)
692
+ if not m:
693
+ continue
694
+
695
+ # Skip if there's an indicated-H right before the suffix
696
+ if re.search(r'\(\d+[hH]\)-' + re.escape(suffix) + r'$', name_lower):
697
+ continue
698
+
699
+ locants = m.group(1)
700
+ before = name[:m.start()] # everything before "-LOCANT-SUFFIX"
701
+
702
+ # Find longest known ring stem at the end of 'before'
703
+ best_stem = None
704
+ best_ring = None
705
+ for stem, ring in _KNOWN_RING_STEMS.items():
706
+ if before.lower().endswith(stem):
707
+ if best_stem is None or len(stem) > len(best_stem):
708
+ best_stem = stem
709
+ best_ring = ring
710
+
711
+ if best_stem:
712
+ leading = before[:len(before) - len(best_stem)]
713
+ if leading:
714
+ # E.g. "6,7-dimethoxypyridin-2-amine" →
715
+ # "2-amino-6,7-dimethoxypyridine"
716
+ variant = locants + "-" + prefix + "-" + leading + best_ring
717
+ else:
718
+ # E.g. "pyridin-2-amine" → "2-aminopyridine"
719
+ variant = locants + "-" + prefix + best_ring
720
+ variant = re.sub(r'-{2,}', '-', variant)
721
+ variant = variant.strip('-')
722
+ variants.append((variant, variant))
723
+
724
+ return variants
725
+
726
+
727
+ # ---------------------------------------------------------------------------
728
+ # Extended suffix→prefix for principal characteristic groups
729
+ # ---------------------------------------------------------------------------
730
+ # Suffixes that extend beyond the parent ring stem (carboxylate,
731
+ # carbaldehyde, carboxamide, carbonitrile, carboxylic acid).
732
+ # These follow the same ring_stem-LOCANT-SUFFIX pattern as the simple
733
+ # suffixes handled by _general_suffix_prefix_variants, but the suffix
734
+ # words themselves are longer and sometimes require an alkyl ester prefix.
735
+
736
+ # (suffix, prefix_or_None, needs_alkyl_prefix, wrap_in_parens)
737
+ # When needs_alkyl_prefix is True, the prefix is built from the
738
+ # space-separated first word of the name (e.g. "methyl" → "methoxycarbonyl").
739
+ _EXTENDED_SUFFIX_TABLE: List[Tuple[str, Optional[str], bool, bool]] = [
740
+ ("carboxylic acid", "carboxy", False, False),
741
+ ("carboxamide", "carbamoyl", False, True),
742
+ ("carbaldehyde", "formyl", False, False),
743
+ ("carbonitrile", "cyano", False, False),
744
+ ("carboxylate", None, True, True), # prefix from alkyl
745
+ ]
746
+
747
+
748
+ def _extended_suffix_prefix_variants(
749
+ name: str, parent: str,
750
+ ) -> List[Tuple[str, str]]:
751
+ """Generate suffix→prefix variants for principal characteristic groups.
752
+
753
+ Handles suffixes that extend beyond the parent ring stem:
754
+ "methyl 2-(methylthio)thieno[2,3-d]pyrimidine-4-carboxylate"
755
+ → "2-(methylthio)-4-(methoxycarbonyl)thieno[2,3-d]pyrimidine"
756
+ "2-(methylthio)thieno[2,3-d]pyrimidine-4-carbaldehyde"
757
+ → "2-(methylthio)-4-formylthieno[2,3-d]pyrimidine"
758
+
759
+ Algorithm mirrors _general_suffix_prefix_variants: find the suffix at
760
+ the end of the name, locate the ring stem before it, and reconstruct
761
+ with the prefix form prepended.
762
+ """
763
+ variants: List[Tuple[str, str]] = []
764
+
765
+ for suffix, prefix, needs_alkyl, wrap in _EXTENDED_SUFFIX_TABLE:
766
+ # Work on a potentially trimmed name (alkyl stripped for esters)
767
+ work_name = name
768
+ alkyl = None
769
+
770
+ if needs_alkyl:
771
+ parts = name.split(None, 1)
772
+ if len(parts) != 2:
773
+ continue
774
+ if parts[0].lower() not in _ALKYL_TO_ALKOXY:
775
+ continue
776
+ alkyl = parts[0]
777
+ work_name = parts[1]
778
+
779
+ work_lower = work_name.lower()
780
+
781
+ # Match "-LOCANT-SUFFIX" (or " SUFFIX" for multi-word) at the end
782
+ if " " in suffix:
783
+ # Multi-word suffix like "carboxylic acid"
784
+ pattern = r'-(\d+(?:,\d+)*)-' + re.escape(suffix) + r'$'
785
+ else:
786
+ pattern = r'-(\d+(?:,\d+)*)-' + re.escape(suffix) + r'$'
787
+
788
+ m = re.search(pattern, work_lower)
789
+ if not m:
790
+ continue
791
+
792
+ locants = m.group(1)
793
+ before = work_name[:m.start()] # everything before "-LOCANT-SUFFIX"
794
+
795
+ # Find longest known ring stem at the end of 'before'
796
+ before_lower = before.lower()
797
+ best_stem = None
798
+ best_ring = None
799
+ for stem, ring in _KNOWN_RING_STEMS.items():
800
+ if before_lower.endswith(stem):
801
+ if best_stem is None or len(stem) > len(best_stem):
802
+ best_stem = stem
803
+ best_ring = ring
804
+
805
+ if best_stem is None:
806
+ continue
807
+
808
+ leading = before[:len(before) - len(best_stem)]
809
+
810
+ # Build the prefix
811
+ if needs_alkyl and alkyl is not None:
812
+ alkoxy = _ALKYL_TO_ALKOXY[alkyl.lower()]
813
+ prefix = f"{alkoxy}carbonyl"
814
+ wrap = True
815
+
816
+ if prefix is None:
817
+ continue
818
+
819
+ pref = f"({prefix})" if wrap else prefix
820
+
821
+ # Reconstruct: LOCANT-PREFIX-LEADING-RING
822
+ if leading:
823
+ variant = locants + "-" + pref + "-" + leading + best_ring
824
+ else:
825
+ variant = locants + "-" + pref + best_ring
826
+ variant = re.sub(r'-{2,}', '-', variant)
827
+ variant = variant.strip('-')
828
+
829
+ new_parent = best_ring
830
+ variants.append((variant, new_parent))
831
+
832
+ return variants
833
+
834
+
835
+ def _extended_prefix_to_suffix_variants(
836
+ name: str, parent: str,
837
+ ) -> List[Tuple[str, str]]:
838
+ """Generate prefix→suffix variants for principal characteristic groups.
839
+
840
+ Reverse of _extended_suffix_prefix_variants:
841
+ "2-(methylthio)-4-(methoxycarbonyl)thieno[2,3-d]pyrimidine"
842
+ → "methyl 2-(methylthio)thieno[2,3-d]pyrimidine-4-carboxylate"
843
+ "4-formyl-2-(methylthio)thieno[2,3-d]pyrimidine"
844
+ → "2-(methylthio)thieno[2,3-d]pyrimidine-4-carbaldehyde"
845
+
846
+ Detects known prefix patterns in the name's substituent chain and
847
+ converts them to suffix form appended after the ring stem.
848
+ """
849
+ variants: List[Tuple[str, str]] = []
850
+ name_lower = name.lower()
851
+
852
+ # Table: (prefix_to_detect, suffix, produces_alkyl)
853
+ # For alkoxycarbonyl, we iterate over _ALKOXY_TO_ALKYL entries.
854
+ simple_prefix_map = [
855
+ ("carboxy", "carboxylic acid", False),
856
+ ("carbamoyl", "carboxamide", False),
857
+ ("formyl", "carbaldehyde", False),
858
+ ("cyano", "carbonitrile", False),
859
+ ]
860
+
861
+ # --- Simple prefixes (no alkyl) ---
862
+ for prefix, suffix, _ in simple_prefix_map:
863
+ # Match "LOCANT-PREFIX" or "LOCANT-(PREFIX)" in the name
864
+ # Try with parentheses first
865
+ for pfx_pat in [re.escape(f"({prefix})"), re.escape(prefix)]:
866
+ pat = r'(\d+(?:,\d+)*)-' + pfx_pat + r'[-]?'
867
+ m_pref = re.search(pat, name_lower)
868
+ if m_pref:
869
+ break
870
+ else:
871
+ continue
872
+
873
+ locants = m_pref.group(1)
874
+
875
+ # Remove the matched prefix group from the name
876
+ before_match = name[:m_pref.start()]
877
+ after_match = name[m_pref.end():]
878
+ core = before_match + after_match
879
+ core = re.sub(r'-{2,}', '-', core)
880
+ core = core.strip('-')
881
+
882
+ # Find ring stem in core to build suffix form
883
+ core_lower = core.lower()
884
+ best_stem = None
885
+ best_ring = None
886
+ for stem, ring in _KNOWN_RING_STEMS.items():
887
+ if stem in core_lower:
888
+ if best_stem is None or len(stem) > len(best_stem):
889
+ best_stem = stem
890
+ best_ring = ring
891
+
892
+ if best_stem is None:
893
+ continue
894
+
895
+ # Find where the stem ends in core, insert "-LOCANT-SUFFIX" there
896
+ stem_idx = core_lower.rfind(best_stem)
897
+ stem_end = stem_idx + len(best_stem)
898
+
899
+ # Include trailing 'e' if present (elided stem → full ring)
900
+ if (stem_end < len(core) and core[stem_end].lower() == 'e'
901
+ and best_ring.endswith('e') and not best_stem.endswith('e')):
902
+ stem_end += 1
903
+
904
+ variant = core[:stem_end] + "-" + locants + "-" + suffix + core[stem_end:]
905
+ variant = re.sub(r'-{2,}', '-', variant)
906
+ variant = variant.strip('-')
907
+ variants.append((variant, parent))
908
+
909
+ # --- Ester prefixes: (alkoxycarbonyl) → alkyl ... carboxylate ---
910
+ for alkoxy, alkyl in _ALKOXY_TO_ALKYL.items():
911
+ target = f"({alkoxy}carbonyl)"
912
+ target_lower = target.lower()
913
+ idx = name_lower.find(target_lower)
914
+ if idx < 0:
915
+ continue
916
+
917
+ # Find locant before the prefix
918
+ before_target = name[:idx]
919
+ m_loc = re.search(r'(\d+(?:,\d+)*)-$', before_target)
920
+ if not m_loc:
921
+ continue
922
+
923
+ locants = m_loc.group(1)
924
+
925
+ # Remove the matched "LOCANT-(alkoxycarbonyl)" from the name
926
+ before_loc = name[:m_loc.start()].rstrip('-')
927
+ after_target = name[idx + len(target):].lstrip('-')
928
+ core = before_loc + after_target
929
+ core = re.sub(r'-{2,}', '-', core)
930
+ core = core.strip('-')
931
+
932
+ # Find ring stem to append suffix
933
+ core_lower = core.lower()
934
+ best_stem = None
935
+ best_ring = None
936
+ for stem, ring in _KNOWN_RING_STEMS.items():
937
+ if stem in core_lower:
938
+ if best_stem is None or len(stem) > len(best_stem):
939
+ best_stem = stem
940
+ best_ring = ring
941
+
942
+ if best_stem is None:
943
+ continue
944
+
945
+ stem_idx = core_lower.rfind(best_stem)
946
+ stem_end = stem_idx + len(best_stem)
947
+ if (stem_end < len(core) and core[stem_end].lower() == 'e'
948
+ and best_ring.endswith('e') and not best_stem.endswith('e')):
949
+ stem_end += 1
950
+
951
+ suffix_part = core[:stem_end] + "-" + locants + "-carboxylate" + core[stem_end:]
952
+ variant = alkyl + " " + suffix_part
953
+ variant = re.sub(r'-{2,}', '-', variant)
954
+ variants.append((variant, parent))
955
+
956
+ return variants
957
+
958
+
959
+ def _find_locant_group_starts(text: str) -> List[int]:
960
+ """Find starting positions of top-level locant-prefix groups in *text*.
961
+
962
+ A locant group starts with ``\\d+(,\\d+)*-`` at bracket depth 0.
963
+ """
964
+ starts: List[int] = []
965
+ i = 0
966
+ depth = 0
967
+ while i < len(text):
968
+ c = text[i]
969
+ if c in '([':
970
+ depth += 1
971
+ i += 1
972
+ elif c in ')]':
973
+ depth -= 1
974
+ i += 1
975
+ elif c.isdigit() and depth == 0:
976
+ j = i
977
+ while j < len(text) and (
978
+ text[j].isdigit() or text[j] == ','):
979
+ j += 1
980
+ if j < len(text) and text[j] == '-':
981
+ starts.append(i)
982
+ i = max(i + 1, j)
983
+ else:
984
+ i += 1
985
+ return starts
986
+
987
+
988
+ def _reorder_locant_prefixes(name: str, parent: str) -> Optional[str]:
989
+ """Reorder top-level locant-prefix groups to ascending locant order.
990
+
991
+ IUPAC convention requires substituent prefixes to appear in ascending
992
+ locant order. E.g.:
993
+
994
+ "5-(chlorosulfonyl)-2-ethoxybenzoic acid" (parent "benzoic acid")
995
+ → "2-ethoxy-5-(chlorosulfonyl)benzoic acid"
996
+
997
+ *parent* is needed to locate the boundary between the prefix section
998
+ and the parent stem. Returns the reordered name, or ``None`` if the
999
+ name is already in ascending order or cannot be parsed.
1000
+ """
1001
+ if not parent:
1002
+ return None
1003
+
1004
+ name_lower = name.lower()
1005
+ parent_lower = parent.lower().strip()
1006
+
1007
+ # --- Collect candidate parent-stem positions -------------------------
1008
+ # Multiple strategies are needed because the decomposer's parent may
1009
+ # include substituent prefixes (e.g. "2-ethoxybenzoic acid" instead
1010
+ # of "benzoic acid"). We try all strategies and pick the first
1011
+ # candidate that yields ≥ 2 non-ascending locant groups.
1012
+ candidates: List[int] = []
1013
+
1014
+ # Strategy 1: Literal parent match
1015
+ idx = name_lower.rfind(parent_lower)
1016
+ if idx > 0:
1017
+ candidates.append(idx)
1018
+
1019
+ # Strategy 2: Elided form ("quinazoline" → "quinazolin")
1020
+ if parent_lower.endswith('e'):
1021
+ idx = name_lower.rfind(parent_lower[:-1])
1022
+ if idx > 0:
1023
+ candidates.append(idx)
1024
+
1025
+ ring = extract_parent_ring(parent_lower)
1026
+
1027
+ # Strategy 3: Known ring stems derived from the parent ring
1028
+ if ring and ring != parent_lower:
1029
+ for stem in _KNOWN_RING_STEMS_BY_LEN:
1030
+ if _KNOWN_RING_STEMS[stem].lower() == ring:
1031
+ idx = name_lower.rfind(stem)
1032
+ if idx > 0:
1033
+ candidates.append(idx)
1034
+ break
1035
+
1036
+ # Strategy 4: Ring-name marker for retained acid names
1037
+ # E.g. "benzoic acid" → ring "benzene" → marker "benz" finds the
1038
+ # parent position even though "benzene"/"benzen" aren't in "benzoic".
1039
+ if ring and len(ring) >= 3:
1040
+ for end in range(len(ring), 2, -1):
1041
+ marker = ring[:end]
1042
+ idx = name_lower.rfind(marker)
1043
+ if idx > 0:
1044
+ candidates.append(idx)
1045
+ break
1046
+
1047
+ if not candidates:
1048
+ return None
1049
+
1050
+ # --- Try each candidate, use the first that yields a reordering ------
1051
+ # Sort candidates ascending so smaller (= longer prefix section) first.
1052
+ for parent_pos in sorted(set(candidates)):
1053
+ prefix_section = name[:parent_pos]
1054
+ parent_section = name[parent_pos:]
1055
+
1056
+ group_starts = _find_locant_group_starts(prefix_section)
1057
+ if len(group_starts) <= 1:
1058
+ continue # try next candidate
1059
+
1060
+ # Extract (first_locant_value, group_text) for each group
1061
+ groups: List[Tuple[int, str]] = []
1062
+ for idx_g, start in enumerate(group_starts):
1063
+ end = (group_starts[idx_g + 1]
1064
+ if idx_g + 1 < len(group_starts)
1065
+ else len(prefix_section))
1066
+ group_text = prefix_section[start:end]
1067
+ m = re.match(r'(\d+)', group_text)
1068
+ first_loc = int(m.group(1)) if m else 0
1069
+ groups.append((first_loc, group_text))
1070
+
1071
+ # Already in ascending order?
1072
+ locs = [g[0] for g in groups]
1073
+ if locs == sorted(locs):
1074
+ continue # no reordering needed at this split point
1075
+
1076
+ before = prefix_section[:group_starts[0]]
1077
+ sorted_groups = sorted(groups, key=lambda g: g[0])
1078
+
1079
+ # Reassemble: strip trailing '-' from each part, join with '-'
1080
+ parts = [g[1].rstrip('-') for g in sorted_groups]
1081
+ new_prefix = '-'.join(parts)
1082
+
1083
+ result = before + new_prefix + parent_section
1084
+ result = re.sub(r'-{2,}', '-', result)
1085
+ return result
1086
+
1087
+ return None
1088
+
1089
+
1090
+ # ---------------------------------------------------------------------------
1091
+ # Retained name → substitutive prefix + ring decomposition
1092
+ # ---------------------------------------------------------------------------
1093
+ # Retained names like "aniline" are systematically "aminobenzene" (prefix
1094
+ # + ring). When the retained name appears as a parent with numbered
1095
+ # substituent prefixes (e.g. "4-fluoroaniline"), we can generate the
1096
+ # fully substitutive form "4-fluoro-1-aminobenzene" where the retained
1097
+ # name's defining substituent gets its own locant. This often yields a
1098
+ # closer text match in aligned sequences.
1099
+
1100
+ # (retained_name, substituent_prefix, ring_name, default_locant)
1101
+ # default_locant: position of the defining substituent in the standard
1102
+ # numbering. None means the retained name is used for multiple isomers
1103
+ # (e.g. naphthol can be 1- or 2-).
1104
+ _RETAINED_SUBSTITUTIVE = [
1105
+ ("aniline", "amino", "benzene", "1"),
1106
+ ("phenol", "hydroxy", "benzene", "1"),
1107
+ ("anisole", "methoxy", "benzene", "1"),
1108
+ ("thiophenol", "sulfanyl", "benzene", "1"),
1109
+ ("naphthol", "hydroxy", "naphthalene", None),
1110
+ ]
1111
+
1112
+
1113
+ def _retained_to_substitutive_variants(
1114
+ name: str, parent: str,
1115
+ ) -> List[Tuple[str, str]]:
1116
+ """Generate fully substitutive variants from retained parent names.
1117
+
1118
+ "4-fluoroaniline" → "4-fluoro-1-aminobenzene" (parent: benzene)
1119
+ "2,6-dichlorophenol" → "2,6-dichloro-1-hydroxybenzene" (parent: benzene)
1120
+
1121
+ The locant reorder pass later normalises to ascending order:
1122
+ "4-fluoro-1-aminobenzene" → "1-amino-4-fluorobenzene"
1123
+ """
1124
+ variants: List[Tuple[str, str]] = []
1125
+ name_lower = name.lower()
1126
+
1127
+ for retained, prefix, ring, locant in _RETAINED_SUBSTITUTIVE:
1128
+ if retained not in name_lower:
1129
+ continue
1130
+ if locant is None:
1131
+ continue # skip ambiguous retained names
1132
+
1133
+ idx = name_lower.index(retained)
1134
+ leading = name[:idx] # e.g. "4-fluoro" from "4-fluoroaniline"
1135
+ trailing = name[idx + len(retained):] # e.g. "" (usually empty)
1136
+
1137
+ # Build substitutive form: "leading-LOCANT-PREFIX-ring-trailing"
1138
+ if leading:
1139
+ # Ensure proper hyphenation
1140
+ lead = leading.rstrip('-')
1141
+ variant = f"{lead}-{locant}-{prefix}{ring}{trailing}"
1142
+ else:
1143
+ variant = f"{locant}-{prefix}{ring}{trailing}"
1144
+
1145
+ variant = re.sub(r'-{2,}', '-', variant)
1146
+ new_parent = ring
1147
+ variants.append((variant, new_parent))
1148
+
1149
+ return variants
1150
+
1151
+
1152
+ def _generate_alignment_variants(
1153
+ name: str, parent: str,
1154
+ ) -> List[Tuple[str, str]]:
1155
+ """Generate all alignment variant alternatives for a name.
1156
+
1157
+ Returns list of (variant_name, variant_parent) tuples.
1158
+ These supplement the decomposer's alternatives with IUPAC-equivalent
1159
+ forms that reduce text distance between consecutive names.
1160
+ """
1161
+ variants: List[Tuple[str, str]] = []
1162
+ # Locant reordering — always try ascending locant normalization
1163
+ reordered = _reorder_locant_prefixes(name, parent)
1164
+ if reordered and reordered != name:
1165
+ variants.append((reordered, parent))
1166
+ variants.extend(_ester_variants(name, parent))
1167
+ variants.extend(_extended_suffix_prefix_variants(name, parent))
1168
+ variants.extend(_extended_prefix_to_suffix_variants(name, parent))
1169
+ variants.extend(_retained_systematic_variants(name, parent))
1170
+ variants.extend(_retained_to_substitutive_variants(name, parent))
1171
+ variants.extend(_indicated_h_variants(name, parent))
1172
+ variants.extend(_general_suffix_prefix_variants(name, parent))
1173
+ return variants
1174
+
1175
+
1176
+ # ---------------------------------------------------------------------------
1177
+ # Two-pass contextual variant generation
1178
+ # ---------------------------------------------------------------------------
1179
+
1180
+ def _contextual_variants(
1181
+ name: str, parent: str, neighbor_name: str,
1182
+ ) -> List[Tuple[str, str]]:
1183
+ """Generate variants of *name* targeted to match *neighbor_name* better.
1184
+
1185
+ Analyses the neighbor's naming style and generates matching variants.
1186
+ Returns list of (variant_name, variant_parent) tuples.
1187
+ """
1188
+ variants: List[Tuple[str, str]] = []
1189
+ n_lower = neighbor_name.lower()
1190
+ name_lower = name.lower()
1191
+
1192
+ # 1. If neighbor uses "acid" form, try converting our ester to acid form
1193
+ # (and vice versa)
1194
+ if "carboxylic acid" in n_lower or "acid" in n_lower:
1195
+ variants.extend(_ester_variants(name, parent))
1196
+ if "carboxylate" in n_lower or "ester" in n_lower:
1197
+ variants.extend(_ester_variants(name, parent))
1198
+
1199
+ # 2. If neighbor uses systematic names, try our retained→systematic
1200
+ # If neighbor uses retained names, try systematic→retained
1201
+ for retained, systematic in _RETAINED_TO_SYSTEMATIC.items():
1202
+ if systematic.lower() in n_lower and retained in name_lower:
1203
+ # Neighbor uses systematic form, we have the retained form
1204
+ variants.extend(_retained_systematic_variants(name, parent))
1205
+ break
1206
+ if retained in n_lower and systematic.lower() in name_lower:
1207
+ # Neighbor uses retained form, we have the systematic form
1208
+ variants.extend(_retained_systematic_variants(name, parent))
1209
+ break
1210
+
1211
+ # 3. Style matching: suffix vs prefix naming
1212
+ # If neighbor uses prefix style (e.g., "4-chloro-..."), generate prefix
1213
+ # variants for our names that use suffix style (e.g., "...-4-one")
1214
+ n_tokens = set(_chem_tokens_flat(neighbor_name))
1215
+ our_tokens = set(_chem_tokens_flat(name))
1216
+
1217
+ # Check if neighbor uses prefix-style substituents
1218
+ prefix_subs = {"amino", "hydroxy", "oxo", "sulfanyl"}
1219
+ neighbor_has_prefix = bool(n_tokens & prefix_subs)
1220
+ suffix_subs = {"amine", "ol", "one", "thiol"}
1221
+ we_have_suffix = bool(our_tokens & suffix_subs)
1222
+
1223
+ if neighbor_has_prefix and we_have_suffix:
1224
+ variants.extend(_general_suffix_prefix_variants(name, parent))
1225
+ variants.extend(_indicated_h_variants(name, parent))
1226
+
1227
+ # 4. Reverse: if neighbor uses suffix style, try converting our prefix style
1228
+ # E.g., neighbor has "pyridin-2-amine", we have "2-aminopyridine"
1229
+ neighbor_has_suffix = bool(n_tokens & suffix_subs)
1230
+ we_have_prefix = bool(our_tokens & prefix_subs)
1231
+
1232
+ if neighbor_has_suffix and we_have_prefix:
1233
+ # Try to generate suffix form from our prefix form
1234
+ # This is the reverse of _general_suffix_prefix_variants
1235
+ variants.extend(_prefix_to_suffix_variants(name, parent))
1236
+
1237
+ # 5. Extended principal characteristic group suffix/prefix matching.
1238
+ # If neighbor uses prefix-style naming for extended groups (formyl,
1239
+ # carboxy, carbamoyl, cyano, alkoxycarbonyl), try our suffix→prefix;
1240
+ # and vice versa.
1241
+ ext_prefix_tokens = {"formyl", "carboxy", "carbamoyl", "cyano", "carbonyl"}
1242
+ ext_suffix_tokens = {"carboxylate", "carbaldehyde", "carboxamide",
1243
+ "carbonitrile", "carboxylic"}
1244
+ if n_tokens & ext_prefix_tokens and our_tokens & ext_suffix_tokens:
1245
+ variants.extend(_extended_suffix_prefix_variants(name, parent))
1246
+ if n_tokens & ext_suffix_tokens and our_tokens & ext_prefix_tokens:
1247
+ variants.extend(_extended_prefix_to_suffix_variants(name, parent))
1248
+
1249
+ return variants
1250
+
1251
+
1252
+ def _prefix_to_suffix_variants(
1253
+ name: str, parent: str,
1254
+ ) -> List[Tuple[str, str]]:
1255
+ """Generate prefix→suffix variants (reverse of _general_suffix_prefix_variants).
1256
+
1257
+ E.g., "2-aminopyridine" → "pyridin-2-amine"
1258
+ "4-oxoquinazoline" → "quinazolin-4-one"
1259
+ """
1260
+ variants: List[Tuple[str, str]] = []
1261
+ name_lower = name.lower()
1262
+
1263
+ prefix_map = {
1264
+ "amino": "amine",
1265
+ "hydroxy": "ol",
1266
+ "oxo": "one",
1267
+ "sulfanyl": "thiol",
1268
+ }
1269
+
1270
+ for prefix, suffix in prefix_map.items():
1271
+ # Pattern: LOCANT-PREFIX-RING at end (or LOCANT-PREFIX-substitutents-RING)
1272
+ # E.g. "2-amino-pyridine", "2-aminopyridine"
1273
+ # We need to find the prefix and ring
1274
+ pat = r'(\d+(?:,\d+)*)-' + re.escape(prefix) + r'[-]?'
1275
+ m = re.search(pat, name_lower)
1276
+ if not m:
1277
+ continue
1278
+
1279
+ locants = m.group(1)
1280
+ after_prefix = name_lower[m.end():] # everything after "N-prefix-"
1281
+
1282
+ # Find a known ring in the remaining part
1283
+ for ring in _KNOWN_RINGS_BY_LEN:
1284
+ if after_prefix == ring or after_prefix.endswith(ring):
1285
+ # Get the elided stem form for the ring (e.g.
1286
+ # "pyridine" → "pyridin"). Prefer the shortest
1287
+ # stem that maps to this ring, which is the elided
1288
+ # form needed for suffix attachment.
1289
+ stem = ring
1290
+ for s in _KNOWN_RING_STEMS_BY_LEN:
1291
+ if _KNOWN_RING_STEMS[s] == ring and len(s) < len(stem):
1292
+ stem = s
1293
+ # Build suffix form: leading-STEM-LOCANT-SUFFIX
1294
+ leading = after_prefix[:len(after_prefix) - len(ring)]
1295
+ variant = leading + stem + "-" + locants + "-" + suffix
1296
+ variant = re.sub(r'-{2,}', '-', variant)
1297
+ variant = variant.strip('-')
1298
+ variants.append((variant, variant))
1299
+ break
1300
+
1301
+ return variants
1302
+
1303
+
1304
+ # ---------------------------------------------------------------------------
1305
+ # DP Viterbi for multi-step sequence alignment
1306
+ # ---------------------------------------------------------------------------
1307
+
1308
+ def _dp_viterbi(
1309
+ names_per_compound: List[List[str]],
1310
+ metric_fn,
1311
+ minimize: bool = True,
1312
+ ) -> Tuple[List[str], float]:
1313
+ """Dynamic programming (Viterbi-style) optimal path.
1314
+
1315
+ O(N * M^2) where N = num compounds, M = max names per compound.
1316
+ Picks one name per compound to optimise the sum of consecutive
1317
+ pairwise metric values.
1318
+ """
1319
+ N = len(names_per_compound)
1320
+ if N == 0:
1321
+ return [], 0.0
1322
+
1323
+ dp = [{} for _ in range(N)]
1324
+ backptr = [{} for _ in range(N)]
1325
+
1326
+ for j, name in enumerate(names_per_compound[0]):
1327
+ dp[0][j] = 0.0
1328
+ backptr[0][j] = -1
1329
+
1330
+ for i in range(1, N):
1331
+ for j, name_j in enumerate(names_per_compound[i]):
1332
+ best_prev_score = float("inf") if minimize else float("-inf")
1333
+ best_prev_idx = 0
1334
+ for k, name_k in enumerate(names_per_compound[i - 1]):
1335
+ edge = metric_fn(name_k, name_j)
1336
+ cumulative = dp[i - 1][k] + edge
1337
+ if (minimize and cumulative < best_prev_score) or \
1338
+ (not minimize and cumulative > best_prev_score):
1339
+ best_prev_score = cumulative
1340
+ best_prev_idx = k
1341
+ dp[i][j] = best_prev_score
1342
+ backptr[i][j] = best_prev_idx
1343
+
1344
+ if minimize:
1345
+ last_idx = min(dp[N - 1], key=dp[N - 1].get)
1346
+ else:
1347
+ last_idx = max(dp[N - 1], key=dp[N - 1].get)
1348
+
1349
+ total_score = dp[N - 1][last_idx]
1350
+ path = [last_idx]
1351
+ for i in range(N - 1, 0, -1):
1352
+ path.append(backptr[i][path[-1]])
1353
+ path.reverse()
1354
+
1355
+ chosen = [names_per_compound[i][path[i]] for i in range(N)]
1356
+ return chosen, total_score
1357
+
1358
+
1359
+ def _make_parent_penalised_metric(base_metric_fn, name_to_parent: dict,
1360
+ penalty: float = 100.0):
1361
+ """Create a metric that adds a penalty when parent rings differ.
1362
+
1363
+ The penalty is large enough that the DP will always minimise parent
1364
+ switches first, then optimise the base metric as a tiebreaker.
1365
+ """
1366
+ def metric(a: str, b: str) -> float:
1367
+ base = base_metric_fn(a, b)
1368
+ pa = extract_parent_ring(name_to_parent.get(a, ""))
1369
+ pb = extract_parent_ring(name_to_parent.get(b, ""))
1370
+ return base + (penalty if pa != pb else 0.0)
1371
+ return metric
1372
+
1373
+
1374
+ # ---------------------------------------------------------------------------
1375
+ # Name diff
1376
+ # ---------------------------------------------------------------------------
1377
+
1378
+ def _tokenize_iupac(name: str) -> List[str]:
1379
+ """Split an IUPAC name into tokens at dashes, parens, spaces, commas.
1380
+
1381
+ Delimiters are kept as separate tokens so that the reconstructed
1382
+ string ``''.join(tokens)`` equals the original name.
1383
+ """
1384
+ tokens: List[str] = []
1385
+ buf: List[str] = []
1386
+ for ch in name:
1387
+ if ch in '-() ,':
1388
+ if buf:
1389
+ tokens.append(''.join(buf))
1390
+ buf = []
1391
+ tokens.append(ch)
1392
+ else:
1393
+ buf.append(ch)
1394
+ if buf:
1395
+ tokens.append(''.join(buf))
1396
+ return tokens
1397
+
1398
+
1399
+ def _refine_replace(t1: str, t2: str,
1400
+ min_affix: int = 3) -> List[Tuple[str, str, str]]:
1401
+ """Refine a 'replace' op by stripping the shared prefix/suffix.
1402
+
1403
+ IUPAC substituents are often concatenated without a delimiter
1404
+ (e.g. "bromoquinolin"), so the token-level diff may lump a
1405
+ substituent and its parent into one replace op. Stripping the
1406
+ common head/tail recovers the clean diff.
1407
+
1408
+ Only strips a prefix/suffix if it is at least *min_affix* characters
1409
+ long, to avoid noisy single-character splits (e.g. the shared "e"
1410
+ in "carbamate" / "amine").
1411
+
1412
+ Returns a list of (tag, text1, text2) ops.
1413
+ """
1414
+ # Common prefix
1415
+ i = 0
1416
+ while i < min(len(t1), len(t2)) and t1[i] == t2[i]:
1417
+ i += 1
1418
+ if i < min_affix:
1419
+ i = 0 # too short — don't split
1420
+
1421
+ # Common suffix (not overlapping prefix)
1422
+ j = 0
1423
+ while (j < min(len(t1), len(t2)) - i
1424
+ and t1[-(j + 1)] == t2[-(j + 1)]):
1425
+ j += 1
1426
+ if j < min_affix:
1427
+ j = 0 # too short — don't split
1428
+
1429
+ prefix = t1[:i]
1430
+ suffix = t1[len(t1) - j:] if j else ""
1431
+ mid1 = t1[i:len(t1) - j] if j else t1[i:]
1432
+ mid2 = t2[i:len(t2) - j] if j else t2[i:]
1433
+
1434
+ ops: List[Tuple[str, str, str]] = []
1435
+ if prefix:
1436
+ ops.append(('equal', prefix, prefix))
1437
+ if mid1 and mid2:
1438
+ ops.append(('replace', mid1, mid2))
1439
+ elif mid1:
1440
+ ops.append(('delete', mid1, ''))
1441
+ elif mid2:
1442
+ ops.append(('insert', '', mid2))
1443
+ if suffix:
1444
+ ops.append(('equal', suffix, suffix))
1445
+ return ops
1446
+
1447
+
1448
+ def name_diff(name1: str, name2: str) -> List[Tuple[str, str, str]]:
1449
+ """Token-level diff between two IUPAC names.
1450
+
1451
+ Tokenises both names at IUPAC delimiters (``- ( ) , space``), then
1452
+ runs ``SequenceMatcher`` on the token lists. Replace ops are further
1453
+ refined by stripping shared prefix/suffix within the replaced text,
1454
+ so that concatenated tokens like ``bromoquinolin`` are split into
1455
+ ``bromo`` (changed) + ``quinolin`` (equal).
1456
+
1457
+ Returns list of ``(tag, from_text, to_text)`` tuples where *tag* is
1458
+ ``'equal'``, ``'replace'``, ``'delete'``, or ``'insert'``, and
1459
+ *from_text* / *to_text* are the joined token strings.
1460
+
1461
+ Example::
1462
+
1463
+ >>> name_diff('4-fluoropyridine', '4-(piperidin-1-yl)pyridine')
1464
+ [('equal', '4-', '4-'),
1465
+ ('replace', 'fluoro', '(piperidin-1-yl)'),
1466
+ ('equal', 'pyridine', 'pyridine')]
1467
+ """
1468
+ tok1 = _tokenize_iupac(name1)
1469
+ tok2 = _tokenize_iupac(name2)
1470
+ sm = difflib.SequenceMatcher(None, tok1, tok2, autojunk=False)
1471
+
1472
+ result: List[Tuple[str, str, str]] = []
1473
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
1474
+ t1 = ''.join(tok1[i1:i2])
1475
+ t2 = ''.join(tok2[j1:j2])
1476
+ if tag == 'replace':
1477
+ result.extend(_refine_replace(t1, t2))
1478
+ else:
1479
+ result.append((tag, t1, t2))
1480
+ return result
1481
+
1482
+
1483
+ def format_name_diff(name1: str, name2: str) -> str:
1484
+ """Plain-text summary of changes between two aligned names.
1485
+
1486
+ Returns a string like ``fluoro -> (piperidin-1-yl)``.
1487
+ Multiple changes are separated by `` ; ``.
1488
+ """
1489
+ ops = name_diff(name1, name2)
1490
+ changes = []
1491
+ for tag, t1, t2 in ops:
1492
+ if tag == 'replace':
1493
+ changes.append(f"{t1} -> {t2}")
1494
+ elif tag == 'delete':
1495
+ changes.append(f"(-{t1})")
1496
+ elif tag == 'insert':
1497
+ changes.append(f"(+{t2})")
1498
+ return " ; ".join(changes) if changes else "(identical)"
1499
+
1500
+
1501
+ def format_name_diff_html(name1: str, name2: str) -> str:
1502
+ """Inline HTML showing the diff between two aligned names.
1503
+
1504
+ Equal parts are plain text; changed parts are highlighted with
1505
+ red strikethrough (deleted/old) and green (inserted/new) spans.
1506
+
1507
+ Returns an HTML fragment (no surrounding tags).
1508
+ """
1509
+ ops = name_diff(name1, name2)
1510
+ parts = []
1511
+ for tag, t1, t2 in ops:
1512
+ if tag == 'equal':
1513
+ parts.append(html_mod.escape(t1))
1514
+ elif tag == 'replace':
1515
+ parts.append(
1516
+ f'<span class="diff-del">{html_mod.escape(t1)}</span>'
1517
+ f'<span class="diff-arrow">\u2192</span>'
1518
+ f'<span class="diff-ins">{html_mod.escape(t2)}</span>')
1519
+ elif tag == 'delete':
1520
+ parts.append(
1521
+ f'<span class="diff-del">{html_mod.escape(t1)}</span>')
1522
+ elif tag == 'insert':
1523
+ parts.append(
1524
+ f'<span class="diff-ins">{html_mod.escape(t2)}</span>')
1525
+ return ''.join(parts)
1526
+
1527
+
1528
+ # ---------------------------------------------------------------------------
1529
+ # Alignment result dataclass
1530
+ # ---------------------------------------------------------------------------
1531
+
1532
+ @dataclass
1533
+ class AlignmentResult:
1534
+ """Result of aligning names for an SM→product pair."""
1535
+ sm_smiles: str
1536
+ prod_smiles: str
1537
+ sm_result: Optional[DecompositionResult] = None
1538
+ prod_result: Optional[DecompositionResult] = None
1539
+
1540
+ # Exact parent matches: (sm_name, prod_name, shared_parent)
1541
+ aligned_pairs: List[Tuple[str, str, str]] = field(default_factory=list)
1542
+
1543
+ # Best similarity pair (may or may not be an exact match)
1544
+ best_sm_name: str = ""
1545
+ best_prod_name: str = ""
1546
+ best_similarity: float = 0.0
1547
+
1548
+ @property
1549
+ def is_aligned(self) -> bool:
1550
+ return len(self.aligned_pairs) > 0
1551
+
1552
+ @property
1553
+ def alignment_quality(self) -> str:
1554
+ """Classify alignment: ALIGNED / SEMI-ALIGNED / UNALIGNED."""
1555
+ if self.aligned_pairs:
1556
+ return "ALIGNED"
1557
+ elif self.best_similarity >= 0.5:
1558
+ return "SEMI-ALIGNED"
1559
+ else:
1560
+ return "UNALIGNED"
1561
+
1562
+
1563
+ # ---------------------------------------------------------------------------
1564
+ # Core alignment function
1565
+ # ---------------------------------------------------------------------------
1566
+
1567
+ def find_aligned_names(sm_smiles: str, prod_smiles: str,
1568
+ verbose: bool = False,
1569
+ preferred_parent: Optional[str] = None,
1570
+ ) -> AlignmentResult:
1571
+ """Find aligned name pairs for SM→product that share a naming parent.
1572
+
1573
+ Parameters
1574
+ ----------
1575
+ sm_smiles, prod_smiles : str
1576
+ Canonical SMILES for starting material and product.
1577
+ verbose : bool
1578
+ Print debug info.
1579
+ preferred_parent : str, optional
1580
+ Substring to match against available naming parents. When set,
1581
+ aligned pairs whose shared parent contains this string receive a
1582
+ similarity bonus, biasing selection toward a consistent naming
1583
+ parent across a multi-step scheme. Example: ``"quinoline"``
1584
+ would prefer quinoline-rooted names over morpholine-rooted ones.
1585
+
1586
+ Returns an AlignmentResult with exact matches and similarity ranking.
1587
+ """
1588
+ result = AlignmentResult(sm_smiles=sm_smiles, prod_smiles=prod_smiles)
1589
+
1590
+ sm_result = decompose_name(sm_smiles, verbose=verbose)
1591
+ prod_result = decompose_name(prod_smiles, verbose=verbose)
1592
+ result.sm_result = sm_result
1593
+ result.prod_result = prod_result
1594
+
1595
+ if sm_result.errors or prod_result.errors:
1596
+ return result
1597
+
1598
+ # Collect all valid names + their naming parent for each
1599
+ sm_names = [(sm_result.canonical_name,
1600
+ sm_result.canonical_parent or "(unknown)")]
1601
+ for alt in sm_result.alternatives:
1602
+ if alt.valid:
1603
+ sm_names.append((alt.name, alt.parent_name))
1604
+
1605
+ prod_names = [(prod_result.canonical_name,
1606
+ prod_result.canonical_parent or "(unknown)")]
1607
+ for alt in prod_result.alternatives:
1608
+ if alt.valid:
1609
+ prod_names.append((alt.name, alt.parent_name))
1610
+
1611
+ # Find pairs sharing the same parent name
1612
+ sm_by_parent = defaultdict(list)
1613
+ prod_by_parent = defaultdict(list)
1614
+
1615
+ for name, parent in sm_names:
1616
+ sm_by_parent[parent.lower()].append(name)
1617
+ for name, parent in prod_names:
1618
+ prod_by_parent[parent.lower()].append(name)
1619
+
1620
+ # Direct parent match
1621
+ for parent_key in sm_by_parent:
1622
+ if parent_key in prod_by_parent:
1623
+ for sm_name in sm_by_parent[parent_key]:
1624
+ for prod_name in prod_by_parent[parent_key]:
1625
+ result.aligned_pairs.append(
1626
+ (sm_name, prod_name, parent_key))
1627
+
1628
+ # Remove trivial "(canonical)" matches
1629
+ result.aligned_pairs = [(s, p, par) for s, p, par in result.aligned_pairs
1630
+ if par != "(canonical)"]
1631
+
1632
+ # When a preferred_parent is specified, first try to find the best
1633
+ # pair where BOTH parents contain the preferred substring. This keeps
1634
+ # naming consistent across a multi-step scheme. Only fall back to
1635
+ # unrestricted similarity if no preferred-parent pair exists.
1636
+ pref_key = preferred_parent.lower().strip() if preferred_parent else ""
1637
+ # Also match the truncated form (e.g. "quinolin" for "quinoline")
1638
+ # because -yl suffixed names drop the final 'e'.
1639
+ pref_keys = []
1640
+ if pref_key:
1641
+ pref_keys.append(pref_key)
1642
+ if pref_key.endswith('e'):
1643
+ pref_keys.append(pref_key[:-1])
1644
+
1645
+ def _has_pref(text: str) -> bool:
1646
+ """Check if text contains the preferred parent (or its stem)."""
1647
+ t = text.lower()
1648
+ return any(pk in t for pk in pref_keys)
1649
+
1650
+ best_sim = 0.0
1651
+ best_pair = ("", "")
1652
+ best_pref_sim = 0.0
1653
+ best_pref_pair = ("", "")
1654
+
1655
+ for sm_name, sm_par in sm_names:
1656
+ for prod_name, prod_par in prod_names:
1657
+ sim = name_similarity(sm_name, prod_name)
1658
+ if sim > best_sim:
1659
+ best_sim = sim
1660
+ best_pair = (sm_name, prod_name)
1661
+ # Track best pair matching preferred parent separately.
1662
+ # Check the parent string — use stem matching because -yl
1663
+ # suffixed parents drop the final 'e' (e.g. "quinolin-2-yl"
1664
+ # inside "4-(4-phenylquinolin-2-yl)morpholine").
1665
+ if (pref_keys
1666
+ and _has_pref(sm_par)
1667
+ and _has_pref(prod_par)
1668
+ and sim > best_pref_sim):
1669
+ best_pref_sim = sim
1670
+ best_pref_pair = (sm_name, prod_name)
1671
+
1672
+ # Use preferred-parent pair if it exists and has reasonable similarity
1673
+ # (at least 30% — just enough to filter out nonsense).
1674
+ if best_pref_pair[0] and best_pref_sim >= 0.30:
1675
+ result.best_sm_name = best_pref_pair[0]
1676
+ result.best_prod_name = best_pref_pair[1]
1677
+ result.best_similarity = best_pref_sim
1678
+ else:
1679
+ result.best_sm_name = best_pair[0]
1680
+ result.best_prod_name = best_pair[1]
1681
+ result.best_similarity = best_sim
1682
+
1683
+ return result
1684
+
1685
+
1686
+ # ---------------------------------------------------------------------------
1687
+ # Multi-step sequence alignment
1688
+ # ---------------------------------------------------------------------------
1689
+
1690
+ @dataclass
1691
+ class SequenceAlignmentResult:
1692
+ """Result of aligning names across a multi-step synthetic route."""
1693
+ smiles_list: List[str]
1694
+ chosen_names: List[str]
1695
+ parent_names: List[str]
1696
+ parent_rings: List[str]
1697
+ parent_switches: int
1698
+ base_score: float
1699
+ decomposition_results: List[Optional[DecompositionResult]] = field(
1700
+ default_factory=list)
1701
+ errors: List[str] = field(default_factory=list)
1702
+
1703
+ @property
1704
+ def is_fully_aligned(self) -> bool:
1705
+ return self.parent_switches == 0
1706
+
1707
+
1708
+ def find_aligned_name_sequence(
1709
+ smiles_list: List[str],
1710
+ verbose: bool = False,
1711
+ parent_penalty: float = 100.0,
1712
+ timeout: float = 30.0,
1713
+ ) -> SequenceAlignmentResult:
1714
+ """Pick one IUPAC name per intermediate to minimise parent-ring switches.
1715
+
1716
+ Uses parent-aware Viterbi DP: the objective is to minimise parent
1717
+ switches first (penalty >> base metric), then minimise chemistry-aware
1718
+ token diff as tiebreaker.
1719
+
1720
+ Parameters
1721
+ ----------
1722
+ smiles_list : list of str
1723
+ SMILES for each intermediate in synthesis order.
1724
+ verbose : bool
1725
+ Print debug info during decomposition.
1726
+ parent_penalty : float
1727
+ Penalty added when consecutive names have different parent rings.
1728
+ Must be >> max possible base metric value.
1729
+ timeout : float
1730
+ Per-compound decomposition timeout in seconds.
1731
+
1732
+ Returns
1733
+ -------
1734
+ SequenceAlignmentResult
1735
+ """
1736
+ names_per_compound: List[List[str]] = []
1737
+ name_to_parent: Dict[str, str] = {}
1738
+ decomp_results: List[Optional[DecompositionResult]] = []
1739
+ errors: List[str] = []
1740
+ canonical_smiles: List[Optional[str]] = [] # for variant validation
1741
+
1742
+ for smi in smiles_list:
1743
+ try:
1744
+ r = decompose_name(smi, verbose=verbose, timeout=timeout)
1745
+ decomp_results.append(r)
1746
+
1747
+ all_names = [(r.canonical_name, r.canonical_parent or "")]
1748
+ for alt in r.alternatives:
1749
+ if alt.valid:
1750
+ all_names.append((alt.name, alt.parent_name))
1751
+
1752
+ # Generate alignment variants for each name.
1753
+ # Variants are round-trip validated: name → SMILES → canonical
1754
+ # must match the original compound's canonical SMILES.
1755
+ expected_canon = _canonical(smi)
1756
+ canonical_smiles.append(expected_canon)
1757
+ extra = []
1758
+ seen_names = {n for n, _ in all_names}
1759
+ for n, p in all_names:
1760
+ for vn, vp in _generate_alignment_variants(n, p):
1761
+ if vn not in seen_names:
1762
+ if expected_canon and _validate_variant(vn, expected_canon):
1763
+ extra.append((vn, vp))
1764
+ seen_names.add(vn)
1765
+ all_names.extend(extra)
1766
+
1767
+ valid_names = [n for n, _ in all_names]
1768
+ names_per_compound.append(valid_names)
1769
+ for n, p in all_names:
1770
+ if p:
1771
+ # Check if parent gives a recognized ring; if not,
1772
+ # the name itself may contain the ring (decomposer bug
1773
+ # where prefix-stripping eats part of the ring name)
1774
+ ring = extract_parent_ring(p)
1775
+ if ring == p.lower().strip():
1776
+ name_ring = extract_parent_ring(n)
1777
+ if name_ring != n.lower().strip():
1778
+ p = n
1779
+ else:
1780
+ # Empty parent (retained names, single decompositions)
1781
+ p = n
1782
+ name_to_parent[n] = p
1783
+
1784
+ if r.errors:
1785
+ errors.append(f"{smi[:40]}: {'; '.join(r.errors)}")
1786
+ except Exception as e:
1787
+ decomp_results.append(None)
1788
+ canonical_smiles.append(None)
1789
+ fallback = f"[{smi[:30]}]"
1790
+ names_per_compound.append([fallback])
1791
+ name_to_parent[fallback] = ""
1792
+ errors.append(f"{smi[:40]}: {e}")
1793
+
1794
+ # --- Pass 1: Run parent-aware DP with chem_token_diff_count as base metric
1795
+ penalised_fn = _make_parent_penalised_metric(
1796
+ chem_token_diff_count, name_to_parent, parent_penalty)
1797
+ pass1_chosen, _total = _dp_viterbi(
1798
+ names_per_compound, penalised_fn, minimize=True)
1799
+
1800
+ # --- Pass 2: Generate contextual variants based on Pass 1 choices,
1801
+ # then re-run DP with the expanded candidate lists.
1802
+ # Each compound looks at what its neighbors chose in Pass 1 and
1803
+ # generates targeted variants to match that naming style.
1804
+ names_per_compound_p2 = [list(names) for names in names_per_compound]
1805
+ added_any = False
1806
+ for i in range(len(pass1_chosen)):
1807
+ existing = set(names_per_compound_p2[i])
1808
+ ctx_variants: List[Tuple[str, str]] = []
1809
+
1810
+ # Get parent for this compound's current names (use first name)
1811
+ comp_parent = name_to_parent.get(
1812
+ names_per_compound[i][0], "") if names_per_compound[i] else ""
1813
+
1814
+ # Generate variants targeted at each neighbor's chosen name
1815
+ if i > 0:
1816
+ for n in names_per_compound_p2[i]:
1817
+ ctx_variants.extend(
1818
+ _contextual_variants(n, name_to_parent.get(n, comp_parent),
1819
+ pass1_chosen[i - 1]))
1820
+ if i < len(pass1_chosen) - 1:
1821
+ for n in names_per_compound_p2[i]:
1822
+ ctx_variants.extend(
1823
+ _contextual_variants(n, name_to_parent.get(n, comp_parent),
1824
+ pass1_chosen[i + 1]))
1825
+
1826
+ # Add new unique variants (validated against canonical SMILES)
1827
+ exp_canon = canonical_smiles[i] if i < len(canonical_smiles) else None
1828
+ for vn, vp in ctx_variants:
1829
+ if vn not in existing:
1830
+ if exp_canon and not _validate_variant(vn, exp_canon):
1831
+ existing.add(vn) # skip invalid, but don't try again
1832
+ continue
1833
+ names_per_compound_p2[i].append(vn)
1834
+ existing.add(vn)
1835
+ added_any = True
1836
+ # Register parent for the new variant
1837
+ if vp:
1838
+ ring = extract_parent_ring(vp)
1839
+ if ring == vp.lower().strip():
1840
+ name_ring = extract_parent_ring(vn)
1841
+ if name_ring != vn.lower().strip():
1842
+ vp = vn
1843
+ else:
1844
+ vp = vn
1845
+ name_to_parent[vn] = vp
1846
+
1847
+ # Re-run DP only if we actually added new variants
1848
+ if added_any:
1849
+ penalised_fn_p2 = _make_parent_penalised_metric(
1850
+ chem_token_diff_count, name_to_parent, parent_penalty)
1851
+ chosen, _total = _dp_viterbi(
1852
+ names_per_compound_p2, penalised_fn_p2, minimize=True)
1853
+ else:
1854
+ chosen = pass1_chosen
1855
+
1856
+ # --- Post-DP: normalise locant order to ascending ----------------------
1857
+ # chem_token_diff_count is order-agnostic (multiset), so the DP cannot
1858
+ # distinguish "5-X-2-Y-ring" from "2-Y-5-X-ring". IUPAC convention
1859
+ # demands ascending locants, so we normalise here.
1860
+ for i, name in enumerate(chosen):
1861
+ parent = name_to_parent.get(name, "")
1862
+ reordered = _reorder_locant_prefixes(name, parent)
1863
+ if reordered and reordered != name:
1864
+ exp_canon = (canonical_smiles[i]
1865
+ if i < len(canonical_smiles) else None)
1866
+ if exp_canon is None or _validate_variant(reordered, exp_canon):
1867
+ chosen[i] = reordered
1868
+ name_to_parent[reordered] = parent
1869
+
1870
+ # Compute actual stats
1871
+ base_score = 0.0
1872
+ switches = 0
1873
+ parent_names = [name_to_parent.get(n, "") for n in chosen]
1874
+ parent_rings = [extract_parent_ring(p) for p in parent_names]
1875
+
1876
+ for i in range(len(chosen) - 1):
1877
+ base_score += chem_token_diff_count(chosen[i], chosen[i + 1])
1878
+ if parent_rings[i] != parent_rings[i + 1]:
1879
+ switches += 1
1880
+
1881
+ return SequenceAlignmentResult(
1882
+ smiles_list=smiles_list,
1883
+ chosen_names=chosen,
1884
+ parent_names=parent_names,
1885
+ parent_rings=parent_rings,
1886
+ parent_switches=switches,
1887
+ base_score=base_score,
1888
+ decomposition_results=decomp_results,
1889
+ errors=errors,
1890
+ )
1891
+
1892
+
1893
+ # ---------------------------------------------------------------------------
1894
+ # Molecular diff (MCS-based)
1895
+ # ---------------------------------------------------------------------------
1896
+
1897
+ @dataclass
1898
+ class FragmentChange:
1899
+ """One changed fragment in a molecular diff."""
1900
+ sm_frag_smiles: str # [*]-bearing SMILES from SM side ("" for additions)
1901
+ prod_frag_smiles: str # [*]-bearing SMILES from product side ("" for removals)
1902
+ sm_name: str # substituent name ("fluoro", "H", etc.)
1903
+ prod_name: str # substituent name ("phenyl", etc.)
1904
+ change_type: str # "replace" | "addition" | "removal"
1905
+
1906
+
1907
+ @dataclass
1908
+ class MolecularDiffResult:
1909
+ """Result of MCS-based molecular diff between SM and product."""
1910
+ sm_smiles: str
1911
+ prod_smiles: str
1912
+ changes: List[FragmentChange]
1913
+ mcs_num_atoms: int
1914
+ fallback_used: bool = False
1915
+ fallback_text: str = ""
1916
+ stereo_only: bool = False
1917
+
1918
+
1919
+ def _get_connected_components(mol: Chem.Mol,
1920
+ atom_indices: set) -> List[set]:
1921
+ """Group atom indices into connected components within the molecule."""
1922
+ visited: set = set()
1923
+ components: List[set] = []
1924
+ for start in atom_indices:
1925
+ if start in visited:
1926
+ continue
1927
+ comp: set = set()
1928
+ queue = [start]
1929
+ while queue:
1930
+ idx = queue.pop()
1931
+ if idx in visited:
1932
+ continue
1933
+ visited.add(idx)
1934
+ comp.add(idx)
1935
+ atom = mol.GetAtomWithIdx(idx)
1936
+ for nbr in atom.GetNeighbors():
1937
+ nidx = nbr.GetIdx()
1938
+ if nidx in atom_indices and nidx not in visited:
1939
+ queue.append(nidx)
1940
+ components.append(comp)
1941
+ return components
1942
+
1943
+
1944
+ def _extract_fragment_smiles(mol: Chem.Mol, frag_atoms: set,
1945
+ attachments: List[Tuple[int, int]]
1946
+ ) -> str:
1947
+ """Extract a fragment as SMILES with [*] at each attachment point.
1948
+
1949
+ Args:
1950
+ mol: Source molecule.
1951
+ frag_atoms: Set of atom indices belonging to this fragment.
1952
+ attachments: List of (frag_atom_idx, core_atom_idx) pairs
1953
+ representing bonds crossing from fragment to MCS core.
1954
+
1955
+ Returns:
1956
+ SMILES like "[*]c1ccccc1" for a phenyl fragment.
1957
+ """
1958
+ frag = Chem.RWMol()
1959
+ old_to_new: dict = {}
1960
+
1961
+ # Add fragment atoms
1962
+ for old_idx in sorted(frag_atoms):
1963
+ src = mol.GetAtomWithIdx(old_idx)
1964
+ new_atom = Chem.Atom(src.GetAtomicNum())
1965
+ new_atom.SetFormalCharge(src.GetFormalCharge())
1966
+ new_atom.SetNumExplicitHs(src.GetNumExplicitHs())
1967
+ new_atom.SetIsAromatic(src.GetIsAromatic())
1968
+ new_idx = frag.AddAtom(new_atom)
1969
+ old_to_new[old_idx] = new_idx
1970
+
1971
+ # Add [*] dummy atoms for each attachment point
1972
+ attach_dummies: dict = {} # core_atom_idx -> new_dummy_idx
1973
+ for frag_idx, core_idx in attachments:
1974
+ if core_idx not in attach_dummies:
1975
+ dummy_idx = frag.AddAtom(Chem.Atom(0)) # [*]
1976
+ attach_dummies[core_idx] = dummy_idx
1977
+ bond = mol.GetBondBetweenAtoms(frag_idx, core_idx)
1978
+ btype = bond.GetBondType() if bond else Chem.BondType.SINGLE
1979
+ frag.AddBond(old_to_new[frag_idx], attach_dummies[core_idx], btype)
1980
+
1981
+ # Add intra-fragment bonds
1982
+ for old_idx in frag_atoms:
1983
+ atom = mol.GetAtomWithIdx(old_idx)
1984
+ for bond in atom.GetBonds():
1985
+ other = bond.GetOtherAtomIdx(old_idx)
1986
+ if other in frag_atoms and old_idx < other:
1987
+ frag.AddBond(old_to_new[old_idx], old_to_new[other],
1988
+ bond.GetBondType())
1989
+
1990
+ try:
1991
+ Chem.SanitizeMol(frag)
1992
+ return Chem.MolToSmiles(frag)
1993
+ except Exception:
1994
+ # If sanitization fails, try without aromaticity perception
1995
+ try:
1996
+ Chem.SanitizeMol(frag, Chem.SanitizeFlags.SANITIZE_ALL
1997
+ ^ Chem.SanitizeFlags.SANITIZE_SETAROMATICITY)
1998
+ return Chem.MolToSmiles(frag)
1999
+ except Exception:
2000
+ return ""
2001
+
2002
+
2003
+ def _name_fragment(frag_smiles: str) -> str:
2004
+ """Name a fragment, returning substituent prefix or raw SMILES fallback."""
2005
+ if not frag_smiles:
2006
+ return "H"
2007
+ # Normalise [*][H] variants
2008
+ mol = Chem.MolFromSmiles(frag_smiles)
2009
+ if mol is None:
2010
+ return frag_smiles
2011
+ heavy = sum(1 for a in mol.GetAtoms() if a.GetAtomicNum() > 1)
2012
+ if heavy == 0:
2013
+ return "H"
2014
+
2015
+ # Detect =O (oxo/carbonyl) vs -OH (hydroxy) — the generic substituent
2016
+ # namer may not distinguish bond order.
2017
+ if heavy == 1:
2018
+ atom = next(a for a in mol.GetAtoms() if a.GetAtomicNum() > 1)
2019
+ if atom.GetAtomicNum() == 8: # oxygen
2020
+ # Check if any bond to a dummy atom is a double bond
2021
+ for bond in atom.GetBonds():
2022
+ if bond.GetOtherAtom(atom).GetAtomicNum() == 0: # [*]
2023
+ if bond.GetBondTypeAsDouble() == 2.0:
2024
+ return "oxo"
2025
+ return "hydroxy"
2026
+ if atom.GetAtomicNum() == 16: # sulfur
2027
+ for bond in atom.GetBonds():
2028
+ if bond.GetOtherAtom(atom).GetAtomicNum() == 0:
2029
+ if bond.GetBondTypeAsDouble() == 2.0:
2030
+ return "thioxo"
2031
+ return "sulfanyl"
2032
+
2033
+ result = name_fragment_as_substituent(frag_smiles, verbose=False)
2034
+ return result if result else frag_smiles
2035
+
2036
+
2037
+ def molecular_diff(sm_smiles: str, prod_smiles: str,
2038
+ min_mcs_ratio: float = 0.4,
2039
+ verbose: bool = False) -> MolecularDiffResult:
2040
+ """Compute molecular-level diff between SM and product using MCS.
2041
+
2042
+ Finds the Maximum Common Substructure (invariant core), extracts
2043
+ changed fragments from each side, names them as IUPAC substituents,
2044
+ and returns structured diff results.
2045
+
2046
+ Falls back to text diff when MCS is too small.
2047
+
2048
+ Args:
2049
+ sm_smiles: Starting material SMILES.
2050
+ prod_smiles: Product SMILES.
2051
+ min_mcs_ratio: Minimum fraction of smaller molecule covered by MCS.
2052
+ Below this, falls back to text diff.
2053
+ verbose: Print debug info.
2054
+
2055
+ Returns:
2056
+ MolecularDiffResult with list of FragmentChange entries.
2057
+ """
2058
+ empty = MolecularDiffResult(sm_smiles=sm_smiles, prod_smiles=prod_smiles,
2059
+ changes=[], mcs_num_atoms=0)
2060
+
2061
+ sm_mol = Chem.MolFromSmiles(sm_smiles)
2062
+ prod_mol = Chem.MolFromSmiles(prod_smiles)
2063
+ if sm_mol is None or prod_mol is None:
2064
+ empty.fallback_used = True
2065
+ empty.fallback_text = "(invalid SMILES)"
2066
+ return empty
2067
+
2068
+ sm_n = sm_mol.GetNumAtoms()
2069
+ prod_n = prod_mol.GetNumAtoms()
2070
+
2071
+ # --- MCS computation ---
2072
+ try:
2073
+ mcs = rdFMCS.FindMCS(
2074
+ [sm_mol, prod_mol],
2075
+ threshold=1.0,
2076
+ ringMatchesRingOnly=True,
2077
+ completeRingsOnly=True,
2078
+ atomCompare=rdFMCS.AtomCompare.CompareElements,
2079
+ bondCompare=rdFMCS.BondCompare.CompareOrder,
2080
+ timeout=5,
2081
+ )
2082
+ except Exception:
2083
+ empty.fallback_used = True
2084
+ return empty
2085
+
2086
+ if mcs.canceled or mcs.numAtoms < 3:
2087
+ empty.fallback_used = True
2088
+ return empty
2089
+
2090
+ # --- Quality gate ---
2091
+ smaller = min(sm_n, prod_n)
2092
+ if mcs.numAtoms < min_mcs_ratio * smaller:
2093
+ if verbose:
2094
+ print(f" MCS too small: {mcs.numAtoms}/{smaller} "
2095
+ f"({mcs.numAtoms/smaller:.0%})", file=sys.stderr)
2096
+ empty.fallback_used = True
2097
+ return empty
2098
+
2099
+ # --- Atom mappings ---
2100
+ core = Chem.MolFromSmarts(mcs.smartsString)
2101
+ if core is None:
2102
+ empty.fallback_used = True
2103
+ return empty
2104
+
2105
+ sm_match = sm_mol.GetSubstructMatch(core)
2106
+ prod_match = prod_mol.GetSubstructMatch(core)
2107
+ if not sm_match or not prod_match:
2108
+ empty.fallback_used = True
2109
+ return empty
2110
+
2111
+ sm_core = set(sm_match)
2112
+ prod_core = set(prod_match)
2113
+
2114
+ # --- Stereo-only check ---
2115
+ if mcs.numAtoms == sm_n == prod_n:
2116
+ return MolecularDiffResult(
2117
+ sm_smiles=sm_smiles, prod_smiles=prod_smiles,
2118
+ changes=[], mcs_num_atoms=mcs.numAtoms, stereo_only=True)
2119
+
2120
+ # --- Extract non-MCS atoms ---
2121
+ sm_non_mcs = set(range(sm_n)) - sm_core
2122
+ prod_non_mcs = set(range(prod_n)) - prod_core
2123
+
2124
+ # --- Group into connected components ---
2125
+ sm_comps = _get_connected_components(sm_mol, sm_non_mcs)
2126
+ prod_comps = _get_connected_components(prod_mol, prod_non_mcs)
2127
+
2128
+ if verbose:
2129
+ print(f" MCS: {mcs.numAtoms} atoms. SM changed: {len(sm_non_mcs)} "
2130
+ f"in {len(sm_comps)} frag(s). Prod changed: {len(prod_non_mcs)} "
2131
+ f"in {len(prod_comps)} frag(s).", file=sys.stderr)
2132
+
2133
+ # --- Find attachment points ---
2134
+ # For each component, find bonds from non-MCS to MCS atoms.
2135
+ # Key: MCS core position (index in sm_match/prod_match tuple)
2136
+ # Multiple fragments can attach to the same core atom (e.g. Grignard
2137
+ # addition: C=O → C(OH)(R) produces two product fragments on one atom).
2138
+ def _find_attachments(mol, components, core_set, match_tuple):
2139
+ """Return {mcs_pos: [(component, [(frag_idx, core_idx), ...]), ...]}."""
2140
+ attach_map: dict = {} # mcs_pos -> list of (comp, atts)
2141
+ for comp in components:
2142
+ atts: List[Tuple[int, int]] = []
2143
+ for atom_idx in comp:
2144
+ for nbr in mol.GetAtomWithIdx(atom_idx).GetNeighbors():
2145
+ nidx = nbr.GetIdx()
2146
+ if nidx in core_set:
2147
+ atts.append((atom_idx, nidx))
2148
+ # Key by MCS core position (to enable pairing)
2149
+ # A component may attach to multiple core atoms; use the first.
2150
+ mcs_positions_seen: set = set()
2151
+ for _, core_idx in atts:
2152
+ mcs_pos = match_tuple.index(core_idx)
2153
+ if mcs_pos not in mcs_positions_seen:
2154
+ mcs_positions_seen.add(mcs_pos)
2155
+ attach_map.setdefault(mcs_pos, []).append((comp, atts))
2156
+ return attach_map
2157
+
2158
+ sm_attach = _find_attachments(sm_mol, sm_comps, sm_core, sm_match)
2159
+ prod_attach = _find_attachments(prod_mol, prod_comps, prod_core, prod_match)
2160
+
2161
+ # --- Pair fragments by shared MCS attachment point ---
2162
+ all_mcs_positions = set(sm_attach.keys()) | set(prod_attach.keys())
2163
+ changes: List[FragmentChange] = []
2164
+
2165
+ for mcs_pos in sorted(all_mcs_positions):
2166
+ sm_list = sm_attach.get(mcs_pos, [])
2167
+ prod_list = prod_attach.get(mcs_pos, [])
2168
+
2169
+ # Extract all fragment SMILES and names for each side
2170
+ sm_frags = []
2171
+ for comp, atts in sm_list:
2172
+ smi = _extract_fragment_smiles(sm_mol, comp, atts)
2173
+ sm_frags.append((smi, _name_fragment(smi) if smi else "H"))
2174
+ prod_frags = []
2175
+ for comp, atts in prod_list:
2176
+ smi = _extract_fragment_smiles(prod_mol, comp, atts)
2177
+ prod_frags.append((smi, _name_fragment(smi) if smi else "H"))
2178
+
2179
+ if sm_frags and prod_frags:
2180
+ # Replacement at this position. Multiple fragments on one
2181
+ # side are part of the same transformation (e.g. Grignard
2182
+ # C=O → C(OH)(R)), so combine all names with " + ".
2183
+ sm_names = " + ".join(n for _, n in sm_frags)
2184
+ prod_names = " + ".join(n for _, n in prod_frags)
2185
+ changes.append(FragmentChange(
2186
+ sm_frag_smiles=sm_frags[0][0],
2187
+ prod_frag_smiles=prod_frags[0][0],
2188
+ sm_name=sm_names, prod_name=prod_names,
2189
+ change_type="replace",
2190
+ ))
2191
+ elif sm_frags:
2192
+ # Pure removals (nothing on product side at this position)
2193
+ for smi, name in sm_frags:
2194
+ changes.append(FragmentChange(
2195
+ sm_frag_smiles=smi, prod_frag_smiles="",
2196
+ sm_name=name, prod_name="H",
2197
+ change_type="removal",
2198
+ ))
2199
+ elif prod_frags:
2200
+ # Pure additions (nothing on SM side at this position)
2201
+ for smi, name in prod_frags:
2202
+ changes.append(FragmentChange(
2203
+ sm_frag_smiles="", prod_frag_smiles=smi,
2204
+ sm_name="H", prod_name=name,
2205
+ change_type="addition",
2206
+ ))
2207
+
2208
+ # --- Post-processing: merge unpaired removals + additions ---
2209
+ # Symmetric molecules (e.g. benzene) can cause the MCS to map
2210
+ # substituted carbons to different positions, so a true substitution
2211
+ # appears as a removal + addition. Merge them into replacements.
2212
+ removals = [c for c in changes if c.change_type == "removal"]
2213
+ additions = [c for c in changes if c.change_type == "addition"]
2214
+
2215
+ if removals and additions:
2216
+ paired_changes = [c for c in changes if c.change_type == "replace"]
2217
+ # Pair removals with additions (1:1, in order)
2218
+ n_pairs = min(len(removals), len(additions))
2219
+ for i in range(n_pairs):
2220
+ paired_changes.append(FragmentChange(
2221
+ sm_frag_smiles=removals[i].sm_frag_smiles,
2222
+ prod_frag_smiles=additions[i].prod_frag_smiles,
2223
+ sm_name=removals[i].sm_name,
2224
+ prod_name=additions[i].prod_name,
2225
+ change_type="replace",
2226
+ ))
2227
+ # Keep any leftover unpaired removals/additions
2228
+ for r in removals[n_pairs:]:
2229
+ paired_changes.append(r)
2230
+ for a in additions[n_pairs:]:
2231
+ paired_changes.append(a)
2232
+ changes = paired_changes
2233
+
2234
+ return MolecularDiffResult(
2235
+ sm_smiles=sm_smiles, prod_smiles=prod_smiles,
2236
+ changes=changes, mcs_num_atoms=mcs.numAtoms)
2237
+
2238
+
2239
+ # ---------------------------------------------------------------------------
2240
+ # Molecular diff formatting
2241
+ # ---------------------------------------------------------------------------
2242
+
2243
+ def format_molecular_diff(sm_smiles: str, prod_smiles: str,
2244
+ alignment_result: Optional['AlignmentResult'] = None
2245
+ ) -> str:
2246
+ """Plain-text molecular diff: ``fluoro → phenyl``.
2247
+
2248
+ Uses MCS to identify changed fragments, names them as substituents.
2249
+ Falls back to text diff (``format_name_diff``) when MCS is too small.
2250
+
2251
+ Multiple changes separated by `` ; ``.
2252
+ """
2253
+ result = molecular_diff(sm_smiles, prod_smiles)
2254
+
2255
+ if result.fallback_used:
2256
+ # Fall back to text diff using best available names
2257
+ if alignment_result:
2258
+ n1 = alignment_result.best_sm_name or ""
2259
+ n2 = alignment_result.best_prod_name or ""
2260
+ else:
2261
+ n1 = _quick_name(sm_smiles)
2262
+ n2 = _quick_name(prod_smiles)
2263
+ if n1 and n2:
2264
+ return format_name_diff(n1, n2)
2265
+ return result.fallback_text or "(no diff available)"
2266
+
2267
+ if result.stereo_only:
2268
+ return "(stereo change)"
2269
+ if not result.changes:
2270
+ return "(identical)"
2271
+
2272
+ parts = []
2273
+ for ch in result.changes:
2274
+ if ch.change_type == "replace":
2275
+ parts.append(f"{ch.sm_name} \u2192 {ch.prod_name}")
2276
+ elif ch.change_type == "removal":
2277
+ parts.append(f"{ch.sm_name} \u2192 H")
2278
+ elif ch.change_type == "addition":
2279
+ parts.append(f"H \u2192 {ch.prod_name}")
2280
+ return " ; ".join(parts) if parts else "(identical)"
2281
+
2282
+
2283
+ def format_molecular_diff_html(sm_smiles: str, prod_smiles: str,
2284
+ alignment_result: Optional['AlignmentResult'] = None
2285
+ ) -> str:
2286
+ """HTML molecular diff with coloured spans.
2287
+
2288
+ Uses same CSS classes as ``format_name_diff_html`` for consistency:
2289
+ ``.diff-del`` (red strikethrough), ``.diff-ins`` (green), ``.diff-arrow``.
2290
+ """
2291
+ result = molecular_diff(sm_smiles, prod_smiles)
2292
+
2293
+ if result.fallback_used:
2294
+ if alignment_result:
2295
+ n1 = alignment_result.best_sm_name or ""
2296
+ n2 = alignment_result.best_prod_name or ""
2297
+ else:
2298
+ n1 = _quick_name(sm_smiles)
2299
+ n2 = _quick_name(prod_smiles)
2300
+ if n1 and n2:
2301
+ return format_name_diff_html(n1, n2)
2302
+ return html_mod.escape(result.fallback_text or "(no diff available)")
2303
+
2304
+ if result.stereo_only:
2305
+ return '<span class="diff-ins">(stereo change)</span>'
2306
+ if not result.changes:
2307
+ return "(identical)"
2308
+
2309
+ parts = []
2310
+ for ch in result.changes:
2311
+ if ch.change_type == "replace":
2312
+ parts.append(
2313
+ f'<span class="diff-del">{html_mod.escape(ch.sm_name)}</span>'
2314
+ f'<span class="diff-arrow">\u2192</span>'
2315
+ f'<span class="diff-ins">{html_mod.escape(ch.prod_name)}</span>'
2316
+ )
2317
+ elif ch.change_type == "removal":
2318
+ parts.append(
2319
+ f'<span class="diff-del">{html_mod.escape(ch.sm_name)}</span>'
2320
+ f'<span class="diff-arrow">\u2192</span>'
2321
+ f'<span class="diff-ins">H</span>'
2322
+ )
2323
+ elif ch.change_type == "addition":
2324
+ parts.append(
2325
+ f'<span class="diff-del">H</span>'
2326
+ f'<span class="diff-arrow">\u2192</span>'
2327
+ f'<span class="diff-ins">{html_mod.escape(ch.prod_name)}</span>'
2328
+ )
2329
+ return " ; ".join(parts) if parts else "(identical)"
2330
+
2331
+
2332
+ def _quick_name(smiles: str) -> str:
2333
+ """Get IUPAC name for a SMILES without full decomposition."""
2334
+ try:
2335
+ from cdxml_toolkit.chemdraw.chemscript_bridge import ChemScriptBridge
2336
+ cs = ChemScriptBridge.get_instance()
2337
+ return cs.get_name(smiles)
2338
+ except Exception:
2339
+ return ""
2340
+
2341
+
2342
+ # Showcase runner and CLI entry points live in chem-pipeline/aligned_namer.py