cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2342 @@
|
|
|
1
|
+
"""
|
|
2
|
+
aligned_namer.py — Aligned IUPAC Name Generation
|
|
3
|
+
|
|
4
|
+
Pairwise alignment (SM→product pairs) and multi-step sequence alignment
|
|
5
|
+
for synthetic routes.
|
|
6
|
+
|
|
7
|
+
Uses name_decomposer to exhaustively generate alternative names for each
|
|
8
|
+
molecule, then picks names that share the same naming parent, making the
|
|
9
|
+
transformation obvious from the names alone.
|
|
10
|
+
|
|
11
|
+
Multi-step sequences use parent-aware dynamic programming (Viterbi) to
|
|
12
|
+
minimise parent-ring switches first, then chemistry-aware token diff as
|
|
13
|
+
tiebreaker.
|
|
14
|
+
|
|
15
|
+
Usage:
|
|
16
|
+
python aligned_namer.py --sm "BrC1=CC=CC=C1" --product "C1=CC=C(C2=CC=NC=C2)C=C1"
|
|
17
|
+
python aligned_namer.py --showcase # run on all showcase reactions
|
|
18
|
+
python aligned_namer.py --showcase --report alignment_report.txt
|
|
19
|
+
"""
|
|
20
|
+
import argparse
|
|
21
|
+
import difflib
|
|
22
|
+
import html as html_mod
|
|
23
|
+
import re
|
|
24
|
+
import sys
|
|
25
|
+
import os
|
|
26
|
+
import glob
|
|
27
|
+
from collections import Counter, defaultdict
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
|
+
from typing import Dict, List, Tuple, Optional
|
|
30
|
+
|
|
31
|
+
from rdkit import Chem, RDLogger
|
|
32
|
+
from rdkit.Chem import rdFMCS
|
|
33
|
+
RDLogger.logger().setLevel(RDLogger.ERROR)
|
|
34
|
+
|
|
35
|
+
from cdxml_toolkit.naming.name_decomposer import (
|
|
36
|
+
decompose_name, DecompositionResult, name_fragment_as_substituent,
|
|
37
|
+
_validate_name, _canonical, _name_to_smiles,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
from rdkit.Chem.inchi import MolToInchi
|
|
42
|
+
except ImportError:
|
|
43
|
+
MolToInchi = None # type: ignore[assignment]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _validate_variant(name: str, expected_canonical: str) -> bool:
|
|
47
|
+
"""Validate a variant name resolves to the same molecule.
|
|
48
|
+
|
|
49
|
+
First tries canonical SMILES comparison (fast). Falls back to
|
|
50
|
+
InChI comparison to handle tautomers (e.g. quinazolinone NH position).
|
|
51
|
+
"""
|
|
52
|
+
if _validate_name(name, expected_canonical):
|
|
53
|
+
return True
|
|
54
|
+
# Canonical SMILES didn't match — try InChI (tautomer-tolerant)
|
|
55
|
+
if MolToInchi is None:
|
|
56
|
+
return False
|
|
57
|
+
smi = _name_to_smiles(name)
|
|
58
|
+
if smi is None:
|
|
59
|
+
return False
|
|
60
|
+
try:
|
|
61
|
+
mol_variant = Chem.MolFromSmiles(smi)
|
|
62
|
+
mol_expected = Chem.MolFromSmiles(expected_canonical)
|
|
63
|
+
if mol_variant is None or mol_expected is None:
|
|
64
|
+
return False
|
|
65
|
+
return MolToInchi(mol_variant) == MolToInchi(mol_expected)
|
|
66
|
+
except Exception:
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# ---------------------------------------------------------------------------
|
|
71
|
+
# Levenshtein distance
|
|
72
|
+
# ---------------------------------------------------------------------------
|
|
73
|
+
|
|
74
|
+
def _levenshtein(s1: str, s2: str) -> int:
|
|
75
|
+
"""Compute Levenshtein edit distance between two strings."""
|
|
76
|
+
if len(s1) < len(s2):
|
|
77
|
+
return _levenshtein(s2, s1)
|
|
78
|
+
if len(s2) == 0:
|
|
79
|
+
return len(s1)
|
|
80
|
+
prev = list(range(len(s2) + 1))
|
|
81
|
+
for i, c1 in enumerate(s1):
|
|
82
|
+
curr = [i + 1]
|
|
83
|
+
for j, c2 in enumerate(s2):
|
|
84
|
+
# insertion, deletion, substitution
|
|
85
|
+
curr.append(min(
|
|
86
|
+
prev[j + 1] + 1,
|
|
87
|
+
curr[j] + 1,
|
|
88
|
+
prev[j] + (0 if c1 == c2 else 1)
|
|
89
|
+
))
|
|
90
|
+
prev = curr
|
|
91
|
+
return prev[-1]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def name_similarity(name1: str, name2: str) -> float:
|
|
95
|
+
"""Compute similarity between two names as 1 - normalized Levenshtein.
|
|
96
|
+
|
|
97
|
+
Returns a float in [0, 1] where 1.0 means identical.
|
|
98
|
+
"""
|
|
99
|
+
if not name1 or not name2:
|
|
100
|
+
return 0.0
|
|
101
|
+
dist = _levenshtein(name1.lower(), name2.lower())
|
|
102
|
+
max_len = max(len(name1), len(name2))
|
|
103
|
+
return 1.0 - (dist / max_len) if max_len > 0 else 0.0
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# ---------------------------------------------------------------------------
|
|
107
|
+
# Chemistry-aware tokeniser
|
|
108
|
+
# ---------------------------------------------------------------------------
|
|
109
|
+
|
|
110
|
+
# Ring system names (ordered longest-first for greedy matching)
|
|
111
|
+
_RING_SYSTEMS = sorted([
|
|
112
|
+
"quinoline", "isoquinoline", "quinoxaline", "quinazoline",
|
|
113
|
+
"pyridine", "pyrimidine", "pyrazine", "pyridazine",
|
|
114
|
+
"benzene", "naphthalene", "anthracene",
|
|
115
|
+
"indole", "benzimidazole", "benzothiazole", "benzofuran", "benzoxazole",
|
|
116
|
+
"thiophene", "furan", "pyrrole", "imidazole", "oxazole",
|
|
117
|
+
"thiazole", "triazine", "tetrazole", "triazole", "oxadiazole",
|
|
118
|
+
"morpholine", "piperidine", "piperazine", "pyrrolidine",
|
|
119
|
+
"carbazole", "acridine", "phenanthroline",
|
|
120
|
+
"thienopyrimidine", "isoindoline", "isoindole",
|
|
121
|
+
"carbamate", "benzamide", "acetamide",
|
|
122
|
+
# Additional heterocycles common in drug synthesis
|
|
123
|
+
"pyrazole", "isoxazole", "isothiazole",
|
|
124
|
+
"oxazolidine", "oxazolidinone", "thiazolidine",
|
|
125
|
+
"tetrahydronaphthalene", "dihydronaphthalene",
|
|
126
|
+
"phthalazine", "cinnoline",
|
|
127
|
+
"purine", "xanthine",
|
|
128
|
+
"azetidine", "aziridine", "oxetane", "thietane",
|
|
129
|
+
"diazepine", "oxazepine",
|
|
130
|
+
# Retained names (for tokenizer splitting: "dimethylaniline" → "dimethyl"+"aniline")
|
|
131
|
+
"aniline", "phenol", "benzenol",
|
|
132
|
+
"anisole", "benzaldehyde", "acetophenone", "styrene",
|
|
133
|
+
], key=len, reverse=True)
|
|
134
|
+
|
|
135
|
+
_SUBSTITUENT_PREFIXES = sorted([
|
|
136
|
+
"amino", "bromo", "chloro", "fluoro", "iodo", "nitro",
|
|
137
|
+
"methyl", "ethyl", "propyl", "butyl", "phenyl", "benzyl",
|
|
138
|
+
"methoxy", "ethoxy", "hydroxy", "oxo", "formyl",
|
|
139
|
+
"methoxycarbonyl", "ethoxycarbonyl", "carbamoyl", "carboxy", "cyano",
|
|
140
|
+
"morpholino", "morpholin", "piperidin", "pyrrolidin", "piperazin",
|
|
141
|
+
"benzamido", "acetamido", "acetyl",
|
|
142
|
+
"tert", "sec", "iso", "cyclo",
|
|
143
|
+
], key=len, reverse=True)
|
|
144
|
+
|
|
145
|
+
_MULTIPLIERS = {"di", "tri", "tetra", "penta", "hexa", "bis", "tris"}
|
|
146
|
+
_STEREO = {"r", "s", "e", "z", "cis", "trans", "rac", "dl", "meso"}
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
_LINKERS = frozenset({
|
|
150
|
+
'yl', 'oxy', 'oyl', 'amido', 'amino', 'thio', 'sulfonyl',
|
|
151
|
+
'amine', 'ol', 'one', 'thiol',
|
|
152
|
+
})
|
|
153
|
+
_FG_SUFFIXES = ('amine', 'amide', 'thiol', 'aldehyde', 'nitrile')
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _classify_token(tok: str, out: list) -> None:
|
|
157
|
+
"""Recursively classify and split a single IUPAC token.
|
|
158
|
+
|
|
159
|
+
Appends (token, category) tuples to *out*.
|
|
160
|
+
"""
|
|
161
|
+
if not tok:
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
# 1. Locant
|
|
165
|
+
if re.match(r'^\d+(?:,\d+)*$', tok):
|
|
166
|
+
out.append((tok, 'locant'))
|
|
167
|
+
return
|
|
168
|
+
|
|
169
|
+
# 2. Ring match (longest ring first — _RING_SYSTEMS is pre-sorted)
|
|
170
|
+
for ring in _RING_SYSTEMS:
|
|
171
|
+
if tok == ring or tok.endswith(ring):
|
|
172
|
+
prefix = tok[:len(tok) - len(ring)]
|
|
173
|
+
if prefix:
|
|
174
|
+
_classify_token(prefix, out) # recurse on prefix
|
|
175
|
+
out.append((ring, 'ring'))
|
|
176
|
+
return
|
|
177
|
+
|
|
178
|
+
# 3. Exact multiplier / stereo / linker
|
|
179
|
+
if tok in _MULTIPLIERS:
|
|
180
|
+
out.append((tok, 'multiplier'))
|
|
181
|
+
return
|
|
182
|
+
if tok in _STEREO:
|
|
183
|
+
out.append((tok, 'stereo'))
|
|
184
|
+
return
|
|
185
|
+
if tok in _LINKERS:
|
|
186
|
+
out.append((tok, 'linker'))
|
|
187
|
+
return
|
|
188
|
+
|
|
189
|
+
# 4. Exact substituent prefix
|
|
190
|
+
for sub in _SUBSTITUENT_PREFIXES:
|
|
191
|
+
if tok == sub:
|
|
192
|
+
out.append((sub, 'substituent'))
|
|
193
|
+
return
|
|
194
|
+
|
|
195
|
+
# 5. Split on substituent suffix (longest match first)
|
|
196
|
+
# E.g. "dimethylphenyl" → "dimethyl" + "phenyl"
|
|
197
|
+
for sub in _SUBSTITUENT_PREFIXES:
|
|
198
|
+
if tok.endswith(sub) and len(tok) > len(sub):
|
|
199
|
+
_classify_token(tok[:-len(sub)], out) # recurse on prefix
|
|
200
|
+
out.append((sub, 'substituent'))
|
|
201
|
+
return
|
|
202
|
+
|
|
203
|
+
# 6. Split on functional group suffix
|
|
204
|
+
# E.g. "phenylamine" → "phenyl" + "amine"
|
|
205
|
+
for fg in _FG_SUFFIXES:
|
|
206
|
+
if tok.endswith(fg) and len(tok) > len(fg):
|
|
207
|
+
_classify_token(tok[:-len(fg)], out) # recurse on prefix
|
|
208
|
+
out.append((fg, 'linker'))
|
|
209
|
+
return
|
|
210
|
+
|
|
211
|
+
# 7. Fallback
|
|
212
|
+
out.append((tok, 'other'))
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _tokenize_name_chem(name: str) -> List[Tuple[str, str]]:
|
|
216
|
+
"""Chemistry-aware IUPAC name tokeniser.
|
|
217
|
+
|
|
218
|
+
Returns list of (token, category) tuples where category is one of:
|
|
219
|
+
locant, ring, substituent, multiplier, stereo, linker, other.
|
|
220
|
+
"""
|
|
221
|
+
result: List[Tuple[str, str]] = []
|
|
222
|
+
s = name.lower().strip()
|
|
223
|
+
raw = re.findall(r'\d+(?:,\d+)*|[a-z]+|\S', s)
|
|
224
|
+
|
|
225
|
+
for tok in raw:
|
|
226
|
+
if not tok or tok in ('(', ')', '-', ',', '[', ']', ' '):
|
|
227
|
+
continue
|
|
228
|
+
_classify_token(tok, result)
|
|
229
|
+
|
|
230
|
+
return result
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _chem_tokens_flat(name: str) -> List[str]:
|
|
234
|
+
"""Get just the token strings from chemistry-aware tokeniser."""
|
|
235
|
+
return [tok for tok, _ in _tokenize_name_chem(name)]
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def chem_token_diff_count(a: str, b: str) -> float:
|
|
239
|
+
"""Token diff count using chemistry-aware tokeniser with soft equivalences.
|
|
240
|
+
|
|
241
|
+
Exact token mismatches cost 1.0 each. Tokens that are chemically
|
|
242
|
+
related (e.g. "phenyl"/"benzene", "aniline"/"phenylamine") contribute
|
|
243
|
+
a reduced cost (0.3) instead of the full 1.0 per token.
|
|
244
|
+
"""
|
|
245
|
+
ta = Counter(_chem_tokens_flat(a))
|
|
246
|
+
tb = Counter(_chem_tokens_flat(b))
|
|
247
|
+
# Work on copies so we can consume matching equivalences
|
|
248
|
+
ra = dict(ta) # residual counts for a
|
|
249
|
+
rb = dict(tb) # residual counts for b
|
|
250
|
+
|
|
251
|
+
cost = 0.0
|
|
252
|
+
|
|
253
|
+
# First pass: consume exact matches (cost = 0)
|
|
254
|
+
for k in set(ra) & set(rb):
|
|
255
|
+
matched = min(ra[k], rb[k])
|
|
256
|
+
ra[k] -= matched
|
|
257
|
+
rb[k] -= matched
|
|
258
|
+
|
|
259
|
+
# Second pass: try soft equivalences on remaining tokens
|
|
260
|
+
# Build residual sets (only tokens with count > 0)
|
|
261
|
+
ra = {k: v for k, v in ra.items() if v > 0}
|
|
262
|
+
rb = {k: v for k, v in rb.items() if v > 0}
|
|
263
|
+
|
|
264
|
+
for tok_a, tok_b, n_consumed_a, n_consumed_b in _soft_equivalence_pairs(ra, rb):
|
|
265
|
+
matched = min(ra.get(tok_a, 0) // n_consumed_a,
|
|
266
|
+
rb.get(tok_b, 0) // n_consumed_b)
|
|
267
|
+
if matched > 0:
|
|
268
|
+
ra[tok_a] = ra.get(tok_a, 0) - matched * n_consumed_a
|
|
269
|
+
rb[tok_b] = rb.get(tok_b, 0) - matched * n_consumed_b
|
|
270
|
+
if ra[tok_a] <= 0:
|
|
271
|
+
ra.pop(tok_a, None)
|
|
272
|
+
if rb[tok_b] <= 0:
|
|
273
|
+
rb.pop(tok_b, None)
|
|
274
|
+
cost += matched * _SOFT_EQUIV_COST
|
|
275
|
+
|
|
276
|
+
# Remaining unmatched tokens cost 1.0 each
|
|
277
|
+
cost += sum(v for v in ra.values())
|
|
278
|
+
cost += sum(v for v in rb.values())
|
|
279
|
+
|
|
280
|
+
return cost
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
# Soft equivalence: related tokens that should have reduced mismatch cost.
|
|
284
|
+
# Each entry: (token_a, token_b, count_a, count_b)
|
|
285
|
+
# Meaning: 1 of token_a ≈ 1 of token_b (consuming count_a and count_b respectively)
|
|
286
|
+
_SOFT_EQUIV_TABLE = [
|
|
287
|
+
# Ring/substituent forms of the same moiety
|
|
288
|
+
("benzene", "phenyl", 1, 1),
|
|
289
|
+
("naphthalene", "naphthyl", 1, 1),
|
|
290
|
+
# Retained → systematic (only entries where BOTH sides are single tokens
|
|
291
|
+
# after _chem_tokens_flat; multi-token targets like "phenylamine" and
|
|
292
|
+
# "methoxybenzene" are handled by _retained_systematic_variants instead)
|
|
293
|
+
("phenol", "benzenol", 1, 1),
|
|
294
|
+
# Functional group name equivalences
|
|
295
|
+
("amine", "amino", 1, 1),
|
|
296
|
+
("ol", "hydroxy", 1, 1),
|
|
297
|
+
("one", "oxo", 1, 1),
|
|
298
|
+
("thiol", "sulfanyl", 1, 1),
|
|
299
|
+
# Ester naming equivalences
|
|
300
|
+
("carboxylate", "carboxylic", 1, 1),
|
|
301
|
+
]
|
|
302
|
+
_SOFT_EQUIV_COST = 0.3 # cost per soft-equivalent pair (vs 1.0 for hard mismatch)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def _soft_equivalence_pairs(ra: dict, rb: dict):
|
|
306
|
+
"""Yield applicable (tok_a, tok_b, n_a, n_b) from the equivalence table.
|
|
307
|
+
|
|
308
|
+
Only yields pairs where tok_a is in *ra* and tok_b is in *rb*,
|
|
309
|
+
or vice versa (bidirectional).
|
|
310
|
+
"""
|
|
311
|
+
for tok_a, tok_b, n_a, n_b in _SOFT_EQUIV_TABLE:
|
|
312
|
+
if ra.get(tok_a, 0) >= n_a and rb.get(tok_b, 0) >= n_b:
|
|
313
|
+
yield tok_a, tok_b, n_a, n_b
|
|
314
|
+
elif ra.get(tok_b, 0) >= n_b and rb.get(tok_a, 0) >= n_a:
|
|
315
|
+
yield tok_b, tok_a, n_b, n_a
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# ---------------------------------------------------------------------------
|
|
319
|
+
# Parent ring extraction
|
|
320
|
+
# ---------------------------------------------------------------------------
|
|
321
|
+
|
|
322
|
+
# Known ring systems for parent classification.
|
|
323
|
+
# Fused names like "thieno[2,3-d]pyrimidine" are intentionally omitted —
|
|
324
|
+
# they contain "pyrimidine" as substring, so the simpler ring name matches.
|
|
325
|
+
# This avoids false switches when the decomposer reports different levels
|
|
326
|
+
# of specificity for the same scaffold.
|
|
327
|
+
_KNOWN_RINGS = {
|
|
328
|
+
# 6-membered N-heterocycles
|
|
329
|
+
"pyridine", "pyrimidine", "pyrazine", "pyridazine", "triazine",
|
|
330
|
+
"pyran", "thiopyran",
|
|
331
|
+
# 5-membered heterocycles
|
|
332
|
+
"thiophene", "furan", "pyrrole", "imidazole", "oxazole",
|
|
333
|
+
"thiazole", "tetrazole", "pyrazole", "isoxazole", "isothiazole",
|
|
334
|
+
"triazole", "oxadiazole", "thiadiazole",
|
|
335
|
+
"selenophene",
|
|
336
|
+
# Saturated 5-membered
|
|
337
|
+
"pyrrolidine", "oxazolidine", "thiazolidine", "dioxolane",
|
|
338
|
+
# Saturated 6-membered
|
|
339
|
+
"piperidine", "piperazine", "morpholine",
|
|
340
|
+
"dioxane", "dithiane",
|
|
341
|
+
# 3- and 4-membered
|
|
342
|
+
"oxirane", "aziridine", "thiirane",
|
|
343
|
+
"azetidine", "oxetane", "thietane",
|
|
344
|
+
# 7-membered
|
|
345
|
+
"diazepine", "oxazepine", "azepane", "azepine", "oxepane",
|
|
346
|
+
# Benzo-fused N-heterocycles
|
|
347
|
+
"quinoline", "isoquinoline", "quinoxaline", "quinazoline",
|
|
348
|
+
"phthalazine", "cinnoline",
|
|
349
|
+
"indole", "isoindole", "indazole", "indoline", "isoindoline",
|
|
350
|
+
"benzimidazole", "benzotriazole",
|
|
351
|
+
# Benzo-fused O/S heterocycles
|
|
352
|
+
"benzofuran", "benzothiophene", "benzoxazole",
|
|
353
|
+
"benzothiazole", "benzisoxazole", "benzisothiazole",
|
|
354
|
+
"chromene", "chromone", "coumarin", "chroman",
|
|
355
|
+
"benzodioxole", "benzodioxane",
|
|
356
|
+
# Larger fused heterocycles
|
|
357
|
+
"carbazole", "acridine", "phenanthroline",
|
|
358
|
+
"phenothiazine", "phenoxazine", "phenazine", "phenanthridine",
|
|
359
|
+
"purine", "xanthine", "xanthene", "pteridine",
|
|
360
|
+
"naphthyridine", "benzodiazepine",
|
|
361
|
+
# Fused N-rich (common in kinase inhibitors)
|
|
362
|
+
"pyrrolopyrimidine", "pyrazolopyrimidine", "imidazopyridine",
|
|
363
|
+
"pyrrolizine", "indolizine",
|
|
364
|
+
"thienopyridine", "thienopyrimidine",
|
|
365
|
+
# Drug-relevant lactams/imides
|
|
366
|
+
"hydantoin",
|
|
367
|
+
# Saturated fused / partial
|
|
368
|
+
"tetrahydroisoquinoline", "tetrahydroquinoline",
|
|
369
|
+
# Carbocycles — simple
|
|
370
|
+
"benzene", "toluene", "naphthalene", "anthracene",
|
|
371
|
+
"cyclopropane", "cyclobutane",
|
|
372
|
+
"cyclopentane", "cyclopentene", "cyclopentadiene",
|
|
373
|
+
"cyclohexane", "cyclohexene", "cyclohexadiene",
|
|
374
|
+
"cycloheptane", "cyclooctane",
|
|
375
|
+
# Carbocycles — polycyclic
|
|
376
|
+
"indene", "indane", "fluorene", "phenanthrene", "azulene",
|
|
377
|
+
"decalin", "tetralin",
|
|
378
|
+
"adamantane", "norbornane",
|
|
379
|
+
"biphenyl",
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
# Retained IUPAC names that map to a base ring system.
|
|
384
|
+
# These are trivially-substituted rings whose retained name doesn't
|
|
385
|
+
# contain the base ring string as a substring.
|
|
386
|
+
_RETAINED_TO_BASE = {
|
|
387
|
+
# Benzene retained names
|
|
388
|
+
"aniline": "benzene", "phenol": "benzene", "anisole": "benzene",
|
|
389
|
+
"acetophenone": "benzene", "benzaldehyde": "benzene",
|
|
390
|
+
"benzoic acid": "benzene", "styrene": "benzene",
|
|
391
|
+
"catechol": "benzene", "resorcinol": "benzene",
|
|
392
|
+
"hydroquinone": "benzene", "cresol": "benzene",
|
|
393
|
+
"xylene": "benzene", "toluene": "benzene",
|
|
394
|
+
"cumene": "benzene", "mesitylene": "benzene",
|
|
395
|
+
# Naphthalene retained names
|
|
396
|
+
"naphthol": "naphthalene",
|
|
397
|
+
# Saturated/partial naphthalene
|
|
398
|
+
"tetralin": "naphthalene", "decalin": "naphthalene",
|
|
399
|
+
# Indene/indane family
|
|
400
|
+
"indane": "indene",
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
# Pre-compute elided stems: "quinazoline" → "quinazolin", etc.
|
|
404
|
+
# IUPAC drops terminal 'e' before vowel-starting suffixes (-ol, -one, -amine).
|
|
405
|
+
_KNOWN_RING_STEMS = {}
|
|
406
|
+
for _ring in _KNOWN_RINGS:
|
|
407
|
+
if _ring.endswith('e'):
|
|
408
|
+
_KNOWN_RING_STEMS[_ring[:-1]] = _ring
|
|
409
|
+
_KNOWN_RING_STEMS[_ring] = _ring
|
|
410
|
+
|
|
411
|
+
# Pre-sorted versions for hot-path functions (extract_parent_ring, etc.)
|
|
412
|
+
# Avoids re-sorting on every call inside the DP inner loop.
|
|
413
|
+
_KNOWN_RINGS_BY_LEN = sorted(_KNOWN_RINGS, key=len, reverse=True)
|
|
414
|
+
_KNOWN_RING_STEMS_BY_LEN = sorted(_KNOWN_RING_STEMS, key=len, reverse=True)
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def _strip_locants(s: str) -> str:
|
|
418
|
+
"""Remove IUPAC locant insertions so ring substrings become contiguous.
|
|
419
|
+
|
|
420
|
+
E.g. "cyclohex-1-ene-1-carboxylate" → "cyclohexenecarboxylate"
|
|
421
|
+
"""
|
|
422
|
+
return re.sub(r'-[\d,()H]+-', '', s)
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def extract_parent_ring(parent: str) -> str:
|
|
426
|
+
"""Extract core ring system from a parent name string.
|
|
427
|
+
|
|
428
|
+
Checks (in order):
|
|
429
|
+
1. Known ring names as substrings, longest first
|
|
430
|
+
2. Elided stems (e.g. "quinazolin" for "quinazoline")
|
|
431
|
+
3. Locant-stripped matching (handles "cyclohex-1-ene" → "cyclohexene")
|
|
432
|
+
4. Retained name → base ring mapping
|
|
433
|
+
5. "phenyl" in name → benzene (chain compounds with phenyl substituent)
|
|
434
|
+
6. Suffix patterns (e.g. "-phenone" → benzene)
|
|
435
|
+
7. Fallback: lowered parent string
|
|
436
|
+
"""
|
|
437
|
+
p = parent.lower().strip()
|
|
438
|
+
# 1. Direct ring match (longest first)
|
|
439
|
+
for ring in _KNOWN_RINGS_BY_LEN:
|
|
440
|
+
if ring in p:
|
|
441
|
+
return _RETAINED_TO_BASE.get(ring, ring)
|
|
442
|
+
# 2. Elided stem match (longest first) — handles vowel elision
|
|
443
|
+
# e.g. "quinazolin-4-one" matches stem "quinazolin" → "quinazoline"
|
|
444
|
+
for stem in _KNOWN_RING_STEMS_BY_LEN:
|
|
445
|
+
if stem in p:
|
|
446
|
+
ring = _KNOWN_RING_STEMS[stem]
|
|
447
|
+
return _RETAINED_TO_BASE.get(ring, ring)
|
|
448
|
+
# 3. Locant-stripped matching — handles "cyclohex-1-ene" → "cyclohexene"
|
|
449
|
+
p_stripped = _strip_locants(p)
|
|
450
|
+
if p_stripped != p:
|
|
451
|
+
for ring in _KNOWN_RINGS_BY_LEN:
|
|
452
|
+
if ring in p_stripped:
|
|
453
|
+
return _RETAINED_TO_BASE.get(ring, ring)
|
|
454
|
+
for stem in _KNOWN_RING_STEMS_BY_LEN:
|
|
455
|
+
if stem in p_stripped:
|
|
456
|
+
ring = _KNOWN_RING_STEMS[stem]
|
|
457
|
+
return _RETAINED_TO_BASE.get(ring, ring)
|
|
458
|
+
# 4. Retained names
|
|
459
|
+
for retained, base in _RETAINED_TO_BASE.items():
|
|
460
|
+
if retained in p:
|
|
461
|
+
return base
|
|
462
|
+
# 5. "phenyl" in name → benzene (chain compounds like "1-phenylpropan-1-ol")
|
|
463
|
+
if "phenyl" in p:
|
|
464
|
+
return "benzene"
|
|
465
|
+
# 6. Suffix patterns
|
|
466
|
+
if p.endswith("phenone") or p.endswith("phenol"):
|
|
467
|
+
return "benzene"
|
|
468
|
+
# 7. Names starting with "benz"
|
|
469
|
+
if p.startswith("benz"):
|
|
470
|
+
return "benzene"
|
|
471
|
+
return p
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
# ---------------------------------------------------------------------------
|
|
475
|
+
# Post-hoc alignment variant generator
|
|
476
|
+
# ---------------------------------------------------------------------------
|
|
477
|
+
# Expands the candidate name set with IUPAC-equivalent alternatives that
|
|
478
|
+
# the decomposer may not have produced, specifically:
|
|
479
|
+
# 1. Ester naming: "alkyl X-ate" ↔ "X-ic acid alkyl ester"
|
|
480
|
+
# 2. Retained→systematic: "aniline" → "phenylamine", etc.
|
|
481
|
+
# 3. Indicated-H lactam suffix→prefix: "-4(3H)-one" → "4-oxo-"
|
|
482
|
+
|
|
483
|
+
_COMMON_ALKYL_ESTERS = {
|
|
484
|
+
"methyl", "ethyl", "propyl", "isopropyl", "butyl", "tert-butyl",
|
|
485
|
+
"isobutyl", "sec-butyl", "benzyl", "allyl", "phenyl", "vinyl",
|
|
486
|
+
"neopentyl", "cyclopentyl", "cyclohexyl",
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
# Alkyl → alkoxy mapping for ester suffix→prefix conversion.
|
|
490
|
+
# "methyl X-carboxylate" → "(methoxycarbonyl)X"
|
|
491
|
+
_ALKYL_TO_ALKOXY = {
|
|
492
|
+
"methyl": "methoxy", "ethyl": "ethoxy", "propyl": "propoxy",
|
|
493
|
+
"isopropyl": "isopropoxy", "butyl": "butoxy",
|
|
494
|
+
"tert-butyl": "tert-butoxy", "isobutyl": "isobutoxy",
|
|
495
|
+
"sec-butyl": "sec-butoxy", "benzyl": "benzyloxy",
|
|
496
|
+
"allyl": "allyloxy", "phenyl": "phenoxy", "vinyl": "vinyloxy",
|
|
497
|
+
"neopentyl": "neopentyloxy", "cyclopentyl": "cyclopentyloxy",
|
|
498
|
+
"cyclohexyl": "cyclohexyloxy",
|
|
499
|
+
}
|
|
500
|
+
_ALKOXY_TO_ALKYL = {v: k for k, v in _ALKYL_TO_ALKOXY.items()}
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def _ester_variants(name: str, parent: str) -> List[Tuple[str, str]]:
|
|
504
|
+
"""Generate ester naming alternatives.
|
|
505
|
+
|
|
506
|
+
"ethyl X-carboxylate" → "X-carboxylic acid ethyl ester"
|
|
507
|
+
"X-ic acid alkyl ester" → "alkyl X-ate"
|
|
508
|
+
"""
|
|
509
|
+
variants: List[Tuple[str, str]] = []
|
|
510
|
+
|
|
511
|
+
# Direction 1: "alkyl ...ate" → "...ic acid alkyl ester"
|
|
512
|
+
parts = name.split(None, 1)
|
|
513
|
+
if len(parts) == 2:
|
|
514
|
+
first = parts[0]
|
|
515
|
+
rest = parts[1]
|
|
516
|
+
if first.lower() in _COMMON_ALKYL_ESTERS and rest.endswith("ate"):
|
|
517
|
+
acid_form = rest[:-3] + "ic acid"
|
|
518
|
+
variant = acid_form + " " + first + " ester"
|
|
519
|
+
# Parent: use the acid form as parent (same ring)
|
|
520
|
+
acid_parent = parent
|
|
521
|
+
if parent and parent.endswith("ate"):
|
|
522
|
+
acid_parent = parent[:-3] + "ic acid"
|
|
523
|
+
variants.append((variant, acid_parent))
|
|
524
|
+
|
|
525
|
+
# Direction 2: "...ic acid alkyl ester" → "alkyl ...ate"
|
|
526
|
+
m = re.match(r'^(.+ic acid)\s+(\S+)\s+ester$', name, re.IGNORECASE)
|
|
527
|
+
if m:
|
|
528
|
+
acid_part = m.group(1)
|
|
529
|
+
alkyl = m.group(2)
|
|
530
|
+
# "Xic acid" → "Xate" (strip "ic acid" = 7 chars, append "ate")
|
|
531
|
+
ester_form = alkyl + " " + acid_part[:-7] + "ate"
|
|
532
|
+
variants.append((ester_form, parent))
|
|
533
|
+
|
|
534
|
+
return variants
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
# Retained IUPAC name → systematic alternative(s).
|
|
538
|
+
# Includes both the base retained name and common derivatives.
|
|
539
|
+
_RETAINED_TO_SYSTEMATIC = {
|
|
540
|
+
# Benzene derivatives
|
|
541
|
+
"aniline": "phenylamine",
|
|
542
|
+
"phenol": "benzenol",
|
|
543
|
+
"anisole": "methoxybenzene",
|
|
544
|
+
"benzaldehyde": "benzenecarbaldehyde",
|
|
545
|
+
"acetophenone": "1-phenylethanone",
|
|
546
|
+
"styrene": "ethenylbenzene",
|
|
547
|
+
"catechol": "benzene-1,2-diol",
|
|
548
|
+
"resorcinol": "benzene-1,3-diol",
|
|
549
|
+
"hydroquinone": "benzene-1,4-diol",
|
|
550
|
+
"cresol": "methylphenol",
|
|
551
|
+
"toluene": "methylbenzene",
|
|
552
|
+
"xylene": "dimethylbenzene",
|
|
553
|
+
"cumene": "isopropylbenzene",
|
|
554
|
+
# Naphthalene derivatives
|
|
555
|
+
"naphthol": "naphthalenol",
|
|
556
|
+
# Common heterocycle retained names
|
|
557
|
+
"nicotinamide": "pyridine-3-carboxamide",
|
|
558
|
+
"nicotinic acid": "pyridine-3-carboxylic acid",
|
|
559
|
+
"salicylaldehyde": "2-hydroxybenzaldehyde",
|
|
560
|
+
"salicylic acid": "2-hydroxybenzoic acid",
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def _retained_systematic_variants(
|
|
565
|
+
name: str, parent: str,
|
|
566
|
+
) -> List[Tuple[str, str]]:
|
|
567
|
+
"""Generate systematic IUPAC alternatives for retained names.
|
|
568
|
+
|
|
569
|
+
E.g. "2,6-dimethylaniline" → "2,6-dimethylphenylamine"
|
|
570
|
+
"""
|
|
571
|
+
variants: List[Tuple[str, str]] = []
|
|
572
|
+
name_lower = name.lower()
|
|
573
|
+
|
|
574
|
+
for retained, systematic in _RETAINED_TO_SYSTEMATIC.items():
|
|
575
|
+
if retained in name_lower:
|
|
576
|
+
idx = name_lower.index(retained)
|
|
577
|
+
# Preserve original case of prefix
|
|
578
|
+
variant = name[:idx] + systematic + name[idx + len(retained):]
|
|
579
|
+
# Parent: replace retained name in parent too
|
|
580
|
+
if parent:
|
|
581
|
+
parent_lower = parent.lower()
|
|
582
|
+
if retained in parent_lower:
|
|
583
|
+
pidx = parent_lower.index(retained)
|
|
584
|
+
new_parent = parent[:pidx] + systematic + parent[pidx + len(retained):]
|
|
585
|
+
else:
|
|
586
|
+
new_parent = parent
|
|
587
|
+
else:
|
|
588
|
+
new_parent = variant
|
|
589
|
+
variants.append((variant, new_parent))
|
|
590
|
+
|
|
591
|
+
# Also generate reverse: systematic → retained
|
|
592
|
+
for retained, systematic in _RETAINED_TO_SYSTEMATIC.items():
|
|
593
|
+
if systematic in name_lower and retained not in name_lower:
|
|
594
|
+
idx = name_lower.index(systematic)
|
|
595
|
+
variant = name[:idx] + retained + name[idx + len(systematic):]
|
|
596
|
+
if parent:
|
|
597
|
+
parent_lower = parent.lower()
|
|
598
|
+
if systematic in parent_lower:
|
|
599
|
+
pidx = parent_lower.index(systematic)
|
|
600
|
+
new_parent = parent[:pidx] + retained + parent[pidx + len(systematic):]
|
|
601
|
+
else:
|
|
602
|
+
new_parent = parent
|
|
603
|
+
else:
|
|
604
|
+
new_parent = variant
|
|
605
|
+
variants.append((variant, new_parent))
|
|
606
|
+
|
|
607
|
+
return variants
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
def _indicated_h_variants(name: str, parent: str) -> List[Tuple[str, str]]:
|
|
611
|
+
"""Generate suffix→prefix variants for indicated-H lactam names.
|
|
612
|
+
|
|
613
|
+
E.g. "6,7-dimethoxyquinazolin-4(3H)-one" → "4-oxo-6,7-dimethoxyquinazoline"
|
|
614
|
+
|
|
615
|
+
The decomposer's suffix→prefix sometimes fails for names with
|
|
616
|
+
indicated-hydrogen notation like (3H), (1H), etc.
|
|
617
|
+
"""
|
|
618
|
+
variants: List[Tuple[str, str]] = []
|
|
619
|
+
|
|
620
|
+
suffix_map = {
|
|
621
|
+
"one": "oxo",
|
|
622
|
+
"ol": "hydroxy",
|
|
623
|
+
"amine": "amino",
|
|
624
|
+
"thione": "thioxo",
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
name_lower = name.lower()
|
|
628
|
+
|
|
629
|
+
for suffix, prefix in suffix_map.items():
|
|
630
|
+
# Look for "STEM-LOCANT(IH)-SUFFIX" where STEM is a known ring stem
|
|
631
|
+
tail_pattern = r'-(\d+)\(\d+[hH]\)-' + re.escape(suffix) + r'$'
|
|
632
|
+
tail_m = re.search(tail_pattern, name_lower)
|
|
633
|
+
if not tail_m:
|
|
634
|
+
continue
|
|
635
|
+
|
|
636
|
+
locant = tail_m.group(1)
|
|
637
|
+
before_locant = name[:tail_m.start()] # everything before "-LOCANT(IH)-SUFFIX"
|
|
638
|
+
|
|
639
|
+
# Find the longest known ring stem at the END of before_locant
|
|
640
|
+
best_stem = None
|
|
641
|
+
best_ring = None
|
|
642
|
+
for stem, ring in _KNOWN_RING_STEMS.items():
|
|
643
|
+
if before_locant.lower().endswith(stem):
|
|
644
|
+
if best_stem is None or len(stem) > len(best_stem):
|
|
645
|
+
best_stem = stem
|
|
646
|
+
best_ring = ring
|
|
647
|
+
|
|
648
|
+
if best_stem:
|
|
649
|
+
# Split: leading substituents + ring
|
|
650
|
+
leading = before_locant[:len(before_locant) - len(best_stem)]
|
|
651
|
+
|
|
652
|
+
# Build variant: "LOCANT-PREFIX-LEADING-RING_FULL"
|
|
653
|
+
# E.g. "4-oxo-6,7-dimethoxyquinazoline"
|
|
654
|
+
if leading:
|
|
655
|
+
variant = locant + "-" + prefix + "-" + leading + best_ring
|
|
656
|
+
else:
|
|
657
|
+
variant = locant + "-" + prefix + best_ring
|
|
658
|
+
variant = re.sub(r'-{2,}', '-', variant)
|
|
659
|
+
variant = variant.strip('-')
|
|
660
|
+
|
|
661
|
+
new_parent = variant
|
|
662
|
+
variants.append((variant, new_parent))
|
|
663
|
+
|
|
664
|
+
return variants
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
def _general_suffix_prefix_variants(name: str, parent: str) -> List[Tuple[str, str]]:
|
|
668
|
+
"""Generate suffix→prefix variants for standard IUPAC names.
|
|
669
|
+
|
|
670
|
+
Handles names WITHOUT indicated-H, e.g.:
|
|
671
|
+
"pyridin-2-amine" → "2-aminopyridine"
|
|
672
|
+
"naphthalen-1-ol" → "1-hydroxynapthalene"
|
|
673
|
+
|
|
674
|
+
Complements _indicated_h_variants which handles (NH) notation.
|
|
675
|
+
"""
|
|
676
|
+
variants: List[Tuple[str, str]] = []
|
|
677
|
+
name_lower = name.lower()
|
|
678
|
+
|
|
679
|
+
suffix_map = {
|
|
680
|
+
"amine": "amino",
|
|
681
|
+
"ol": "hydroxy",
|
|
682
|
+
"one": "oxo",
|
|
683
|
+
"thiol": "sulfanyl",
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
for suffix, prefix in suffix_map.items():
|
|
687
|
+
# Pattern: "ring-LOCANT-SUFFIX" at the end
|
|
688
|
+
# E.g. "pyridin-2-amine", "naphthalen-1-ol"
|
|
689
|
+
# Must NOT have indicated-H (handled by _indicated_h_variants)
|
|
690
|
+
pattern = r'-(\d+(?:,\d+)*)-' + re.escape(suffix) + r'$'
|
|
691
|
+
m = re.search(pattern, name_lower)
|
|
692
|
+
if not m:
|
|
693
|
+
continue
|
|
694
|
+
|
|
695
|
+
# Skip if there's an indicated-H right before the suffix
|
|
696
|
+
if re.search(r'\(\d+[hH]\)-' + re.escape(suffix) + r'$', name_lower):
|
|
697
|
+
continue
|
|
698
|
+
|
|
699
|
+
locants = m.group(1)
|
|
700
|
+
before = name[:m.start()] # everything before "-LOCANT-SUFFIX"
|
|
701
|
+
|
|
702
|
+
# Find longest known ring stem at the end of 'before'
|
|
703
|
+
best_stem = None
|
|
704
|
+
best_ring = None
|
|
705
|
+
for stem, ring in _KNOWN_RING_STEMS.items():
|
|
706
|
+
if before.lower().endswith(stem):
|
|
707
|
+
if best_stem is None or len(stem) > len(best_stem):
|
|
708
|
+
best_stem = stem
|
|
709
|
+
best_ring = ring
|
|
710
|
+
|
|
711
|
+
if best_stem:
|
|
712
|
+
leading = before[:len(before) - len(best_stem)]
|
|
713
|
+
if leading:
|
|
714
|
+
# E.g. "6,7-dimethoxypyridin-2-amine" →
|
|
715
|
+
# "2-amino-6,7-dimethoxypyridine"
|
|
716
|
+
variant = locants + "-" + prefix + "-" + leading + best_ring
|
|
717
|
+
else:
|
|
718
|
+
# E.g. "pyridin-2-amine" → "2-aminopyridine"
|
|
719
|
+
variant = locants + "-" + prefix + best_ring
|
|
720
|
+
variant = re.sub(r'-{2,}', '-', variant)
|
|
721
|
+
variant = variant.strip('-')
|
|
722
|
+
variants.append((variant, variant))
|
|
723
|
+
|
|
724
|
+
return variants
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
# ---------------------------------------------------------------------------
|
|
728
|
+
# Extended suffix→prefix for principal characteristic groups
|
|
729
|
+
# ---------------------------------------------------------------------------
|
|
730
|
+
# Suffixes that extend beyond the parent ring stem (carboxylate,
|
|
731
|
+
# carbaldehyde, carboxamide, carbonitrile, carboxylic acid).
|
|
732
|
+
# These follow the same ring_stem-LOCANT-SUFFIX pattern as the simple
|
|
733
|
+
# suffixes handled by _general_suffix_prefix_variants, but the suffix
|
|
734
|
+
# words themselves are longer and sometimes require an alkyl ester prefix.
|
|
735
|
+
|
|
736
|
+
# (suffix, prefix_or_None, needs_alkyl_prefix, wrap_in_parens)
|
|
737
|
+
# When needs_alkyl_prefix is True, the prefix is built from the
|
|
738
|
+
# space-separated first word of the name (e.g. "methyl" → "methoxycarbonyl").
|
|
739
|
+
_EXTENDED_SUFFIX_TABLE: List[Tuple[str, Optional[str], bool, bool]] = [
|
|
740
|
+
("carboxylic acid", "carboxy", False, False),
|
|
741
|
+
("carboxamide", "carbamoyl", False, True),
|
|
742
|
+
("carbaldehyde", "formyl", False, False),
|
|
743
|
+
("carbonitrile", "cyano", False, False),
|
|
744
|
+
("carboxylate", None, True, True), # prefix from alkyl
|
|
745
|
+
]
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
def _extended_suffix_prefix_variants(
|
|
749
|
+
name: str, parent: str,
|
|
750
|
+
) -> List[Tuple[str, str]]:
|
|
751
|
+
"""Generate suffix→prefix variants for principal characteristic groups.
|
|
752
|
+
|
|
753
|
+
Handles suffixes that extend beyond the parent ring stem:
|
|
754
|
+
"methyl 2-(methylthio)thieno[2,3-d]pyrimidine-4-carboxylate"
|
|
755
|
+
→ "2-(methylthio)-4-(methoxycarbonyl)thieno[2,3-d]pyrimidine"
|
|
756
|
+
"2-(methylthio)thieno[2,3-d]pyrimidine-4-carbaldehyde"
|
|
757
|
+
→ "2-(methylthio)-4-formylthieno[2,3-d]pyrimidine"
|
|
758
|
+
|
|
759
|
+
Algorithm mirrors _general_suffix_prefix_variants: find the suffix at
|
|
760
|
+
the end of the name, locate the ring stem before it, and reconstruct
|
|
761
|
+
with the prefix form prepended.
|
|
762
|
+
"""
|
|
763
|
+
variants: List[Tuple[str, str]] = []
|
|
764
|
+
|
|
765
|
+
for suffix, prefix, needs_alkyl, wrap in _EXTENDED_SUFFIX_TABLE:
|
|
766
|
+
# Work on a potentially trimmed name (alkyl stripped for esters)
|
|
767
|
+
work_name = name
|
|
768
|
+
alkyl = None
|
|
769
|
+
|
|
770
|
+
if needs_alkyl:
|
|
771
|
+
parts = name.split(None, 1)
|
|
772
|
+
if len(parts) != 2:
|
|
773
|
+
continue
|
|
774
|
+
if parts[0].lower() not in _ALKYL_TO_ALKOXY:
|
|
775
|
+
continue
|
|
776
|
+
alkyl = parts[0]
|
|
777
|
+
work_name = parts[1]
|
|
778
|
+
|
|
779
|
+
work_lower = work_name.lower()
|
|
780
|
+
|
|
781
|
+
# Match "-LOCANT-SUFFIX" (or " SUFFIX" for multi-word) at the end
|
|
782
|
+
if " " in suffix:
|
|
783
|
+
# Multi-word suffix like "carboxylic acid"
|
|
784
|
+
pattern = r'-(\d+(?:,\d+)*)-' + re.escape(suffix) + r'$'
|
|
785
|
+
else:
|
|
786
|
+
pattern = r'-(\d+(?:,\d+)*)-' + re.escape(suffix) + r'$'
|
|
787
|
+
|
|
788
|
+
m = re.search(pattern, work_lower)
|
|
789
|
+
if not m:
|
|
790
|
+
continue
|
|
791
|
+
|
|
792
|
+
locants = m.group(1)
|
|
793
|
+
before = work_name[:m.start()] # everything before "-LOCANT-SUFFIX"
|
|
794
|
+
|
|
795
|
+
# Find longest known ring stem at the end of 'before'
|
|
796
|
+
before_lower = before.lower()
|
|
797
|
+
best_stem = None
|
|
798
|
+
best_ring = None
|
|
799
|
+
for stem, ring in _KNOWN_RING_STEMS.items():
|
|
800
|
+
if before_lower.endswith(stem):
|
|
801
|
+
if best_stem is None or len(stem) > len(best_stem):
|
|
802
|
+
best_stem = stem
|
|
803
|
+
best_ring = ring
|
|
804
|
+
|
|
805
|
+
if best_stem is None:
|
|
806
|
+
continue
|
|
807
|
+
|
|
808
|
+
leading = before[:len(before) - len(best_stem)]
|
|
809
|
+
|
|
810
|
+
# Build the prefix
|
|
811
|
+
if needs_alkyl and alkyl is not None:
|
|
812
|
+
alkoxy = _ALKYL_TO_ALKOXY[alkyl.lower()]
|
|
813
|
+
prefix = f"{alkoxy}carbonyl"
|
|
814
|
+
wrap = True
|
|
815
|
+
|
|
816
|
+
if prefix is None:
|
|
817
|
+
continue
|
|
818
|
+
|
|
819
|
+
pref = f"({prefix})" if wrap else prefix
|
|
820
|
+
|
|
821
|
+
# Reconstruct: LOCANT-PREFIX-LEADING-RING
|
|
822
|
+
if leading:
|
|
823
|
+
variant = locants + "-" + pref + "-" + leading + best_ring
|
|
824
|
+
else:
|
|
825
|
+
variant = locants + "-" + pref + best_ring
|
|
826
|
+
variant = re.sub(r'-{2,}', '-', variant)
|
|
827
|
+
variant = variant.strip('-')
|
|
828
|
+
|
|
829
|
+
new_parent = best_ring
|
|
830
|
+
variants.append((variant, new_parent))
|
|
831
|
+
|
|
832
|
+
return variants
|
|
833
|
+
|
|
834
|
+
|
|
835
|
+
def _extended_prefix_to_suffix_variants(
|
|
836
|
+
name: str, parent: str,
|
|
837
|
+
) -> List[Tuple[str, str]]:
|
|
838
|
+
"""Generate prefix→suffix variants for principal characteristic groups.
|
|
839
|
+
|
|
840
|
+
Reverse of _extended_suffix_prefix_variants:
|
|
841
|
+
"2-(methylthio)-4-(methoxycarbonyl)thieno[2,3-d]pyrimidine"
|
|
842
|
+
→ "methyl 2-(methylthio)thieno[2,3-d]pyrimidine-4-carboxylate"
|
|
843
|
+
"4-formyl-2-(methylthio)thieno[2,3-d]pyrimidine"
|
|
844
|
+
→ "2-(methylthio)thieno[2,3-d]pyrimidine-4-carbaldehyde"
|
|
845
|
+
|
|
846
|
+
Detects known prefix patterns in the name's substituent chain and
|
|
847
|
+
converts them to suffix form appended after the ring stem.
|
|
848
|
+
"""
|
|
849
|
+
variants: List[Tuple[str, str]] = []
|
|
850
|
+
name_lower = name.lower()
|
|
851
|
+
|
|
852
|
+
# Table: (prefix_to_detect, suffix, produces_alkyl)
|
|
853
|
+
# For alkoxycarbonyl, we iterate over _ALKOXY_TO_ALKYL entries.
|
|
854
|
+
simple_prefix_map = [
|
|
855
|
+
("carboxy", "carboxylic acid", False),
|
|
856
|
+
("carbamoyl", "carboxamide", False),
|
|
857
|
+
("formyl", "carbaldehyde", False),
|
|
858
|
+
("cyano", "carbonitrile", False),
|
|
859
|
+
]
|
|
860
|
+
|
|
861
|
+
# --- Simple prefixes (no alkyl) ---
|
|
862
|
+
for prefix, suffix, _ in simple_prefix_map:
|
|
863
|
+
# Match "LOCANT-PREFIX" or "LOCANT-(PREFIX)" in the name
|
|
864
|
+
# Try with parentheses first
|
|
865
|
+
for pfx_pat in [re.escape(f"({prefix})"), re.escape(prefix)]:
|
|
866
|
+
pat = r'(\d+(?:,\d+)*)-' + pfx_pat + r'[-]?'
|
|
867
|
+
m_pref = re.search(pat, name_lower)
|
|
868
|
+
if m_pref:
|
|
869
|
+
break
|
|
870
|
+
else:
|
|
871
|
+
continue
|
|
872
|
+
|
|
873
|
+
locants = m_pref.group(1)
|
|
874
|
+
|
|
875
|
+
# Remove the matched prefix group from the name
|
|
876
|
+
before_match = name[:m_pref.start()]
|
|
877
|
+
after_match = name[m_pref.end():]
|
|
878
|
+
core = before_match + after_match
|
|
879
|
+
core = re.sub(r'-{2,}', '-', core)
|
|
880
|
+
core = core.strip('-')
|
|
881
|
+
|
|
882
|
+
# Find ring stem in core to build suffix form
|
|
883
|
+
core_lower = core.lower()
|
|
884
|
+
best_stem = None
|
|
885
|
+
best_ring = None
|
|
886
|
+
for stem, ring in _KNOWN_RING_STEMS.items():
|
|
887
|
+
if stem in core_lower:
|
|
888
|
+
if best_stem is None or len(stem) > len(best_stem):
|
|
889
|
+
best_stem = stem
|
|
890
|
+
best_ring = ring
|
|
891
|
+
|
|
892
|
+
if best_stem is None:
|
|
893
|
+
continue
|
|
894
|
+
|
|
895
|
+
# Find where the stem ends in core, insert "-LOCANT-SUFFIX" there
|
|
896
|
+
stem_idx = core_lower.rfind(best_stem)
|
|
897
|
+
stem_end = stem_idx + len(best_stem)
|
|
898
|
+
|
|
899
|
+
# Include trailing 'e' if present (elided stem → full ring)
|
|
900
|
+
if (stem_end < len(core) and core[stem_end].lower() == 'e'
|
|
901
|
+
and best_ring.endswith('e') and not best_stem.endswith('e')):
|
|
902
|
+
stem_end += 1
|
|
903
|
+
|
|
904
|
+
variant = core[:stem_end] + "-" + locants + "-" + suffix + core[stem_end:]
|
|
905
|
+
variant = re.sub(r'-{2,}', '-', variant)
|
|
906
|
+
variant = variant.strip('-')
|
|
907
|
+
variants.append((variant, parent))
|
|
908
|
+
|
|
909
|
+
# --- Ester prefixes: (alkoxycarbonyl) → alkyl ... carboxylate ---
|
|
910
|
+
for alkoxy, alkyl in _ALKOXY_TO_ALKYL.items():
|
|
911
|
+
target = f"({alkoxy}carbonyl)"
|
|
912
|
+
target_lower = target.lower()
|
|
913
|
+
idx = name_lower.find(target_lower)
|
|
914
|
+
if idx < 0:
|
|
915
|
+
continue
|
|
916
|
+
|
|
917
|
+
# Find locant before the prefix
|
|
918
|
+
before_target = name[:idx]
|
|
919
|
+
m_loc = re.search(r'(\d+(?:,\d+)*)-$', before_target)
|
|
920
|
+
if not m_loc:
|
|
921
|
+
continue
|
|
922
|
+
|
|
923
|
+
locants = m_loc.group(1)
|
|
924
|
+
|
|
925
|
+
# Remove the matched "LOCANT-(alkoxycarbonyl)" from the name
|
|
926
|
+
before_loc = name[:m_loc.start()].rstrip('-')
|
|
927
|
+
after_target = name[idx + len(target):].lstrip('-')
|
|
928
|
+
core = before_loc + after_target
|
|
929
|
+
core = re.sub(r'-{2,}', '-', core)
|
|
930
|
+
core = core.strip('-')
|
|
931
|
+
|
|
932
|
+
# Find ring stem to append suffix
|
|
933
|
+
core_lower = core.lower()
|
|
934
|
+
best_stem = None
|
|
935
|
+
best_ring = None
|
|
936
|
+
for stem, ring in _KNOWN_RING_STEMS.items():
|
|
937
|
+
if stem in core_lower:
|
|
938
|
+
if best_stem is None or len(stem) > len(best_stem):
|
|
939
|
+
best_stem = stem
|
|
940
|
+
best_ring = ring
|
|
941
|
+
|
|
942
|
+
if best_stem is None:
|
|
943
|
+
continue
|
|
944
|
+
|
|
945
|
+
stem_idx = core_lower.rfind(best_stem)
|
|
946
|
+
stem_end = stem_idx + len(best_stem)
|
|
947
|
+
if (stem_end < len(core) and core[stem_end].lower() == 'e'
|
|
948
|
+
and best_ring.endswith('e') and not best_stem.endswith('e')):
|
|
949
|
+
stem_end += 1
|
|
950
|
+
|
|
951
|
+
suffix_part = core[:stem_end] + "-" + locants + "-carboxylate" + core[stem_end:]
|
|
952
|
+
variant = alkyl + " " + suffix_part
|
|
953
|
+
variant = re.sub(r'-{2,}', '-', variant)
|
|
954
|
+
variants.append((variant, parent))
|
|
955
|
+
|
|
956
|
+
return variants
|
|
957
|
+
|
|
958
|
+
|
|
959
|
+
def _find_locant_group_starts(text: str) -> List[int]:
|
|
960
|
+
"""Find starting positions of top-level locant-prefix groups in *text*.
|
|
961
|
+
|
|
962
|
+
A locant group starts with ``\\d+(,\\d+)*-`` at bracket depth 0.
|
|
963
|
+
"""
|
|
964
|
+
starts: List[int] = []
|
|
965
|
+
i = 0
|
|
966
|
+
depth = 0
|
|
967
|
+
while i < len(text):
|
|
968
|
+
c = text[i]
|
|
969
|
+
if c in '([':
|
|
970
|
+
depth += 1
|
|
971
|
+
i += 1
|
|
972
|
+
elif c in ')]':
|
|
973
|
+
depth -= 1
|
|
974
|
+
i += 1
|
|
975
|
+
elif c.isdigit() and depth == 0:
|
|
976
|
+
j = i
|
|
977
|
+
while j < len(text) and (
|
|
978
|
+
text[j].isdigit() or text[j] == ','):
|
|
979
|
+
j += 1
|
|
980
|
+
if j < len(text) and text[j] == '-':
|
|
981
|
+
starts.append(i)
|
|
982
|
+
i = max(i + 1, j)
|
|
983
|
+
else:
|
|
984
|
+
i += 1
|
|
985
|
+
return starts
|
|
986
|
+
|
|
987
|
+
|
|
988
|
+
def _reorder_locant_prefixes(name: str, parent: str) -> Optional[str]:
|
|
989
|
+
"""Reorder top-level locant-prefix groups to ascending locant order.
|
|
990
|
+
|
|
991
|
+
IUPAC convention requires substituent prefixes to appear in ascending
|
|
992
|
+
locant order. E.g.:
|
|
993
|
+
|
|
994
|
+
"5-(chlorosulfonyl)-2-ethoxybenzoic acid" (parent "benzoic acid")
|
|
995
|
+
→ "2-ethoxy-5-(chlorosulfonyl)benzoic acid"
|
|
996
|
+
|
|
997
|
+
*parent* is needed to locate the boundary between the prefix section
|
|
998
|
+
and the parent stem. Returns the reordered name, or ``None`` if the
|
|
999
|
+
name is already in ascending order or cannot be parsed.
|
|
1000
|
+
"""
|
|
1001
|
+
if not parent:
|
|
1002
|
+
return None
|
|
1003
|
+
|
|
1004
|
+
name_lower = name.lower()
|
|
1005
|
+
parent_lower = parent.lower().strip()
|
|
1006
|
+
|
|
1007
|
+
# --- Collect candidate parent-stem positions -------------------------
|
|
1008
|
+
# Multiple strategies are needed because the decomposer's parent may
|
|
1009
|
+
# include substituent prefixes (e.g. "2-ethoxybenzoic acid" instead
|
|
1010
|
+
# of "benzoic acid"). We try all strategies and pick the first
|
|
1011
|
+
# candidate that yields ≥ 2 non-ascending locant groups.
|
|
1012
|
+
candidates: List[int] = []
|
|
1013
|
+
|
|
1014
|
+
# Strategy 1: Literal parent match
|
|
1015
|
+
idx = name_lower.rfind(parent_lower)
|
|
1016
|
+
if idx > 0:
|
|
1017
|
+
candidates.append(idx)
|
|
1018
|
+
|
|
1019
|
+
# Strategy 2: Elided form ("quinazoline" → "quinazolin")
|
|
1020
|
+
if parent_lower.endswith('e'):
|
|
1021
|
+
idx = name_lower.rfind(parent_lower[:-1])
|
|
1022
|
+
if idx > 0:
|
|
1023
|
+
candidates.append(idx)
|
|
1024
|
+
|
|
1025
|
+
ring = extract_parent_ring(parent_lower)
|
|
1026
|
+
|
|
1027
|
+
# Strategy 3: Known ring stems derived from the parent ring
|
|
1028
|
+
if ring and ring != parent_lower:
|
|
1029
|
+
for stem in _KNOWN_RING_STEMS_BY_LEN:
|
|
1030
|
+
if _KNOWN_RING_STEMS[stem].lower() == ring:
|
|
1031
|
+
idx = name_lower.rfind(stem)
|
|
1032
|
+
if idx > 0:
|
|
1033
|
+
candidates.append(idx)
|
|
1034
|
+
break
|
|
1035
|
+
|
|
1036
|
+
# Strategy 4: Ring-name marker for retained acid names
|
|
1037
|
+
# E.g. "benzoic acid" → ring "benzene" → marker "benz" finds the
|
|
1038
|
+
# parent position even though "benzene"/"benzen" aren't in "benzoic".
|
|
1039
|
+
if ring and len(ring) >= 3:
|
|
1040
|
+
for end in range(len(ring), 2, -1):
|
|
1041
|
+
marker = ring[:end]
|
|
1042
|
+
idx = name_lower.rfind(marker)
|
|
1043
|
+
if idx > 0:
|
|
1044
|
+
candidates.append(idx)
|
|
1045
|
+
break
|
|
1046
|
+
|
|
1047
|
+
if not candidates:
|
|
1048
|
+
return None
|
|
1049
|
+
|
|
1050
|
+
# --- Try each candidate, use the first that yields a reordering ------
|
|
1051
|
+
# Sort candidates ascending so smaller (= longer prefix section) first.
|
|
1052
|
+
for parent_pos in sorted(set(candidates)):
|
|
1053
|
+
prefix_section = name[:parent_pos]
|
|
1054
|
+
parent_section = name[parent_pos:]
|
|
1055
|
+
|
|
1056
|
+
group_starts = _find_locant_group_starts(prefix_section)
|
|
1057
|
+
if len(group_starts) <= 1:
|
|
1058
|
+
continue # try next candidate
|
|
1059
|
+
|
|
1060
|
+
# Extract (first_locant_value, group_text) for each group
|
|
1061
|
+
groups: List[Tuple[int, str]] = []
|
|
1062
|
+
for idx_g, start in enumerate(group_starts):
|
|
1063
|
+
end = (group_starts[idx_g + 1]
|
|
1064
|
+
if idx_g + 1 < len(group_starts)
|
|
1065
|
+
else len(prefix_section))
|
|
1066
|
+
group_text = prefix_section[start:end]
|
|
1067
|
+
m = re.match(r'(\d+)', group_text)
|
|
1068
|
+
first_loc = int(m.group(1)) if m else 0
|
|
1069
|
+
groups.append((first_loc, group_text))
|
|
1070
|
+
|
|
1071
|
+
# Already in ascending order?
|
|
1072
|
+
locs = [g[0] for g in groups]
|
|
1073
|
+
if locs == sorted(locs):
|
|
1074
|
+
continue # no reordering needed at this split point
|
|
1075
|
+
|
|
1076
|
+
before = prefix_section[:group_starts[0]]
|
|
1077
|
+
sorted_groups = sorted(groups, key=lambda g: g[0])
|
|
1078
|
+
|
|
1079
|
+
# Reassemble: strip trailing '-' from each part, join with '-'
|
|
1080
|
+
parts = [g[1].rstrip('-') for g in sorted_groups]
|
|
1081
|
+
new_prefix = '-'.join(parts)
|
|
1082
|
+
|
|
1083
|
+
result = before + new_prefix + parent_section
|
|
1084
|
+
result = re.sub(r'-{2,}', '-', result)
|
|
1085
|
+
return result
|
|
1086
|
+
|
|
1087
|
+
return None
|
|
1088
|
+
|
|
1089
|
+
|
|
1090
|
+
# ---------------------------------------------------------------------------
|
|
1091
|
+
# Retained name → substitutive prefix + ring decomposition
|
|
1092
|
+
# ---------------------------------------------------------------------------
|
|
1093
|
+
# Retained names like "aniline" are systematically "aminobenzene" (prefix
|
|
1094
|
+
# + ring). When the retained name appears as a parent with numbered
|
|
1095
|
+
# substituent prefixes (e.g. "4-fluoroaniline"), we can generate the
|
|
1096
|
+
# fully substitutive form "4-fluoro-1-aminobenzene" where the retained
|
|
1097
|
+
# name's defining substituent gets its own locant. This often yields a
|
|
1098
|
+
# closer text match in aligned sequences.
|
|
1099
|
+
|
|
1100
|
+
# (retained_name, substituent_prefix, ring_name, default_locant)
|
|
1101
|
+
# default_locant: position of the defining substituent in the standard
|
|
1102
|
+
# numbering. None means the retained name is used for multiple isomers
|
|
1103
|
+
# (e.g. naphthol can be 1- or 2-).
|
|
1104
|
+
_RETAINED_SUBSTITUTIVE = [
|
|
1105
|
+
("aniline", "amino", "benzene", "1"),
|
|
1106
|
+
("phenol", "hydroxy", "benzene", "1"),
|
|
1107
|
+
("anisole", "methoxy", "benzene", "1"),
|
|
1108
|
+
("thiophenol", "sulfanyl", "benzene", "1"),
|
|
1109
|
+
("naphthol", "hydroxy", "naphthalene", None),
|
|
1110
|
+
]
|
|
1111
|
+
|
|
1112
|
+
|
|
1113
|
+
def _retained_to_substitutive_variants(
|
|
1114
|
+
name: str, parent: str,
|
|
1115
|
+
) -> List[Tuple[str, str]]:
|
|
1116
|
+
"""Generate fully substitutive variants from retained parent names.
|
|
1117
|
+
|
|
1118
|
+
"4-fluoroaniline" → "4-fluoro-1-aminobenzene" (parent: benzene)
|
|
1119
|
+
"2,6-dichlorophenol" → "2,6-dichloro-1-hydroxybenzene" (parent: benzene)
|
|
1120
|
+
|
|
1121
|
+
The locant reorder pass later normalises to ascending order:
|
|
1122
|
+
"4-fluoro-1-aminobenzene" → "1-amino-4-fluorobenzene"
|
|
1123
|
+
"""
|
|
1124
|
+
variants: List[Tuple[str, str]] = []
|
|
1125
|
+
name_lower = name.lower()
|
|
1126
|
+
|
|
1127
|
+
for retained, prefix, ring, locant in _RETAINED_SUBSTITUTIVE:
|
|
1128
|
+
if retained not in name_lower:
|
|
1129
|
+
continue
|
|
1130
|
+
if locant is None:
|
|
1131
|
+
continue # skip ambiguous retained names
|
|
1132
|
+
|
|
1133
|
+
idx = name_lower.index(retained)
|
|
1134
|
+
leading = name[:idx] # e.g. "4-fluoro" from "4-fluoroaniline"
|
|
1135
|
+
trailing = name[idx + len(retained):] # e.g. "" (usually empty)
|
|
1136
|
+
|
|
1137
|
+
# Build substitutive form: "leading-LOCANT-PREFIX-ring-trailing"
|
|
1138
|
+
if leading:
|
|
1139
|
+
# Ensure proper hyphenation
|
|
1140
|
+
lead = leading.rstrip('-')
|
|
1141
|
+
variant = f"{lead}-{locant}-{prefix}{ring}{trailing}"
|
|
1142
|
+
else:
|
|
1143
|
+
variant = f"{locant}-{prefix}{ring}{trailing}"
|
|
1144
|
+
|
|
1145
|
+
variant = re.sub(r'-{2,}', '-', variant)
|
|
1146
|
+
new_parent = ring
|
|
1147
|
+
variants.append((variant, new_parent))
|
|
1148
|
+
|
|
1149
|
+
return variants
|
|
1150
|
+
|
|
1151
|
+
|
|
1152
|
+
def _generate_alignment_variants(
|
|
1153
|
+
name: str, parent: str,
|
|
1154
|
+
) -> List[Tuple[str, str]]:
|
|
1155
|
+
"""Generate all alignment variant alternatives for a name.
|
|
1156
|
+
|
|
1157
|
+
Returns list of (variant_name, variant_parent) tuples.
|
|
1158
|
+
These supplement the decomposer's alternatives with IUPAC-equivalent
|
|
1159
|
+
forms that reduce text distance between consecutive names.
|
|
1160
|
+
"""
|
|
1161
|
+
variants: List[Tuple[str, str]] = []
|
|
1162
|
+
# Locant reordering — always try ascending locant normalization
|
|
1163
|
+
reordered = _reorder_locant_prefixes(name, parent)
|
|
1164
|
+
if reordered and reordered != name:
|
|
1165
|
+
variants.append((reordered, parent))
|
|
1166
|
+
variants.extend(_ester_variants(name, parent))
|
|
1167
|
+
variants.extend(_extended_suffix_prefix_variants(name, parent))
|
|
1168
|
+
variants.extend(_extended_prefix_to_suffix_variants(name, parent))
|
|
1169
|
+
variants.extend(_retained_systematic_variants(name, parent))
|
|
1170
|
+
variants.extend(_retained_to_substitutive_variants(name, parent))
|
|
1171
|
+
variants.extend(_indicated_h_variants(name, parent))
|
|
1172
|
+
variants.extend(_general_suffix_prefix_variants(name, parent))
|
|
1173
|
+
return variants
|
|
1174
|
+
|
|
1175
|
+
|
|
1176
|
+
# ---------------------------------------------------------------------------
|
|
1177
|
+
# Two-pass contextual variant generation
|
|
1178
|
+
# ---------------------------------------------------------------------------
|
|
1179
|
+
|
|
1180
|
+
def _contextual_variants(
|
|
1181
|
+
name: str, parent: str, neighbor_name: str,
|
|
1182
|
+
) -> List[Tuple[str, str]]:
|
|
1183
|
+
"""Generate variants of *name* targeted to match *neighbor_name* better.
|
|
1184
|
+
|
|
1185
|
+
Analyses the neighbor's naming style and generates matching variants.
|
|
1186
|
+
Returns list of (variant_name, variant_parent) tuples.
|
|
1187
|
+
"""
|
|
1188
|
+
variants: List[Tuple[str, str]] = []
|
|
1189
|
+
n_lower = neighbor_name.lower()
|
|
1190
|
+
name_lower = name.lower()
|
|
1191
|
+
|
|
1192
|
+
# 1. If neighbor uses "acid" form, try converting our ester to acid form
|
|
1193
|
+
# (and vice versa)
|
|
1194
|
+
if "carboxylic acid" in n_lower or "acid" in n_lower:
|
|
1195
|
+
variants.extend(_ester_variants(name, parent))
|
|
1196
|
+
if "carboxylate" in n_lower or "ester" in n_lower:
|
|
1197
|
+
variants.extend(_ester_variants(name, parent))
|
|
1198
|
+
|
|
1199
|
+
# 2. If neighbor uses systematic names, try our retained→systematic
|
|
1200
|
+
# If neighbor uses retained names, try systematic→retained
|
|
1201
|
+
for retained, systematic in _RETAINED_TO_SYSTEMATIC.items():
|
|
1202
|
+
if systematic.lower() in n_lower and retained in name_lower:
|
|
1203
|
+
# Neighbor uses systematic form, we have the retained form
|
|
1204
|
+
variants.extend(_retained_systematic_variants(name, parent))
|
|
1205
|
+
break
|
|
1206
|
+
if retained in n_lower and systematic.lower() in name_lower:
|
|
1207
|
+
# Neighbor uses retained form, we have the systematic form
|
|
1208
|
+
variants.extend(_retained_systematic_variants(name, parent))
|
|
1209
|
+
break
|
|
1210
|
+
|
|
1211
|
+
# 3. Style matching: suffix vs prefix naming
|
|
1212
|
+
# If neighbor uses prefix style (e.g., "4-chloro-..."), generate prefix
|
|
1213
|
+
# variants for our names that use suffix style (e.g., "...-4-one")
|
|
1214
|
+
n_tokens = set(_chem_tokens_flat(neighbor_name))
|
|
1215
|
+
our_tokens = set(_chem_tokens_flat(name))
|
|
1216
|
+
|
|
1217
|
+
# Check if neighbor uses prefix-style substituents
|
|
1218
|
+
prefix_subs = {"amino", "hydroxy", "oxo", "sulfanyl"}
|
|
1219
|
+
neighbor_has_prefix = bool(n_tokens & prefix_subs)
|
|
1220
|
+
suffix_subs = {"amine", "ol", "one", "thiol"}
|
|
1221
|
+
we_have_suffix = bool(our_tokens & suffix_subs)
|
|
1222
|
+
|
|
1223
|
+
if neighbor_has_prefix and we_have_suffix:
|
|
1224
|
+
variants.extend(_general_suffix_prefix_variants(name, parent))
|
|
1225
|
+
variants.extend(_indicated_h_variants(name, parent))
|
|
1226
|
+
|
|
1227
|
+
# 4. Reverse: if neighbor uses suffix style, try converting our prefix style
|
|
1228
|
+
# E.g., neighbor has "pyridin-2-amine", we have "2-aminopyridine"
|
|
1229
|
+
neighbor_has_suffix = bool(n_tokens & suffix_subs)
|
|
1230
|
+
we_have_prefix = bool(our_tokens & prefix_subs)
|
|
1231
|
+
|
|
1232
|
+
if neighbor_has_suffix and we_have_prefix:
|
|
1233
|
+
# Try to generate suffix form from our prefix form
|
|
1234
|
+
# This is the reverse of _general_suffix_prefix_variants
|
|
1235
|
+
variants.extend(_prefix_to_suffix_variants(name, parent))
|
|
1236
|
+
|
|
1237
|
+
# 5. Extended principal characteristic group suffix/prefix matching.
|
|
1238
|
+
# If neighbor uses prefix-style naming for extended groups (formyl,
|
|
1239
|
+
# carboxy, carbamoyl, cyano, alkoxycarbonyl), try our suffix→prefix;
|
|
1240
|
+
# and vice versa.
|
|
1241
|
+
ext_prefix_tokens = {"formyl", "carboxy", "carbamoyl", "cyano", "carbonyl"}
|
|
1242
|
+
ext_suffix_tokens = {"carboxylate", "carbaldehyde", "carboxamide",
|
|
1243
|
+
"carbonitrile", "carboxylic"}
|
|
1244
|
+
if n_tokens & ext_prefix_tokens and our_tokens & ext_suffix_tokens:
|
|
1245
|
+
variants.extend(_extended_suffix_prefix_variants(name, parent))
|
|
1246
|
+
if n_tokens & ext_suffix_tokens and our_tokens & ext_prefix_tokens:
|
|
1247
|
+
variants.extend(_extended_prefix_to_suffix_variants(name, parent))
|
|
1248
|
+
|
|
1249
|
+
return variants
|
|
1250
|
+
|
|
1251
|
+
|
|
1252
|
+
def _prefix_to_suffix_variants(
|
|
1253
|
+
name: str, parent: str,
|
|
1254
|
+
) -> List[Tuple[str, str]]:
|
|
1255
|
+
"""Generate prefix→suffix variants (reverse of _general_suffix_prefix_variants).
|
|
1256
|
+
|
|
1257
|
+
E.g., "2-aminopyridine" → "pyridin-2-amine"
|
|
1258
|
+
"4-oxoquinazoline" → "quinazolin-4-one"
|
|
1259
|
+
"""
|
|
1260
|
+
variants: List[Tuple[str, str]] = []
|
|
1261
|
+
name_lower = name.lower()
|
|
1262
|
+
|
|
1263
|
+
prefix_map = {
|
|
1264
|
+
"amino": "amine",
|
|
1265
|
+
"hydroxy": "ol",
|
|
1266
|
+
"oxo": "one",
|
|
1267
|
+
"sulfanyl": "thiol",
|
|
1268
|
+
}
|
|
1269
|
+
|
|
1270
|
+
for prefix, suffix in prefix_map.items():
|
|
1271
|
+
# Pattern: LOCANT-PREFIX-RING at end (or LOCANT-PREFIX-substitutents-RING)
|
|
1272
|
+
# E.g. "2-amino-pyridine", "2-aminopyridine"
|
|
1273
|
+
# We need to find the prefix and ring
|
|
1274
|
+
pat = r'(\d+(?:,\d+)*)-' + re.escape(prefix) + r'[-]?'
|
|
1275
|
+
m = re.search(pat, name_lower)
|
|
1276
|
+
if not m:
|
|
1277
|
+
continue
|
|
1278
|
+
|
|
1279
|
+
locants = m.group(1)
|
|
1280
|
+
after_prefix = name_lower[m.end():] # everything after "N-prefix-"
|
|
1281
|
+
|
|
1282
|
+
# Find a known ring in the remaining part
|
|
1283
|
+
for ring in _KNOWN_RINGS_BY_LEN:
|
|
1284
|
+
if after_prefix == ring or after_prefix.endswith(ring):
|
|
1285
|
+
# Get the elided stem form for the ring (e.g.
|
|
1286
|
+
# "pyridine" → "pyridin"). Prefer the shortest
|
|
1287
|
+
# stem that maps to this ring, which is the elided
|
|
1288
|
+
# form needed for suffix attachment.
|
|
1289
|
+
stem = ring
|
|
1290
|
+
for s in _KNOWN_RING_STEMS_BY_LEN:
|
|
1291
|
+
if _KNOWN_RING_STEMS[s] == ring and len(s) < len(stem):
|
|
1292
|
+
stem = s
|
|
1293
|
+
# Build suffix form: leading-STEM-LOCANT-SUFFIX
|
|
1294
|
+
leading = after_prefix[:len(after_prefix) - len(ring)]
|
|
1295
|
+
variant = leading + stem + "-" + locants + "-" + suffix
|
|
1296
|
+
variant = re.sub(r'-{2,}', '-', variant)
|
|
1297
|
+
variant = variant.strip('-')
|
|
1298
|
+
variants.append((variant, variant))
|
|
1299
|
+
break
|
|
1300
|
+
|
|
1301
|
+
return variants
|
|
1302
|
+
|
|
1303
|
+
|
|
1304
|
+
# ---------------------------------------------------------------------------
|
|
1305
|
+
# DP Viterbi for multi-step sequence alignment
|
|
1306
|
+
# ---------------------------------------------------------------------------
|
|
1307
|
+
|
|
1308
|
+
def _dp_viterbi(
|
|
1309
|
+
names_per_compound: List[List[str]],
|
|
1310
|
+
metric_fn,
|
|
1311
|
+
minimize: bool = True,
|
|
1312
|
+
) -> Tuple[List[str], float]:
|
|
1313
|
+
"""Dynamic programming (Viterbi-style) optimal path.
|
|
1314
|
+
|
|
1315
|
+
O(N * M^2) where N = num compounds, M = max names per compound.
|
|
1316
|
+
Picks one name per compound to optimise the sum of consecutive
|
|
1317
|
+
pairwise metric values.
|
|
1318
|
+
"""
|
|
1319
|
+
N = len(names_per_compound)
|
|
1320
|
+
if N == 0:
|
|
1321
|
+
return [], 0.0
|
|
1322
|
+
|
|
1323
|
+
dp = [{} for _ in range(N)]
|
|
1324
|
+
backptr = [{} for _ in range(N)]
|
|
1325
|
+
|
|
1326
|
+
for j, name in enumerate(names_per_compound[0]):
|
|
1327
|
+
dp[0][j] = 0.0
|
|
1328
|
+
backptr[0][j] = -1
|
|
1329
|
+
|
|
1330
|
+
for i in range(1, N):
|
|
1331
|
+
for j, name_j in enumerate(names_per_compound[i]):
|
|
1332
|
+
best_prev_score = float("inf") if minimize else float("-inf")
|
|
1333
|
+
best_prev_idx = 0
|
|
1334
|
+
for k, name_k in enumerate(names_per_compound[i - 1]):
|
|
1335
|
+
edge = metric_fn(name_k, name_j)
|
|
1336
|
+
cumulative = dp[i - 1][k] + edge
|
|
1337
|
+
if (minimize and cumulative < best_prev_score) or \
|
|
1338
|
+
(not minimize and cumulative > best_prev_score):
|
|
1339
|
+
best_prev_score = cumulative
|
|
1340
|
+
best_prev_idx = k
|
|
1341
|
+
dp[i][j] = best_prev_score
|
|
1342
|
+
backptr[i][j] = best_prev_idx
|
|
1343
|
+
|
|
1344
|
+
if minimize:
|
|
1345
|
+
last_idx = min(dp[N - 1], key=dp[N - 1].get)
|
|
1346
|
+
else:
|
|
1347
|
+
last_idx = max(dp[N - 1], key=dp[N - 1].get)
|
|
1348
|
+
|
|
1349
|
+
total_score = dp[N - 1][last_idx]
|
|
1350
|
+
path = [last_idx]
|
|
1351
|
+
for i in range(N - 1, 0, -1):
|
|
1352
|
+
path.append(backptr[i][path[-1]])
|
|
1353
|
+
path.reverse()
|
|
1354
|
+
|
|
1355
|
+
chosen = [names_per_compound[i][path[i]] for i in range(N)]
|
|
1356
|
+
return chosen, total_score
|
|
1357
|
+
|
|
1358
|
+
|
|
1359
|
+
def _make_parent_penalised_metric(base_metric_fn, name_to_parent: dict,
|
|
1360
|
+
penalty: float = 100.0):
|
|
1361
|
+
"""Create a metric that adds a penalty when parent rings differ.
|
|
1362
|
+
|
|
1363
|
+
The penalty is large enough that the DP will always minimise parent
|
|
1364
|
+
switches first, then optimise the base metric as a tiebreaker.
|
|
1365
|
+
"""
|
|
1366
|
+
def metric(a: str, b: str) -> float:
|
|
1367
|
+
base = base_metric_fn(a, b)
|
|
1368
|
+
pa = extract_parent_ring(name_to_parent.get(a, ""))
|
|
1369
|
+
pb = extract_parent_ring(name_to_parent.get(b, ""))
|
|
1370
|
+
return base + (penalty if pa != pb else 0.0)
|
|
1371
|
+
return metric
|
|
1372
|
+
|
|
1373
|
+
|
|
1374
|
+
# ---------------------------------------------------------------------------
|
|
1375
|
+
# Name diff
|
|
1376
|
+
# ---------------------------------------------------------------------------
|
|
1377
|
+
|
|
1378
|
+
def _tokenize_iupac(name: str) -> List[str]:
|
|
1379
|
+
"""Split an IUPAC name into tokens at dashes, parens, spaces, commas.
|
|
1380
|
+
|
|
1381
|
+
Delimiters are kept as separate tokens so that the reconstructed
|
|
1382
|
+
string ``''.join(tokens)`` equals the original name.
|
|
1383
|
+
"""
|
|
1384
|
+
tokens: List[str] = []
|
|
1385
|
+
buf: List[str] = []
|
|
1386
|
+
for ch in name:
|
|
1387
|
+
if ch in '-() ,':
|
|
1388
|
+
if buf:
|
|
1389
|
+
tokens.append(''.join(buf))
|
|
1390
|
+
buf = []
|
|
1391
|
+
tokens.append(ch)
|
|
1392
|
+
else:
|
|
1393
|
+
buf.append(ch)
|
|
1394
|
+
if buf:
|
|
1395
|
+
tokens.append(''.join(buf))
|
|
1396
|
+
return tokens
|
|
1397
|
+
|
|
1398
|
+
|
|
1399
|
+
def _refine_replace(t1: str, t2: str,
|
|
1400
|
+
min_affix: int = 3) -> List[Tuple[str, str, str]]:
|
|
1401
|
+
"""Refine a 'replace' op by stripping the shared prefix/suffix.
|
|
1402
|
+
|
|
1403
|
+
IUPAC substituents are often concatenated without a delimiter
|
|
1404
|
+
(e.g. "bromoquinolin"), so the token-level diff may lump a
|
|
1405
|
+
substituent and its parent into one replace op. Stripping the
|
|
1406
|
+
common head/tail recovers the clean diff.
|
|
1407
|
+
|
|
1408
|
+
Only strips a prefix/suffix if it is at least *min_affix* characters
|
|
1409
|
+
long, to avoid noisy single-character splits (e.g. the shared "e"
|
|
1410
|
+
in "carbamate" / "amine").
|
|
1411
|
+
|
|
1412
|
+
Returns a list of (tag, text1, text2) ops.
|
|
1413
|
+
"""
|
|
1414
|
+
# Common prefix
|
|
1415
|
+
i = 0
|
|
1416
|
+
while i < min(len(t1), len(t2)) and t1[i] == t2[i]:
|
|
1417
|
+
i += 1
|
|
1418
|
+
if i < min_affix:
|
|
1419
|
+
i = 0 # too short — don't split
|
|
1420
|
+
|
|
1421
|
+
# Common suffix (not overlapping prefix)
|
|
1422
|
+
j = 0
|
|
1423
|
+
while (j < min(len(t1), len(t2)) - i
|
|
1424
|
+
and t1[-(j + 1)] == t2[-(j + 1)]):
|
|
1425
|
+
j += 1
|
|
1426
|
+
if j < min_affix:
|
|
1427
|
+
j = 0 # too short — don't split
|
|
1428
|
+
|
|
1429
|
+
prefix = t1[:i]
|
|
1430
|
+
suffix = t1[len(t1) - j:] if j else ""
|
|
1431
|
+
mid1 = t1[i:len(t1) - j] if j else t1[i:]
|
|
1432
|
+
mid2 = t2[i:len(t2) - j] if j else t2[i:]
|
|
1433
|
+
|
|
1434
|
+
ops: List[Tuple[str, str, str]] = []
|
|
1435
|
+
if prefix:
|
|
1436
|
+
ops.append(('equal', prefix, prefix))
|
|
1437
|
+
if mid1 and mid2:
|
|
1438
|
+
ops.append(('replace', mid1, mid2))
|
|
1439
|
+
elif mid1:
|
|
1440
|
+
ops.append(('delete', mid1, ''))
|
|
1441
|
+
elif mid2:
|
|
1442
|
+
ops.append(('insert', '', mid2))
|
|
1443
|
+
if suffix:
|
|
1444
|
+
ops.append(('equal', suffix, suffix))
|
|
1445
|
+
return ops
|
|
1446
|
+
|
|
1447
|
+
|
|
1448
|
+
def name_diff(name1: str, name2: str) -> List[Tuple[str, str, str]]:
|
|
1449
|
+
"""Token-level diff between two IUPAC names.
|
|
1450
|
+
|
|
1451
|
+
Tokenises both names at IUPAC delimiters (``- ( ) , space``), then
|
|
1452
|
+
runs ``SequenceMatcher`` on the token lists. Replace ops are further
|
|
1453
|
+
refined by stripping shared prefix/suffix within the replaced text,
|
|
1454
|
+
so that concatenated tokens like ``bromoquinolin`` are split into
|
|
1455
|
+
``bromo`` (changed) + ``quinolin`` (equal).
|
|
1456
|
+
|
|
1457
|
+
Returns list of ``(tag, from_text, to_text)`` tuples where *tag* is
|
|
1458
|
+
``'equal'``, ``'replace'``, ``'delete'``, or ``'insert'``, and
|
|
1459
|
+
*from_text* / *to_text* are the joined token strings.
|
|
1460
|
+
|
|
1461
|
+
Example::
|
|
1462
|
+
|
|
1463
|
+
>>> name_diff('4-fluoropyridine', '4-(piperidin-1-yl)pyridine')
|
|
1464
|
+
[('equal', '4-', '4-'),
|
|
1465
|
+
('replace', 'fluoro', '(piperidin-1-yl)'),
|
|
1466
|
+
('equal', 'pyridine', 'pyridine')]
|
|
1467
|
+
"""
|
|
1468
|
+
tok1 = _tokenize_iupac(name1)
|
|
1469
|
+
tok2 = _tokenize_iupac(name2)
|
|
1470
|
+
sm = difflib.SequenceMatcher(None, tok1, tok2, autojunk=False)
|
|
1471
|
+
|
|
1472
|
+
result: List[Tuple[str, str, str]] = []
|
|
1473
|
+
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
|
1474
|
+
t1 = ''.join(tok1[i1:i2])
|
|
1475
|
+
t2 = ''.join(tok2[j1:j2])
|
|
1476
|
+
if tag == 'replace':
|
|
1477
|
+
result.extend(_refine_replace(t1, t2))
|
|
1478
|
+
else:
|
|
1479
|
+
result.append((tag, t1, t2))
|
|
1480
|
+
return result
|
|
1481
|
+
|
|
1482
|
+
|
|
1483
|
+
def format_name_diff(name1: str, name2: str) -> str:
|
|
1484
|
+
"""Plain-text summary of changes between two aligned names.
|
|
1485
|
+
|
|
1486
|
+
Returns a string like ``fluoro -> (piperidin-1-yl)``.
|
|
1487
|
+
Multiple changes are separated by `` ; ``.
|
|
1488
|
+
"""
|
|
1489
|
+
ops = name_diff(name1, name2)
|
|
1490
|
+
changes = []
|
|
1491
|
+
for tag, t1, t2 in ops:
|
|
1492
|
+
if tag == 'replace':
|
|
1493
|
+
changes.append(f"{t1} -> {t2}")
|
|
1494
|
+
elif tag == 'delete':
|
|
1495
|
+
changes.append(f"(-{t1})")
|
|
1496
|
+
elif tag == 'insert':
|
|
1497
|
+
changes.append(f"(+{t2})")
|
|
1498
|
+
return " ; ".join(changes) if changes else "(identical)"
|
|
1499
|
+
|
|
1500
|
+
|
|
1501
|
+
def format_name_diff_html(name1: str, name2: str) -> str:
|
|
1502
|
+
"""Inline HTML showing the diff between two aligned names.
|
|
1503
|
+
|
|
1504
|
+
Equal parts are plain text; changed parts are highlighted with
|
|
1505
|
+
red strikethrough (deleted/old) and green (inserted/new) spans.
|
|
1506
|
+
|
|
1507
|
+
Returns an HTML fragment (no surrounding tags).
|
|
1508
|
+
"""
|
|
1509
|
+
ops = name_diff(name1, name2)
|
|
1510
|
+
parts = []
|
|
1511
|
+
for tag, t1, t2 in ops:
|
|
1512
|
+
if tag == 'equal':
|
|
1513
|
+
parts.append(html_mod.escape(t1))
|
|
1514
|
+
elif tag == 'replace':
|
|
1515
|
+
parts.append(
|
|
1516
|
+
f'<span class="diff-del">{html_mod.escape(t1)}</span>'
|
|
1517
|
+
f'<span class="diff-arrow">\u2192</span>'
|
|
1518
|
+
f'<span class="diff-ins">{html_mod.escape(t2)}</span>')
|
|
1519
|
+
elif tag == 'delete':
|
|
1520
|
+
parts.append(
|
|
1521
|
+
f'<span class="diff-del">{html_mod.escape(t1)}</span>')
|
|
1522
|
+
elif tag == 'insert':
|
|
1523
|
+
parts.append(
|
|
1524
|
+
f'<span class="diff-ins">{html_mod.escape(t2)}</span>')
|
|
1525
|
+
return ''.join(parts)
|
|
1526
|
+
|
|
1527
|
+
|
|
1528
|
+
# ---------------------------------------------------------------------------
|
|
1529
|
+
# Alignment result dataclass
|
|
1530
|
+
# ---------------------------------------------------------------------------
|
|
1531
|
+
|
|
1532
|
+
@dataclass
|
|
1533
|
+
class AlignmentResult:
|
|
1534
|
+
"""Result of aligning names for an SM→product pair."""
|
|
1535
|
+
sm_smiles: str
|
|
1536
|
+
prod_smiles: str
|
|
1537
|
+
sm_result: Optional[DecompositionResult] = None
|
|
1538
|
+
prod_result: Optional[DecompositionResult] = None
|
|
1539
|
+
|
|
1540
|
+
# Exact parent matches: (sm_name, prod_name, shared_parent)
|
|
1541
|
+
aligned_pairs: List[Tuple[str, str, str]] = field(default_factory=list)
|
|
1542
|
+
|
|
1543
|
+
# Best similarity pair (may or may not be an exact match)
|
|
1544
|
+
best_sm_name: str = ""
|
|
1545
|
+
best_prod_name: str = ""
|
|
1546
|
+
best_similarity: float = 0.0
|
|
1547
|
+
|
|
1548
|
+
@property
|
|
1549
|
+
def is_aligned(self) -> bool:
|
|
1550
|
+
return len(self.aligned_pairs) > 0
|
|
1551
|
+
|
|
1552
|
+
@property
|
|
1553
|
+
def alignment_quality(self) -> str:
|
|
1554
|
+
"""Classify alignment: ALIGNED / SEMI-ALIGNED / UNALIGNED."""
|
|
1555
|
+
if self.aligned_pairs:
|
|
1556
|
+
return "ALIGNED"
|
|
1557
|
+
elif self.best_similarity >= 0.5:
|
|
1558
|
+
return "SEMI-ALIGNED"
|
|
1559
|
+
else:
|
|
1560
|
+
return "UNALIGNED"
|
|
1561
|
+
|
|
1562
|
+
|
|
1563
|
+
# ---------------------------------------------------------------------------
|
|
1564
|
+
# Core alignment function
|
|
1565
|
+
# ---------------------------------------------------------------------------
|
|
1566
|
+
|
|
1567
|
+
def find_aligned_names(sm_smiles: str, prod_smiles: str,
|
|
1568
|
+
verbose: bool = False,
|
|
1569
|
+
preferred_parent: Optional[str] = None,
|
|
1570
|
+
) -> AlignmentResult:
|
|
1571
|
+
"""Find aligned name pairs for SM→product that share a naming parent.
|
|
1572
|
+
|
|
1573
|
+
Parameters
|
|
1574
|
+
----------
|
|
1575
|
+
sm_smiles, prod_smiles : str
|
|
1576
|
+
Canonical SMILES for starting material and product.
|
|
1577
|
+
verbose : bool
|
|
1578
|
+
Print debug info.
|
|
1579
|
+
preferred_parent : str, optional
|
|
1580
|
+
Substring to match against available naming parents. When set,
|
|
1581
|
+
aligned pairs whose shared parent contains this string receive a
|
|
1582
|
+
similarity bonus, biasing selection toward a consistent naming
|
|
1583
|
+
parent across a multi-step scheme. Example: ``"quinoline"``
|
|
1584
|
+
would prefer quinoline-rooted names over morpholine-rooted ones.
|
|
1585
|
+
|
|
1586
|
+
Returns an AlignmentResult with exact matches and similarity ranking.
|
|
1587
|
+
"""
|
|
1588
|
+
result = AlignmentResult(sm_smiles=sm_smiles, prod_smiles=prod_smiles)
|
|
1589
|
+
|
|
1590
|
+
sm_result = decompose_name(sm_smiles, verbose=verbose)
|
|
1591
|
+
prod_result = decompose_name(prod_smiles, verbose=verbose)
|
|
1592
|
+
result.sm_result = sm_result
|
|
1593
|
+
result.prod_result = prod_result
|
|
1594
|
+
|
|
1595
|
+
if sm_result.errors or prod_result.errors:
|
|
1596
|
+
return result
|
|
1597
|
+
|
|
1598
|
+
# Collect all valid names + their naming parent for each
|
|
1599
|
+
sm_names = [(sm_result.canonical_name,
|
|
1600
|
+
sm_result.canonical_parent or "(unknown)")]
|
|
1601
|
+
for alt in sm_result.alternatives:
|
|
1602
|
+
if alt.valid:
|
|
1603
|
+
sm_names.append((alt.name, alt.parent_name))
|
|
1604
|
+
|
|
1605
|
+
prod_names = [(prod_result.canonical_name,
|
|
1606
|
+
prod_result.canonical_parent or "(unknown)")]
|
|
1607
|
+
for alt in prod_result.alternatives:
|
|
1608
|
+
if alt.valid:
|
|
1609
|
+
prod_names.append((alt.name, alt.parent_name))
|
|
1610
|
+
|
|
1611
|
+
# Find pairs sharing the same parent name
|
|
1612
|
+
sm_by_parent = defaultdict(list)
|
|
1613
|
+
prod_by_parent = defaultdict(list)
|
|
1614
|
+
|
|
1615
|
+
for name, parent in sm_names:
|
|
1616
|
+
sm_by_parent[parent.lower()].append(name)
|
|
1617
|
+
for name, parent in prod_names:
|
|
1618
|
+
prod_by_parent[parent.lower()].append(name)
|
|
1619
|
+
|
|
1620
|
+
# Direct parent match
|
|
1621
|
+
for parent_key in sm_by_parent:
|
|
1622
|
+
if parent_key in prod_by_parent:
|
|
1623
|
+
for sm_name in sm_by_parent[parent_key]:
|
|
1624
|
+
for prod_name in prod_by_parent[parent_key]:
|
|
1625
|
+
result.aligned_pairs.append(
|
|
1626
|
+
(sm_name, prod_name, parent_key))
|
|
1627
|
+
|
|
1628
|
+
# Remove trivial "(canonical)" matches
|
|
1629
|
+
result.aligned_pairs = [(s, p, par) for s, p, par in result.aligned_pairs
|
|
1630
|
+
if par != "(canonical)"]
|
|
1631
|
+
|
|
1632
|
+
# When a preferred_parent is specified, first try to find the best
|
|
1633
|
+
# pair where BOTH parents contain the preferred substring. This keeps
|
|
1634
|
+
# naming consistent across a multi-step scheme. Only fall back to
|
|
1635
|
+
# unrestricted similarity if no preferred-parent pair exists.
|
|
1636
|
+
pref_key = preferred_parent.lower().strip() if preferred_parent else ""
|
|
1637
|
+
# Also match the truncated form (e.g. "quinolin" for "quinoline")
|
|
1638
|
+
# because -yl suffixed names drop the final 'e'.
|
|
1639
|
+
pref_keys = []
|
|
1640
|
+
if pref_key:
|
|
1641
|
+
pref_keys.append(pref_key)
|
|
1642
|
+
if pref_key.endswith('e'):
|
|
1643
|
+
pref_keys.append(pref_key[:-1])
|
|
1644
|
+
|
|
1645
|
+
def _has_pref(text: str) -> bool:
|
|
1646
|
+
"""Check if text contains the preferred parent (or its stem)."""
|
|
1647
|
+
t = text.lower()
|
|
1648
|
+
return any(pk in t for pk in pref_keys)
|
|
1649
|
+
|
|
1650
|
+
best_sim = 0.0
|
|
1651
|
+
best_pair = ("", "")
|
|
1652
|
+
best_pref_sim = 0.0
|
|
1653
|
+
best_pref_pair = ("", "")
|
|
1654
|
+
|
|
1655
|
+
for sm_name, sm_par in sm_names:
|
|
1656
|
+
for prod_name, prod_par in prod_names:
|
|
1657
|
+
sim = name_similarity(sm_name, prod_name)
|
|
1658
|
+
if sim > best_sim:
|
|
1659
|
+
best_sim = sim
|
|
1660
|
+
best_pair = (sm_name, prod_name)
|
|
1661
|
+
# Track best pair matching preferred parent separately.
|
|
1662
|
+
# Check the parent string — use stem matching because -yl
|
|
1663
|
+
# suffixed parents drop the final 'e' (e.g. "quinolin-2-yl"
|
|
1664
|
+
# inside "4-(4-phenylquinolin-2-yl)morpholine").
|
|
1665
|
+
if (pref_keys
|
|
1666
|
+
and _has_pref(sm_par)
|
|
1667
|
+
and _has_pref(prod_par)
|
|
1668
|
+
and sim > best_pref_sim):
|
|
1669
|
+
best_pref_sim = sim
|
|
1670
|
+
best_pref_pair = (sm_name, prod_name)
|
|
1671
|
+
|
|
1672
|
+
# Use preferred-parent pair if it exists and has reasonable similarity
|
|
1673
|
+
# (at least 30% — just enough to filter out nonsense).
|
|
1674
|
+
if best_pref_pair[0] and best_pref_sim >= 0.30:
|
|
1675
|
+
result.best_sm_name = best_pref_pair[0]
|
|
1676
|
+
result.best_prod_name = best_pref_pair[1]
|
|
1677
|
+
result.best_similarity = best_pref_sim
|
|
1678
|
+
else:
|
|
1679
|
+
result.best_sm_name = best_pair[0]
|
|
1680
|
+
result.best_prod_name = best_pair[1]
|
|
1681
|
+
result.best_similarity = best_sim
|
|
1682
|
+
|
|
1683
|
+
return result
|
|
1684
|
+
|
|
1685
|
+
|
|
1686
|
+
# ---------------------------------------------------------------------------
|
|
1687
|
+
# Multi-step sequence alignment
|
|
1688
|
+
# ---------------------------------------------------------------------------
|
|
1689
|
+
|
|
1690
|
+
@dataclass
|
|
1691
|
+
class SequenceAlignmentResult:
|
|
1692
|
+
"""Result of aligning names across a multi-step synthetic route."""
|
|
1693
|
+
smiles_list: List[str]
|
|
1694
|
+
chosen_names: List[str]
|
|
1695
|
+
parent_names: List[str]
|
|
1696
|
+
parent_rings: List[str]
|
|
1697
|
+
parent_switches: int
|
|
1698
|
+
base_score: float
|
|
1699
|
+
decomposition_results: List[Optional[DecompositionResult]] = field(
|
|
1700
|
+
default_factory=list)
|
|
1701
|
+
errors: List[str] = field(default_factory=list)
|
|
1702
|
+
|
|
1703
|
+
@property
|
|
1704
|
+
def is_fully_aligned(self) -> bool:
|
|
1705
|
+
return self.parent_switches == 0
|
|
1706
|
+
|
|
1707
|
+
|
|
1708
|
+
def find_aligned_name_sequence(
|
|
1709
|
+
smiles_list: List[str],
|
|
1710
|
+
verbose: bool = False,
|
|
1711
|
+
parent_penalty: float = 100.0,
|
|
1712
|
+
timeout: float = 30.0,
|
|
1713
|
+
) -> SequenceAlignmentResult:
|
|
1714
|
+
"""Pick one IUPAC name per intermediate to minimise parent-ring switches.
|
|
1715
|
+
|
|
1716
|
+
Uses parent-aware Viterbi DP: the objective is to minimise parent
|
|
1717
|
+
switches first (penalty >> base metric), then minimise chemistry-aware
|
|
1718
|
+
token diff as tiebreaker.
|
|
1719
|
+
|
|
1720
|
+
Parameters
|
|
1721
|
+
----------
|
|
1722
|
+
smiles_list : list of str
|
|
1723
|
+
SMILES for each intermediate in synthesis order.
|
|
1724
|
+
verbose : bool
|
|
1725
|
+
Print debug info during decomposition.
|
|
1726
|
+
parent_penalty : float
|
|
1727
|
+
Penalty added when consecutive names have different parent rings.
|
|
1728
|
+
Must be >> max possible base metric value.
|
|
1729
|
+
timeout : float
|
|
1730
|
+
Per-compound decomposition timeout in seconds.
|
|
1731
|
+
|
|
1732
|
+
Returns
|
|
1733
|
+
-------
|
|
1734
|
+
SequenceAlignmentResult
|
|
1735
|
+
"""
|
|
1736
|
+
names_per_compound: List[List[str]] = []
|
|
1737
|
+
name_to_parent: Dict[str, str] = {}
|
|
1738
|
+
decomp_results: List[Optional[DecompositionResult]] = []
|
|
1739
|
+
errors: List[str] = []
|
|
1740
|
+
canonical_smiles: List[Optional[str]] = [] # for variant validation
|
|
1741
|
+
|
|
1742
|
+
for smi in smiles_list:
|
|
1743
|
+
try:
|
|
1744
|
+
r = decompose_name(smi, verbose=verbose, timeout=timeout)
|
|
1745
|
+
decomp_results.append(r)
|
|
1746
|
+
|
|
1747
|
+
all_names = [(r.canonical_name, r.canonical_parent or "")]
|
|
1748
|
+
for alt in r.alternatives:
|
|
1749
|
+
if alt.valid:
|
|
1750
|
+
all_names.append((alt.name, alt.parent_name))
|
|
1751
|
+
|
|
1752
|
+
# Generate alignment variants for each name.
|
|
1753
|
+
# Variants are round-trip validated: name → SMILES → canonical
|
|
1754
|
+
# must match the original compound's canonical SMILES.
|
|
1755
|
+
expected_canon = _canonical(smi)
|
|
1756
|
+
canonical_smiles.append(expected_canon)
|
|
1757
|
+
extra = []
|
|
1758
|
+
seen_names = {n for n, _ in all_names}
|
|
1759
|
+
for n, p in all_names:
|
|
1760
|
+
for vn, vp in _generate_alignment_variants(n, p):
|
|
1761
|
+
if vn not in seen_names:
|
|
1762
|
+
if expected_canon and _validate_variant(vn, expected_canon):
|
|
1763
|
+
extra.append((vn, vp))
|
|
1764
|
+
seen_names.add(vn)
|
|
1765
|
+
all_names.extend(extra)
|
|
1766
|
+
|
|
1767
|
+
valid_names = [n for n, _ in all_names]
|
|
1768
|
+
names_per_compound.append(valid_names)
|
|
1769
|
+
for n, p in all_names:
|
|
1770
|
+
if p:
|
|
1771
|
+
# Check if parent gives a recognized ring; if not,
|
|
1772
|
+
# the name itself may contain the ring (decomposer bug
|
|
1773
|
+
# where prefix-stripping eats part of the ring name)
|
|
1774
|
+
ring = extract_parent_ring(p)
|
|
1775
|
+
if ring == p.lower().strip():
|
|
1776
|
+
name_ring = extract_parent_ring(n)
|
|
1777
|
+
if name_ring != n.lower().strip():
|
|
1778
|
+
p = n
|
|
1779
|
+
else:
|
|
1780
|
+
# Empty parent (retained names, single decompositions)
|
|
1781
|
+
p = n
|
|
1782
|
+
name_to_parent[n] = p
|
|
1783
|
+
|
|
1784
|
+
if r.errors:
|
|
1785
|
+
errors.append(f"{smi[:40]}: {'; '.join(r.errors)}")
|
|
1786
|
+
except Exception as e:
|
|
1787
|
+
decomp_results.append(None)
|
|
1788
|
+
canonical_smiles.append(None)
|
|
1789
|
+
fallback = f"[{smi[:30]}]"
|
|
1790
|
+
names_per_compound.append([fallback])
|
|
1791
|
+
name_to_parent[fallback] = ""
|
|
1792
|
+
errors.append(f"{smi[:40]}: {e}")
|
|
1793
|
+
|
|
1794
|
+
# --- Pass 1: Run parent-aware DP with chem_token_diff_count as base metric
|
|
1795
|
+
penalised_fn = _make_parent_penalised_metric(
|
|
1796
|
+
chem_token_diff_count, name_to_parent, parent_penalty)
|
|
1797
|
+
pass1_chosen, _total = _dp_viterbi(
|
|
1798
|
+
names_per_compound, penalised_fn, minimize=True)
|
|
1799
|
+
|
|
1800
|
+
# --- Pass 2: Generate contextual variants based on Pass 1 choices,
|
|
1801
|
+
# then re-run DP with the expanded candidate lists.
|
|
1802
|
+
# Each compound looks at what its neighbors chose in Pass 1 and
|
|
1803
|
+
# generates targeted variants to match that naming style.
|
|
1804
|
+
names_per_compound_p2 = [list(names) for names in names_per_compound]
|
|
1805
|
+
added_any = False
|
|
1806
|
+
for i in range(len(pass1_chosen)):
|
|
1807
|
+
existing = set(names_per_compound_p2[i])
|
|
1808
|
+
ctx_variants: List[Tuple[str, str]] = []
|
|
1809
|
+
|
|
1810
|
+
# Get parent for this compound's current names (use first name)
|
|
1811
|
+
comp_parent = name_to_parent.get(
|
|
1812
|
+
names_per_compound[i][0], "") if names_per_compound[i] else ""
|
|
1813
|
+
|
|
1814
|
+
# Generate variants targeted at each neighbor's chosen name
|
|
1815
|
+
if i > 0:
|
|
1816
|
+
for n in names_per_compound_p2[i]:
|
|
1817
|
+
ctx_variants.extend(
|
|
1818
|
+
_contextual_variants(n, name_to_parent.get(n, comp_parent),
|
|
1819
|
+
pass1_chosen[i - 1]))
|
|
1820
|
+
if i < len(pass1_chosen) - 1:
|
|
1821
|
+
for n in names_per_compound_p2[i]:
|
|
1822
|
+
ctx_variants.extend(
|
|
1823
|
+
_contextual_variants(n, name_to_parent.get(n, comp_parent),
|
|
1824
|
+
pass1_chosen[i + 1]))
|
|
1825
|
+
|
|
1826
|
+
# Add new unique variants (validated against canonical SMILES)
|
|
1827
|
+
exp_canon = canonical_smiles[i] if i < len(canonical_smiles) else None
|
|
1828
|
+
for vn, vp in ctx_variants:
|
|
1829
|
+
if vn not in existing:
|
|
1830
|
+
if exp_canon and not _validate_variant(vn, exp_canon):
|
|
1831
|
+
existing.add(vn) # skip invalid, but don't try again
|
|
1832
|
+
continue
|
|
1833
|
+
names_per_compound_p2[i].append(vn)
|
|
1834
|
+
existing.add(vn)
|
|
1835
|
+
added_any = True
|
|
1836
|
+
# Register parent for the new variant
|
|
1837
|
+
if vp:
|
|
1838
|
+
ring = extract_parent_ring(vp)
|
|
1839
|
+
if ring == vp.lower().strip():
|
|
1840
|
+
name_ring = extract_parent_ring(vn)
|
|
1841
|
+
if name_ring != vn.lower().strip():
|
|
1842
|
+
vp = vn
|
|
1843
|
+
else:
|
|
1844
|
+
vp = vn
|
|
1845
|
+
name_to_parent[vn] = vp
|
|
1846
|
+
|
|
1847
|
+
# Re-run DP only if we actually added new variants
|
|
1848
|
+
if added_any:
|
|
1849
|
+
penalised_fn_p2 = _make_parent_penalised_metric(
|
|
1850
|
+
chem_token_diff_count, name_to_parent, parent_penalty)
|
|
1851
|
+
chosen, _total = _dp_viterbi(
|
|
1852
|
+
names_per_compound_p2, penalised_fn_p2, minimize=True)
|
|
1853
|
+
else:
|
|
1854
|
+
chosen = pass1_chosen
|
|
1855
|
+
|
|
1856
|
+
# --- Post-DP: normalise locant order to ascending ----------------------
|
|
1857
|
+
# chem_token_diff_count is order-agnostic (multiset), so the DP cannot
|
|
1858
|
+
# distinguish "5-X-2-Y-ring" from "2-Y-5-X-ring". IUPAC convention
|
|
1859
|
+
# demands ascending locants, so we normalise here.
|
|
1860
|
+
for i, name in enumerate(chosen):
|
|
1861
|
+
parent = name_to_parent.get(name, "")
|
|
1862
|
+
reordered = _reorder_locant_prefixes(name, parent)
|
|
1863
|
+
if reordered and reordered != name:
|
|
1864
|
+
exp_canon = (canonical_smiles[i]
|
|
1865
|
+
if i < len(canonical_smiles) else None)
|
|
1866
|
+
if exp_canon is None or _validate_variant(reordered, exp_canon):
|
|
1867
|
+
chosen[i] = reordered
|
|
1868
|
+
name_to_parent[reordered] = parent
|
|
1869
|
+
|
|
1870
|
+
# Compute actual stats
|
|
1871
|
+
base_score = 0.0
|
|
1872
|
+
switches = 0
|
|
1873
|
+
parent_names = [name_to_parent.get(n, "") for n in chosen]
|
|
1874
|
+
parent_rings = [extract_parent_ring(p) for p in parent_names]
|
|
1875
|
+
|
|
1876
|
+
for i in range(len(chosen) - 1):
|
|
1877
|
+
base_score += chem_token_diff_count(chosen[i], chosen[i + 1])
|
|
1878
|
+
if parent_rings[i] != parent_rings[i + 1]:
|
|
1879
|
+
switches += 1
|
|
1880
|
+
|
|
1881
|
+
return SequenceAlignmentResult(
|
|
1882
|
+
smiles_list=smiles_list,
|
|
1883
|
+
chosen_names=chosen,
|
|
1884
|
+
parent_names=parent_names,
|
|
1885
|
+
parent_rings=parent_rings,
|
|
1886
|
+
parent_switches=switches,
|
|
1887
|
+
base_score=base_score,
|
|
1888
|
+
decomposition_results=decomp_results,
|
|
1889
|
+
errors=errors,
|
|
1890
|
+
)
|
|
1891
|
+
|
|
1892
|
+
|
|
1893
|
+
# ---------------------------------------------------------------------------
|
|
1894
|
+
# Molecular diff (MCS-based)
|
|
1895
|
+
# ---------------------------------------------------------------------------
|
|
1896
|
+
|
|
1897
|
+
@dataclass
|
|
1898
|
+
class FragmentChange:
|
|
1899
|
+
"""One changed fragment in a molecular diff."""
|
|
1900
|
+
sm_frag_smiles: str # [*]-bearing SMILES from SM side ("" for additions)
|
|
1901
|
+
prod_frag_smiles: str # [*]-bearing SMILES from product side ("" for removals)
|
|
1902
|
+
sm_name: str # substituent name ("fluoro", "H", etc.)
|
|
1903
|
+
prod_name: str # substituent name ("phenyl", etc.)
|
|
1904
|
+
change_type: str # "replace" | "addition" | "removal"
|
|
1905
|
+
|
|
1906
|
+
|
|
1907
|
+
@dataclass
|
|
1908
|
+
class MolecularDiffResult:
|
|
1909
|
+
"""Result of MCS-based molecular diff between SM and product."""
|
|
1910
|
+
sm_smiles: str
|
|
1911
|
+
prod_smiles: str
|
|
1912
|
+
changes: List[FragmentChange]
|
|
1913
|
+
mcs_num_atoms: int
|
|
1914
|
+
fallback_used: bool = False
|
|
1915
|
+
fallback_text: str = ""
|
|
1916
|
+
stereo_only: bool = False
|
|
1917
|
+
|
|
1918
|
+
|
|
1919
|
+
def _get_connected_components(mol: Chem.Mol,
|
|
1920
|
+
atom_indices: set) -> List[set]:
|
|
1921
|
+
"""Group atom indices into connected components within the molecule."""
|
|
1922
|
+
visited: set = set()
|
|
1923
|
+
components: List[set] = []
|
|
1924
|
+
for start in atom_indices:
|
|
1925
|
+
if start in visited:
|
|
1926
|
+
continue
|
|
1927
|
+
comp: set = set()
|
|
1928
|
+
queue = [start]
|
|
1929
|
+
while queue:
|
|
1930
|
+
idx = queue.pop()
|
|
1931
|
+
if idx in visited:
|
|
1932
|
+
continue
|
|
1933
|
+
visited.add(idx)
|
|
1934
|
+
comp.add(idx)
|
|
1935
|
+
atom = mol.GetAtomWithIdx(idx)
|
|
1936
|
+
for nbr in atom.GetNeighbors():
|
|
1937
|
+
nidx = nbr.GetIdx()
|
|
1938
|
+
if nidx in atom_indices and nidx not in visited:
|
|
1939
|
+
queue.append(nidx)
|
|
1940
|
+
components.append(comp)
|
|
1941
|
+
return components
|
|
1942
|
+
|
|
1943
|
+
|
|
1944
|
+
def _extract_fragment_smiles(mol: Chem.Mol, frag_atoms: set,
|
|
1945
|
+
attachments: List[Tuple[int, int]]
|
|
1946
|
+
) -> str:
|
|
1947
|
+
"""Extract a fragment as SMILES with [*] at each attachment point.
|
|
1948
|
+
|
|
1949
|
+
Args:
|
|
1950
|
+
mol: Source molecule.
|
|
1951
|
+
frag_atoms: Set of atom indices belonging to this fragment.
|
|
1952
|
+
attachments: List of (frag_atom_idx, core_atom_idx) pairs
|
|
1953
|
+
representing bonds crossing from fragment to MCS core.
|
|
1954
|
+
|
|
1955
|
+
Returns:
|
|
1956
|
+
SMILES like "[*]c1ccccc1" for a phenyl fragment.
|
|
1957
|
+
"""
|
|
1958
|
+
frag = Chem.RWMol()
|
|
1959
|
+
old_to_new: dict = {}
|
|
1960
|
+
|
|
1961
|
+
# Add fragment atoms
|
|
1962
|
+
for old_idx in sorted(frag_atoms):
|
|
1963
|
+
src = mol.GetAtomWithIdx(old_idx)
|
|
1964
|
+
new_atom = Chem.Atom(src.GetAtomicNum())
|
|
1965
|
+
new_atom.SetFormalCharge(src.GetFormalCharge())
|
|
1966
|
+
new_atom.SetNumExplicitHs(src.GetNumExplicitHs())
|
|
1967
|
+
new_atom.SetIsAromatic(src.GetIsAromatic())
|
|
1968
|
+
new_idx = frag.AddAtom(new_atom)
|
|
1969
|
+
old_to_new[old_idx] = new_idx
|
|
1970
|
+
|
|
1971
|
+
# Add [*] dummy atoms for each attachment point
|
|
1972
|
+
attach_dummies: dict = {} # core_atom_idx -> new_dummy_idx
|
|
1973
|
+
for frag_idx, core_idx in attachments:
|
|
1974
|
+
if core_idx not in attach_dummies:
|
|
1975
|
+
dummy_idx = frag.AddAtom(Chem.Atom(0)) # [*]
|
|
1976
|
+
attach_dummies[core_idx] = dummy_idx
|
|
1977
|
+
bond = mol.GetBondBetweenAtoms(frag_idx, core_idx)
|
|
1978
|
+
btype = bond.GetBondType() if bond else Chem.BondType.SINGLE
|
|
1979
|
+
frag.AddBond(old_to_new[frag_idx], attach_dummies[core_idx], btype)
|
|
1980
|
+
|
|
1981
|
+
# Add intra-fragment bonds
|
|
1982
|
+
for old_idx in frag_atoms:
|
|
1983
|
+
atom = mol.GetAtomWithIdx(old_idx)
|
|
1984
|
+
for bond in atom.GetBonds():
|
|
1985
|
+
other = bond.GetOtherAtomIdx(old_idx)
|
|
1986
|
+
if other in frag_atoms and old_idx < other:
|
|
1987
|
+
frag.AddBond(old_to_new[old_idx], old_to_new[other],
|
|
1988
|
+
bond.GetBondType())
|
|
1989
|
+
|
|
1990
|
+
try:
|
|
1991
|
+
Chem.SanitizeMol(frag)
|
|
1992
|
+
return Chem.MolToSmiles(frag)
|
|
1993
|
+
except Exception:
|
|
1994
|
+
# If sanitization fails, try without aromaticity perception
|
|
1995
|
+
try:
|
|
1996
|
+
Chem.SanitizeMol(frag, Chem.SanitizeFlags.SANITIZE_ALL
|
|
1997
|
+
^ Chem.SanitizeFlags.SANITIZE_SETAROMATICITY)
|
|
1998
|
+
return Chem.MolToSmiles(frag)
|
|
1999
|
+
except Exception:
|
|
2000
|
+
return ""
|
|
2001
|
+
|
|
2002
|
+
|
|
2003
|
+
def _name_fragment(frag_smiles: str) -> str:
|
|
2004
|
+
"""Name a fragment, returning substituent prefix or raw SMILES fallback."""
|
|
2005
|
+
if not frag_smiles:
|
|
2006
|
+
return "H"
|
|
2007
|
+
# Normalise [*][H] variants
|
|
2008
|
+
mol = Chem.MolFromSmiles(frag_smiles)
|
|
2009
|
+
if mol is None:
|
|
2010
|
+
return frag_smiles
|
|
2011
|
+
heavy = sum(1 for a in mol.GetAtoms() if a.GetAtomicNum() > 1)
|
|
2012
|
+
if heavy == 0:
|
|
2013
|
+
return "H"
|
|
2014
|
+
|
|
2015
|
+
# Detect =O (oxo/carbonyl) vs -OH (hydroxy) — the generic substituent
|
|
2016
|
+
# namer may not distinguish bond order.
|
|
2017
|
+
if heavy == 1:
|
|
2018
|
+
atom = next(a for a in mol.GetAtoms() if a.GetAtomicNum() > 1)
|
|
2019
|
+
if atom.GetAtomicNum() == 8: # oxygen
|
|
2020
|
+
# Check if any bond to a dummy atom is a double bond
|
|
2021
|
+
for bond in atom.GetBonds():
|
|
2022
|
+
if bond.GetOtherAtom(atom).GetAtomicNum() == 0: # [*]
|
|
2023
|
+
if bond.GetBondTypeAsDouble() == 2.0:
|
|
2024
|
+
return "oxo"
|
|
2025
|
+
return "hydroxy"
|
|
2026
|
+
if atom.GetAtomicNum() == 16: # sulfur
|
|
2027
|
+
for bond in atom.GetBonds():
|
|
2028
|
+
if bond.GetOtherAtom(atom).GetAtomicNum() == 0:
|
|
2029
|
+
if bond.GetBondTypeAsDouble() == 2.0:
|
|
2030
|
+
return "thioxo"
|
|
2031
|
+
return "sulfanyl"
|
|
2032
|
+
|
|
2033
|
+
result = name_fragment_as_substituent(frag_smiles, verbose=False)
|
|
2034
|
+
return result if result else frag_smiles
|
|
2035
|
+
|
|
2036
|
+
|
|
2037
|
+
def molecular_diff(sm_smiles: str, prod_smiles: str,
|
|
2038
|
+
min_mcs_ratio: float = 0.4,
|
|
2039
|
+
verbose: bool = False) -> MolecularDiffResult:
|
|
2040
|
+
"""Compute molecular-level diff between SM and product using MCS.
|
|
2041
|
+
|
|
2042
|
+
Finds the Maximum Common Substructure (invariant core), extracts
|
|
2043
|
+
changed fragments from each side, names them as IUPAC substituents,
|
|
2044
|
+
and returns structured diff results.
|
|
2045
|
+
|
|
2046
|
+
Falls back to text diff when MCS is too small.
|
|
2047
|
+
|
|
2048
|
+
Args:
|
|
2049
|
+
sm_smiles: Starting material SMILES.
|
|
2050
|
+
prod_smiles: Product SMILES.
|
|
2051
|
+
min_mcs_ratio: Minimum fraction of smaller molecule covered by MCS.
|
|
2052
|
+
Below this, falls back to text diff.
|
|
2053
|
+
verbose: Print debug info.
|
|
2054
|
+
|
|
2055
|
+
Returns:
|
|
2056
|
+
MolecularDiffResult with list of FragmentChange entries.
|
|
2057
|
+
"""
|
|
2058
|
+
empty = MolecularDiffResult(sm_smiles=sm_smiles, prod_smiles=prod_smiles,
|
|
2059
|
+
changes=[], mcs_num_atoms=0)
|
|
2060
|
+
|
|
2061
|
+
sm_mol = Chem.MolFromSmiles(sm_smiles)
|
|
2062
|
+
prod_mol = Chem.MolFromSmiles(prod_smiles)
|
|
2063
|
+
if sm_mol is None or prod_mol is None:
|
|
2064
|
+
empty.fallback_used = True
|
|
2065
|
+
empty.fallback_text = "(invalid SMILES)"
|
|
2066
|
+
return empty
|
|
2067
|
+
|
|
2068
|
+
sm_n = sm_mol.GetNumAtoms()
|
|
2069
|
+
prod_n = prod_mol.GetNumAtoms()
|
|
2070
|
+
|
|
2071
|
+
# --- MCS computation ---
|
|
2072
|
+
try:
|
|
2073
|
+
mcs = rdFMCS.FindMCS(
|
|
2074
|
+
[sm_mol, prod_mol],
|
|
2075
|
+
threshold=1.0,
|
|
2076
|
+
ringMatchesRingOnly=True,
|
|
2077
|
+
completeRingsOnly=True,
|
|
2078
|
+
atomCompare=rdFMCS.AtomCompare.CompareElements,
|
|
2079
|
+
bondCompare=rdFMCS.BondCompare.CompareOrder,
|
|
2080
|
+
timeout=5,
|
|
2081
|
+
)
|
|
2082
|
+
except Exception:
|
|
2083
|
+
empty.fallback_used = True
|
|
2084
|
+
return empty
|
|
2085
|
+
|
|
2086
|
+
if mcs.canceled or mcs.numAtoms < 3:
|
|
2087
|
+
empty.fallback_used = True
|
|
2088
|
+
return empty
|
|
2089
|
+
|
|
2090
|
+
# --- Quality gate ---
|
|
2091
|
+
smaller = min(sm_n, prod_n)
|
|
2092
|
+
if mcs.numAtoms < min_mcs_ratio * smaller:
|
|
2093
|
+
if verbose:
|
|
2094
|
+
print(f" MCS too small: {mcs.numAtoms}/{smaller} "
|
|
2095
|
+
f"({mcs.numAtoms/smaller:.0%})", file=sys.stderr)
|
|
2096
|
+
empty.fallback_used = True
|
|
2097
|
+
return empty
|
|
2098
|
+
|
|
2099
|
+
# --- Atom mappings ---
|
|
2100
|
+
core = Chem.MolFromSmarts(mcs.smartsString)
|
|
2101
|
+
if core is None:
|
|
2102
|
+
empty.fallback_used = True
|
|
2103
|
+
return empty
|
|
2104
|
+
|
|
2105
|
+
sm_match = sm_mol.GetSubstructMatch(core)
|
|
2106
|
+
prod_match = prod_mol.GetSubstructMatch(core)
|
|
2107
|
+
if not sm_match or not prod_match:
|
|
2108
|
+
empty.fallback_used = True
|
|
2109
|
+
return empty
|
|
2110
|
+
|
|
2111
|
+
sm_core = set(sm_match)
|
|
2112
|
+
prod_core = set(prod_match)
|
|
2113
|
+
|
|
2114
|
+
# --- Stereo-only check ---
|
|
2115
|
+
if mcs.numAtoms == sm_n == prod_n:
|
|
2116
|
+
return MolecularDiffResult(
|
|
2117
|
+
sm_smiles=sm_smiles, prod_smiles=prod_smiles,
|
|
2118
|
+
changes=[], mcs_num_atoms=mcs.numAtoms, stereo_only=True)
|
|
2119
|
+
|
|
2120
|
+
# --- Extract non-MCS atoms ---
|
|
2121
|
+
sm_non_mcs = set(range(sm_n)) - sm_core
|
|
2122
|
+
prod_non_mcs = set(range(prod_n)) - prod_core
|
|
2123
|
+
|
|
2124
|
+
# --- Group into connected components ---
|
|
2125
|
+
sm_comps = _get_connected_components(sm_mol, sm_non_mcs)
|
|
2126
|
+
prod_comps = _get_connected_components(prod_mol, prod_non_mcs)
|
|
2127
|
+
|
|
2128
|
+
if verbose:
|
|
2129
|
+
print(f" MCS: {mcs.numAtoms} atoms. SM changed: {len(sm_non_mcs)} "
|
|
2130
|
+
f"in {len(sm_comps)} frag(s). Prod changed: {len(prod_non_mcs)} "
|
|
2131
|
+
f"in {len(prod_comps)} frag(s).", file=sys.stderr)
|
|
2132
|
+
|
|
2133
|
+
# --- Find attachment points ---
|
|
2134
|
+
# For each component, find bonds from non-MCS to MCS atoms.
|
|
2135
|
+
# Key: MCS core position (index in sm_match/prod_match tuple)
|
|
2136
|
+
# Multiple fragments can attach to the same core atom (e.g. Grignard
|
|
2137
|
+
# addition: C=O → C(OH)(R) produces two product fragments on one atom).
|
|
2138
|
+
def _find_attachments(mol, components, core_set, match_tuple):
|
|
2139
|
+
"""Return {mcs_pos: [(component, [(frag_idx, core_idx), ...]), ...]}."""
|
|
2140
|
+
attach_map: dict = {} # mcs_pos -> list of (comp, atts)
|
|
2141
|
+
for comp in components:
|
|
2142
|
+
atts: List[Tuple[int, int]] = []
|
|
2143
|
+
for atom_idx in comp:
|
|
2144
|
+
for nbr in mol.GetAtomWithIdx(atom_idx).GetNeighbors():
|
|
2145
|
+
nidx = nbr.GetIdx()
|
|
2146
|
+
if nidx in core_set:
|
|
2147
|
+
atts.append((atom_idx, nidx))
|
|
2148
|
+
# Key by MCS core position (to enable pairing)
|
|
2149
|
+
# A component may attach to multiple core atoms; use the first.
|
|
2150
|
+
mcs_positions_seen: set = set()
|
|
2151
|
+
for _, core_idx in atts:
|
|
2152
|
+
mcs_pos = match_tuple.index(core_idx)
|
|
2153
|
+
if mcs_pos not in mcs_positions_seen:
|
|
2154
|
+
mcs_positions_seen.add(mcs_pos)
|
|
2155
|
+
attach_map.setdefault(mcs_pos, []).append((comp, atts))
|
|
2156
|
+
return attach_map
|
|
2157
|
+
|
|
2158
|
+
sm_attach = _find_attachments(sm_mol, sm_comps, sm_core, sm_match)
|
|
2159
|
+
prod_attach = _find_attachments(prod_mol, prod_comps, prod_core, prod_match)
|
|
2160
|
+
|
|
2161
|
+
# --- Pair fragments by shared MCS attachment point ---
|
|
2162
|
+
all_mcs_positions = set(sm_attach.keys()) | set(prod_attach.keys())
|
|
2163
|
+
changes: List[FragmentChange] = []
|
|
2164
|
+
|
|
2165
|
+
for mcs_pos in sorted(all_mcs_positions):
|
|
2166
|
+
sm_list = sm_attach.get(mcs_pos, [])
|
|
2167
|
+
prod_list = prod_attach.get(mcs_pos, [])
|
|
2168
|
+
|
|
2169
|
+
# Extract all fragment SMILES and names for each side
|
|
2170
|
+
sm_frags = []
|
|
2171
|
+
for comp, atts in sm_list:
|
|
2172
|
+
smi = _extract_fragment_smiles(sm_mol, comp, atts)
|
|
2173
|
+
sm_frags.append((smi, _name_fragment(smi) if smi else "H"))
|
|
2174
|
+
prod_frags = []
|
|
2175
|
+
for comp, atts in prod_list:
|
|
2176
|
+
smi = _extract_fragment_smiles(prod_mol, comp, atts)
|
|
2177
|
+
prod_frags.append((smi, _name_fragment(smi) if smi else "H"))
|
|
2178
|
+
|
|
2179
|
+
if sm_frags and prod_frags:
|
|
2180
|
+
# Replacement at this position. Multiple fragments on one
|
|
2181
|
+
# side are part of the same transformation (e.g. Grignard
|
|
2182
|
+
# C=O → C(OH)(R)), so combine all names with " + ".
|
|
2183
|
+
sm_names = " + ".join(n for _, n in sm_frags)
|
|
2184
|
+
prod_names = " + ".join(n for _, n in prod_frags)
|
|
2185
|
+
changes.append(FragmentChange(
|
|
2186
|
+
sm_frag_smiles=sm_frags[0][0],
|
|
2187
|
+
prod_frag_smiles=prod_frags[0][0],
|
|
2188
|
+
sm_name=sm_names, prod_name=prod_names,
|
|
2189
|
+
change_type="replace",
|
|
2190
|
+
))
|
|
2191
|
+
elif sm_frags:
|
|
2192
|
+
# Pure removals (nothing on product side at this position)
|
|
2193
|
+
for smi, name in sm_frags:
|
|
2194
|
+
changes.append(FragmentChange(
|
|
2195
|
+
sm_frag_smiles=smi, prod_frag_smiles="",
|
|
2196
|
+
sm_name=name, prod_name="H",
|
|
2197
|
+
change_type="removal",
|
|
2198
|
+
))
|
|
2199
|
+
elif prod_frags:
|
|
2200
|
+
# Pure additions (nothing on SM side at this position)
|
|
2201
|
+
for smi, name in prod_frags:
|
|
2202
|
+
changes.append(FragmentChange(
|
|
2203
|
+
sm_frag_smiles="", prod_frag_smiles=smi,
|
|
2204
|
+
sm_name="H", prod_name=name,
|
|
2205
|
+
change_type="addition",
|
|
2206
|
+
))
|
|
2207
|
+
|
|
2208
|
+
# --- Post-processing: merge unpaired removals + additions ---
|
|
2209
|
+
# Symmetric molecules (e.g. benzene) can cause the MCS to map
|
|
2210
|
+
# substituted carbons to different positions, so a true substitution
|
|
2211
|
+
# appears as a removal + addition. Merge them into replacements.
|
|
2212
|
+
removals = [c for c in changes if c.change_type == "removal"]
|
|
2213
|
+
additions = [c for c in changes if c.change_type == "addition"]
|
|
2214
|
+
|
|
2215
|
+
if removals and additions:
|
|
2216
|
+
paired_changes = [c for c in changes if c.change_type == "replace"]
|
|
2217
|
+
# Pair removals with additions (1:1, in order)
|
|
2218
|
+
n_pairs = min(len(removals), len(additions))
|
|
2219
|
+
for i in range(n_pairs):
|
|
2220
|
+
paired_changes.append(FragmentChange(
|
|
2221
|
+
sm_frag_smiles=removals[i].sm_frag_smiles,
|
|
2222
|
+
prod_frag_smiles=additions[i].prod_frag_smiles,
|
|
2223
|
+
sm_name=removals[i].sm_name,
|
|
2224
|
+
prod_name=additions[i].prod_name,
|
|
2225
|
+
change_type="replace",
|
|
2226
|
+
))
|
|
2227
|
+
# Keep any leftover unpaired removals/additions
|
|
2228
|
+
for r in removals[n_pairs:]:
|
|
2229
|
+
paired_changes.append(r)
|
|
2230
|
+
for a in additions[n_pairs:]:
|
|
2231
|
+
paired_changes.append(a)
|
|
2232
|
+
changes = paired_changes
|
|
2233
|
+
|
|
2234
|
+
return MolecularDiffResult(
|
|
2235
|
+
sm_smiles=sm_smiles, prod_smiles=prod_smiles,
|
|
2236
|
+
changes=changes, mcs_num_atoms=mcs.numAtoms)
|
|
2237
|
+
|
|
2238
|
+
|
|
2239
|
+
# ---------------------------------------------------------------------------
|
|
2240
|
+
# Molecular diff formatting
|
|
2241
|
+
# ---------------------------------------------------------------------------
|
|
2242
|
+
|
|
2243
|
+
def format_molecular_diff(sm_smiles: str, prod_smiles: str,
|
|
2244
|
+
alignment_result: Optional['AlignmentResult'] = None
|
|
2245
|
+
) -> str:
|
|
2246
|
+
"""Plain-text molecular diff: ``fluoro → phenyl``.
|
|
2247
|
+
|
|
2248
|
+
Uses MCS to identify changed fragments, names them as substituents.
|
|
2249
|
+
Falls back to text diff (``format_name_diff``) when MCS is too small.
|
|
2250
|
+
|
|
2251
|
+
Multiple changes separated by `` ; ``.
|
|
2252
|
+
"""
|
|
2253
|
+
result = molecular_diff(sm_smiles, prod_smiles)
|
|
2254
|
+
|
|
2255
|
+
if result.fallback_used:
|
|
2256
|
+
# Fall back to text diff using best available names
|
|
2257
|
+
if alignment_result:
|
|
2258
|
+
n1 = alignment_result.best_sm_name or ""
|
|
2259
|
+
n2 = alignment_result.best_prod_name or ""
|
|
2260
|
+
else:
|
|
2261
|
+
n1 = _quick_name(sm_smiles)
|
|
2262
|
+
n2 = _quick_name(prod_smiles)
|
|
2263
|
+
if n1 and n2:
|
|
2264
|
+
return format_name_diff(n1, n2)
|
|
2265
|
+
return result.fallback_text or "(no diff available)"
|
|
2266
|
+
|
|
2267
|
+
if result.stereo_only:
|
|
2268
|
+
return "(stereo change)"
|
|
2269
|
+
if not result.changes:
|
|
2270
|
+
return "(identical)"
|
|
2271
|
+
|
|
2272
|
+
parts = []
|
|
2273
|
+
for ch in result.changes:
|
|
2274
|
+
if ch.change_type == "replace":
|
|
2275
|
+
parts.append(f"{ch.sm_name} \u2192 {ch.prod_name}")
|
|
2276
|
+
elif ch.change_type == "removal":
|
|
2277
|
+
parts.append(f"{ch.sm_name} \u2192 H")
|
|
2278
|
+
elif ch.change_type == "addition":
|
|
2279
|
+
parts.append(f"H \u2192 {ch.prod_name}")
|
|
2280
|
+
return " ; ".join(parts) if parts else "(identical)"
|
|
2281
|
+
|
|
2282
|
+
|
|
2283
|
+
def format_molecular_diff_html(sm_smiles: str, prod_smiles: str,
|
|
2284
|
+
alignment_result: Optional['AlignmentResult'] = None
|
|
2285
|
+
) -> str:
|
|
2286
|
+
"""HTML molecular diff with coloured spans.
|
|
2287
|
+
|
|
2288
|
+
Uses same CSS classes as ``format_name_diff_html`` for consistency:
|
|
2289
|
+
``.diff-del`` (red strikethrough), ``.diff-ins`` (green), ``.diff-arrow``.
|
|
2290
|
+
"""
|
|
2291
|
+
result = molecular_diff(sm_smiles, prod_smiles)
|
|
2292
|
+
|
|
2293
|
+
if result.fallback_used:
|
|
2294
|
+
if alignment_result:
|
|
2295
|
+
n1 = alignment_result.best_sm_name or ""
|
|
2296
|
+
n2 = alignment_result.best_prod_name or ""
|
|
2297
|
+
else:
|
|
2298
|
+
n1 = _quick_name(sm_smiles)
|
|
2299
|
+
n2 = _quick_name(prod_smiles)
|
|
2300
|
+
if n1 and n2:
|
|
2301
|
+
return format_name_diff_html(n1, n2)
|
|
2302
|
+
return html_mod.escape(result.fallback_text or "(no diff available)")
|
|
2303
|
+
|
|
2304
|
+
if result.stereo_only:
|
|
2305
|
+
return '<span class="diff-ins">(stereo change)</span>'
|
|
2306
|
+
if not result.changes:
|
|
2307
|
+
return "(identical)"
|
|
2308
|
+
|
|
2309
|
+
parts = []
|
|
2310
|
+
for ch in result.changes:
|
|
2311
|
+
if ch.change_type == "replace":
|
|
2312
|
+
parts.append(
|
|
2313
|
+
f'<span class="diff-del">{html_mod.escape(ch.sm_name)}</span>'
|
|
2314
|
+
f'<span class="diff-arrow">\u2192</span>'
|
|
2315
|
+
f'<span class="diff-ins">{html_mod.escape(ch.prod_name)}</span>'
|
|
2316
|
+
)
|
|
2317
|
+
elif ch.change_type == "removal":
|
|
2318
|
+
parts.append(
|
|
2319
|
+
f'<span class="diff-del">{html_mod.escape(ch.sm_name)}</span>'
|
|
2320
|
+
f'<span class="diff-arrow">\u2192</span>'
|
|
2321
|
+
f'<span class="diff-ins">H</span>'
|
|
2322
|
+
)
|
|
2323
|
+
elif ch.change_type == "addition":
|
|
2324
|
+
parts.append(
|
|
2325
|
+
f'<span class="diff-del">H</span>'
|
|
2326
|
+
f'<span class="diff-arrow">\u2192</span>'
|
|
2327
|
+
f'<span class="diff-ins">{html_mod.escape(ch.prod_name)}</span>'
|
|
2328
|
+
)
|
|
2329
|
+
return " ; ".join(parts) if parts else "(identical)"
|
|
2330
|
+
|
|
2331
|
+
|
|
2332
|
+
def _quick_name(smiles: str) -> str:
|
|
2333
|
+
"""Get IUPAC name for a SMILES without full decomposition."""
|
|
2334
|
+
try:
|
|
2335
|
+
from cdxml_toolkit.chemdraw.chemscript_bridge import ChemScriptBridge
|
|
2336
|
+
cs = ChemScriptBridge.get_instance()
|
|
2337
|
+
return cs.get_name(smiles)
|
|
2338
|
+
except Exception:
|
|
2339
|
+
return ""
|
|
2340
|
+
|
|
2341
|
+
|
|
2342
|
+
# Showcase runner and CLI entry points live in chem-pipeline/aligned_namer.py
|