cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,605 @@
|
|
|
1
|
+
"""RDKit-based CDXML fragment utilities.
|
|
2
|
+
|
|
3
|
+
Complements cdxml_utils.py (which provides pure XML geometry — bounding
|
|
4
|
+
boxes, centroids, text bbox, IO) with RDKit-powered chemical operations:
|
|
5
|
+
|
|
6
|
+
- frag_to_mol() — CDXML <fragment> → RDKit Mol (with metadata)
|
|
7
|
+
- frag_to_smiles() — CDXML <fragment> → canonical SMILES
|
|
8
|
+
- frag_to_mw() — CDXML <fragment> → molecular weight
|
|
9
|
+
- frag_to_molblock() — CDXML <fragment> → MOL block (CDXML coords)
|
|
10
|
+
- cleanup_fragment_rdkit() — 2D cleanup with Kabsch orientation preservation
|
|
11
|
+
- set_cdxml_conformer() — Set RDKit conformer from CDXML coordinates
|
|
12
|
+
- rdkit_default_bond_length() — RDKit's default 2D depiction bond length
|
|
13
|
+
- avg_bond_length_from_atoms() — Average bond length from CDXML atom coords
|
|
14
|
+
|
|
15
|
+
Uses shared modules (constants.py for ACS_BOND_LENGTH).
|
|
16
|
+
|
|
17
|
+
All RDKit imports are lazy so this module can be imported even if RDKit
|
|
18
|
+
is not installed (functions will raise ImportError at call time).
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import math
|
|
22
|
+
import xml.etree.ElementTree as ET
|
|
23
|
+
from typing import Dict, List, Optional, Tuple
|
|
24
|
+
|
|
25
|
+
from .constants import ACS_BOND_LENGTH
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
# Core: CDXML <fragment> → RDKit Mol
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
|
|
32
|
+
def frag_to_mol(frag_elem: ET.Element):
|
|
33
|
+
"""Convert a CDXML <fragment> to an RDKit Mol with atom metadata.
|
|
34
|
+
|
|
35
|
+
Returns ``(mol, atoms_data)`` where *atoms_data* is a list of dicts
|
|
36
|
+
with keys: id, idx, x, y, elem, num_h, is_abbrev, xml.
|
|
37
|
+
|
|
38
|
+
Abbreviation groups (``NodeType="Fragment"``) become dummy atoms
|
|
39
|
+
(element 0) so they participate in connectivity but not MCS element
|
|
40
|
+
matching.
|
|
41
|
+
|
|
42
|
+
Returns ``(None, None)`` if conversion fails.
|
|
43
|
+
"""
|
|
44
|
+
from rdkit import Chem
|
|
45
|
+
|
|
46
|
+
atoms: List[dict] = []
|
|
47
|
+
id_map: Dict[int, int] = {}
|
|
48
|
+
|
|
49
|
+
# NodeTypes that are NOT real atoms — they become dummy atoms (element 0)
|
|
50
|
+
# or get skipped entirely.
|
|
51
|
+
_SKIP_NODETYPES = {"ExternalConnectionPoint"}
|
|
52
|
+
_DUMMY_NODETYPES = {
|
|
53
|
+
"Fragment", # Real abbreviation groups (Boc, OTs, Me, etc.)
|
|
54
|
+
"GenericNickname", # Generic variable groups (R, X, Ar, etc.)
|
|
55
|
+
"Nickname", # Alternative label form (may or may not be real)
|
|
56
|
+
"Unspecified", # Uninterpretable text labels
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
for n in frag_elem.findall("n"):
|
|
60
|
+
nid = int(n.get("id"))
|
|
61
|
+
node_type = n.get("NodeType")
|
|
62
|
+
if node_type in _SKIP_NODETYPES:
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
px, py = [float(v) for v in n.get("p", "0 0").split()]
|
|
66
|
+
elem = int(n.get("Element", "6"))
|
|
67
|
+
num_h_attr = n.get("NumHydrogens")
|
|
68
|
+
num_h = int(num_h_attr) if num_h_attr is not None else None
|
|
69
|
+
is_abbrev = node_type in _DUMMY_NODETYPES
|
|
70
|
+
|
|
71
|
+
idx = len(atoms)
|
|
72
|
+
id_map[nid] = idx
|
|
73
|
+
atoms.append({
|
|
74
|
+
"id": nid, "idx": idx,
|
|
75
|
+
"x": px, "y": py,
|
|
76
|
+
"elem": elem, "num_h": num_h,
|
|
77
|
+
"is_abbrev": is_abbrev,
|
|
78
|
+
"xml": n,
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
bonds = []
|
|
82
|
+
for b in frag_elem.findall("b"):
|
|
83
|
+
bi, ei = int(b.get("B")), int(b.get("E"))
|
|
84
|
+
if bi in id_map and ei in id_map:
|
|
85
|
+
bonds.append((id_map[bi], id_map[ei], int(b.get("Order", "1"))))
|
|
86
|
+
|
|
87
|
+
em = Chem.RWMol()
|
|
88
|
+
for a in atoms:
|
|
89
|
+
ra = Chem.Atom(0 if a["is_abbrev"] else a["elem"])
|
|
90
|
+
if a["num_h"] is not None:
|
|
91
|
+
ra.SetNoImplicit(True)
|
|
92
|
+
ra.SetNumExplicitHs(a["num_h"])
|
|
93
|
+
em.AddAtom(ra)
|
|
94
|
+
|
|
95
|
+
BT = {1: Chem.BondType.SINGLE, 2: Chem.BondType.DOUBLE,
|
|
96
|
+
3: Chem.BondType.TRIPLE}
|
|
97
|
+
for bi, ei, order in bonds:
|
|
98
|
+
em.AddBond(bi, ei, BT.get(order, Chem.BondType.SINGLE))
|
|
99
|
+
|
|
100
|
+
mol = em.GetMol()
|
|
101
|
+
try:
|
|
102
|
+
Chem.SanitizeMol(mol)
|
|
103
|
+
except Exception:
|
|
104
|
+
try:
|
|
105
|
+
Chem.SanitizeMol(
|
|
106
|
+
mol,
|
|
107
|
+
Chem.SanitizeFlags.SANITIZE_ALL
|
|
108
|
+
^ Chem.SanitizeFlags.SANITIZE_PROPERTIES,
|
|
109
|
+
)
|
|
110
|
+
except Exception:
|
|
111
|
+
pass
|
|
112
|
+
|
|
113
|
+
return mol, atoms
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# ---------------------------------------------------------------------------
|
|
117
|
+
# Convenience wrappers
|
|
118
|
+
# ---------------------------------------------------------------------------
|
|
119
|
+
|
|
120
|
+
def frag_to_smiles(frag_elem: ET.Element) -> Optional[str]:
|
|
121
|
+
"""Convert a CDXML <fragment> to a canonical SMILES string.
|
|
122
|
+
|
|
123
|
+
Returns None if conversion fails.
|
|
124
|
+
"""
|
|
125
|
+
from rdkit import Chem
|
|
126
|
+
result = frag_to_mol(frag_elem)
|
|
127
|
+
if result is None or result[0] is None:
|
|
128
|
+
return None
|
|
129
|
+
mol, _ = result
|
|
130
|
+
try:
|
|
131
|
+
smi = Chem.MolToSmiles(mol)
|
|
132
|
+
return smi if smi else None
|
|
133
|
+
except Exception:
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def frag_to_smiles_resolved(frag_elem: ET.Element) -> Optional[str]:
|
|
138
|
+
"""Convert a CDXML <fragment> to SMILES, resolving abbreviation groups.
|
|
139
|
+
|
|
140
|
+
Unlike :func:`frag_to_smiles`, which turns abbreviation groups
|
|
141
|
+
(``NodeType="Fragment"``) into ``[*]`` dummy atoms, this function
|
|
142
|
+
attempts to replace each dummy with the real fragment SMILES from
|
|
143
|
+
the superatom table.
|
|
144
|
+
|
|
145
|
+
Falls back to :func:`frag_to_smiles` if resolution fails.
|
|
146
|
+
"""
|
|
147
|
+
from rdkit import Chem
|
|
148
|
+
|
|
149
|
+
result = frag_to_mol(frag_elem)
|
|
150
|
+
if result is None or result[0] is None:
|
|
151
|
+
return None
|
|
152
|
+
mol, atoms_data = result
|
|
153
|
+
|
|
154
|
+
# Check for abbreviation groups — only resolve real abbreviations
|
|
155
|
+
# (NodeType="Fragment"), NOT generic groups (R, X, Ar — GenericNickname,
|
|
156
|
+
# Nickname, Unspecified) which should stay as [*].
|
|
157
|
+
abbrev_atoms = [(a["idx"], a) for a in atoms_data
|
|
158
|
+
if a["is_abbrev"]
|
|
159
|
+
and a["xml"].get("NodeType") == "Fragment"]
|
|
160
|
+
if not abbrev_atoms:
|
|
161
|
+
# No abbreviations — standard path
|
|
162
|
+
try:
|
|
163
|
+
smi = Chem.MolToSmiles(mol)
|
|
164
|
+
return smi if smi else None
|
|
165
|
+
except Exception:
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
# Try to resolve each abbreviation
|
|
169
|
+
try:
|
|
170
|
+
from .resolve.superatom_table import get_abbrev_label, lookup_smiles
|
|
171
|
+
except ImportError:
|
|
172
|
+
return frag_to_smiles(frag_elem)
|
|
173
|
+
|
|
174
|
+
em = Chem.RWMol(mol)
|
|
175
|
+
|
|
176
|
+
# Process abbreviations in reverse index order to keep indices stable
|
|
177
|
+
replacements = []
|
|
178
|
+
for idx, a in sorted(abbrev_atoms, key=lambda x: x[0], reverse=True):
|
|
179
|
+
label = get_abbrev_label(a["xml"])
|
|
180
|
+
if not label:
|
|
181
|
+
return frag_to_smiles(frag_elem) # Can't resolve — fallback
|
|
182
|
+
|
|
183
|
+
abbrev_smi = lookup_smiles(label)
|
|
184
|
+
if not abbrev_smi:
|
|
185
|
+
return frag_to_smiles(frag_elem) # Unknown abbreviation — fallback
|
|
186
|
+
|
|
187
|
+
abbrev_mol = Chem.MolFromSmiles(abbrev_smi)
|
|
188
|
+
if abbrev_mol is None:
|
|
189
|
+
return frag_to_smiles(frag_elem)
|
|
190
|
+
|
|
191
|
+
# Find the bond connecting dummy to core
|
|
192
|
+
dummy_atom = em.GetAtomWithIdx(idx)
|
|
193
|
+
dummy_bonds = list(dummy_atom.GetBonds())
|
|
194
|
+
if len(dummy_bonds) != 1:
|
|
195
|
+
return frag_to_smiles(frag_elem) # Multi-attachment — too complex
|
|
196
|
+
|
|
197
|
+
bond = dummy_bonds[0]
|
|
198
|
+
core_idx = bond.GetOtherAtomIdx(idx)
|
|
199
|
+
bond_type = bond.GetBondType()
|
|
200
|
+
|
|
201
|
+
replacements.append((idx, core_idx, bond_type, abbrev_mol))
|
|
202
|
+
|
|
203
|
+
# Apply replacements (still in reverse order)
|
|
204
|
+
for idx, core_idx, bond_type, abbrev_mol in replacements:
|
|
205
|
+
# Remove bond between dummy and core
|
|
206
|
+
em.RemoveBond(idx, core_idx)
|
|
207
|
+
|
|
208
|
+
# Add abbreviation atoms
|
|
209
|
+
offset = em.GetNumAtoms()
|
|
210
|
+
for i in range(abbrev_mol.GetNumAtoms()):
|
|
211
|
+
new_atom = Chem.Atom(abbrev_mol.GetAtomWithIdx(i).GetAtomicNum())
|
|
212
|
+
src = abbrev_mol.GetAtomWithIdx(i)
|
|
213
|
+
new_atom.SetFormalCharge(src.GetFormalCharge())
|
|
214
|
+
if src.GetNoImplicit():
|
|
215
|
+
new_atom.SetNoImplicit(True)
|
|
216
|
+
new_atom.SetNumExplicitHs(src.GetNumExplicitHs())
|
|
217
|
+
em.AddAtom(new_atom)
|
|
218
|
+
|
|
219
|
+
for b in abbrev_mol.GetBonds():
|
|
220
|
+
em.AddBond(offset + b.GetBeginAtomIdx(),
|
|
221
|
+
offset + b.GetEndAtomIdx(),
|
|
222
|
+
b.GetBondType())
|
|
223
|
+
|
|
224
|
+
# Connect first atom of abbreviation to core
|
|
225
|
+
em.AddBond(core_idx, offset, bond_type)
|
|
226
|
+
|
|
227
|
+
# Remove dummy atoms (highest index first — they were sorted in reverse)
|
|
228
|
+
for idx, _, _, _ in replacements:
|
|
229
|
+
em.RemoveAtom(idx)
|
|
230
|
+
|
|
231
|
+
try:
|
|
232
|
+
resolved = em.GetMol()
|
|
233
|
+
Chem.SanitizeMol(resolved)
|
|
234
|
+
smi = Chem.MolToSmiles(resolved)
|
|
235
|
+
return smi if smi else frag_to_smiles(frag_elem)
|
|
236
|
+
except Exception:
|
|
237
|
+
return frag_to_smiles(frag_elem)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def frag_to_smiles_chemscript(frag_elem: ET.Element) -> Optional[str]:
|
|
241
|
+
"""Convert a CDXML ``<fragment>`` to SMILES using ChemScript.
|
|
242
|
+
|
|
243
|
+
ChemScript (PerkinElmer ChemDraw .NET library) natively understands
|
|
244
|
+
ALL ChemDraw abbreviation groups (Nicknames, Fragments, generic groups)
|
|
245
|
+
and expands them to full structures. This gives far better results than
|
|
246
|
+
:func:`frag_to_smiles_resolved` for fragments with complex or rare
|
|
247
|
+
abbreviations (NHTrs, PO(OH)₂, Bn, etc.).
|
|
248
|
+
|
|
249
|
+
Falls back to ``None`` if ChemScript is unavailable or fails.
|
|
250
|
+
Requires ChemDraw 16+ installed on Windows.
|
|
251
|
+
"""
|
|
252
|
+
import copy
|
|
253
|
+
import tempfile
|
|
254
|
+
import os
|
|
255
|
+
|
|
256
|
+
try:
|
|
257
|
+
from .chemdraw.chemscript_bridge import ChemScriptBridge
|
|
258
|
+
except ImportError:
|
|
259
|
+
return None
|
|
260
|
+
|
|
261
|
+
# Wrap the fragment in a minimal CDXML document
|
|
262
|
+
frag_copy = copy.deepcopy(frag_elem)
|
|
263
|
+
wrapper = ET.Element("CDXML")
|
|
264
|
+
page_el = ET.SubElement(wrapper, "page")
|
|
265
|
+
page_el.append(frag_copy)
|
|
266
|
+
|
|
267
|
+
tmp_path = None
|
|
268
|
+
try:
|
|
269
|
+
tmp = tempfile.NamedTemporaryFile(
|
|
270
|
+
suffix=".cdxml", delete=False, mode="w", encoding="utf-8"
|
|
271
|
+
)
|
|
272
|
+
tmp.write('<?xml version="1.0" encoding="UTF-8" ?>')
|
|
273
|
+
tmp.write(ET.tostring(wrapper, encoding="unicode"))
|
|
274
|
+
tmp.close()
|
|
275
|
+
tmp_path = tmp.name
|
|
276
|
+
|
|
277
|
+
cs = ChemScriptBridge()
|
|
278
|
+
info = cs.get_info(tmp_path)
|
|
279
|
+
if info and info.get("ok"):
|
|
280
|
+
smi = info.get("smiles")
|
|
281
|
+
if smi:
|
|
282
|
+
return smi
|
|
283
|
+
# Reaction-type response — shouldn't happen for a single fragment
|
|
284
|
+
# but handle gracefully
|
|
285
|
+
reactants = info.get("reactants", [])
|
|
286
|
+
if reactants and reactants[0].get("smiles"):
|
|
287
|
+
return reactants[0]["smiles"]
|
|
288
|
+
return None
|
|
289
|
+
except Exception:
|
|
290
|
+
return None
|
|
291
|
+
finally:
|
|
292
|
+
if tmp_path and os.path.exists(tmp_path):
|
|
293
|
+
try:
|
|
294
|
+
os.unlink(tmp_path)
|
|
295
|
+
except OSError:
|
|
296
|
+
pass
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def frag_to_mw(frag_elem: ET.Element) -> Optional[float]:
|
|
300
|
+
"""Compute molecular weight from a CDXML <fragment>.
|
|
301
|
+
|
|
302
|
+
If the fragment contains abbreviation groups (``NodeType="Fragment"``),
|
|
303
|
+
attempts to resolve their MW via the superatom lookup table
|
|
304
|
+
(``superatom_table.py``). Falls back to None only if an abbreviation
|
|
305
|
+
label cannot be resolved.
|
|
306
|
+
"""
|
|
307
|
+
from rdkit.Chem import Descriptors
|
|
308
|
+
result = frag_to_mol(frag_elem)
|
|
309
|
+
if result is None or result[0] is None:
|
|
310
|
+
return None
|
|
311
|
+
mol, atoms_data = result
|
|
312
|
+
|
|
313
|
+
abbrev_atoms = [a for a in atoms_data if a["is_abbrev"]]
|
|
314
|
+
if not abbrev_atoms:
|
|
315
|
+
# No abbreviations — straightforward MW
|
|
316
|
+
try:
|
|
317
|
+
return Descriptors.MolWt(mol)
|
|
318
|
+
except Exception:
|
|
319
|
+
return None
|
|
320
|
+
|
|
321
|
+
# Has abbreviation groups — try superatom-assisted MW.
|
|
322
|
+
# Strategy: MolWt(mol_with_dummies) gives MW of the core (dummy atoms
|
|
323
|
+
# contribute 0 Da). For each abbreviation, look up its standalone MW
|
|
324
|
+
# and subtract 1.008 (one H lost when it bonds to the core).
|
|
325
|
+
try:
|
|
326
|
+
from .resolve.superatom_table import get_abbrev_label, lookup_mw
|
|
327
|
+
except ImportError:
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
H_MASS = 1.008
|
|
331
|
+
abbrev_mw_total = 0.0
|
|
332
|
+
for a in abbrev_atoms:
|
|
333
|
+
label = get_abbrev_label(a["xml"])
|
|
334
|
+
if label is None:
|
|
335
|
+
return None # can't read label
|
|
336
|
+
mw = lookup_mw(label)
|
|
337
|
+
if mw is None:
|
|
338
|
+
return None # unknown abbreviation
|
|
339
|
+
# Count bonds from this dummy atom to the rest of the molecule
|
|
340
|
+
dummy_idx = a["idx"]
|
|
341
|
+
n_bonds = sum(1 for bond in mol.GetBonds()
|
|
342
|
+
if bond.GetBeginAtomIdx() == dummy_idx
|
|
343
|
+
or bond.GetEndAtomIdx() == dummy_idx)
|
|
344
|
+
# Each bond replaces one H on the abbreviation fragment
|
|
345
|
+
abbrev_mw_total += mw - (n_bonds * H_MASS)
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
core_mw = Descriptors.MolWt(mol)
|
|
349
|
+
except Exception:
|
|
350
|
+
return None
|
|
351
|
+
|
|
352
|
+
return core_mw + abbrev_mw_total
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def frag_to_molblock(frag_elem: ET.Element) -> Optional[str]:
|
|
356
|
+
"""Convert a CDXML <fragment> to a MOL block string (with CDXML coords).
|
|
357
|
+
|
|
358
|
+
Sets the RDKit conformer from CDXML coordinates before export so the
|
|
359
|
+
MOL block preserves the drawn layout.
|
|
360
|
+
|
|
361
|
+
Returns None if conversion fails.
|
|
362
|
+
"""
|
|
363
|
+
from rdkit import Chem
|
|
364
|
+
result = frag_to_mol(frag_elem)
|
|
365
|
+
if result is None or result[0] is None:
|
|
366
|
+
return None
|
|
367
|
+
mol, atoms_data = result
|
|
368
|
+
|
|
369
|
+
set_cdxml_conformer(mol, atoms_data, scale=1.0)
|
|
370
|
+
|
|
371
|
+
try:
|
|
372
|
+
return Chem.MolToMolBlock(mol)
|
|
373
|
+
except Exception:
|
|
374
|
+
return None
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
# ---------------------------------------------------------------------------
|
|
378
|
+
# 2D Cleanup with orientation preservation
|
|
379
|
+
# ---------------------------------------------------------------------------
|
|
380
|
+
|
|
381
|
+
def cleanup_fragment_rdkit(frag_elem: ET.Element,
|
|
382
|
+
verbose: bool = False) -> bool:
|
|
383
|
+
"""Clean up a single fragment's 2D geometry using RDKit.
|
|
384
|
+
|
|
385
|
+
Uses ``AllChem.Compute2DCoords()`` for cleanup, then applies Kabsch
|
|
386
|
+
rotation to restore the original orientation.
|
|
387
|
+
|
|
388
|
+
Abbreviation groups (``NodeType="Fragment"``) are included as dummy
|
|
389
|
+
atoms (element 0) in the RDKit mol — they participate in layout but
|
|
390
|
+
not element matching. When an abbreviation node moves, its inner
|
|
391
|
+
fragment atoms and text label are translated by the same delta.
|
|
392
|
+
|
|
393
|
+
Modifies *frag_elem* in place. Returns True if cleanup was applied.
|
|
394
|
+
"""
|
|
395
|
+
import copy as _copy
|
|
396
|
+
from rdkit import Chem
|
|
397
|
+
from rdkit.Chem import AllChem
|
|
398
|
+
|
|
399
|
+
result = frag_to_mol(frag_elem)
|
|
400
|
+
if result is None or result[0] is None:
|
|
401
|
+
return False
|
|
402
|
+
mol, atoms_data = result
|
|
403
|
+
|
|
404
|
+
if mol.GetNumAtoms() < 2:
|
|
405
|
+
return False
|
|
406
|
+
|
|
407
|
+
# Save original coordinates
|
|
408
|
+
orig_coords = [(a["x"], a["y"]) for a in atoms_data]
|
|
409
|
+
|
|
410
|
+
# Compute new 2D coords
|
|
411
|
+
mol_copy = _copy.deepcopy(mol)
|
|
412
|
+
AllChem.Compute2DCoords(mol_copy)
|
|
413
|
+
conf = mol_copy.GetConformer()
|
|
414
|
+
|
|
415
|
+
# Get new coords (RDKit space: y-up)
|
|
416
|
+
new_coords_rdk = []
|
|
417
|
+
for i in range(mol_copy.GetNumAtoms()):
|
|
418
|
+
pos = conf.GetAtomPosition(i)
|
|
419
|
+
new_coords_rdk.append((pos.x, pos.y))
|
|
420
|
+
|
|
421
|
+
# Scale new coords to ACS standard bond length
|
|
422
|
+
avg_bl_new = _avg_bond_length_from_conf(mol_copy)
|
|
423
|
+
if avg_bl_new < 1e-6:
|
|
424
|
+
return False
|
|
425
|
+
scale = ACS_BOND_LENGTH / avg_bl_new
|
|
426
|
+
|
|
427
|
+
# Convert to CDXML space (y-flip + scale)
|
|
428
|
+
new_coords_cdxml = [(x * scale, -y * scale) for x, y in new_coords_rdk]
|
|
429
|
+
|
|
430
|
+
# Kabsch: find best rotation from new → original
|
|
431
|
+
cx_orig = sum(x for x, y in orig_coords) / len(orig_coords)
|
|
432
|
+
cy_orig = sum(y for x, y in orig_coords) / len(orig_coords)
|
|
433
|
+
cx_new = sum(x for x, y in new_coords_cdxml) / len(new_coords_cdxml)
|
|
434
|
+
cy_new = sum(y for x, y in new_coords_cdxml) / len(new_coords_cdxml)
|
|
435
|
+
|
|
436
|
+
orig_centered = [(x - cx_orig, y - cy_orig) for x, y in orig_coords]
|
|
437
|
+
new_centered = [(x - cx_new, y - cy_new) for x, y in new_coords_cdxml]
|
|
438
|
+
|
|
439
|
+
# Compute optimal rotation angle via atan2(cross, dot)
|
|
440
|
+
dot_sum = 0.0
|
|
441
|
+
cross_sum = 0.0
|
|
442
|
+
for (ox, oy), (nx, ny) in zip(orig_centered, new_centered):
|
|
443
|
+
dot_sum += nx * ox + ny * oy
|
|
444
|
+
cross_sum += nx * oy - ny * ox
|
|
445
|
+
angle = math.atan2(cross_sum, dot_sum)
|
|
446
|
+
|
|
447
|
+
cos_a = math.cos(angle)
|
|
448
|
+
sin_a = math.sin(angle)
|
|
449
|
+
|
|
450
|
+
# Apply rotation to new coords and translate to original centroid
|
|
451
|
+
final_coords = []
|
|
452
|
+
for x, y in new_centered:
|
|
453
|
+
rx = x * cos_a - y * sin_a + cx_orig
|
|
454
|
+
ry = x * sin_a + y * cos_a + cy_orig
|
|
455
|
+
final_coords.append((rx, ry))
|
|
456
|
+
|
|
457
|
+
# For salt products (disconnected components like amine + HCl):
|
|
458
|
+
# RDKit's Compute2DCoords places disconnected fragments arbitrarily.
|
|
459
|
+
# The Kabsch rotation preserves overall orientation but scrambles the
|
|
460
|
+
# relative position of small counterions. Fix: reposition small
|
|
461
|
+
# components to preserve their original offset from the main structure.
|
|
462
|
+
frags = Chem.GetMolFrags(mol)
|
|
463
|
+
if len(frags) > 1:
|
|
464
|
+
largest = max(frags, key=len)
|
|
465
|
+
# Original centroid of largest component
|
|
466
|
+
ocx_main = sum(orig_coords[i][0] for i in largest) / len(largest)
|
|
467
|
+
ocy_main = sum(orig_coords[i][1] for i in largest) / len(largest)
|
|
468
|
+
# New centroid of largest component (after Kabsch)
|
|
469
|
+
ncx_main = sum(final_coords[i][0] for i in largest) / len(largest)
|
|
470
|
+
ncy_main = sum(final_coords[i][1] for i in largest) / len(largest)
|
|
471
|
+
for comp in frags:
|
|
472
|
+
if comp is largest:
|
|
473
|
+
continue
|
|
474
|
+
# Original offset from main component
|
|
475
|
+
ocx_s = sum(orig_coords[i][0] for i in comp) / len(comp)
|
|
476
|
+
ocy_s = sum(orig_coords[i][1] for i in comp) / len(comp)
|
|
477
|
+
off_x = ocx_s - ocx_main
|
|
478
|
+
off_y = ocy_s - ocy_main
|
|
479
|
+
# Where it should be (preserve original offset from main)
|
|
480
|
+
tgt_x = ncx_main + off_x
|
|
481
|
+
tgt_y = ncy_main + off_y
|
|
482
|
+
# Where it currently is
|
|
483
|
+
cur_x = sum(final_coords[i][0] for i in comp) / len(comp)
|
|
484
|
+
cur_y = sum(final_coords[i][1] for i in comp) / len(comp)
|
|
485
|
+
# Shift
|
|
486
|
+
dx = tgt_x - cur_x
|
|
487
|
+
dy = tgt_y - cur_y
|
|
488
|
+
for idx in comp:
|
|
489
|
+
fx, fy = final_coords[idx]
|
|
490
|
+
final_coords[idx] = (fx + dx, fy + dy)
|
|
491
|
+
|
|
492
|
+
# Write back to CDXML — also translate inner abbreviation fragments
|
|
493
|
+
has_abbrev = False
|
|
494
|
+
for atom_d, (fx, fy) in zip(atoms_data, final_coords):
|
|
495
|
+
node = atom_d["xml"]
|
|
496
|
+
old_x, old_y = atom_d["x"], atom_d["y"]
|
|
497
|
+
node.set("p", f"{fx:.4f} {fy:.4f}")
|
|
498
|
+
|
|
499
|
+
if atom_d["is_abbrev"]:
|
|
500
|
+
has_abbrev = True
|
|
501
|
+
dx = fx - old_x
|
|
502
|
+
dy = fy - old_y
|
|
503
|
+
inner_frag = node.find("fragment")
|
|
504
|
+
if inner_frag is not None:
|
|
505
|
+
for inner_n in inner_frag.findall("n"):
|
|
506
|
+
ip = inner_n.get("p")
|
|
507
|
+
if ip:
|
|
508
|
+
ix, iy = [float(v) for v in ip.split()]
|
|
509
|
+
inner_n.set("p", f"{ix + dx:.4f} {iy + dy:.4f}")
|
|
510
|
+
for t_elem in node.findall("t"):
|
|
511
|
+
tp = t_elem.get("p")
|
|
512
|
+
if tp:
|
|
513
|
+
tx, ty = [float(v) for v in tp.split()]
|
|
514
|
+
t_elem.set("p", f"{tx + dx:.4f} {ty + dy:.4f}")
|
|
515
|
+
bb = t_elem.get("BoundingBox")
|
|
516
|
+
if bb:
|
|
517
|
+
bvals = [float(v) for v in bb.split()]
|
|
518
|
+
if len(bvals) == 4:
|
|
519
|
+
t_elem.set("BoundingBox",
|
|
520
|
+
f"{bvals[0]+dx:.4f} {bvals[1]+dy:.4f} "
|
|
521
|
+
f"{bvals[2]+dx:.4f} {bvals[3]+dy:.4f}")
|
|
522
|
+
|
|
523
|
+
if verbose:
|
|
524
|
+
import sys
|
|
525
|
+
frag_id = frag_elem.get("id", "?")
|
|
526
|
+
abbrev_note = " (with abbreviations)" if has_abbrev else ""
|
|
527
|
+
print(f" [RDKit cleanup] fragment {frag_id}: "
|
|
528
|
+
f"{mol.GetNumAtoms()} atoms{abbrev_note}",
|
|
529
|
+
file=sys.stderr)
|
|
530
|
+
|
|
531
|
+
return True
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
# ---------------------------------------------------------------------------
|
|
535
|
+
# Scale / coordinate helpers
|
|
536
|
+
# ---------------------------------------------------------------------------
|
|
537
|
+
|
|
538
|
+
_rdk_bl_cache: Optional[float] = None
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def rdkit_default_bond_length() -> float:
|
|
542
|
+
"""RDKit's default 2D depiction bond length (cached)."""
|
|
543
|
+
global _rdk_bl_cache
|
|
544
|
+
if _rdk_bl_cache is None:
|
|
545
|
+
from rdkit import Chem
|
|
546
|
+
from rdkit.Chem import AllChem
|
|
547
|
+
m = Chem.MolFromSmiles("CC")
|
|
548
|
+
AllChem.Compute2DCoords(m)
|
|
549
|
+
c = m.GetConformer()
|
|
550
|
+
p0, p1 = c.GetAtomPosition(0), c.GetAtomPosition(1)
|
|
551
|
+
_rdk_bl_cache = math.sqrt(
|
|
552
|
+
(p1.x - p0.x) ** 2 + (p1.y - p0.y) ** 2)
|
|
553
|
+
return _rdk_bl_cache
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
def avg_bond_length_from_atoms(atoms_data: List[dict], mol) -> float:
|
|
557
|
+
"""Average bond length computed from CDXML atom coordinates."""
|
|
558
|
+
return _avg_bond_length(atoms_data, mol)
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def set_cdxml_conformer(mol, atoms_data: List[dict], scale: float = 1.0):
|
|
562
|
+
"""Set conformer from CDXML coordinates (y-flipped, scaled to RDKit space).
|
|
563
|
+
|
|
564
|
+
CDXML y-axis points down; RDKit y-axis points up. The *scale* factor
|
|
565
|
+
converts from CDXML points (~14.40 pt bond length) to RDKit units
|
|
566
|
+
(~1.5 unit bond length).
|
|
567
|
+
"""
|
|
568
|
+
from rdkit import Chem
|
|
569
|
+
from rdkit.Geometry import Point3D
|
|
570
|
+
|
|
571
|
+
conf = Chem.Conformer(mol.GetNumAtoms())
|
|
572
|
+
for a in atoms_data:
|
|
573
|
+
conf.SetAtomPosition(
|
|
574
|
+
a["idx"], Point3D(a["x"] * scale, -a["y"] * scale, 0.0))
|
|
575
|
+
mol.RemoveAllConformers()
|
|
576
|
+
mol.AddConformer(conf, assignId=True)
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
# ---------------------------------------------------------------------------
|
|
580
|
+
# Internal helpers
|
|
581
|
+
# ---------------------------------------------------------------------------
|
|
582
|
+
|
|
583
|
+
def _avg_bond_length(atoms_data: List[dict], mol) -> float:
|
|
584
|
+
"""Average bond length from CDXML atom coordinates."""
|
|
585
|
+
total, count = 0.0, 0
|
|
586
|
+
for bond in mol.GetBonds():
|
|
587
|
+
i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
|
|
588
|
+
dx = atoms_data[i]["x"] - atoms_data[j]["x"]
|
|
589
|
+
dy = atoms_data[i]["y"] - atoms_data[j]["y"]
|
|
590
|
+
total += math.sqrt(dx * dx + dy * dy)
|
|
591
|
+
count += 1
|
|
592
|
+
return total / count if count else ACS_BOND_LENGTH
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
def _avg_bond_length_from_conf(mol) -> float:
|
|
596
|
+
"""Average bond length from RDKit conformer coordinates."""
|
|
597
|
+
conf = mol.GetConformer()
|
|
598
|
+
total, count = 0.0, 0
|
|
599
|
+
for bond in mol.GetBonds():
|
|
600
|
+
p0 = conf.GetAtomPosition(bond.GetBeginAtomIdx())
|
|
601
|
+
p1 = conf.GetAtomPosition(bond.GetEndAtomIdx())
|
|
602
|
+
total += math.sqrt(
|
|
603
|
+
(p1.x - p0.x) ** 2 + (p1.y - p0.y) ** 2)
|
|
604
|
+
count += 1
|
|
605
|
+
return total / count if count else 1.5
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Render — declarative text-based reaction scheme renderer.
|
|
2
|
+
|
|
3
|
+
Build publication-ready CDXML reaction schemes from YAML or compact text.
|
|
4
|
+
The LLM specifies semantic content (structures, roles, conditions);
|
|
5
|
+
the deterministic renderer handles all spatial layout.
|
|
6
|
+
|
|
7
|
+
Supported layouts: linear, sequential, serpentine, divergent, stacked-rows.
|
|
8
|
+
Supported annotations: run arrows, dashed/failed arrows, compound labels,
|
|
9
|
+
letter conditions.
|
|
10
|
+
|
|
11
|
+
No ChemDraw COM needed — uses RDKit for 2D coordinate generation.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from .schema import SchemeDescriptor, StepDescriptor, StructureRef, ArrowContent
|
|
15
|
+
from .renderer import render, render_to_file
|
|
16
|
+
from .parser import parse_yaml
|
|
17
|
+
from .compact_parser import parse_compact_file
|