cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
scheme_aligner.py - Align reaction scheme structures using RDKit MCS.
|
|
4
|
+
|
|
5
|
+
Experimental tool. Uses Maximum Common Substructure (MCS) to find shared
|
|
6
|
+
scaffolds between the product and every other drawn structure (reactants,
|
|
7
|
+
reagents) in a CDXML reaction scheme, then aligns each structure's 2D
|
|
8
|
+
coordinates to match the product's orientation via RDKit's
|
|
9
|
+
GenerateDepictionMatching2DStructure.
|
|
10
|
+
|
|
11
|
+
The product is the reference — everything else aligns to it.
|
|
12
|
+
|
|
13
|
+
Inspired by:
|
|
14
|
+
https://greglandrum.github.io/rdkit-blog/posts/2021-08-07-rgd-and-highlighting.html
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
python scheme_aligner.py reaction.cdxml
|
|
18
|
+
python scheme_aligner.py reaction.cdxml -o aligned.cdxml --svg
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import argparse
|
|
22
|
+
import math
|
|
23
|
+
import sys
|
|
24
|
+
import xml.etree.ElementTree as ET
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
|
|
27
|
+
from ...constants import ACS_BOND_LENGTH
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
from rdkit import Chem, RDLogger
|
|
31
|
+
from rdkit.Chem import AllChem, rdFMCS, rdDepictor
|
|
32
|
+
from rdkit.Chem.Draw import rdMolDraw2D
|
|
33
|
+
from rdkit.Geometry import Point3D
|
|
34
|
+
RDLogger.logger().setLevel(RDLogger.ERROR)
|
|
35
|
+
except ImportError:
|
|
36
|
+
sys.exit("Error: RDKit is required. Activate the LLMChem environment.")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
# CDXML parsing
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
|
|
43
|
+
def parse_cdxml(path):
|
|
44
|
+
"""Parse CDXML file. Returns (tree, fragments_dict, reaction_steps)."""
|
|
45
|
+
tree = ET.parse(str(path))
|
|
46
|
+
root = tree.getroot()
|
|
47
|
+
page = root.find('.//page')
|
|
48
|
+
if page is None:
|
|
49
|
+
sys.exit("No <page> element in CDXML.")
|
|
50
|
+
|
|
51
|
+
fragments = {int(f.get('id')): f for f in page.findall('fragment')}
|
|
52
|
+
|
|
53
|
+
steps = []
|
|
54
|
+
for s in root.findall('.//step'):
|
|
55
|
+
steps.append({
|
|
56
|
+
'reactants': _ids(s.get('ReactionStepReactants', '')),
|
|
57
|
+
'products': _ids(s.get('ReactionStepProducts', '')),
|
|
58
|
+
'above': _ids(s.get('ReactionStepObjectsAboveArrow', '')),
|
|
59
|
+
'below': _ids(s.get('ReactionStepObjectsBelowArrow', '')),
|
|
60
|
+
})
|
|
61
|
+
|
|
62
|
+
return tree, fragments, steps
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _ids(s):
|
|
66
|
+
return [int(x) for x in s.split() if x]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
# Fragment -> RDKit Mol
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
def fragment_to_mol(frag_elem):
|
|
74
|
+
"""Convert a CDXML <fragment> to an RDKit Mol (no conformer set).
|
|
75
|
+
|
|
76
|
+
Returns (mol, atoms_list) where atoms_list has per-atom metadata
|
|
77
|
+
including original CDXML coordinates and XML element references.
|
|
78
|
+
"""
|
|
79
|
+
atoms, id_map = [], {}
|
|
80
|
+
|
|
81
|
+
for n in frag_elem.findall('n'):
|
|
82
|
+
nid = int(n.get('id'))
|
|
83
|
+
if n.get('NodeType') == 'ExternalConnectionPoint':
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
px, py = [float(v) for v in n.get('p', '0 0').split()]
|
|
87
|
+
elem = int(n.get('Element', '6'))
|
|
88
|
+
num_h_attr = n.get('NumHydrogens')
|
|
89
|
+
num_h = int(num_h_attr) if num_h_attr is not None else None
|
|
90
|
+
is_abbrev = n.get('NodeType') == 'Fragment'
|
|
91
|
+
|
|
92
|
+
idx = len(atoms)
|
|
93
|
+
id_map[nid] = idx
|
|
94
|
+
atoms.append({
|
|
95
|
+
'id': nid, 'idx': idx,
|
|
96
|
+
'x': px, 'y': py,
|
|
97
|
+
'elem': elem, 'num_h': num_h,
|
|
98
|
+
'is_abbrev': is_abbrev,
|
|
99
|
+
'xml': n,
|
|
100
|
+
})
|
|
101
|
+
|
|
102
|
+
bonds = []
|
|
103
|
+
for b in frag_elem.findall('b'):
|
|
104
|
+
bi, ei = int(b.get('B')), int(b.get('E'))
|
|
105
|
+
if bi in id_map and ei in id_map:
|
|
106
|
+
bonds.append((id_map[bi], id_map[ei], int(b.get('Order', '1'))))
|
|
107
|
+
|
|
108
|
+
em = Chem.RWMol()
|
|
109
|
+
for a in atoms:
|
|
110
|
+
ra = Chem.Atom(0 if a['is_abbrev'] else a['elem'])
|
|
111
|
+
if a['num_h'] is not None:
|
|
112
|
+
ra.SetNoImplicit(True)
|
|
113
|
+
ra.SetNumExplicitHs(a['num_h'])
|
|
114
|
+
em.AddAtom(ra)
|
|
115
|
+
|
|
116
|
+
BT = {1: Chem.BondType.SINGLE, 2: Chem.BondType.DOUBLE,
|
|
117
|
+
3: Chem.BondType.TRIPLE}
|
|
118
|
+
for bi, ei, order in bonds:
|
|
119
|
+
em.AddBond(bi, ei, BT.get(order, Chem.BondType.SINGLE))
|
|
120
|
+
|
|
121
|
+
mol = em.GetMol()
|
|
122
|
+
try:
|
|
123
|
+
Chem.SanitizeMol(mol)
|
|
124
|
+
except Exception:
|
|
125
|
+
try:
|
|
126
|
+
Chem.SanitizeMol(mol,
|
|
127
|
+
Chem.SanitizeFlags.SANITIZE_ALL ^
|
|
128
|
+
Chem.SanitizeFlags.SANITIZE_PROPERTIES)
|
|
129
|
+
except Exception:
|
|
130
|
+
pass
|
|
131
|
+
|
|
132
|
+
return mol, atoms
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# ---------------------------------------------------------------------------
|
|
136
|
+
# Scale helpers
|
|
137
|
+
# ---------------------------------------------------------------------------
|
|
138
|
+
|
|
139
|
+
def avg_bond_length(atoms_data, mol):
|
|
140
|
+
"""Average bond length computed from CDXML atom coordinates."""
|
|
141
|
+
total, count = 0.0, 0
|
|
142
|
+
for bond in mol.GetBonds():
|
|
143
|
+
i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
|
|
144
|
+
dx = atoms_data[i]['x'] - atoms_data[j]['x']
|
|
145
|
+
dy = atoms_data[i]['y'] - atoms_data[j]['y']
|
|
146
|
+
total += math.sqrt(dx * dx + dy * dy)
|
|
147
|
+
count += 1
|
|
148
|
+
return total / count if count else ACS_BOND_LENGTH
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
_rdkit_bl_cache = None
|
|
152
|
+
|
|
153
|
+
def rdkit_bond_length():
|
|
154
|
+
"""RDKit's default 2D depiction bond length (cached)."""
|
|
155
|
+
global _rdkit_bl_cache
|
|
156
|
+
if _rdkit_bl_cache is None:
|
|
157
|
+
m = Chem.MolFromSmiles('CC')
|
|
158
|
+
AllChem.Compute2DCoords(m)
|
|
159
|
+
c = m.GetConformer()
|
|
160
|
+
p0, p1 = c.GetAtomPosition(0), c.GetAtomPosition(1)
|
|
161
|
+
_rdkit_bl_cache = math.sqrt(
|
|
162
|
+
(p1.x - p0.x) ** 2 + (p1.y - p0.y) ** 2)
|
|
163
|
+
return _rdkit_bl_cache
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def set_cdxml_coords(mol, atoms_data, scale=1.0):
|
|
167
|
+
"""Set conformer from CDXML coordinates (y-flipped, optionally scaled)."""
|
|
168
|
+
conf = Chem.Conformer(mol.GetNumAtoms())
|
|
169
|
+
for a in atoms_data:
|
|
170
|
+
conf.SetAtomPosition(a['idx'],
|
|
171
|
+
Point3D(a['x'] * scale, -a['y'] * scale, 0.0))
|
|
172
|
+
mol.RemoveAllConformers()
|
|
173
|
+
mol.AddConformer(conf, assignId=True)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# ---------------------------------------------------------------------------
|
|
177
|
+
# MCS finding
|
|
178
|
+
# ---------------------------------------------------------------------------
|
|
179
|
+
|
|
180
|
+
def find_mcs(ref_mol, target_mol, timeout=30):
|
|
181
|
+
"""Find MCS. Returns (mcs_result, atom_map [(ref_idx, tgt_idx)])."""
|
|
182
|
+
mcs = rdFMCS.FindMCS(
|
|
183
|
+
[ref_mol, target_mol],
|
|
184
|
+
timeout=timeout,
|
|
185
|
+
atomCompare=rdFMCS.AtomCompare.CompareElements,
|
|
186
|
+
bondCompare=rdFMCS.BondCompare.CompareOrder,
|
|
187
|
+
ringMatchesRingOnly=True,
|
|
188
|
+
completeRingsOnly=True,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
if mcs.numAtoms < 3:
|
|
192
|
+
return None, None
|
|
193
|
+
|
|
194
|
+
core = Chem.MolFromSmarts(mcs.smartsString)
|
|
195
|
+
if core is None:
|
|
196
|
+
return None, None
|
|
197
|
+
|
|
198
|
+
ref_match = ref_mol.GetSubstructMatch(core)
|
|
199
|
+
target_match = target_mol.GetSubstructMatch(core)
|
|
200
|
+
if not ref_match or not target_match:
|
|
201
|
+
return None, None
|
|
202
|
+
|
|
203
|
+
return mcs, list(zip(ref_match, target_match))
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
# ---------------------------------------------------------------------------
|
|
207
|
+
# Alignment via GenerateDepictionMatching2DStructure
|
|
208
|
+
# ---------------------------------------------------------------------------
|
|
209
|
+
|
|
210
|
+
def align_fragment(ref_mol, tgt_mol, atom_map):
|
|
211
|
+
"""Align target fragment to reference (product) using
|
|
212
|
+
GenerateDepictionMatching2DStructure.
|
|
213
|
+
|
|
214
|
+
ref_mol must already have its conformer set at RDKit scale.
|
|
215
|
+
Modifies tgt_mol conformer in-place.
|
|
216
|
+
Returns MCS RMSD in RDKit units.
|
|
217
|
+
"""
|
|
218
|
+
rdDepictor.GenerateDepictionMatching2DStructure(
|
|
219
|
+
tgt_mol, ref_mol, atom_map)
|
|
220
|
+
|
|
221
|
+
# RMSD of MCS atoms (should be ~0)
|
|
222
|
+
rc = ref_mol.GetConformer()
|
|
223
|
+
tc = tgt_mol.GetConformer()
|
|
224
|
+
ss = sum(
|
|
225
|
+
(rc.GetAtomPosition(ri).x - tc.GetAtomPosition(ti).x) ** 2 +
|
|
226
|
+
(rc.GetAtomPosition(ri).y - tc.GetAtomPosition(ti).y) ** 2
|
|
227
|
+
for ri, ti in atom_map)
|
|
228
|
+
return math.sqrt(ss / len(atom_map))
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
# ---------------------------------------------------------------------------
|
|
232
|
+
# Coordinate writeback
|
|
233
|
+
# ---------------------------------------------------------------------------
|
|
234
|
+
|
|
235
|
+
def _translate_subtree(elem, dx, dy):
|
|
236
|
+
"""Recursively shift all p and BoundingBox attributes by (dx, dy)."""
|
|
237
|
+
p = elem.get('p')
|
|
238
|
+
if p:
|
|
239
|
+
parts = p.split()
|
|
240
|
+
if len(parts) >= 2:
|
|
241
|
+
elem.set('p',
|
|
242
|
+
f"{float(parts[0])+dx:.2f} {float(parts[1])+dy:.2f}")
|
|
243
|
+
|
|
244
|
+
bb = elem.get('BoundingBox')
|
|
245
|
+
if bb:
|
|
246
|
+
parts = bb.split()
|
|
247
|
+
if len(parts) == 4:
|
|
248
|
+
elem.set('BoundingBox',
|
|
249
|
+
f"{float(parts[0])+dx:.2f} {float(parts[1])+dy:.2f} "
|
|
250
|
+
f"{float(parts[2])+dx:.2f} {float(parts[3])+dy:.2f}")
|
|
251
|
+
|
|
252
|
+
for child in elem:
|
|
253
|
+
_translate_subtree(child, dx, dy)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def write_aligned_coords(frag_elem, mol, atoms_data, scale,
|
|
257
|
+
original_center):
|
|
258
|
+
"""Convert aligned RDKit coords back to CDXML space and write to XML."""
|
|
259
|
+
conf = mol.GetConformer()
|
|
260
|
+
inv = 1.0 / scale
|
|
261
|
+
|
|
262
|
+
# Aligned positions in CDXML space
|
|
263
|
+
aligned = []
|
|
264
|
+
for a in atoms_data:
|
|
265
|
+
pos = conf.GetAtomPosition(a['idx'])
|
|
266
|
+
aligned.append((pos.x * inv, -pos.y * inv)) # scale + flip y
|
|
267
|
+
|
|
268
|
+
# Translate to keep fragment at its original center
|
|
269
|
+
acx = sum(p[0] for p in aligned) / len(aligned)
|
|
270
|
+
acy = sum(p[1] for p in aligned) / len(aligned)
|
|
271
|
+
gdx = original_center[0] - acx
|
|
272
|
+
gdy = original_center[1] - acy
|
|
273
|
+
|
|
274
|
+
for i, a in enumerate(atoms_data):
|
|
275
|
+
new_x = aligned[i][0] + gdx
|
|
276
|
+
new_y = aligned[i][1] + gdy
|
|
277
|
+
adx = new_x - a['x']
|
|
278
|
+
ady = new_y - a['y']
|
|
279
|
+
|
|
280
|
+
node = a['xml']
|
|
281
|
+
node.set('p', f"{new_x:.2f} {new_y:.2f}")
|
|
282
|
+
|
|
283
|
+
for child in node:
|
|
284
|
+
_translate_subtree(child, adx, ady)
|
|
285
|
+
|
|
286
|
+
# Recompute fragment BoundingBox
|
|
287
|
+
xs, ys = [], []
|
|
288
|
+
for n in frag_elem.findall('n'):
|
|
289
|
+
if n.get('NodeType') == 'ExternalConnectionPoint':
|
|
290
|
+
continue
|
|
291
|
+
p = n.get('p')
|
|
292
|
+
if p:
|
|
293
|
+
parts = p.split()
|
|
294
|
+
xs.append(float(parts[0]))
|
|
295
|
+
ys.append(float(parts[1]))
|
|
296
|
+
if xs and ys:
|
|
297
|
+
margin = 15.0
|
|
298
|
+
frag_elem.set('BoundingBox',
|
|
299
|
+
f"{min(xs)-margin:.2f} {min(ys)-margin:.2f} "
|
|
300
|
+
f"{max(xs)+margin:.2f} {max(ys)+margin:.2f}")
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
# ---------------------------------------------------------------------------
|
|
304
|
+
# Visualization
|
|
305
|
+
# ---------------------------------------------------------------------------
|
|
306
|
+
|
|
307
|
+
def save_svg(mol, highlight_atoms, label, out_dir, stem):
|
|
308
|
+
"""Save a single SVG with highlighted atoms."""
|
|
309
|
+
drawer = rdMolDraw2D.MolDraw2DSVG(600, 450)
|
|
310
|
+
drawer.drawOptions().addAtomIndices = False
|
|
311
|
+
drawer.DrawMolecule(mol, highlightAtoms=highlight_atoms)
|
|
312
|
+
drawer.FinishDrawing()
|
|
313
|
+
svg_path = out_dir / f"{stem}-{label}.svg"
|
|
314
|
+
svg_path.write_text(drawer.GetDrawingText())
|
|
315
|
+
print(f" SVG: {svg_path}")
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# ---------------------------------------------------------------------------
|
|
319
|
+
# Main
|
|
320
|
+
# ---------------------------------------------------------------------------
|
|
321
|
+
|
|
322
|
+
def _centroid(atoms_data):
|
|
323
|
+
n = len(atoms_data)
|
|
324
|
+
return (sum(a['x'] for a in atoms_data) / n,
|
|
325
|
+
sum(a['y'] for a in atoms_data) / n)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def main(argv=None) -> int:
|
|
329
|
+
ap = argparse.ArgumentParser(
|
|
330
|
+
description='Align all structures in a reaction scheme to the '
|
|
331
|
+
'product orientation via RDKit MCS.',
|
|
332
|
+
)
|
|
333
|
+
ap.add_argument('input', help='Input CDXML file with reaction scheme')
|
|
334
|
+
ap.add_argument('-o', '--output',
|
|
335
|
+
help='Output CDXML (default: <input>-aligned.cdxml)')
|
|
336
|
+
ap.add_argument('--svg', action='store_true',
|
|
337
|
+
help='Save SVGs showing MCS-highlighted structures')
|
|
338
|
+
ap.add_argument('--timeout', type=int, default=30,
|
|
339
|
+
help='MCS timeout in seconds (default: 30)')
|
|
340
|
+
args = ap.parse_args(argv)
|
|
341
|
+
|
|
342
|
+
inp = Path(args.input)
|
|
343
|
+
if not inp.exists():
|
|
344
|
+
print(f"File not found: {inp}", file=sys.stderr)
|
|
345
|
+
return 1
|
|
346
|
+
|
|
347
|
+
out = Path(args.output) if args.output else \
|
|
348
|
+
inp.parent / (inp.stem + '-aligned.cdxml')
|
|
349
|
+
|
|
350
|
+
tree, fragments, steps = parse_cdxml(inp)
|
|
351
|
+
if not steps:
|
|
352
|
+
print("No reaction scheme found in CDXML.", file=sys.stderr)
|
|
353
|
+
return 1
|
|
354
|
+
|
|
355
|
+
print(f"Input: {inp}")
|
|
356
|
+
print(f"Fragments: {list(fragments.keys())}")
|
|
357
|
+
print(f"Reaction steps: {len(steps)}")
|
|
358
|
+
|
|
359
|
+
for si, step in enumerate(steps):
|
|
360
|
+
if not step['products']:
|
|
361
|
+
print(f"\nStep {si+1}: no products, skipping.")
|
|
362
|
+
continue
|
|
363
|
+
|
|
364
|
+
# --- Product is the reference ---
|
|
365
|
+
prod_id = step['products'][0]
|
|
366
|
+
prod_mol, prod_atoms = fragment_to_mol(fragments[prod_id])
|
|
367
|
+
|
|
368
|
+
# Compute scale from product's bond length
|
|
369
|
+
cdxml_bl = avg_bond_length(prod_atoms, prod_mol)
|
|
370
|
+
rdk_bl = rdkit_bond_length()
|
|
371
|
+
scale = rdk_bl / cdxml_bl
|
|
372
|
+
|
|
373
|
+
# Set product conformer at RDKit scale (the reference for all alignments)
|
|
374
|
+
set_cdxml_coords(prod_mol, prod_atoms, scale)
|
|
375
|
+
|
|
376
|
+
print(f"\nStep {si+1}:")
|
|
377
|
+
print(f" Product = reference (fragment {prod_id}): "
|
|
378
|
+
f"{prod_mol.GetNumAtoms()} atoms, "
|
|
379
|
+
f"{prod_mol.GetNumBonds()} bonds")
|
|
380
|
+
print(f" Bond length: CDXML {cdxml_bl:.1f} pts -> "
|
|
381
|
+
f"RDKit {rdk_bl:.2f}")
|
|
382
|
+
|
|
383
|
+
if args.svg:
|
|
384
|
+
save_svg(prod_mol, list(range(prod_mol.GetNumAtoms())),
|
|
385
|
+
'product-ref', out.parent, out.stem)
|
|
386
|
+
|
|
387
|
+
# --- Collect all other drawn structures in this step ---
|
|
388
|
+
other_ids = []
|
|
389
|
+
for fid in (step['reactants'] + step['above'] + step['below']):
|
|
390
|
+
if fid in fragments and fid != prod_id and fid not in other_ids:
|
|
391
|
+
other_ids.append(fid)
|
|
392
|
+
|
|
393
|
+
for fid in other_ids:
|
|
394
|
+
frag_mol, frag_atoms = fragment_to_mol(fragments[fid])
|
|
395
|
+
frag_center = _centroid(frag_atoms)
|
|
396
|
+
|
|
397
|
+
print(f"\n Fragment {fid}: "
|
|
398
|
+
f"{frag_mol.GetNumAtoms()} atoms, "
|
|
399
|
+
f"{frag_mol.GetNumBonds()} bonds")
|
|
400
|
+
|
|
401
|
+
# Find MCS with product
|
|
402
|
+
mcs, amap = find_mcs(prod_mol, frag_mol, args.timeout)
|
|
403
|
+
if mcs is None:
|
|
404
|
+
print(f" MCS < 3 atoms, skipping.")
|
|
405
|
+
continue
|
|
406
|
+
|
|
407
|
+
print(f" MCS: {mcs.numAtoms} atoms, {mcs.numBonds} bonds")
|
|
408
|
+
|
|
409
|
+
# Align this fragment to the product
|
|
410
|
+
rmsd = align_fragment(prod_mol, frag_mol, amap)
|
|
411
|
+
print(f" MCS RMSD: {rmsd:.4f}")
|
|
412
|
+
|
|
413
|
+
# Write aligned coords back to CDXML
|
|
414
|
+
write_aligned_coords(
|
|
415
|
+
fragments[fid], frag_mol, frag_atoms, scale, frag_center)
|
|
416
|
+
print(f" Coordinates updated.")
|
|
417
|
+
|
|
418
|
+
if args.svg:
|
|
419
|
+
hl = [ti for ri, ti in amap]
|
|
420
|
+
save_svg(frag_mol, hl, f'frag{fid}', out.parent, out.stem)
|
|
421
|
+
|
|
422
|
+
tree.write(str(out), xml_declaration=True, encoding='UTF-8')
|
|
423
|
+
print(f"\nOutput: {out}")
|
|
424
|
+
return 0
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
if __name__ == '__main__':
|
|
428
|
+
sys.exit(main())
|