cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,605 @@
1
+ """RDKit-based CDXML fragment utilities.
2
+
3
+ Complements cdxml_utils.py (which provides pure XML geometry — bounding
4
+ boxes, centroids, text bbox, IO) with RDKit-powered chemical operations:
5
+
6
+ - frag_to_mol() — CDXML <fragment> → RDKit Mol (with metadata)
7
+ - frag_to_smiles() — CDXML <fragment> → canonical SMILES
8
+ - frag_to_mw() — CDXML <fragment> → molecular weight
9
+ - frag_to_molblock() — CDXML <fragment> → MOL block (CDXML coords)
10
+ - cleanup_fragment_rdkit() — 2D cleanup with Kabsch orientation preservation
11
+ - set_cdxml_conformer() — Set RDKit conformer from CDXML coordinates
12
+ - rdkit_default_bond_length() — RDKit's default 2D depiction bond length
13
+ - avg_bond_length_from_atoms() — Average bond length from CDXML atom coords
14
+
15
+ Uses shared modules (constants.py for ACS_BOND_LENGTH).
16
+
17
+ All RDKit imports are lazy so this module can be imported even if RDKit
18
+ is not installed (functions will raise ImportError at call time).
19
+ """
20
+
21
+ import math
22
+ import xml.etree.ElementTree as ET
23
+ from typing import Dict, List, Optional, Tuple
24
+
25
+ from .constants import ACS_BOND_LENGTH
26
+
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # Core: CDXML <fragment> → RDKit Mol
30
+ # ---------------------------------------------------------------------------
31
+
32
+ def frag_to_mol(frag_elem: ET.Element):
33
+ """Convert a CDXML <fragment> to an RDKit Mol with atom metadata.
34
+
35
+ Returns ``(mol, atoms_data)`` where *atoms_data* is a list of dicts
36
+ with keys: id, idx, x, y, elem, num_h, is_abbrev, xml.
37
+
38
+ Abbreviation groups (``NodeType="Fragment"``) become dummy atoms
39
+ (element 0) so they participate in connectivity but not MCS element
40
+ matching.
41
+
42
+ Returns ``(None, None)`` if conversion fails.
43
+ """
44
+ from rdkit import Chem
45
+
46
+ atoms: List[dict] = []
47
+ id_map: Dict[int, int] = {}
48
+
49
+ # NodeTypes that are NOT real atoms — they become dummy atoms (element 0)
50
+ # or get skipped entirely.
51
+ _SKIP_NODETYPES = {"ExternalConnectionPoint"}
52
+ _DUMMY_NODETYPES = {
53
+ "Fragment", # Real abbreviation groups (Boc, OTs, Me, etc.)
54
+ "GenericNickname", # Generic variable groups (R, X, Ar, etc.)
55
+ "Nickname", # Alternative label form (may or may not be real)
56
+ "Unspecified", # Uninterpretable text labels
57
+ }
58
+
59
+ for n in frag_elem.findall("n"):
60
+ nid = int(n.get("id"))
61
+ node_type = n.get("NodeType")
62
+ if node_type in _SKIP_NODETYPES:
63
+ continue
64
+
65
+ px, py = [float(v) for v in n.get("p", "0 0").split()]
66
+ elem = int(n.get("Element", "6"))
67
+ num_h_attr = n.get("NumHydrogens")
68
+ num_h = int(num_h_attr) if num_h_attr is not None else None
69
+ is_abbrev = node_type in _DUMMY_NODETYPES
70
+
71
+ idx = len(atoms)
72
+ id_map[nid] = idx
73
+ atoms.append({
74
+ "id": nid, "idx": idx,
75
+ "x": px, "y": py,
76
+ "elem": elem, "num_h": num_h,
77
+ "is_abbrev": is_abbrev,
78
+ "xml": n,
79
+ })
80
+
81
+ bonds = []
82
+ for b in frag_elem.findall("b"):
83
+ bi, ei = int(b.get("B")), int(b.get("E"))
84
+ if bi in id_map and ei in id_map:
85
+ bonds.append((id_map[bi], id_map[ei], int(b.get("Order", "1"))))
86
+
87
+ em = Chem.RWMol()
88
+ for a in atoms:
89
+ ra = Chem.Atom(0 if a["is_abbrev"] else a["elem"])
90
+ if a["num_h"] is not None:
91
+ ra.SetNoImplicit(True)
92
+ ra.SetNumExplicitHs(a["num_h"])
93
+ em.AddAtom(ra)
94
+
95
+ BT = {1: Chem.BondType.SINGLE, 2: Chem.BondType.DOUBLE,
96
+ 3: Chem.BondType.TRIPLE}
97
+ for bi, ei, order in bonds:
98
+ em.AddBond(bi, ei, BT.get(order, Chem.BondType.SINGLE))
99
+
100
+ mol = em.GetMol()
101
+ try:
102
+ Chem.SanitizeMol(mol)
103
+ except Exception:
104
+ try:
105
+ Chem.SanitizeMol(
106
+ mol,
107
+ Chem.SanitizeFlags.SANITIZE_ALL
108
+ ^ Chem.SanitizeFlags.SANITIZE_PROPERTIES,
109
+ )
110
+ except Exception:
111
+ pass
112
+
113
+ return mol, atoms
114
+
115
+
116
+ # ---------------------------------------------------------------------------
117
+ # Convenience wrappers
118
+ # ---------------------------------------------------------------------------
119
+
120
+ def frag_to_smiles(frag_elem: ET.Element) -> Optional[str]:
121
+ """Convert a CDXML <fragment> to a canonical SMILES string.
122
+
123
+ Returns None if conversion fails.
124
+ """
125
+ from rdkit import Chem
126
+ result = frag_to_mol(frag_elem)
127
+ if result is None or result[0] is None:
128
+ return None
129
+ mol, _ = result
130
+ try:
131
+ smi = Chem.MolToSmiles(mol)
132
+ return smi if smi else None
133
+ except Exception:
134
+ return None
135
+
136
+
137
+ def frag_to_smiles_resolved(frag_elem: ET.Element) -> Optional[str]:
138
+ """Convert a CDXML <fragment> to SMILES, resolving abbreviation groups.
139
+
140
+ Unlike :func:`frag_to_smiles`, which turns abbreviation groups
141
+ (``NodeType="Fragment"``) into ``[*]`` dummy atoms, this function
142
+ attempts to replace each dummy with the real fragment SMILES from
143
+ the superatom table.
144
+
145
+ Falls back to :func:`frag_to_smiles` if resolution fails.
146
+ """
147
+ from rdkit import Chem
148
+
149
+ result = frag_to_mol(frag_elem)
150
+ if result is None or result[0] is None:
151
+ return None
152
+ mol, atoms_data = result
153
+
154
+ # Check for abbreviation groups — only resolve real abbreviations
155
+ # (NodeType="Fragment"), NOT generic groups (R, X, Ar — GenericNickname,
156
+ # Nickname, Unspecified) which should stay as [*].
157
+ abbrev_atoms = [(a["idx"], a) for a in atoms_data
158
+ if a["is_abbrev"]
159
+ and a["xml"].get("NodeType") == "Fragment"]
160
+ if not abbrev_atoms:
161
+ # No abbreviations — standard path
162
+ try:
163
+ smi = Chem.MolToSmiles(mol)
164
+ return smi if smi else None
165
+ except Exception:
166
+ return None
167
+
168
+ # Try to resolve each abbreviation
169
+ try:
170
+ from .resolve.superatom_table import get_abbrev_label, lookup_smiles
171
+ except ImportError:
172
+ return frag_to_smiles(frag_elem)
173
+
174
+ em = Chem.RWMol(mol)
175
+
176
+ # Process abbreviations in reverse index order to keep indices stable
177
+ replacements = []
178
+ for idx, a in sorted(abbrev_atoms, key=lambda x: x[0], reverse=True):
179
+ label = get_abbrev_label(a["xml"])
180
+ if not label:
181
+ return frag_to_smiles(frag_elem) # Can't resolve — fallback
182
+
183
+ abbrev_smi = lookup_smiles(label)
184
+ if not abbrev_smi:
185
+ return frag_to_smiles(frag_elem) # Unknown abbreviation — fallback
186
+
187
+ abbrev_mol = Chem.MolFromSmiles(abbrev_smi)
188
+ if abbrev_mol is None:
189
+ return frag_to_smiles(frag_elem)
190
+
191
+ # Find the bond connecting dummy to core
192
+ dummy_atom = em.GetAtomWithIdx(idx)
193
+ dummy_bonds = list(dummy_atom.GetBonds())
194
+ if len(dummy_bonds) != 1:
195
+ return frag_to_smiles(frag_elem) # Multi-attachment — too complex
196
+
197
+ bond = dummy_bonds[0]
198
+ core_idx = bond.GetOtherAtomIdx(idx)
199
+ bond_type = bond.GetBondType()
200
+
201
+ replacements.append((idx, core_idx, bond_type, abbrev_mol))
202
+
203
+ # Apply replacements (still in reverse order)
204
+ for idx, core_idx, bond_type, abbrev_mol in replacements:
205
+ # Remove bond between dummy and core
206
+ em.RemoveBond(idx, core_idx)
207
+
208
+ # Add abbreviation atoms
209
+ offset = em.GetNumAtoms()
210
+ for i in range(abbrev_mol.GetNumAtoms()):
211
+ new_atom = Chem.Atom(abbrev_mol.GetAtomWithIdx(i).GetAtomicNum())
212
+ src = abbrev_mol.GetAtomWithIdx(i)
213
+ new_atom.SetFormalCharge(src.GetFormalCharge())
214
+ if src.GetNoImplicit():
215
+ new_atom.SetNoImplicit(True)
216
+ new_atom.SetNumExplicitHs(src.GetNumExplicitHs())
217
+ em.AddAtom(new_atom)
218
+
219
+ for b in abbrev_mol.GetBonds():
220
+ em.AddBond(offset + b.GetBeginAtomIdx(),
221
+ offset + b.GetEndAtomIdx(),
222
+ b.GetBondType())
223
+
224
+ # Connect first atom of abbreviation to core
225
+ em.AddBond(core_idx, offset, bond_type)
226
+
227
+ # Remove dummy atoms (highest index first — they were sorted in reverse)
228
+ for idx, _, _, _ in replacements:
229
+ em.RemoveAtom(idx)
230
+
231
+ try:
232
+ resolved = em.GetMol()
233
+ Chem.SanitizeMol(resolved)
234
+ smi = Chem.MolToSmiles(resolved)
235
+ return smi if smi else frag_to_smiles(frag_elem)
236
+ except Exception:
237
+ return frag_to_smiles(frag_elem)
238
+
239
+
240
+ def frag_to_smiles_chemscript(frag_elem: ET.Element) -> Optional[str]:
241
+ """Convert a CDXML ``<fragment>`` to SMILES using ChemScript.
242
+
243
+ ChemScript (PerkinElmer ChemDraw .NET library) natively understands
244
+ ALL ChemDraw abbreviation groups (Nicknames, Fragments, generic groups)
245
+ and expands them to full structures. This gives far better results than
246
+ :func:`frag_to_smiles_resolved` for fragments with complex or rare
247
+ abbreviations (NHTrs, PO(OH)₂, Bn, etc.).
248
+
249
+ Falls back to ``None`` if ChemScript is unavailable or fails.
250
+ Requires ChemDraw 16+ installed on Windows.
251
+ """
252
+ import copy
253
+ import tempfile
254
+ import os
255
+
256
+ try:
257
+ from .chemdraw.chemscript_bridge import ChemScriptBridge
258
+ except ImportError:
259
+ return None
260
+
261
+ # Wrap the fragment in a minimal CDXML document
262
+ frag_copy = copy.deepcopy(frag_elem)
263
+ wrapper = ET.Element("CDXML")
264
+ page_el = ET.SubElement(wrapper, "page")
265
+ page_el.append(frag_copy)
266
+
267
+ tmp_path = None
268
+ try:
269
+ tmp = tempfile.NamedTemporaryFile(
270
+ suffix=".cdxml", delete=False, mode="w", encoding="utf-8"
271
+ )
272
+ tmp.write('<?xml version="1.0" encoding="UTF-8" ?>')
273
+ tmp.write(ET.tostring(wrapper, encoding="unicode"))
274
+ tmp.close()
275
+ tmp_path = tmp.name
276
+
277
+ cs = ChemScriptBridge()
278
+ info = cs.get_info(tmp_path)
279
+ if info and info.get("ok"):
280
+ smi = info.get("smiles")
281
+ if smi:
282
+ return smi
283
+ # Reaction-type response — shouldn't happen for a single fragment
284
+ # but handle gracefully
285
+ reactants = info.get("reactants", [])
286
+ if reactants and reactants[0].get("smiles"):
287
+ return reactants[0]["smiles"]
288
+ return None
289
+ except Exception:
290
+ return None
291
+ finally:
292
+ if tmp_path and os.path.exists(tmp_path):
293
+ try:
294
+ os.unlink(tmp_path)
295
+ except OSError:
296
+ pass
297
+
298
+
299
+ def frag_to_mw(frag_elem: ET.Element) -> Optional[float]:
300
+ """Compute molecular weight from a CDXML <fragment>.
301
+
302
+ If the fragment contains abbreviation groups (``NodeType="Fragment"``),
303
+ attempts to resolve their MW via the superatom lookup table
304
+ (``superatom_table.py``). Falls back to None only if an abbreviation
305
+ label cannot be resolved.
306
+ """
307
+ from rdkit.Chem import Descriptors
308
+ result = frag_to_mol(frag_elem)
309
+ if result is None or result[0] is None:
310
+ return None
311
+ mol, atoms_data = result
312
+
313
+ abbrev_atoms = [a for a in atoms_data if a["is_abbrev"]]
314
+ if not abbrev_atoms:
315
+ # No abbreviations — straightforward MW
316
+ try:
317
+ return Descriptors.MolWt(mol)
318
+ except Exception:
319
+ return None
320
+
321
+ # Has abbreviation groups — try superatom-assisted MW.
322
+ # Strategy: MolWt(mol_with_dummies) gives MW of the core (dummy atoms
323
+ # contribute 0 Da). For each abbreviation, look up its standalone MW
324
+ # and subtract 1.008 (one H lost when it bonds to the core).
325
+ try:
326
+ from .resolve.superatom_table import get_abbrev_label, lookup_mw
327
+ except ImportError:
328
+ return None
329
+
330
+ H_MASS = 1.008
331
+ abbrev_mw_total = 0.0
332
+ for a in abbrev_atoms:
333
+ label = get_abbrev_label(a["xml"])
334
+ if label is None:
335
+ return None # can't read label
336
+ mw = lookup_mw(label)
337
+ if mw is None:
338
+ return None # unknown abbreviation
339
+ # Count bonds from this dummy atom to the rest of the molecule
340
+ dummy_idx = a["idx"]
341
+ n_bonds = sum(1 for bond in mol.GetBonds()
342
+ if bond.GetBeginAtomIdx() == dummy_idx
343
+ or bond.GetEndAtomIdx() == dummy_idx)
344
+ # Each bond replaces one H on the abbreviation fragment
345
+ abbrev_mw_total += mw - (n_bonds * H_MASS)
346
+
347
+ try:
348
+ core_mw = Descriptors.MolWt(mol)
349
+ except Exception:
350
+ return None
351
+
352
+ return core_mw + abbrev_mw_total
353
+
354
+
355
+ def frag_to_molblock(frag_elem: ET.Element) -> Optional[str]:
356
+ """Convert a CDXML <fragment> to a MOL block string (with CDXML coords).
357
+
358
+ Sets the RDKit conformer from CDXML coordinates before export so the
359
+ MOL block preserves the drawn layout.
360
+
361
+ Returns None if conversion fails.
362
+ """
363
+ from rdkit import Chem
364
+ result = frag_to_mol(frag_elem)
365
+ if result is None or result[0] is None:
366
+ return None
367
+ mol, atoms_data = result
368
+
369
+ set_cdxml_conformer(mol, atoms_data, scale=1.0)
370
+
371
+ try:
372
+ return Chem.MolToMolBlock(mol)
373
+ except Exception:
374
+ return None
375
+
376
+
377
+ # ---------------------------------------------------------------------------
378
+ # 2D Cleanup with orientation preservation
379
+ # ---------------------------------------------------------------------------
380
+
381
+ def cleanup_fragment_rdkit(frag_elem: ET.Element,
382
+ verbose: bool = False) -> bool:
383
+ """Clean up a single fragment's 2D geometry using RDKit.
384
+
385
+ Uses ``AllChem.Compute2DCoords()`` for cleanup, then applies Kabsch
386
+ rotation to restore the original orientation.
387
+
388
+ Abbreviation groups (``NodeType="Fragment"``) are included as dummy
389
+ atoms (element 0) in the RDKit mol — they participate in layout but
390
+ not element matching. When an abbreviation node moves, its inner
391
+ fragment atoms and text label are translated by the same delta.
392
+
393
+ Modifies *frag_elem* in place. Returns True if cleanup was applied.
394
+ """
395
+ import copy as _copy
396
+ from rdkit import Chem
397
+ from rdkit.Chem import AllChem
398
+
399
+ result = frag_to_mol(frag_elem)
400
+ if result is None or result[0] is None:
401
+ return False
402
+ mol, atoms_data = result
403
+
404
+ if mol.GetNumAtoms() < 2:
405
+ return False
406
+
407
+ # Save original coordinates
408
+ orig_coords = [(a["x"], a["y"]) for a in atoms_data]
409
+
410
+ # Compute new 2D coords
411
+ mol_copy = _copy.deepcopy(mol)
412
+ AllChem.Compute2DCoords(mol_copy)
413
+ conf = mol_copy.GetConformer()
414
+
415
+ # Get new coords (RDKit space: y-up)
416
+ new_coords_rdk = []
417
+ for i in range(mol_copy.GetNumAtoms()):
418
+ pos = conf.GetAtomPosition(i)
419
+ new_coords_rdk.append((pos.x, pos.y))
420
+
421
+ # Scale new coords to ACS standard bond length
422
+ avg_bl_new = _avg_bond_length_from_conf(mol_copy)
423
+ if avg_bl_new < 1e-6:
424
+ return False
425
+ scale = ACS_BOND_LENGTH / avg_bl_new
426
+
427
+ # Convert to CDXML space (y-flip + scale)
428
+ new_coords_cdxml = [(x * scale, -y * scale) for x, y in new_coords_rdk]
429
+
430
+ # Kabsch: find best rotation from new → original
431
+ cx_orig = sum(x for x, y in orig_coords) / len(orig_coords)
432
+ cy_orig = sum(y for x, y in orig_coords) / len(orig_coords)
433
+ cx_new = sum(x for x, y in new_coords_cdxml) / len(new_coords_cdxml)
434
+ cy_new = sum(y for x, y in new_coords_cdxml) / len(new_coords_cdxml)
435
+
436
+ orig_centered = [(x - cx_orig, y - cy_orig) for x, y in orig_coords]
437
+ new_centered = [(x - cx_new, y - cy_new) for x, y in new_coords_cdxml]
438
+
439
+ # Compute optimal rotation angle via atan2(cross, dot)
440
+ dot_sum = 0.0
441
+ cross_sum = 0.0
442
+ for (ox, oy), (nx, ny) in zip(orig_centered, new_centered):
443
+ dot_sum += nx * ox + ny * oy
444
+ cross_sum += nx * oy - ny * ox
445
+ angle = math.atan2(cross_sum, dot_sum)
446
+
447
+ cos_a = math.cos(angle)
448
+ sin_a = math.sin(angle)
449
+
450
+ # Apply rotation to new coords and translate to original centroid
451
+ final_coords = []
452
+ for x, y in new_centered:
453
+ rx = x * cos_a - y * sin_a + cx_orig
454
+ ry = x * sin_a + y * cos_a + cy_orig
455
+ final_coords.append((rx, ry))
456
+
457
+ # For salt products (disconnected components like amine + HCl):
458
+ # RDKit's Compute2DCoords places disconnected fragments arbitrarily.
459
+ # The Kabsch rotation preserves overall orientation but scrambles the
460
+ # relative position of small counterions. Fix: reposition small
461
+ # components to preserve their original offset from the main structure.
462
+ frags = Chem.GetMolFrags(mol)
463
+ if len(frags) > 1:
464
+ largest = max(frags, key=len)
465
+ # Original centroid of largest component
466
+ ocx_main = sum(orig_coords[i][0] for i in largest) / len(largest)
467
+ ocy_main = sum(orig_coords[i][1] for i in largest) / len(largest)
468
+ # New centroid of largest component (after Kabsch)
469
+ ncx_main = sum(final_coords[i][0] for i in largest) / len(largest)
470
+ ncy_main = sum(final_coords[i][1] for i in largest) / len(largest)
471
+ for comp in frags:
472
+ if comp is largest:
473
+ continue
474
+ # Original offset from main component
475
+ ocx_s = sum(orig_coords[i][0] for i in comp) / len(comp)
476
+ ocy_s = sum(orig_coords[i][1] for i in comp) / len(comp)
477
+ off_x = ocx_s - ocx_main
478
+ off_y = ocy_s - ocy_main
479
+ # Where it should be (preserve original offset from main)
480
+ tgt_x = ncx_main + off_x
481
+ tgt_y = ncy_main + off_y
482
+ # Where it currently is
483
+ cur_x = sum(final_coords[i][0] for i in comp) / len(comp)
484
+ cur_y = sum(final_coords[i][1] for i in comp) / len(comp)
485
+ # Shift
486
+ dx = tgt_x - cur_x
487
+ dy = tgt_y - cur_y
488
+ for idx in comp:
489
+ fx, fy = final_coords[idx]
490
+ final_coords[idx] = (fx + dx, fy + dy)
491
+
492
+ # Write back to CDXML — also translate inner abbreviation fragments
493
+ has_abbrev = False
494
+ for atom_d, (fx, fy) in zip(atoms_data, final_coords):
495
+ node = atom_d["xml"]
496
+ old_x, old_y = atom_d["x"], atom_d["y"]
497
+ node.set("p", f"{fx:.4f} {fy:.4f}")
498
+
499
+ if atom_d["is_abbrev"]:
500
+ has_abbrev = True
501
+ dx = fx - old_x
502
+ dy = fy - old_y
503
+ inner_frag = node.find("fragment")
504
+ if inner_frag is not None:
505
+ for inner_n in inner_frag.findall("n"):
506
+ ip = inner_n.get("p")
507
+ if ip:
508
+ ix, iy = [float(v) for v in ip.split()]
509
+ inner_n.set("p", f"{ix + dx:.4f} {iy + dy:.4f}")
510
+ for t_elem in node.findall("t"):
511
+ tp = t_elem.get("p")
512
+ if tp:
513
+ tx, ty = [float(v) for v in tp.split()]
514
+ t_elem.set("p", f"{tx + dx:.4f} {ty + dy:.4f}")
515
+ bb = t_elem.get("BoundingBox")
516
+ if bb:
517
+ bvals = [float(v) for v in bb.split()]
518
+ if len(bvals) == 4:
519
+ t_elem.set("BoundingBox",
520
+ f"{bvals[0]+dx:.4f} {bvals[1]+dy:.4f} "
521
+ f"{bvals[2]+dx:.4f} {bvals[3]+dy:.4f}")
522
+
523
+ if verbose:
524
+ import sys
525
+ frag_id = frag_elem.get("id", "?")
526
+ abbrev_note = " (with abbreviations)" if has_abbrev else ""
527
+ print(f" [RDKit cleanup] fragment {frag_id}: "
528
+ f"{mol.GetNumAtoms()} atoms{abbrev_note}",
529
+ file=sys.stderr)
530
+
531
+ return True
532
+
533
+
534
+ # ---------------------------------------------------------------------------
535
+ # Scale / coordinate helpers
536
+ # ---------------------------------------------------------------------------
537
+
538
+ _rdk_bl_cache: Optional[float] = None
539
+
540
+
541
+ def rdkit_default_bond_length() -> float:
542
+ """RDKit's default 2D depiction bond length (cached)."""
543
+ global _rdk_bl_cache
544
+ if _rdk_bl_cache is None:
545
+ from rdkit import Chem
546
+ from rdkit.Chem import AllChem
547
+ m = Chem.MolFromSmiles("CC")
548
+ AllChem.Compute2DCoords(m)
549
+ c = m.GetConformer()
550
+ p0, p1 = c.GetAtomPosition(0), c.GetAtomPosition(1)
551
+ _rdk_bl_cache = math.sqrt(
552
+ (p1.x - p0.x) ** 2 + (p1.y - p0.y) ** 2)
553
+ return _rdk_bl_cache
554
+
555
+
556
+ def avg_bond_length_from_atoms(atoms_data: List[dict], mol) -> float:
557
+ """Average bond length computed from CDXML atom coordinates."""
558
+ return _avg_bond_length(atoms_data, mol)
559
+
560
+
561
+ def set_cdxml_conformer(mol, atoms_data: List[dict], scale: float = 1.0):
562
+ """Set conformer from CDXML coordinates (y-flipped, scaled to RDKit space).
563
+
564
+ CDXML y-axis points down; RDKit y-axis points up. The *scale* factor
565
+ converts from CDXML points (~14.40 pt bond length) to RDKit units
566
+ (~1.5 unit bond length).
567
+ """
568
+ from rdkit import Chem
569
+ from rdkit.Geometry import Point3D
570
+
571
+ conf = Chem.Conformer(mol.GetNumAtoms())
572
+ for a in atoms_data:
573
+ conf.SetAtomPosition(
574
+ a["idx"], Point3D(a["x"] * scale, -a["y"] * scale, 0.0))
575
+ mol.RemoveAllConformers()
576
+ mol.AddConformer(conf, assignId=True)
577
+
578
+
579
+ # ---------------------------------------------------------------------------
580
+ # Internal helpers
581
+ # ---------------------------------------------------------------------------
582
+
583
+ def _avg_bond_length(atoms_data: List[dict], mol) -> float:
584
+ """Average bond length from CDXML atom coordinates."""
585
+ total, count = 0.0, 0
586
+ for bond in mol.GetBonds():
587
+ i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
588
+ dx = atoms_data[i]["x"] - atoms_data[j]["x"]
589
+ dy = atoms_data[i]["y"] - atoms_data[j]["y"]
590
+ total += math.sqrt(dx * dx + dy * dy)
591
+ count += 1
592
+ return total / count if count else ACS_BOND_LENGTH
593
+
594
+
595
+ def _avg_bond_length_from_conf(mol) -> float:
596
+ """Average bond length from RDKit conformer coordinates."""
597
+ conf = mol.GetConformer()
598
+ total, count = 0.0, 0
599
+ for bond in mol.GetBonds():
600
+ p0 = conf.GetAtomPosition(bond.GetBeginAtomIdx())
601
+ p1 = conf.GetAtomPosition(bond.GetEndAtomIdx())
602
+ total += math.sqrt(
603
+ (p1.x - p0.x) ** 2 + (p1.y - p0.y) ** 2)
604
+ count += 1
605
+ return total / count if count else 1.5
@@ -0,0 +1,17 @@
1
+ """Render — declarative text-based reaction scheme renderer.
2
+
3
+ Build publication-ready CDXML reaction schemes from YAML or compact text.
4
+ The LLM specifies semantic content (structures, roles, conditions);
5
+ the deterministic renderer handles all spatial layout.
6
+
7
+ Supported layouts: linear, sequential, serpentine, divergent, stacked-rows.
8
+ Supported annotations: run arrows, dashed/failed arrows, compound labels,
9
+ letter conditions.
10
+
11
+ No ChemDraw COM needed — uses RDKit for 2D coordinate generation.
12
+ """
13
+
14
+ from .schema import SchemeDescriptor, StepDescriptor, StructureRef, ArrowContent
15
+ from .renderer import render, render_to_file
16
+ from .parser import parse_yaml
17
+ from .compact_parser import parse_compact_file