cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1043 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
scheme_maker.py -- Build CDXML reaction scheme from reaction JSON (experimental).
|
|
4
|
+
|
|
5
|
+
Takes a reaction JSON file (v1.2 from reaction_parser.py) and produces a
|
|
6
|
+
publication-ready CDXML reaction scheme. The output is equivalent to what
|
|
7
|
+
the current polishing pipeline produces (scheme_polisher_v2 + eln_enrichment
|
|
8
|
+
+ reaction_cleanup), but built from semantic data rather than CDXML surgery.
|
|
9
|
+
|
|
10
|
+
When species have ``original_geometry`` data (stored by reaction_parser v1.2),
|
|
11
|
+
the original CDXML coordinates and abbreviation groups are used by default.
|
|
12
|
+
This preserves the input orientation and re-abbreviates groups like OTs, Boc,
|
|
13
|
+
etc. instead of expanding them to full structures.
|
|
14
|
+
|
|
15
|
+
This tool is EXPERIMENTAL. It coexists with the existing pipeline and does
|
|
16
|
+
not replace it.
|
|
17
|
+
|
|
18
|
+
CLI:
|
|
19
|
+
python scheme_maker.py reaction.json -o scheme.cdxml
|
|
20
|
+
python scheme_maker.py reaction.json --approach chemdraw_mimic --align-mode rdkit
|
|
21
|
+
python scheme_maker.py reaction.json --no-run-arrow --verbose
|
|
22
|
+
|
|
23
|
+
Python API:
|
|
24
|
+
from cdxml_toolkit.render.scheme_maker import build_scheme
|
|
25
|
+
cdxml_path = build_scheme("reaction.json", output="scheme.cdxml")
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
import argparse
|
|
29
|
+
import json
|
|
30
|
+
import math
|
|
31
|
+
import os
|
|
32
|
+
import re
|
|
33
|
+
import sys
|
|
34
|
+
import tempfile
|
|
35
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
36
|
+
from xml.etree import ElementTree as ET
|
|
37
|
+
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
# Lazy imports — defer heavy dependencies to call time
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
_HAS_RDKIT = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _check_rdkit() -> bool:
|
|
46
|
+
global _HAS_RDKIT
|
|
47
|
+
if _HAS_RDKIT is None:
|
|
48
|
+
try:
|
|
49
|
+
from rdkit import Chem # noqa: F401
|
|
50
|
+
_HAS_RDKIT = True
|
|
51
|
+
except ImportError:
|
|
52
|
+
_HAS_RDKIT = False
|
|
53
|
+
return _HAS_RDKIT
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
# Logging
|
|
58
|
+
# ---------------------------------------------------------------------------
|
|
59
|
+
|
|
60
|
+
_verbose = False
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _log(msg: str) -> None:
|
|
64
|
+
if _verbose:
|
|
65
|
+
print(msg, file=sys.stderr)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# ---------------------------------------------------------------------------
|
|
69
|
+
# Core: SMILES → atom/bond dicts (via structure_from_image)
|
|
70
|
+
# ---------------------------------------------------------------------------
|
|
71
|
+
|
|
72
|
+
def _smiles_to_mol_data(smiles: str, offset: int = 0) -> Optional[Dict]:
|
|
73
|
+
"""Convert SMILES to atom/bond dicts using RDKit 2D coords.
|
|
74
|
+
|
|
75
|
+
Returns dict with 'atoms' and 'bonds' lists, or None on failure.
|
|
76
|
+
Uses structure_from_image.smiles_to_coords which handles Kekulization,
|
|
77
|
+
explicit H removal, and bond direction annotation.
|
|
78
|
+
"""
|
|
79
|
+
try:
|
|
80
|
+
from ..image.structure_from_image import smiles_to_coords
|
|
81
|
+
except ImportError:
|
|
82
|
+
raise RuntimeError(
|
|
83
|
+
"structure_from_image.py is required (for smiles_to_coords). "
|
|
84
|
+
"Ensure it is in the same directory."
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
return smiles_to_coords(smiles, offset_index=offset)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _normalize_mol(mol_data: Dict, center_x: float = 0.0,
|
|
91
|
+
center_y: float = 0.0) -> Tuple[List, List]:
|
|
92
|
+
"""Normalize atom coords to ACS bond length (14.40 pt), flip y, center."""
|
|
93
|
+
from ..coord_normalizer import normalize_coords
|
|
94
|
+
return normalize_coords(
|
|
95
|
+
mol_data["atoms"], mol_data["bonds"],
|
|
96
|
+
center_x=center_x, center_y=center_y,
|
|
97
|
+
flip_y=True,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ---------------------------------------------------------------------------
|
|
102
|
+
# Original geometry → mol_data conversion
|
|
103
|
+
# ---------------------------------------------------------------------------
|
|
104
|
+
|
|
105
|
+
def _geometry_to_mol_data(geom: Dict[str, Any],
|
|
106
|
+
offset: int = 0) -> Optional[Dict]:
|
|
107
|
+
"""Convert ``original_geometry`` from a SpeciesDescriptor to mol_data.
|
|
108
|
+
|
|
109
|
+
The returned dict has ``"atoms"`` and ``"bonds"`` lists in the same
|
|
110
|
+
format that ``smiles_to_coords`` / ``_smiles_to_mol_data`` returns,
|
|
111
|
+
including extra keys for abbreviation and generic groups.
|
|
112
|
+
|
|
113
|
+
Coordinates are negated on the y-axis so that subsequent
|
|
114
|
+
``normalize_coords(flip_y=True)`` produces correct CDXML-space output
|
|
115
|
+
(the double-negation cancels out).
|
|
116
|
+
"""
|
|
117
|
+
if not geom or not geom.get("atoms"):
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
atoms: List[Dict[str, Any]] = []
|
|
121
|
+
id_remap: Dict[int, int] = {} # original id → new 1-based index
|
|
122
|
+
|
|
123
|
+
for i, a in enumerate(geom["atoms"]):
|
|
124
|
+
idx = offset + i + 1
|
|
125
|
+
orig_id = a.get("id", i)
|
|
126
|
+
id_remap[orig_id] = idx
|
|
127
|
+
|
|
128
|
+
atom_d: Dict[str, Any] = {
|
|
129
|
+
"index": idx,
|
|
130
|
+
"symbol": a.get("symbol", "C"),
|
|
131
|
+
"x": a["x"],
|
|
132
|
+
"y": -a["y"], # negate so flip_y=True restores original
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if "num_hydrogens" in a:
|
|
136
|
+
atom_d["num_hydrogens"] = a["num_hydrogens"]
|
|
137
|
+
|
|
138
|
+
if "charge" in a:
|
|
139
|
+
atom_d["charge"] = a["charge"]
|
|
140
|
+
|
|
141
|
+
# Abbreviation groups (OTs, Boc, Me, …)
|
|
142
|
+
if a.get("is_abbreviation"):
|
|
143
|
+
atom_d["is_abbreviation"] = True
|
|
144
|
+
atom_d["abbrev_label"] = a.get("label", "?")
|
|
145
|
+
atom_d["abbrev_smiles"] = a.get("label_smiles")
|
|
146
|
+
# Use a placeholder symbol that won't be stripped as explicit H
|
|
147
|
+
atom_d["symbol"] = "X"
|
|
148
|
+
|
|
149
|
+
# Generic variable groups (R, X, Ar, R1, …)
|
|
150
|
+
elif a.get("is_generic"):
|
|
151
|
+
atom_d["is_generic"] = True
|
|
152
|
+
atom_d["generic_label"] = a.get("label", "R")
|
|
153
|
+
atom_d["node_type"] = a.get("node_type", "GenericNickname")
|
|
154
|
+
atom_d["symbol"] = "X"
|
|
155
|
+
|
|
156
|
+
atoms.append(atom_d)
|
|
157
|
+
|
|
158
|
+
bonds: List[Dict[str, Any]] = []
|
|
159
|
+
for j, b in enumerate(geom["bonds"]):
|
|
160
|
+
bi = id_remap.get(b["begin"])
|
|
161
|
+
ei = id_remap.get(b["end"])
|
|
162
|
+
if bi is None or ei is None:
|
|
163
|
+
continue
|
|
164
|
+
bond_d: Dict[str, Any] = {
|
|
165
|
+
"index": offset + len(geom["atoms"]) + j + 1,
|
|
166
|
+
"order": b.get("order", 1),
|
|
167
|
+
"atom1": bi,
|
|
168
|
+
"atom2": ei,
|
|
169
|
+
}
|
|
170
|
+
if "double_position" in b:
|
|
171
|
+
bond_d["double_pos"] = b["double_position"]
|
|
172
|
+
# Preserve stereo config
|
|
173
|
+
if "cfg" in b:
|
|
174
|
+
bond_d["cfg"] = b["cfg"]
|
|
175
|
+
bonds.append(bond_d)
|
|
176
|
+
|
|
177
|
+
return {"atoms": atoms, "bonds": bonds}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _species_mol_data(sp, offset: int = 0) -> Optional[Dict]:
|
|
181
|
+
"""Get mol_data for a species, preferring original geometry.
|
|
182
|
+
|
|
183
|
+
When a species has ``original_geometry`` (from reaction_parser v1.2),
|
|
184
|
+
uses the original CDXML coordinates and abbreviation data. Falls back
|
|
185
|
+
to SMILES-based 2D coordinate generation.
|
|
186
|
+
"""
|
|
187
|
+
# Prefer original geometry (preserves orientation + abbreviations)
|
|
188
|
+
if sp.original_geometry:
|
|
189
|
+
mol = _geometry_to_mol_data(sp.original_geometry, offset=offset)
|
|
190
|
+
if mol is not None:
|
|
191
|
+
_log(f" Using original geometry for '{sp.name or sp.smiles}'")
|
|
192
|
+
return mol
|
|
193
|
+
|
|
194
|
+
# Fallback: generate from SMILES
|
|
195
|
+
if sp.smiles:
|
|
196
|
+
return _smiles_to_mol_data(sp.smiles, offset=offset)
|
|
197
|
+
|
|
198
|
+
return None
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# ---------------------------------------------------------------------------
|
|
202
|
+
# Role priority ordering for above-arrow text
|
|
203
|
+
# ---------------------------------------------------------------------------
|
|
204
|
+
|
|
205
|
+
# Priority: lower number = higher priority = closer to top.
|
|
206
|
+
# Catalyst and ligand are always first (defining the reaction).
|
|
207
|
+
# Remaining reagents cluster around 50. Solvent is last.
|
|
208
|
+
_ROLE_PRIORITY = {
|
|
209
|
+
"catalyst": 10,
|
|
210
|
+
"ligand": 20,
|
|
211
|
+
"coupling_reagent": 40,
|
|
212
|
+
"activating_agent": 41,
|
|
213
|
+
"reducing_agent": 42,
|
|
214
|
+
"oxidant": 43,
|
|
215
|
+
"halogenating_agent": 44,
|
|
216
|
+
"fluorinating_agent": 45,
|
|
217
|
+
"borylating_agent": 46,
|
|
218
|
+
"lewis_acid": 47,
|
|
219
|
+
"protecting_group": 48,
|
|
220
|
+
"deprotecting_agent": 49,
|
|
221
|
+
"acid": 50,
|
|
222
|
+
"base": 51,
|
|
223
|
+
"additive": 55,
|
|
224
|
+
"reagent": 60,
|
|
225
|
+
"reductant": 65,
|
|
226
|
+
"drying_agent": 70,
|
|
227
|
+
"solvent": 80,
|
|
228
|
+
}
|
|
229
|
+
_DEFAULT_ROLE_PRIORITY = 59 # unknown roles sort just before "reagent"
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _sort_by_role_priority(
|
|
233
|
+
entries: List[Tuple[str, str, float]],
|
|
234
|
+
) -> List[Tuple[str, str, float]]:
|
|
235
|
+
"""Sort (text, role_detail, equiv) entries by reagent role priority.
|
|
236
|
+
|
|
237
|
+
Catalyst → Ligand → Coupling reagent → … → Base → Acid → Solvent.
|
|
238
|
+
Within the same priority (or all at _DEFAULT_ROLE_PRIORITY for
|
|
239
|
+
unclassified reagents), lower equivalents = higher priority.
|
|
240
|
+
This heuristic reflects that catalysts/ligands are typically used
|
|
241
|
+
in smaller amounts than stoichiometric reagents.
|
|
242
|
+
"""
|
|
243
|
+
return sorted(
|
|
244
|
+
entries,
|
|
245
|
+
key=lambda e: (_ROLE_PRIORITY.get(e[1], _DEFAULT_ROLE_PRIORITY), e[2]),
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _merge_condition_tokens(condition_lines: List[str]) -> List[str]:
|
|
250
|
+
"""Merge temperature and time tokens into a single comma-separated line.
|
|
251
|
+
|
|
252
|
+
Input: ["105 °C", "24 h"]
|
|
253
|
+
Output: ["105 °C, 24 h"]
|
|
254
|
+
|
|
255
|
+
Other condition tokens (atmosphere, etc.) stay on separate lines.
|
|
256
|
+
"""
|
|
257
|
+
temp_time_tokens = []
|
|
258
|
+
other_tokens = []
|
|
259
|
+
|
|
260
|
+
# Patterns for temperature and time
|
|
261
|
+
temp_pat = re.compile(
|
|
262
|
+
r"^-?\d+\.?\d*\s*°?\s*[cCfF]$" # "105 °C", "80°C", "-78 °C"
|
|
263
|
+
r"|^rt$|^RT$|^room\s+temp" # "rt", "RT", "room temp"
|
|
264
|
+
r"|^reflux$" # "reflux"
|
|
265
|
+
r"|^-?\d+\s*to\s*-?\d+\s*°?\s*[cCfF]$" # "0 to 25 °C"
|
|
266
|
+
, re.IGNORECASE
|
|
267
|
+
)
|
|
268
|
+
time_pat = re.compile(
|
|
269
|
+
r"^\d+\.?\d*\s*(h|hr|hrs|hours?|min|minutes?|d|days?|s|sec|seconds?|overnight|o/?n)$",
|
|
270
|
+
re.IGNORECASE
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
for tok in condition_lines:
|
|
274
|
+
tok = tok.strip()
|
|
275
|
+
if not tok:
|
|
276
|
+
continue
|
|
277
|
+
if temp_pat.match(tok) or time_pat.match(tok):
|
|
278
|
+
temp_time_tokens.append(tok)
|
|
279
|
+
else:
|
|
280
|
+
other_tokens.append(tok)
|
|
281
|
+
|
|
282
|
+
result = []
|
|
283
|
+
if temp_time_tokens:
|
|
284
|
+
result.append(", ".join(temp_time_tokens))
|
|
285
|
+
result.extend(other_tokens)
|
|
286
|
+
return result
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
# ---------------------------------------------------------------------------
|
|
290
|
+
# Core: Build CDXML from reaction JSON
|
|
291
|
+
# ---------------------------------------------------------------------------
|
|
292
|
+
|
|
293
|
+
def build_scheme(
|
|
294
|
+
input_path: str,
|
|
295
|
+
output: Optional[str] = None,
|
|
296
|
+
approach: str = "chemdraw_mimic",
|
|
297
|
+
align_mode: str = "rdkit",
|
|
298
|
+
run_arrow: bool = True,
|
|
299
|
+
verbose: bool = False,
|
|
300
|
+
) -> str:
|
|
301
|
+
"""Build a CDXML reaction scheme from a reaction JSON file.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
input_path: Path to reaction JSON (v1.1 from reaction_parser)
|
|
305
|
+
output: Path for output CDXML (default: {stem}-scheme.cdxml)
|
|
306
|
+
approach: Layout approach for reaction_cleanup
|
|
307
|
+
align_mode: Alignment strategy (rdkit/rxnmapper/kabsch/none)
|
|
308
|
+
run_arrow: Add run arrow with mass/yield if ELN data available
|
|
309
|
+
verbose: Print diagnostic messages
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
Path to the output CDXML file.
|
|
313
|
+
"""
|
|
314
|
+
global _verbose
|
|
315
|
+
_verbose = verbose
|
|
316
|
+
|
|
317
|
+
if not _check_rdkit():
|
|
318
|
+
print("ERROR: RDKit is required for scheme_maker.", file=sys.stderr)
|
|
319
|
+
sys.exit(1)
|
|
320
|
+
|
|
321
|
+
# --- Step 1: Load and validate JSON ---
|
|
322
|
+
from ..perception.reaction_parser import ReactionDescriptor
|
|
323
|
+
|
|
324
|
+
desc = ReactionDescriptor.from_json(input_path)
|
|
325
|
+
_log(f"Loaded JSON: {desc.experiment}, {len(desc.species)} species, "
|
|
326
|
+
f"version={desc.version}")
|
|
327
|
+
|
|
328
|
+
# Validate: need at least one product with SMILES
|
|
329
|
+
products_with_smiles = [
|
|
330
|
+
sp for sp in desc.species
|
|
331
|
+
if sp.role == "product" and sp.smiles
|
|
332
|
+
]
|
|
333
|
+
if not products_with_smiles:
|
|
334
|
+
print("ERROR: No product species with SMILES found in JSON.",
|
|
335
|
+
file=sys.stderr)
|
|
336
|
+
sys.exit(1)
|
|
337
|
+
|
|
338
|
+
# --- Step 2: Partition species into layout groups ---
|
|
339
|
+
reactant_species = []
|
|
340
|
+
product_species = []
|
|
341
|
+
# Each entry is (text, role_detail, equiv) for priority sorting later.
|
|
342
|
+
# equiv is used as a tiebreaker: lower equiv = higher priority (catalysts
|
|
343
|
+
# are typically used in small amounts like 0.05 eq.).
|
|
344
|
+
above_arrow_entries = [] # (text, role_detail, equiv) tuples
|
|
345
|
+
above_arrow_mol_species = [] # structural species above arrow
|
|
346
|
+
condition_lines = [] # below-arrow condition text (temp, time, atm)
|
|
347
|
+
|
|
348
|
+
def _parse_equiv(sp) -> float:
|
|
349
|
+
"""Parse csv_equiv to a float for sorting. Missing = 999."""
|
|
350
|
+
if sp.csv_equiv:
|
|
351
|
+
try:
|
|
352
|
+
return float(sp.csv_equiv)
|
|
353
|
+
except (ValueError, TypeError):
|
|
354
|
+
pass
|
|
355
|
+
return 999.0
|
|
356
|
+
|
|
357
|
+
for sp in desc.species:
|
|
358
|
+
# Derive position from chemical role (no dependency on scheme_position)
|
|
359
|
+
if sp.role == "product":
|
|
360
|
+
pos = "product"
|
|
361
|
+
elif sp.role == "atom_contributing":
|
|
362
|
+
if sp.is_substrate or sp.is_sm:
|
|
363
|
+
pos = "reactant"
|
|
364
|
+
else:
|
|
365
|
+
pos = "above_arrow"
|
|
366
|
+
elif sp.is_solvent:
|
|
367
|
+
pos = "above_arrow"
|
|
368
|
+
elif sp.role == "non_contributing":
|
|
369
|
+
pos = "above_arrow"
|
|
370
|
+
else:
|
|
371
|
+
pos = "above_arrow"
|
|
372
|
+
|
|
373
|
+
if pos == "product":
|
|
374
|
+
if sp.smiles:
|
|
375
|
+
product_species.append(sp)
|
|
376
|
+
else:
|
|
377
|
+
_log(f" WARNING: Product '{sp.name}' has no SMILES, skipping")
|
|
378
|
+
elif pos == "reactant":
|
|
379
|
+
if sp.smiles:
|
|
380
|
+
reactant_species.append(sp)
|
|
381
|
+
else:
|
|
382
|
+
# No SMILES — convert to text label above arrow
|
|
383
|
+
_log(f" WARNING: Reactant '{sp.name}' has no SMILES, "
|
|
384
|
+
"converting to text")
|
|
385
|
+
text = sp.display_text or sp.name or sp.csv_name or "?"
|
|
386
|
+
role_d = sp.role_detail or sp.rxn_insight_role or ""
|
|
387
|
+
above_arrow_entries.append((text, role_d, _parse_equiv(sp)))
|
|
388
|
+
elif pos == "above_arrow":
|
|
389
|
+
if (sp.smiles and sp.source in ("fragment", "rxn")
|
|
390
|
+
and sp.role == "atom_contributing"):
|
|
391
|
+
# Structural species above arrow (non-substrate atom-contributing
|
|
392
|
+
# reactant, e.g. coupling partner)
|
|
393
|
+
above_arrow_mol_species.append(sp)
|
|
394
|
+
else:
|
|
395
|
+
# Text species above arrow (reagents, catalysts, solvents)
|
|
396
|
+
text = sp.display_text or sp.name or sp.csv_name or "?"
|
|
397
|
+
role_d = sp.role_detail or sp.rxn_insight_role or ""
|
|
398
|
+
if sp.is_solvent and not role_d:
|
|
399
|
+
role_d = "solvent"
|
|
400
|
+
above_arrow_entries.append((text, role_d, _parse_equiv(sp)))
|
|
401
|
+
elif pos == "below_arrow":
|
|
402
|
+
text = sp.display_text or sp.name or sp.csv_name or "?"
|
|
403
|
+
role_d = sp.role_detail or sp.rxn_insight_role or ""
|
|
404
|
+
if sp.is_solvent and not role_d:
|
|
405
|
+
role_d = "solvent"
|
|
406
|
+
above_arrow_entries.append((text, role_d, _parse_equiv(sp)))
|
|
407
|
+
|
|
408
|
+
# Add condition tokens (temp, time, atmosphere)
|
|
409
|
+
condition_lines.extend(desc.conditions)
|
|
410
|
+
|
|
411
|
+
# Deduplicate above-arrow entries (case-insensitive)
|
|
412
|
+
seen_above = set()
|
|
413
|
+
deduped_entries = []
|
|
414
|
+
for txt, role_d, eq in above_arrow_entries:
|
|
415
|
+
key = txt.strip().lower()
|
|
416
|
+
if key not in seen_above:
|
|
417
|
+
seen_above.add(key)
|
|
418
|
+
deduped_entries.append((txt, role_d, eq))
|
|
419
|
+
above_arrow_entries = deduped_entries
|
|
420
|
+
|
|
421
|
+
# Sort entries by role priority:
|
|
422
|
+
# catalyst > ligand > coupling_reagent > … > base/acid > solvent
|
|
423
|
+
# Within same role priority, lower equiv = higher priority.
|
|
424
|
+
above_arrow_entries = _sort_by_role_priority(above_arrow_entries)
|
|
425
|
+
|
|
426
|
+
above_arrow_texts = [txt for txt, _, _ in above_arrow_entries]
|
|
427
|
+
|
|
428
|
+
_log(f" Reactants: {len(reactant_species)}, "
|
|
429
|
+
f"Products: {len(product_species)}, "
|
|
430
|
+
f"Above-arrow text: {len(above_arrow_texts)}, "
|
|
431
|
+
f"Above-arrow structures: {len(above_arrow_mol_species)}, "
|
|
432
|
+
f"Conditions: {len(condition_lines)}")
|
|
433
|
+
|
|
434
|
+
# --- Step 3: Generate 2D coords for each structural species ---
|
|
435
|
+
# Prefer original geometry when available (preserves orientation and
|
|
436
|
+
# abbreviation groups like OTs, Boc). Fall back to SMILES→RDKit→coords.
|
|
437
|
+
_log("Generating 2D coordinates...")
|
|
438
|
+
|
|
439
|
+
atom_offset = 0
|
|
440
|
+
reactant_mols = []
|
|
441
|
+
for sp in reactant_species:
|
|
442
|
+
mol_data = _species_mol_data(sp, offset=atom_offset)
|
|
443
|
+
if mol_data is None:
|
|
444
|
+
_log(f" WARNING: Could not generate coords for '{sp.name}' "
|
|
445
|
+
f"(SMILES: {sp.smiles})")
|
|
446
|
+
above_arrow_texts.append(sp.display_text or sp.name or "?")
|
|
447
|
+
continue
|
|
448
|
+
atom_offset += len(mol_data["atoms"]) + len(mol_data["bonds"])
|
|
449
|
+
reactant_mols.append(mol_data)
|
|
450
|
+
|
|
451
|
+
product_mols = []
|
|
452
|
+
for sp in product_species:
|
|
453
|
+
mol_data = _species_mol_data(sp, offset=atom_offset)
|
|
454
|
+
if mol_data is None:
|
|
455
|
+
_log(f" WARNING: Could not generate coords for product "
|
|
456
|
+
f"'{sp.name}' (SMILES: {sp.smiles})")
|
|
457
|
+
continue
|
|
458
|
+
atom_offset += len(mol_data["atoms"]) + len(mol_data["bonds"])
|
|
459
|
+
product_mols.append(mol_data)
|
|
460
|
+
|
|
461
|
+
above_arrow_mols = []
|
|
462
|
+
for sp in above_arrow_mol_species:
|
|
463
|
+
mol_data = _species_mol_data(sp, offset=atom_offset)
|
|
464
|
+
if mol_data is None:
|
|
465
|
+
_log(f" WARNING: Could not generate coords for '{sp.name}', "
|
|
466
|
+
"converting to text")
|
|
467
|
+
above_arrow_texts.append(sp.display_text or sp.name or "?")
|
|
468
|
+
continue
|
|
469
|
+
atom_offset += len(mol_data["atoms"]) + len(mol_data["bonds"])
|
|
470
|
+
above_arrow_mols.append(mol_data)
|
|
471
|
+
|
|
472
|
+
if not product_mols:
|
|
473
|
+
print("ERROR: No product structures could be generated.",
|
|
474
|
+
file=sys.stderr)
|
|
475
|
+
sys.exit(1)
|
|
476
|
+
|
|
477
|
+
# --- Step 4: Normalize coordinates ---
|
|
478
|
+
_log("Normalizing coordinates...")
|
|
479
|
+
from ..coord_normalizer import normalize_reaction
|
|
480
|
+
|
|
481
|
+
norm_reactants, norm_products = normalize_reaction(
|
|
482
|
+
reactant_mols, product_mols,
|
|
483
|
+
reactant_start_x=50.0,
|
|
484
|
+
product_start_x=350.0,
|
|
485
|
+
molecule_gap=80.0,
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
# --- Step 5: Build conditions dict ---
|
|
489
|
+
# Merge all text into a single below-arrow block. reaction_cleanup
|
|
490
|
+
# puts all <t> elements below the arrow anyway, and multiple <t>
|
|
491
|
+
# elements can overlap. A single merged block with \n-separated
|
|
492
|
+
# lines avoids this. This matches the --merge-conditions behavior
|
|
493
|
+
# of scheme_polisher.
|
|
494
|
+
#
|
|
495
|
+
# Condition tokens (temp + time) are merged onto a single
|
|
496
|
+
# comma-separated line: "105 °C, 24 h".
|
|
497
|
+
merged_conditions = _merge_condition_tokens(condition_lines)
|
|
498
|
+
merged_text = above_arrow_texts + merged_conditions
|
|
499
|
+
conditions = {}
|
|
500
|
+
if merged_text:
|
|
501
|
+
conditions["below"] = merged_text
|
|
502
|
+
|
|
503
|
+
_log(f" Conditions: {conditions}")
|
|
504
|
+
|
|
505
|
+
# --- Step 6: Assemble initial CDXML ---
|
|
506
|
+
_log("Assembling CDXML...")
|
|
507
|
+
from ..cdxml_builder import build_reaction_cdxml
|
|
508
|
+
|
|
509
|
+
cdxml_str = build_reaction_cdxml(
|
|
510
|
+
norm_reactants, norm_products,
|
|
511
|
+
conditions=conditions if conditions else None,
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
# Write to temp file for subsequent processing
|
|
515
|
+
tmp_dir = tempfile.mkdtemp(prefix="scheme_maker_")
|
|
516
|
+
tmp_assembled = os.path.join(tmp_dir, "assembled.cdxml")
|
|
517
|
+
with open(tmp_assembled, "w", encoding="utf-8") as f:
|
|
518
|
+
f.write(cdxml_str)
|
|
519
|
+
|
|
520
|
+
_log(f" Assembled CDXML: {tmp_assembled}")
|
|
521
|
+
|
|
522
|
+
# --- Step 7: Insert above-arrow structures (if any) ---
|
|
523
|
+
if above_arrow_mols:
|
|
524
|
+
_log("Inserting above-arrow structures...")
|
|
525
|
+
_insert_above_arrow_structures(tmp_assembled, above_arrow_mols)
|
|
526
|
+
|
|
527
|
+
# --- Step 8: Apply text formatting (subscripts/italics) ---
|
|
528
|
+
_log("Applying text formatting...")
|
|
529
|
+
_apply_text_formatting(tmp_assembled)
|
|
530
|
+
|
|
531
|
+
# --- Step 9: Run alignment ---
|
|
532
|
+
if align_mode != "none" and len(reactant_mols) > 0:
|
|
533
|
+
_log(f"Running alignment ({align_mode})...")
|
|
534
|
+
_run_alignment(tmp_assembled, align_mode)
|
|
535
|
+
|
|
536
|
+
# --- Step 10: Run reaction_cleanup (final layout) ---
|
|
537
|
+
_log(f"Running layout ({approach})...")
|
|
538
|
+
from ..layout.reaction_cleanup import run_cleanup
|
|
539
|
+
|
|
540
|
+
# Determine output path
|
|
541
|
+
if output is None:
|
|
542
|
+
stem = os.path.splitext(os.path.basename(input_path))[0]
|
|
543
|
+
output = os.path.join(os.path.dirname(input_path) or ".",
|
|
544
|
+
f"{stem}-scheme.cdxml")
|
|
545
|
+
|
|
546
|
+
result = run_cleanup(tmp_assembled, output, approach=approach,
|
|
547
|
+
verbose=verbose)
|
|
548
|
+
_log(f" Layout complete: {result.get('num_reactants', '?')} reactants, "
|
|
549
|
+
f"{result.get('num_products', '?')} products")
|
|
550
|
+
|
|
551
|
+
# --- Step 11: Add run arrow (optional) ---
|
|
552
|
+
if run_arrow and desc.eln_data:
|
|
553
|
+
_log("Adding run arrow...")
|
|
554
|
+
_add_run_arrow(output, desc.eln_data)
|
|
555
|
+
|
|
556
|
+
# Cleanup temp files
|
|
557
|
+
try:
|
|
558
|
+
os.unlink(tmp_assembled)
|
|
559
|
+
os.rmdir(tmp_dir)
|
|
560
|
+
except OSError:
|
|
561
|
+
pass
|
|
562
|
+
|
|
563
|
+
_log(f"Output: {output}")
|
|
564
|
+
return output
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
# ---------------------------------------------------------------------------
|
|
568
|
+
# Step 7: Insert above-arrow structures
|
|
569
|
+
# ---------------------------------------------------------------------------
|
|
570
|
+
|
|
571
|
+
def _insert_above_arrow_structures(cdxml_path: str,
|
|
572
|
+
above_mols: List[Dict]) -> None:
|
|
573
|
+
"""Insert structural fragments above the arrow in the CDXML.
|
|
574
|
+
|
|
575
|
+
Normalizes each above-arrow molecule, builds its fragment XML,
|
|
576
|
+
and inserts it into the page. Updates <step> metadata.
|
|
577
|
+
"""
|
|
578
|
+
from ..cdxml_utils import parse_cdxml, write_cdxml
|
|
579
|
+
from ..cdxml_builder import _build_fragment, _IDGen # noqa: private API
|
|
580
|
+
|
|
581
|
+
tree = parse_cdxml(cdxml_path)
|
|
582
|
+
root = tree.getroot()
|
|
583
|
+
page = root.find(".//page")
|
|
584
|
+
if page is None:
|
|
585
|
+
return
|
|
586
|
+
|
|
587
|
+
step = page.find(".//scheme/step")
|
|
588
|
+
if step is None:
|
|
589
|
+
return
|
|
590
|
+
|
|
591
|
+
# Find arrow center for positioning
|
|
592
|
+
arrow = page.find(".//arrow")
|
|
593
|
+
if arrow is None:
|
|
594
|
+
return
|
|
595
|
+
bbox = arrow.get("BoundingBox", "0 0 100 300")
|
|
596
|
+
parts = bbox.split()
|
|
597
|
+
if len(parts) >= 4:
|
|
598
|
+
arrow_cx = (float(parts[0]) + float(parts[2])) / 2.0
|
|
599
|
+
arrow_cy = float(parts[1]) - 30.0 # above the arrow
|
|
600
|
+
else:
|
|
601
|
+
arrow_cx = 200.0
|
|
602
|
+
arrow_cy = 270.0
|
|
603
|
+
|
|
604
|
+
# Get current max ID
|
|
605
|
+
max_id = 0
|
|
606
|
+
for el in root.iter():
|
|
607
|
+
eid = el.get("id")
|
|
608
|
+
if eid:
|
|
609
|
+
try:
|
|
610
|
+
max_id = max(max_id, int(eid))
|
|
611
|
+
except ValueError:
|
|
612
|
+
pass
|
|
613
|
+
|
|
614
|
+
id_gen = _IDGen(start=max_id + 1)
|
|
615
|
+
|
|
616
|
+
above_ids = step.get("ReactionStepObjectsAboveArrow", "")
|
|
617
|
+
above_ids_list = above_ids.split() if above_ids else []
|
|
618
|
+
|
|
619
|
+
y_offset = 0.0
|
|
620
|
+
for mol_data in above_mols:
|
|
621
|
+
# Normalize to ACS bond length
|
|
622
|
+
atoms, bonds = _normalize_mol(mol_data,
|
|
623
|
+
center_x=arrow_cx,
|
|
624
|
+
center_y=arrow_cy - y_offset)
|
|
625
|
+
|
|
626
|
+
frag_xml, _, frag_id_val = _build_fragment(atoms, bonds, id_gen)
|
|
627
|
+
|
|
628
|
+
# Parse fragment XML and insert into page
|
|
629
|
+
frag_elem = ET.fromstring(frag_xml)
|
|
630
|
+
# Insert before scheme element
|
|
631
|
+
scheme = page.find("scheme")
|
|
632
|
+
if scheme is not None:
|
|
633
|
+
idx = list(page).index(scheme)
|
|
634
|
+
page.insert(idx, frag_elem)
|
|
635
|
+
else:
|
|
636
|
+
page.append(frag_elem)
|
|
637
|
+
|
|
638
|
+
frag_id = frag_elem.get("id")
|
|
639
|
+
if frag_id:
|
|
640
|
+
above_ids_list.append(frag_id)
|
|
641
|
+
|
|
642
|
+
y_offset += 60.0 # stack vertically
|
|
643
|
+
|
|
644
|
+
# Update step metadata
|
|
645
|
+
if above_ids_list:
|
|
646
|
+
step.set("ReactionStepObjectsAboveArrow", " ".join(above_ids_list))
|
|
647
|
+
|
|
648
|
+
write_cdxml(tree, cdxml_path)
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
# ---------------------------------------------------------------------------
|
|
652
|
+
# Step 8: Apply text formatting
|
|
653
|
+
# ---------------------------------------------------------------------------
|
|
654
|
+
|
|
655
|
+
def _apply_text_formatting(cdxml_path: str) -> None:
|
|
656
|
+
"""Apply subscript/italic formatting to standalone caption text elements.
|
|
657
|
+
|
|
658
|
+
Handles multi-line condition text by formatting each line independently
|
|
659
|
+
and preserving line breaks. Condition tokens (temperatures, times,
|
|
660
|
+
atmospheres) are left unformatted to avoid spurious subscripts.
|
|
661
|
+
"""
|
|
662
|
+
from ..cdxml_utils import parse_cdxml, write_cdxml
|
|
663
|
+
|
|
664
|
+
try:
|
|
665
|
+
from ..text_formatting import build_formatted_s_xml
|
|
666
|
+
except ImportError:
|
|
667
|
+
_log(" text_formatting not available, skipping")
|
|
668
|
+
return
|
|
669
|
+
|
|
670
|
+
tree = parse_cdxml(cdxml_path)
|
|
671
|
+
root = tree.getroot()
|
|
672
|
+
page = root.find(".//page")
|
|
673
|
+
if page is None:
|
|
674
|
+
return
|
|
675
|
+
|
|
676
|
+
# Condition tokens should not be formatted (would get spurious subscripts)
|
|
677
|
+
try:
|
|
678
|
+
from ..perception.reaction_parser import _is_condition_token
|
|
679
|
+
except ImportError:
|
|
680
|
+
_is_condition_token = None
|
|
681
|
+
|
|
682
|
+
modified = False
|
|
683
|
+
# Only process direct children of <page> — these are standalone captions
|
|
684
|
+
# (conditions text, labels). Skip <t> inside <fragment><n> (atom labels).
|
|
685
|
+
for t_elem in list(page):
|
|
686
|
+
if t_elem.tag != "t":
|
|
687
|
+
continue
|
|
688
|
+
|
|
689
|
+
s_elems = t_elem.findall("s")
|
|
690
|
+
if not s_elems:
|
|
691
|
+
continue
|
|
692
|
+
|
|
693
|
+
text = "".join(s.text or "" for s in s_elems)
|
|
694
|
+
if not text.strip():
|
|
695
|
+
continue
|
|
696
|
+
|
|
697
|
+
# Get style attributes from the first <s> element
|
|
698
|
+
first_s = s_elems[0]
|
|
699
|
+
font = first_s.get("font", "3")
|
|
700
|
+
size = first_s.get("size", "10")
|
|
701
|
+
face = first_s.get("face", "1")
|
|
702
|
+
|
|
703
|
+
# Handle multi-line: format each line separately
|
|
704
|
+
lines = text.split("\n")
|
|
705
|
+
new_s_elements = []
|
|
706
|
+
|
|
707
|
+
for i, line in enumerate(lines):
|
|
708
|
+
line = line.strip()
|
|
709
|
+
if not line:
|
|
710
|
+
continue
|
|
711
|
+
|
|
712
|
+
# Don't format condition tokens (temperatures, times, etc.)
|
|
713
|
+
is_condition = False
|
|
714
|
+
if _is_condition_token is not None:
|
|
715
|
+
is_condition = _is_condition_token(line)
|
|
716
|
+
|
|
717
|
+
if is_condition:
|
|
718
|
+
# Plain text — no subscripts
|
|
719
|
+
s_elem = ET.Element("s")
|
|
720
|
+
s_elem.set("font", font)
|
|
721
|
+
s_elem.set("size", size)
|
|
722
|
+
s_elem.set("face", face)
|
|
723
|
+
s_elem.text = line
|
|
724
|
+
new_s_elements.append(s_elem)
|
|
725
|
+
else:
|
|
726
|
+
# Apply chemical formatting
|
|
727
|
+
formatted_xml = build_formatted_s_xml(line)
|
|
728
|
+
if formatted_xml:
|
|
729
|
+
try:
|
|
730
|
+
wrapper = ET.fromstring(f"<t>{formatted_xml}</t>")
|
|
731
|
+
for s in wrapper:
|
|
732
|
+
if not s.get("font"):
|
|
733
|
+
s.set("font", font)
|
|
734
|
+
if not s.get("size"):
|
|
735
|
+
s.set("size", size)
|
|
736
|
+
new_s_elements.append(s)
|
|
737
|
+
except ET.ParseError:
|
|
738
|
+
# Fallback: plain text
|
|
739
|
+
s_elem = ET.Element("s")
|
|
740
|
+
s_elem.set("font", font)
|
|
741
|
+
s_elem.set("size", size)
|
|
742
|
+
s_elem.set("face", face)
|
|
743
|
+
s_elem.text = line
|
|
744
|
+
new_s_elements.append(s_elem)
|
|
745
|
+
else:
|
|
746
|
+
s_elem = ET.Element("s")
|
|
747
|
+
s_elem.set("font", font)
|
|
748
|
+
s_elem.set("size", size)
|
|
749
|
+
s_elem.set("face", face)
|
|
750
|
+
s_elem.text = line
|
|
751
|
+
new_s_elements.append(s_elem)
|
|
752
|
+
|
|
753
|
+
# Insert newline between lines (append to last <s> text)
|
|
754
|
+
if i < len(lines) - 1 and new_s_elements:
|
|
755
|
+
last = new_s_elements[-1]
|
|
756
|
+
last.text = (last.text or "") + "\n"
|
|
757
|
+
|
|
758
|
+
if not new_s_elements:
|
|
759
|
+
continue
|
|
760
|
+
|
|
761
|
+
# Replace <s> children
|
|
762
|
+
for s in list(s_elems):
|
|
763
|
+
t_elem.remove(s)
|
|
764
|
+
|
|
765
|
+
for s_elem in new_s_elements:
|
|
766
|
+
t_elem.append(s_elem)
|
|
767
|
+
modified = True
|
|
768
|
+
|
|
769
|
+
if modified:
|
|
770
|
+
write_cdxml(tree, cdxml_path)
|
|
771
|
+
|
|
772
|
+
|
|
773
|
+
# ---------------------------------------------------------------------------
|
|
774
|
+
# Step 9: Run alignment
|
|
775
|
+
# ---------------------------------------------------------------------------
|
|
776
|
+
|
|
777
|
+
def _run_alignment(cdxml_path: str, align_mode: str) -> None:
|
|
778
|
+
"""Align reactant structures to match product orientation."""
|
|
779
|
+
from ..cdxml_utils import parse_cdxml, write_cdxml
|
|
780
|
+
|
|
781
|
+
tree = parse_cdxml(cdxml_path)
|
|
782
|
+
|
|
783
|
+
aligned = 0
|
|
784
|
+
if align_mode == "rxnmapper":
|
|
785
|
+
try:
|
|
786
|
+
from ..layout.alignment import rxnmapper_align_to_product
|
|
787
|
+
aligned = rxnmapper_align_to_product(tree, verbose=_verbose)
|
|
788
|
+
_log(f" RXNMapper aligned {aligned} fragments")
|
|
789
|
+
except (ImportError, Exception) as e:
|
|
790
|
+
_log(f" RXNMapper alignment failed ({e}), falling back to RDKit MCS")
|
|
791
|
+
align_mode = "rdkit"
|
|
792
|
+
|
|
793
|
+
if align_mode == "rdkit":
|
|
794
|
+
try:
|
|
795
|
+
from ..layout.alignment import rdkit_align_to_product
|
|
796
|
+
aligned = rdkit_align_to_product(tree, verbose=_verbose)
|
|
797
|
+
_log(f" RDKit MCS aligned {aligned} fragments")
|
|
798
|
+
except (ImportError, Exception) as e:
|
|
799
|
+
_log(f" RDKit alignment failed ({e}), falling back to Kabsch")
|
|
800
|
+
align_mode = "kabsch"
|
|
801
|
+
|
|
802
|
+
if align_mode == "kabsch":
|
|
803
|
+
try:
|
|
804
|
+
from ..layout.alignment import kabsch_align_to_product
|
|
805
|
+
aligned = kabsch_align_to_product(tree, verbose=_verbose)
|
|
806
|
+
_log(f" Kabsch aligned {aligned} fragments")
|
|
807
|
+
except (ImportError, Exception) as e:
|
|
808
|
+
_log(f" Kabsch alignment failed: {e}")
|
|
809
|
+
|
|
810
|
+
if aligned > 0:
|
|
811
|
+
# Alignment rotates fragments, which invalidates pre-computed
|
|
812
|
+
# DoublePosition values (they are relative to the B→E bond vector,
|
|
813
|
+
# which has rotated). Strip them — ChemDraw recomputes correct
|
|
814
|
+
# values automatically via NeedsClean.
|
|
815
|
+
for bond_el in tree.iter("b"):
|
|
816
|
+
if bond_el.get("DoublePosition"):
|
|
817
|
+
del bond_el.attrib["DoublePosition"]
|
|
818
|
+
write_cdxml(tree, cdxml_path)
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
# ---------------------------------------------------------------------------
|
|
822
|
+
# Step 11: Add run arrow with mass/yield
|
|
823
|
+
# ---------------------------------------------------------------------------
|
|
824
|
+
|
|
825
|
+
def _add_run_arrow(cdxml_path: str, eln_data: Dict[str, Any]) -> None:
|
|
826
|
+
"""Add a run arrow below the scheme with SM mass and product yield.
|
|
827
|
+
|
|
828
|
+
The run arrow matches the reaction arrow's X-extent and is positioned
|
|
829
|
+
below all existing content (text + structures).
|
|
830
|
+
"""
|
|
831
|
+
from ..cdxml_utils import parse_cdxml, write_cdxml
|
|
832
|
+
|
|
833
|
+
sm_mass = eln_data.get("sm_mass", "")
|
|
834
|
+
product_obtained = eln_data.get("product_obtained", "")
|
|
835
|
+
product_yield = eln_data.get("product_yield", "")
|
|
836
|
+
|
|
837
|
+
if not sm_mass and not product_obtained:
|
|
838
|
+
_log(" No mass/yield data, skipping run arrow")
|
|
839
|
+
return
|
|
840
|
+
|
|
841
|
+
tree = parse_cdxml(cdxml_path)
|
|
842
|
+
root = tree.getroot()
|
|
843
|
+
page = root.find(".//page")
|
|
844
|
+
if page is None:
|
|
845
|
+
return
|
|
846
|
+
|
|
847
|
+
# Find the existing reaction arrow to match its X-extent
|
|
848
|
+
rxn_arrow = page.find(".//arrow")
|
|
849
|
+
if rxn_arrow is None:
|
|
850
|
+
_log(" No reaction arrow found, skipping run arrow")
|
|
851
|
+
return
|
|
852
|
+
|
|
853
|
+
# Get reaction arrow tail/head X from Tail3D/Head3D
|
|
854
|
+
tail_3d = rxn_arrow.get("Tail3D", "")
|
|
855
|
+
head_3d = rxn_arrow.get("Head3D", "")
|
|
856
|
+
if tail_3d and head_3d:
|
|
857
|
+
arrow_x1 = float(tail_3d.split()[0])
|
|
858
|
+
arrow_x2 = float(head_3d.split()[0])
|
|
859
|
+
else:
|
|
860
|
+
# Fallback: use BoundingBox
|
|
861
|
+
bbox = rxn_arrow.get("BoundingBox", "0 0 100 300").split()
|
|
862
|
+
arrow_x1 = float(bbox[0])
|
|
863
|
+
arrow_x2 = float(bbox[2])
|
|
864
|
+
|
|
865
|
+
# Find bottom of all content (including text below arrow)
|
|
866
|
+
max_y = 0.0
|
|
867
|
+
for elem in page:
|
|
868
|
+
if elem.tag == "fragment":
|
|
869
|
+
for node in elem.findall("n"):
|
|
870
|
+
p = node.get("p", "")
|
|
871
|
+
if p:
|
|
872
|
+
parts = p.split()
|
|
873
|
+
if len(parts) >= 2:
|
|
874
|
+
max_y = max(max_y, float(parts[1]))
|
|
875
|
+
elif elem.tag == "t":
|
|
876
|
+
# Check text bounding box or p position
|
|
877
|
+
bb = elem.get("BoundingBox", "")
|
|
878
|
+
if bb:
|
|
879
|
+
parts = bb.split()
|
|
880
|
+
if len(parts) >= 4:
|
|
881
|
+
max_y = max(max_y, float(parts[3]))
|
|
882
|
+
else:
|
|
883
|
+
p = elem.get("p", "")
|
|
884
|
+
if p:
|
|
885
|
+
parts = p.split()
|
|
886
|
+
if len(parts) >= 2:
|
|
887
|
+
max_y = max(max_y, float(parts[1]) + 5.0)
|
|
888
|
+
|
|
889
|
+
if max_y == 0.0:
|
|
890
|
+
return
|
|
891
|
+
|
|
892
|
+
# Get max id for new elements
|
|
893
|
+
max_id = 0
|
|
894
|
+
for el in root.iter():
|
|
895
|
+
eid = el.get("id")
|
|
896
|
+
if eid:
|
|
897
|
+
try:
|
|
898
|
+
max_id = max(max_id, int(eid))
|
|
899
|
+
except ValueError:
|
|
900
|
+
pass
|
|
901
|
+
|
|
902
|
+
next_id = max_id + 1
|
|
903
|
+
|
|
904
|
+
# Position run arrow below all content
|
|
905
|
+
arrow_y = max_y + 18.0
|
|
906
|
+
|
|
907
|
+
# Create arrow element (same X-extent as reaction arrow)
|
|
908
|
+
arrow_elem = ET.SubElement(page, "arrow")
|
|
909
|
+
arrow_elem.set("id", str(next_id))
|
|
910
|
+
next_id += 1
|
|
911
|
+
arrow_elem.set("Z", str(next_id))
|
|
912
|
+
next_id += 1
|
|
913
|
+
bbox = f"{arrow_x1:.2f} {arrow_y - 2:.2f} {arrow_x2:.2f} {arrow_y + 2:.2f}"
|
|
914
|
+
arrow_elem.set("BoundingBox", bbox)
|
|
915
|
+
arrow_elem.set("FillType", "None")
|
|
916
|
+
arrow_elem.set("ArrowheadHead", "Full")
|
|
917
|
+
arrow_elem.set("ArrowheadType", "Solid")
|
|
918
|
+
arrow_elem.set("Head3D", f"{arrow_x2:.2f} {arrow_y:.2f} 0")
|
|
919
|
+
arrow_elem.set("Tail3D", f"{arrow_x1:.2f} {arrow_y:.2f} 0")
|
|
920
|
+
|
|
921
|
+
# Text baseline should vertically centre on the arrow.
|
|
922
|
+
# Arial 10pt has ~7pt cap height; p (anchor) is at the text baseline,
|
|
923
|
+
# so baseline ≈ arrow_y + 3.5 centres the text on the arrow line.
|
|
924
|
+
text_baseline_y = arrow_y + 3.5
|
|
925
|
+
|
|
926
|
+
# Left label: SM mass (positioned left of arrow tail)
|
|
927
|
+
if sm_mass:
|
|
928
|
+
t_left = ET.SubElement(page, "t")
|
|
929
|
+
t_left.set("id", str(next_id))
|
|
930
|
+
next_id += 1
|
|
931
|
+
text_width = len(sm_mass) * 5.8
|
|
932
|
+
lx = arrow_x1 - 5.0 - text_width
|
|
933
|
+
ly_top = text_baseline_y - 8.0
|
|
934
|
+
ly_bot = text_baseline_y + 2.0
|
|
935
|
+
t_left.set("p", f"{arrow_x1 - 5:.2f} {text_baseline_y:.2f}")
|
|
936
|
+
t_left.set("BoundingBox",
|
|
937
|
+
f"{lx:.2f} {ly_top:.2f} {arrow_x1 - 5:.2f} {ly_bot:.2f}")
|
|
938
|
+
t_left.set("Justification", "Right")
|
|
939
|
+
t_left.set("CaptionJustification", "Right")
|
|
940
|
+
t_left.set("InterpretChemically", "no")
|
|
941
|
+
s_left = ET.SubElement(t_left, "s")
|
|
942
|
+
s_left.set("font", "3")
|
|
943
|
+
s_left.set("size", "10")
|
|
944
|
+
s_left.set("face", "0")
|
|
945
|
+
s_left.text = sm_mass
|
|
946
|
+
|
|
947
|
+
# Right label: product obtained + yield (positioned right of arrow head)
|
|
948
|
+
# Format: "1.60 g, 72%" (comma-separated, no parentheses)
|
|
949
|
+
right_text_parts = []
|
|
950
|
+
if product_obtained:
|
|
951
|
+
right_text_parts.append(product_obtained)
|
|
952
|
+
if product_yield:
|
|
953
|
+
# Strip extra whitespace in yield (e.g. "72 %" → "72%")
|
|
954
|
+
yield_clean = product_yield.replace(" %", "%").replace("% ", "%")
|
|
955
|
+
right_text_parts.append(yield_clean)
|
|
956
|
+
if right_text_parts:
|
|
957
|
+
right_text = ", ".join(right_text_parts)
|
|
958
|
+
text_width = len(right_text) * 5.8
|
|
959
|
+
t_right = ET.SubElement(page, "t")
|
|
960
|
+
t_right.set("id", str(next_id))
|
|
961
|
+
next_id += 1
|
|
962
|
+
rx = arrow_x2 + 5.0
|
|
963
|
+
ry_top = text_baseline_y - 8.0
|
|
964
|
+
ry_bot = text_baseline_y + 2.0
|
|
965
|
+
t_right.set("p", f"{rx:.2f} {text_baseline_y:.2f}")
|
|
966
|
+
t_right.set("BoundingBox",
|
|
967
|
+
f"{rx:.2f} {ry_top:.2f} {rx + text_width:.2f} {ry_bot:.2f}")
|
|
968
|
+
t_right.set("InterpretChemically", "no")
|
|
969
|
+
s_right = ET.SubElement(t_right, "s")
|
|
970
|
+
s_right.set("font", "3")
|
|
971
|
+
s_right.set("size", "10")
|
|
972
|
+
s_right.set("face", "0")
|
|
973
|
+
s_right.text = right_text
|
|
974
|
+
|
|
975
|
+
write_cdxml(tree, cdxml_path)
|
|
976
|
+
_log(f" Run arrow added: {sm_mass} -> {' '.join(right_text_parts)}")
|
|
977
|
+
|
|
978
|
+
|
|
979
|
+
# ---------------------------------------------------------------------------
|
|
980
|
+
# CLI
|
|
981
|
+
# ---------------------------------------------------------------------------
|
|
982
|
+
|
|
983
|
+
def _build_arg_parser() -> argparse.ArgumentParser:
|
|
984
|
+
p = argparse.ArgumentParser(
|
|
985
|
+
description="Build CDXML reaction scheme from reaction JSON "
|
|
986
|
+
"(experimental).",
|
|
987
|
+
)
|
|
988
|
+
p.add_argument("input", help="Reaction JSON file (from reaction_parser)")
|
|
989
|
+
p.add_argument("-o", "--output", default=None,
|
|
990
|
+
help="Output CDXML file (default: {stem}-scheme.cdxml)")
|
|
991
|
+
p.add_argument("--approach", default="chemdraw_mimic",
|
|
992
|
+
choices=["chemdraw_mimic", "compact", "bbox_center",
|
|
993
|
+
"arrow_driven", "proportional", "golden_ratio"],
|
|
994
|
+
help="Layout approach (default: chemdraw_mimic)")
|
|
995
|
+
p.add_argument("--align-mode", default="rdkit",
|
|
996
|
+
choices=["rdkit", "rxnmapper", "kabsch", "none"],
|
|
997
|
+
help="Alignment strategy (default: rdkit)")
|
|
998
|
+
p.add_argument("--no-run-arrow", action="store_true",
|
|
999
|
+
help="Skip run arrow even if ELN data is available")
|
|
1000
|
+
p.add_argument("-v", "--verbose", action="store_true")
|
|
1001
|
+
p.add_argument("--json-errors", action="store_true",
|
|
1002
|
+
help="Structured JSON errors to stderr")
|
|
1003
|
+
return p
|
|
1004
|
+
|
|
1005
|
+
|
|
1006
|
+
def main() -> None:
|
|
1007
|
+
parser = _build_arg_parser()
|
|
1008
|
+
args = parser.parse_args()
|
|
1009
|
+
|
|
1010
|
+
if not os.path.isfile(args.input):
|
|
1011
|
+
if args.json_errors:
|
|
1012
|
+
err = {"error": "file_not_found",
|
|
1013
|
+
"detail": f"Input file not found: {args.input}"}
|
|
1014
|
+
print(json.dumps(err), file=sys.stderr)
|
|
1015
|
+
else:
|
|
1016
|
+
print(f"ERROR: Input file not found: {args.input}",
|
|
1017
|
+
file=sys.stderr)
|
|
1018
|
+
sys.exit(1)
|
|
1019
|
+
|
|
1020
|
+
try:
|
|
1021
|
+
output = build_scheme(
|
|
1022
|
+
input_path=args.input,
|
|
1023
|
+
output=args.output,
|
|
1024
|
+
approach=args.approach,
|
|
1025
|
+
align_mode=args.align_mode,
|
|
1026
|
+
run_arrow=not args.no_run_arrow,
|
|
1027
|
+
verbose=args.verbose,
|
|
1028
|
+
)
|
|
1029
|
+
print(f"Output: {output}")
|
|
1030
|
+
except Exception as e:
|
|
1031
|
+
if args.json_errors:
|
|
1032
|
+
err = {"error": "scheme_build_failed", "detail": str(e)}
|
|
1033
|
+
print(json.dumps(err), file=sys.stderr)
|
|
1034
|
+
else:
|
|
1035
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
1036
|
+
if args.verbose:
|
|
1037
|
+
import traceback
|
|
1038
|
+
traceback.print_exc()
|
|
1039
|
+
sys.exit(1)
|
|
1040
|
+
|
|
1041
|
+
|
|
1042
|
+
if __name__ == "__main__":
|
|
1043
|
+
main()
|