cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2103 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
reaction_from_image.py — Build a full ChemDraw reaction scheme from a screenshot.
|
|
4
|
+
|
|
5
|
+
Takes a screenshot of a reaction scheme (e.g. from SciFinder, a paper, or a patent)
|
|
6
|
+
and produces a CDXML reaction scheme with proper arrow, conditions text, and
|
|
7
|
+
ACS Document 1996 styling.
|
|
8
|
+
|
|
9
|
+
Architecture
|
|
10
|
+
------------
|
|
11
|
+
This is an *orchestration* tool. It does NOT try to auto-detect arrows or OCR
|
|
12
|
+
conditions text from the image — that is unreliable and unnecessary. Instead,
|
|
13
|
+
an LLM (or user) looks at the screenshot and provides a small JSON descriptor
|
|
14
|
+
that tells the tool:
|
|
15
|
+
|
|
16
|
+
1. Which detected structures are reactants vs products (by left-to-right index)
|
|
17
|
+
2. What conditions text goes above / below the arrow
|
|
18
|
+
|
|
19
|
+
The tool then:
|
|
20
|
+
a. Extracts molecular structures from the image via DECIMER
|
|
21
|
+
(delegates to structure_from_image.py)
|
|
22
|
+
b. Assigns them as reactants or products per the descriptor
|
|
23
|
+
c. Lays out the molecules, arrow, and conditions text
|
|
24
|
+
d. Builds a valid CDXML document via cdxml_builder.py
|
|
25
|
+
e. Applies subscript formatting to chemical formulae in conditions
|
|
26
|
+
|
|
27
|
+
Usage
|
|
28
|
+
-----
|
|
29
|
+
Minimal (LLM provides JSON descriptor on stdin):
|
|
30
|
+
python reaction_from_image.py --image scheme.png --descriptor desc.json -o scheme.cdxml
|
|
31
|
+
|
|
32
|
+
Descriptor JSON format:
|
|
33
|
+
{
|
|
34
|
+
"reactant_indices": [0, 1],
|
|
35
|
+
"product_indices": [2],
|
|
36
|
+
"conditions_above": ["Pd2dba3", "BINAP"],
|
|
37
|
+
"conditions_below": ["Dioxane", "24 h, reflux"]
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
- reactant_indices / product_indices refer to the left-to-right order of
|
|
41
|
+
detected structures (0-indexed). The tool extracts all structures first,
|
|
42
|
+
then assigns roles based on these indices.
|
|
43
|
+
- conditions_above / conditions_below are plain text strings.
|
|
44
|
+
- If a condition string matches a known abbreviation (e.g. "Cs2CO3"),
|
|
45
|
+
it is kept verbatim. Unknown abbreviations are reproduced as-is
|
|
46
|
+
(we never trust LLM-generated SMILES for reagents).
|
|
47
|
+
|
|
48
|
+
Abbreviation dictionary
|
|
49
|
+
-----------------------
|
|
50
|
+
A curated dictionary maps common reagent/ligand/catalyst abbreviations to
|
|
51
|
+
themselves (for subscript formatting) or to display names. This is NOT
|
|
52
|
+
for structure resolution — it's only to decide whether a name is "known"
|
|
53
|
+
and how to display it. If an abbreviation isn't in the dictionary, the
|
|
54
|
+
exact text from the descriptor is used verbatim.
|
|
55
|
+
|
|
56
|
+
The dictionary lives in ABBREVIATIONS below and can be extended over time.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
import argparse
|
|
60
|
+
import json
|
|
61
|
+
import math
|
|
62
|
+
import os
|
|
63
|
+
import re
|
|
64
|
+
import sys
|
|
65
|
+
from copy import deepcopy
|
|
66
|
+
from typing import Dict, List, Optional, Tuple
|
|
67
|
+
from xml.sax.saxutils import escape as xml_escape
|
|
68
|
+
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
# Shared reagent database
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
from ..resolve.reagent_db import get_reagent_db
|
|
74
|
+
from ..text_formatting import needs_subscript, build_formatted_s_xml
|
|
75
|
+
from ..constants import (
|
|
76
|
+
ACS_BOND_LENGTH, EXPAND_SCALE_BOND,
|
|
77
|
+
CDXML_HEADER, CDXML_FOOTER,
|
|
78
|
+
ACS_LABEL_FONT, ACS_LABEL_SIZE, ACS_LABEL_FACE,
|
|
79
|
+
ACS_CAPTION_SIZE, ACS_HASH_SPACING, ACS_MARGIN_WIDTH,
|
|
80
|
+
ACS_LINE_WIDTH, ACS_BOLD_WIDTH, ACS_BOND_LENGTH_STR,
|
|
81
|
+
ACS_BOND_SPACING, ACS_CHAIN_ANGLE_STR,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ---------------------------------------------------------------------------
|
|
86
|
+
# Resolve abbreviation display text
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
def resolve_abbreviation(text: str) -> str:
|
|
90
|
+
"""Look up text in the reagent database.
|
|
91
|
+
|
|
92
|
+
Returns the canonical display form if found, otherwise the original text
|
|
93
|
+
verbatim (we never invent or transform unknown abbreviations).
|
|
94
|
+
"""
|
|
95
|
+
return get_reagent_db().resolve_display(text)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# ---------------------------------------------------------------------------
|
|
99
|
+
# Condition classification: chemistry vs. non-chemistry text
|
|
100
|
+
# ---------------------------------------------------------------------------
|
|
101
|
+
|
|
102
|
+
def _is_non_chemistry_text(text: str) -> bool:
|
|
103
|
+
"""Return True if *text* is a non-chemistry condition string that should
|
|
104
|
+
**always** be rendered as a text label (never expanded to a structure).
|
|
105
|
+
|
|
106
|
+
Examples that return True:
|
|
107
|
+
"24 h, reflux", "120 °C", "rt", "overnight", "10 mol%",
|
|
108
|
+
"1.5 equiv", "N2", "Ar", "sealed tube"
|
|
109
|
+
"""
|
|
110
|
+
t = text.strip()
|
|
111
|
+
tl = t.lower()
|
|
112
|
+
|
|
113
|
+
# Temperature patterns
|
|
114
|
+
if re.search(r'-?\d+\s*°', t):
|
|
115
|
+
return True
|
|
116
|
+
if tl in ("rt", "room temperature", "room temp", "room temp."):
|
|
117
|
+
return True
|
|
118
|
+
|
|
119
|
+
# Time patterns
|
|
120
|
+
if re.search(r'\d+\s*(h|hr|hrs|min|d|days?)\b', tl):
|
|
121
|
+
return True
|
|
122
|
+
if tl in ("overnight", "o/n", "on"):
|
|
123
|
+
return True
|
|
124
|
+
|
|
125
|
+
# Percentage / equivalents / concentration
|
|
126
|
+
if re.search(r'\d+\s*(mol\s*)?%', tl):
|
|
127
|
+
return True
|
|
128
|
+
if re.search(r'[\d.]+\s*equiv', tl):
|
|
129
|
+
return True
|
|
130
|
+
if re.search(r'[\d.]+\s*M\b', t): # case-sensitive M
|
|
131
|
+
return True
|
|
132
|
+
|
|
133
|
+
# Physical conditions (single keywords)
|
|
134
|
+
_PHYS = {
|
|
135
|
+
"reflux", "sealed tube", "microwave", "mw", "ultrasound",
|
|
136
|
+
"sonication", "inert atmosphere", "dark", "hv", "light",
|
|
137
|
+
}
|
|
138
|
+
if tl in _PHYS:
|
|
139
|
+
return True
|
|
140
|
+
|
|
141
|
+
# Inert gas (very short abbreviations)
|
|
142
|
+
if tl in ("n2", "ar", "argon", "nitrogen"):
|
|
143
|
+
return True
|
|
144
|
+
|
|
145
|
+
# Compound phrases with comma → likely mixed ("24 h, reflux")
|
|
146
|
+
if "," in t:
|
|
147
|
+
return True
|
|
148
|
+
|
|
149
|
+
# "then ..." / step instructions
|
|
150
|
+
if tl.startswith("then "):
|
|
151
|
+
return True
|
|
152
|
+
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# ---------------------------------------------------------------------------
|
|
157
|
+
# Expand conditions: resolve names to structures (ChemScript → PubChem)
|
|
158
|
+
# ---------------------------------------------------------------------------
|
|
159
|
+
|
|
160
|
+
def _extract_fragment_from_cdxml(cdxml_str: str) -> Optional[Tuple[str, float, float, float, float]]:
|
|
161
|
+
"""Parse a CDXML string and extract the first <fragment> element XML +
|
|
162
|
+
its bounding box. Returns (frag_xml, xmin, ymin, xmax, ymax) or None."""
|
|
163
|
+
import xml.etree.ElementTree as ET
|
|
164
|
+
|
|
165
|
+
if not cdxml_str or "<CDXML" not in cdxml_str:
|
|
166
|
+
return None
|
|
167
|
+
root = ET.fromstring(cdxml_str)
|
|
168
|
+
page_el = root.find("page")
|
|
169
|
+
if page_el is None:
|
|
170
|
+
return None
|
|
171
|
+
frag_el = page_el.find("fragment")
|
|
172
|
+
if frag_el is None:
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
frag_xml = ET.tostring(frag_el, encoding="unicode")
|
|
176
|
+
xmin, ymin, xmax, ymax = _measure_fragment_xml(frag_xml)
|
|
177
|
+
if xmin == xmax:
|
|
178
|
+
return None
|
|
179
|
+
return (frag_xml, xmin, ymin, xmax, ymax)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _scale_fragment_xml(
|
|
183
|
+
frag_xml: str,
|
|
184
|
+
scale: float,
|
|
185
|
+
xmin: float, ymin: float, xmax: float, ymax: float,
|
|
186
|
+
) -> Tuple[str, float, float, float, float]:
|
|
187
|
+
"""Scale a fragment's coordinates around its center by *scale* factor.
|
|
188
|
+
|
|
189
|
+
Returns ``(scaled_xml, new_xmin, new_ymin, new_xmax, new_ymax)``.
|
|
190
|
+
"""
|
|
191
|
+
cx = (xmin + xmax) / 2.0
|
|
192
|
+
cy = (ymin + ymax) / 2.0
|
|
193
|
+
|
|
194
|
+
def scale_p(m: "re.Match") -> str:
|
|
195
|
+
x, y = float(m.group(1)), float(m.group(2))
|
|
196
|
+
nx = cx + (x - cx) * scale
|
|
197
|
+
ny = cy + (y - cy) * scale
|
|
198
|
+
return f'p="{nx:.3f} {ny:.3f}"'
|
|
199
|
+
|
|
200
|
+
def scale_bb(m: "re.Match") -> str:
|
|
201
|
+
vals = [float(v) for v in m.group(1).split()]
|
|
202
|
+
sv = [
|
|
203
|
+
f"{cx + (vals[0] - cx) * scale:.3f}",
|
|
204
|
+
f"{cy + (vals[1] - cy) * scale:.3f}",
|
|
205
|
+
f"{cx + (vals[2] - cx) * scale:.3f}",
|
|
206
|
+
f"{cy + (vals[3] - cy) * scale:.3f}",
|
|
207
|
+
]
|
|
208
|
+
return f'BoundingBox="{" ".join(sv)}"'
|
|
209
|
+
|
|
210
|
+
scaled = re.sub(r'\bp="([-\d.]+)\s+([-\d.]+)"', scale_p, frag_xml)
|
|
211
|
+
scaled = re.sub(r'\bBoundingBox="((?:[-\d.]+ ?){4})"', scale_bb, scaled)
|
|
212
|
+
new_xmin, new_ymin, new_xmax, new_ymax = _measure_fragment_xml(scaled)
|
|
213
|
+
return scaled, new_xmin, new_ymin, new_xmax, new_ymax
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _resolve_condition_to_fragment(
|
|
217
|
+
text: str,
|
|
218
|
+
cs_bridge,
|
|
219
|
+
verbose: bool = False,
|
|
220
|
+
) -> Optional[Tuple[str, float, float, float, float]]:
|
|
221
|
+
"""Attempt to resolve a single condition string to a CDXML fragment.
|
|
222
|
+
|
|
223
|
+
Resolution chain:
|
|
224
|
+
1. Skip if ``_is_non_chemistry_text(text)`` → always text.
|
|
225
|
+
2. ChemScript ``name_to_cdxml(canonical_name)`` → extract fragment.
|
|
226
|
+
3. PubChem name→SMILES → ChemScript ``smiles_to_cdxml(smiles)`` → extract fragment.
|
|
227
|
+
4. Return ``None`` → caller renders text verbatim.
|
|
228
|
+
|
|
229
|
+
If the resolved structure exceeds ``EXPAND_MAX_WIDTH``, it is scaled down
|
|
230
|
+
so that conditions don't dominate the scheme.
|
|
231
|
+
|
|
232
|
+
Returns (fragment_xml, xmin, ymin, xmax, ymax) or None.
|
|
233
|
+
"""
|
|
234
|
+
def log(msg: str):
|
|
235
|
+
if verbose:
|
|
236
|
+
print(f"[expand] {msg}", file=sys.stderr)
|
|
237
|
+
|
|
238
|
+
canonical = resolve_abbreviation(text)
|
|
239
|
+
|
|
240
|
+
if _is_non_chemistry_text(canonical):
|
|
241
|
+
log(f" '{canonical}' → non-chemistry text, keeping as label")
|
|
242
|
+
return None
|
|
243
|
+
|
|
244
|
+
result = None
|
|
245
|
+
|
|
246
|
+
# --- 1. ChemScript name resolution ---
|
|
247
|
+
try:
|
|
248
|
+
cdxml_str = cs_bridge.name_to_cdxml(canonical)
|
|
249
|
+
result = _extract_fragment_from_cdxml(cdxml_str)
|
|
250
|
+
if result is not None:
|
|
251
|
+
log(f" '{canonical}' → ChemScript name OK")
|
|
252
|
+
except Exception as exc:
|
|
253
|
+
log(f" '{canonical}' → ChemScript name failed: {exc}")
|
|
254
|
+
|
|
255
|
+
# --- 2. PubChem name → SMILES → ChemScript smiles_to_cdxml ---
|
|
256
|
+
if result is None:
|
|
257
|
+
try:
|
|
258
|
+
from ..resolve.cas_resolver import resolve_name_to_smiles
|
|
259
|
+
smiles = resolve_name_to_smiles(canonical)
|
|
260
|
+
if smiles:
|
|
261
|
+
log(f" '{canonical}' → PubChem SMILES: {smiles[:60]}")
|
|
262
|
+
cdxml_str = cs_bridge.smiles_to_cdxml(smiles)
|
|
263
|
+
result = _extract_fragment_from_cdxml(cdxml_str)
|
|
264
|
+
if result is not None:
|
|
265
|
+
log(f" '{canonical}' → PubChem+ChemScript OK")
|
|
266
|
+
except Exception as exc:
|
|
267
|
+
log(f" '{canonical}' → PubChem fallback failed: {exc}")
|
|
268
|
+
|
|
269
|
+
if result is None:
|
|
270
|
+
log(f" '{canonical}' → unresolved, keeping as text label")
|
|
271
|
+
return None
|
|
272
|
+
|
|
273
|
+
# --- Scale down large structures ---
|
|
274
|
+
frag_xml, xmin, ymin, xmax, ymax = result
|
|
275
|
+
w = xmax - xmin
|
|
276
|
+
if w > EXPAND_MAX_WIDTH:
|
|
277
|
+
scale = EXPAND_MAX_WIDTH / w
|
|
278
|
+
frag_xml, xmin, ymin, xmax, ymax = _scale_fragment_xml(
|
|
279
|
+
frag_xml, scale, xmin, ymin, xmax, ymax
|
|
280
|
+
)
|
|
281
|
+
log(f" '{canonical}' scaled to {scale:.2f}x (w={w:.1f} → {xmax - xmin:.1f})")
|
|
282
|
+
|
|
283
|
+
return (frag_xml, xmin, ymin, xmax, ymax)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def _resolve_all_conditions(
|
|
287
|
+
conditions: List[str],
|
|
288
|
+
cs_bridge,
|
|
289
|
+
verbose: bool = False,
|
|
290
|
+
) -> List[Tuple[str, Optional[Tuple[str, float, float, float, float]]]]:
|
|
291
|
+
"""Resolve a list of condition strings. For each returns
|
|
292
|
+
``(display_text, fragment_info_or_None)``.
|
|
293
|
+
"""
|
|
294
|
+
results: List[Tuple[str, Optional[Tuple[str, float, float, float, float]]]] = []
|
|
295
|
+
for text in conditions:
|
|
296
|
+
canonical = resolve_abbreviation(text)
|
|
297
|
+
frag = _resolve_condition_to_fragment(text, cs_bridge, verbose)
|
|
298
|
+
results.append((canonical, frag))
|
|
299
|
+
return results
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
# ---------------------------------------------------------------------------
|
|
303
|
+
# Fragment ID reassignment (prevent collisions with reactant/product IDs)
|
|
304
|
+
# ---------------------------------------------------------------------------
|
|
305
|
+
|
|
306
|
+
def _reassign_fragment_ids(frag_xml: str, ids: "_IDGen") -> Tuple[str, int]:
|
|
307
|
+
"""Rewrite all element IDs in *frag_xml* using *ids* so they are unique
|
|
308
|
+
within the overall CDXML document.
|
|
309
|
+
|
|
310
|
+
Returns ``(new_xml, top_level_fragment_id)``.
|
|
311
|
+
"""
|
|
312
|
+
import xml.etree.ElementTree as ET
|
|
313
|
+
|
|
314
|
+
root = ET.fromstring(frag_xml)
|
|
315
|
+
old_to_new: Dict[str, str] = {}
|
|
316
|
+
|
|
317
|
+
# First pass: assign new IDs
|
|
318
|
+
for el in root.iter():
|
|
319
|
+
old_id = el.get("id")
|
|
320
|
+
if old_id is not None:
|
|
321
|
+
new_id = str(ids.next())
|
|
322
|
+
old_to_new[old_id] = new_id
|
|
323
|
+
|
|
324
|
+
# Second pass: rewrite id, B (bond begin), E (bond end), Z
|
|
325
|
+
for el in root.iter():
|
|
326
|
+
for attr in ("id", "B", "E", "SupersededBy"):
|
|
327
|
+
val = el.get(attr)
|
|
328
|
+
if val and val in old_to_new:
|
|
329
|
+
el.set(attr, old_to_new[val])
|
|
330
|
+
# Z attribute also needs a unique value
|
|
331
|
+
if el.get("Z") is not None:
|
|
332
|
+
el.set("Z", str(ids.next()))
|
|
333
|
+
|
|
334
|
+
top_id = int(old_to_new.get(root.get("id", "0"), "0"))
|
|
335
|
+
new_xml = ET.tostring(root, encoding="unicode")
|
|
336
|
+
return new_xml, top_id
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
# ---------------------------------------------------------------------------
|
|
340
|
+
# Layout constants (ACS Document 1996)
|
|
341
|
+
# ---------------------------------------------------------------------------
|
|
342
|
+
|
|
343
|
+
INTER_MOL_GAP = 18.0 # horizontal gap between molecules on same side
|
|
344
|
+
ARROW_MARGIN = 20.0 # gap between molecules and arrow ends
|
|
345
|
+
ARROW_LENGTH = 80.0 # default arrow shaft length
|
|
346
|
+
PAGE_LEFT = 80.0 # left margin for first reactant
|
|
347
|
+
VERTICAL_CENTER = 500.0 # y-coordinate for vertical centre of the scheme
|
|
348
|
+
CONDITIONS_GAP_ABOVE = 10.0 # clear gap between bottom of above-text and arrow shaft
|
|
349
|
+
CONDITIONS_GAP_BELOW = 10.0 # clear gap between arrow shaft and top of below-text
|
|
350
|
+
CONDITIONS_LINE_HEIGHT = 12.0 # line height for conditions text
|
|
351
|
+
CONDITIONS_DESCENDER = 3.0 # extra space below baseline for descenders (g, p, y)
|
|
352
|
+
|
|
353
|
+
# Expanded conditions layout constants
|
|
354
|
+
EXPAND_STRUCTURE_GAP = 10.0 # horizontal gap between adjacent condition structures
|
|
355
|
+
EXPAND_ABOVE_CLEARANCE = 12.0 # clearance from arrow shaft to bottom of above-structures
|
|
356
|
+
EXPAND_BELOW_CLEARANCE = 12.0 # clearance from arrow shaft to top of below-structures
|
|
357
|
+
EXPAND_MAX_WIDTH = 80.0 # max width for a single condition structure before scaling
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
# ---------------------------------------------------------------------------
|
|
361
|
+
# CDXML header helper (uses shared template from constants.py)
|
|
362
|
+
# ---------------------------------------------------------------------------
|
|
363
|
+
|
|
364
|
+
def _format_cdxml_header(bbox: str) -> str:
|
|
365
|
+
"""Format CDXML_HEADER template with ACS Document 1996 style constants."""
|
|
366
|
+
return CDXML_HEADER.format(
|
|
367
|
+
bbox=bbox,
|
|
368
|
+
label_font=ACS_LABEL_FONT,
|
|
369
|
+
label_size=ACS_LABEL_SIZE,
|
|
370
|
+
label_face=ACS_LABEL_FACE,
|
|
371
|
+
caption_size=ACS_CAPTION_SIZE,
|
|
372
|
+
hash_spacing=ACS_HASH_SPACING,
|
|
373
|
+
margin_width=ACS_MARGIN_WIDTH,
|
|
374
|
+
line_width=ACS_LINE_WIDTH,
|
|
375
|
+
bold_width=ACS_BOLD_WIDTH,
|
|
376
|
+
bond_length=ACS_BOND_LENGTH_STR,
|
|
377
|
+
bond_spacing=ACS_BOND_SPACING,
|
|
378
|
+
chain_angle=ACS_CHAIN_ANGLE_STR,
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
# ---------------------------------------------------------------------------
|
|
383
|
+
# ID generator
|
|
384
|
+
# ---------------------------------------------------------------------------
|
|
385
|
+
|
|
386
|
+
class _IDGen:
|
|
387
|
+
"""Simple incrementing integer ID generator."""
|
|
388
|
+
def __init__(self, start: int = 1000):
|
|
389
|
+
self._n = start
|
|
390
|
+
|
|
391
|
+
def next(self) -> int:
|
|
392
|
+
v = self._n
|
|
393
|
+
self._n += 1
|
|
394
|
+
return v
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
# ---------------------------------------------------------------------------
|
|
398
|
+
# Molecule bounding box helpers
|
|
399
|
+
# ---------------------------------------------------------------------------
|
|
400
|
+
|
|
401
|
+
def _mol_extent(mol: Dict) -> Tuple[float, float, float, float]:
|
|
402
|
+
"""Return (min_x, min_y, max_x, max_y) of atoms."""
|
|
403
|
+
xs = [a["x"] for a in mol["atoms"]]
|
|
404
|
+
ys = [a["y"] for a in mol["atoms"]]
|
|
405
|
+
return min(xs), min(ys), max(xs), max(ys)
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def _translate_mol(mol: Dict, dx: float, dy: float) -> Dict:
|
|
409
|
+
"""Translate all atom coordinates by (dx, dy). Returns a new dict."""
|
|
410
|
+
mol = deepcopy(mol)
|
|
411
|
+
for a in mol["atoms"]:
|
|
412
|
+
a["x"] += dx
|
|
413
|
+
a["y"] += dy
|
|
414
|
+
return mol
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
# ---------------------------------------------------------------------------
|
|
418
|
+
# Fragment (molecule) XML builder — adapted from cdxml_builder.py
|
|
419
|
+
# ---------------------------------------------------------------------------
|
|
420
|
+
|
|
421
|
+
# Element numbers for heteroatoms
|
|
422
|
+
ELEMENT_NUMBERS: Dict[str, int] = {
|
|
423
|
+
"H": 1, "B": 5, "C": 6, "N": 7, "O": 8,
|
|
424
|
+
"F": 9, "Si": 14, "P": 15, "S": 16, "Cl": 17,
|
|
425
|
+
"Se": 34, "Br": 35, "I": 53, "Cs": 55,
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
WIDE_SYMBOLS = {"Br", "Cl", "Si", "Se", "Cs"}
|
|
429
|
+
|
|
430
|
+
BOND_ORDER_ATTR: Dict[int, Optional[str]] = {
|
|
431
|
+
1: None, 2: "2", 3: "3", 4: "1.5",
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
BOND_STEREO_ATTR: Dict[int, str] = {
|
|
435
|
+
1: "WedgeBegin", 4: "WedgeBegin", 6: "WedgedHashBegin",
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def _label_bbox(x: float, y: float, symbol: str) -> str:
|
|
440
|
+
char_w = 7.0 if symbol in WIDE_SYMBOLS else 6.0
|
|
441
|
+
lx = x - char_w / 2.0
|
|
442
|
+
ty = y - 7.52
|
|
443
|
+
by = y
|
|
444
|
+
rx = lx + char_w
|
|
445
|
+
return f"{lx:.2f} {ty:.2f} {rx:.2f} {by:.2f}"
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def _build_fragment(
|
|
449
|
+
atoms: List[Dict],
|
|
450
|
+
bonds: List[Dict],
|
|
451
|
+
ids: _IDGen,
|
|
452
|
+
) -> Tuple[str, int]:
|
|
453
|
+
"""Build a <fragment> XML string. Returns (xml_string, fragment_id)."""
|
|
454
|
+
frag_id = ids.next()
|
|
455
|
+
atom_id_map: Dict[int, int] = {}
|
|
456
|
+
|
|
457
|
+
xs = [a["x"] for a in atoms]
|
|
458
|
+
ys = [a["y"] for a in atoms]
|
|
459
|
+
bb = f"{min(xs):.2f} {min(ys):.2f} {max(xs):.2f} {max(ys):.2f}"
|
|
460
|
+
|
|
461
|
+
lines = [f'<fragment id="{frag_id}" BoundingBox="{bb}" Z="{ids.next()}">']
|
|
462
|
+
|
|
463
|
+
for a in atoms:
|
|
464
|
+
aid = ids.next()
|
|
465
|
+
atom_id_map[a["index"]] = aid
|
|
466
|
+
sym = a.get("symbol", "C")
|
|
467
|
+
ax, ay = a["x"], a["y"]
|
|
468
|
+
z = ids.next()
|
|
469
|
+
attrs = [f'id="{aid}"', f'p="{ax:.2f} {ay:.2f}"', f'Z="{z}"']
|
|
470
|
+
|
|
471
|
+
is_carbon = (sym == "C")
|
|
472
|
+
charge = a.get("charge", 0)
|
|
473
|
+
|
|
474
|
+
if not is_carbon:
|
|
475
|
+
el_num = ELEMENT_NUMBERS.get(sym, 0)
|
|
476
|
+
if el_num:
|
|
477
|
+
attrs.append(f'Element="{el_num}"')
|
|
478
|
+
nh = a.get("num_hydrogens", 0)
|
|
479
|
+
attrs.append(f'NumHydrogens="{nh}"')
|
|
480
|
+
attrs.append('NeedsClean="yes"')
|
|
481
|
+
attrs.append('AS="N"')
|
|
482
|
+
|
|
483
|
+
if charge:
|
|
484
|
+
attrs.append(f'Charge="{charge}"')
|
|
485
|
+
|
|
486
|
+
cfg = a.get("cfg", 0)
|
|
487
|
+
if cfg:
|
|
488
|
+
attrs.append(f'Stereo="{cfg}"')
|
|
489
|
+
|
|
490
|
+
if is_carbon and not charge:
|
|
491
|
+
lines.append(f'<n {" ".join(attrs)}/>')
|
|
492
|
+
else:
|
|
493
|
+
lx = ax - 3.25
|
|
494
|
+
ly = ay + 3.52
|
|
495
|
+
bbox = _label_bbox(ax, ay, sym)
|
|
496
|
+
label_text = xml_escape(sym)
|
|
497
|
+
label_align = ""
|
|
498
|
+
if sym in WIDE_SYMBOLS:
|
|
499
|
+
label_align = ' LabelAlignment="Left"'
|
|
500
|
+
lines.append(f'<n {" ".join(attrs)}>')
|
|
501
|
+
lines.append(
|
|
502
|
+
f'<t p="{lx:.2f} {ly:.2f}" BoundingBox="{bbox}" '
|
|
503
|
+
f'LabelJustification="Left"{label_align}>'
|
|
504
|
+
)
|
|
505
|
+
lines.append(
|
|
506
|
+
f'<s font="3" size="10" color="0" face="96">{label_text}</s>'
|
|
507
|
+
)
|
|
508
|
+
lines.append("</t>")
|
|
509
|
+
lines.append("</n>")
|
|
510
|
+
|
|
511
|
+
for b in bonds:
|
|
512
|
+
bid = ids.next()
|
|
513
|
+
z = ids.next()
|
|
514
|
+
a1 = atom_id_map.get(b["atom1"], 0)
|
|
515
|
+
a2 = atom_id_map.get(b["atom2"], 0)
|
|
516
|
+
order = b.get("order", 1)
|
|
517
|
+
cfg = b.get("cfg", 0)
|
|
518
|
+
attrs = [f'id="{bid}"', f'Z="{z}"', f'B="{a1}"', f'E="{a2}"']
|
|
519
|
+
|
|
520
|
+
order_attr = BOND_ORDER_ATTR.get(order)
|
|
521
|
+
if order_attr:
|
|
522
|
+
attrs.append(f'Order="{order_attr}"')
|
|
523
|
+
|
|
524
|
+
double_pos = b.get("double_pos", "")
|
|
525
|
+
if double_pos:
|
|
526
|
+
attrs.append(f'DoublePosition="{double_pos}"')
|
|
527
|
+
|
|
528
|
+
if cfg and cfg in BOND_STEREO_ATTR:
|
|
529
|
+
attrs.append(f'Display="{BOND_STEREO_ATTR[cfg]}"')
|
|
530
|
+
elif order == 1:
|
|
531
|
+
attrs.append('BS="N"')
|
|
532
|
+
|
|
533
|
+
lines.append(f'<b {" ".join(attrs)}/>')
|
|
534
|
+
|
|
535
|
+
lines.append("</fragment>")
|
|
536
|
+
return "\n".join(lines), frag_id
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
# ---------------------------------------------------------------------------
|
|
540
|
+
# Arrow XML builder
|
|
541
|
+
# ---------------------------------------------------------------------------
|
|
542
|
+
|
|
543
|
+
def _build_arrow(
|
|
544
|
+
tail_x: float, tail_y: float,
|
|
545
|
+
head_x: float, head_y: float,
|
|
546
|
+
ids: _IDGen,
|
|
547
|
+
) -> Tuple[str, int]:
|
|
548
|
+
"""Build an <arrow> element. Returns (xml_string, arrow_id)."""
|
|
549
|
+
aid = ids.next()
|
|
550
|
+
z = ids.next()
|
|
551
|
+
bx1 = min(tail_x, head_x)
|
|
552
|
+
by1 = min(tail_y, head_y) - 4.0
|
|
553
|
+
bx2 = max(tail_x, head_x)
|
|
554
|
+
by2 = max(tail_y, head_y) + 4.0
|
|
555
|
+
cx3 = (tail_x + head_x) / 2.0
|
|
556
|
+
cy3 = tail_y + 100.0
|
|
557
|
+
xml = (
|
|
558
|
+
f'<arrow id="{aid}" '
|
|
559
|
+
f'BoundingBox="{bx1:.2f} {by1:.2f} {bx2:.2f} {by2:.2f}" '
|
|
560
|
+
f'Z="{z}" '
|
|
561
|
+
f'FillType="None" '
|
|
562
|
+
f'ArrowheadHead="Full" '
|
|
563
|
+
f'ArrowheadType="Solid" '
|
|
564
|
+
f'HeadSize="1000" '
|
|
565
|
+
f'ArrowheadCenterSize="875" '
|
|
566
|
+
f'ArrowheadWidth="250" '
|
|
567
|
+
f'Head3D="{head_x:.2f} {head_y:.2f} 0" '
|
|
568
|
+
f'Tail3D="{tail_x:.2f} {tail_y:.2f} 0" '
|
|
569
|
+
f'Center3D="{cx3:.2f} {cy3:.2f} 0" '
|
|
570
|
+
f'MajorAxisEnd3D="{cx3 + 80:.2f} {cy3:.2f} 0" '
|
|
571
|
+
f'MinorAxisEnd3D="{cx3:.2f} {cy3 + 80:.2f} 0"'
|
|
572
|
+
f'/>'
|
|
573
|
+
)
|
|
574
|
+
return xml, aid
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
# ---------------------------------------------------------------------------
|
|
578
|
+
# Conditions text XML builder (with subscript support)
|
|
579
|
+
# ---------------------------------------------------------------------------
|
|
580
|
+
|
|
581
|
+
def _build_conditions_text(
|
|
582
|
+
text_lines: List[str],
|
|
583
|
+
x: float,
|
|
584
|
+
baseline_y: float,
|
|
585
|
+
ids: _IDGen,
|
|
586
|
+
) -> Tuple[str, int]:
|
|
587
|
+
"""Build a <t> element for conditions text above or below the arrow.
|
|
588
|
+
|
|
589
|
+
Each entry in text_lines becomes one line. Chemical formulae get
|
|
590
|
+
subscript formatting (e.g. Pd2dba3 → Pd₂dba₃).
|
|
591
|
+
|
|
592
|
+
Parameters
|
|
593
|
+
----------
|
|
594
|
+
x : horizontal centre of the text block
|
|
595
|
+
baseline_y : y-coordinate for the baseline of the FIRST line
|
|
596
|
+
(CDXML <t> p="x y" uses first-line baseline)
|
|
597
|
+
|
|
598
|
+
Returns (xml_string, text_element_id).
|
|
599
|
+
"""
|
|
600
|
+
tid = ids.next()
|
|
601
|
+
z = ids.next()
|
|
602
|
+
|
|
603
|
+
# Estimate bounding box
|
|
604
|
+
max_chars = max((len(ln) for ln in text_lines), default=1)
|
|
605
|
+
n_lines = len(text_lines)
|
|
606
|
+
w = max_chars * 5.8
|
|
607
|
+
ascender = 8.0 # approximate ascender height above baseline
|
|
608
|
+
descender = 3.0 # approximate descender depth below baseline
|
|
609
|
+
|
|
610
|
+
bx1 = x - w / 2.0
|
|
611
|
+
by1 = baseline_y - ascender
|
|
612
|
+
bx2 = x + w / 2.0
|
|
613
|
+
by2 = baseline_y + (n_lines - 1) * CONDITIONS_LINE_HEIGHT + descender
|
|
614
|
+
|
|
615
|
+
# Build the <s> content. We join lines with \n inside the <s>,
|
|
616
|
+
# applying subscripts per-line where appropriate. If ANY line
|
|
617
|
+
# needs subscripts, we build per-line <s> elements; otherwise
|
|
618
|
+
# we use a single <s> block.
|
|
619
|
+
any_subscript = any(needs_subscript(ln) for ln in text_lines)
|
|
620
|
+
|
|
621
|
+
if any_subscript:
|
|
622
|
+
# Build each line as separate <s> element(s), with \n between lines
|
|
623
|
+
s_parts = []
|
|
624
|
+
for i, ln in enumerate(text_lines):
|
|
625
|
+
if i > 0:
|
|
626
|
+
# Newline between lines — plain text <s>
|
|
627
|
+
s_parts.append(
|
|
628
|
+
'<s font="3" size="10" color="0" face="96">\n</s>'
|
|
629
|
+
)
|
|
630
|
+
s_parts.append(build_formatted_s_xml(ln))
|
|
631
|
+
s_xml = "".join(s_parts)
|
|
632
|
+
else:
|
|
633
|
+
# Simple: all lines in one <s>
|
|
634
|
+
text = "\n".join(xml_escape(ln) for ln in text_lines)
|
|
635
|
+
s_xml = f'<s font="3" size="10" color="0" face="96">{text}</s>'
|
|
636
|
+
|
|
637
|
+
xml = (
|
|
638
|
+
f'<t id="{tid}" p="{x:.2f} {baseline_y:.2f}" '
|
|
639
|
+
f'BoundingBox="{bx1:.2f} {by1:.2f} {bx2:.2f} {by2:.2f}" '
|
|
640
|
+
f'Z="{z}" '
|
|
641
|
+
f'CaptionJustification="Center" '
|
|
642
|
+
f'Justification="Center" '
|
|
643
|
+
f'LineHeight="auto">'
|
|
644
|
+
f'{s_xml}'
|
|
645
|
+
f'</t>'
|
|
646
|
+
)
|
|
647
|
+
return xml, tid
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
# ---------------------------------------------------------------------------
|
|
651
|
+
# Layout expanded conditions (structures + text) above/below arrow
|
|
652
|
+
# ---------------------------------------------------------------------------
|
|
653
|
+
|
|
654
|
+
ExpandedItems = List[Tuple[str, Optional[Tuple[str, float, float, float, float]]]]
|
|
655
|
+
|
|
656
|
+
|
|
657
|
+
def _layout_expanded_conditions(
|
|
658
|
+
resolved_items: "ExpandedItems",
|
|
659
|
+
arrow_tail_x: float,
|
|
660
|
+
arrow_head_x: float,
|
|
661
|
+
arrow_y: float,
|
|
662
|
+
position: str, # "above" or "below"
|
|
663
|
+
ids: "_IDGen",
|
|
664
|
+
verbose: bool = False,
|
|
665
|
+
) -> Tuple[List[str], List[int]]:
|
|
666
|
+
"""Position resolved condition items (structures + text labels) above or
|
|
667
|
+
below the arrow, arranged horizontally.
|
|
668
|
+
|
|
669
|
+
Parameters
|
|
670
|
+
----------
|
|
671
|
+
resolved_items : list of (display_text, fragment_info_or_None)
|
|
672
|
+
arrow_tail_x, arrow_head_x : arrow horizontal extent
|
|
673
|
+
arrow_y : arrow y-coordinate
|
|
674
|
+
position : "above" or "below"
|
|
675
|
+
ids : shared ID generator
|
|
676
|
+
verbose : print debug info
|
|
677
|
+
|
|
678
|
+
Returns
|
|
679
|
+
-------
|
|
680
|
+
(xml_strings, element_ids)
|
|
681
|
+
"""
|
|
682
|
+
def log(msg: str):
|
|
683
|
+
if verbose:
|
|
684
|
+
print(f"[expand-layout] {msg}", file=sys.stderr)
|
|
685
|
+
|
|
686
|
+
if not resolved_items:
|
|
687
|
+
return [], []
|
|
688
|
+
|
|
689
|
+
arrow_mid_x = (arrow_tail_x + arrow_head_x) / 2.0
|
|
690
|
+
|
|
691
|
+
# --- Compute widths and heights for each item ---
|
|
692
|
+
item_infos = [] # (width, height, is_structure)
|
|
693
|
+
for display_text, frag_info in resolved_items:
|
|
694
|
+
if frag_info is not None:
|
|
695
|
+
_xml, xmin, ymin, xmax, ymax = frag_info
|
|
696
|
+
w = xmax - xmin
|
|
697
|
+
h = ymax - ymin
|
|
698
|
+
item_infos.append((w, h, True))
|
|
699
|
+
else:
|
|
700
|
+
# Estimate text width
|
|
701
|
+
w = max(len(display_text) * 5.8, 20.0)
|
|
702
|
+
h = 12.0 # single-line text height
|
|
703
|
+
item_infos.append((w, h, False))
|
|
704
|
+
|
|
705
|
+
n = len(item_infos)
|
|
706
|
+
total_width = sum(info[0] for info in item_infos) + (n - 1) * EXPAND_STRUCTURE_GAP
|
|
707
|
+
max_height = max(info[1] for info in item_infos)
|
|
708
|
+
|
|
709
|
+
# Starting x so the row is centered over the arrow midpoint
|
|
710
|
+
start_x = arrow_mid_x - total_width / 2.0
|
|
711
|
+
|
|
712
|
+
# --- Compute vertical anchor ---
|
|
713
|
+
if position == "above":
|
|
714
|
+
# Bottom edge of all items at arrow_y - clearance
|
|
715
|
+
items_bottom_y = arrow_y - EXPAND_ABOVE_CLEARANCE
|
|
716
|
+
else:
|
|
717
|
+
# Top edge of all items at arrow_y + clearance
|
|
718
|
+
items_top_y = arrow_y + EXPAND_BELOW_CLEARANCE
|
|
719
|
+
|
|
720
|
+
# --- Place each item ---
|
|
721
|
+
xml_parts: List[str] = []
|
|
722
|
+
id_parts: List[int] = []
|
|
723
|
+
cursor_x = start_x
|
|
724
|
+
|
|
725
|
+
for i, (display_text, frag_info) in enumerate(resolved_items):
|
|
726
|
+
w, h, is_struct = item_infos[i]
|
|
727
|
+
|
|
728
|
+
if is_struct and frag_info is not None:
|
|
729
|
+
frag_xml, xmin, ymin, xmax, ymax = frag_info
|
|
730
|
+
frag_cx = (xmin + xmax) / 2.0
|
|
731
|
+
frag_cy = (ymin + ymax) / 2.0
|
|
732
|
+
target_cx = cursor_x + w / 2.0
|
|
733
|
+
|
|
734
|
+
if position == "above":
|
|
735
|
+
# Place so fragment's ymax = items_bottom_y
|
|
736
|
+
target_cy = items_bottom_y - h / 2.0
|
|
737
|
+
else:
|
|
738
|
+
# Place so fragment's ymin = items_top_y
|
|
739
|
+
target_cy = items_top_y + h / 2.0
|
|
740
|
+
|
|
741
|
+
dx = target_cx - frag_cx
|
|
742
|
+
dy = target_cy - frag_cy
|
|
743
|
+
translated = _translate_fragment_xml(frag_xml, dx, dy)
|
|
744
|
+
final_xml, frag_id = _reassign_fragment_ids(translated, ids)
|
|
745
|
+
xml_parts.append(final_xml)
|
|
746
|
+
id_parts.append(frag_id)
|
|
747
|
+
log(f" Structure '{display_text}' at cx={target_cx:.1f} cy={target_cy:.1f}")
|
|
748
|
+
|
|
749
|
+
else:
|
|
750
|
+
# Text label fallback
|
|
751
|
+
text_cx = cursor_x + w / 2.0
|
|
752
|
+
if position == "above":
|
|
753
|
+
ascender = 8.0
|
|
754
|
+
baseline_y = items_bottom_y - ascender
|
|
755
|
+
else:
|
|
756
|
+
ascender = 8.0
|
|
757
|
+
baseline_y = items_top_y + ascender
|
|
758
|
+
|
|
759
|
+
txt_xml, txt_id = _build_conditions_text(
|
|
760
|
+
[display_text], text_cx, baseline_y, ids
|
|
761
|
+
)
|
|
762
|
+
xml_parts.append(txt_xml)
|
|
763
|
+
id_parts.append(txt_id)
|
|
764
|
+
log(f" Text '{display_text}' at cx={text_cx:.1f} baseline={baseline_y:.1f}")
|
|
765
|
+
|
|
766
|
+
cursor_x += w + EXPAND_STRUCTURE_GAP
|
|
767
|
+
|
|
768
|
+
return xml_parts, id_parts
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
# ---------------------------------------------------------------------------
|
|
772
|
+
# Core: build reaction scheme CDXML
|
|
773
|
+
# ---------------------------------------------------------------------------
|
|
774
|
+
|
|
775
|
+
def build_reaction_scheme(
|
|
776
|
+
structures: List[Dict],
|
|
777
|
+
reactant_indices: List[int],
|
|
778
|
+
product_indices: List[int],
|
|
779
|
+
conditions_above: List[str],
|
|
780
|
+
conditions_below: List[str],
|
|
781
|
+
verbose: bool = False,
|
|
782
|
+
expanded_above: Optional["ExpandedItems"] = None,
|
|
783
|
+
expanded_below: Optional["ExpandedItems"] = None,
|
|
784
|
+
) -> str:
|
|
785
|
+
"""
|
|
786
|
+
Assemble a CDXML reaction scheme from extracted structures + descriptor.
|
|
787
|
+
|
|
788
|
+
Parameters
|
|
789
|
+
----------
|
|
790
|
+
structures : list of structure dicts from extract_structures_from_image
|
|
791
|
+
reactant_indices : which structures (by index) are reactants
|
|
792
|
+
product_indices : which structures (by index) are products
|
|
793
|
+
conditions_above : text lines for above the arrow
|
|
794
|
+
conditions_below : text lines for below the arrow
|
|
795
|
+
verbose : print layout info to stderr
|
|
796
|
+
expanded_above : pre-resolved expanded conditions for above (from --expand)
|
|
797
|
+
expanded_below : pre-resolved expanded conditions for below (from --expand)
|
|
798
|
+
|
|
799
|
+
Returns
|
|
800
|
+
-------
|
|
801
|
+
CDXML document string
|
|
802
|
+
"""
|
|
803
|
+
def log(msg: str):
|
|
804
|
+
if verbose:
|
|
805
|
+
print(f"[reaction_from_image] {msg}", file=sys.stderr)
|
|
806
|
+
|
|
807
|
+
# Validate indices
|
|
808
|
+
n = len(structures)
|
|
809
|
+
for idx in reactant_indices + product_indices:
|
|
810
|
+
if idx < 0 or idx >= n:
|
|
811
|
+
raise ValueError(
|
|
812
|
+
f"Structure index {idx} out of range (0–{n-1}). "
|
|
813
|
+
f"Image yielded {n} structures."
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
# Separate reactant and product molecules
|
|
817
|
+
reactant_mols = [structures[i] for i in reactant_indices]
|
|
818
|
+
product_mols = [structures[i] for i in product_indices]
|
|
819
|
+
|
|
820
|
+
# Check that all molecules have atoms
|
|
821
|
+
for side, mols, label in [
|
|
822
|
+
(reactant_indices, reactant_mols, "reactant"),
|
|
823
|
+
(product_indices, product_mols, "product"),
|
|
824
|
+
]:
|
|
825
|
+
for idx, mol in zip(side, mols):
|
|
826
|
+
if not mol.get("atoms"):
|
|
827
|
+
raise ValueError(
|
|
828
|
+
f"Structure {idx} ({label}) has no atoms — DECIMER may have "
|
|
829
|
+
f"failed on this region. SMILES: {mol.get('smiles', '(none)')}"
|
|
830
|
+
)
|
|
831
|
+
|
|
832
|
+
# ------------------------------------------------------------------
|
|
833
|
+
# Layout: position molecules left-to-right
|
|
834
|
+
#
|
|
835
|
+
# [Reactant1] [gap] [Reactant2] [margin] →arrow→ [margin] [Product1]
|
|
836
|
+
#
|
|
837
|
+
# All molecules are centred vertically at VERTICAL_CENTER.
|
|
838
|
+
# ------------------------------------------------------------------
|
|
839
|
+
|
|
840
|
+
ids = _IDGen(1000)
|
|
841
|
+
cursor_x = PAGE_LEFT # running x position
|
|
842
|
+
|
|
843
|
+
# Position reactants
|
|
844
|
+
positioned_reactants: List[Dict] = []
|
|
845
|
+
for mol in reactant_mols:
|
|
846
|
+
x0, y0, x1, y1 = _mol_extent(mol)
|
|
847
|
+
mol_w = x1 - x0
|
|
848
|
+
mol_h = y1 - y0
|
|
849
|
+
# Translate: left edge to cursor_x, vertical centre to VERTICAL_CENTER
|
|
850
|
+
dx = cursor_x - x0
|
|
851
|
+
dy = VERTICAL_CENTER - (y0 + y1) / 2.0
|
|
852
|
+
positioned = _translate_mol(mol, dx, dy)
|
|
853
|
+
positioned_reactants.append(positioned)
|
|
854
|
+
cursor_x += mol_w + INTER_MOL_GAP
|
|
855
|
+
log(f"Reactant placed at x=[{cursor_x - mol_w - INTER_MOL_GAP:.1f}, {cursor_x - INTER_MOL_GAP:.1f}]")
|
|
856
|
+
|
|
857
|
+
# Arrow position
|
|
858
|
+
arrow_tail_x = cursor_x - INTER_MOL_GAP + ARROW_MARGIN
|
|
859
|
+
arrow_head_x = arrow_tail_x + ARROW_LENGTH
|
|
860
|
+
arrow_y = VERTICAL_CENTER
|
|
861
|
+
|
|
862
|
+
log(f"Arrow: tail={arrow_tail_x:.1f}, head={arrow_head_x:.1f}, y={arrow_y:.1f}")
|
|
863
|
+
|
|
864
|
+
# Position products
|
|
865
|
+
cursor_x = arrow_head_x + ARROW_MARGIN
|
|
866
|
+
positioned_products: List[Dict] = []
|
|
867
|
+
for mol in product_mols:
|
|
868
|
+
x0, y0, x1, y1 = _mol_extent(mol)
|
|
869
|
+
mol_w = x1 - x0
|
|
870
|
+
dx = cursor_x - x0
|
|
871
|
+
dy = VERTICAL_CENTER - (y0 + y1) / 2.0
|
|
872
|
+
positioned = _translate_mol(mol, dx, dy)
|
|
873
|
+
positioned_products.append(positioned)
|
|
874
|
+
cursor_x += mol_w + INTER_MOL_GAP
|
|
875
|
+
log(f"Product placed at x=[{cursor_x - mol_w - INTER_MOL_GAP:.1f}, {cursor_x - INTER_MOL_GAP:.1f}]")
|
|
876
|
+
|
|
877
|
+
# Resolve abbreviations in conditions text
|
|
878
|
+
resolved_above = [resolve_abbreviation(line) for line in conditions_above]
|
|
879
|
+
resolved_below = [resolve_abbreviation(line) for line in conditions_below]
|
|
880
|
+
|
|
881
|
+
log(f"Conditions above: {resolved_above}")
|
|
882
|
+
log(f"Conditions below: {resolved_below}")
|
|
883
|
+
|
|
884
|
+
# ------------------------------------------------------------------
|
|
885
|
+
# Build XML elements
|
|
886
|
+
# ------------------------------------------------------------------
|
|
887
|
+
|
|
888
|
+
fragment_xmls: List[str] = []
|
|
889
|
+
reactant_frag_ids: List[int] = []
|
|
890
|
+
product_frag_ids: List[int] = []
|
|
891
|
+
|
|
892
|
+
for mol in positioned_reactants:
|
|
893
|
+
frag_xml, frag_id = _build_fragment(mol["atoms"], mol["bonds"], ids)
|
|
894
|
+
fragment_xmls.append(frag_xml)
|
|
895
|
+
reactant_frag_ids.append(frag_id)
|
|
896
|
+
|
|
897
|
+
for mol in positioned_products:
|
|
898
|
+
frag_xml, frag_id = _build_fragment(mol["atoms"], mol["bonds"], ids)
|
|
899
|
+
fragment_xmls.append(frag_xml)
|
|
900
|
+
product_frag_ids.append(frag_id)
|
|
901
|
+
|
|
902
|
+
# Conditions: expanded structures or text labels
|
|
903
|
+
above_xmls: List[str] = []
|
|
904
|
+
below_xmls: List[str] = []
|
|
905
|
+
above_ids: List[int] = []
|
|
906
|
+
below_ids: List[int] = []
|
|
907
|
+
|
|
908
|
+
arrow_mid_x = (arrow_tail_x + arrow_head_x) / 2.0
|
|
909
|
+
|
|
910
|
+
if expanded_above is not None:
|
|
911
|
+
# --expand mode: structures + text fallback
|
|
912
|
+
ax, ai = _layout_expanded_conditions(
|
|
913
|
+
expanded_above, arrow_tail_x, arrow_head_x, arrow_y,
|
|
914
|
+
"above", ids, verbose,
|
|
915
|
+
)
|
|
916
|
+
above_xmls.extend(ax)
|
|
917
|
+
above_ids.extend(ai)
|
|
918
|
+
elif resolved_above:
|
|
919
|
+
n_above = len(resolved_above)
|
|
920
|
+
baseline_y = (arrow_y - CONDITIONS_GAP_ABOVE - CONDITIONS_DESCENDER
|
|
921
|
+
- (n_above - 1) * CONDITIONS_LINE_HEIGHT)
|
|
922
|
+
txt_xml, txt_id = _build_conditions_text(
|
|
923
|
+
resolved_above, arrow_mid_x, baseline_y, ids
|
|
924
|
+
)
|
|
925
|
+
above_xmls.append(txt_xml)
|
|
926
|
+
above_ids.append(txt_id)
|
|
927
|
+
|
|
928
|
+
if expanded_below is not None:
|
|
929
|
+
bx, bi = _layout_expanded_conditions(
|
|
930
|
+
expanded_below, arrow_tail_x, arrow_head_x, arrow_y,
|
|
931
|
+
"below", ids, verbose,
|
|
932
|
+
)
|
|
933
|
+
below_xmls.extend(bx)
|
|
934
|
+
below_ids.extend(bi)
|
|
935
|
+
elif resolved_below:
|
|
936
|
+
ascender = 8.0
|
|
937
|
+
baseline_y = arrow_y + CONDITIONS_GAP_BELOW + ascender
|
|
938
|
+
txt_xml, txt_id = _build_conditions_text(
|
|
939
|
+
resolved_below, arrow_mid_x, baseline_y, ids
|
|
940
|
+
)
|
|
941
|
+
below_xmls.append(txt_xml)
|
|
942
|
+
below_ids.append(txt_id)
|
|
943
|
+
|
|
944
|
+
# Arrow
|
|
945
|
+
arrow_xml, arrow_id = _build_arrow(
|
|
946
|
+
arrow_tail_x, arrow_y, arrow_head_x, arrow_y, ids
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
# Scheme / step
|
|
950
|
+
scheme_id = ids.next()
|
|
951
|
+
step_id = ids.next()
|
|
952
|
+
step_attrs = [
|
|
953
|
+
f'id="{step_id}"',
|
|
954
|
+
f'ReactionStepReactants="{" ".join(str(i) for i in reactant_frag_ids)}"',
|
|
955
|
+
f'ReactionStepProducts="{" ".join(str(i) for i in product_frag_ids)}"',
|
|
956
|
+
f'ReactionStepArrows="{arrow_id}"',
|
|
957
|
+
]
|
|
958
|
+
if above_ids:
|
|
959
|
+
step_attrs.append(
|
|
960
|
+
f'ReactionStepObjectsAboveArrow="{" ".join(str(i) for i in above_ids)}"'
|
|
961
|
+
)
|
|
962
|
+
if below_ids:
|
|
963
|
+
step_attrs.append(
|
|
964
|
+
f'ReactionStepObjectsBelowArrow="{" ".join(str(i) for i in below_ids)}"'
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
scheme_xml = f'<scheme id="{scheme_id}"><step {" ".join(step_attrs)}/></scheme>'
|
|
968
|
+
|
|
969
|
+
# ------------------------------------------------------------------
|
|
970
|
+
# Compute overall bounding box
|
|
971
|
+
# ------------------------------------------------------------------
|
|
972
|
+
all_xs: List[float] = []
|
|
973
|
+
all_ys: List[float] = []
|
|
974
|
+
for mol in positioned_reactants + positioned_products:
|
|
975
|
+
for a in mol["atoms"]:
|
|
976
|
+
all_xs.append(a["x"])
|
|
977
|
+
all_ys.append(a["y"])
|
|
978
|
+
|
|
979
|
+
# Include expanded condition fragments in bounding box
|
|
980
|
+
for frag_xml_str in above_xmls + below_xmls:
|
|
981
|
+
if frag_xml_str.lstrip().startswith("<fragment"):
|
|
982
|
+
fx0, fy0, fx1, fy1 = _measure_fragment_xml(frag_xml_str)
|
|
983
|
+
if fx0 != fx1:
|
|
984
|
+
all_xs.extend([fx0, fx1])
|
|
985
|
+
all_ys.extend([fy0, fy1])
|
|
986
|
+
|
|
987
|
+
margin = 20.0
|
|
988
|
+
doc_bbox = (
|
|
989
|
+
f"{min(all_xs) - margin:.2f} {min(all_ys) - margin:.2f} "
|
|
990
|
+
f"{max(all_xs) + margin:.2f} {max(all_ys) + margin:.2f}"
|
|
991
|
+
)
|
|
992
|
+
|
|
993
|
+
# Page size — generous
|
|
994
|
+
page_w = max(all_xs) + 100
|
|
995
|
+
page_h = max(all_ys) + 100
|
|
996
|
+
page_id = ids.next()
|
|
997
|
+
|
|
998
|
+
# ------------------------------------------------------------------
|
|
999
|
+
# Assemble document
|
|
1000
|
+
# ------------------------------------------------------------------
|
|
1001
|
+
parts = [
|
|
1002
|
+
_format_cdxml_header(doc_bbox),
|
|
1003
|
+
f'<page id="{page_id}" BoundingBox="0 0 {page_w:.0f} {page_h:.0f}" '
|
|
1004
|
+
f'HeaderPosition="36" FooterPosition="36" '
|
|
1005
|
+
f'PrintTrimMarks="yes" HeightPages="1" WidthPages="2">',
|
|
1006
|
+
]
|
|
1007
|
+
parts.extend(fragment_xmls)
|
|
1008
|
+
parts.extend(above_xmls)
|
|
1009
|
+
parts.extend(below_xmls)
|
|
1010
|
+
parts.append(arrow_xml)
|
|
1011
|
+
parts.append(scheme_xml)
|
|
1012
|
+
parts.append("</page>")
|
|
1013
|
+
parts.append(CDXML_FOOTER)
|
|
1014
|
+
|
|
1015
|
+
return "\n".join(parts)
|
|
1016
|
+
|
|
1017
|
+
|
|
1018
|
+
# ---------------------------------------------------------------------------
|
|
1019
|
+
# Fragment XML translation helper (for ChemScript fragments)
|
|
1020
|
+
# ---------------------------------------------------------------------------
|
|
1021
|
+
|
|
1022
|
+
def _translate_fragment_xml(frag_xml: str, dx: float, dy: float) -> str:
|
|
1023
|
+
"""Shift all coordinate attributes in a fragment XML string by (dx, dy).
|
|
1024
|
+
|
|
1025
|
+
Handles: p="x y" and BoundingBox="x1 y1 x2 y2"
|
|
1026
|
+
"""
|
|
1027
|
+
def shift_p(m: "re.Match") -> str:
|
|
1028
|
+
x, y = float(m.group(1)), float(m.group(2))
|
|
1029
|
+
return f'p="{x + dx:.3f} {y + dy:.3f}"'
|
|
1030
|
+
|
|
1031
|
+
def shift_bb(m: "re.Match") -> str:
|
|
1032
|
+
vals = [float(v) for v in m.group(1).split()]
|
|
1033
|
+
shifted = [
|
|
1034
|
+
f"{vals[0] + dx:.3f}", f"{vals[1] + dy:.3f}",
|
|
1035
|
+
f"{vals[2] + dx:.3f}", f"{vals[3] + dy:.3f}",
|
|
1036
|
+
]
|
|
1037
|
+
return f'BoundingBox="{" ".join(shifted)}"'
|
|
1038
|
+
|
|
1039
|
+
frag_xml = re.sub(r'\bp="([-\d.]+)\s+([-\d.]+)"', shift_p, frag_xml)
|
|
1040
|
+
frag_xml = re.sub(r'\bBoundingBox="((?:[-\d.]+ ?){4})"', shift_bb, frag_xml)
|
|
1041
|
+
return frag_xml
|
|
1042
|
+
|
|
1043
|
+
|
|
1044
|
+
def _measure_fragment_xml(frag_xml: str) -> Tuple[float, float, float, float]:
|
|
1045
|
+
"""Measure (xmin, ymin, xmax, ymax) from all p="x y" attributes in fragment XML."""
|
|
1046
|
+
xs, ys = [], []
|
|
1047
|
+
for m in re.finditer(r'\bp="([-\d.]+)\s+([-\d.]+)"', frag_xml):
|
|
1048
|
+
xs.append(float(m.group(1)))
|
|
1049
|
+
ys.append(float(m.group(2)))
|
|
1050
|
+
if not xs:
|
|
1051
|
+
return (0, 0, 0, 0)
|
|
1052
|
+
return min(xs), min(ys), max(xs), max(ys)
|
|
1053
|
+
|
|
1054
|
+
|
|
1055
|
+
def _best_smiles_component(smiles: str) -> str:
|
|
1056
|
+
"""For a multi-component SMILES (dot-separated), return the largest
|
|
1057
|
+
drug-like component (most heavy atoms, filtering out pure alkyne chains)."""
|
|
1058
|
+
components = smiles.split(".")
|
|
1059
|
+
if len(components) <= 1:
|
|
1060
|
+
return smiles
|
|
1061
|
+
|
|
1062
|
+
best = ""
|
|
1063
|
+
best_score = -1
|
|
1064
|
+
for comp in components:
|
|
1065
|
+
comp = comp.strip()
|
|
1066
|
+
if not comp:
|
|
1067
|
+
continue
|
|
1068
|
+
# Reject pure-alkyne chains
|
|
1069
|
+
if re.fullmatch(r'[C#]+', comp):
|
|
1070
|
+
continue
|
|
1071
|
+
# Score by number of heavy-atom characters
|
|
1072
|
+
score = sum(1 for c in comp if c.isalpha() and c.isupper())
|
|
1073
|
+
if score > best_score:
|
|
1074
|
+
best = comp
|
|
1075
|
+
best_score = score
|
|
1076
|
+
|
|
1077
|
+
return best or smiles
|
|
1078
|
+
|
|
1079
|
+
|
|
1080
|
+
# ---------------------------------------------------------------------------
|
|
1081
|
+
# ChemScript cleanup: SMILES → ChemDraw-native fragment XML
|
|
1082
|
+
# ---------------------------------------------------------------------------
|
|
1083
|
+
|
|
1084
|
+
def _open_chemscript_bridge(verbose: bool = False):
|
|
1085
|
+
"""Import and open a ChemScriptBridge instance. Caller must call .close()."""
|
|
1086
|
+
import importlib.util
|
|
1087
|
+
_dir = os.path.dirname(os.path.abspath(__file__))
|
|
1088
|
+
try:
|
|
1089
|
+
spec = importlib.util.spec_from_file_location(
|
|
1090
|
+
"chemscript_bridge", os.path.join(_dir, "chemscript_bridge.py")
|
|
1091
|
+
)
|
|
1092
|
+
csb_mod = importlib.util.module_from_spec(spec)
|
|
1093
|
+
spec.loader.exec_module(csb_mod)
|
|
1094
|
+
except Exception as exc:
|
|
1095
|
+
raise ImportError(
|
|
1096
|
+
f"Could not import chemscript_bridge.py: {exc}\n"
|
|
1097
|
+
"ChemDraw and chemscript_bridge are required."
|
|
1098
|
+
) from exc
|
|
1099
|
+
if verbose:
|
|
1100
|
+
print("[reaction_from_image] Opening ChemScript bridge...",
|
|
1101
|
+
file=sys.stderr)
|
|
1102
|
+
return csb_mod.ChemScriptBridge()
|
|
1103
|
+
|
|
1104
|
+
|
|
1105
|
+
def _chemscript_fragment_xmls(
|
|
1106
|
+
structures: List[Dict],
|
|
1107
|
+
verbose: bool = False,
|
|
1108
|
+
) -> Dict[int, Tuple[str, float, float, float, float]]:
|
|
1109
|
+
"""
|
|
1110
|
+
For each structure with a valid SMILES, produce a ChemScript-cleaned
|
|
1111
|
+
fragment XML string + its bounding box.
|
|
1112
|
+
|
|
1113
|
+
Returns dict: structure_index → (fragment_xml, xmin, ymin, xmax, ymax)
|
|
1114
|
+
"""
|
|
1115
|
+
import xml.etree.ElementTree as ET
|
|
1116
|
+
|
|
1117
|
+
def log(msg: str):
|
|
1118
|
+
if verbose:
|
|
1119
|
+
print(f"[reaction_from_image] {msg}", file=sys.stderr)
|
|
1120
|
+
|
|
1121
|
+
cs = _open_chemscript_bridge(verbose)
|
|
1122
|
+
|
|
1123
|
+
result: Dict[int, Tuple[str, float, float, float, float]] = {}
|
|
1124
|
+
try:
|
|
1125
|
+
for i, entry in enumerate(structures):
|
|
1126
|
+
smiles = entry.get("smiles", "").strip()
|
|
1127
|
+
if not smiles:
|
|
1128
|
+
continue
|
|
1129
|
+
if "." in smiles:
|
|
1130
|
+
smiles = _best_smiles_component(smiles)
|
|
1131
|
+
|
|
1132
|
+
log(f" ChemScript [{i}]: {smiles[:60]}...")
|
|
1133
|
+
try:
|
|
1134
|
+
cdxml_str = cs.smiles_to_cdxml(smiles)
|
|
1135
|
+
except Exception as exc:
|
|
1136
|
+
log(f" ChemScript failed for [{i}]: {exc}")
|
|
1137
|
+
continue
|
|
1138
|
+
|
|
1139
|
+
if not cdxml_str or "<CDXML" not in cdxml_str:
|
|
1140
|
+
log(f" ChemScript returned empty CDXML for [{i}]")
|
|
1141
|
+
continue
|
|
1142
|
+
|
|
1143
|
+
# Parse and extract the first <fragment>
|
|
1144
|
+
root = ET.fromstring(cdxml_str)
|
|
1145
|
+
page_el = root.find("page")
|
|
1146
|
+
if page_el is None:
|
|
1147
|
+
continue
|
|
1148
|
+
frag_el = page_el.find("fragment")
|
|
1149
|
+
if frag_el is None:
|
|
1150
|
+
continue
|
|
1151
|
+
|
|
1152
|
+
frag_xml = ET.tostring(frag_el, encoding="unicode")
|
|
1153
|
+
|
|
1154
|
+
# Measure bounding box from atom positions
|
|
1155
|
+
xmin, ymin, xmax, ymax = _measure_fragment_xml(frag_xml)
|
|
1156
|
+
if xmin == xmax:
|
|
1157
|
+
continue
|
|
1158
|
+
|
|
1159
|
+
result[i] = (frag_xml, xmin, ymin, xmax, ymax)
|
|
1160
|
+
log(f" ChemScript [{i}]: OK, bbox w={xmax-xmin:.1f} h={ymax-ymin:.1f}")
|
|
1161
|
+
finally:
|
|
1162
|
+
cs.close()
|
|
1163
|
+
|
|
1164
|
+
return result
|
|
1165
|
+
|
|
1166
|
+
|
|
1167
|
+
# ---------------------------------------------------------------------------
|
|
1168
|
+
# Build reaction scheme using ChemScript fragments
|
|
1169
|
+
# ---------------------------------------------------------------------------
|
|
1170
|
+
|
|
1171
|
+
def build_reaction_scheme_chemscript(
|
|
1172
|
+
structures: List[Dict],
|
|
1173
|
+
cs_fragments: Dict[int, Tuple[str, float, float, float, float]],
|
|
1174
|
+
reactant_indices: List[int],
|
|
1175
|
+
product_indices: List[int],
|
|
1176
|
+
conditions_above: List[str],
|
|
1177
|
+
conditions_below: List[str],
|
|
1178
|
+
verbose: bool = False,
|
|
1179
|
+
expanded_above: Optional["ExpandedItems"] = None,
|
|
1180
|
+
expanded_below: Optional["ExpandedItems"] = None,
|
|
1181
|
+
) -> str:
|
|
1182
|
+
"""
|
|
1183
|
+
Assemble a CDXML reaction scheme using ChemScript-cleaned fragment XML.
|
|
1184
|
+
|
|
1185
|
+
Same layout logic as build_reaction_scheme but uses native ChemDraw
|
|
1186
|
+
fragments instead of building from atom/bond dicts.
|
|
1187
|
+
"""
|
|
1188
|
+
def log(msg: str):
|
|
1189
|
+
if verbose:
|
|
1190
|
+
print(f"[reaction_from_image] {msg}", file=sys.stderr)
|
|
1191
|
+
|
|
1192
|
+
# Validate indices — must have ChemScript fragments for all
|
|
1193
|
+
for idx in reactant_indices + product_indices:
|
|
1194
|
+
if idx not in cs_fragments:
|
|
1195
|
+
raise ValueError(
|
|
1196
|
+
f"Structure {idx} has no ChemScript fragment — "
|
|
1197
|
+
f"cleanup may have failed for this structure."
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1200
|
+
ids = _IDGen(1000)
|
|
1201
|
+
cursor_x = PAGE_LEFT
|
|
1202
|
+
|
|
1203
|
+
# ------------------------------------------------------------------
|
|
1204
|
+
# Layout: translate ChemScript fragments to final positions
|
|
1205
|
+
# ------------------------------------------------------------------
|
|
1206
|
+
|
|
1207
|
+
reactant_frag_xmls: List[str] = []
|
|
1208
|
+
reactant_extents: List[Tuple[float, float, float, float]] = []
|
|
1209
|
+
|
|
1210
|
+
for idx in reactant_indices:
|
|
1211
|
+
frag_xml, xmin, ymin, xmax, ymax = cs_fragments[idx]
|
|
1212
|
+
mol_w = xmax - xmin
|
|
1213
|
+
cx = (xmin + xmax) / 2.0
|
|
1214
|
+
cy = (ymin + ymax) / 2.0
|
|
1215
|
+
target_cx = cursor_x + mol_w / 2.0
|
|
1216
|
+
target_cy = VERTICAL_CENTER
|
|
1217
|
+
dx = target_cx - cx
|
|
1218
|
+
dy = target_cy - cy
|
|
1219
|
+
translated = _translate_fragment_xml(frag_xml, dx, dy)
|
|
1220
|
+
|
|
1221
|
+
# Re-measure to get actual final extent
|
|
1222
|
+
fx0, fy0, fx1, fy1 = _measure_fragment_xml(translated)
|
|
1223
|
+
reactant_frag_xmls.append(translated)
|
|
1224
|
+
reactant_extents.append((fx0, fy0, fx1, fy1))
|
|
1225
|
+
cursor_x += mol_w + INTER_MOL_GAP
|
|
1226
|
+
log(f"Reactant [{idx}] placed at x=[{cursor_x - mol_w - INTER_MOL_GAP:.1f}, {cursor_x - INTER_MOL_GAP:.1f}]")
|
|
1227
|
+
|
|
1228
|
+
# Arrow
|
|
1229
|
+
arrow_tail_x = cursor_x - INTER_MOL_GAP + ARROW_MARGIN
|
|
1230
|
+
arrow_head_x = arrow_tail_x + ARROW_LENGTH
|
|
1231
|
+
arrow_y = VERTICAL_CENTER
|
|
1232
|
+
log(f"Arrow: tail={arrow_tail_x:.1f}, head={arrow_head_x:.1f}, y={arrow_y:.1f}")
|
|
1233
|
+
|
|
1234
|
+
cursor_x = arrow_head_x + ARROW_MARGIN
|
|
1235
|
+
|
|
1236
|
+
product_frag_xmls: List[str] = []
|
|
1237
|
+
product_extents: List[Tuple[float, float, float, float]] = []
|
|
1238
|
+
|
|
1239
|
+
for idx in product_indices:
|
|
1240
|
+
frag_xml, xmin, ymin, xmax, ymax = cs_fragments[idx]
|
|
1241
|
+
mol_w = xmax - xmin
|
|
1242
|
+
cx = (xmin + xmax) / 2.0
|
|
1243
|
+
cy = (ymin + ymax) / 2.0
|
|
1244
|
+
target_cx = cursor_x + mol_w / 2.0
|
|
1245
|
+
target_cy = VERTICAL_CENTER
|
|
1246
|
+
dx = target_cx - cx
|
|
1247
|
+
dy = target_cy - cy
|
|
1248
|
+
translated = _translate_fragment_xml(frag_xml, dx, dy)
|
|
1249
|
+
|
|
1250
|
+
fx0, fy0, fx1, fy1 = _measure_fragment_xml(translated)
|
|
1251
|
+
product_frag_xmls.append(translated)
|
|
1252
|
+
product_extents.append((fx0, fy0, fx1, fy1))
|
|
1253
|
+
cursor_x += mol_w + INTER_MOL_GAP
|
|
1254
|
+
log(f"Product [{idx}] placed at x=[{cursor_x - mol_w - INTER_MOL_GAP:.1f}, {cursor_x - INTER_MOL_GAP:.1f}]")
|
|
1255
|
+
|
|
1256
|
+
# Assign IDs to ChemScript fragments (need IDs for <scheme><step> references)
|
|
1257
|
+
# ChemScript fragments already have their own internal IDs; we need to extract
|
|
1258
|
+
# the top-level fragment id for the <step> element.
|
|
1259
|
+
reactant_frag_ids: List[str] = []
|
|
1260
|
+
for xml in reactant_frag_xmls:
|
|
1261
|
+
m = re.search(r'<fragment\s+id="(\d+)"', xml)
|
|
1262
|
+
if m:
|
|
1263
|
+
reactant_frag_ids.append(m.group(1))
|
|
1264
|
+
|
|
1265
|
+
product_frag_ids: List[str] = []
|
|
1266
|
+
for xml in product_frag_xmls:
|
|
1267
|
+
m = re.search(r'<fragment\s+id="(\d+)"', xml)
|
|
1268
|
+
if m:
|
|
1269
|
+
product_frag_ids.append(m.group(1))
|
|
1270
|
+
|
|
1271
|
+
# Resolve abbreviations
|
|
1272
|
+
resolved_above = [resolve_abbreviation(line) for line in conditions_above]
|
|
1273
|
+
resolved_below = [resolve_abbreviation(line) for line in conditions_below]
|
|
1274
|
+
log(f"Conditions above: {resolved_above}")
|
|
1275
|
+
log(f"Conditions below: {resolved_below}")
|
|
1276
|
+
|
|
1277
|
+
# Conditions: expanded structures or text labels
|
|
1278
|
+
above_xmls: List[str] = []
|
|
1279
|
+
below_xmls: List[str] = []
|
|
1280
|
+
above_ids: List[int] = []
|
|
1281
|
+
below_ids: List[int] = []
|
|
1282
|
+
|
|
1283
|
+
arrow_mid_x = (arrow_tail_x + arrow_head_x) / 2.0
|
|
1284
|
+
|
|
1285
|
+
if expanded_above is not None:
|
|
1286
|
+
ax, ai = _layout_expanded_conditions(
|
|
1287
|
+
expanded_above, arrow_tail_x, arrow_head_x, arrow_y,
|
|
1288
|
+
"above", ids, verbose,
|
|
1289
|
+
)
|
|
1290
|
+
above_xmls.extend(ax)
|
|
1291
|
+
above_ids.extend(ai)
|
|
1292
|
+
elif resolved_above:
|
|
1293
|
+
n_above = len(resolved_above)
|
|
1294
|
+
baseline_y = (arrow_y - CONDITIONS_GAP_ABOVE - CONDITIONS_DESCENDER
|
|
1295
|
+
- (n_above - 1) * CONDITIONS_LINE_HEIGHT)
|
|
1296
|
+
txt_xml, txt_id = _build_conditions_text(
|
|
1297
|
+
resolved_above, arrow_mid_x, baseline_y, ids
|
|
1298
|
+
)
|
|
1299
|
+
above_xmls.append(txt_xml)
|
|
1300
|
+
above_ids.append(txt_id)
|
|
1301
|
+
|
|
1302
|
+
if expanded_below is not None:
|
|
1303
|
+
bx, bi = _layout_expanded_conditions(
|
|
1304
|
+
expanded_below, arrow_tail_x, arrow_head_x, arrow_y,
|
|
1305
|
+
"below", ids, verbose,
|
|
1306
|
+
)
|
|
1307
|
+
below_xmls.extend(bx)
|
|
1308
|
+
below_ids.extend(bi)
|
|
1309
|
+
elif resolved_below:
|
|
1310
|
+
ascender = 8.0
|
|
1311
|
+
baseline_y = arrow_y + CONDITIONS_GAP_BELOW + ascender
|
|
1312
|
+
txt_xml, txt_id = _build_conditions_text(
|
|
1313
|
+
resolved_below, arrow_mid_x, baseline_y, ids
|
|
1314
|
+
)
|
|
1315
|
+
below_xmls.append(txt_xml)
|
|
1316
|
+
below_ids.append(txt_id)
|
|
1317
|
+
|
|
1318
|
+
# Arrow XML
|
|
1319
|
+
arrow_xml, arrow_id = _build_arrow(
|
|
1320
|
+
arrow_tail_x, arrow_y, arrow_head_x, arrow_y, ids
|
|
1321
|
+
)
|
|
1322
|
+
|
|
1323
|
+
# Scheme / step
|
|
1324
|
+
scheme_id = ids.next()
|
|
1325
|
+
step_id = ids.next()
|
|
1326
|
+
step_attrs = [
|
|
1327
|
+
f'id="{step_id}"',
|
|
1328
|
+
f'ReactionStepReactants="{" ".join(reactant_frag_ids)}"',
|
|
1329
|
+
f'ReactionStepProducts="{" ".join(product_frag_ids)}"',
|
|
1330
|
+
f'ReactionStepArrows="{arrow_id}"',
|
|
1331
|
+
]
|
|
1332
|
+
if above_ids:
|
|
1333
|
+
step_attrs.append(
|
|
1334
|
+
f'ReactionStepObjectsAboveArrow="{" ".join(str(i) for i in above_ids)}"'
|
|
1335
|
+
)
|
|
1336
|
+
if below_ids:
|
|
1337
|
+
step_attrs.append(
|
|
1338
|
+
f'ReactionStepObjectsBelowArrow="{" ".join(str(i) for i in below_ids)}"'
|
|
1339
|
+
)
|
|
1340
|
+
scheme_xml = f'<scheme id="{scheme_id}"><step {" ".join(step_attrs)}/></scheme>'
|
|
1341
|
+
|
|
1342
|
+
# Bounding box
|
|
1343
|
+
all_extents = reactant_extents + product_extents
|
|
1344
|
+
all_x0 = min(e[0] for e in all_extents)
|
|
1345
|
+
all_y0 = min(e[1] for e in all_extents)
|
|
1346
|
+
all_x1 = max(e[2] for e in all_extents)
|
|
1347
|
+
all_y1 = max(e[3] for e in all_extents)
|
|
1348
|
+
|
|
1349
|
+
# Include expanded condition fragments in bounding box
|
|
1350
|
+
for frag_xml_str in above_xmls + below_xmls:
|
|
1351
|
+
if frag_xml_str.lstrip().startswith("<fragment"):
|
|
1352
|
+
fx0, fy0, fx1, fy1 = _measure_fragment_xml(frag_xml_str)
|
|
1353
|
+
if fx0 != fx1:
|
|
1354
|
+
all_x0 = min(all_x0, fx0)
|
|
1355
|
+
all_y0 = min(all_y0, fy0)
|
|
1356
|
+
all_x1 = max(all_x1, fx1)
|
|
1357
|
+
all_y1 = max(all_y1, fy1)
|
|
1358
|
+
|
|
1359
|
+
margin = 20.0
|
|
1360
|
+
doc_bbox = (
|
|
1361
|
+
f"{all_x0 - margin:.2f} {all_y0 - margin:.2f} "
|
|
1362
|
+
f"{all_x1 + margin:.2f} {all_y1 + margin:.2f}"
|
|
1363
|
+
)
|
|
1364
|
+
page_w = all_x1 + 100
|
|
1365
|
+
page_h = all_y1 + 100
|
|
1366
|
+
page_id = ids.next()
|
|
1367
|
+
|
|
1368
|
+
# Assemble
|
|
1369
|
+
parts = [
|
|
1370
|
+
_format_cdxml_header(doc_bbox),
|
|
1371
|
+
f'<page id="{page_id}" BoundingBox="0 0 {page_w:.0f} {page_h:.0f}" '
|
|
1372
|
+
f'HeaderPosition="36" FooterPosition="36" '
|
|
1373
|
+
f'PrintTrimMarks="yes" HeightPages="1" WidthPages="2">',
|
|
1374
|
+
]
|
|
1375
|
+
parts.extend(reactant_frag_xmls)
|
|
1376
|
+
parts.extend(product_frag_xmls)
|
|
1377
|
+
parts.extend(above_xmls)
|
|
1378
|
+
parts.extend(below_xmls)
|
|
1379
|
+
parts.append(arrow_xml)
|
|
1380
|
+
parts.append(scheme_xml)
|
|
1381
|
+
parts.append("</page>")
|
|
1382
|
+
parts.append(CDXML_FOOTER)
|
|
1383
|
+
|
|
1384
|
+
return "\n".join(parts)
|
|
1385
|
+
|
|
1386
|
+
|
|
1387
|
+
# ---------------------------------------------------------------------------
|
|
1388
|
+
# High-level pipeline: image + descriptor → CDXML
|
|
1389
|
+
# ---------------------------------------------------------------------------
|
|
1390
|
+
|
|
1391
|
+
def reaction_from_image(
|
|
1392
|
+
image_path: str,
|
|
1393
|
+
descriptor: Dict,
|
|
1394
|
+
page: int = 0,
|
|
1395
|
+
segment: bool = True,
|
|
1396
|
+
hand_drawn: bool = False,
|
|
1397
|
+
verbose: bool = False,
|
|
1398
|
+
merge_gap: Optional[int] = None,
|
|
1399
|
+
cleanup: bool = False,
|
|
1400
|
+
expand: bool = False,
|
|
1401
|
+
) -> str:
|
|
1402
|
+
"""
|
|
1403
|
+
Full pipeline: image + reaction descriptor → CDXML reaction scheme.
|
|
1404
|
+
|
|
1405
|
+
Parameters
|
|
1406
|
+
----------
|
|
1407
|
+
image_path : path to screenshot PNG/JPG/PDF
|
|
1408
|
+
descriptor : dict with reactant_indices, product_indices, conditions_above/below
|
|
1409
|
+
page : PDF page number
|
|
1410
|
+
segment : whether to segment the image
|
|
1411
|
+
hand_drawn : use hand-drawn DECIMER model
|
|
1412
|
+
verbose : print progress
|
|
1413
|
+
merge_gap : pixel gap for merging nearby boxes (None = adaptive)
|
|
1414
|
+
cleanup : run ChemScript cleanup on structures (ChemDraw-native quality)
|
|
1415
|
+
expand : expand conditions to molecular structures where possible
|
|
1416
|
+
|
|
1417
|
+
Returns
|
|
1418
|
+
-------
|
|
1419
|
+
CDXML document string
|
|
1420
|
+
"""
|
|
1421
|
+
# Import structure_from_image (sibling module)
|
|
1422
|
+
from . import structure_from_image as sfi
|
|
1423
|
+
|
|
1424
|
+
# Step 1: Extract structures (DECIMER)
|
|
1425
|
+
if verbose:
|
|
1426
|
+
print(f"[reaction_from_image] Extracting structures from {image_path}...",
|
|
1427
|
+
file=sys.stderr)
|
|
1428
|
+
|
|
1429
|
+
structures = sfi._extract_structures_raw(
|
|
1430
|
+
image_path,
|
|
1431
|
+
page=page,
|
|
1432
|
+
segment=segment,
|
|
1433
|
+
hand_drawn=hand_drawn,
|
|
1434
|
+
verbose=verbose,
|
|
1435
|
+
merge_gap=merge_gap,
|
|
1436
|
+
)
|
|
1437
|
+
|
|
1438
|
+
if verbose:
|
|
1439
|
+
print(f"[reaction_from_image] Extracted {len(structures)} structure(s)",
|
|
1440
|
+
file=sys.stderr)
|
|
1441
|
+
for i, s in enumerate(structures):
|
|
1442
|
+
print(f" [{i}] SMILES={s.get('smiles', '?')}, "
|
|
1443
|
+
f"bbox={s.get('bbox', '?')}, "
|
|
1444
|
+
f"atoms={len(s.get('atoms', []))}",
|
|
1445
|
+
file=sys.stderr)
|
|
1446
|
+
|
|
1447
|
+
reactant_indices = descriptor.get("reactant_indices", [])
|
|
1448
|
+
product_indices = descriptor.get("product_indices", [])
|
|
1449
|
+
conditions_above = descriptor.get("conditions_above", [])
|
|
1450
|
+
conditions_below = descriptor.get("conditions_below", [])
|
|
1451
|
+
|
|
1452
|
+
# Step 2: Optionally expand conditions to structures
|
|
1453
|
+
expanded_above = None
|
|
1454
|
+
expanded_below = None
|
|
1455
|
+
if expand:
|
|
1456
|
+
cs_bridge = _open_chemscript_bridge(verbose)
|
|
1457
|
+
try:
|
|
1458
|
+
if verbose:
|
|
1459
|
+
print("[reaction_from_image] Resolving conditions to structures...",
|
|
1460
|
+
file=sys.stderr)
|
|
1461
|
+
expanded_above = _resolve_all_conditions(
|
|
1462
|
+
conditions_above, cs_bridge, verbose
|
|
1463
|
+
)
|
|
1464
|
+
expanded_below = _resolve_all_conditions(
|
|
1465
|
+
conditions_below, cs_bridge, verbose
|
|
1466
|
+
)
|
|
1467
|
+
except Exception as exc:
|
|
1468
|
+
if verbose:
|
|
1469
|
+
print(f"[reaction_from_image] Expand failed: {exc}", file=sys.stderr)
|
|
1470
|
+
# Don't close bridge yet if cleanup also needs it
|
|
1471
|
+
|
|
1472
|
+
# Step 3: Build reaction scheme
|
|
1473
|
+
if cleanup:
|
|
1474
|
+
# Use ChemScript for publication-quality structures
|
|
1475
|
+
if verbose:
|
|
1476
|
+
print("[reaction_from_image] Running ChemScript cleanup...",
|
|
1477
|
+
file=sys.stderr)
|
|
1478
|
+
cs_fragments = _chemscript_fragment_xmls(structures, verbose=verbose)
|
|
1479
|
+
|
|
1480
|
+
cdxml = build_reaction_scheme_chemscript(
|
|
1481
|
+
structures=structures,
|
|
1482
|
+
cs_fragments=cs_fragments,
|
|
1483
|
+
reactant_indices=reactant_indices,
|
|
1484
|
+
product_indices=product_indices,
|
|
1485
|
+
conditions_above=conditions_above,
|
|
1486
|
+
conditions_below=conditions_below,
|
|
1487
|
+
verbose=verbose,
|
|
1488
|
+
expanded_above=expanded_above,
|
|
1489
|
+
expanded_below=expanded_below,
|
|
1490
|
+
)
|
|
1491
|
+
else:
|
|
1492
|
+
# Use RDKit coordinates (faster, no ChemDraw dependency)
|
|
1493
|
+
cdxml = build_reaction_scheme(
|
|
1494
|
+
structures=structures,
|
|
1495
|
+
reactant_indices=reactant_indices,
|
|
1496
|
+
product_indices=product_indices,
|
|
1497
|
+
conditions_above=conditions_above,
|
|
1498
|
+
conditions_below=conditions_below,
|
|
1499
|
+
verbose=verbose,
|
|
1500
|
+
expanded_above=expanded_above,
|
|
1501
|
+
expanded_below=expanded_below,
|
|
1502
|
+
)
|
|
1503
|
+
|
|
1504
|
+
return cdxml
|
|
1505
|
+
|
|
1506
|
+
|
|
1507
|
+
# ---------------------------------------------------------------------------
|
|
1508
|
+
# High-level pipeline: image + descriptor → JSON (ReactionDescriptor)
|
|
1509
|
+
# ---------------------------------------------------------------------------
|
|
1510
|
+
|
|
1511
|
+
def reaction_from_image_to_json(
|
|
1512
|
+
image_path: str,
|
|
1513
|
+
descriptor: Dict,
|
|
1514
|
+
output_path: Optional[str] = None,
|
|
1515
|
+
page: int = 0,
|
|
1516
|
+
segment: bool = True,
|
|
1517
|
+
hand_drawn: bool = False,
|
|
1518
|
+
verbose: bool = False,
|
|
1519
|
+
merge_gap: Optional[int] = None,
|
|
1520
|
+
use_network: bool = True,
|
|
1521
|
+
) -> "ReactionDescriptor":
|
|
1522
|
+
"""
|
|
1523
|
+
Full pipeline: image + reaction descriptor → ReactionDescriptor JSON.
|
|
1524
|
+
|
|
1525
|
+
Same extraction as :func:`reaction_from_image`, but returns a
|
|
1526
|
+
:class:`ReactionDescriptor` (the standard JSON source of truth) instead
|
|
1527
|
+
of a CDXML string. The agent can then render a scheme downstream via
|
|
1528
|
+
the scheme DSL if needed.
|
|
1529
|
+
|
|
1530
|
+
Parameters
|
|
1531
|
+
----------
|
|
1532
|
+
image_path : path to screenshot PNG/JPG/PDF
|
|
1533
|
+
descriptor : dict with reactant_indices, product_indices, conditions_above/below
|
|
1534
|
+
output_path : if given, write JSON to this path
|
|
1535
|
+
page : PDF page number
|
|
1536
|
+
segment : whether to segment the image
|
|
1537
|
+
hand_drawn : use hand-drawn DECIMER model
|
|
1538
|
+
verbose : print progress
|
|
1539
|
+
merge_gap : pixel gap for merging nearby boxes (None = adaptive)
|
|
1540
|
+
use_network : allow PubChem lookups for condition name resolution
|
|
1541
|
+
|
|
1542
|
+
Returns
|
|
1543
|
+
-------
|
|
1544
|
+
ReactionDescriptor
|
|
1545
|
+
"""
|
|
1546
|
+
from ..perception.reaction_parser import (
|
|
1547
|
+
ReactionDescriptor, SpeciesDescriptor,
|
|
1548
|
+
_resolve_text_label, _compute_all_masses,
|
|
1549
|
+
)
|
|
1550
|
+
from . import structure_from_image as sfi
|
|
1551
|
+
import datetime
|
|
1552
|
+
|
|
1553
|
+
def _log(msg: str):
|
|
1554
|
+
if verbose:
|
|
1555
|
+
print(f"[reaction_from_image_to_json] {msg}", file=sys.stderr)
|
|
1556
|
+
|
|
1557
|
+
# Step 1: Extract structures (DECIMER)
|
|
1558
|
+
_log(f"Extracting structures from {image_path}...")
|
|
1559
|
+
structures = sfi._extract_structures_raw(
|
|
1560
|
+
image_path,
|
|
1561
|
+
page=page,
|
|
1562
|
+
segment=segment,
|
|
1563
|
+
hand_drawn=hand_drawn,
|
|
1564
|
+
verbose=verbose,
|
|
1565
|
+
merge_gap=merge_gap,
|
|
1566
|
+
)
|
|
1567
|
+
_log(f"Extracted {len(structures)} structure(s)")
|
|
1568
|
+
|
|
1569
|
+
reactant_indices = set(descriptor.get("reactant_indices", []))
|
|
1570
|
+
product_indices = set(descriptor.get("product_indices", []))
|
|
1571
|
+
conditions_above = descriptor.get("conditions_above", [])
|
|
1572
|
+
conditions_below = descriptor.get("conditions_below", [])
|
|
1573
|
+
|
|
1574
|
+
db = get_reagent_db()
|
|
1575
|
+
species_list: List[SpeciesDescriptor] = []
|
|
1576
|
+
warnings: List[str] = []
|
|
1577
|
+
sp_idx = 0
|
|
1578
|
+
|
|
1579
|
+
# Step 2: Build SpeciesDescriptor for each extracted structure
|
|
1580
|
+
for entry in structures:
|
|
1581
|
+
smiles = entry.get("smiles", "").strip()
|
|
1582
|
+
idx = entry.get("index", 0)
|
|
1583
|
+
|
|
1584
|
+
if not smiles:
|
|
1585
|
+
warnings.append(f"Structure at index {idx}: DECIMER returned no SMILES")
|
|
1586
|
+
continue
|
|
1587
|
+
|
|
1588
|
+
# Canonicalize SMILES
|
|
1589
|
+
canon_smiles = smiles
|
|
1590
|
+
try:
|
|
1591
|
+
from rdkit import Chem
|
|
1592
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
1593
|
+
if mol:
|
|
1594
|
+
canon_smiles = Chem.MolToSmiles(mol)
|
|
1595
|
+
except ImportError:
|
|
1596
|
+
pass
|
|
1597
|
+
|
|
1598
|
+
# Determine role from descriptor indices
|
|
1599
|
+
if idx in reactant_indices:
|
|
1600
|
+
role = "atom_contributing"
|
|
1601
|
+
is_sm = (idx == min(reactant_indices)) # first reactant = SM
|
|
1602
|
+
elif idx in product_indices:
|
|
1603
|
+
role = "product"
|
|
1604
|
+
is_sm = False
|
|
1605
|
+
else:
|
|
1606
|
+
role = "non_contributing"
|
|
1607
|
+
is_sm = False
|
|
1608
|
+
|
|
1609
|
+
is_dp = (role == "product" and
|
|
1610
|
+
(len(product_indices) == 1 or idx == min(product_indices)))
|
|
1611
|
+
|
|
1612
|
+
# Try to get a display name from reagent DB (by SMILES)
|
|
1613
|
+
display = db.display_for_smiles(canon_smiles) if canon_smiles else None
|
|
1614
|
+
name = display or canon_smiles
|
|
1615
|
+
|
|
1616
|
+
# Role detail from reagent DB (by SMILES)
|
|
1617
|
+
role_detail = db.role_for_smiles(canon_smiles) if canon_smiles else None
|
|
1618
|
+
|
|
1619
|
+
# Build original_geometry from extracted atoms/bonds
|
|
1620
|
+
atoms = entry.get("atoms", [])
|
|
1621
|
+
bonds = entry.get("bonds", [])
|
|
1622
|
+
original_geometry = None
|
|
1623
|
+
if atoms:
|
|
1624
|
+
original_geometry = {
|
|
1625
|
+
"atoms": [
|
|
1626
|
+
{k: v for k, v in a.items()
|
|
1627
|
+
if k in ("index", "x", "y", "symbol", "num_hydrogens", "charge")}
|
|
1628
|
+
for a in atoms
|
|
1629
|
+
],
|
|
1630
|
+
"bonds": [
|
|
1631
|
+
{k: v for k, v in b.items()
|
|
1632
|
+
if k in ("index", "atom1", "atom2", "order", "cfg", "double_pos")}
|
|
1633
|
+
for b in bonds
|
|
1634
|
+
],
|
|
1635
|
+
}
|
|
1636
|
+
|
|
1637
|
+
sp = SpeciesDescriptor(
|
|
1638
|
+
id=f"sp_{sp_idx}",
|
|
1639
|
+
smiles=canon_smiles,
|
|
1640
|
+
name=name,
|
|
1641
|
+
role=role,
|
|
1642
|
+
role_detail=role_detail,
|
|
1643
|
+
classification_method="image_descriptor",
|
|
1644
|
+
is_sm=is_sm,
|
|
1645
|
+
is_dp=is_dp,
|
|
1646
|
+
source="image",
|
|
1647
|
+
display_text=name,
|
|
1648
|
+
original_geometry=original_geometry,
|
|
1649
|
+
)
|
|
1650
|
+
species_list.append(sp)
|
|
1651
|
+
sp_idx += 1
|
|
1652
|
+
|
|
1653
|
+
# Step 3: Resolve conditions text to species or condition strings
|
|
1654
|
+
condition_strings: List[str] = []
|
|
1655
|
+
all_conditions = conditions_above + conditions_below
|
|
1656
|
+
|
|
1657
|
+
for cond_text in all_conditions:
|
|
1658
|
+
cond_text = cond_text.strip()
|
|
1659
|
+
if not cond_text:
|
|
1660
|
+
continue
|
|
1661
|
+
|
|
1662
|
+
# Non-chemistry text goes straight to conditions
|
|
1663
|
+
if _is_non_chemistry_text(cond_text):
|
|
1664
|
+
condition_strings.append(cond_text)
|
|
1665
|
+
continue
|
|
1666
|
+
|
|
1667
|
+
# Try to resolve to SMILES
|
|
1668
|
+
smi = _resolve_text_label(cond_text, use_network=use_network)
|
|
1669
|
+
if smi:
|
|
1670
|
+
# Get display name and role from reagent DB
|
|
1671
|
+
display = db.display_for_name(cond_text) or cond_text
|
|
1672
|
+
role_detail = db.role_for_name(cond_text)
|
|
1673
|
+
|
|
1674
|
+
sp = SpeciesDescriptor(
|
|
1675
|
+
id=f"sp_{sp_idx}",
|
|
1676
|
+
smiles=smi,
|
|
1677
|
+
name=display,
|
|
1678
|
+
role="non_contributing",
|
|
1679
|
+
role_detail=role_detail,
|
|
1680
|
+
classification_method="name_resolution",
|
|
1681
|
+
source="text_label",
|
|
1682
|
+
display_text=display,
|
|
1683
|
+
)
|
|
1684
|
+
species_list.append(sp)
|
|
1685
|
+
sp_idx += 1
|
|
1686
|
+
else:
|
|
1687
|
+
# Could not resolve — store as condition text
|
|
1688
|
+
condition_strings.append(cond_text)
|
|
1689
|
+
|
|
1690
|
+
# Step 4: Compute masses, formulas, adducts for all species
|
|
1691
|
+
_compute_all_masses(species_list)
|
|
1692
|
+
|
|
1693
|
+
# Step 5: Build the reaction SMILES
|
|
1694
|
+
reaction_smiles = None
|
|
1695
|
+
try:
|
|
1696
|
+
reactant_smis = [sp.smiles for sp in species_list
|
|
1697
|
+
if sp.role == "atom_contributing" and sp.smiles]
|
|
1698
|
+
product_smis = [sp.smiles for sp in species_list
|
|
1699
|
+
if sp.role == "product" and sp.smiles]
|
|
1700
|
+
if reactant_smis and product_smis:
|
|
1701
|
+
reaction_smiles = (
|
|
1702
|
+
".".join(reactant_smis) + ">>" + ".".join(product_smis)
|
|
1703
|
+
)
|
|
1704
|
+
except Exception:
|
|
1705
|
+
pass
|
|
1706
|
+
|
|
1707
|
+
desc = ReactionDescriptor(
|
|
1708
|
+
version="1.3",
|
|
1709
|
+
experiment="",
|
|
1710
|
+
input_files={"image": os.path.abspath(image_path)},
|
|
1711
|
+
reaction_smiles=reaction_smiles,
|
|
1712
|
+
species=species_list,
|
|
1713
|
+
warnings=warnings,
|
|
1714
|
+
metadata={
|
|
1715
|
+
"parser_version": "reaction_from_image 1.0",
|
|
1716
|
+
"timestamp": datetime.datetime.now().isoformat(),
|
|
1717
|
+
"source": "image",
|
|
1718
|
+
},
|
|
1719
|
+
conditions=condition_strings,
|
|
1720
|
+
)
|
|
1721
|
+
|
|
1722
|
+
if output_path:
|
|
1723
|
+
desc.to_json(output_path)
|
|
1724
|
+
_log(f"Wrote {output_path}")
|
|
1725
|
+
|
|
1726
|
+
return desc
|
|
1727
|
+
|
|
1728
|
+
|
|
1729
|
+
# ---------------------------------------------------------------------------
|
|
1730
|
+
# CLI
|
|
1731
|
+
# ---------------------------------------------------------------------------
|
|
1732
|
+
|
|
1733
|
+
def _structures_json_to_descriptor(
|
|
1734
|
+
structures: List[Dict],
|
|
1735
|
+
descriptor: Dict,
|
|
1736
|
+
source_path: str,
|
|
1737
|
+
verbose: bool = False,
|
|
1738
|
+
) -> "ReactionDescriptor":
|
|
1739
|
+
"""Build a ReactionDescriptor from pre-extracted structures + descriptor.
|
|
1740
|
+
|
|
1741
|
+
Used when --structures-json is combined with --format json in the CLI.
|
|
1742
|
+
"""
|
|
1743
|
+
from ..perception.reaction_parser import (
|
|
1744
|
+
ReactionDescriptor, SpeciesDescriptor,
|
|
1745
|
+
_resolve_text_label, _compute_all_masses,
|
|
1746
|
+
)
|
|
1747
|
+
import datetime
|
|
1748
|
+
|
|
1749
|
+
db = get_reagent_db()
|
|
1750
|
+
reactant_indices = set(descriptor.get("reactant_indices", []))
|
|
1751
|
+
product_indices = set(descriptor.get("product_indices", []))
|
|
1752
|
+
conditions_above = descriptor.get("conditions_above", [])
|
|
1753
|
+
conditions_below = descriptor.get("conditions_below", [])
|
|
1754
|
+
|
|
1755
|
+
species_list: List[SpeciesDescriptor] = []
|
|
1756
|
+
warnings: List[str] = []
|
|
1757
|
+
sp_idx = 0
|
|
1758
|
+
|
|
1759
|
+
for entry in structures:
|
|
1760
|
+
smiles = entry.get("smiles", "").strip()
|
|
1761
|
+
idx = entry.get("index", 0)
|
|
1762
|
+
if not smiles:
|
|
1763
|
+
warnings.append(f"Structure at index {idx}: no SMILES")
|
|
1764
|
+
continue
|
|
1765
|
+
|
|
1766
|
+
canon_smiles = smiles
|
|
1767
|
+
try:
|
|
1768
|
+
from rdkit import Chem
|
|
1769
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
1770
|
+
if mol:
|
|
1771
|
+
canon_smiles = Chem.MolToSmiles(mol)
|
|
1772
|
+
except ImportError:
|
|
1773
|
+
pass
|
|
1774
|
+
|
|
1775
|
+
if idx in reactant_indices:
|
|
1776
|
+
role = "atom_contributing"
|
|
1777
|
+
is_sm = (idx == min(reactant_indices))
|
|
1778
|
+
elif idx in product_indices:
|
|
1779
|
+
role = "product"
|
|
1780
|
+
is_sm = False
|
|
1781
|
+
else:
|
|
1782
|
+
role = "non_contributing"
|
|
1783
|
+
is_sm = False
|
|
1784
|
+
|
|
1785
|
+
is_dp = (role == "product" and
|
|
1786
|
+
(len(product_indices) == 1 or idx == min(product_indices)))
|
|
1787
|
+
|
|
1788
|
+
display = db.display_for_smiles(canon_smiles) if canon_smiles else None
|
|
1789
|
+
name = display or canon_smiles
|
|
1790
|
+
role_detail = db.role_for_smiles(canon_smiles) if canon_smiles else None
|
|
1791
|
+
|
|
1792
|
+
atoms = entry.get("atoms", [])
|
|
1793
|
+
bonds = entry.get("bonds", [])
|
|
1794
|
+
original_geometry = None
|
|
1795
|
+
if atoms:
|
|
1796
|
+
original_geometry = {
|
|
1797
|
+
"atoms": [
|
|
1798
|
+
{k: v for k, v in a.items()
|
|
1799
|
+
if k in ("index", "x", "y", "symbol", "num_hydrogens", "charge")}
|
|
1800
|
+
for a in atoms
|
|
1801
|
+
],
|
|
1802
|
+
"bonds": [
|
|
1803
|
+
{k: v for k, v in b.items()
|
|
1804
|
+
if k in ("index", "atom1", "atom2", "order", "cfg", "double_pos")}
|
|
1805
|
+
for b in bonds
|
|
1806
|
+
],
|
|
1807
|
+
}
|
|
1808
|
+
|
|
1809
|
+
sp = SpeciesDescriptor(
|
|
1810
|
+
id=f"sp_{sp_idx}",
|
|
1811
|
+
smiles=canon_smiles,
|
|
1812
|
+
name=name,
|
|
1813
|
+
role=role,
|
|
1814
|
+
role_detail=role_detail,
|
|
1815
|
+
classification_method="image_descriptor",
|
|
1816
|
+
is_sm=is_sm,
|
|
1817
|
+
is_dp=is_dp,
|
|
1818
|
+
source="image",
|
|
1819
|
+
display_text=name,
|
|
1820
|
+
original_geometry=original_geometry,
|
|
1821
|
+
)
|
|
1822
|
+
species_list.append(sp)
|
|
1823
|
+
sp_idx += 1
|
|
1824
|
+
|
|
1825
|
+
condition_strings: List[str] = []
|
|
1826
|
+
for cond_text in conditions_above + conditions_below:
|
|
1827
|
+
cond_text = cond_text.strip()
|
|
1828
|
+
if not cond_text:
|
|
1829
|
+
continue
|
|
1830
|
+
if _is_non_chemistry_text(cond_text):
|
|
1831
|
+
condition_strings.append(cond_text)
|
|
1832
|
+
continue
|
|
1833
|
+
smi = _resolve_text_label(cond_text, use_network=True)
|
|
1834
|
+
if smi:
|
|
1835
|
+
display = db.display_for_name(cond_text) or cond_text
|
|
1836
|
+
role_detail_cond = db.role_for_name(cond_text)
|
|
1837
|
+
sp = SpeciesDescriptor(
|
|
1838
|
+
id=f"sp_{sp_idx}",
|
|
1839
|
+
smiles=smi,
|
|
1840
|
+
name=display,
|
|
1841
|
+
role="non_contributing",
|
|
1842
|
+
role_detail=role_detail_cond,
|
|
1843
|
+
classification_method="name_resolution",
|
|
1844
|
+
source="text_label",
|
|
1845
|
+
display_text=display,
|
|
1846
|
+
)
|
|
1847
|
+
species_list.append(sp)
|
|
1848
|
+
sp_idx += 1
|
|
1849
|
+
else:
|
|
1850
|
+
condition_strings.append(cond_text)
|
|
1851
|
+
|
|
1852
|
+
_compute_all_masses(species_list)
|
|
1853
|
+
|
|
1854
|
+
reaction_smiles = None
|
|
1855
|
+
try:
|
|
1856
|
+
r_smis = [sp.smiles for sp in species_list
|
|
1857
|
+
if sp.role == "atom_contributing" and sp.smiles]
|
|
1858
|
+
p_smis = [sp.smiles for sp in species_list
|
|
1859
|
+
if sp.role == "product" and sp.smiles]
|
|
1860
|
+
if r_smis and p_smis:
|
|
1861
|
+
reaction_smiles = ".".join(r_smis) + ">>" + ".".join(p_smis)
|
|
1862
|
+
except Exception:
|
|
1863
|
+
pass
|
|
1864
|
+
|
|
1865
|
+
return ReactionDescriptor(
|
|
1866
|
+
version="1.3",
|
|
1867
|
+
experiment="",
|
|
1868
|
+
input_files={"structures_json": os.path.abspath(source_path)},
|
|
1869
|
+
reaction_smiles=reaction_smiles,
|
|
1870
|
+
species=species_list,
|
|
1871
|
+
warnings=warnings,
|
|
1872
|
+
metadata={
|
|
1873
|
+
"parser_version": "reaction_from_image 1.0",
|
|
1874
|
+
"timestamp": datetime.datetime.now().isoformat(),
|
|
1875
|
+
"source": "image",
|
|
1876
|
+
},
|
|
1877
|
+
conditions=condition_strings,
|
|
1878
|
+
)
|
|
1879
|
+
|
|
1880
|
+
|
|
1881
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
1882
|
+
p = argparse.ArgumentParser(
|
|
1883
|
+
description=(
|
|
1884
|
+
"Build a ChemDraw reaction scheme (CDXML) from a screenshot image. "
|
|
1885
|
+
"Requires a JSON descriptor specifying which structures are reactants/products "
|
|
1886
|
+
"and what conditions text to include."
|
|
1887
|
+
),
|
|
1888
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
1889
|
+
epilog=__doc__,
|
|
1890
|
+
)
|
|
1891
|
+
p.add_argument(
|
|
1892
|
+
"--image", "-i",
|
|
1893
|
+
default=None,
|
|
1894
|
+
help="Input image file (PNG/JPG/PDF). Required unless --structures-json is used.",
|
|
1895
|
+
)
|
|
1896
|
+
p.add_argument(
|
|
1897
|
+
"--descriptor", "-d",
|
|
1898
|
+
required=True,
|
|
1899
|
+
help="JSON descriptor file (or '-' for stdin)",
|
|
1900
|
+
)
|
|
1901
|
+
p.add_argument(
|
|
1902
|
+
"--output", "-o",
|
|
1903
|
+
default=None,
|
|
1904
|
+
help="Output CDXML file (default: <image_stem>_scheme.cdxml)",
|
|
1905
|
+
)
|
|
1906
|
+
p.add_argument(
|
|
1907
|
+
"--page",
|
|
1908
|
+
type=int,
|
|
1909
|
+
default=0,
|
|
1910
|
+
help="PDF page number, 0-indexed (default: 0)",
|
|
1911
|
+
)
|
|
1912
|
+
p.add_argument(
|
|
1913
|
+
"--no-segment",
|
|
1914
|
+
action="store_true",
|
|
1915
|
+
help="Don't segment — treat each region as one structure",
|
|
1916
|
+
)
|
|
1917
|
+
p.add_argument(
|
|
1918
|
+
"--hand-drawn",
|
|
1919
|
+
action="store_true",
|
|
1920
|
+
help="Use DECIMER hand-drawn model",
|
|
1921
|
+
)
|
|
1922
|
+
p.add_argument(
|
|
1923
|
+
"--gap",
|
|
1924
|
+
type=int,
|
|
1925
|
+
default=None,
|
|
1926
|
+
help="Merge gap in pixels for segmentation (default: adaptive)",
|
|
1927
|
+
)
|
|
1928
|
+
p.add_argument(
|
|
1929
|
+
"--cleanup",
|
|
1930
|
+
action="store_true",
|
|
1931
|
+
help="Run ChemScript cleanup on extracted structures",
|
|
1932
|
+
)
|
|
1933
|
+
p.add_argument(
|
|
1934
|
+
"--expand",
|
|
1935
|
+
action="store_true",
|
|
1936
|
+
help=(
|
|
1937
|
+
"Expand conditions to molecular structures where possible. "
|
|
1938
|
+
"Uses ChemScript name resolution and PubChem lookup. "
|
|
1939
|
+
"Falls back to text labels for unresolvable conditions."
|
|
1940
|
+
),
|
|
1941
|
+
)
|
|
1942
|
+
p.add_argument(
|
|
1943
|
+
"--verbose", "-v",
|
|
1944
|
+
action="store_true",
|
|
1945
|
+
help="Print progress to stderr",
|
|
1946
|
+
)
|
|
1947
|
+
p.add_argument(
|
|
1948
|
+
"--structures-json",
|
|
1949
|
+
default=None,
|
|
1950
|
+
help=(
|
|
1951
|
+
"Path to a pre-extracted structures JSON file "
|
|
1952
|
+
"(from structure_from_image.py). Skips DECIMER extraction."
|
|
1953
|
+
),
|
|
1954
|
+
)
|
|
1955
|
+
p.add_argument(
|
|
1956
|
+
"--format",
|
|
1957
|
+
choices=["cdxml", "json"],
|
|
1958
|
+
default="cdxml",
|
|
1959
|
+
help=(
|
|
1960
|
+
"Output format (default: cdxml). 'json' produces a "
|
|
1961
|
+
"ReactionDescriptor JSON file (same format as cdxml-parse)."
|
|
1962
|
+
),
|
|
1963
|
+
)
|
|
1964
|
+
return p
|
|
1965
|
+
|
|
1966
|
+
|
|
1967
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
1968
|
+
parser = _build_parser()
|
|
1969
|
+
args = parser.parse_args(argv)
|
|
1970
|
+
|
|
1971
|
+
# Validate: --image is required unless --structures-json is used
|
|
1972
|
+
if args.image is None and args.structures_json is None:
|
|
1973
|
+
parser.error("--image is required unless --structures-json is provided")
|
|
1974
|
+
|
|
1975
|
+
# Load descriptor
|
|
1976
|
+
if args.descriptor == "-":
|
|
1977
|
+
descriptor = json.load(sys.stdin)
|
|
1978
|
+
else:
|
|
1979
|
+
with open(args.descriptor, encoding="utf-8") as f:
|
|
1980
|
+
descriptor = json.load(f)
|
|
1981
|
+
|
|
1982
|
+
# Output path
|
|
1983
|
+
ext = ".json" if args.format == "json" else ".cdxml"
|
|
1984
|
+
if args.output is None:
|
|
1985
|
+
if args.image:
|
|
1986
|
+
stem = os.path.splitext(os.path.basename(args.image))[0]
|
|
1987
|
+
else:
|
|
1988
|
+
stem = os.path.splitext(os.path.basename(args.structures_json))[0]
|
|
1989
|
+
args.output = stem + ("_reaction" + ext if args.format == "json"
|
|
1990
|
+
else "_scheme" + ext)
|
|
1991
|
+
|
|
1992
|
+
# --- JSON output path: use reaction_from_image_to_json() ---
|
|
1993
|
+
if args.format == "json":
|
|
1994
|
+
if args.image:
|
|
1995
|
+
desc = reaction_from_image_to_json(
|
|
1996
|
+
image_path=args.image,
|
|
1997
|
+
descriptor=descriptor,
|
|
1998
|
+
output_path=args.output,
|
|
1999
|
+
page=args.page,
|
|
2000
|
+
segment=not args.no_segment,
|
|
2001
|
+
hand_drawn=args.hand_drawn,
|
|
2002
|
+
verbose=args.verbose,
|
|
2003
|
+
merge_gap=args.gap,
|
|
2004
|
+
)
|
|
2005
|
+
else:
|
|
2006
|
+
# --structures-json provided: build descriptor from pre-extracted
|
|
2007
|
+
from . import structure_from_image as sfi
|
|
2008
|
+
with open(args.structures_json, encoding="utf-8") as f:
|
|
2009
|
+
structures = json.load(f)
|
|
2010
|
+
# Synthesize an image path for metadata
|
|
2011
|
+
desc = _structures_json_to_descriptor(
|
|
2012
|
+
structures, descriptor, args.structures_json, args.verbose,
|
|
2013
|
+
)
|
|
2014
|
+
desc.to_json(args.output)
|
|
2015
|
+
|
|
2016
|
+
print(f"Written reaction JSON to {args.output}", file=sys.stderr)
|
|
2017
|
+
return 0
|
|
2018
|
+
|
|
2019
|
+
# --- CDXML output path (existing behaviour) ---
|
|
2020
|
+
if args.structures_json:
|
|
2021
|
+
with open(args.structures_json, encoding="utf-8") as f:
|
|
2022
|
+
structures = json.load(f)
|
|
2023
|
+
|
|
2024
|
+
if args.verbose:
|
|
2025
|
+
print(f"[reaction_from_image] Loaded {len(structures)} structures "
|
|
2026
|
+
f"from {args.structures_json}", file=sys.stderr)
|
|
2027
|
+
for i, s in enumerate(structures):
|
|
2028
|
+
print(f" [{i}] SMILES={s.get('smiles', '?')}, "
|
|
2029
|
+
f"atoms={len(s.get('atoms', []))}",
|
|
2030
|
+
file=sys.stderr)
|
|
2031
|
+
|
|
2032
|
+
reactant_indices = descriptor.get("reactant_indices", [])
|
|
2033
|
+
product_indices = descriptor.get("product_indices", [])
|
|
2034
|
+
conditions_above = descriptor.get("conditions_above", [])
|
|
2035
|
+
conditions_below = descriptor.get("conditions_below", [])
|
|
2036
|
+
|
|
2037
|
+
# Resolve conditions to structures if --expand
|
|
2038
|
+
expanded_above = None
|
|
2039
|
+
expanded_below = None
|
|
2040
|
+
if args.expand:
|
|
2041
|
+
cs_bridge = _open_chemscript_bridge(args.verbose)
|
|
2042
|
+
try:
|
|
2043
|
+
if args.verbose:
|
|
2044
|
+
print("[reaction_from_image] Resolving conditions to structures...",
|
|
2045
|
+
file=sys.stderr)
|
|
2046
|
+
expanded_above = _resolve_all_conditions(
|
|
2047
|
+
conditions_above, cs_bridge, args.verbose
|
|
2048
|
+
)
|
|
2049
|
+
expanded_below = _resolve_all_conditions(
|
|
2050
|
+
conditions_below, cs_bridge, args.verbose
|
|
2051
|
+
)
|
|
2052
|
+
finally:
|
|
2053
|
+
cs_bridge.close()
|
|
2054
|
+
|
|
2055
|
+
if args.cleanup:
|
|
2056
|
+
if args.verbose:
|
|
2057
|
+
print("[reaction_from_image] Running ChemScript cleanup...",
|
|
2058
|
+
file=sys.stderr)
|
|
2059
|
+
cs_fragments = _chemscript_fragment_xmls(structures, verbose=args.verbose)
|
|
2060
|
+
cdxml = build_reaction_scheme_chemscript(
|
|
2061
|
+
structures=structures,
|
|
2062
|
+
cs_fragments=cs_fragments,
|
|
2063
|
+
reactant_indices=reactant_indices,
|
|
2064
|
+
product_indices=product_indices,
|
|
2065
|
+
conditions_above=conditions_above,
|
|
2066
|
+
conditions_below=conditions_below,
|
|
2067
|
+
verbose=args.verbose,
|
|
2068
|
+
expanded_above=expanded_above,
|
|
2069
|
+
expanded_below=expanded_below,
|
|
2070
|
+
)
|
|
2071
|
+
else:
|
|
2072
|
+
cdxml = build_reaction_scheme(
|
|
2073
|
+
structures=structures,
|
|
2074
|
+
reactant_indices=reactant_indices,
|
|
2075
|
+
product_indices=product_indices,
|
|
2076
|
+
conditions_above=conditions_above,
|
|
2077
|
+
conditions_below=conditions_below,
|
|
2078
|
+
verbose=args.verbose,
|
|
2079
|
+
expanded_above=expanded_above,
|
|
2080
|
+
expanded_below=expanded_below,
|
|
2081
|
+
)
|
|
2082
|
+
else:
|
|
2083
|
+
cdxml = reaction_from_image(
|
|
2084
|
+
image_path=args.image,
|
|
2085
|
+
descriptor=descriptor,
|
|
2086
|
+
page=args.page,
|
|
2087
|
+
segment=not args.no_segment,
|
|
2088
|
+
hand_drawn=args.hand_drawn,
|
|
2089
|
+
verbose=args.verbose,
|
|
2090
|
+
merge_gap=args.gap,
|
|
2091
|
+
cleanup=args.cleanup,
|
|
2092
|
+
expand=args.expand,
|
|
2093
|
+
)
|
|
2094
|
+
|
|
2095
|
+
with open(args.output, "w", encoding="utf-8") as f:
|
|
2096
|
+
f.write(cdxml)
|
|
2097
|
+
|
|
2098
|
+
print(f"Written reaction scheme to {args.output}", file=sys.stderr)
|
|
2099
|
+
return 0
|
|
2100
|
+
|
|
2101
|
+
|
|
2102
|
+
if __name__ == "__main__":
|
|
2103
|
+
sys.exit(main())
|