cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1394 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
eln_enrichment.py -- Enrich a polished reaction scheme with ELN CSV data.
|
|
4
|
+
|
|
5
|
+
Given a polished CDXML (from scheme_polisher) and a Findmolecule ELN CSV,
|
|
6
|
+
annotates the scheme with:
|
|
7
|
+
- Equivalents on each reagent (text labels and above-arrow structures)
|
|
8
|
+
- A "run arrow" below the scheme showing SM mass and product yield
|
|
9
|
+
|
|
10
|
+
Two-phase design:
|
|
11
|
+
Phase A (before layout): Inject equivalents into text content so that
|
|
12
|
+
text widths are correct for arrow length computation.
|
|
13
|
+
Phase B (after layout): Add run arrow, above-arrow eq labels, and
|
|
14
|
+
side eq labels using finalized positions.
|
|
15
|
+
|
|
16
|
+
Usage (via scheme_polisher_v2.py):
|
|
17
|
+
python scheme_polisher_v2.py input.cdx --eln-csv experiment.csv -o out.cdxml
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import os
|
|
21
|
+
import re
|
|
22
|
+
import sys
|
|
23
|
+
import xml.etree.ElementTree as ET
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
from typing import Dict, List, Optional, Tuple
|
|
26
|
+
from xml.sax.saxutils import escape as xml_escape
|
|
27
|
+
|
|
28
|
+
from ...cdxml_utils import (
|
|
29
|
+
fragment_bbox,
|
|
30
|
+
fragment_bbox_with_label_extension,
|
|
31
|
+
fragment_bottom_has_hanging_label,
|
|
32
|
+
recompute_text_bbox,
|
|
33
|
+
)
|
|
34
|
+
from ...constants import (
|
|
35
|
+
CDXML_FOOTER,
|
|
36
|
+
CDXML_MINIMAL_HEADER,
|
|
37
|
+
MW_MATCH_TOLERANCE,
|
|
38
|
+
MW_MATCH_TOLERANCE_LOOSE,
|
|
39
|
+
)
|
|
40
|
+
from ...text_formatting import build_formatted_s_xml
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
# Data structures
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class MatchedReagent:
|
|
49
|
+
"""A CSV reagent matched to a scheme element."""
|
|
50
|
+
csv_name: str
|
|
51
|
+
csv_equiv: str # raw equiv string from CSV, e.g. "2.0"
|
|
52
|
+
csv_mass: str # e.g. "2.15 g"
|
|
53
|
+
csv_is_substrate: bool
|
|
54
|
+
csv_mw: float
|
|
55
|
+
scheme_element_id: str # id of the matched <t> or <fragment>
|
|
56
|
+
scheme_position: str # "reactant", "above_arrow", "below_arrow"
|
|
57
|
+
scheme_display: str # display text on the scheme (e.g. "Cs2CO3")
|
|
58
|
+
is_solvent: bool = False
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class EnrichmentData:
|
|
63
|
+
"""All enrichment info extracted from CSV + scheme matching."""
|
|
64
|
+
matches: List[MatchedReagent] = field(default_factory=list)
|
|
65
|
+
substrate: Optional[MatchedReagent] = None # equiv=1.0, is_substrate
|
|
66
|
+
sm_mass: str = "" # e.g. "2.15 g"
|
|
67
|
+
product_obtained: str = "" # e.g. "1.6 g"
|
|
68
|
+
product_yield: str = "" # e.g. "72%"
|
|
69
|
+
solvent_names: List[str] = field(default_factory=list)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# ---------------------------------------------------------------------------
|
|
73
|
+
# Helpers
|
|
74
|
+
# ---------------------------------------------------------------------------
|
|
75
|
+
|
|
76
|
+
def _format_equiv(equiv_str: str) -> str:
|
|
77
|
+
"""Format equivalents for display: '2.0' -> '2', '0.05' -> '0.05'."""
|
|
78
|
+
try:
|
|
79
|
+
val = float(equiv_str)
|
|
80
|
+
if val == int(val) and val >= 1:
|
|
81
|
+
return str(int(val))
|
|
82
|
+
# Strip trailing zeros but keep significant decimals
|
|
83
|
+
formatted = f"{val:g}"
|
|
84
|
+
return formatted
|
|
85
|
+
except (ValueError, TypeError):
|
|
86
|
+
return equiv_str
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _get_text_content(el: ET.Element) -> str:
|
|
90
|
+
"""Extract concatenated text from all <s> children of a <t> element."""
|
|
91
|
+
parts = []
|
|
92
|
+
for s in el.iter("s"):
|
|
93
|
+
if s.text:
|
|
94
|
+
parts.append(s.text)
|
|
95
|
+
return "".join(parts).strip()
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _normalize_name(name: str) -> str:
|
|
99
|
+
"""Normalize a name for comparison: lowercase, strip whitespace."""
|
|
100
|
+
return re.sub(r'\s+', ' ', name.strip().lower())
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _get_max_id(root: ET.Element) -> int:
|
|
104
|
+
"""Find the maximum id attribute value in the entire document."""
|
|
105
|
+
max_id = 0
|
|
106
|
+
for el in root.iter():
|
|
107
|
+
eid = el.get("id", "")
|
|
108
|
+
if eid:
|
|
109
|
+
try:
|
|
110
|
+
max_id = max(max_id, int(eid))
|
|
111
|
+
except ValueError:
|
|
112
|
+
pass
|
|
113
|
+
return max_id
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _get_max_z(root: ET.Element) -> int:
|
|
117
|
+
"""Find the maximum Z attribute value in the entire document."""
|
|
118
|
+
max_z = 0
|
|
119
|
+
for el in root.iter():
|
|
120
|
+
z = el.get("Z", "")
|
|
121
|
+
if z:
|
|
122
|
+
try:
|
|
123
|
+
max_z = max(max_z, int(z))
|
|
124
|
+
except ValueError:
|
|
125
|
+
pass
|
|
126
|
+
return max_z
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# ---------------------------------------------------------------------------
|
|
130
|
+
# Step 1: CSV-to-scheme matching
|
|
131
|
+
# ---------------------------------------------------------------------------
|
|
132
|
+
|
|
133
|
+
def match_csv_to_scheme(
|
|
134
|
+
root: ET.Element,
|
|
135
|
+
csv_path: str,
|
|
136
|
+
verbose: bool = False,
|
|
137
|
+
) -> EnrichmentData:
|
|
138
|
+
"""Match CSV reagents/solvents/product to scheme elements.
|
|
139
|
+
|
|
140
|
+
Uses two passes:
|
|
141
|
+
1. Name match via reagent_db.resolve_display()
|
|
142
|
+
2. MW match via RDKit (fallback for CSV names that don't resolve)
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
root : ET.Element
|
|
147
|
+
Parsed CDXML root element (after polish_scheme).
|
|
148
|
+
csv_path : str
|
|
149
|
+
Path to Findmolecule ELN CSV file.
|
|
150
|
+
verbose : bool
|
|
151
|
+
Print matching details to stderr.
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
EnrichmentData with all matches + product info.
|
|
156
|
+
"""
|
|
157
|
+
from ...perception.eln_csv_parser import parse_eln_csv
|
|
158
|
+
from ...resolve.reagent_db import get_reagent_db
|
|
159
|
+
|
|
160
|
+
def log(msg: str):
|
|
161
|
+
if verbose:
|
|
162
|
+
print(f" [enrich] {msg}", file=sys.stderr)
|
|
163
|
+
|
|
164
|
+
# Parse CSV
|
|
165
|
+
exp = parse_eln_csv(csv_path)
|
|
166
|
+
if exp is None:
|
|
167
|
+
log("WARNING: Could not parse CSV")
|
|
168
|
+
return EnrichmentData()
|
|
169
|
+
|
|
170
|
+
db = get_reagent_db()
|
|
171
|
+
enrichment = EnrichmentData()
|
|
172
|
+
|
|
173
|
+
# Collect solvent names from CSV
|
|
174
|
+
for s in exp.solvents:
|
|
175
|
+
enrichment.solvent_names.append(_normalize_name(s.name))
|
|
176
|
+
|
|
177
|
+
# Product info
|
|
178
|
+
if exp.product:
|
|
179
|
+
enrichment.product_obtained = exp.product.obtained_mass.strip()
|
|
180
|
+
enrichment.product_yield = exp.product.yield_pct.strip()
|
|
181
|
+
|
|
182
|
+
# --- Build scheme element inventory ---
|
|
183
|
+
page = root.find("page")
|
|
184
|
+
if page is None:
|
|
185
|
+
return enrichment
|
|
186
|
+
|
|
187
|
+
scheme = page.find("scheme")
|
|
188
|
+
step = scheme.find("step") if scheme is not None else None
|
|
189
|
+
if step is None:
|
|
190
|
+
return enrichment
|
|
191
|
+
|
|
192
|
+
reactant_ids = step.get("ReactionStepReactants", "").split()
|
|
193
|
+
product_ids = step.get("ReactionStepProducts", "").split()
|
|
194
|
+
above_ids = step.get("ReactionStepObjectsAboveArrow", "").split()
|
|
195
|
+
below_ids = step.get("ReactionStepObjectsBelowArrow", "").split()
|
|
196
|
+
|
|
197
|
+
# Build id -> (element, position) map
|
|
198
|
+
id_to_el: Dict[str, ET.Element] = {}
|
|
199
|
+
for el in page:
|
|
200
|
+
eid = el.get("id", "")
|
|
201
|
+
if eid:
|
|
202
|
+
id_to_el[eid] = el
|
|
203
|
+
|
|
204
|
+
# Build scheme_elements: list of (element_id, position, display_text, smiles_or_none, mw_or_none)
|
|
205
|
+
scheme_elements: List[Dict] = []
|
|
206
|
+
|
|
207
|
+
def _add_element(eid: str, position: str):
|
|
208
|
+
el = id_to_el.get(eid)
|
|
209
|
+
if el is None:
|
|
210
|
+
return
|
|
211
|
+
if el.tag == "t":
|
|
212
|
+
text = _get_text_content(el)
|
|
213
|
+
# For merged text blocks, split into lines
|
|
214
|
+
lines = [l.strip() for l in text.split("\n") if l.strip()]
|
|
215
|
+
for line in lines:
|
|
216
|
+
scheme_elements.append({
|
|
217
|
+
"element_id": eid,
|
|
218
|
+
"position": position,
|
|
219
|
+
"display": line,
|
|
220
|
+
"tag": "t",
|
|
221
|
+
"is_line_in_merged": len(lines) > 1,
|
|
222
|
+
})
|
|
223
|
+
elif el.tag == "fragment":
|
|
224
|
+
# Get the display name from the fragment (check if it was replaced by text)
|
|
225
|
+
# For fragments, we need to look at what the polisher classified it as
|
|
226
|
+
# The display name might be derived from SMILES or classification
|
|
227
|
+
# For matching, we'll try to compute MW from atom coordinates
|
|
228
|
+
frag_mw = _compute_fragment_mw(el)
|
|
229
|
+
scheme_elements.append({
|
|
230
|
+
"element_id": eid,
|
|
231
|
+
"position": position,
|
|
232
|
+
"display": None,
|
|
233
|
+
"tag": "fragment",
|
|
234
|
+
"mw": frag_mw,
|
|
235
|
+
"is_line_in_merged": False,
|
|
236
|
+
})
|
|
237
|
+
|
|
238
|
+
for rid in reactant_ids:
|
|
239
|
+
_add_element(rid, "reactant")
|
|
240
|
+
for eid in above_ids:
|
|
241
|
+
_add_element(eid, "above_arrow")
|
|
242
|
+
for eid in below_ids:
|
|
243
|
+
_add_element(eid, "below_arrow")
|
|
244
|
+
|
|
245
|
+
# --- Pass 1: Name match ---
|
|
246
|
+
matched_csv_indices = set()
|
|
247
|
+
matched_scheme_ids = set()
|
|
248
|
+
|
|
249
|
+
for i, reagent in enumerate(exp.reactants):
|
|
250
|
+
csv_display = db.resolve_display(reagent.name)
|
|
251
|
+
csv_norm = _normalize_name(csv_display)
|
|
252
|
+
csv_name_norm = _normalize_name(reagent.name)
|
|
253
|
+
|
|
254
|
+
for se in scheme_elements:
|
|
255
|
+
if se["element_id"] in matched_scheme_ids and not se["is_line_in_merged"]:
|
|
256
|
+
continue
|
|
257
|
+
if se["tag"] != "t" or se["display"] is None:
|
|
258
|
+
continue
|
|
259
|
+
|
|
260
|
+
scheme_display = se["display"]
|
|
261
|
+
scheme_norm = _normalize_name(scheme_display)
|
|
262
|
+
|
|
263
|
+
# Compare: resolved display vs scheme text (ignoring existing equiv annotations)
|
|
264
|
+
scheme_clean = re.sub(r'\s*\([\d.]+\s*eq\.\)\s*$', '', scheme_norm)
|
|
265
|
+
|
|
266
|
+
if csv_norm == scheme_clean or csv_name_norm == scheme_clean:
|
|
267
|
+
match = MatchedReagent(
|
|
268
|
+
csv_name=reagent.name,
|
|
269
|
+
csv_equiv=reagent.equiv,
|
|
270
|
+
csv_mass=reagent.mass,
|
|
271
|
+
csv_is_substrate=reagent.is_substrate,
|
|
272
|
+
csv_mw=reagent.mw,
|
|
273
|
+
scheme_element_id=se["element_id"],
|
|
274
|
+
scheme_position=se["position"],
|
|
275
|
+
scheme_display=scheme_display,
|
|
276
|
+
is_solvent=_normalize_name(reagent.name) in enrichment.solvent_names,
|
|
277
|
+
)
|
|
278
|
+
enrichment.matches.append(match)
|
|
279
|
+
matched_csv_indices.add(i)
|
|
280
|
+
if not se["is_line_in_merged"]:
|
|
281
|
+
matched_scheme_ids.add(se["element_id"])
|
|
282
|
+
log(f"Name match: CSV '{reagent.name}' -> scheme '{scheme_display}' "
|
|
283
|
+
f"(pos={se['position']}, equiv={reagent.equiv})")
|
|
284
|
+
break
|
|
285
|
+
|
|
286
|
+
# Also match solvents by name (they appear in scheme text but don't get equiv)
|
|
287
|
+
for solvent in exp.solvents:
|
|
288
|
+
solv_display = db.resolve_display(solvent.name)
|
|
289
|
+
solv_norm = _normalize_name(solv_display)
|
|
290
|
+
solv_name_norm = _normalize_name(solvent.name)
|
|
291
|
+
|
|
292
|
+
for se in scheme_elements:
|
|
293
|
+
if se["tag"] != "t" or se["display"] is None:
|
|
294
|
+
continue
|
|
295
|
+
scheme_norm = _normalize_name(se["display"])
|
|
296
|
+
scheme_clean = re.sub(r'\s*\([\d.]+\s*eq\.\)\s*$', '', scheme_norm)
|
|
297
|
+
if solv_norm == scheme_clean or solv_name_norm == scheme_clean:
|
|
298
|
+
log(f"Solvent match: CSV '{solvent.name}' -> scheme '{se['display']}'")
|
|
299
|
+
break
|
|
300
|
+
|
|
301
|
+
# --- Pass 2: MW match (fallback for unmatched CSV reactants) ---
|
|
302
|
+
try:
|
|
303
|
+
from rdkit import Chem
|
|
304
|
+
from rdkit.Chem import Descriptors
|
|
305
|
+
_has_rdkit = True
|
|
306
|
+
except ImportError:
|
|
307
|
+
_has_rdkit = False
|
|
308
|
+
|
|
309
|
+
if _has_rdkit:
|
|
310
|
+
for i, reagent in enumerate(exp.reactants):
|
|
311
|
+
if i in matched_csv_indices:
|
|
312
|
+
continue
|
|
313
|
+
|
|
314
|
+
csv_mw = reagent.mw
|
|
315
|
+
if csv_mw <= 0:
|
|
316
|
+
continue
|
|
317
|
+
|
|
318
|
+
# Try to match against fragment MW — pick closest within window
|
|
319
|
+
best_se = None
|
|
320
|
+
best_delta = MW_MATCH_TOLERANCE # threshold
|
|
321
|
+
for se in scheme_elements:
|
|
322
|
+
se_id = se["element_id"]
|
|
323
|
+
if se_id in matched_scheme_ids:
|
|
324
|
+
continue
|
|
325
|
+
if se["tag"] != "fragment":
|
|
326
|
+
continue
|
|
327
|
+
frag_mw = se.get("mw")
|
|
328
|
+
if frag_mw is None or frag_mw <= 0:
|
|
329
|
+
continue
|
|
330
|
+
delta = abs(frag_mw - csv_mw)
|
|
331
|
+
if delta < best_delta:
|
|
332
|
+
best_delta = delta
|
|
333
|
+
best_se = se
|
|
334
|
+
if best_se is not None:
|
|
335
|
+
se_id = best_se["element_id"]
|
|
336
|
+
frag_mw = best_se["mw"]
|
|
337
|
+
match = MatchedReagent(
|
|
338
|
+
csv_name=reagent.name,
|
|
339
|
+
csv_equiv=reagent.equiv,
|
|
340
|
+
csv_mass=reagent.mass,
|
|
341
|
+
csv_is_substrate=reagent.is_substrate,
|
|
342
|
+
csv_mw=reagent.mw,
|
|
343
|
+
scheme_element_id=se_id,
|
|
344
|
+
scheme_position=best_se["position"],
|
|
345
|
+
scheme_display=f"fragment_{se_id}",
|
|
346
|
+
is_solvent=False,
|
|
347
|
+
)
|
|
348
|
+
enrichment.matches.append(match)
|
|
349
|
+
matched_csv_indices.add(i)
|
|
350
|
+
matched_scheme_ids.add(se_id)
|
|
351
|
+
log(f"MW match: CSV '{reagent.name}' (MW={csv_mw:.1f}) -> "
|
|
352
|
+
f"fragment {se_id} (MW={frag_mw:.1f}, delta={best_delta:.2f}, "
|
|
353
|
+
f"pos={best_se['position']}, equiv={reagent.equiv})")
|
|
354
|
+
|
|
355
|
+
# Also try matching against text elements by resolving their
|
|
356
|
+
# display name to SMILES (via reagent_db) and computing MW
|
|
357
|
+
# Pick closest match within window
|
|
358
|
+
if i not in matched_csv_indices:
|
|
359
|
+
best_text_se = None
|
|
360
|
+
best_text_mw = None
|
|
361
|
+
best_text_delta = MW_MATCH_TOLERANCE # threshold
|
|
362
|
+
best_text_display = None
|
|
363
|
+
for se in scheme_elements:
|
|
364
|
+
if se["tag"] != "t" or se["display"] is None:
|
|
365
|
+
continue
|
|
366
|
+
se_id = se["element_id"]
|
|
367
|
+
scheme_display = se["display"]
|
|
368
|
+
# Check not already matched as a line in merged text
|
|
369
|
+
already_matched_line = False
|
|
370
|
+
for existing in enrichment.matches:
|
|
371
|
+
if (existing.scheme_element_id == se_id
|
|
372
|
+
and existing.scheme_display == scheme_display):
|
|
373
|
+
already_matched_line = True
|
|
374
|
+
break
|
|
375
|
+
if already_matched_line:
|
|
376
|
+
continue
|
|
377
|
+
# Look up the entry in reagent_db by scheme display name
|
|
378
|
+
entry = db.entry_for_name(
|
|
379
|
+
_normalize_name(scheme_display).replace(" ", "")
|
|
380
|
+
)
|
|
381
|
+
if entry is None:
|
|
382
|
+
continue
|
|
383
|
+
smi_val = entry.get("smiles")
|
|
384
|
+
if not smi_val:
|
|
385
|
+
continue
|
|
386
|
+
smiles_list = smi_val if isinstance(smi_val, list) else [smi_val]
|
|
387
|
+
for smi in smiles_list:
|
|
388
|
+
mol = Chem.MolFromSmiles(smi)
|
|
389
|
+
if mol:
|
|
390
|
+
mw = Descriptors.ExactMolWt(mol)
|
|
391
|
+
delta = abs(mw - csv_mw)
|
|
392
|
+
if delta < best_text_delta:
|
|
393
|
+
best_text_delta = delta
|
|
394
|
+
best_text_se = se
|
|
395
|
+
best_text_mw = mw
|
|
396
|
+
best_text_display = scheme_display
|
|
397
|
+
if best_text_se is not None:
|
|
398
|
+
match = MatchedReagent(
|
|
399
|
+
csv_name=reagent.name,
|
|
400
|
+
csv_equiv=reagent.equiv,
|
|
401
|
+
csv_mass=reagent.mass,
|
|
402
|
+
csv_is_substrate=reagent.is_substrate,
|
|
403
|
+
csv_mw=reagent.mw,
|
|
404
|
+
scheme_element_id=best_text_se["element_id"],
|
|
405
|
+
scheme_position=best_text_se["position"],
|
|
406
|
+
scheme_display=best_text_display,
|
|
407
|
+
is_solvent=False,
|
|
408
|
+
)
|
|
409
|
+
enrichment.matches.append(match)
|
|
410
|
+
matched_csv_indices.add(i)
|
|
411
|
+
log(f"MW-via-SMILES match: CSV '{reagent.name}' "
|
|
412
|
+
f"(MW={csv_mw:.1f}) -> scheme '{best_text_display}' "
|
|
413
|
+
f"(MW={best_text_mw:.1f}, delta={best_text_delta:.2f})")
|
|
414
|
+
|
|
415
|
+
# --- Identify substrate (SM for run arrow) ---
|
|
416
|
+
# Use the reagent with equiv=1.0 and is_substrate=True
|
|
417
|
+
# If multiple substrates, use the one with largest MW (main SM)
|
|
418
|
+
substrate_candidates = [
|
|
419
|
+
m for m in enrichment.matches
|
|
420
|
+
if m.csv_is_substrate
|
|
421
|
+
]
|
|
422
|
+
if substrate_candidates:
|
|
423
|
+
# Prefer equiv=1.0 substrate; if none, use largest MW
|
|
424
|
+
eq1_substrates = [m for m in substrate_candidates
|
|
425
|
+
if _format_equiv(m.csv_equiv) == "1"]
|
|
426
|
+
if eq1_substrates:
|
|
427
|
+
enrichment.substrate = max(eq1_substrates, key=lambda m: m.csv_mw)
|
|
428
|
+
else:
|
|
429
|
+
enrichment.substrate = max(substrate_candidates, key=lambda m: m.csv_mw)
|
|
430
|
+
enrichment.sm_mass = enrichment.substrate.csv_mass.strip()
|
|
431
|
+
log(f"Substrate: '{enrichment.substrate.csv_name}' "
|
|
432
|
+
f"(mass={enrichment.sm_mass})")
|
|
433
|
+
|
|
434
|
+
# Report unmatched
|
|
435
|
+
for i, reagent in enumerate(exp.reactants):
|
|
436
|
+
if i not in matched_csv_indices:
|
|
437
|
+
log(f"WARNING: Unmatched CSV reactant: '{reagent.name}' "
|
|
438
|
+
f"(MW={reagent.mw})")
|
|
439
|
+
|
|
440
|
+
return enrichment
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def _compute_fragment_mw(frag: ET.Element) -> Optional[float]:
|
|
444
|
+
"""Compute MW from a CDXML fragment element.
|
|
445
|
+
|
|
446
|
+
Three-tier resolution:
|
|
447
|
+
1. ChemScript SMILES → RDKit MolWt (exact average MW)
|
|
448
|
+
2. RDKit-direct from CDXML fragment (no ChemScript needed)
|
|
449
|
+
3. Manual atom counting (less accurate for heteroatoms)
|
|
450
|
+
Returns None if fragment has no atoms.
|
|
451
|
+
"""
|
|
452
|
+
# --- Tier 1: ChemScript + RDKit ---
|
|
453
|
+
mw = _compute_fragment_mw_via_smiles(frag)
|
|
454
|
+
if mw is not None:
|
|
455
|
+
return mw
|
|
456
|
+
|
|
457
|
+
# --- Tier 2: RDKit-direct from CDXML fragment ---
|
|
458
|
+
mw = _compute_fragment_mw_rdkit_direct(frag)
|
|
459
|
+
if mw is not None:
|
|
460
|
+
return mw
|
|
461
|
+
|
|
462
|
+
# --- Tier 3: manual atom counting ---
|
|
463
|
+
return _compute_fragment_mw_manual(frag)
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
def _compute_fragment_mw_rdkit_direct(frag: ET.Element) -> Optional[float]:
|
|
467
|
+
"""Compute MW directly from CDXML fragment via RDKit (no ChemScript).
|
|
468
|
+
|
|
469
|
+
Uses rdkit_utils.frag_to_mw() which converts CDXML atoms/bonds to
|
|
470
|
+
an RDKit Mol and computes average MW. Returns None if the fragment
|
|
471
|
+
contains abbreviation groups (element 0 / dummy atoms).
|
|
472
|
+
"""
|
|
473
|
+
try:
|
|
474
|
+
from ...rdkit_utils import frag_to_mw
|
|
475
|
+
return frag_to_mw(frag)
|
|
476
|
+
except ImportError:
|
|
477
|
+
return None
|
|
478
|
+
except Exception:
|
|
479
|
+
return None
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def _compute_fragment_mw_via_smiles(frag: ET.Element) -> Optional[float]:
|
|
483
|
+
"""Compute MW via ChemScript SMILES export + RDKit."""
|
|
484
|
+
try:
|
|
485
|
+
from ...chemdraw.chemscript_bridge import ChemScriptBridge
|
|
486
|
+
from rdkit import Chem
|
|
487
|
+
from rdkit.Chem import Descriptors
|
|
488
|
+
except ImportError:
|
|
489
|
+
return None
|
|
490
|
+
|
|
491
|
+
import tempfile
|
|
492
|
+
|
|
493
|
+
# Wrap fragment in minimal CDXML document
|
|
494
|
+
frag_xml = ET.tostring(frag, encoding="unicode")
|
|
495
|
+
cdxml_doc = (
|
|
496
|
+
CDXML_MINIMAL_HEADER + "\n<page id=\"1\">\n"
|
|
497
|
+
+ frag_xml
|
|
498
|
+
+ "\n</page>\n" + CDXML_FOOTER
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
try:
|
|
502
|
+
bridge = ChemScriptBridge()
|
|
503
|
+
# Write temp CDXML file for ChemScript
|
|
504
|
+
tmp = tempfile.NamedTemporaryFile(
|
|
505
|
+
suffix=".cdxml", delete=False, mode="w", encoding="utf-8"
|
|
506
|
+
)
|
|
507
|
+
tmp.write(cdxml_doc)
|
|
508
|
+
tmp.close()
|
|
509
|
+
|
|
510
|
+
smiles = bridge.write_data(tmp.name, "chemical/x-smiles")
|
|
511
|
+
os.unlink(tmp.name)
|
|
512
|
+
|
|
513
|
+
if not smiles or not smiles.strip():
|
|
514
|
+
return None
|
|
515
|
+
|
|
516
|
+
mol = Chem.MolFromSmiles(smiles.strip())
|
|
517
|
+
if mol is None:
|
|
518
|
+
return None
|
|
519
|
+
|
|
520
|
+
# Use average MW (MolWt) to match CSV values, not monoisotopic
|
|
521
|
+
return Descriptors.MolWt(mol)
|
|
522
|
+
except Exception:
|
|
523
|
+
return None
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def _compute_fragment_mw_manual(frag: ET.Element) -> Optional[float]:
|
|
527
|
+
"""Fallback: compute approximate MW from CDXML atom elements.
|
|
528
|
+
|
|
529
|
+
Counts atoms by element type, adds implicit H from NumHydrogens
|
|
530
|
+
attribute. Only estimates implicit H for carbon (valence 4);
|
|
531
|
+
heteroatom implicit H requires NumHydrogens to be present.
|
|
532
|
+
"""
|
|
533
|
+
ATOMIC_WEIGHTS = {
|
|
534
|
+
1: 1.008, 5: 10.81, 6: 12.011, 7: 14.007, 8: 15.999,
|
|
535
|
+
9: 18.998, 14: 28.086, 15: 30.974, 16: 32.065, 17: 35.453,
|
|
536
|
+
35: 79.904, 53: 126.904, 11: 22.990, 19: 39.098,
|
|
537
|
+
46: 106.42, 55: 132.905, 29: 63.546, 30: 65.38,
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
def _collect_atoms_bonds(container):
|
|
541
|
+
nodes = {} # id -> Element
|
|
542
|
+
bonds = []
|
|
543
|
+
for n in container.findall("n"):
|
|
544
|
+
nid = n.get("id", "")
|
|
545
|
+
node_type = n.get("NodeType", "")
|
|
546
|
+
if node_type == "Fragment":
|
|
547
|
+
inner = n.find("fragment")
|
|
548
|
+
if inner is not None:
|
|
549
|
+
inner_nodes, inner_bonds = _collect_atoms_bonds(inner)
|
|
550
|
+
nodes.update(inner_nodes)
|
|
551
|
+
bonds.extend(inner_bonds)
|
|
552
|
+
continue
|
|
553
|
+
if node_type == "ExternalConnectionPoint":
|
|
554
|
+
continue
|
|
555
|
+
if nid:
|
|
556
|
+
nodes[nid] = n
|
|
557
|
+
bonds.extend(container.findall("b"))
|
|
558
|
+
return nodes, bonds
|
|
559
|
+
|
|
560
|
+
all_nodes, all_bonds = _collect_atoms_bonds(frag)
|
|
561
|
+
|
|
562
|
+
total_mw = 0.0
|
|
563
|
+
atom_count = 0
|
|
564
|
+
|
|
565
|
+
for nid, n in all_nodes.items():
|
|
566
|
+
elem = n.get("Element", "6")
|
|
567
|
+
try:
|
|
568
|
+
elem_num = int(elem)
|
|
569
|
+
except ValueError:
|
|
570
|
+
elem_num = 6
|
|
571
|
+
|
|
572
|
+
weight = ATOMIC_WEIGHTS.get(elem_num, 0)
|
|
573
|
+
total_mw += weight
|
|
574
|
+
atom_count += 1
|
|
575
|
+
|
|
576
|
+
nh = n.get("NumHydrogens")
|
|
577
|
+
if nh is not None:
|
|
578
|
+
try:
|
|
579
|
+
total_mw += int(nh) * 1.008
|
|
580
|
+
except ValueError:
|
|
581
|
+
pass
|
|
582
|
+
elif elem_num == 6:
|
|
583
|
+
bond_count = 0
|
|
584
|
+
for b in all_bonds:
|
|
585
|
+
if b.get("B") == nid or b.get("E") == nid:
|
|
586
|
+
order = b.get("Order", "1")
|
|
587
|
+
try:
|
|
588
|
+
bond_count += int(order)
|
|
589
|
+
except ValueError:
|
|
590
|
+
bond_count += 1
|
|
591
|
+
implicit_h = max(0, 4 - bond_count)
|
|
592
|
+
total_mw += implicit_h * 1.008
|
|
593
|
+
|
|
594
|
+
return total_mw if atom_count > 0 else None
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
# ---------------------------------------------------------------------------
|
|
598
|
+
# Step 1.5: Reposition non-substrate reactant to above-arrow
|
|
599
|
+
# ---------------------------------------------------------------------------
|
|
600
|
+
|
|
601
|
+
def reposition_reactant_above_arrow(
|
|
602
|
+
root: ET.Element,
|
|
603
|
+
csv_path: str,
|
|
604
|
+
verbose: bool = False,
|
|
605
|
+
) -> bool:
|
|
606
|
+
"""Move a non-substrate reactant from left-of-arrow to above-arrow.
|
|
607
|
+
|
|
608
|
+
When two atom-contributing structures sit to the left of the arrow
|
|
609
|
+
and nothing is drawn above it, the non-substrate (the one that is
|
|
610
|
+
NOT 1.0 eq in the ELN CSV) should be moved above the arrow. The
|
|
611
|
+
substrate stays on the left.
|
|
612
|
+
|
|
613
|
+
Only modifies ``<step>`` metadata (``ReactionStepReactants`` and
|
|
614
|
+
``ReactionStepObjectsAboveArrow``). Physical repositioning is
|
|
615
|
+
handled downstream by ``reaction_cleanup``'s ``_stack_above_below``.
|
|
616
|
+
|
|
617
|
+
Parameters
|
|
618
|
+
----------
|
|
619
|
+
root : ET.Element
|
|
620
|
+
Parsed CDXML root (after scheme_polisher).
|
|
621
|
+
csv_path : str
|
|
622
|
+
Path to Findmolecule ELN CSV file.
|
|
623
|
+
verbose : bool
|
|
624
|
+
Print details to stderr.
|
|
625
|
+
|
|
626
|
+
Returns
|
|
627
|
+
-------
|
|
628
|
+
True if a fragment was repositioned, False otherwise.
|
|
629
|
+
"""
|
|
630
|
+
from ...perception.eln_csv_parser import parse_eln_csv
|
|
631
|
+
|
|
632
|
+
def log(msg: str):
|
|
633
|
+
if verbose:
|
|
634
|
+
print(f" [reposition] {msg}", file=sys.stderr)
|
|
635
|
+
|
|
636
|
+
# Parse CSV to identify substrate
|
|
637
|
+
exp = parse_eln_csv(csv_path)
|
|
638
|
+
if exp is None:
|
|
639
|
+
log("Could not parse CSV")
|
|
640
|
+
return False
|
|
641
|
+
|
|
642
|
+
# Find step metadata
|
|
643
|
+
page = root.find("page")
|
|
644
|
+
if page is None:
|
|
645
|
+
return False
|
|
646
|
+
scheme = page.find("scheme")
|
|
647
|
+
step = scheme.find("step") if scheme is not None else None
|
|
648
|
+
if step is None:
|
|
649
|
+
return False
|
|
650
|
+
|
|
651
|
+
reactant_ids = step.get("ReactionStepReactants", "").split()
|
|
652
|
+
above_ids = step.get("ReactionStepObjectsAboveArrow", "").split()
|
|
653
|
+
|
|
654
|
+
# Build id -> element map
|
|
655
|
+
id_to_el: Dict[str, ET.Element] = {}
|
|
656
|
+
for el in page:
|
|
657
|
+
eid = el.get("id", "")
|
|
658
|
+
if eid:
|
|
659
|
+
id_to_el[eid] = el
|
|
660
|
+
|
|
661
|
+
# Identify fragment elements among reactants and above-arrow
|
|
662
|
+
reactant_frags = [] # (id, element)
|
|
663
|
+
for rid in reactant_ids:
|
|
664
|
+
el = id_to_el.get(rid)
|
|
665
|
+
if el is not None and el.tag == "fragment":
|
|
666
|
+
reactant_frags.append((rid, el))
|
|
667
|
+
|
|
668
|
+
above_frags = []
|
|
669
|
+
for aid in above_ids:
|
|
670
|
+
el = id_to_el.get(aid)
|
|
671
|
+
if el is not None and el.tag == "fragment":
|
|
672
|
+
above_frags.append((aid, el))
|
|
673
|
+
|
|
674
|
+
# Condition: 2+ fragment reactants, 0 fragment above arrow
|
|
675
|
+
if len(reactant_frags) < 2 or len(above_frags) > 0:
|
|
676
|
+
if verbose and len(reactant_frags) < 2:
|
|
677
|
+
log(f"Only {len(reactant_frags)} fragment reactant(s), "
|
|
678
|
+
f"no repositioning needed")
|
|
679
|
+
if verbose and len(above_frags) > 0:
|
|
680
|
+
log(f"{len(above_frags)} fragment(s) already above arrow, "
|
|
681
|
+
f"no repositioning needed")
|
|
682
|
+
return False
|
|
683
|
+
|
|
684
|
+
log(f"Found {len(reactant_frags)} fragment reactant(s), "
|
|
685
|
+
f"0 fragments above arrow")
|
|
686
|
+
|
|
687
|
+
# Find the substrate from CSV (equiv=1.0 and/or is_substrate=True)
|
|
688
|
+
substrate_mw = None
|
|
689
|
+
substrate_name = None
|
|
690
|
+
for reagent in exp.reactants:
|
|
691
|
+
if reagent.is_substrate:
|
|
692
|
+
substrate_mw = reagent.mw
|
|
693
|
+
substrate_name = reagent.name
|
|
694
|
+
break
|
|
695
|
+
if substrate_mw is None:
|
|
696
|
+
# Fallback: look for equiv=1.0
|
|
697
|
+
for reagent in exp.reactants:
|
|
698
|
+
try:
|
|
699
|
+
eq = float(reagent.equiv)
|
|
700
|
+
except (ValueError, TypeError):
|
|
701
|
+
continue
|
|
702
|
+
if abs(eq - 1.0) < 0.01:
|
|
703
|
+
substrate_mw = reagent.mw
|
|
704
|
+
substrate_name = reagent.name
|
|
705
|
+
break
|
|
706
|
+
if substrate_mw is None or substrate_mw <= 0:
|
|
707
|
+
log("Could not identify substrate MW from CSV")
|
|
708
|
+
return False
|
|
709
|
+
|
|
710
|
+
log(f"Substrate from CSV: '{substrate_name}' (MW={substrate_mw:.1f})")
|
|
711
|
+
|
|
712
|
+
# Match substrate to a fragment by MW
|
|
713
|
+
substrate_frag_id = None
|
|
714
|
+
best_delta = float("inf")
|
|
715
|
+
for fid, frag_el in reactant_frags:
|
|
716
|
+
frag_mw = _compute_fragment_mw(frag_el)
|
|
717
|
+
if frag_mw is None:
|
|
718
|
+
continue
|
|
719
|
+
delta = abs(frag_mw - substrate_mw)
|
|
720
|
+
log(f" Fragment {fid}: MW={frag_mw:.1f}, delta={delta:.1f}")
|
|
721
|
+
if delta < best_delta and delta < MW_MATCH_TOLERANCE_LOOSE:
|
|
722
|
+
best_delta = delta
|
|
723
|
+
substrate_frag_id = fid
|
|
724
|
+
|
|
725
|
+
if substrate_frag_id is None:
|
|
726
|
+
log("Could not match substrate to any reactant fragment by MW")
|
|
727
|
+
return False
|
|
728
|
+
|
|
729
|
+
log(f"Substrate matched to fragment {substrate_frag_id} "
|
|
730
|
+
f"(delta={best_delta:.1f})")
|
|
731
|
+
|
|
732
|
+
# Move the OTHER fragment(s) to above-arrow
|
|
733
|
+
moved = False
|
|
734
|
+
new_reactant_ids = list(reactant_ids)
|
|
735
|
+
new_above_ids = list(above_ids)
|
|
736
|
+
for fid, frag_el in reactant_frags:
|
|
737
|
+
if fid == substrate_frag_id:
|
|
738
|
+
continue
|
|
739
|
+
# Move from reactants to above-arrow
|
|
740
|
+
if fid in new_reactant_ids:
|
|
741
|
+
new_reactant_ids.remove(fid)
|
|
742
|
+
new_above_ids.append(fid)
|
|
743
|
+
log(f"Moving fragment {fid} from reactants to above-arrow")
|
|
744
|
+
moved = True
|
|
745
|
+
|
|
746
|
+
if moved:
|
|
747
|
+
step.set("ReactionStepReactants", " ".join(new_reactant_ids))
|
|
748
|
+
step.set("ReactionStepObjectsAboveArrow",
|
|
749
|
+
" ".join(new_above_ids))
|
|
750
|
+
|
|
751
|
+
return moved
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
# ---------------------------------------------------------------------------
|
|
755
|
+
# Step 2: Phase A -- Inject equivalents into text content (before layout)
|
|
756
|
+
# ---------------------------------------------------------------------------
|
|
757
|
+
|
|
758
|
+
def enrich_phase_a(
|
|
759
|
+
root: ET.Element,
|
|
760
|
+
enrichment: EnrichmentData,
|
|
761
|
+
merged_text_id: Optional[str],
|
|
762
|
+
verbose: bool = False,
|
|
763
|
+
) -> None:
|
|
764
|
+
"""Inject equivalents into text labels (modifies root in-place).
|
|
765
|
+
|
|
766
|
+
In merged mode: rebuilds <s> elements in the merged text block.
|
|
767
|
+
In non-merged mode: appends ' (X eq.)' to each matching <t> element.
|
|
768
|
+
|
|
769
|
+
Must be called BEFORE layout (compact + reaction_cleanup) so that
|
|
770
|
+
text widths are correct for arrow length computation.
|
|
771
|
+
"""
|
|
772
|
+
def log(msg: str):
|
|
773
|
+
if verbose:
|
|
774
|
+
print(f" [enrich-A] {msg}", file=sys.stderr)
|
|
775
|
+
|
|
776
|
+
page = root.find("page")
|
|
777
|
+
if page is None:
|
|
778
|
+
return
|
|
779
|
+
|
|
780
|
+
# Build match lookup: scheme_display (normalized) -> MatchedReagent
|
|
781
|
+
# For merged text, we match by line content
|
|
782
|
+
match_by_display: Dict[str, MatchedReagent] = {}
|
|
783
|
+
for m in enrichment.matches:
|
|
784
|
+
if m.scheme_position in ("below_arrow", "above_arrow") and m.scheme_display:
|
|
785
|
+
# Only inject equiv for non-substrate reagents in text
|
|
786
|
+
# (Substrates are structures on left/right — handled in Phase B)
|
|
787
|
+
if not m.is_solvent:
|
|
788
|
+
match_by_display[_normalize_name(m.scheme_display)] = m
|
|
789
|
+
|
|
790
|
+
if not match_by_display:
|
|
791
|
+
log("No text-based equiv matches to inject")
|
|
792
|
+
return
|
|
793
|
+
|
|
794
|
+
if merged_text_id:
|
|
795
|
+
_inject_merged(page, merged_text_id, match_by_display,
|
|
796
|
+
enrichment.solvent_names, log)
|
|
797
|
+
else:
|
|
798
|
+
_inject_separate(page, match_by_display,
|
|
799
|
+
enrichment.solvent_names, log)
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
def _inject_merged(
|
|
803
|
+
page: ET.Element,
|
|
804
|
+
merged_text_id: str,
|
|
805
|
+
match_by_display: Dict[str, 'MatchedReagent'],
|
|
806
|
+
solvent_names: List[str],
|
|
807
|
+
log,
|
|
808
|
+
) -> None:
|
|
809
|
+
"""Inject equiv into a merged text block (single <t> with newlines)."""
|
|
810
|
+
# Find the merged text element
|
|
811
|
+
merged_el = None
|
|
812
|
+
for el in page:
|
|
813
|
+
if el.get("id") == merged_text_id and el.tag == "t":
|
|
814
|
+
merged_el = el
|
|
815
|
+
break
|
|
816
|
+
|
|
817
|
+
if merged_el is None:
|
|
818
|
+
log(f"WARNING: Merged text element id={merged_text_id} not found")
|
|
819
|
+
return
|
|
820
|
+
|
|
821
|
+
# Extract current text content
|
|
822
|
+
full_text = _get_text_content(merged_el)
|
|
823
|
+
lines = full_text.split("\n")
|
|
824
|
+
|
|
825
|
+
# Build new <s> XML for each line
|
|
826
|
+
new_s_parts = []
|
|
827
|
+
for i, line in enumerate(lines):
|
|
828
|
+
line_stripped = line.strip()
|
|
829
|
+
if not line_stripped:
|
|
830
|
+
continue
|
|
831
|
+
|
|
832
|
+
line_norm = _normalize_name(line_stripped)
|
|
833
|
+
# Check if this line is a condition (time/temp) — skip
|
|
834
|
+
is_condition = bool(
|
|
835
|
+
re.search(r'\d+\s*°', line_stripped)
|
|
836
|
+
or re.search(r'\d+\s*[hm](?:\s|$|,)', line_stripped)
|
|
837
|
+
)
|
|
838
|
+
# Check if this line is a solvent — skip equiv
|
|
839
|
+
is_solvent = line_norm in solvent_names
|
|
840
|
+
|
|
841
|
+
matched = match_by_display.get(line_norm)
|
|
842
|
+
is_last_line = (i == len(lines) - 1)
|
|
843
|
+
|
|
844
|
+
if matched and not is_condition and not is_solvent:
|
|
845
|
+
equiv_str = _format_equiv(matched.csv_equiv)
|
|
846
|
+
# Build formatted reagent name (with subscripts/italics)
|
|
847
|
+
reagent_s_xml = build_formatted_s_xml(line_stripped)
|
|
848
|
+
# Append equiv in plain face; newline must be INSIDE <s> text
|
|
849
|
+
if not is_last_line:
|
|
850
|
+
equiv_s_xml = (
|
|
851
|
+
f'<s font="3" size="10" color="0"> '
|
|
852
|
+
f'({equiv_str} eq.)\n</s>'
|
|
853
|
+
)
|
|
854
|
+
else:
|
|
855
|
+
equiv_s_xml = (
|
|
856
|
+
f'<s font="3" size="10" color="0"> '
|
|
857
|
+
f'({equiv_str} eq.)</s>'
|
|
858
|
+
)
|
|
859
|
+
new_s_parts.append(reagent_s_xml + equiv_s_xml)
|
|
860
|
+
log(f" Merged line '{line_stripped}' -> ({equiv_str} eq.)")
|
|
861
|
+
else:
|
|
862
|
+
# Keep original line with its formatting
|
|
863
|
+
reagent_s_xml = build_formatted_s_xml(line_stripped)
|
|
864
|
+
# Newline must be INSIDE <s> text to be preserved in CDXML
|
|
865
|
+
if not is_last_line:
|
|
866
|
+
new_s_parts.append(
|
|
867
|
+
reagent_s_xml
|
|
868
|
+
+ '<s font="3" size="10" color="0">\n</s>'
|
|
869
|
+
)
|
|
870
|
+
else:
|
|
871
|
+
new_s_parts.append(reagent_s_xml)
|
|
872
|
+
|
|
873
|
+
# Clear existing <s> children and rebuild
|
|
874
|
+
for s in list(merged_el.findall("s")):
|
|
875
|
+
merged_el.remove(s)
|
|
876
|
+
|
|
877
|
+
# Parse and insert new <s> elements
|
|
878
|
+
combined_xml = "".join(new_s_parts)
|
|
879
|
+
|
|
880
|
+
# Wrap for parsing
|
|
881
|
+
wrapper = f"<t>{combined_xml}</t>"
|
|
882
|
+
try:
|
|
883
|
+
temp_t = ET.fromstring(wrapper)
|
|
884
|
+
for s in temp_t.findall("s"):
|
|
885
|
+
merged_el.append(s)
|
|
886
|
+
except ET.ParseError as e:
|
|
887
|
+
log(f"WARNING: Failed to rebuild merged text: {e}")
|
|
888
|
+
log(f" XML was: {wrapper[:200]}...")
|
|
889
|
+
return
|
|
890
|
+
|
|
891
|
+
log(f"Rebuilt merged text block (id={merged_text_id})")
|
|
892
|
+
|
|
893
|
+
|
|
894
|
+
def _inject_separate(
|
|
895
|
+
page: ET.Element,
|
|
896
|
+
match_by_display: Dict[str, 'MatchedReagent'],
|
|
897
|
+
solvent_names: List[str],
|
|
898
|
+
log,
|
|
899
|
+
) -> None:
|
|
900
|
+
"""Inject equiv into separate text elements (non-merged mode)."""
|
|
901
|
+
|
|
902
|
+
for el in page.findall("t"):
|
|
903
|
+
text = _get_text_content(el)
|
|
904
|
+
text_norm = _normalize_name(text)
|
|
905
|
+
|
|
906
|
+
# Skip conditions
|
|
907
|
+
if re.search(r'\d+\s*°', text) or re.search(r'\d+\s*[hm](?:\s|$|,)', text):
|
|
908
|
+
continue
|
|
909
|
+
# Skip solvents
|
|
910
|
+
if text_norm in solvent_names:
|
|
911
|
+
continue
|
|
912
|
+
|
|
913
|
+
matched = match_by_display.get(text_norm)
|
|
914
|
+
if matched is None:
|
|
915
|
+
continue
|
|
916
|
+
|
|
917
|
+
equiv_str = _format_equiv(matched.csv_equiv)
|
|
918
|
+
|
|
919
|
+
# Rebuild <s> children
|
|
920
|
+
for s in list(el.findall("s")):
|
|
921
|
+
el.remove(s)
|
|
922
|
+
|
|
923
|
+
reagent_s_xml = build_formatted_s_xml(text)
|
|
924
|
+
equiv_s_xml = (
|
|
925
|
+
f'<s font="3" size="10" color="0"> '
|
|
926
|
+
f'({equiv_str} eq.)</s>'
|
|
927
|
+
)
|
|
928
|
+
wrapper = f"<t>{reagent_s_xml}{equiv_s_xml}</t>"
|
|
929
|
+
try:
|
|
930
|
+
temp_t = ET.fromstring(wrapper)
|
|
931
|
+
for s in temp_t.findall("s"):
|
|
932
|
+
el.append(s)
|
|
933
|
+
except ET.ParseError as e:
|
|
934
|
+
log(f"WARNING: Failed to rebuild text for '{text}': {e}")
|
|
935
|
+
continue
|
|
936
|
+
|
|
937
|
+
# Recompute bounding box
|
|
938
|
+
recompute_text_bbox(el)
|
|
939
|
+
log(f" Separate text '{text}' -> ({equiv_str} eq.)")
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
# ---------------------------------------------------------------------------
|
|
943
|
+
# Step 3: Phase B -- Post-layout additions (run arrow + eq labels)
|
|
944
|
+
# ---------------------------------------------------------------------------
|
|
945
|
+
|
|
946
|
+
def enrich_phase_b(
|
|
947
|
+
root: ET.Element,
|
|
948
|
+
enrichment: EnrichmentData,
|
|
949
|
+
verbose: bool = False,
|
|
950
|
+
) -> None:
|
|
951
|
+
"""Add run arrow and structural eq labels after layout.
|
|
952
|
+
|
|
953
|
+
Must be called AFTER reaction_cleanup has finalized positions.
|
|
954
|
+
Modifies root in-place.
|
|
955
|
+
"""
|
|
956
|
+
def log(msg: str):
|
|
957
|
+
if verbose:
|
|
958
|
+
print(f" [enrich-B] {msg}", file=sys.stderr)
|
|
959
|
+
|
|
960
|
+
page = root.find("page")
|
|
961
|
+
if page is None:
|
|
962
|
+
return
|
|
963
|
+
|
|
964
|
+
# --- Find reaction arrow ---
|
|
965
|
+
arrow_el = None
|
|
966
|
+
for el in page:
|
|
967
|
+
if el.tag == "arrow":
|
|
968
|
+
arrow_el = el
|
|
969
|
+
break
|
|
970
|
+
# Fallback: look for <graphic> with ArrowType
|
|
971
|
+
if arrow_el is None:
|
|
972
|
+
for el in page:
|
|
973
|
+
if el.tag == "graphic" and el.get("ArrowType"):
|
|
974
|
+
arrow_el = el
|
|
975
|
+
break
|
|
976
|
+
|
|
977
|
+
if arrow_el is None:
|
|
978
|
+
log("WARNING: No reaction arrow found")
|
|
979
|
+
return
|
|
980
|
+
|
|
981
|
+
# Get arrow coordinates
|
|
982
|
+
if arrow_el.tag == "arrow":
|
|
983
|
+
head3d = arrow_el.get("Head3D", "")
|
|
984
|
+
tail3d = arrow_el.get("Tail3D", "")
|
|
985
|
+
if head3d and tail3d:
|
|
986
|
+
head_parts = head3d.split()
|
|
987
|
+
tail_parts = tail3d.split()
|
|
988
|
+
arrow_head_x = float(head_parts[0])
|
|
989
|
+
arrow_tail_x = float(tail_parts[0])
|
|
990
|
+
arrow_y = float(head_parts[1])
|
|
991
|
+
else:
|
|
992
|
+
bb = arrow_el.get("BoundingBox", "").split()
|
|
993
|
+
if len(bb) >= 4:
|
|
994
|
+
arrow_tail_x = float(bb[0])
|
|
995
|
+
arrow_head_x = float(bb[2])
|
|
996
|
+
arrow_y = (float(bb[1]) + float(bb[3])) / 2.0
|
|
997
|
+
else:
|
|
998
|
+
log("WARNING: Cannot determine arrow position")
|
|
999
|
+
return
|
|
1000
|
+
else:
|
|
1001
|
+
bb = arrow_el.get("BoundingBox", "").split()
|
|
1002
|
+
if len(bb) >= 4:
|
|
1003
|
+
# graphic BoundingBox: head_x, y, tail_x, y (reversed)
|
|
1004
|
+
arrow_head_x = float(bb[0])
|
|
1005
|
+
arrow_tail_x = float(bb[2])
|
|
1006
|
+
arrow_y = float(bb[1])
|
|
1007
|
+
else:
|
|
1008
|
+
log("WARNING: Cannot determine arrow position")
|
|
1009
|
+
return
|
|
1010
|
+
|
|
1011
|
+
# Ensure tail_x < head_x
|
|
1012
|
+
if arrow_tail_x > arrow_head_x:
|
|
1013
|
+
arrow_tail_x, arrow_head_x = arrow_head_x, arrow_tail_x
|
|
1014
|
+
|
|
1015
|
+
arrow_cx = (arrow_tail_x + arrow_head_x) / 2.0
|
|
1016
|
+
arrow_len = arrow_head_x - arrow_tail_x
|
|
1017
|
+
log(f"Arrow: tail={arrow_tail_x:.1f}, head={arrow_head_x:.1f}, "
|
|
1018
|
+
f"y={arrow_y:.2f}, len={arrow_len:.1f}")
|
|
1019
|
+
|
|
1020
|
+
# --- Get step metadata for element positions ---
|
|
1021
|
+
scheme = page.find("scheme")
|
|
1022
|
+
step = scheme.find("step") if scheme is not None else None
|
|
1023
|
+
above_ids = step.get("ReactionStepObjectsAboveArrow", "").split() if step is not None else []
|
|
1024
|
+
below_ids = step.get("ReactionStepObjectsBelowArrow", "").split() if step is not None else []
|
|
1025
|
+
reactant_ids = step.get("ReactionStepReactants", "").split() if step is not None else []
|
|
1026
|
+
product_ids = step.get("ReactionStepProducts", "").split() if step is not None else []
|
|
1027
|
+
|
|
1028
|
+
id_to_el: Dict[str, ET.Element] = {}
|
|
1029
|
+
for el in page:
|
|
1030
|
+
eid = el.get("id", "")
|
|
1031
|
+
if eid:
|
|
1032
|
+
id_to_el[eid] = el
|
|
1033
|
+
|
|
1034
|
+
# --- ID allocation ---
|
|
1035
|
+
next_id = _get_max_id(root) + 1
|
|
1036
|
+
next_z = _get_max_z(root) + 1
|
|
1037
|
+
|
|
1038
|
+
# --- Above-arrow structure eq labels ---
|
|
1039
|
+
for m in enrichment.matches:
|
|
1040
|
+
if m.scheme_position != "above_arrow":
|
|
1041
|
+
continue
|
|
1042
|
+
if m.is_solvent:
|
|
1043
|
+
continue
|
|
1044
|
+
|
|
1045
|
+
el = id_to_el.get(m.scheme_element_id)
|
|
1046
|
+
if el is None or el.tag != "fragment":
|
|
1047
|
+
continue
|
|
1048
|
+
|
|
1049
|
+
equiv_str = _format_equiv(m.csv_equiv)
|
|
1050
|
+
if equiv_str == "1":
|
|
1051
|
+
continue # Don't show (1 eq.) for 1.0
|
|
1052
|
+
|
|
1053
|
+
# Get fragment bottom from atom positions only
|
|
1054
|
+
frag_bb = fragment_bbox_with_label_extension(el)
|
|
1055
|
+
if frag_bb is None:
|
|
1056
|
+
continue
|
|
1057
|
+
frag_bottom = frag_bb[3]
|
|
1058
|
+
|
|
1059
|
+
# Shift fragment UP to make room for eq label
|
|
1060
|
+
# Ensure at least 20pt between fragment bottom and arrow
|
|
1061
|
+
gap_needed = 20.0
|
|
1062
|
+
current_gap = arrow_y - frag_bottom
|
|
1063
|
+
if current_gap < gap_needed:
|
|
1064
|
+
shift_up = gap_needed - current_gap
|
|
1065
|
+
_shift_fragment(el, 0, -shift_up)
|
|
1066
|
+
frag_bb = fragment_bbox_with_label_extension(el)
|
|
1067
|
+
frag_bottom = frag_bb[3]
|
|
1068
|
+
log(f" Shifted fragment {m.scheme_element_id} up by {shift_up:.1f}pt")
|
|
1069
|
+
|
|
1070
|
+
# Place eq label midway between fragment bottom and arrow,
|
|
1071
|
+
# centered on the arrow midpoint (not the fragment center)
|
|
1072
|
+
label_y = (frag_bottom + arrow_y) / 2.0 + 3.0 # +3 for baseline offset
|
|
1073
|
+
label_text = f"({equiv_str} eq.)"
|
|
1074
|
+
|
|
1075
|
+
eq_label = _create_text_element(
|
|
1076
|
+
next_id, next_z, arrow_cx, label_y, label_text,
|
|
1077
|
+
justify="Center",
|
|
1078
|
+
)
|
|
1079
|
+
page.append(eq_label)
|
|
1080
|
+
# Add to above-arrow objects in step
|
|
1081
|
+
if step is not None:
|
|
1082
|
+
above_str = step.get("ReactionStepObjectsAboveArrow", "")
|
|
1083
|
+
step.set("ReactionStepObjectsAboveArrow",
|
|
1084
|
+
f"{above_str} {next_id}".strip())
|
|
1085
|
+
|
|
1086
|
+
log(f" Above-arrow eq label: '{label_text}' at ({arrow_cx:.1f}, {label_y:.1f})")
|
|
1087
|
+
next_id += 1
|
|
1088
|
+
next_z += 1
|
|
1089
|
+
|
|
1090
|
+
# --- Left/right side structure eq labels ---
|
|
1091
|
+
for m in enrichment.matches:
|
|
1092
|
+
if m.scheme_position not in ("reactant",):
|
|
1093
|
+
continue
|
|
1094
|
+
if m.is_solvent:
|
|
1095
|
+
continue
|
|
1096
|
+
|
|
1097
|
+
el = id_to_el.get(m.scheme_element_id)
|
|
1098
|
+
if el is None or el.tag != "fragment":
|
|
1099
|
+
continue
|
|
1100
|
+
|
|
1101
|
+
equiv_str = _format_equiv(m.csv_equiv)
|
|
1102
|
+
if equiv_str == "1":
|
|
1103
|
+
continue # Don't show (1 eq.) for 1.0
|
|
1104
|
+
|
|
1105
|
+
frag_bb = fragment_bbox_with_label_extension(el)
|
|
1106
|
+
if frag_bb is None:
|
|
1107
|
+
continue
|
|
1108
|
+
frag_bottom = frag_bb[3]
|
|
1109
|
+
frag_cx = (frag_bb[0] + frag_bb[2]) / 2.0
|
|
1110
|
+
|
|
1111
|
+
# Place label below fragment (no shifting)
|
|
1112
|
+
label_y = frag_bottom + 12.0
|
|
1113
|
+
label_text = f"({equiv_str} eq.)"
|
|
1114
|
+
|
|
1115
|
+
eq_label = _create_text_element(
|
|
1116
|
+
next_id, next_z, frag_cx, label_y, label_text,
|
|
1117
|
+
justify="Center",
|
|
1118
|
+
)
|
|
1119
|
+
page.append(eq_label)
|
|
1120
|
+
log(f" Side eq label: '{label_text}' below fragment {m.scheme_element_id}")
|
|
1121
|
+
next_id += 1
|
|
1122
|
+
next_z += 1
|
|
1123
|
+
|
|
1124
|
+
# --- Run arrow ---
|
|
1125
|
+
if enrichment.sm_mass or enrichment.product_obtained:
|
|
1126
|
+
_create_run_arrow(
|
|
1127
|
+
page, root, arrow_tail_x, arrow_head_x, arrow_y,
|
|
1128
|
+
enrichment, id_to_el, below_ids,
|
|
1129
|
+
next_id, next_z, log,
|
|
1130
|
+
)
|
|
1131
|
+
|
|
1132
|
+
|
|
1133
|
+
def _create_run_arrow(
|
|
1134
|
+
page: ET.Element,
|
|
1135
|
+
root: ET.Element,
|
|
1136
|
+
arrow_tail_x: float,
|
|
1137
|
+
arrow_head_x: float,
|
|
1138
|
+
arrow_y: float,
|
|
1139
|
+
enrichment: EnrichmentData,
|
|
1140
|
+
id_to_el: Dict[str, ET.Element],
|
|
1141
|
+
below_ids: List[str],
|
|
1142
|
+
next_id: int,
|
|
1143
|
+
next_z: int,
|
|
1144
|
+
log,
|
|
1145
|
+
) -> None:
|
|
1146
|
+
"""Create the run arrow with SM mass and product yield."""
|
|
1147
|
+
# Find bottom of all content below arrow
|
|
1148
|
+
content_bottom = arrow_y
|
|
1149
|
+
for el in page:
|
|
1150
|
+
eid = el.get("id", "")
|
|
1151
|
+
if el.tag in ("fragment", "t"):
|
|
1152
|
+
bb = _get_element_bbox(el)
|
|
1153
|
+
if bb and bb[3] > content_bottom:
|
|
1154
|
+
content_bottom = bb[3]
|
|
1155
|
+
|
|
1156
|
+
# Run arrow y position: below all content
|
|
1157
|
+
run_arrow_y = content_bottom + 20.0
|
|
1158
|
+
|
|
1159
|
+
log(f" Run arrow at y={run_arrow_y:.1f} "
|
|
1160
|
+
f"(content_bottom={content_bottom:.1f})")
|
|
1161
|
+
|
|
1162
|
+
# Create <graphic> element (the old-style reference)
|
|
1163
|
+
graphic_id = next_id
|
|
1164
|
+
next_id += 1
|
|
1165
|
+
arrow_id = next_id
|
|
1166
|
+
next_id += 1
|
|
1167
|
+
|
|
1168
|
+
graphic = ET.SubElement(page, "graphic")
|
|
1169
|
+
graphic.set("id", str(graphic_id))
|
|
1170
|
+
graphic.set("SupersededBy", str(arrow_id))
|
|
1171
|
+
graphic.set("BoundingBox",
|
|
1172
|
+
f"{arrow_head_x:.2f} {run_arrow_y:.2f} "
|
|
1173
|
+
f"{arrow_tail_x:.2f} {run_arrow_y:.2f}")
|
|
1174
|
+
graphic.set("Z", str(next_z))
|
|
1175
|
+
next_z += 1
|
|
1176
|
+
graphic.set("GraphicType", "Line")
|
|
1177
|
+
graphic.set("ArrowType", "FullHead")
|
|
1178
|
+
graphic.set("HeadSize", "1000")
|
|
1179
|
+
|
|
1180
|
+
# Create <arrow> element
|
|
1181
|
+
arrow = ET.SubElement(page, "arrow")
|
|
1182
|
+
arrow.set("id", str(arrow_id))
|
|
1183
|
+
bb_top = run_arrow_y - 1.64
|
|
1184
|
+
bb_bot = run_arrow_y + 1.52
|
|
1185
|
+
arrow.set("BoundingBox",
|
|
1186
|
+
f"{arrow_tail_x:.2f} {bb_top:.2f} "
|
|
1187
|
+
f"{arrow_head_x:.2f} {bb_bot:.2f}")
|
|
1188
|
+
arrow.set("Z", str(next_z))
|
|
1189
|
+
next_z += 1
|
|
1190
|
+
arrow.set("FillType", "None")
|
|
1191
|
+
arrow.set("ArrowheadHead", "Full")
|
|
1192
|
+
arrow.set("ArrowheadType", "Solid")
|
|
1193
|
+
arrow.set("HeadSize", "1000")
|
|
1194
|
+
arrow.set("ArrowheadCenterSize", "875")
|
|
1195
|
+
arrow.set("ArrowheadWidth", "250")
|
|
1196
|
+
arrow.set("Head3D", f"{arrow_head_x:.2f} {run_arrow_y:.2f} 0")
|
|
1197
|
+
arrow.set("Tail3D", f"{arrow_tail_x:.2f} {run_arrow_y:.2f} 0")
|
|
1198
|
+
# Center3D / MajorAxisEnd3D / MinorAxisEnd3D (cosmetic, approximated)
|
|
1199
|
+
cx_3d = (arrow_tail_x + arrow_head_x) / 2.0 + 290.0
|
|
1200
|
+
cy_3d = run_arrow_y + 129.0
|
|
1201
|
+
arrow.set("Center3D", f"{cx_3d:.2f} {cy_3d:.2f} 0")
|
|
1202
|
+
arrow.set("MajorAxisEnd3D",
|
|
1203
|
+
f"{cx_3d + (arrow_head_x - arrow_tail_x) / 2.0:.2f} {cy_3d:.2f} 0")
|
|
1204
|
+
arrow.set("MinorAxisEnd3D",
|
|
1205
|
+
f"{cx_3d:.2f} {cy_3d + (arrow_head_x - arrow_tail_x) / 2.0:.2f} 0")
|
|
1206
|
+
|
|
1207
|
+
# --- SM mass text (left of run arrow) ---
|
|
1208
|
+
sm_text_y = run_arrow_y + 2.25 # baseline slightly below arrow
|
|
1209
|
+
if enrichment.sm_mass:
|
|
1210
|
+
sm_label = _create_text_element(
|
|
1211
|
+
next_id, next_z,
|
|
1212
|
+
arrow_tail_x - 4.0, # right edge aligned near arrow tail
|
|
1213
|
+
sm_text_y,
|
|
1214
|
+
enrichment.sm_mass,
|
|
1215
|
+
justify="Right",
|
|
1216
|
+
)
|
|
1217
|
+
page.append(sm_label)
|
|
1218
|
+
log(f" SM mass: '{enrichment.sm_mass}' at x={arrow_tail_x - 4.0:.1f}")
|
|
1219
|
+
next_id += 1
|
|
1220
|
+
next_z += 1
|
|
1221
|
+
|
|
1222
|
+
# --- Product yield text (right of run arrow) ---
|
|
1223
|
+
if enrichment.product_obtained or enrichment.product_yield:
|
|
1224
|
+
yield_parts = []
|
|
1225
|
+
if enrichment.product_obtained:
|
|
1226
|
+
yield_parts.append(enrichment.product_obtained)
|
|
1227
|
+
if enrichment.product_yield:
|
|
1228
|
+
yield_parts.append(enrichment.product_yield)
|
|
1229
|
+
yield_text = ", ".join(yield_parts)
|
|
1230
|
+
|
|
1231
|
+
yield_label = _create_text_element(
|
|
1232
|
+
next_id, next_z,
|
|
1233
|
+
arrow_head_x + 4.0, # left edge aligned near arrow head
|
|
1234
|
+
sm_text_y,
|
|
1235
|
+
yield_text,
|
|
1236
|
+
justify="Left",
|
|
1237
|
+
)
|
|
1238
|
+
page.append(yield_label)
|
|
1239
|
+
log(f" Product yield: '{yield_text}' at x={arrow_head_x + 4.0:.1f}")
|
|
1240
|
+
next_id += 1
|
|
1241
|
+
next_z += 1
|
|
1242
|
+
|
|
1243
|
+
# --- Update document BoundingBox ---
|
|
1244
|
+
_update_document_bbox(root, page)
|
|
1245
|
+
|
|
1246
|
+
|
|
1247
|
+
# ---------------------------------------------------------------------------
|
|
1248
|
+
# Element creation helpers
|
|
1249
|
+
# ---------------------------------------------------------------------------
|
|
1250
|
+
|
|
1251
|
+
def _create_text_element(
|
|
1252
|
+
elem_id: int,
|
|
1253
|
+
z_order: int,
|
|
1254
|
+
x: float,
|
|
1255
|
+
y: float,
|
|
1256
|
+
text: str,
|
|
1257
|
+
justify: str = "Left",
|
|
1258
|
+
) -> ET.Element:
|
|
1259
|
+
"""Create a standalone <t> element with plain text content."""
|
|
1260
|
+
t = ET.Element("t")
|
|
1261
|
+
t.set("id", str(elem_id))
|
|
1262
|
+
t.set("p", f"{x:.2f} {y:.2f}")
|
|
1263
|
+
t.set("Z", str(z_order))
|
|
1264
|
+
t.set("Warning", "Chemical Interpretation is not possible for this label")
|
|
1265
|
+
t.set("LineHeight", "auto")
|
|
1266
|
+
|
|
1267
|
+
if justify == "Center":
|
|
1268
|
+
t.set("CaptionJustification", "Center")
|
|
1269
|
+
t.set("Justification", "Center")
|
|
1270
|
+
elif justify == "Right":
|
|
1271
|
+
t.set("CaptionJustification", "Right")
|
|
1272
|
+
t.set("Justification", "Right")
|
|
1273
|
+
|
|
1274
|
+
s = ET.SubElement(t, "s")
|
|
1275
|
+
s.set("font", "3")
|
|
1276
|
+
s.set("size", "10")
|
|
1277
|
+
s.set("color", "0")
|
|
1278
|
+
s.text = text
|
|
1279
|
+
|
|
1280
|
+
# Compute bounding box
|
|
1281
|
+
char_w = 5.8
|
|
1282
|
+
line_h = 12.0
|
|
1283
|
+
w = len(text) * char_w
|
|
1284
|
+
|
|
1285
|
+
if justify == "Center":
|
|
1286
|
+
x1 = x - w / 2.0
|
|
1287
|
+
x2 = x + w / 2.0
|
|
1288
|
+
elif justify == "Right":
|
|
1289
|
+
x1 = x - w
|
|
1290
|
+
x2 = x
|
|
1291
|
+
else:
|
|
1292
|
+
x1 = x
|
|
1293
|
+
x2 = x + w
|
|
1294
|
+
|
|
1295
|
+
y1 = y - line_h + 3.0
|
|
1296
|
+
y2 = y + 3.0
|
|
1297
|
+
|
|
1298
|
+
t.set("BoundingBox", f"{x1:.2f} {y1:.2f} {x2:.2f} {y2:.2f}")
|
|
1299
|
+
return t
|
|
1300
|
+
|
|
1301
|
+
|
|
1302
|
+
def _shift_fragment(frag: ET.Element, dx: float, dy: float):
|
|
1303
|
+
"""Shift all coordinates in a fragment by (dx, dy)."""
|
|
1304
|
+
for n in frag.iter("n"):
|
|
1305
|
+
p = n.get("p")
|
|
1306
|
+
if p:
|
|
1307
|
+
parts = p.split()
|
|
1308
|
+
if len(parts) >= 2:
|
|
1309
|
+
nx = float(parts[0]) + dx
|
|
1310
|
+
ny = float(parts[1]) + dy
|
|
1311
|
+
n.set("p", f"{nx:.2f} {ny:.2f}")
|
|
1312
|
+
|
|
1313
|
+
for t in frag.iter("t"):
|
|
1314
|
+
p = t.get("p")
|
|
1315
|
+
if p:
|
|
1316
|
+
parts = p.split()
|
|
1317
|
+
if len(parts) >= 2:
|
|
1318
|
+
nx = float(parts[0]) + dx
|
|
1319
|
+
ny = float(parts[1]) + dy
|
|
1320
|
+
t.set("p", f"{nx:.2f} {ny:.2f}")
|
|
1321
|
+
bb = t.get("BoundingBox")
|
|
1322
|
+
if bb:
|
|
1323
|
+
vals = [float(v) for v in bb.split()]
|
|
1324
|
+
if len(vals) >= 4:
|
|
1325
|
+
vals[0] += dx
|
|
1326
|
+
vals[1] += dy
|
|
1327
|
+
vals[2] += dx
|
|
1328
|
+
vals[3] += dy
|
|
1329
|
+
t.set("BoundingBox",
|
|
1330
|
+
" ".join(f"{v:.2f}" for v in vals))
|
|
1331
|
+
|
|
1332
|
+
bb = frag.get("BoundingBox")
|
|
1333
|
+
if bb:
|
|
1334
|
+
vals = [float(v) for v in bb.split()]
|
|
1335
|
+
if len(vals) >= 4:
|
|
1336
|
+
vals[0] += dx
|
|
1337
|
+
vals[1] += dy
|
|
1338
|
+
vals[2] += dx
|
|
1339
|
+
vals[3] += dy
|
|
1340
|
+
frag.set("BoundingBox",
|
|
1341
|
+
" ".join(f"{v:.2f}" for v in vals))
|
|
1342
|
+
|
|
1343
|
+
# Inner fragments (abbreviation groups)
|
|
1344
|
+
for inner in frag.iter("fragment"):
|
|
1345
|
+
if inner is not frag:
|
|
1346
|
+
ib = inner.get("BoundingBox")
|
|
1347
|
+
if ib:
|
|
1348
|
+
vals = [float(v) for v in ib.split()]
|
|
1349
|
+
if len(vals) >= 4:
|
|
1350
|
+
vals[0] += dx
|
|
1351
|
+
vals[1] += dy
|
|
1352
|
+
vals[2] += dx
|
|
1353
|
+
vals[3] += dy
|
|
1354
|
+
inner.set("BoundingBox",
|
|
1355
|
+
" ".join(f"{v:.2f}" for v in vals))
|
|
1356
|
+
|
|
1357
|
+
|
|
1358
|
+
def _get_element_bbox(el: ET.Element) -> Optional[Tuple[float, float, float, float]]:
|
|
1359
|
+
"""Get bounding box for any element."""
|
|
1360
|
+
if el.tag == "fragment":
|
|
1361
|
+
return fragment_bbox_with_label_extension(el)
|
|
1362
|
+
elif el.tag == "t":
|
|
1363
|
+
bb = el.get("BoundingBox", "")
|
|
1364
|
+
if bb:
|
|
1365
|
+
vals = [float(v) for v in bb.split()]
|
|
1366
|
+
if len(vals) >= 4:
|
|
1367
|
+
return (vals[0], vals[1], vals[2], vals[3])
|
|
1368
|
+
# Fallback from p attribute
|
|
1369
|
+
p = el.get("p", "")
|
|
1370
|
+
if p:
|
|
1371
|
+
parts = [float(v) for v in p.split()]
|
|
1372
|
+
text = _get_text_content(el)
|
|
1373
|
+
w = len(text) * 5.8
|
|
1374
|
+
return (parts[0] - w/2, parts[1] - 12.0, parts[0] + w/2, parts[1])
|
|
1375
|
+
return None
|
|
1376
|
+
|
|
1377
|
+
|
|
1378
|
+
def _update_document_bbox(root: ET.Element, page: ET.Element):
|
|
1379
|
+
"""Update root CDXML BoundingBox to encompass all content."""
|
|
1380
|
+
min_x = min_y = float('inf')
|
|
1381
|
+
max_x = max_y = float('-inf')
|
|
1382
|
+
|
|
1383
|
+
for el in page:
|
|
1384
|
+
bb = _get_element_bbox(el)
|
|
1385
|
+
if bb is None:
|
|
1386
|
+
continue
|
|
1387
|
+
min_x = min(min_x, bb[0])
|
|
1388
|
+
min_y = min(min_y, bb[1])
|
|
1389
|
+
max_x = max(max_x, bb[2])
|
|
1390
|
+
max_y = max(max_y, bb[3])
|
|
1391
|
+
|
|
1392
|
+
if min_x < float('inf'):
|
|
1393
|
+
root.set("BoundingBox",
|
|
1394
|
+
f"{min_x:.2f} {min_y:.2f} {max_x:.2f} {max_y:.2f}")
|