cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1394 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ eln_enrichment.py -- Enrich a polished reaction scheme with ELN CSV data.
4
+
5
+ Given a polished CDXML (from scheme_polisher) and a Findmolecule ELN CSV,
6
+ annotates the scheme with:
7
+ - Equivalents on each reagent (text labels and above-arrow structures)
8
+ - A "run arrow" below the scheme showing SM mass and product yield
9
+
10
+ Two-phase design:
11
+ Phase A (before layout): Inject equivalents into text content so that
12
+ text widths are correct for arrow length computation.
13
+ Phase B (after layout): Add run arrow, above-arrow eq labels, and
14
+ side eq labels using finalized positions.
15
+
16
+ Usage (via scheme_polisher_v2.py):
17
+ python scheme_polisher_v2.py input.cdx --eln-csv experiment.csv -o out.cdxml
18
+ """
19
+
20
+ import os
21
+ import re
22
+ import sys
23
+ import xml.etree.ElementTree as ET
24
+ from dataclasses import dataclass, field
25
+ from typing import Dict, List, Optional, Tuple
26
+ from xml.sax.saxutils import escape as xml_escape
27
+
28
+ from ...cdxml_utils import (
29
+ fragment_bbox,
30
+ fragment_bbox_with_label_extension,
31
+ fragment_bottom_has_hanging_label,
32
+ recompute_text_bbox,
33
+ )
34
+ from ...constants import (
35
+ CDXML_FOOTER,
36
+ CDXML_MINIMAL_HEADER,
37
+ MW_MATCH_TOLERANCE,
38
+ MW_MATCH_TOLERANCE_LOOSE,
39
+ )
40
+ from ...text_formatting import build_formatted_s_xml
41
+
42
+
43
+ # ---------------------------------------------------------------------------
44
+ # Data structures
45
+ # ---------------------------------------------------------------------------
46
+
47
+ @dataclass
48
+ class MatchedReagent:
49
+ """A CSV reagent matched to a scheme element."""
50
+ csv_name: str
51
+ csv_equiv: str # raw equiv string from CSV, e.g. "2.0"
52
+ csv_mass: str # e.g. "2.15 g"
53
+ csv_is_substrate: bool
54
+ csv_mw: float
55
+ scheme_element_id: str # id of the matched <t> or <fragment>
56
+ scheme_position: str # "reactant", "above_arrow", "below_arrow"
57
+ scheme_display: str # display text on the scheme (e.g. "Cs2CO3")
58
+ is_solvent: bool = False
59
+
60
+
61
+ @dataclass
62
+ class EnrichmentData:
63
+ """All enrichment info extracted from CSV + scheme matching."""
64
+ matches: List[MatchedReagent] = field(default_factory=list)
65
+ substrate: Optional[MatchedReagent] = None # equiv=1.0, is_substrate
66
+ sm_mass: str = "" # e.g. "2.15 g"
67
+ product_obtained: str = "" # e.g. "1.6 g"
68
+ product_yield: str = "" # e.g. "72%"
69
+ solvent_names: List[str] = field(default_factory=list)
70
+
71
+
72
+ # ---------------------------------------------------------------------------
73
+ # Helpers
74
+ # ---------------------------------------------------------------------------
75
+
76
+ def _format_equiv(equiv_str: str) -> str:
77
+ """Format equivalents for display: '2.0' -> '2', '0.05' -> '0.05'."""
78
+ try:
79
+ val = float(equiv_str)
80
+ if val == int(val) and val >= 1:
81
+ return str(int(val))
82
+ # Strip trailing zeros but keep significant decimals
83
+ formatted = f"{val:g}"
84
+ return formatted
85
+ except (ValueError, TypeError):
86
+ return equiv_str
87
+
88
+
89
+ def _get_text_content(el: ET.Element) -> str:
90
+ """Extract concatenated text from all <s> children of a <t> element."""
91
+ parts = []
92
+ for s in el.iter("s"):
93
+ if s.text:
94
+ parts.append(s.text)
95
+ return "".join(parts).strip()
96
+
97
+
98
+ def _normalize_name(name: str) -> str:
99
+ """Normalize a name for comparison: lowercase, strip whitespace."""
100
+ return re.sub(r'\s+', ' ', name.strip().lower())
101
+
102
+
103
+ def _get_max_id(root: ET.Element) -> int:
104
+ """Find the maximum id attribute value in the entire document."""
105
+ max_id = 0
106
+ for el in root.iter():
107
+ eid = el.get("id", "")
108
+ if eid:
109
+ try:
110
+ max_id = max(max_id, int(eid))
111
+ except ValueError:
112
+ pass
113
+ return max_id
114
+
115
+
116
+ def _get_max_z(root: ET.Element) -> int:
117
+ """Find the maximum Z attribute value in the entire document."""
118
+ max_z = 0
119
+ for el in root.iter():
120
+ z = el.get("Z", "")
121
+ if z:
122
+ try:
123
+ max_z = max(max_z, int(z))
124
+ except ValueError:
125
+ pass
126
+ return max_z
127
+
128
+
129
+ # ---------------------------------------------------------------------------
130
+ # Step 1: CSV-to-scheme matching
131
+ # ---------------------------------------------------------------------------
132
+
133
+ def match_csv_to_scheme(
134
+ root: ET.Element,
135
+ csv_path: str,
136
+ verbose: bool = False,
137
+ ) -> EnrichmentData:
138
+ """Match CSV reagents/solvents/product to scheme elements.
139
+
140
+ Uses two passes:
141
+ 1. Name match via reagent_db.resolve_display()
142
+ 2. MW match via RDKit (fallback for CSV names that don't resolve)
143
+
144
+ Parameters
145
+ ----------
146
+ root : ET.Element
147
+ Parsed CDXML root element (after polish_scheme).
148
+ csv_path : str
149
+ Path to Findmolecule ELN CSV file.
150
+ verbose : bool
151
+ Print matching details to stderr.
152
+
153
+ Returns
154
+ -------
155
+ EnrichmentData with all matches + product info.
156
+ """
157
+ from ...perception.eln_csv_parser import parse_eln_csv
158
+ from ...resolve.reagent_db import get_reagent_db
159
+
160
+ def log(msg: str):
161
+ if verbose:
162
+ print(f" [enrich] {msg}", file=sys.stderr)
163
+
164
+ # Parse CSV
165
+ exp = parse_eln_csv(csv_path)
166
+ if exp is None:
167
+ log("WARNING: Could not parse CSV")
168
+ return EnrichmentData()
169
+
170
+ db = get_reagent_db()
171
+ enrichment = EnrichmentData()
172
+
173
+ # Collect solvent names from CSV
174
+ for s in exp.solvents:
175
+ enrichment.solvent_names.append(_normalize_name(s.name))
176
+
177
+ # Product info
178
+ if exp.product:
179
+ enrichment.product_obtained = exp.product.obtained_mass.strip()
180
+ enrichment.product_yield = exp.product.yield_pct.strip()
181
+
182
+ # --- Build scheme element inventory ---
183
+ page = root.find("page")
184
+ if page is None:
185
+ return enrichment
186
+
187
+ scheme = page.find("scheme")
188
+ step = scheme.find("step") if scheme is not None else None
189
+ if step is None:
190
+ return enrichment
191
+
192
+ reactant_ids = step.get("ReactionStepReactants", "").split()
193
+ product_ids = step.get("ReactionStepProducts", "").split()
194
+ above_ids = step.get("ReactionStepObjectsAboveArrow", "").split()
195
+ below_ids = step.get("ReactionStepObjectsBelowArrow", "").split()
196
+
197
+ # Build id -> (element, position) map
198
+ id_to_el: Dict[str, ET.Element] = {}
199
+ for el in page:
200
+ eid = el.get("id", "")
201
+ if eid:
202
+ id_to_el[eid] = el
203
+
204
+ # Build scheme_elements: list of (element_id, position, display_text, smiles_or_none, mw_or_none)
205
+ scheme_elements: List[Dict] = []
206
+
207
+ def _add_element(eid: str, position: str):
208
+ el = id_to_el.get(eid)
209
+ if el is None:
210
+ return
211
+ if el.tag == "t":
212
+ text = _get_text_content(el)
213
+ # For merged text blocks, split into lines
214
+ lines = [l.strip() for l in text.split("\n") if l.strip()]
215
+ for line in lines:
216
+ scheme_elements.append({
217
+ "element_id": eid,
218
+ "position": position,
219
+ "display": line,
220
+ "tag": "t",
221
+ "is_line_in_merged": len(lines) > 1,
222
+ })
223
+ elif el.tag == "fragment":
224
+ # Get the display name from the fragment (check if it was replaced by text)
225
+ # For fragments, we need to look at what the polisher classified it as
226
+ # The display name might be derived from SMILES or classification
227
+ # For matching, we'll try to compute MW from atom coordinates
228
+ frag_mw = _compute_fragment_mw(el)
229
+ scheme_elements.append({
230
+ "element_id": eid,
231
+ "position": position,
232
+ "display": None,
233
+ "tag": "fragment",
234
+ "mw": frag_mw,
235
+ "is_line_in_merged": False,
236
+ })
237
+
238
+ for rid in reactant_ids:
239
+ _add_element(rid, "reactant")
240
+ for eid in above_ids:
241
+ _add_element(eid, "above_arrow")
242
+ for eid in below_ids:
243
+ _add_element(eid, "below_arrow")
244
+
245
+ # --- Pass 1: Name match ---
246
+ matched_csv_indices = set()
247
+ matched_scheme_ids = set()
248
+
249
+ for i, reagent in enumerate(exp.reactants):
250
+ csv_display = db.resolve_display(reagent.name)
251
+ csv_norm = _normalize_name(csv_display)
252
+ csv_name_norm = _normalize_name(reagent.name)
253
+
254
+ for se in scheme_elements:
255
+ if se["element_id"] in matched_scheme_ids and not se["is_line_in_merged"]:
256
+ continue
257
+ if se["tag"] != "t" or se["display"] is None:
258
+ continue
259
+
260
+ scheme_display = se["display"]
261
+ scheme_norm = _normalize_name(scheme_display)
262
+
263
+ # Compare: resolved display vs scheme text (ignoring existing equiv annotations)
264
+ scheme_clean = re.sub(r'\s*\([\d.]+\s*eq\.\)\s*$', '', scheme_norm)
265
+
266
+ if csv_norm == scheme_clean or csv_name_norm == scheme_clean:
267
+ match = MatchedReagent(
268
+ csv_name=reagent.name,
269
+ csv_equiv=reagent.equiv,
270
+ csv_mass=reagent.mass,
271
+ csv_is_substrate=reagent.is_substrate,
272
+ csv_mw=reagent.mw,
273
+ scheme_element_id=se["element_id"],
274
+ scheme_position=se["position"],
275
+ scheme_display=scheme_display,
276
+ is_solvent=_normalize_name(reagent.name) in enrichment.solvent_names,
277
+ )
278
+ enrichment.matches.append(match)
279
+ matched_csv_indices.add(i)
280
+ if not se["is_line_in_merged"]:
281
+ matched_scheme_ids.add(se["element_id"])
282
+ log(f"Name match: CSV '{reagent.name}' -> scheme '{scheme_display}' "
283
+ f"(pos={se['position']}, equiv={reagent.equiv})")
284
+ break
285
+
286
+ # Also match solvents by name (they appear in scheme text but don't get equiv)
287
+ for solvent in exp.solvents:
288
+ solv_display = db.resolve_display(solvent.name)
289
+ solv_norm = _normalize_name(solv_display)
290
+ solv_name_norm = _normalize_name(solvent.name)
291
+
292
+ for se in scheme_elements:
293
+ if se["tag"] != "t" or se["display"] is None:
294
+ continue
295
+ scheme_norm = _normalize_name(se["display"])
296
+ scheme_clean = re.sub(r'\s*\([\d.]+\s*eq\.\)\s*$', '', scheme_norm)
297
+ if solv_norm == scheme_clean or solv_name_norm == scheme_clean:
298
+ log(f"Solvent match: CSV '{solvent.name}' -> scheme '{se['display']}'")
299
+ break
300
+
301
+ # --- Pass 2: MW match (fallback for unmatched CSV reactants) ---
302
+ try:
303
+ from rdkit import Chem
304
+ from rdkit.Chem import Descriptors
305
+ _has_rdkit = True
306
+ except ImportError:
307
+ _has_rdkit = False
308
+
309
+ if _has_rdkit:
310
+ for i, reagent in enumerate(exp.reactants):
311
+ if i in matched_csv_indices:
312
+ continue
313
+
314
+ csv_mw = reagent.mw
315
+ if csv_mw <= 0:
316
+ continue
317
+
318
+ # Try to match against fragment MW — pick closest within window
319
+ best_se = None
320
+ best_delta = MW_MATCH_TOLERANCE # threshold
321
+ for se in scheme_elements:
322
+ se_id = se["element_id"]
323
+ if se_id in matched_scheme_ids:
324
+ continue
325
+ if se["tag"] != "fragment":
326
+ continue
327
+ frag_mw = se.get("mw")
328
+ if frag_mw is None or frag_mw <= 0:
329
+ continue
330
+ delta = abs(frag_mw - csv_mw)
331
+ if delta < best_delta:
332
+ best_delta = delta
333
+ best_se = se
334
+ if best_se is not None:
335
+ se_id = best_se["element_id"]
336
+ frag_mw = best_se["mw"]
337
+ match = MatchedReagent(
338
+ csv_name=reagent.name,
339
+ csv_equiv=reagent.equiv,
340
+ csv_mass=reagent.mass,
341
+ csv_is_substrate=reagent.is_substrate,
342
+ csv_mw=reagent.mw,
343
+ scheme_element_id=se_id,
344
+ scheme_position=best_se["position"],
345
+ scheme_display=f"fragment_{se_id}",
346
+ is_solvent=False,
347
+ )
348
+ enrichment.matches.append(match)
349
+ matched_csv_indices.add(i)
350
+ matched_scheme_ids.add(se_id)
351
+ log(f"MW match: CSV '{reagent.name}' (MW={csv_mw:.1f}) -> "
352
+ f"fragment {se_id} (MW={frag_mw:.1f}, delta={best_delta:.2f}, "
353
+ f"pos={best_se['position']}, equiv={reagent.equiv})")
354
+
355
+ # Also try matching against text elements by resolving their
356
+ # display name to SMILES (via reagent_db) and computing MW
357
+ # Pick closest match within window
358
+ if i not in matched_csv_indices:
359
+ best_text_se = None
360
+ best_text_mw = None
361
+ best_text_delta = MW_MATCH_TOLERANCE # threshold
362
+ best_text_display = None
363
+ for se in scheme_elements:
364
+ if se["tag"] != "t" or se["display"] is None:
365
+ continue
366
+ se_id = se["element_id"]
367
+ scheme_display = se["display"]
368
+ # Check not already matched as a line in merged text
369
+ already_matched_line = False
370
+ for existing in enrichment.matches:
371
+ if (existing.scheme_element_id == se_id
372
+ and existing.scheme_display == scheme_display):
373
+ already_matched_line = True
374
+ break
375
+ if already_matched_line:
376
+ continue
377
+ # Look up the entry in reagent_db by scheme display name
378
+ entry = db.entry_for_name(
379
+ _normalize_name(scheme_display).replace(" ", "")
380
+ )
381
+ if entry is None:
382
+ continue
383
+ smi_val = entry.get("smiles")
384
+ if not smi_val:
385
+ continue
386
+ smiles_list = smi_val if isinstance(smi_val, list) else [smi_val]
387
+ for smi in smiles_list:
388
+ mol = Chem.MolFromSmiles(smi)
389
+ if mol:
390
+ mw = Descriptors.ExactMolWt(mol)
391
+ delta = abs(mw - csv_mw)
392
+ if delta < best_text_delta:
393
+ best_text_delta = delta
394
+ best_text_se = se
395
+ best_text_mw = mw
396
+ best_text_display = scheme_display
397
+ if best_text_se is not None:
398
+ match = MatchedReagent(
399
+ csv_name=reagent.name,
400
+ csv_equiv=reagent.equiv,
401
+ csv_mass=reagent.mass,
402
+ csv_is_substrate=reagent.is_substrate,
403
+ csv_mw=reagent.mw,
404
+ scheme_element_id=best_text_se["element_id"],
405
+ scheme_position=best_text_se["position"],
406
+ scheme_display=best_text_display,
407
+ is_solvent=False,
408
+ )
409
+ enrichment.matches.append(match)
410
+ matched_csv_indices.add(i)
411
+ log(f"MW-via-SMILES match: CSV '{reagent.name}' "
412
+ f"(MW={csv_mw:.1f}) -> scheme '{best_text_display}' "
413
+ f"(MW={best_text_mw:.1f}, delta={best_text_delta:.2f})")
414
+
415
+ # --- Identify substrate (SM for run arrow) ---
416
+ # Use the reagent with equiv=1.0 and is_substrate=True
417
+ # If multiple substrates, use the one with largest MW (main SM)
418
+ substrate_candidates = [
419
+ m for m in enrichment.matches
420
+ if m.csv_is_substrate
421
+ ]
422
+ if substrate_candidates:
423
+ # Prefer equiv=1.0 substrate; if none, use largest MW
424
+ eq1_substrates = [m for m in substrate_candidates
425
+ if _format_equiv(m.csv_equiv) == "1"]
426
+ if eq1_substrates:
427
+ enrichment.substrate = max(eq1_substrates, key=lambda m: m.csv_mw)
428
+ else:
429
+ enrichment.substrate = max(substrate_candidates, key=lambda m: m.csv_mw)
430
+ enrichment.sm_mass = enrichment.substrate.csv_mass.strip()
431
+ log(f"Substrate: '{enrichment.substrate.csv_name}' "
432
+ f"(mass={enrichment.sm_mass})")
433
+
434
+ # Report unmatched
435
+ for i, reagent in enumerate(exp.reactants):
436
+ if i not in matched_csv_indices:
437
+ log(f"WARNING: Unmatched CSV reactant: '{reagent.name}' "
438
+ f"(MW={reagent.mw})")
439
+
440
+ return enrichment
441
+
442
+
443
+ def _compute_fragment_mw(frag: ET.Element) -> Optional[float]:
444
+ """Compute MW from a CDXML fragment element.
445
+
446
+ Three-tier resolution:
447
+ 1. ChemScript SMILES → RDKit MolWt (exact average MW)
448
+ 2. RDKit-direct from CDXML fragment (no ChemScript needed)
449
+ 3. Manual atom counting (less accurate for heteroatoms)
450
+ Returns None if fragment has no atoms.
451
+ """
452
+ # --- Tier 1: ChemScript + RDKit ---
453
+ mw = _compute_fragment_mw_via_smiles(frag)
454
+ if mw is not None:
455
+ return mw
456
+
457
+ # --- Tier 2: RDKit-direct from CDXML fragment ---
458
+ mw = _compute_fragment_mw_rdkit_direct(frag)
459
+ if mw is not None:
460
+ return mw
461
+
462
+ # --- Tier 3: manual atom counting ---
463
+ return _compute_fragment_mw_manual(frag)
464
+
465
+
466
+ def _compute_fragment_mw_rdkit_direct(frag: ET.Element) -> Optional[float]:
467
+ """Compute MW directly from CDXML fragment via RDKit (no ChemScript).
468
+
469
+ Uses rdkit_utils.frag_to_mw() which converts CDXML atoms/bonds to
470
+ an RDKit Mol and computes average MW. Returns None if the fragment
471
+ contains abbreviation groups (element 0 / dummy atoms).
472
+ """
473
+ try:
474
+ from ...rdkit_utils import frag_to_mw
475
+ return frag_to_mw(frag)
476
+ except ImportError:
477
+ return None
478
+ except Exception:
479
+ return None
480
+
481
+
482
+ def _compute_fragment_mw_via_smiles(frag: ET.Element) -> Optional[float]:
483
+ """Compute MW via ChemScript SMILES export + RDKit."""
484
+ try:
485
+ from ...chemdraw.chemscript_bridge import ChemScriptBridge
486
+ from rdkit import Chem
487
+ from rdkit.Chem import Descriptors
488
+ except ImportError:
489
+ return None
490
+
491
+ import tempfile
492
+
493
+ # Wrap fragment in minimal CDXML document
494
+ frag_xml = ET.tostring(frag, encoding="unicode")
495
+ cdxml_doc = (
496
+ CDXML_MINIMAL_HEADER + "\n<page id=\"1\">\n"
497
+ + frag_xml
498
+ + "\n</page>\n" + CDXML_FOOTER
499
+ )
500
+
501
+ try:
502
+ bridge = ChemScriptBridge()
503
+ # Write temp CDXML file for ChemScript
504
+ tmp = tempfile.NamedTemporaryFile(
505
+ suffix=".cdxml", delete=False, mode="w", encoding="utf-8"
506
+ )
507
+ tmp.write(cdxml_doc)
508
+ tmp.close()
509
+
510
+ smiles = bridge.write_data(tmp.name, "chemical/x-smiles")
511
+ os.unlink(tmp.name)
512
+
513
+ if not smiles or not smiles.strip():
514
+ return None
515
+
516
+ mol = Chem.MolFromSmiles(smiles.strip())
517
+ if mol is None:
518
+ return None
519
+
520
+ # Use average MW (MolWt) to match CSV values, not monoisotopic
521
+ return Descriptors.MolWt(mol)
522
+ except Exception:
523
+ return None
524
+
525
+
526
+ def _compute_fragment_mw_manual(frag: ET.Element) -> Optional[float]:
527
+ """Fallback: compute approximate MW from CDXML atom elements.
528
+
529
+ Counts atoms by element type, adds implicit H from NumHydrogens
530
+ attribute. Only estimates implicit H for carbon (valence 4);
531
+ heteroatom implicit H requires NumHydrogens to be present.
532
+ """
533
+ ATOMIC_WEIGHTS = {
534
+ 1: 1.008, 5: 10.81, 6: 12.011, 7: 14.007, 8: 15.999,
535
+ 9: 18.998, 14: 28.086, 15: 30.974, 16: 32.065, 17: 35.453,
536
+ 35: 79.904, 53: 126.904, 11: 22.990, 19: 39.098,
537
+ 46: 106.42, 55: 132.905, 29: 63.546, 30: 65.38,
538
+ }
539
+
540
+ def _collect_atoms_bonds(container):
541
+ nodes = {} # id -> Element
542
+ bonds = []
543
+ for n in container.findall("n"):
544
+ nid = n.get("id", "")
545
+ node_type = n.get("NodeType", "")
546
+ if node_type == "Fragment":
547
+ inner = n.find("fragment")
548
+ if inner is not None:
549
+ inner_nodes, inner_bonds = _collect_atoms_bonds(inner)
550
+ nodes.update(inner_nodes)
551
+ bonds.extend(inner_bonds)
552
+ continue
553
+ if node_type == "ExternalConnectionPoint":
554
+ continue
555
+ if nid:
556
+ nodes[nid] = n
557
+ bonds.extend(container.findall("b"))
558
+ return nodes, bonds
559
+
560
+ all_nodes, all_bonds = _collect_atoms_bonds(frag)
561
+
562
+ total_mw = 0.0
563
+ atom_count = 0
564
+
565
+ for nid, n in all_nodes.items():
566
+ elem = n.get("Element", "6")
567
+ try:
568
+ elem_num = int(elem)
569
+ except ValueError:
570
+ elem_num = 6
571
+
572
+ weight = ATOMIC_WEIGHTS.get(elem_num, 0)
573
+ total_mw += weight
574
+ atom_count += 1
575
+
576
+ nh = n.get("NumHydrogens")
577
+ if nh is not None:
578
+ try:
579
+ total_mw += int(nh) * 1.008
580
+ except ValueError:
581
+ pass
582
+ elif elem_num == 6:
583
+ bond_count = 0
584
+ for b in all_bonds:
585
+ if b.get("B") == nid or b.get("E") == nid:
586
+ order = b.get("Order", "1")
587
+ try:
588
+ bond_count += int(order)
589
+ except ValueError:
590
+ bond_count += 1
591
+ implicit_h = max(0, 4 - bond_count)
592
+ total_mw += implicit_h * 1.008
593
+
594
+ return total_mw if atom_count > 0 else None
595
+
596
+
597
+ # ---------------------------------------------------------------------------
598
+ # Step 1.5: Reposition non-substrate reactant to above-arrow
599
+ # ---------------------------------------------------------------------------
600
+
601
+ def reposition_reactant_above_arrow(
602
+ root: ET.Element,
603
+ csv_path: str,
604
+ verbose: bool = False,
605
+ ) -> bool:
606
+ """Move a non-substrate reactant from left-of-arrow to above-arrow.
607
+
608
+ When two atom-contributing structures sit to the left of the arrow
609
+ and nothing is drawn above it, the non-substrate (the one that is
610
+ NOT 1.0 eq in the ELN CSV) should be moved above the arrow. The
611
+ substrate stays on the left.
612
+
613
+ Only modifies ``<step>`` metadata (``ReactionStepReactants`` and
614
+ ``ReactionStepObjectsAboveArrow``). Physical repositioning is
615
+ handled downstream by ``reaction_cleanup``'s ``_stack_above_below``.
616
+
617
+ Parameters
618
+ ----------
619
+ root : ET.Element
620
+ Parsed CDXML root (after scheme_polisher).
621
+ csv_path : str
622
+ Path to Findmolecule ELN CSV file.
623
+ verbose : bool
624
+ Print details to stderr.
625
+
626
+ Returns
627
+ -------
628
+ True if a fragment was repositioned, False otherwise.
629
+ """
630
+ from ...perception.eln_csv_parser import parse_eln_csv
631
+
632
+ def log(msg: str):
633
+ if verbose:
634
+ print(f" [reposition] {msg}", file=sys.stderr)
635
+
636
+ # Parse CSV to identify substrate
637
+ exp = parse_eln_csv(csv_path)
638
+ if exp is None:
639
+ log("Could not parse CSV")
640
+ return False
641
+
642
+ # Find step metadata
643
+ page = root.find("page")
644
+ if page is None:
645
+ return False
646
+ scheme = page.find("scheme")
647
+ step = scheme.find("step") if scheme is not None else None
648
+ if step is None:
649
+ return False
650
+
651
+ reactant_ids = step.get("ReactionStepReactants", "").split()
652
+ above_ids = step.get("ReactionStepObjectsAboveArrow", "").split()
653
+
654
+ # Build id -> element map
655
+ id_to_el: Dict[str, ET.Element] = {}
656
+ for el in page:
657
+ eid = el.get("id", "")
658
+ if eid:
659
+ id_to_el[eid] = el
660
+
661
+ # Identify fragment elements among reactants and above-arrow
662
+ reactant_frags = [] # (id, element)
663
+ for rid in reactant_ids:
664
+ el = id_to_el.get(rid)
665
+ if el is not None and el.tag == "fragment":
666
+ reactant_frags.append((rid, el))
667
+
668
+ above_frags = []
669
+ for aid in above_ids:
670
+ el = id_to_el.get(aid)
671
+ if el is not None and el.tag == "fragment":
672
+ above_frags.append((aid, el))
673
+
674
+ # Condition: 2+ fragment reactants, 0 fragment above arrow
675
+ if len(reactant_frags) < 2 or len(above_frags) > 0:
676
+ if verbose and len(reactant_frags) < 2:
677
+ log(f"Only {len(reactant_frags)} fragment reactant(s), "
678
+ f"no repositioning needed")
679
+ if verbose and len(above_frags) > 0:
680
+ log(f"{len(above_frags)} fragment(s) already above arrow, "
681
+ f"no repositioning needed")
682
+ return False
683
+
684
+ log(f"Found {len(reactant_frags)} fragment reactant(s), "
685
+ f"0 fragments above arrow")
686
+
687
+ # Find the substrate from CSV (equiv=1.0 and/or is_substrate=True)
688
+ substrate_mw = None
689
+ substrate_name = None
690
+ for reagent in exp.reactants:
691
+ if reagent.is_substrate:
692
+ substrate_mw = reagent.mw
693
+ substrate_name = reagent.name
694
+ break
695
+ if substrate_mw is None:
696
+ # Fallback: look for equiv=1.0
697
+ for reagent in exp.reactants:
698
+ try:
699
+ eq = float(reagent.equiv)
700
+ except (ValueError, TypeError):
701
+ continue
702
+ if abs(eq - 1.0) < 0.01:
703
+ substrate_mw = reagent.mw
704
+ substrate_name = reagent.name
705
+ break
706
+ if substrate_mw is None or substrate_mw <= 0:
707
+ log("Could not identify substrate MW from CSV")
708
+ return False
709
+
710
+ log(f"Substrate from CSV: '{substrate_name}' (MW={substrate_mw:.1f})")
711
+
712
+ # Match substrate to a fragment by MW
713
+ substrate_frag_id = None
714
+ best_delta = float("inf")
715
+ for fid, frag_el in reactant_frags:
716
+ frag_mw = _compute_fragment_mw(frag_el)
717
+ if frag_mw is None:
718
+ continue
719
+ delta = abs(frag_mw - substrate_mw)
720
+ log(f" Fragment {fid}: MW={frag_mw:.1f}, delta={delta:.1f}")
721
+ if delta < best_delta and delta < MW_MATCH_TOLERANCE_LOOSE:
722
+ best_delta = delta
723
+ substrate_frag_id = fid
724
+
725
+ if substrate_frag_id is None:
726
+ log("Could not match substrate to any reactant fragment by MW")
727
+ return False
728
+
729
+ log(f"Substrate matched to fragment {substrate_frag_id} "
730
+ f"(delta={best_delta:.1f})")
731
+
732
+ # Move the OTHER fragment(s) to above-arrow
733
+ moved = False
734
+ new_reactant_ids = list(reactant_ids)
735
+ new_above_ids = list(above_ids)
736
+ for fid, frag_el in reactant_frags:
737
+ if fid == substrate_frag_id:
738
+ continue
739
+ # Move from reactants to above-arrow
740
+ if fid in new_reactant_ids:
741
+ new_reactant_ids.remove(fid)
742
+ new_above_ids.append(fid)
743
+ log(f"Moving fragment {fid} from reactants to above-arrow")
744
+ moved = True
745
+
746
+ if moved:
747
+ step.set("ReactionStepReactants", " ".join(new_reactant_ids))
748
+ step.set("ReactionStepObjectsAboveArrow",
749
+ " ".join(new_above_ids))
750
+
751
+ return moved
752
+
753
+
754
+ # ---------------------------------------------------------------------------
755
+ # Step 2: Phase A -- Inject equivalents into text content (before layout)
756
+ # ---------------------------------------------------------------------------
757
+
758
+ def enrich_phase_a(
759
+ root: ET.Element,
760
+ enrichment: EnrichmentData,
761
+ merged_text_id: Optional[str],
762
+ verbose: bool = False,
763
+ ) -> None:
764
+ """Inject equivalents into text labels (modifies root in-place).
765
+
766
+ In merged mode: rebuilds <s> elements in the merged text block.
767
+ In non-merged mode: appends ' (X eq.)' to each matching <t> element.
768
+
769
+ Must be called BEFORE layout (compact + reaction_cleanup) so that
770
+ text widths are correct for arrow length computation.
771
+ """
772
+ def log(msg: str):
773
+ if verbose:
774
+ print(f" [enrich-A] {msg}", file=sys.stderr)
775
+
776
+ page = root.find("page")
777
+ if page is None:
778
+ return
779
+
780
+ # Build match lookup: scheme_display (normalized) -> MatchedReagent
781
+ # For merged text, we match by line content
782
+ match_by_display: Dict[str, MatchedReagent] = {}
783
+ for m in enrichment.matches:
784
+ if m.scheme_position in ("below_arrow", "above_arrow") and m.scheme_display:
785
+ # Only inject equiv for non-substrate reagents in text
786
+ # (Substrates are structures on left/right — handled in Phase B)
787
+ if not m.is_solvent:
788
+ match_by_display[_normalize_name(m.scheme_display)] = m
789
+
790
+ if not match_by_display:
791
+ log("No text-based equiv matches to inject")
792
+ return
793
+
794
+ if merged_text_id:
795
+ _inject_merged(page, merged_text_id, match_by_display,
796
+ enrichment.solvent_names, log)
797
+ else:
798
+ _inject_separate(page, match_by_display,
799
+ enrichment.solvent_names, log)
800
+
801
+
802
+ def _inject_merged(
803
+ page: ET.Element,
804
+ merged_text_id: str,
805
+ match_by_display: Dict[str, 'MatchedReagent'],
806
+ solvent_names: List[str],
807
+ log,
808
+ ) -> None:
809
+ """Inject equiv into a merged text block (single <t> with newlines)."""
810
+ # Find the merged text element
811
+ merged_el = None
812
+ for el in page:
813
+ if el.get("id") == merged_text_id and el.tag == "t":
814
+ merged_el = el
815
+ break
816
+
817
+ if merged_el is None:
818
+ log(f"WARNING: Merged text element id={merged_text_id} not found")
819
+ return
820
+
821
+ # Extract current text content
822
+ full_text = _get_text_content(merged_el)
823
+ lines = full_text.split("\n")
824
+
825
+ # Build new <s> XML for each line
826
+ new_s_parts = []
827
+ for i, line in enumerate(lines):
828
+ line_stripped = line.strip()
829
+ if not line_stripped:
830
+ continue
831
+
832
+ line_norm = _normalize_name(line_stripped)
833
+ # Check if this line is a condition (time/temp) — skip
834
+ is_condition = bool(
835
+ re.search(r'\d+\s*°', line_stripped)
836
+ or re.search(r'\d+\s*[hm](?:\s|$|,)', line_stripped)
837
+ )
838
+ # Check if this line is a solvent — skip equiv
839
+ is_solvent = line_norm in solvent_names
840
+
841
+ matched = match_by_display.get(line_norm)
842
+ is_last_line = (i == len(lines) - 1)
843
+
844
+ if matched and not is_condition and not is_solvent:
845
+ equiv_str = _format_equiv(matched.csv_equiv)
846
+ # Build formatted reagent name (with subscripts/italics)
847
+ reagent_s_xml = build_formatted_s_xml(line_stripped)
848
+ # Append equiv in plain face; newline must be INSIDE <s> text
849
+ if not is_last_line:
850
+ equiv_s_xml = (
851
+ f'<s font="3" size="10" color="0"> '
852
+ f'({equiv_str} eq.)\n</s>'
853
+ )
854
+ else:
855
+ equiv_s_xml = (
856
+ f'<s font="3" size="10" color="0"> '
857
+ f'({equiv_str} eq.)</s>'
858
+ )
859
+ new_s_parts.append(reagent_s_xml + equiv_s_xml)
860
+ log(f" Merged line '{line_stripped}' -> ({equiv_str} eq.)")
861
+ else:
862
+ # Keep original line with its formatting
863
+ reagent_s_xml = build_formatted_s_xml(line_stripped)
864
+ # Newline must be INSIDE <s> text to be preserved in CDXML
865
+ if not is_last_line:
866
+ new_s_parts.append(
867
+ reagent_s_xml
868
+ + '<s font="3" size="10" color="0">\n</s>'
869
+ )
870
+ else:
871
+ new_s_parts.append(reagent_s_xml)
872
+
873
+ # Clear existing <s> children and rebuild
874
+ for s in list(merged_el.findall("s")):
875
+ merged_el.remove(s)
876
+
877
+ # Parse and insert new <s> elements
878
+ combined_xml = "".join(new_s_parts)
879
+
880
+ # Wrap for parsing
881
+ wrapper = f"<t>{combined_xml}</t>"
882
+ try:
883
+ temp_t = ET.fromstring(wrapper)
884
+ for s in temp_t.findall("s"):
885
+ merged_el.append(s)
886
+ except ET.ParseError as e:
887
+ log(f"WARNING: Failed to rebuild merged text: {e}")
888
+ log(f" XML was: {wrapper[:200]}...")
889
+ return
890
+
891
+ log(f"Rebuilt merged text block (id={merged_text_id})")
892
+
893
+
894
+ def _inject_separate(
895
+ page: ET.Element,
896
+ match_by_display: Dict[str, 'MatchedReagent'],
897
+ solvent_names: List[str],
898
+ log,
899
+ ) -> None:
900
+ """Inject equiv into separate text elements (non-merged mode)."""
901
+
902
+ for el in page.findall("t"):
903
+ text = _get_text_content(el)
904
+ text_norm = _normalize_name(text)
905
+
906
+ # Skip conditions
907
+ if re.search(r'\d+\s*°', text) or re.search(r'\d+\s*[hm](?:\s|$|,)', text):
908
+ continue
909
+ # Skip solvents
910
+ if text_norm in solvent_names:
911
+ continue
912
+
913
+ matched = match_by_display.get(text_norm)
914
+ if matched is None:
915
+ continue
916
+
917
+ equiv_str = _format_equiv(matched.csv_equiv)
918
+
919
+ # Rebuild <s> children
920
+ for s in list(el.findall("s")):
921
+ el.remove(s)
922
+
923
+ reagent_s_xml = build_formatted_s_xml(text)
924
+ equiv_s_xml = (
925
+ f'<s font="3" size="10" color="0"> '
926
+ f'({equiv_str} eq.)</s>'
927
+ )
928
+ wrapper = f"<t>{reagent_s_xml}{equiv_s_xml}</t>"
929
+ try:
930
+ temp_t = ET.fromstring(wrapper)
931
+ for s in temp_t.findall("s"):
932
+ el.append(s)
933
+ except ET.ParseError as e:
934
+ log(f"WARNING: Failed to rebuild text for '{text}': {e}")
935
+ continue
936
+
937
+ # Recompute bounding box
938
+ recompute_text_bbox(el)
939
+ log(f" Separate text '{text}' -> ({equiv_str} eq.)")
940
+
941
+
942
+ # ---------------------------------------------------------------------------
943
+ # Step 3: Phase B -- Post-layout additions (run arrow + eq labels)
944
+ # ---------------------------------------------------------------------------
945
+
946
+ def enrich_phase_b(
947
+ root: ET.Element,
948
+ enrichment: EnrichmentData,
949
+ verbose: bool = False,
950
+ ) -> None:
951
+ """Add run arrow and structural eq labels after layout.
952
+
953
+ Must be called AFTER reaction_cleanup has finalized positions.
954
+ Modifies root in-place.
955
+ """
956
+ def log(msg: str):
957
+ if verbose:
958
+ print(f" [enrich-B] {msg}", file=sys.stderr)
959
+
960
+ page = root.find("page")
961
+ if page is None:
962
+ return
963
+
964
+ # --- Find reaction arrow ---
965
+ arrow_el = None
966
+ for el in page:
967
+ if el.tag == "arrow":
968
+ arrow_el = el
969
+ break
970
+ # Fallback: look for <graphic> with ArrowType
971
+ if arrow_el is None:
972
+ for el in page:
973
+ if el.tag == "graphic" and el.get("ArrowType"):
974
+ arrow_el = el
975
+ break
976
+
977
+ if arrow_el is None:
978
+ log("WARNING: No reaction arrow found")
979
+ return
980
+
981
+ # Get arrow coordinates
982
+ if arrow_el.tag == "arrow":
983
+ head3d = arrow_el.get("Head3D", "")
984
+ tail3d = arrow_el.get("Tail3D", "")
985
+ if head3d and tail3d:
986
+ head_parts = head3d.split()
987
+ tail_parts = tail3d.split()
988
+ arrow_head_x = float(head_parts[0])
989
+ arrow_tail_x = float(tail_parts[0])
990
+ arrow_y = float(head_parts[1])
991
+ else:
992
+ bb = arrow_el.get("BoundingBox", "").split()
993
+ if len(bb) >= 4:
994
+ arrow_tail_x = float(bb[0])
995
+ arrow_head_x = float(bb[2])
996
+ arrow_y = (float(bb[1]) + float(bb[3])) / 2.0
997
+ else:
998
+ log("WARNING: Cannot determine arrow position")
999
+ return
1000
+ else:
1001
+ bb = arrow_el.get("BoundingBox", "").split()
1002
+ if len(bb) >= 4:
1003
+ # graphic BoundingBox: head_x, y, tail_x, y (reversed)
1004
+ arrow_head_x = float(bb[0])
1005
+ arrow_tail_x = float(bb[2])
1006
+ arrow_y = float(bb[1])
1007
+ else:
1008
+ log("WARNING: Cannot determine arrow position")
1009
+ return
1010
+
1011
+ # Ensure tail_x < head_x
1012
+ if arrow_tail_x > arrow_head_x:
1013
+ arrow_tail_x, arrow_head_x = arrow_head_x, arrow_tail_x
1014
+
1015
+ arrow_cx = (arrow_tail_x + arrow_head_x) / 2.0
1016
+ arrow_len = arrow_head_x - arrow_tail_x
1017
+ log(f"Arrow: tail={arrow_tail_x:.1f}, head={arrow_head_x:.1f}, "
1018
+ f"y={arrow_y:.2f}, len={arrow_len:.1f}")
1019
+
1020
+ # --- Get step metadata for element positions ---
1021
+ scheme = page.find("scheme")
1022
+ step = scheme.find("step") if scheme is not None else None
1023
+ above_ids = step.get("ReactionStepObjectsAboveArrow", "").split() if step is not None else []
1024
+ below_ids = step.get("ReactionStepObjectsBelowArrow", "").split() if step is not None else []
1025
+ reactant_ids = step.get("ReactionStepReactants", "").split() if step is not None else []
1026
+ product_ids = step.get("ReactionStepProducts", "").split() if step is not None else []
1027
+
1028
+ id_to_el: Dict[str, ET.Element] = {}
1029
+ for el in page:
1030
+ eid = el.get("id", "")
1031
+ if eid:
1032
+ id_to_el[eid] = el
1033
+
1034
+ # --- ID allocation ---
1035
+ next_id = _get_max_id(root) + 1
1036
+ next_z = _get_max_z(root) + 1
1037
+
1038
+ # --- Above-arrow structure eq labels ---
1039
+ for m in enrichment.matches:
1040
+ if m.scheme_position != "above_arrow":
1041
+ continue
1042
+ if m.is_solvent:
1043
+ continue
1044
+
1045
+ el = id_to_el.get(m.scheme_element_id)
1046
+ if el is None or el.tag != "fragment":
1047
+ continue
1048
+
1049
+ equiv_str = _format_equiv(m.csv_equiv)
1050
+ if equiv_str == "1":
1051
+ continue # Don't show (1 eq.) for 1.0
1052
+
1053
+ # Get fragment bottom from atom positions only
1054
+ frag_bb = fragment_bbox_with_label_extension(el)
1055
+ if frag_bb is None:
1056
+ continue
1057
+ frag_bottom = frag_bb[3]
1058
+
1059
+ # Shift fragment UP to make room for eq label
1060
+ # Ensure at least 20pt between fragment bottom and arrow
1061
+ gap_needed = 20.0
1062
+ current_gap = arrow_y - frag_bottom
1063
+ if current_gap < gap_needed:
1064
+ shift_up = gap_needed - current_gap
1065
+ _shift_fragment(el, 0, -shift_up)
1066
+ frag_bb = fragment_bbox_with_label_extension(el)
1067
+ frag_bottom = frag_bb[3]
1068
+ log(f" Shifted fragment {m.scheme_element_id} up by {shift_up:.1f}pt")
1069
+
1070
+ # Place eq label midway between fragment bottom and arrow,
1071
+ # centered on the arrow midpoint (not the fragment center)
1072
+ label_y = (frag_bottom + arrow_y) / 2.0 + 3.0 # +3 for baseline offset
1073
+ label_text = f"({equiv_str} eq.)"
1074
+
1075
+ eq_label = _create_text_element(
1076
+ next_id, next_z, arrow_cx, label_y, label_text,
1077
+ justify="Center",
1078
+ )
1079
+ page.append(eq_label)
1080
+ # Add to above-arrow objects in step
1081
+ if step is not None:
1082
+ above_str = step.get("ReactionStepObjectsAboveArrow", "")
1083
+ step.set("ReactionStepObjectsAboveArrow",
1084
+ f"{above_str} {next_id}".strip())
1085
+
1086
+ log(f" Above-arrow eq label: '{label_text}' at ({arrow_cx:.1f}, {label_y:.1f})")
1087
+ next_id += 1
1088
+ next_z += 1
1089
+
1090
+ # --- Left/right side structure eq labels ---
1091
+ for m in enrichment.matches:
1092
+ if m.scheme_position not in ("reactant",):
1093
+ continue
1094
+ if m.is_solvent:
1095
+ continue
1096
+
1097
+ el = id_to_el.get(m.scheme_element_id)
1098
+ if el is None or el.tag != "fragment":
1099
+ continue
1100
+
1101
+ equiv_str = _format_equiv(m.csv_equiv)
1102
+ if equiv_str == "1":
1103
+ continue # Don't show (1 eq.) for 1.0
1104
+
1105
+ frag_bb = fragment_bbox_with_label_extension(el)
1106
+ if frag_bb is None:
1107
+ continue
1108
+ frag_bottom = frag_bb[3]
1109
+ frag_cx = (frag_bb[0] + frag_bb[2]) / 2.0
1110
+
1111
+ # Place label below fragment (no shifting)
1112
+ label_y = frag_bottom + 12.0
1113
+ label_text = f"({equiv_str} eq.)"
1114
+
1115
+ eq_label = _create_text_element(
1116
+ next_id, next_z, frag_cx, label_y, label_text,
1117
+ justify="Center",
1118
+ )
1119
+ page.append(eq_label)
1120
+ log(f" Side eq label: '{label_text}' below fragment {m.scheme_element_id}")
1121
+ next_id += 1
1122
+ next_z += 1
1123
+
1124
+ # --- Run arrow ---
1125
+ if enrichment.sm_mass or enrichment.product_obtained:
1126
+ _create_run_arrow(
1127
+ page, root, arrow_tail_x, arrow_head_x, arrow_y,
1128
+ enrichment, id_to_el, below_ids,
1129
+ next_id, next_z, log,
1130
+ )
1131
+
1132
+
1133
+ def _create_run_arrow(
1134
+ page: ET.Element,
1135
+ root: ET.Element,
1136
+ arrow_tail_x: float,
1137
+ arrow_head_x: float,
1138
+ arrow_y: float,
1139
+ enrichment: EnrichmentData,
1140
+ id_to_el: Dict[str, ET.Element],
1141
+ below_ids: List[str],
1142
+ next_id: int,
1143
+ next_z: int,
1144
+ log,
1145
+ ) -> None:
1146
+ """Create the run arrow with SM mass and product yield."""
1147
+ # Find bottom of all content below arrow
1148
+ content_bottom = arrow_y
1149
+ for el in page:
1150
+ eid = el.get("id", "")
1151
+ if el.tag in ("fragment", "t"):
1152
+ bb = _get_element_bbox(el)
1153
+ if bb and bb[3] > content_bottom:
1154
+ content_bottom = bb[3]
1155
+
1156
+ # Run arrow y position: below all content
1157
+ run_arrow_y = content_bottom + 20.0
1158
+
1159
+ log(f" Run arrow at y={run_arrow_y:.1f} "
1160
+ f"(content_bottom={content_bottom:.1f})")
1161
+
1162
+ # Create <graphic> element (the old-style reference)
1163
+ graphic_id = next_id
1164
+ next_id += 1
1165
+ arrow_id = next_id
1166
+ next_id += 1
1167
+
1168
+ graphic = ET.SubElement(page, "graphic")
1169
+ graphic.set("id", str(graphic_id))
1170
+ graphic.set("SupersededBy", str(arrow_id))
1171
+ graphic.set("BoundingBox",
1172
+ f"{arrow_head_x:.2f} {run_arrow_y:.2f} "
1173
+ f"{arrow_tail_x:.2f} {run_arrow_y:.2f}")
1174
+ graphic.set("Z", str(next_z))
1175
+ next_z += 1
1176
+ graphic.set("GraphicType", "Line")
1177
+ graphic.set("ArrowType", "FullHead")
1178
+ graphic.set("HeadSize", "1000")
1179
+
1180
+ # Create <arrow> element
1181
+ arrow = ET.SubElement(page, "arrow")
1182
+ arrow.set("id", str(arrow_id))
1183
+ bb_top = run_arrow_y - 1.64
1184
+ bb_bot = run_arrow_y + 1.52
1185
+ arrow.set("BoundingBox",
1186
+ f"{arrow_tail_x:.2f} {bb_top:.2f} "
1187
+ f"{arrow_head_x:.2f} {bb_bot:.2f}")
1188
+ arrow.set("Z", str(next_z))
1189
+ next_z += 1
1190
+ arrow.set("FillType", "None")
1191
+ arrow.set("ArrowheadHead", "Full")
1192
+ arrow.set("ArrowheadType", "Solid")
1193
+ arrow.set("HeadSize", "1000")
1194
+ arrow.set("ArrowheadCenterSize", "875")
1195
+ arrow.set("ArrowheadWidth", "250")
1196
+ arrow.set("Head3D", f"{arrow_head_x:.2f} {run_arrow_y:.2f} 0")
1197
+ arrow.set("Tail3D", f"{arrow_tail_x:.2f} {run_arrow_y:.2f} 0")
1198
+ # Center3D / MajorAxisEnd3D / MinorAxisEnd3D (cosmetic, approximated)
1199
+ cx_3d = (arrow_tail_x + arrow_head_x) / 2.0 + 290.0
1200
+ cy_3d = run_arrow_y + 129.0
1201
+ arrow.set("Center3D", f"{cx_3d:.2f} {cy_3d:.2f} 0")
1202
+ arrow.set("MajorAxisEnd3D",
1203
+ f"{cx_3d + (arrow_head_x - arrow_tail_x) / 2.0:.2f} {cy_3d:.2f} 0")
1204
+ arrow.set("MinorAxisEnd3D",
1205
+ f"{cx_3d:.2f} {cy_3d + (arrow_head_x - arrow_tail_x) / 2.0:.2f} 0")
1206
+
1207
+ # --- SM mass text (left of run arrow) ---
1208
+ sm_text_y = run_arrow_y + 2.25 # baseline slightly below arrow
1209
+ if enrichment.sm_mass:
1210
+ sm_label = _create_text_element(
1211
+ next_id, next_z,
1212
+ arrow_tail_x - 4.0, # right edge aligned near arrow tail
1213
+ sm_text_y,
1214
+ enrichment.sm_mass,
1215
+ justify="Right",
1216
+ )
1217
+ page.append(sm_label)
1218
+ log(f" SM mass: '{enrichment.sm_mass}' at x={arrow_tail_x - 4.0:.1f}")
1219
+ next_id += 1
1220
+ next_z += 1
1221
+
1222
+ # --- Product yield text (right of run arrow) ---
1223
+ if enrichment.product_obtained or enrichment.product_yield:
1224
+ yield_parts = []
1225
+ if enrichment.product_obtained:
1226
+ yield_parts.append(enrichment.product_obtained)
1227
+ if enrichment.product_yield:
1228
+ yield_parts.append(enrichment.product_yield)
1229
+ yield_text = ", ".join(yield_parts)
1230
+
1231
+ yield_label = _create_text_element(
1232
+ next_id, next_z,
1233
+ arrow_head_x + 4.0, # left edge aligned near arrow head
1234
+ sm_text_y,
1235
+ yield_text,
1236
+ justify="Left",
1237
+ )
1238
+ page.append(yield_label)
1239
+ log(f" Product yield: '{yield_text}' at x={arrow_head_x + 4.0:.1f}")
1240
+ next_id += 1
1241
+ next_z += 1
1242
+
1243
+ # --- Update document BoundingBox ---
1244
+ _update_document_bbox(root, page)
1245
+
1246
+
1247
+ # ---------------------------------------------------------------------------
1248
+ # Element creation helpers
1249
+ # ---------------------------------------------------------------------------
1250
+
1251
+ def _create_text_element(
1252
+ elem_id: int,
1253
+ z_order: int,
1254
+ x: float,
1255
+ y: float,
1256
+ text: str,
1257
+ justify: str = "Left",
1258
+ ) -> ET.Element:
1259
+ """Create a standalone <t> element with plain text content."""
1260
+ t = ET.Element("t")
1261
+ t.set("id", str(elem_id))
1262
+ t.set("p", f"{x:.2f} {y:.2f}")
1263
+ t.set("Z", str(z_order))
1264
+ t.set("Warning", "Chemical Interpretation is not possible for this label")
1265
+ t.set("LineHeight", "auto")
1266
+
1267
+ if justify == "Center":
1268
+ t.set("CaptionJustification", "Center")
1269
+ t.set("Justification", "Center")
1270
+ elif justify == "Right":
1271
+ t.set("CaptionJustification", "Right")
1272
+ t.set("Justification", "Right")
1273
+
1274
+ s = ET.SubElement(t, "s")
1275
+ s.set("font", "3")
1276
+ s.set("size", "10")
1277
+ s.set("color", "0")
1278
+ s.text = text
1279
+
1280
+ # Compute bounding box
1281
+ char_w = 5.8
1282
+ line_h = 12.0
1283
+ w = len(text) * char_w
1284
+
1285
+ if justify == "Center":
1286
+ x1 = x - w / 2.0
1287
+ x2 = x + w / 2.0
1288
+ elif justify == "Right":
1289
+ x1 = x - w
1290
+ x2 = x
1291
+ else:
1292
+ x1 = x
1293
+ x2 = x + w
1294
+
1295
+ y1 = y - line_h + 3.0
1296
+ y2 = y + 3.0
1297
+
1298
+ t.set("BoundingBox", f"{x1:.2f} {y1:.2f} {x2:.2f} {y2:.2f}")
1299
+ return t
1300
+
1301
+
1302
+ def _shift_fragment(frag: ET.Element, dx: float, dy: float):
1303
+ """Shift all coordinates in a fragment by (dx, dy)."""
1304
+ for n in frag.iter("n"):
1305
+ p = n.get("p")
1306
+ if p:
1307
+ parts = p.split()
1308
+ if len(parts) >= 2:
1309
+ nx = float(parts[0]) + dx
1310
+ ny = float(parts[1]) + dy
1311
+ n.set("p", f"{nx:.2f} {ny:.2f}")
1312
+
1313
+ for t in frag.iter("t"):
1314
+ p = t.get("p")
1315
+ if p:
1316
+ parts = p.split()
1317
+ if len(parts) >= 2:
1318
+ nx = float(parts[0]) + dx
1319
+ ny = float(parts[1]) + dy
1320
+ t.set("p", f"{nx:.2f} {ny:.2f}")
1321
+ bb = t.get("BoundingBox")
1322
+ if bb:
1323
+ vals = [float(v) for v in bb.split()]
1324
+ if len(vals) >= 4:
1325
+ vals[0] += dx
1326
+ vals[1] += dy
1327
+ vals[2] += dx
1328
+ vals[3] += dy
1329
+ t.set("BoundingBox",
1330
+ " ".join(f"{v:.2f}" for v in vals))
1331
+
1332
+ bb = frag.get("BoundingBox")
1333
+ if bb:
1334
+ vals = [float(v) for v in bb.split()]
1335
+ if len(vals) >= 4:
1336
+ vals[0] += dx
1337
+ vals[1] += dy
1338
+ vals[2] += dx
1339
+ vals[3] += dy
1340
+ frag.set("BoundingBox",
1341
+ " ".join(f"{v:.2f}" for v in vals))
1342
+
1343
+ # Inner fragments (abbreviation groups)
1344
+ for inner in frag.iter("fragment"):
1345
+ if inner is not frag:
1346
+ ib = inner.get("BoundingBox")
1347
+ if ib:
1348
+ vals = [float(v) for v in ib.split()]
1349
+ if len(vals) >= 4:
1350
+ vals[0] += dx
1351
+ vals[1] += dy
1352
+ vals[2] += dx
1353
+ vals[3] += dy
1354
+ inner.set("BoundingBox",
1355
+ " ".join(f"{v:.2f}" for v in vals))
1356
+
1357
+
1358
+ def _get_element_bbox(el: ET.Element) -> Optional[Tuple[float, float, float, float]]:
1359
+ """Get bounding box for any element."""
1360
+ if el.tag == "fragment":
1361
+ return fragment_bbox_with_label_extension(el)
1362
+ elif el.tag == "t":
1363
+ bb = el.get("BoundingBox", "")
1364
+ if bb:
1365
+ vals = [float(v) for v in bb.split()]
1366
+ if len(vals) >= 4:
1367
+ return (vals[0], vals[1], vals[2], vals[3])
1368
+ # Fallback from p attribute
1369
+ p = el.get("p", "")
1370
+ if p:
1371
+ parts = [float(v) for v in p.split()]
1372
+ text = _get_text_content(el)
1373
+ w = len(text) * 5.8
1374
+ return (parts[0] - w/2, parts[1] - 12.0, parts[0] + w/2, parts[1])
1375
+ return None
1376
+
1377
+
1378
+ def _update_document_bbox(root: ET.Element, page: ET.Element):
1379
+ """Update root CDXML BoundingBox to encompass all content."""
1380
+ min_x = min_y = float('inf')
1381
+ max_x = max_y = float('-inf')
1382
+
1383
+ for el in page:
1384
+ bb = _get_element_bbox(el)
1385
+ if bb is None:
1386
+ continue
1387
+ min_x = min(min_x, bb[0])
1388
+ min_y = min(min_y, bb[1])
1389
+ max_x = max(max_x, bb[2])
1390
+ max_y = max(max_y, bb[3])
1391
+
1392
+ if min_x < float('inf'):
1393
+ root.set("BoundingBox",
1394
+ f"{min_x:.2f} {min_y:.2f} {max_x:.2f} {max_y:.2f}")