cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1043 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ scheme_maker.py -- Build CDXML reaction scheme from reaction JSON (experimental).
4
+
5
+ Takes a reaction JSON file (v1.2 from reaction_parser.py) and produces a
6
+ publication-ready CDXML reaction scheme. The output is equivalent to what
7
+ the current polishing pipeline produces (scheme_polisher_v2 + eln_enrichment
8
+ + reaction_cleanup), but built from semantic data rather than CDXML surgery.
9
+
10
+ When species have ``original_geometry`` data (stored by reaction_parser v1.2),
11
+ the original CDXML coordinates and abbreviation groups are used by default.
12
+ This preserves the input orientation and re-abbreviates groups like OTs, Boc,
13
+ etc. instead of expanding them to full structures.
14
+
15
+ This tool is EXPERIMENTAL. It coexists with the existing pipeline and does
16
+ not replace it.
17
+
18
+ CLI:
19
+ python scheme_maker.py reaction.json -o scheme.cdxml
20
+ python scheme_maker.py reaction.json --approach chemdraw_mimic --align-mode rdkit
21
+ python scheme_maker.py reaction.json --no-run-arrow --verbose
22
+
23
+ Python API:
24
+ from cdxml_toolkit.render.scheme_maker import build_scheme
25
+ cdxml_path = build_scheme("reaction.json", output="scheme.cdxml")
26
+ """
27
+
28
+ import argparse
29
+ import json
30
+ import math
31
+ import os
32
+ import re
33
+ import sys
34
+ import tempfile
35
+ from typing import Any, Dict, List, Optional, Tuple
36
+ from xml.etree import ElementTree as ET
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # Lazy imports — defer heavy dependencies to call time
40
+ # ---------------------------------------------------------------------------
41
+
42
+ _HAS_RDKIT = None
43
+
44
+
45
+ def _check_rdkit() -> bool:
46
+ global _HAS_RDKIT
47
+ if _HAS_RDKIT is None:
48
+ try:
49
+ from rdkit import Chem # noqa: F401
50
+ _HAS_RDKIT = True
51
+ except ImportError:
52
+ _HAS_RDKIT = False
53
+ return _HAS_RDKIT
54
+
55
+
56
+ # ---------------------------------------------------------------------------
57
+ # Logging
58
+ # ---------------------------------------------------------------------------
59
+
60
+ _verbose = False
61
+
62
+
63
+ def _log(msg: str) -> None:
64
+ if _verbose:
65
+ print(msg, file=sys.stderr)
66
+
67
+
68
+ # ---------------------------------------------------------------------------
69
+ # Core: SMILES → atom/bond dicts (via structure_from_image)
70
+ # ---------------------------------------------------------------------------
71
+
72
+ def _smiles_to_mol_data(smiles: str, offset: int = 0) -> Optional[Dict]:
73
+ """Convert SMILES to atom/bond dicts using RDKit 2D coords.
74
+
75
+ Returns dict with 'atoms' and 'bonds' lists, or None on failure.
76
+ Uses structure_from_image.smiles_to_coords which handles Kekulization,
77
+ explicit H removal, and bond direction annotation.
78
+ """
79
+ try:
80
+ from ..image.structure_from_image import smiles_to_coords
81
+ except ImportError:
82
+ raise RuntimeError(
83
+ "structure_from_image.py is required (for smiles_to_coords). "
84
+ "Ensure it is in the same directory."
85
+ )
86
+
87
+ return smiles_to_coords(smiles, offset_index=offset)
88
+
89
+
90
+ def _normalize_mol(mol_data: Dict, center_x: float = 0.0,
91
+ center_y: float = 0.0) -> Tuple[List, List]:
92
+ """Normalize atom coords to ACS bond length (14.40 pt), flip y, center."""
93
+ from ..coord_normalizer import normalize_coords
94
+ return normalize_coords(
95
+ mol_data["atoms"], mol_data["bonds"],
96
+ center_x=center_x, center_y=center_y,
97
+ flip_y=True,
98
+ )
99
+
100
+
101
+ # ---------------------------------------------------------------------------
102
+ # Original geometry → mol_data conversion
103
+ # ---------------------------------------------------------------------------
104
+
105
+ def _geometry_to_mol_data(geom: Dict[str, Any],
106
+ offset: int = 0) -> Optional[Dict]:
107
+ """Convert ``original_geometry`` from a SpeciesDescriptor to mol_data.
108
+
109
+ The returned dict has ``"atoms"`` and ``"bonds"`` lists in the same
110
+ format that ``smiles_to_coords`` / ``_smiles_to_mol_data`` returns,
111
+ including extra keys for abbreviation and generic groups.
112
+
113
+ Coordinates are negated on the y-axis so that subsequent
114
+ ``normalize_coords(flip_y=True)`` produces correct CDXML-space output
115
+ (the double-negation cancels out).
116
+ """
117
+ if not geom or not geom.get("atoms"):
118
+ return None
119
+
120
+ atoms: List[Dict[str, Any]] = []
121
+ id_remap: Dict[int, int] = {} # original id → new 1-based index
122
+
123
+ for i, a in enumerate(geom["atoms"]):
124
+ idx = offset + i + 1
125
+ orig_id = a.get("id", i)
126
+ id_remap[orig_id] = idx
127
+
128
+ atom_d: Dict[str, Any] = {
129
+ "index": idx,
130
+ "symbol": a.get("symbol", "C"),
131
+ "x": a["x"],
132
+ "y": -a["y"], # negate so flip_y=True restores original
133
+ }
134
+
135
+ if "num_hydrogens" in a:
136
+ atom_d["num_hydrogens"] = a["num_hydrogens"]
137
+
138
+ if "charge" in a:
139
+ atom_d["charge"] = a["charge"]
140
+
141
+ # Abbreviation groups (OTs, Boc, Me, …)
142
+ if a.get("is_abbreviation"):
143
+ atom_d["is_abbreviation"] = True
144
+ atom_d["abbrev_label"] = a.get("label", "?")
145
+ atom_d["abbrev_smiles"] = a.get("label_smiles")
146
+ # Use a placeholder symbol that won't be stripped as explicit H
147
+ atom_d["symbol"] = "X"
148
+
149
+ # Generic variable groups (R, X, Ar, R1, …)
150
+ elif a.get("is_generic"):
151
+ atom_d["is_generic"] = True
152
+ atom_d["generic_label"] = a.get("label", "R")
153
+ atom_d["node_type"] = a.get("node_type", "GenericNickname")
154
+ atom_d["symbol"] = "X"
155
+
156
+ atoms.append(atom_d)
157
+
158
+ bonds: List[Dict[str, Any]] = []
159
+ for j, b in enumerate(geom["bonds"]):
160
+ bi = id_remap.get(b["begin"])
161
+ ei = id_remap.get(b["end"])
162
+ if bi is None or ei is None:
163
+ continue
164
+ bond_d: Dict[str, Any] = {
165
+ "index": offset + len(geom["atoms"]) + j + 1,
166
+ "order": b.get("order", 1),
167
+ "atom1": bi,
168
+ "atom2": ei,
169
+ }
170
+ if "double_position" in b:
171
+ bond_d["double_pos"] = b["double_position"]
172
+ # Preserve stereo config
173
+ if "cfg" in b:
174
+ bond_d["cfg"] = b["cfg"]
175
+ bonds.append(bond_d)
176
+
177
+ return {"atoms": atoms, "bonds": bonds}
178
+
179
+
180
+ def _species_mol_data(sp, offset: int = 0) -> Optional[Dict]:
181
+ """Get mol_data for a species, preferring original geometry.
182
+
183
+ When a species has ``original_geometry`` (from reaction_parser v1.2),
184
+ uses the original CDXML coordinates and abbreviation data. Falls back
185
+ to SMILES-based 2D coordinate generation.
186
+ """
187
+ # Prefer original geometry (preserves orientation + abbreviations)
188
+ if sp.original_geometry:
189
+ mol = _geometry_to_mol_data(sp.original_geometry, offset=offset)
190
+ if mol is not None:
191
+ _log(f" Using original geometry for '{sp.name or sp.smiles}'")
192
+ return mol
193
+
194
+ # Fallback: generate from SMILES
195
+ if sp.smiles:
196
+ return _smiles_to_mol_data(sp.smiles, offset=offset)
197
+
198
+ return None
199
+
200
+
201
+ # ---------------------------------------------------------------------------
202
+ # Role priority ordering for above-arrow text
203
+ # ---------------------------------------------------------------------------
204
+
205
+ # Priority: lower number = higher priority = closer to top.
206
+ # Catalyst and ligand are always first (defining the reaction).
207
+ # Remaining reagents cluster around 50. Solvent is last.
208
+ _ROLE_PRIORITY = {
209
+ "catalyst": 10,
210
+ "ligand": 20,
211
+ "coupling_reagent": 40,
212
+ "activating_agent": 41,
213
+ "reducing_agent": 42,
214
+ "oxidant": 43,
215
+ "halogenating_agent": 44,
216
+ "fluorinating_agent": 45,
217
+ "borylating_agent": 46,
218
+ "lewis_acid": 47,
219
+ "protecting_group": 48,
220
+ "deprotecting_agent": 49,
221
+ "acid": 50,
222
+ "base": 51,
223
+ "additive": 55,
224
+ "reagent": 60,
225
+ "reductant": 65,
226
+ "drying_agent": 70,
227
+ "solvent": 80,
228
+ }
229
+ _DEFAULT_ROLE_PRIORITY = 59 # unknown roles sort just before "reagent"
230
+
231
+
232
+ def _sort_by_role_priority(
233
+ entries: List[Tuple[str, str, float]],
234
+ ) -> List[Tuple[str, str, float]]:
235
+ """Sort (text, role_detail, equiv) entries by reagent role priority.
236
+
237
+ Catalyst → Ligand → Coupling reagent → … → Base → Acid → Solvent.
238
+ Within the same priority (or all at _DEFAULT_ROLE_PRIORITY for
239
+ unclassified reagents), lower equivalents = higher priority.
240
+ This heuristic reflects that catalysts/ligands are typically used
241
+ in smaller amounts than stoichiometric reagents.
242
+ """
243
+ return sorted(
244
+ entries,
245
+ key=lambda e: (_ROLE_PRIORITY.get(e[1], _DEFAULT_ROLE_PRIORITY), e[2]),
246
+ )
247
+
248
+
249
+ def _merge_condition_tokens(condition_lines: List[str]) -> List[str]:
250
+ """Merge temperature and time tokens into a single comma-separated line.
251
+
252
+ Input: ["105 °C", "24 h"]
253
+ Output: ["105 °C, 24 h"]
254
+
255
+ Other condition tokens (atmosphere, etc.) stay on separate lines.
256
+ """
257
+ temp_time_tokens = []
258
+ other_tokens = []
259
+
260
+ # Patterns for temperature and time
261
+ temp_pat = re.compile(
262
+ r"^-?\d+\.?\d*\s*°?\s*[cCfF]$" # "105 °C", "80°C", "-78 °C"
263
+ r"|^rt$|^RT$|^room\s+temp" # "rt", "RT", "room temp"
264
+ r"|^reflux$" # "reflux"
265
+ r"|^-?\d+\s*to\s*-?\d+\s*°?\s*[cCfF]$" # "0 to 25 °C"
266
+ , re.IGNORECASE
267
+ )
268
+ time_pat = re.compile(
269
+ r"^\d+\.?\d*\s*(h|hr|hrs|hours?|min|minutes?|d|days?|s|sec|seconds?|overnight|o/?n)$",
270
+ re.IGNORECASE
271
+ )
272
+
273
+ for tok in condition_lines:
274
+ tok = tok.strip()
275
+ if not tok:
276
+ continue
277
+ if temp_pat.match(tok) or time_pat.match(tok):
278
+ temp_time_tokens.append(tok)
279
+ else:
280
+ other_tokens.append(tok)
281
+
282
+ result = []
283
+ if temp_time_tokens:
284
+ result.append(", ".join(temp_time_tokens))
285
+ result.extend(other_tokens)
286
+ return result
287
+
288
+
289
+ # ---------------------------------------------------------------------------
290
+ # Core: Build CDXML from reaction JSON
291
+ # ---------------------------------------------------------------------------
292
+
293
+ def build_scheme(
294
+ input_path: str,
295
+ output: Optional[str] = None,
296
+ approach: str = "chemdraw_mimic",
297
+ align_mode: str = "rdkit",
298
+ run_arrow: bool = True,
299
+ verbose: bool = False,
300
+ ) -> str:
301
+ """Build a CDXML reaction scheme from a reaction JSON file.
302
+
303
+ Args:
304
+ input_path: Path to reaction JSON (v1.1 from reaction_parser)
305
+ output: Path for output CDXML (default: {stem}-scheme.cdxml)
306
+ approach: Layout approach for reaction_cleanup
307
+ align_mode: Alignment strategy (rdkit/rxnmapper/kabsch/none)
308
+ run_arrow: Add run arrow with mass/yield if ELN data available
309
+ verbose: Print diagnostic messages
310
+
311
+ Returns:
312
+ Path to the output CDXML file.
313
+ """
314
+ global _verbose
315
+ _verbose = verbose
316
+
317
+ if not _check_rdkit():
318
+ print("ERROR: RDKit is required for scheme_maker.", file=sys.stderr)
319
+ sys.exit(1)
320
+
321
+ # --- Step 1: Load and validate JSON ---
322
+ from ..perception.reaction_parser import ReactionDescriptor
323
+
324
+ desc = ReactionDescriptor.from_json(input_path)
325
+ _log(f"Loaded JSON: {desc.experiment}, {len(desc.species)} species, "
326
+ f"version={desc.version}")
327
+
328
+ # Validate: need at least one product with SMILES
329
+ products_with_smiles = [
330
+ sp for sp in desc.species
331
+ if sp.role == "product" and sp.smiles
332
+ ]
333
+ if not products_with_smiles:
334
+ print("ERROR: No product species with SMILES found in JSON.",
335
+ file=sys.stderr)
336
+ sys.exit(1)
337
+
338
+ # --- Step 2: Partition species into layout groups ---
339
+ reactant_species = []
340
+ product_species = []
341
+ # Each entry is (text, role_detail, equiv) for priority sorting later.
342
+ # equiv is used as a tiebreaker: lower equiv = higher priority (catalysts
343
+ # are typically used in small amounts like 0.05 eq.).
344
+ above_arrow_entries = [] # (text, role_detail, equiv) tuples
345
+ above_arrow_mol_species = [] # structural species above arrow
346
+ condition_lines = [] # below-arrow condition text (temp, time, atm)
347
+
348
+ def _parse_equiv(sp) -> float:
349
+ """Parse csv_equiv to a float for sorting. Missing = 999."""
350
+ if sp.csv_equiv:
351
+ try:
352
+ return float(sp.csv_equiv)
353
+ except (ValueError, TypeError):
354
+ pass
355
+ return 999.0
356
+
357
+ for sp in desc.species:
358
+ # Derive position from chemical role (no dependency on scheme_position)
359
+ if sp.role == "product":
360
+ pos = "product"
361
+ elif sp.role == "atom_contributing":
362
+ if sp.is_substrate or sp.is_sm:
363
+ pos = "reactant"
364
+ else:
365
+ pos = "above_arrow"
366
+ elif sp.is_solvent:
367
+ pos = "above_arrow"
368
+ elif sp.role == "non_contributing":
369
+ pos = "above_arrow"
370
+ else:
371
+ pos = "above_arrow"
372
+
373
+ if pos == "product":
374
+ if sp.smiles:
375
+ product_species.append(sp)
376
+ else:
377
+ _log(f" WARNING: Product '{sp.name}' has no SMILES, skipping")
378
+ elif pos == "reactant":
379
+ if sp.smiles:
380
+ reactant_species.append(sp)
381
+ else:
382
+ # No SMILES — convert to text label above arrow
383
+ _log(f" WARNING: Reactant '{sp.name}' has no SMILES, "
384
+ "converting to text")
385
+ text = sp.display_text or sp.name or sp.csv_name or "?"
386
+ role_d = sp.role_detail or sp.rxn_insight_role or ""
387
+ above_arrow_entries.append((text, role_d, _parse_equiv(sp)))
388
+ elif pos == "above_arrow":
389
+ if (sp.smiles and sp.source in ("fragment", "rxn")
390
+ and sp.role == "atom_contributing"):
391
+ # Structural species above arrow (non-substrate atom-contributing
392
+ # reactant, e.g. coupling partner)
393
+ above_arrow_mol_species.append(sp)
394
+ else:
395
+ # Text species above arrow (reagents, catalysts, solvents)
396
+ text = sp.display_text or sp.name or sp.csv_name or "?"
397
+ role_d = sp.role_detail or sp.rxn_insight_role or ""
398
+ if sp.is_solvent and not role_d:
399
+ role_d = "solvent"
400
+ above_arrow_entries.append((text, role_d, _parse_equiv(sp)))
401
+ elif pos == "below_arrow":
402
+ text = sp.display_text or sp.name or sp.csv_name or "?"
403
+ role_d = sp.role_detail or sp.rxn_insight_role or ""
404
+ if sp.is_solvent and not role_d:
405
+ role_d = "solvent"
406
+ above_arrow_entries.append((text, role_d, _parse_equiv(sp)))
407
+
408
+ # Add condition tokens (temp, time, atmosphere)
409
+ condition_lines.extend(desc.conditions)
410
+
411
+ # Deduplicate above-arrow entries (case-insensitive)
412
+ seen_above = set()
413
+ deduped_entries = []
414
+ for txt, role_d, eq in above_arrow_entries:
415
+ key = txt.strip().lower()
416
+ if key not in seen_above:
417
+ seen_above.add(key)
418
+ deduped_entries.append((txt, role_d, eq))
419
+ above_arrow_entries = deduped_entries
420
+
421
+ # Sort entries by role priority:
422
+ # catalyst > ligand > coupling_reagent > … > base/acid > solvent
423
+ # Within same role priority, lower equiv = higher priority.
424
+ above_arrow_entries = _sort_by_role_priority(above_arrow_entries)
425
+
426
+ above_arrow_texts = [txt for txt, _, _ in above_arrow_entries]
427
+
428
+ _log(f" Reactants: {len(reactant_species)}, "
429
+ f"Products: {len(product_species)}, "
430
+ f"Above-arrow text: {len(above_arrow_texts)}, "
431
+ f"Above-arrow structures: {len(above_arrow_mol_species)}, "
432
+ f"Conditions: {len(condition_lines)}")
433
+
434
+ # --- Step 3: Generate 2D coords for each structural species ---
435
+ # Prefer original geometry when available (preserves orientation and
436
+ # abbreviation groups like OTs, Boc). Fall back to SMILES→RDKit→coords.
437
+ _log("Generating 2D coordinates...")
438
+
439
+ atom_offset = 0
440
+ reactant_mols = []
441
+ for sp in reactant_species:
442
+ mol_data = _species_mol_data(sp, offset=atom_offset)
443
+ if mol_data is None:
444
+ _log(f" WARNING: Could not generate coords for '{sp.name}' "
445
+ f"(SMILES: {sp.smiles})")
446
+ above_arrow_texts.append(sp.display_text or sp.name or "?")
447
+ continue
448
+ atom_offset += len(mol_data["atoms"]) + len(mol_data["bonds"])
449
+ reactant_mols.append(mol_data)
450
+
451
+ product_mols = []
452
+ for sp in product_species:
453
+ mol_data = _species_mol_data(sp, offset=atom_offset)
454
+ if mol_data is None:
455
+ _log(f" WARNING: Could not generate coords for product "
456
+ f"'{sp.name}' (SMILES: {sp.smiles})")
457
+ continue
458
+ atom_offset += len(mol_data["atoms"]) + len(mol_data["bonds"])
459
+ product_mols.append(mol_data)
460
+
461
+ above_arrow_mols = []
462
+ for sp in above_arrow_mol_species:
463
+ mol_data = _species_mol_data(sp, offset=atom_offset)
464
+ if mol_data is None:
465
+ _log(f" WARNING: Could not generate coords for '{sp.name}', "
466
+ "converting to text")
467
+ above_arrow_texts.append(sp.display_text or sp.name or "?")
468
+ continue
469
+ atom_offset += len(mol_data["atoms"]) + len(mol_data["bonds"])
470
+ above_arrow_mols.append(mol_data)
471
+
472
+ if not product_mols:
473
+ print("ERROR: No product structures could be generated.",
474
+ file=sys.stderr)
475
+ sys.exit(1)
476
+
477
+ # --- Step 4: Normalize coordinates ---
478
+ _log("Normalizing coordinates...")
479
+ from ..coord_normalizer import normalize_reaction
480
+
481
+ norm_reactants, norm_products = normalize_reaction(
482
+ reactant_mols, product_mols,
483
+ reactant_start_x=50.0,
484
+ product_start_x=350.0,
485
+ molecule_gap=80.0,
486
+ )
487
+
488
+ # --- Step 5: Build conditions dict ---
489
+ # Merge all text into a single below-arrow block. reaction_cleanup
490
+ # puts all <t> elements below the arrow anyway, and multiple <t>
491
+ # elements can overlap. A single merged block with \n-separated
492
+ # lines avoids this. This matches the --merge-conditions behavior
493
+ # of scheme_polisher.
494
+ #
495
+ # Condition tokens (temp + time) are merged onto a single
496
+ # comma-separated line: "105 °C, 24 h".
497
+ merged_conditions = _merge_condition_tokens(condition_lines)
498
+ merged_text = above_arrow_texts + merged_conditions
499
+ conditions = {}
500
+ if merged_text:
501
+ conditions["below"] = merged_text
502
+
503
+ _log(f" Conditions: {conditions}")
504
+
505
+ # --- Step 6: Assemble initial CDXML ---
506
+ _log("Assembling CDXML...")
507
+ from ..cdxml_builder import build_reaction_cdxml
508
+
509
+ cdxml_str = build_reaction_cdxml(
510
+ norm_reactants, norm_products,
511
+ conditions=conditions if conditions else None,
512
+ )
513
+
514
+ # Write to temp file for subsequent processing
515
+ tmp_dir = tempfile.mkdtemp(prefix="scheme_maker_")
516
+ tmp_assembled = os.path.join(tmp_dir, "assembled.cdxml")
517
+ with open(tmp_assembled, "w", encoding="utf-8") as f:
518
+ f.write(cdxml_str)
519
+
520
+ _log(f" Assembled CDXML: {tmp_assembled}")
521
+
522
+ # --- Step 7: Insert above-arrow structures (if any) ---
523
+ if above_arrow_mols:
524
+ _log("Inserting above-arrow structures...")
525
+ _insert_above_arrow_structures(tmp_assembled, above_arrow_mols)
526
+
527
+ # --- Step 8: Apply text formatting (subscripts/italics) ---
528
+ _log("Applying text formatting...")
529
+ _apply_text_formatting(tmp_assembled)
530
+
531
+ # --- Step 9: Run alignment ---
532
+ if align_mode != "none" and len(reactant_mols) > 0:
533
+ _log(f"Running alignment ({align_mode})...")
534
+ _run_alignment(tmp_assembled, align_mode)
535
+
536
+ # --- Step 10: Run reaction_cleanup (final layout) ---
537
+ _log(f"Running layout ({approach})...")
538
+ from ..layout.reaction_cleanup import run_cleanup
539
+
540
+ # Determine output path
541
+ if output is None:
542
+ stem = os.path.splitext(os.path.basename(input_path))[0]
543
+ output = os.path.join(os.path.dirname(input_path) or ".",
544
+ f"{stem}-scheme.cdxml")
545
+
546
+ result = run_cleanup(tmp_assembled, output, approach=approach,
547
+ verbose=verbose)
548
+ _log(f" Layout complete: {result.get('num_reactants', '?')} reactants, "
549
+ f"{result.get('num_products', '?')} products")
550
+
551
+ # --- Step 11: Add run arrow (optional) ---
552
+ if run_arrow and desc.eln_data:
553
+ _log("Adding run arrow...")
554
+ _add_run_arrow(output, desc.eln_data)
555
+
556
+ # Cleanup temp files
557
+ try:
558
+ os.unlink(tmp_assembled)
559
+ os.rmdir(tmp_dir)
560
+ except OSError:
561
+ pass
562
+
563
+ _log(f"Output: {output}")
564
+ return output
565
+
566
+
567
+ # ---------------------------------------------------------------------------
568
+ # Step 7: Insert above-arrow structures
569
+ # ---------------------------------------------------------------------------
570
+
571
+ def _insert_above_arrow_structures(cdxml_path: str,
572
+ above_mols: List[Dict]) -> None:
573
+ """Insert structural fragments above the arrow in the CDXML.
574
+
575
+ Normalizes each above-arrow molecule, builds its fragment XML,
576
+ and inserts it into the page. Updates <step> metadata.
577
+ """
578
+ from ..cdxml_utils import parse_cdxml, write_cdxml
579
+ from ..cdxml_builder import _build_fragment, _IDGen # noqa: private API
580
+
581
+ tree = parse_cdxml(cdxml_path)
582
+ root = tree.getroot()
583
+ page = root.find(".//page")
584
+ if page is None:
585
+ return
586
+
587
+ step = page.find(".//scheme/step")
588
+ if step is None:
589
+ return
590
+
591
+ # Find arrow center for positioning
592
+ arrow = page.find(".//arrow")
593
+ if arrow is None:
594
+ return
595
+ bbox = arrow.get("BoundingBox", "0 0 100 300")
596
+ parts = bbox.split()
597
+ if len(parts) >= 4:
598
+ arrow_cx = (float(parts[0]) + float(parts[2])) / 2.0
599
+ arrow_cy = float(parts[1]) - 30.0 # above the arrow
600
+ else:
601
+ arrow_cx = 200.0
602
+ arrow_cy = 270.0
603
+
604
+ # Get current max ID
605
+ max_id = 0
606
+ for el in root.iter():
607
+ eid = el.get("id")
608
+ if eid:
609
+ try:
610
+ max_id = max(max_id, int(eid))
611
+ except ValueError:
612
+ pass
613
+
614
+ id_gen = _IDGen(start=max_id + 1)
615
+
616
+ above_ids = step.get("ReactionStepObjectsAboveArrow", "")
617
+ above_ids_list = above_ids.split() if above_ids else []
618
+
619
+ y_offset = 0.0
620
+ for mol_data in above_mols:
621
+ # Normalize to ACS bond length
622
+ atoms, bonds = _normalize_mol(mol_data,
623
+ center_x=arrow_cx,
624
+ center_y=arrow_cy - y_offset)
625
+
626
+ frag_xml, _, frag_id_val = _build_fragment(atoms, bonds, id_gen)
627
+
628
+ # Parse fragment XML and insert into page
629
+ frag_elem = ET.fromstring(frag_xml)
630
+ # Insert before scheme element
631
+ scheme = page.find("scheme")
632
+ if scheme is not None:
633
+ idx = list(page).index(scheme)
634
+ page.insert(idx, frag_elem)
635
+ else:
636
+ page.append(frag_elem)
637
+
638
+ frag_id = frag_elem.get("id")
639
+ if frag_id:
640
+ above_ids_list.append(frag_id)
641
+
642
+ y_offset += 60.0 # stack vertically
643
+
644
+ # Update step metadata
645
+ if above_ids_list:
646
+ step.set("ReactionStepObjectsAboveArrow", " ".join(above_ids_list))
647
+
648
+ write_cdxml(tree, cdxml_path)
649
+
650
+
651
+ # ---------------------------------------------------------------------------
652
+ # Step 8: Apply text formatting
653
+ # ---------------------------------------------------------------------------
654
+
655
+ def _apply_text_formatting(cdxml_path: str) -> None:
656
+ """Apply subscript/italic formatting to standalone caption text elements.
657
+
658
+ Handles multi-line condition text by formatting each line independently
659
+ and preserving line breaks. Condition tokens (temperatures, times,
660
+ atmospheres) are left unformatted to avoid spurious subscripts.
661
+ """
662
+ from ..cdxml_utils import parse_cdxml, write_cdxml
663
+
664
+ try:
665
+ from ..text_formatting import build_formatted_s_xml
666
+ except ImportError:
667
+ _log(" text_formatting not available, skipping")
668
+ return
669
+
670
+ tree = parse_cdxml(cdxml_path)
671
+ root = tree.getroot()
672
+ page = root.find(".//page")
673
+ if page is None:
674
+ return
675
+
676
+ # Condition tokens should not be formatted (would get spurious subscripts)
677
+ try:
678
+ from ..perception.reaction_parser import _is_condition_token
679
+ except ImportError:
680
+ _is_condition_token = None
681
+
682
+ modified = False
683
+ # Only process direct children of <page> — these are standalone captions
684
+ # (conditions text, labels). Skip <t> inside <fragment><n> (atom labels).
685
+ for t_elem in list(page):
686
+ if t_elem.tag != "t":
687
+ continue
688
+
689
+ s_elems = t_elem.findall("s")
690
+ if not s_elems:
691
+ continue
692
+
693
+ text = "".join(s.text or "" for s in s_elems)
694
+ if not text.strip():
695
+ continue
696
+
697
+ # Get style attributes from the first <s> element
698
+ first_s = s_elems[0]
699
+ font = first_s.get("font", "3")
700
+ size = first_s.get("size", "10")
701
+ face = first_s.get("face", "1")
702
+
703
+ # Handle multi-line: format each line separately
704
+ lines = text.split("\n")
705
+ new_s_elements = []
706
+
707
+ for i, line in enumerate(lines):
708
+ line = line.strip()
709
+ if not line:
710
+ continue
711
+
712
+ # Don't format condition tokens (temperatures, times, etc.)
713
+ is_condition = False
714
+ if _is_condition_token is not None:
715
+ is_condition = _is_condition_token(line)
716
+
717
+ if is_condition:
718
+ # Plain text — no subscripts
719
+ s_elem = ET.Element("s")
720
+ s_elem.set("font", font)
721
+ s_elem.set("size", size)
722
+ s_elem.set("face", face)
723
+ s_elem.text = line
724
+ new_s_elements.append(s_elem)
725
+ else:
726
+ # Apply chemical formatting
727
+ formatted_xml = build_formatted_s_xml(line)
728
+ if formatted_xml:
729
+ try:
730
+ wrapper = ET.fromstring(f"<t>{formatted_xml}</t>")
731
+ for s in wrapper:
732
+ if not s.get("font"):
733
+ s.set("font", font)
734
+ if not s.get("size"):
735
+ s.set("size", size)
736
+ new_s_elements.append(s)
737
+ except ET.ParseError:
738
+ # Fallback: plain text
739
+ s_elem = ET.Element("s")
740
+ s_elem.set("font", font)
741
+ s_elem.set("size", size)
742
+ s_elem.set("face", face)
743
+ s_elem.text = line
744
+ new_s_elements.append(s_elem)
745
+ else:
746
+ s_elem = ET.Element("s")
747
+ s_elem.set("font", font)
748
+ s_elem.set("size", size)
749
+ s_elem.set("face", face)
750
+ s_elem.text = line
751
+ new_s_elements.append(s_elem)
752
+
753
+ # Insert newline between lines (append to last <s> text)
754
+ if i < len(lines) - 1 and new_s_elements:
755
+ last = new_s_elements[-1]
756
+ last.text = (last.text or "") + "\n"
757
+
758
+ if not new_s_elements:
759
+ continue
760
+
761
+ # Replace <s> children
762
+ for s in list(s_elems):
763
+ t_elem.remove(s)
764
+
765
+ for s_elem in new_s_elements:
766
+ t_elem.append(s_elem)
767
+ modified = True
768
+
769
+ if modified:
770
+ write_cdxml(tree, cdxml_path)
771
+
772
+
773
+ # ---------------------------------------------------------------------------
774
+ # Step 9: Run alignment
775
+ # ---------------------------------------------------------------------------
776
+
777
+ def _run_alignment(cdxml_path: str, align_mode: str) -> None:
778
+ """Align reactant structures to match product orientation."""
779
+ from ..cdxml_utils import parse_cdxml, write_cdxml
780
+
781
+ tree = parse_cdxml(cdxml_path)
782
+
783
+ aligned = 0
784
+ if align_mode == "rxnmapper":
785
+ try:
786
+ from ..layout.alignment import rxnmapper_align_to_product
787
+ aligned = rxnmapper_align_to_product(tree, verbose=_verbose)
788
+ _log(f" RXNMapper aligned {aligned} fragments")
789
+ except (ImportError, Exception) as e:
790
+ _log(f" RXNMapper alignment failed ({e}), falling back to RDKit MCS")
791
+ align_mode = "rdkit"
792
+
793
+ if align_mode == "rdkit":
794
+ try:
795
+ from ..layout.alignment import rdkit_align_to_product
796
+ aligned = rdkit_align_to_product(tree, verbose=_verbose)
797
+ _log(f" RDKit MCS aligned {aligned} fragments")
798
+ except (ImportError, Exception) as e:
799
+ _log(f" RDKit alignment failed ({e}), falling back to Kabsch")
800
+ align_mode = "kabsch"
801
+
802
+ if align_mode == "kabsch":
803
+ try:
804
+ from ..layout.alignment import kabsch_align_to_product
805
+ aligned = kabsch_align_to_product(tree, verbose=_verbose)
806
+ _log(f" Kabsch aligned {aligned} fragments")
807
+ except (ImportError, Exception) as e:
808
+ _log(f" Kabsch alignment failed: {e}")
809
+
810
+ if aligned > 0:
811
+ # Alignment rotates fragments, which invalidates pre-computed
812
+ # DoublePosition values (they are relative to the B→E bond vector,
813
+ # which has rotated). Strip them — ChemDraw recomputes correct
814
+ # values automatically via NeedsClean.
815
+ for bond_el in tree.iter("b"):
816
+ if bond_el.get("DoublePosition"):
817
+ del bond_el.attrib["DoublePosition"]
818
+ write_cdxml(tree, cdxml_path)
819
+
820
+
821
+ # ---------------------------------------------------------------------------
822
+ # Step 11: Add run arrow with mass/yield
823
+ # ---------------------------------------------------------------------------
824
+
825
+ def _add_run_arrow(cdxml_path: str, eln_data: Dict[str, Any]) -> None:
826
+ """Add a run arrow below the scheme with SM mass and product yield.
827
+
828
+ The run arrow matches the reaction arrow's X-extent and is positioned
829
+ below all existing content (text + structures).
830
+ """
831
+ from ..cdxml_utils import parse_cdxml, write_cdxml
832
+
833
+ sm_mass = eln_data.get("sm_mass", "")
834
+ product_obtained = eln_data.get("product_obtained", "")
835
+ product_yield = eln_data.get("product_yield", "")
836
+
837
+ if not sm_mass and not product_obtained:
838
+ _log(" No mass/yield data, skipping run arrow")
839
+ return
840
+
841
+ tree = parse_cdxml(cdxml_path)
842
+ root = tree.getroot()
843
+ page = root.find(".//page")
844
+ if page is None:
845
+ return
846
+
847
+ # Find the existing reaction arrow to match its X-extent
848
+ rxn_arrow = page.find(".//arrow")
849
+ if rxn_arrow is None:
850
+ _log(" No reaction arrow found, skipping run arrow")
851
+ return
852
+
853
+ # Get reaction arrow tail/head X from Tail3D/Head3D
854
+ tail_3d = rxn_arrow.get("Tail3D", "")
855
+ head_3d = rxn_arrow.get("Head3D", "")
856
+ if tail_3d and head_3d:
857
+ arrow_x1 = float(tail_3d.split()[0])
858
+ arrow_x2 = float(head_3d.split()[0])
859
+ else:
860
+ # Fallback: use BoundingBox
861
+ bbox = rxn_arrow.get("BoundingBox", "0 0 100 300").split()
862
+ arrow_x1 = float(bbox[0])
863
+ arrow_x2 = float(bbox[2])
864
+
865
+ # Find bottom of all content (including text below arrow)
866
+ max_y = 0.0
867
+ for elem in page:
868
+ if elem.tag == "fragment":
869
+ for node in elem.findall("n"):
870
+ p = node.get("p", "")
871
+ if p:
872
+ parts = p.split()
873
+ if len(parts) >= 2:
874
+ max_y = max(max_y, float(parts[1]))
875
+ elif elem.tag == "t":
876
+ # Check text bounding box or p position
877
+ bb = elem.get("BoundingBox", "")
878
+ if bb:
879
+ parts = bb.split()
880
+ if len(parts) >= 4:
881
+ max_y = max(max_y, float(parts[3]))
882
+ else:
883
+ p = elem.get("p", "")
884
+ if p:
885
+ parts = p.split()
886
+ if len(parts) >= 2:
887
+ max_y = max(max_y, float(parts[1]) + 5.0)
888
+
889
+ if max_y == 0.0:
890
+ return
891
+
892
+ # Get max id for new elements
893
+ max_id = 0
894
+ for el in root.iter():
895
+ eid = el.get("id")
896
+ if eid:
897
+ try:
898
+ max_id = max(max_id, int(eid))
899
+ except ValueError:
900
+ pass
901
+
902
+ next_id = max_id + 1
903
+
904
+ # Position run arrow below all content
905
+ arrow_y = max_y + 18.0
906
+
907
+ # Create arrow element (same X-extent as reaction arrow)
908
+ arrow_elem = ET.SubElement(page, "arrow")
909
+ arrow_elem.set("id", str(next_id))
910
+ next_id += 1
911
+ arrow_elem.set("Z", str(next_id))
912
+ next_id += 1
913
+ bbox = f"{arrow_x1:.2f} {arrow_y - 2:.2f} {arrow_x2:.2f} {arrow_y + 2:.2f}"
914
+ arrow_elem.set("BoundingBox", bbox)
915
+ arrow_elem.set("FillType", "None")
916
+ arrow_elem.set("ArrowheadHead", "Full")
917
+ arrow_elem.set("ArrowheadType", "Solid")
918
+ arrow_elem.set("Head3D", f"{arrow_x2:.2f} {arrow_y:.2f} 0")
919
+ arrow_elem.set("Tail3D", f"{arrow_x1:.2f} {arrow_y:.2f} 0")
920
+
921
+ # Text baseline should vertically centre on the arrow.
922
+ # Arial 10pt has ~7pt cap height; p (anchor) is at the text baseline,
923
+ # so baseline ≈ arrow_y + 3.5 centres the text on the arrow line.
924
+ text_baseline_y = arrow_y + 3.5
925
+
926
+ # Left label: SM mass (positioned left of arrow tail)
927
+ if sm_mass:
928
+ t_left = ET.SubElement(page, "t")
929
+ t_left.set("id", str(next_id))
930
+ next_id += 1
931
+ text_width = len(sm_mass) * 5.8
932
+ lx = arrow_x1 - 5.0 - text_width
933
+ ly_top = text_baseline_y - 8.0
934
+ ly_bot = text_baseline_y + 2.0
935
+ t_left.set("p", f"{arrow_x1 - 5:.2f} {text_baseline_y:.2f}")
936
+ t_left.set("BoundingBox",
937
+ f"{lx:.2f} {ly_top:.2f} {arrow_x1 - 5:.2f} {ly_bot:.2f}")
938
+ t_left.set("Justification", "Right")
939
+ t_left.set("CaptionJustification", "Right")
940
+ t_left.set("InterpretChemically", "no")
941
+ s_left = ET.SubElement(t_left, "s")
942
+ s_left.set("font", "3")
943
+ s_left.set("size", "10")
944
+ s_left.set("face", "0")
945
+ s_left.text = sm_mass
946
+
947
+ # Right label: product obtained + yield (positioned right of arrow head)
948
+ # Format: "1.60 g, 72%" (comma-separated, no parentheses)
949
+ right_text_parts = []
950
+ if product_obtained:
951
+ right_text_parts.append(product_obtained)
952
+ if product_yield:
953
+ # Strip extra whitespace in yield (e.g. "72 %" → "72%")
954
+ yield_clean = product_yield.replace(" %", "%").replace("% ", "%")
955
+ right_text_parts.append(yield_clean)
956
+ if right_text_parts:
957
+ right_text = ", ".join(right_text_parts)
958
+ text_width = len(right_text) * 5.8
959
+ t_right = ET.SubElement(page, "t")
960
+ t_right.set("id", str(next_id))
961
+ next_id += 1
962
+ rx = arrow_x2 + 5.0
963
+ ry_top = text_baseline_y - 8.0
964
+ ry_bot = text_baseline_y + 2.0
965
+ t_right.set("p", f"{rx:.2f} {text_baseline_y:.2f}")
966
+ t_right.set("BoundingBox",
967
+ f"{rx:.2f} {ry_top:.2f} {rx + text_width:.2f} {ry_bot:.2f}")
968
+ t_right.set("InterpretChemically", "no")
969
+ s_right = ET.SubElement(t_right, "s")
970
+ s_right.set("font", "3")
971
+ s_right.set("size", "10")
972
+ s_right.set("face", "0")
973
+ s_right.text = right_text
974
+
975
+ write_cdxml(tree, cdxml_path)
976
+ _log(f" Run arrow added: {sm_mass} -> {' '.join(right_text_parts)}")
977
+
978
+
979
+ # ---------------------------------------------------------------------------
980
+ # CLI
981
+ # ---------------------------------------------------------------------------
982
+
983
+ def _build_arg_parser() -> argparse.ArgumentParser:
984
+ p = argparse.ArgumentParser(
985
+ description="Build CDXML reaction scheme from reaction JSON "
986
+ "(experimental).",
987
+ )
988
+ p.add_argument("input", help="Reaction JSON file (from reaction_parser)")
989
+ p.add_argument("-o", "--output", default=None,
990
+ help="Output CDXML file (default: {stem}-scheme.cdxml)")
991
+ p.add_argument("--approach", default="chemdraw_mimic",
992
+ choices=["chemdraw_mimic", "compact", "bbox_center",
993
+ "arrow_driven", "proportional", "golden_ratio"],
994
+ help="Layout approach (default: chemdraw_mimic)")
995
+ p.add_argument("--align-mode", default="rdkit",
996
+ choices=["rdkit", "rxnmapper", "kabsch", "none"],
997
+ help="Alignment strategy (default: rdkit)")
998
+ p.add_argument("--no-run-arrow", action="store_true",
999
+ help="Skip run arrow even if ELN data is available")
1000
+ p.add_argument("-v", "--verbose", action="store_true")
1001
+ p.add_argument("--json-errors", action="store_true",
1002
+ help="Structured JSON errors to stderr")
1003
+ return p
1004
+
1005
+
1006
+ def main() -> None:
1007
+ parser = _build_arg_parser()
1008
+ args = parser.parse_args()
1009
+
1010
+ if not os.path.isfile(args.input):
1011
+ if args.json_errors:
1012
+ err = {"error": "file_not_found",
1013
+ "detail": f"Input file not found: {args.input}"}
1014
+ print(json.dumps(err), file=sys.stderr)
1015
+ else:
1016
+ print(f"ERROR: Input file not found: {args.input}",
1017
+ file=sys.stderr)
1018
+ sys.exit(1)
1019
+
1020
+ try:
1021
+ output = build_scheme(
1022
+ input_path=args.input,
1023
+ output=args.output,
1024
+ approach=args.approach,
1025
+ align_mode=args.align_mode,
1026
+ run_arrow=not args.no_run_arrow,
1027
+ verbose=args.verbose,
1028
+ )
1029
+ print(f"Output: {output}")
1030
+ except Exception as e:
1031
+ if args.json_errors:
1032
+ err = {"error": "scheme_build_failed", "detail": str(e)}
1033
+ print(json.dumps(err), file=sys.stderr)
1034
+ else:
1035
+ print(f"ERROR: {e}", file=sys.stderr)
1036
+ if args.verbose:
1037
+ import traceback
1038
+ traceback.print_exc()
1039
+ sys.exit(1)
1040
+
1041
+
1042
+ if __name__ == "__main__":
1043
+ main()