cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1487 @@
1
+ """
2
+ scheme_yaml_writer.py — Generate scheme YAML from reaction_parser JSON.
3
+
4
+ This is the layout-decision layer between perception (reaction_parser) and
5
+ rendering (renderer). It reads a reaction JSON, decides where each species
6
+ goes in the scheme, and writes a YAML file that the renderer can consume.
7
+
8
+ The three decisions made here:
9
+ 1. Structure or text? (atom-contributing → structure; non-contributing → text)
10
+ 2. Position? (substrate → left; other reactant → above arrow; reagents → below)
11
+ 3. Priority ordering of below-arrow text (catalyst > base > solvent > conditions)
12
+
13
+ Usage:
14
+ python experiments/scheme_dsl/scheme_yaml_writer.py reaction.json -o scheme.yaml
15
+
16
+ from scheme_dsl.scheme_yaml_writer import write_scheme_yaml
17
+ yaml_path = write_scheme_yaml("reaction.json", "scheme.yaml")
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import json
24
+ import os
25
+ import re
26
+ import sys
27
+ from dataclasses import dataclass, field as dc_field
28
+ from typing import Any, Dict, List, Optional, Tuple
29
+
30
+ try:
31
+ import yaml
32
+ except ImportError:
33
+ yaml = None # type: ignore[assignment]
34
+
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Role-based priority for below-arrow text ordering
38
+ # ---------------------------------------------------------------------------
39
+
40
+ # Lower number = higher priority (appears first below arrow)
41
+ _ROLE_PRIORITY = {
42
+ "catalyst": 10,
43
+ "ligand": 15,
44
+ "base": 20,
45
+ "coupling_reagent": 25,
46
+ "reducing_agent": 30,
47
+ "oxidant": 30,
48
+ "lewis_acid": 30,
49
+ "activating_agent": 30,
50
+ "halogenating_agent": 30,
51
+ "fluorinating_agent": 30,
52
+ "borylating_agent": 30,
53
+ "protecting_group": 35,
54
+ "deprotecting_agent": 35,
55
+ "acid": 35,
56
+ "additive": 40,
57
+ "reductant": 40,
58
+ "reagent": 45,
59
+ "drying_agent": 50,
60
+ "inorganic_salt": 50,
61
+ "solvent": 80,
62
+ }
63
+
64
+ # Roles that should always be shown as text, never as drawn structures
65
+ _DEMOTE_ROLES = {
66
+ "base", "catalyst", "ligand", "coupling_reagent", "reducing_agent",
67
+ "oxidant", "protecting_group", "deprotecting_agent", "acid",
68
+ "activating_agent", "lewis_acid", "drying_agent", "halogenating_agent",
69
+ "fluorinating_agent", "borylating_agent", "additive", "reductant",
70
+ "reagent", "solvent", "inorganic_salt",
71
+ }
72
+
73
+
74
+ # ---------------------------------------------------------------------------
75
+ # Public API
76
+ # ---------------------------------------------------------------------------
77
+
78
+ def write_scheme_yaml(
79
+ json_path: str,
80
+ output_path: str,
81
+ layout: str = "auto",
82
+ include_run_arrows: bool = True,
83
+ use_eln_labels: bool = False,
84
+ ) -> str:
85
+ """Read reaction JSON, make layout decisions, write YAML file.
86
+
87
+ Parameters
88
+ ----------
89
+ json_path : str
90
+ Path to reaction_parser JSON file.
91
+ output_path : str
92
+ Path where the YAML will be written.
93
+ layout : str
94
+ Layout type: "linear", "sequential", or "auto" (inferred from step count).
95
+ include_run_arrows : bool
96
+ If True and ELN data has SM mass + product yield, include run_arrows.
97
+ use_eln_labels : bool
98
+ If True, label products with ELN experiment names instead of
99
+ sequential numbers.
100
+
101
+ Returns
102
+ -------
103
+ str
104
+ The absolute path to the written YAML file.
105
+ """
106
+ with open(json_path, "r", encoding="utf-8") as f:
107
+ data = json.load(f)
108
+
109
+ species = data.get("species", [])
110
+ eln_data = data.get("eln_data") or {}
111
+ conditions = data.get("conditions", [])
112
+
113
+ product_label = None
114
+ if use_eln_labels:
115
+ experiment = data.get("experiment",
116
+ os.path.splitext(os.path.basename(json_path))[0])
117
+ product_label = experiment
118
+
119
+ # Build the YAML dict
120
+ yaml_dict = _build_yaml_dict(species, conditions, eln_data,
121
+ layout=layout,
122
+ include_run_arrows=include_run_arrows,
123
+ product_label=product_label)
124
+
125
+ # Write YAML
126
+ _write_yaml_file(yaml_dict, output_path)
127
+
128
+ return os.path.abspath(output_path)
129
+
130
+
131
+ def build_scheme_yaml_dict(
132
+ json_path: str,
133
+ layout: str = "auto",
134
+ include_run_arrows: bool = True,
135
+ use_eln_labels: bool = False,
136
+ ) -> Dict[str, Any]:
137
+ """Read reaction JSON and return the YAML dict (without writing to disk).
138
+
139
+ Useful for programmatic access when you want to inspect or modify
140
+ the dict before writing.
141
+ """
142
+ with open(json_path, "r", encoding="utf-8") as f:
143
+ data = json.load(f)
144
+
145
+ species = data.get("species", [])
146
+ eln_data = data.get("eln_data") or {}
147
+ conditions = data.get("conditions", [])
148
+
149
+ product_label = None
150
+ if use_eln_labels:
151
+ experiment = data.get("experiment",
152
+ os.path.splitext(os.path.basename(json_path))[0])
153
+ product_label = experiment
154
+
155
+ return _build_yaml_dict(species, conditions, eln_data,
156
+ layout=layout,
157
+ include_run_arrows=include_run_arrows,
158
+ product_label=product_label)
159
+
160
+
161
+ # ---------------------------------------------------------------------------
162
+ # Core layout logic
163
+ # ---------------------------------------------------------------------------
164
+
165
+ def _build_yaml_dict(
166
+ species: List[Dict[str, Any]],
167
+ conditions: List[str],
168
+ eln_data: Dict[str, Any],
169
+ layout: str = "auto",
170
+ include_run_arrows: bool = True,
171
+ product_label: Optional[str] = None,
172
+ ) -> Dict[str, Any]:
173
+ """Build the complete YAML dict from reaction data.
174
+
175
+ This is where all three layout decisions are made:
176
+ 1. Structure or text? (atom_contributing → draw; else → text)
177
+ 2. Position? (substrate left, other reactant above, reagents below)
178
+ 3. Priority ordering of below-arrow text
179
+
180
+ Parameters
181
+ ----------
182
+ product_label : str, optional
183
+ When set, only products get this label (ELN mode).
184
+ Substrates and above-arrow structures are unlabelled.
185
+ """
186
+ # --- Classify species into scheme positions ---
187
+ substrates = [] # left of arrow (drawn structures)
188
+ above_structures = [] # above arrow (drawn structures)
189
+ above_text = [] # above arrow (text, e.g. "(1.2 eq)")
190
+ below_text_items = [] # below arrow (text with priority for sorting)
191
+ products = [] # right of arrow (drawn structures)
192
+
193
+ # Track species that will be drawn as structures (need StructureRef entries)
194
+ drawn_ids = set()
195
+
196
+ for sp in species:
197
+ sp_id = sp.get("id", "")
198
+ role = sp.get("role", "")
199
+ role_detail = (sp.get("role_detail") or "").lower()
200
+ is_sm = sp.get("is_sm", False)
201
+ is_dp = sp.get("is_dp", False)
202
+ is_substrate = sp.get("is_substrate", False)
203
+ is_solvent = sp.get("is_solvent", False)
204
+ atom_contributing = (role == "atom_contributing")
205
+ source = sp.get("source", "")
206
+ # Use raw name for labels (display_text may already have equiv appended)
207
+ name = sp.get("name", "")
208
+ display = sp.get("display_text") or name
209
+ smiles = sp.get("smiles")
210
+
211
+ # Check is_dp flag as well as role for product identification
212
+ if role == "product" or is_dp:
213
+ if smiles:
214
+ products.append(sp_id)
215
+ drawn_ids.add(sp_id)
216
+
217
+ # Check is_sm/is_substrate flags — but only honor them when the
218
+ # species is atom-contributing or unclassified. The CSV may mark
219
+ # reagents like n-BuLi as "substrate" even though RXNMapper says
220
+ # they are non-contributing. Role classification wins.
221
+ elif (is_substrate or is_sm) and (atom_contributing or role in ("", "unclassified")):
222
+ if smiles:
223
+ substrates.append(sp_id)
224
+ drawn_ids.add(sp_id)
225
+ # else: CSV-only SM entry with no structure — skip (the actual
226
+ # structure should be a separate atom_contributing species)
227
+
228
+ elif atom_contributing:
229
+ # Atom-contributing species are drawn as structures above arrow
230
+ above_structures.append(sp_id)
231
+ drawn_ids.add(sp_id)
232
+ # Add equiv text below the structure
233
+ equiv = sp.get("csv_equiv")
234
+ if equiv:
235
+ above_text.append(f"({equiv} eq)")
236
+
237
+ elif is_solvent:
238
+ # Solvents → below arrow text, low priority
239
+ priority = _ROLE_PRIORITY.get("solvent", 80)
240
+ below_text_items.append((priority, display))
241
+
242
+ else:
243
+ # Non-contributing / unclassified species → text below arrow
244
+ # Check if it should be demoted (drawn structure → text)
245
+ should_demote = role_detail in _DEMOTE_ROLES
246
+ has_structure = source in ("fragment", "rxn") and smiles
247
+
248
+ if has_structure and not should_demote:
249
+ # Unusual: non-contributing but not a known reagent type.
250
+ # Draw it above the arrow as a structure.
251
+ above_structures.append(sp_id)
252
+ drawn_ids.add(sp_id)
253
+ else:
254
+ # Text label below arrow — use raw name + format equiv here
255
+ priority = _ROLE_PRIORITY.get(role_detail, 50)
256
+ label = _format_label_with_equiv(name, display, sp.get("csv_equiv"))
257
+ if not label:
258
+ continue
259
+ below_text_items.append((priority, label))
260
+
261
+ # If no substrate was identified but there are atom-contributing species
262
+ # above the arrow, promote the most likely one to substrate (left of arrow).
263
+ # Prefer the one with csv_equiv closest to 1.0, else the first one.
264
+ if not substrates and above_structures:
265
+ above_set = set(above_structures)
266
+ best_id = above_structures[0]
267
+ best_diff = float("inf")
268
+ for sp in species:
269
+ sp_id_check = sp.get("id")
270
+ if sp_id_check in above_set:
271
+ eq = sp.get("csv_equiv")
272
+ if eq:
273
+ try:
274
+ diff = abs(float(eq) - 1.0)
275
+ if diff < best_diff:
276
+ best_diff = diff
277
+ best_id = sp_id_check
278
+ except (ValueError, TypeError):
279
+ pass
280
+ above_structures.remove(best_id)
281
+ substrates.append(best_id)
282
+ # Rebuild above_text from remaining above_structures
283
+ above_text = []
284
+ remaining = set(above_structures)
285
+ for sp in species:
286
+ if sp.get("id") in remaining:
287
+ eq = sp.get("csv_equiv")
288
+ if eq:
289
+ above_text.append(f"({eq} eq)")
290
+
291
+ # Sort below-arrow text by priority
292
+ below_text_items.sort(key=lambda x: x[0])
293
+ below_text = [item[1] for item in below_text_items]
294
+
295
+ # Add conditions (temperature, time, atmosphere) at end
296
+ if conditions:
297
+ below_text.extend(_normalize_conditions(conditions))
298
+
299
+ # --- Build structures dict ---
300
+ # Assign compound labels to substrates and products.
301
+ # Above-arrow structures (reagents) don't get numbered.
302
+ # When product_label is set (ELN mode), only products get labels.
303
+ label_map: Dict[str, str] = {}
304
+ if product_label is not None:
305
+ # ELN mode: only products get the provided label
306
+ for sid in products:
307
+ label_map[sid] = product_label
308
+ else:
309
+ # Default mode: sequential numbers 1, 2, 3, ...
310
+ label_order: List[str] = []
311
+ for sid in substrates:
312
+ if sid not in label_order:
313
+ label_order.append(sid)
314
+ for sid in products:
315
+ if sid not in label_order:
316
+ label_order.append(sid)
317
+ label_counter = 1
318
+ for sid in label_order:
319
+ label_map[sid] = str(label_counter)
320
+ label_counter += 1
321
+
322
+ structures = {}
323
+ for sp in species:
324
+ sp_id = sp.get("id", "")
325
+ if sp_id not in drawn_ids:
326
+ continue
327
+ smiles = sp.get("smiles")
328
+ if not smiles:
329
+ continue
330
+ entry: Dict[str, Any] = {"smiles": smiles}
331
+ # Compound number label below structure (substrates + products only)
332
+ if sp_id in label_map:
333
+ entry["label"] = label_map[sp_id]
334
+ structures[sp_id] = entry
335
+
336
+ # --- Build step ---
337
+ step: Dict[str, Any] = {}
338
+ if substrates:
339
+ step["substrates"] = substrates
340
+ if products:
341
+ step["products"] = products
342
+
343
+ above: Dict[str, Any] = {}
344
+ if above_structures:
345
+ above["structures"] = above_structures
346
+ if above_text:
347
+ above["text"] = above_text
348
+ if above:
349
+ step["above_arrow"] = above
350
+
351
+ if below_text:
352
+ step["below_arrow"] = {"text": below_text}
353
+
354
+ # --- Determine layout ---
355
+ if layout == "auto":
356
+ layout = "linear" # single step for now; multi-step will be "sequential"
357
+
358
+ # --- Build top-level YAML dict ---
359
+ yaml_dict: Dict[str, Any] = {
360
+ "structures": structures,
361
+ "steps": [step],
362
+ "layout": layout,
363
+ }
364
+
365
+ # --- Run arrows ---
366
+ # Run arrows already display yield in their output label, so only add
367
+ # yield_ to the step when run arrows are NOT present (avoids duplication).
368
+ run_arrows_added = False
369
+ if include_run_arrows and eln_data:
370
+ run_arrows = _build_run_arrows(eln_data)
371
+ if run_arrows:
372
+ yaml_dict["run_arrows"] = run_arrows
373
+ run_arrows_added = True
374
+
375
+ if not run_arrows_added:
376
+ yield_pct = eln_data.get("product_yield", "").strip()
377
+ if yield_pct:
378
+ yield_str = yield_pct.rstrip("%").strip()
379
+ step["yield_"] = f"{yield_str}%"
380
+
381
+ return yaml_dict
382
+
383
+
384
+ # ---------------------------------------------------------------------------
385
+ # Run arrows
386
+ # ---------------------------------------------------------------------------
387
+
388
+ def _build_run_arrows(eln_data: Dict[str, Any]) -> Optional[List[Dict[str, Any]]]:
389
+ """Build run_arrows list from ELN data (SM mass → product yield)."""
390
+ sm_mass = eln_data.get("sm_mass", "").strip()
391
+ product_obtained = eln_data.get("product_obtained", "").strip()
392
+ product_yield = eln_data.get("product_yield", "").strip()
393
+
394
+ if not sm_mass or not product_obtained:
395
+ return None
396
+
397
+ input_label = sm_mass if _has_unit(sm_mass) else f"{sm_mass} g"
398
+ obtained_str = (product_obtained if _has_unit(product_obtained)
399
+ else f"{product_obtained} g")
400
+ if product_yield:
401
+ yield_clean = product_yield.rstrip("%").strip()
402
+ output_label = f"{obtained_str}, {yield_clean}% yield"
403
+ else:
404
+ output_label = obtained_str
405
+
406
+ return [{
407
+ "step": 1,
408
+ "runs": [{"input": input_label, "output": output_label}],
409
+ }]
410
+
411
+
412
+ def _format_label_with_equiv(
413
+ name: str, display: str, csv_equiv: Optional[str],
414
+ ) -> str:
415
+ """Build a text label, adding equiv only if not already present.
416
+
417
+ Uses raw ``name`` as the base label, falling back to ``display``.
418
+ Avoids duplicating "(X eq)" when ``display_text`` already contains it.
419
+ """
420
+ base = name or display
421
+ if not base:
422
+ return ""
423
+ # If equiv data exists and the label doesn't already mention "eq"
424
+ if csv_equiv and "eq" not in base.lower():
425
+ return f"{base} ({csv_equiv} eq)"
426
+ # If display has equiv but name doesn't, use display as-is
427
+ if "eq" in display.lower():
428
+ return display
429
+ return base
430
+
431
+
432
+ def _normalize_conditions(conditions: List[str]) -> List[str]:
433
+ """Normalize condition strings for display.
434
+
435
+ Fixes temperature formatting:
436
+ "80 C" → "80 °C"
437
+ "105C" → "105 °C"
438
+ "80°C" → "80 °C"
439
+ "-78 °C" → "-78 °C" (no change)
440
+ """
441
+ result = []
442
+ for cond in conditions:
443
+ # "80 C" or "-78 C" → "80 °C" (missing degree symbol)
444
+ cond = re.sub(r"(-?\d+\.?\d*)\s+C\b", r"\1 °C", cond)
445
+ # "80C" or "105C" → "80 °C" (no space, no degree)
446
+ cond = re.sub(r"(-?\d+\.?\d*)C\b", r"\1 °C", cond)
447
+ # "80°C" → "80 °C" (no space before degree)
448
+ cond = re.sub(r"(-?\d+\.?\d*)°C", r"\1 °C", cond)
449
+ result.append(cond)
450
+ return result
451
+
452
+
453
+ def _has_unit(value: str) -> bool:
454
+ """Check if a mass string already contains a unit (g, mg, mL, etc.)."""
455
+ return bool(re.search(r"\d\s*(g|mg|kg|mL|µL|L)\b", value))
456
+
457
+
458
+ def _merge_eln_labels(experiments: List[str]) -> str:
459
+ """Merge multiple ELN experiment names into a compact label.
460
+
461
+ If all share a common prefix (e.g. "KL-7001-"), uses compact form:
462
+ "KL-7001-001/003/004/009". Otherwise joins with ", ".
463
+ """
464
+ if not experiments:
465
+ return ""
466
+ if len(experiments) == 1:
467
+ return experiments[0]
468
+
469
+ # Try to find common prefix up to the last dash
470
+ parts = [exp.rsplit("-", 1) for exp in experiments if "-" in exp]
471
+ if len(parts) == len(experiments):
472
+ prefixes = set(p[0] for p in parts)
473
+ if len(prefixes) == 1:
474
+ prefix = parts[0][0]
475
+ suffixes = [p[1] for p in parts]
476
+ return prefix + "-" + "/".join(suffixes)
477
+ return ", ".join(experiments)
478
+
479
+
480
+ # ---------------------------------------------------------------------------
481
+ # Multi-reaction merge — data structures
482
+ # ---------------------------------------------------------------------------
483
+
484
+ @dataclass
485
+ class ReactionSummary:
486
+ """Extracted summary of one reaction JSON for merge classification."""
487
+ index: int
488
+ json_path: str
489
+ experiment: str
490
+ sm_smiles: str
491
+ dp_smiles: str
492
+ reagent_smiles: Dict[str, str] # {species_id: canonical_smiles}
493
+ reagent_names: Dict[str, str] # {species_id: display_name}
494
+ reagent_equivs: Dict[str, str] # {species_id: equiv_str}
495
+ all_smiles: set # all valid canonical SMILES in this reaction
496
+ species: List[Dict[str, Any]]
497
+ conditions: List[str]
498
+ eln_data: Dict[str, Any]
499
+
500
+
501
+ @dataclass
502
+ class MergePlan:
503
+ """How to combine N reaction JSONs into a merged scheme."""
504
+ parallel_groups: List[List[int]] # groups of reaction indices
505
+ chains: List[List[int]] # independent sequential chains (each is a topo-sorted list of group indices)
506
+ unrelated_groups: List[int] # indices into parallel_groups with no chain link
507
+
508
+ def describe(self) -> str:
509
+ parts: List[str] = []
510
+ for chain in self.chains:
511
+ descs = []
512
+ for gi in chain:
513
+ grp = self.parallel_groups[gi]
514
+ descs.append("+".join(str(g) for g in grp)
515
+ if len(grp) > 1 else str(grp[0]))
516
+ parts.append("Chain: " + " -> ".join(descs))
517
+ for gi in self.unrelated_groups:
518
+ grp = self.parallel_groups[gi]
519
+ parts.append("Unrelated: " + "+".join(str(g) for g in grp))
520
+ return "; ".join(parts) if parts else "Single reaction"
521
+
522
+
523
+ # ---------------------------------------------------------------------------
524
+ # Multi-reaction merge — SMILES matching
525
+ # ---------------------------------------------------------------------------
526
+
527
+ def _canonicalize(smiles: str) -> str:
528
+ """Return RDKit canonical SMILES, or original string if RDKit fails."""
529
+ if not smiles:
530
+ return ""
531
+ try:
532
+ from rdkit import Chem
533
+ mol = Chem.MolFromSmiles(smiles)
534
+ if mol is not None:
535
+ return Chem.MolToSmiles(mol)
536
+ except Exception:
537
+ pass
538
+ return smiles
539
+
540
+
541
+ def _strip_salts(smiles: str) -> str:
542
+ """Strip small fragments (counterions) from multi-component SMILES.
543
+
544
+ Keeps only the largest fragment by heavy atom count. This handles
545
+ common salt forms (HCl, TFA, Na+, etc.) that differ between ELN
546
+ entries for the same compound.
547
+ """
548
+ if not smiles or "." not in smiles:
549
+ return smiles
550
+ try:
551
+ from rdkit import Chem
552
+ frags = smiles.split(".")
553
+ best = smiles
554
+ best_size = 0
555
+ for frag in frags:
556
+ mol = Chem.MolFromSmiles(frag)
557
+ if mol is not None:
558
+ size = mol.GetNumHeavyAtoms()
559
+ if size > best_size:
560
+ best_size = size
561
+ best = Chem.MolToSmiles(mol)
562
+ return best
563
+ except Exception:
564
+ return smiles
565
+
566
+
567
+ def _smiles_match(a: str, b: str) -> bool:
568
+ """Check if two SMILES represent the same molecule, tolerating salt forms."""
569
+ if not a or not b:
570
+ return False
571
+ if a == b:
572
+ return True
573
+ return _strip_salts(a) == _strip_salts(b)
574
+
575
+
576
+ def _extract_reaction_summary(index: int, json_path: str) -> ReactionSummary:
577
+ """Load a reaction JSON and extract the key data for merge classification.
578
+
579
+ Solvents are excluded from the reagent set (checked via role_detail or
580
+ is_solvent flag).
581
+ """
582
+ with open(json_path, "r", encoding="utf-8") as f:
583
+ data = json.load(f)
584
+
585
+ species = data.get("species", [])
586
+ conditions = data.get("conditions", [])
587
+ eln_data = data.get("eln_data") or {}
588
+ experiment = data.get("experiment", os.path.splitext(
589
+ os.path.basename(json_path))[0])
590
+
591
+ sm_smiles = ""
592
+ dp_smiles = ""
593
+ reagent_smiles: Dict[str, str] = {}
594
+ reagent_names: Dict[str, str] = {}
595
+ reagent_equivs: Dict[str, str] = {}
596
+ all_smiles: set = set()
597
+
598
+ for sp in species:
599
+ smi = _canonicalize(sp.get("smiles", ""))
600
+ sp_id = sp.get("id", "")
601
+ is_solvent = (sp.get("is_solvent", False)
602
+ or (sp.get("role_detail") or "").lower() == "solvent")
603
+
604
+ if smi and smi != "?" and smi != "":
605
+ all_smiles.add(smi)
606
+
607
+ if sp.get("is_sm") and smi and not is_solvent:
608
+ sm_smiles = smi
609
+ if (sp.get("is_dp") or sp.get("role") == "product") and smi:
610
+ dp_smiles = smi
611
+ elif not is_solvent and smi and not sp.get("is_sm") and not sp.get("is_dp"):
612
+ reagent_smiles[sp_id] = smi
613
+ reagent_names[sp_id] = (sp.get("name") or sp.get("display_text")
614
+ or sp_id)
615
+ equiv = sp.get("csv_equiv")
616
+ if equiv:
617
+ reagent_equivs[sp_id] = str(equiv)
618
+
619
+ return ReactionSummary(
620
+ index=index, json_path=json_path, experiment=experiment,
621
+ sm_smiles=sm_smiles, dp_smiles=dp_smiles,
622
+ reagent_smiles=reagent_smiles, reagent_names=reagent_names,
623
+ reagent_equivs=reagent_equivs, all_smiles=all_smiles,
624
+ species=species, conditions=conditions, eln_data=eln_data,
625
+ )
626
+
627
+
628
+ # ---------------------------------------------------------------------------
629
+ # Multi-reaction merge — pair classification
630
+ # ---------------------------------------------------------------------------
631
+
632
+ def _classify_pair(a: ReactionSummary, b: ReactionSummary) -> str:
633
+ """Classify the relationship between two reactions.
634
+
635
+ Returns "parallel", "sequential_ab", "sequential_ba", or "unrelated".
636
+
637
+ Parallel requires: same SM + same DP + at least one shared non-solvent
638
+ reagent. Same SM+DP with no shared reagent = different chemistry.
639
+
640
+ Salt forms are tolerated: free amine and HCl salt are treated as the
641
+ same molecule for SM/DP matching.
642
+ """
643
+ sm_match = _smiles_match(a.sm_smiles, b.sm_smiles)
644
+ dp_match = _smiles_match(a.dp_smiles, b.dp_smiles)
645
+
646
+ if sm_match and dp_match:
647
+ a_set = set(a.reagent_smiles.values())
648
+ b_set = set(b.reagent_smiles.values())
649
+ if a_set & b_set:
650
+ return "parallel"
651
+ return "unrelated"
652
+
653
+ # Direct DP→SM match (salt-tolerant)
654
+ if _smiles_match(a.dp_smiles, b.sm_smiles):
655
+ return "sequential_ab"
656
+ if _smiles_match(b.dp_smiles, a.sm_smiles):
657
+ return "sequential_ba"
658
+
659
+ # Fallback: check if A's product SMILES appears anywhere in B's species
660
+ # (handles cases where SM SMILES is unresolved but the molecule appears
661
+ # as a reagent or is otherwise present in the reaction)
662
+ if a.dp_smiles:
663
+ a_dp_stripped = _strip_salts(a.dp_smiles)
664
+ for smi in b.all_smiles:
665
+ if a.dp_smiles == smi or a_dp_stripped == _strip_salts(smi):
666
+ return "sequential_ab"
667
+ if b.dp_smiles:
668
+ b_dp_stripped = _strip_salts(b.dp_smiles)
669
+ for smi in a.all_smiles:
670
+ if b.dp_smiles == smi or b_dp_stripped == _strip_salts(smi):
671
+ return "sequential_ba"
672
+
673
+ return "unrelated"
674
+
675
+
676
+ def _build_merge_plan(summaries: List[ReactionSummary]) -> MergePlan:
677
+ """Analyze N reactions and determine merge strategy.
678
+
679
+ Algorithm:
680
+ 1. Pairwise classification
681
+ 2. Union-Find for parallel clustering
682
+ 3. DAG construction for sequential links between clusters
683
+ 4. Find connected components in the DAG (independent chains)
684
+ 5. Topological sort within each component
685
+ """
686
+ n = len(summaries)
687
+ if n == 1:
688
+ return MergePlan(
689
+ parallel_groups=[[0]], chains=[[0]],
690
+ unrelated_groups=[],
691
+ )
692
+
693
+ # Classify all pairs
694
+ classifications: Dict[Tuple[int, int], str] = {}
695
+ for i in range(n):
696
+ for j in range(i + 1, n):
697
+ classifications[(i, j)] = _classify_pair(summaries[i], summaries[j])
698
+
699
+ # Union-Find for parallel clusters
700
+ parent = list(range(n))
701
+
702
+ def find(x: int) -> int:
703
+ while parent[x] != x:
704
+ parent[x] = parent[parent[x]]
705
+ x = parent[x]
706
+ return x
707
+
708
+ def union(x: int, y: int) -> None:
709
+ px, py = find(x), find(y)
710
+ if px != py:
711
+ parent[px] = py
712
+
713
+ for (i, j), c in classifications.items():
714
+ if c == "parallel":
715
+ union(i, j)
716
+
717
+ # Build groups
718
+ groups_map: Dict[int, List[int]] = {}
719
+ for i in range(n):
720
+ root = find(i)
721
+ groups_map.setdefault(root, []).append(i)
722
+ groups = list(groups_map.values())
723
+
724
+ reaction_to_group: Dict[int, int] = {}
725
+ for gi, grp in enumerate(groups):
726
+ for ri in grp:
727
+ reaction_to_group[ri] = gi
728
+
729
+ # DAG of sequential links between groups
730
+ ng = len(groups)
731
+ seq_edges: set = set()
732
+ for (i, j), c in classifications.items():
733
+ gi, gj = reaction_to_group[i], reaction_to_group[j]
734
+ if gi == gj:
735
+ continue
736
+ if c == "sequential_ab":
737
+ seq_edges.add((gi, gj))
738
+ elif c == "sequential_ba":
739
+ seq_edges.add((gj, gi))
740
+
741
+ if not seq_edges:
742
+ # No sequential links at all
743
+ if ng == 1:
744
+ return MergePlan(
745
+ parallel_groups=groups, chains=[[0]],
746
+ unrelated_groups=[],
747
+ )
748
+ return MergePlan(
749
+ parallel_groups=groups, chains=[],
750
+ unrelated_groups=list(range(ng)),
751
+ )
752
+
753
+ # Find connected components in the undirected version of the DAG
754
+ adj_undirected: Dict[int, set] = {i: set() for i in range(ng)}
755
+ adj_directed: Dict[int, List[int]] = {i: [] for i in range(ng)}
756
+ in_deg: Dict[int, int] = {i: 0 for i in range(ng)}
757
+ for a, b in seq_edges:
758
+ adj_undirected[a].add(b)
759
+ adj_undirected[b].add(a)
760
+ adj_directed[a].append(b)
761
+ in_deg[b] += 1
762
+
763
+ visited: set = set()
764
+ components: List[set] = []
765
+ for start in range(ng):
766
+ if start in visited or not adj_undirected[start]:
767
+ continue
768
+ # BFS to find connected component
769
+ component: set = set()
770
+ bfs_queue = [start]
771
+ while bfs_queue:
772
+ node = bfs_queue.pop(0)
773
+ if node in visited:
774
+ continue
775
+ visited.add(node)
776
+ component.add(node)
777
+ for nb in adj_undirected[node]:
778
+ if nb not in visited:
779
+ bfs_queue.append(nb)
780
+ components.append(component)
781
+
782
+ # Topological sort within each component → one chain per component
783
+ chains: List[List[int]] = []
784
+ for component in components:
785
+ # Kahn's algorithm on the subgraph
786
+ local_in: Dict[int, int] = {g: 0 for g in component}
787
+ for g in component:
788
+ for nb in adj_directed[g]:
789
+ if nb in component:
790
+ local_in[nb] += 1
791
+ queue = [g for g in component if local_in[g] == 0]
792
+ chain: List[int] = []
793
+ while queue:
794
+ node = queue.pop(0)
795
+ chain.append(node)
796
+ for nb in adj_directed[node]:
797
+ if nb in component:
798
+ local_in[nb] -= 1
799
+ if local_in[nb] == 0:
800
+ queue.append(nb)
801
+ if len(chain) != len(component):
802
+ chain = sorted(component) # cycle fallback
803
+ chains.append(chain)
804
+
805
+ # Groups not in any component → unrelated
806
+ connected_groups: set = set()
807
+ for comp in components:
808
+ connected_groups.update(comp)
809
+ unrelated = [gi for gi in range(ng) if gi not in connected_groups]
810
+
811
+ return MergePlan(
812
+ parallel_groups=groups,
813
+ chains=chains,
814
+ unrelated_groups=unrelated,
815
+ )
816
+
817
+
818
+ # ---------------------------------------------------------------------------
819
+ # Multi-reaction merge — parallel merge helpers
820
+ # ---------------------------------------------------------------------------
821
+
822
+ def _pick_template(
823
+ summaries: List[ReactionSummary],
824
+ group_indices: List[int],
825
+ ) -> int:
826
+ """Pick the best template reaction for a parallel group.
827
+
828
+ Returns the index (into summaries) of the reaction whose reagent
829
+ SMILES set is shared by the most other reactions in the group.
830
+ This minimizes the number of run-arrow notes needed.
831
+ """
832
+ if len(group_indices) <= 1:
833
+ return group_indices[0]
834
+
835
+ reagent_sets = {}
836
+ for ri in group_indices:
837
+ s = summaries[ri]
838
+ reagent_sets[ri] = frozenset(s.reagent_smiles.values())
839
+
840
+ best_ri = group_indices[0]
841
+ best_count = 0
842
+ for ri in group_indices:
843
+ count = sum(1 for other in group_indices
844
+ if reagent_sets[other] == reagent_sets[ri])
845
+ if count > best_count:
846
+ best_count = count
847
+ best_ri = ri
848
+ return best_ri
849
+
850
+
851
+ def _diff_reagents(
852
+ summaries: List[ReactionSummary],
853
+ group_indices: List[int],
854
+ ) -> Tuple[bool, Dict[int, Optional[str]]]:
855
+ """Compare reagents across parallel reactions against the optimal template.
856
+
857
+ Each run is compared against the template reaction (first in group).
858
+ Notes only show reagents that are in THIS run but NOT in the template
859
+ (i.e. what's different about this particular run). Equiv differences
860
+ for shared reagents are handled by range notation on the main arrow.
861
+
862
+ Returns (all_identical, {reaction_index: note_string_or_None}).
863
+ """
864
+ if len(group_indices) <= 1:
865
+ return True, {}
866
+
867
+ # Build per-reaction fingerprint: {canonical_smiles: (name, equiv)}
868
+ per_reaction: Dict[int, Dict[str, Tuple[str, str]]] = {}
869
+ for ri in group_indices:
870
+ s = summaries[ri]
871
+ fp: Dict[str, Tuple[str, str]] = {}
872
+ for sp_id, smi in s.reagent_smiles.items():
873
+ equiv = s.reagent_equivs.get(sp_id, "")
874
+ name = s.reagent_names.get(sp_id, "")
875
+ fp[smi] = (name, equiv)
876
+ per_reaction[ri] = fp
877
+
878
+ template_ri = _pick_template(summaries, group_indices)
879
+ template_smiles = set(per_reaction[template_ri].keys())
880
+
881
+ # Check if any run has a different reagent set than the template
882
+ has_differences = False
883
+ notes: Dict[int, Optional[str]] = {}
884
+ for ri in group_indices:
885
+ run_smiles = set(per_reaction[ri].keys())
886
+ # Reagents in this run but NOT in the template
887
+ extra = run_smiles - template_smiles
888
+ if extra:
889
+ has_differences = True
890
+ parts: List[str] = []
891
+ for smi in sorted(extra):
892
+ name, equiv = per_reaction[ri][smi]
893
+ if equiv:
894
+ parts.append(f"{name} ({equiv} eq)")
895
+ else:
896
+ parts.append(name)
897
+ notes[ri] = ", ".join(parts)
898
+ else:
899
+ notes[ri] = None
900
+
901
+ if not has_differences:
902
+ return True, {}
903
+
904
+ return False, notes
905
+
906
+
907
+ def _equiv_range(
908
+ summaries: List[ReactionSummary],
909
+ group_indices: List[int],
910
+ smiles: str,
911
+ ) -> str:
912
+ """Compute range notation for equivalents of one reagent across parallel runs.
913
+
914
+ Returns e.g. "1.1\u20131.5" (en-dash) if they differ, single value if same.
915
+ """
916
+ values: List[float] = []
917
+ for ri in group_indices:
918
+ s = summaries[ri]
919
+ for sp_id, smi in s.reagent_smiles.items():
920
+ if smi == smiles:
921
+ eq = s.reagent_equivs.get(sp_id, "")
922
+ if eq:
923
+ try:
924
+ values.append(float(eq))
925
+ except ValueError:
926
+ pass
927
+ if not values:
928
+ return ""
929
+ unique = sorted(set(values))
930
+ if len(unique) == 1:
931
+ return f"{unique[0]:g}"
932
+ return f"{unique[0]:g}\u2013{unique[-1]:g}"
933
+
934
+
935
+ # ---------------------------------------------------------------------------
936
+ # Multi-reaction merge — combined YAML generation
937
+ # ---------------------------------------------------------------------------
938
+
939
+ def _namespace_species_id(reaction_index: int, sp_id: str) -> str:
940
+ """Prefix a species ID with reaction index to avoid collisions."""
941
+ return f"rxn{reaction_index}_{sp_id}"
942
+
943
+
944
+ def _apply_namespace(
945
+ yaml_dict: Dict[str, Any],
946
+ reaction_index: int,
947
+ remap: Dict[str, str],
948
+ ) -> Dict[str, Any]:
949
+ """Namespace all structure IDs in a single-reaction YAML dict.
950
+
951
+ Returns a new dict with namespaced structure keys and step references.
952
+ Applies the remap table for shared intermediates.
953
+ """
954
+ old_structures = yaml_dict.get("structures", {})
955
+ new_structures: Dict[str, Any] = {}
956
+ id_map: Dict[str, str] = {} # old_id -> final_id
957
+
958
+ for old_id, struct_data in old_structures.items():
959
+ ns_id = _namespace_species_id(reaction_index, old_id)
960
+ final_id = remap.get(ns_id, ns_id)
961
+ id_map[old_id] = final_id
962
+ new_structures[final_id] = struct_data
963
+
964
+ def _remap_ids(id_list: List[str]) -> List[str]:
965
+ return [id_map.get(sid, sid) for sid in id_list]
966
+
967
+ new_steps = []
968
+ for step in yaml_dict.get("steps", []):
969
+ new_step = dict(step)
970
+ if "substrates" in new_step:
971
+ new_step["substrates"] = _remap_ids(new_step["substrates"])
972
+ if "products" in new_step:
973
+ new_step["products"] = _remap_ids(new_step["products"])
974
+ if "above_arrow" in new_step:
975
+ above = dict(new_step["above_arrow"])
976
+ if "structures" in above:
977
+ above["structures"] = _remap_ids(above["structures"])
978
+ new_step["above_arrow"] = above
979
+ new_steps.append(new_step)
980
+
981
+ result = dict(yaml_dict)
982
+ result["structures"] = new_structures
983
+ result["steps"] = new_steps
984
+ return result
985
+
986
+
987
+ def _build_run_entry_from_eln(
988
+ eln_data: Dict[str, Any],
989
+ allow_partial: bool = False,
990
+ ) -> Optional[Dict[str, Any]]:
991
+ """Build a single run arrow entry dict from ELN data.
992
+
993
+ Parameters
994
+ ----------
995
+ allow_partial : bool
996
+ When True, create an entry even if only sm_mass is available
997
+ (output will be empty). Used for merged schemes where every
998
+ reaction should get a run arrow.
999
+ """
1000
+ sm_mass = eln_data.get("sm_mass", "").strip()
1001
+ product_obtained = eln_data.get("product_obtained", "").strip()
1002
+ product_yield = eln_data.get("product_yield", "").strip()
1003
+
1004
+ if not sm_mass:
1005
+ return None
1006
+ if not product_obtained and not allow_partial:
1007
+ return None
1008
+
1009
+ input_label = sm_mass if _has_unit(sm_mass) else f"{sm_mass} g"
1010
+
1011
+ if product_obtained:
1012
+ obtained_str = (product_obtained if _has_unit(product_obtained)
1013
+ else f"{product_obtained} g")
1014
+ if product_yield:
1015
+ yield_clean = product_yield.rstrip("%").strip()
1016
+ output_label = f"{obtained_str}, {yield_clean}% yield"
1017
+ else:
1018
+ output_label = obtained_str
1019
+ else:
1020
+ output_label = ""
1021
+
1022
+ return {"input": input_label, "output": output_label}
1023
+
1024
+
1025
+ def _update_below_arrow_with_ranges(
1026
+ step_dict: Dict[str, Any],
1027
+ summaries: List[ReactionSummary],
1028
+ group_indices: List[int],
1029
+ template: ReactionSummary,
1030
+ ) -> None:
1031
+ """Replace individual equiv values with range notation in below_arrow text.
1032
+
1033
+ For parallel groups, reagents that vary across runs get range notation
1034
+ (e.g., "Cs2CO3 (1.5\u20132.0 eq)").
1035
+ """
1036
+ below = step_dict.get("below_arrow")
1037
+ if not below:
1038
+ return
1039
+ text_lines = below.get("text", [])
1040
+ if not text_lines:
1041
+ return
1042
+
1043
+ new_lines = []
1044
+ for line in text_lines:
1045
+ updated = False
1046
+ for sp in template.species:
1047
+ name = sp.get("name", "")
1048
+ if not name or name not in line or "eq" not in line:
1049
+ continue
1050
+ smi = _canonicalize(sp.get("smiles", ""))
1051
+ if not smi:
1052
+ continue
1053
+ range_str = _equiv_range(summaries, group_indices, smi)
1054
+ if range_str and "\u2013" in range_str:
1055
+ new_line = re.sub(
1056
+ r"\([^)]*eq\)", f"({range_str} eq)", line)
1057
+ new_lines.append(new_line)
1058
+ updated = True
1059
+ break
1060
+ if not updated:
1061
+ new_lines.append(line)
1062
+
1063
+ below["text"] = new_lines
1064
+
1065
+
1066
+ def _update_above_arrow_with_ranges(
1067
+ step_dict: Dict[str, Any],
1068
+ summaries: List[ReactionSummary],
1069
+ group_indices: List[int],
1070
+ template: ReactionSummary,
1071
+ ) -> None:
1072
+ """Replace equiv values in above_arrow text with range notation."""
1073
+ above = step_dict.get("above_arrow")
1074
+ if not above:
1075
+ return
1076
+ text_lines = above.get("text", [])
1077
+ if not text_lines:
1078
+ return
1079
+
1080
+ # Above-arrow text entries are typically "(X eq)" for each above structure.
1081
+ # Find the corresponding species by position.
1082
+ above_structs = above.get("structures", [])
1083
+ new_lines = []
1084
+ for i, line in enumerate(text_lines):
1085
+ if "eq" not in line:
1086
+ new_lines.append(line)
1087
+ continue
1088
+ # Find the SMILES for the i-th above structure
1089
+ if i < len(above_structs):
1090
+ sid = above_structs[i]
1091
+ sp = next((s for s in template.species if s.get("id") == sid), None)
1092
+ if sp:
1093
+ smi = _canonicalize(sp.get("smiles", ""))
1094
+ if smi:
1095
+ range_str = _equiv_range(summaries, group_indices, smi)
1096
+ if range_str and "\u2013" in range_str:
1097
+ new_lines.append(f"({range_str} eq)")
1098
+ continue
1099
+ new_lines.append(line)
1100
+
1101
+ above["text"] = new_lines
1102
+
1103
+
1104
+ def build_merged_scheme_yaml_dict(
1105
+ json_paths: List[str],
1106
+ layout: str = "auto",
1107
+ include_run_arrows: bool = True,
1108
+ use_eln_labels: bool = False,
1109
+ ) -> Dict[str, Any]:
1110
+ """Build a combined YAML dict from multiple reaction JSONs.
1111
+
1112
+ Detects parallel reactions (same SM + DP + shared reagents) and sequential
1113
+ chains (product of A = SM of B), and produces a merged scheme.
1114
+ """
1115
+ all_summaries = [_extract_reaction_summary(i, p)
1116
+ for i, p in enumerate(json_paths)]
1117
+
1118
+ # Filter out degenerate reactions (SM == DP, e.g. solubility tests,
1119
+ # control experiments). These have no meaningful reaction to display.
1120
+ summaries = [s for s in all_summaries
1121
+ if not (s.sm_smiles and s.dp_smiles
1122
+ and s.sm_smiles == s.dp_smiles)]
1123
+
1124
+ if not summaries:
1125
+ summaries = all_summaries # fallback: don't filter everything out
1126
+
1127
+ plan = _build_merge_plan(summaries)
1128
+
1129
+ # --- Determine shared intermediates (per chain) ---
1130
+ remap: Dict[str, str] = {}
1131
+ for chain in plan.chains:
1132
+ for ci in range(len(chain) - 1):
1133
+ gi_a = chain[ci]
1134
+ gi_b = chain[ci + 1]
1135
+ ri_a = plan.parallel_groups[gi_a][0]
1136
+ ri_b = plan.parallel_groups[gi_b][0]
1137
+ sa, sb = summaries[ri_a], summaries[ri_b]
1138
+
1139
+ # Find the DP species in A that links to B
1140
+ dp_id_a = next(
1141
+ (sp["id"] for sp in sa.species
1142
+ if (sp.get("is_dp") or sp.get("role") == "product")
1143
+ and _canonicalize(sp.get("smiles", "")) == sa.dp_smiles),
1144
+ None,
1145
+ )
1146
+ # Find the SM species in B that matches A's product.
1147
+ # Try direct SM match first; if SM SMILES is unresolved, find any
1148
+ # species in B whose SMILES equals A's DP.
1149
+ sm_id_b = next(
1150
+ (sp["id"] for sp in sb.species
1151
+ if sp.get("is_sm")
1152
+ and _canonicalize(sp.get("smiles", "")) == sa.dp_smiles),
1153
+ None,
1154
+ )
1155
+ if sm_id_b is None:
1156
+ # Fallback: any species in B with matching SMILES
1157
+ sm_id_b = next(
1158
+ (sp["id"] for sp in sb.species
1159
+ if _canonicalize(sp.get("smiles", "")) == sa.dp_smiles),
1160
+ None,
1161
+ )
1162
+ if dp_id_a and sm_id_b:
1163
+ canonical = _namespace_species_id(ri_a, dp_id_a)
1164
+ replaced = _namespace_species_id(ri_b, sm_id_b)
1165
+ remap[replaced] = canonical
1166
+
1167
+ # --- Build per-group YAML dicts ---
1168
+ def _build_group(
1169
+ group_indices: List[int],
1170
+ step_number: int,
1171
+ label_start: int,
1172
+ ) -> Tuple[Dict[str, Any], List[Dict[str, Any]], int]:
1173
+ """Build YAML structures + step(s) for one parallel group.
1174
+
1175
+ Returns (structures_dict, [step_dict], next_label).
1176
+ """
1177
+ template_ri = _pick_template(summaries, group_indices)
1178
+ template = summaries[template_ri]
1179
+
1180
+ # ELN label for products (when use_eln_labels is enabled)
1181
+ plabel = None
1182
+ if use_eln_labels:
1183
+ exps = [summaries[ri].experiment for ri in group_indices]
1184
+ plabel = _merge_eln_labels(exps)
1185
+
1186
+ # Build single-reaction dict using existing logic
1187
+ single = _build_yaml_dict(
1188
+ template.species, template.conditions, template.eln_data,
1189
+ layout="linear", include_run_arrows=False,
1190
+ product_label=plabel,
1191
+ )
1192
+
1193
+ # Range notation for parallel groups — apply BEFORE namespacing
1194
+ # so that species IDs in above_arrow.structures match template.species
1195
+ if len(group_indices) > 1:
1196
+ _update_below_arrow_with_ranges(
1197
+ single["steps"][0], summaries, group_indices, template)
1198
+ _update_above_arrow_with_ranges(
1199
+ single["steps"][0], summaries, group_indices, template)
1200
+
1201
+ # Namespace IDs
1202
+ ns = _apply_namespace(single, template_ri, remap)
1203
+ structures = ns["structures"]
1204
+ step = ns["steps"][0]
1205
+
1206
+ # Relabel: skip IDs already in all_structures (shared intermediates
1207
+ # get their label from the group that first produced them).
1208
+ if use_eln_labels:
1209
+ # ELN mode: labels already set by _build_yaml_dict (product_label)
1210
+ for _sid in list(structures.keys()):
1211
+ if _sid in all_structures:
1212
+ del structures[_sid]
1213
+ else:
1214
+ # Default mode: relabel with global counter
1215
+ label_counter = label_start
1216
+ for _sid in list(structures.keys()):
1217
+ if _sid in all_structures:
1218
+ del structures[_sid]
1219
+ continue
1220
+ entry = structures[_sid]
1221
+ if "label" in entry:
1222
+ entry["label"] = str(label_counter)
1223
+ label_counter += 1
1224
+ label_start = label_counter
1225
+
1226
+ return structures, [step], label_start
1227
+
1228
+ def _build_group_run_arrows(
1229
+ group_indices: List[int],
1230
+ step_number: int,
1231
+ include: bool,
1232
+ ) -> Optional[Dict[str, Any]]:
1233
+ """Build run_arrows entry for one parallel group.
1234
+
1235
+ Every reaction in the group gets a run arrow, even if the ELN
1236
+ data only has sm_mass (no product_obtained). This ensures all
1237
+ runs are visible, with deviation notes shown where applicable.
1238
+ """
1239
+ if not include:
1240
+ return None
1241
+
1242
+ all_identical, notes = _diff_reagents(summaries, group_indices)
1243
+ runs: List[Dict[str, Any]] = []
1244
+ for ri in group_indices:
1245
+ entry = _build_run_entry_from_eln(
1246
+ summaries[ri].eln_data, allow_partial=True)
1247
+ if entry:
1248
+ if not all_identical and notes.get(ri):
1249
+ entry["note"] = notes[ri]
1250
+ runs.append(entry)
1251
+
1252
+ if runs:
1253
+ return {"step": step_number, "runs": runs}
1254
+ return None
1255
+
1256
+ # --- Determine overall layout ---
1257
+ num_chains = len(plan.chains)
1258
+ num_unrelated = len(plan.unrelated_groups)
1259
+ num_sections = num_chains + num_unrelated
1260
+
1261
+ if layout == "auto":
1262
+ if num_sections > 1:
1263
+ layout = "stacked-rows"
1264
+ elif num_chains == 1 and len(plan.chains[0]) > 1:
1265
+ layout = "sequential"
1266
+ else:
1267
+ layout = "linear"
1268
+
1269
+ # --- Assemble ---
1270
+ all_structures: Dict[str, Any] = {}
1271
+ run_arrows_list: List[Dict[str, Any]] = []
1272
+
1273
+ if layout == "stacked-rows" or num_sections > 1:
1274
+ # Each chain becomes a section; each unrelated group becomes a section
1275
+ sections: List[Dict[str, Any]] = []
1276
+ label_counter = 1
1277
+ global_step = 1
1278
+
1279
+ for chain in plan.chains:
1280
+ chain_steps: List[Dict[str, Any]] = []
1281
+ for gi in chain:
1282
+ grp = plan.parallel_groups[gi]
1283
+ structs, steps, label_counter = _build_group(
1284
+ grp, global_step, label_counter)
1285
+ valid_steps = [s for s in steps
1286
+ if s.get("substrates") or s.get("products")]
1287
+ if not valid_steps:
1288
+ continue
1289
+ all_structures.update(structs)
1290
+ chain_steps.extend(valid_steps)
1291
+ ra = _build_group_run_arrows(grp, global_step, include_run_arrows)
1292
+ if ra:
1293
+ run_arrows_list.append(ra)
1294
+ global_step += 1
1295
+ if chain_steps:
1296
+ sec: Dict[str, Any] = {"steps": chain_steps}
1297
+ if len(chain_steps) > 1:
1298
+ sec["layout"] = "sequential"
1299
+ sections.append(sec)
1300
+
1301
+ # Each unrelated group as its own section
1302
+ for gi in plan.unrelated_groups:
1303
+ grp = plan.parallel_groups[gi]
1304
+ structs, steps, label_counter = _build_group(
1305
+ grp, global_step, label_counter)
1306
+ valid_steps = [s for s in steps
1307
+ if s.get("substrates") or s.get("products")]
1308
+ if not valid_steps:
1309
+ continue
1310
+ all_structures.update(structs)
1311
+ sec = {"steps": valid_steps}
1312
+ ra = _build_group_run_arrows(grp, global_step, include_run_arrows)
1313
+ if ra:
1314
+ run_arrows_list.append(ra)
1315
+ global_step += 1
1316
+ sections.append(sec)
1317
+
1318
+ # If only 1 section survived, collapse to flat sequential layout
1319
+ if len(sections) == 1:
1320
+ flat_steps = sections[0].get("steps", [])
1321
+ flat_layout = "sequential" if len(flat_steps) > 1 else "linear"
1322
+ yaml_dict: Dict[str, Any] = {
1323
+ "structures": all_structures,
1324
+ "steps": flat_steps,
1325
+ "layout": flat_layout,
1326
+ }
1327
+ else:
1328
+ yaml_dict = {
1329
+ "structures": all_structures,
1330
+ "sections": sections,
1331
+ "layout": "stacked-rows",
1332
+ }
1333
+ else:
1334
+ # Linear or sequential: single chain, flat steps list
1335
+ all_steps: List[Dict[str, Any]] = []
1336
+ label_counter = 1
1337
+ step_num = 1
1338
+ # Use the single chain if available, otherwise unrelated groups
1339
+ group_order = plan.chains[0] if plan.chains else plan.unrelated_groups
1340
+ for gi in group_order:
1341
+ grp = plan.parallel_groups[gi]
1342
+ structs, steps, label_counter = _build_group(
1343
+ grp, step_num, label_counter)
1344
+ valid_steps = [s for s in steps
1345
+ if s.get("substrates") or s.get("products")]
1346
+ if not valid_steps:
1347
+ continue
1348
+ all_structures.update(structs)
1349
+ all_steps.extend(valid_steps)
1350
+ ra = _build_group_run_arrows(grp, step_num, include_run_arrows)
1351
+ if ra:
1352
+ run_arrows_list.append(ra)
1353
+ step_num += 1
1354
+
1355
+ yaml_dict = {
1356
+ "structures": all_structures,
1357
+ "steps": all_steps,
1358
+ "layout": layout,
1359
+ }
1360
+
1361
+ # Prevent auto-wrapping; merged schemes should render as-is
1362
+ if yaml_dict.get("layout") == "sequential":
1363
+ yaml_dict["wrap"] = "none"
1364
+
1365
+ if run_arrows_list:
1366
+ yaml_dict["run_arrows"] = run_arrows_list
1367
+
1368
+ return yaml_dict
1369
+
1370
+
1371
+ def write_merged_scheme_yaml(
1372
+ json_paths: List[str],
1373
+ output_path: str,
1374
+ layout: str = "auto",
1375
+ include_run_arrows: bool = True,
1376
+ use_eln_labels: bool = False,
1377
+ ) -> str:
1378
+ """Read multiple reaction JSONs, detect relationships, write merged YAML.
1379
+
1380
+ Returns the absolute path to the written YAML file.
1381
+ """
1382
+ yaml_dict = build_merged_scheme_yaml_dict(
1383
+ json_paths, layout=layout, include_run_arrows=include_run_arrows,
1384
+ use_eln_labels=use_eln_labels,
1385
+ )
1386
+ _write_yaml_file(yaml_dict, output_path)
1387
+ return os.path.abspath(output_path)
1388
+
1389
+
1390
+ # ---------------------------------------------------------------------------
1391
+ # YAML output
1392
+ # ---------------------------------------------------------------------------
1393
+
1394
+ def _write_yaml_file(data: Dict[str, Any], path: str) -> None:
1395
+ """Write YAML dict to file.
1396
+
1397
+ Uses PyYAML if available, otherwise writes a simple manual format.
1398
+ """
1399
+ if yaml is not None:
1400
+ with open(path, "w", encoding="utf-8") as f:
1401
+ yaml.dump(data, f, default_flow_style=False, allow_unicode=True,
1402
+ sort_keys=False)
1403
+ else:
1404
+ # Fallback: write JSON with .yaml extension (valid YAML superset)
1405
+ with open(path, "w", encoding="utf-8") as f:
1406
+ json.dump(data, f, indent=2, ensure_ascii=False)
1407
+
1408
+
1409
+ # ---------------------------------------------------------------------------
1410
+ # CLI
1411
+ # ---------------------------------------------------------------------------
1412
+
1413
+ def main():
1414
+ parser = argparse.ArgumentParser(
1415
+ description="Generate scheme YAML from one or more reaction_parser JSON files.",
1416
+ )
1417
+ parser.add_argument("json_paths", nargs="+",
1418
+ help="One or more reaction parser JSON files")
1419
+ parser.add_argument("-o", "--output", default=None,
1420
+ help="Output YAML path (default: auto-generated)")
1421
+ parser.add_argument("--layout", default="auto",
1422
+ help="Layout: linear, sequential, stacked-rows, auto")
1423
+ parser.add_argument("--no-run-arrows", action="store_true",
1424
+ help="Suppress run arrows")
1425
+ parser.add_argument("--no-merge", action="store_true",
1426
+ help="Process each JSON individually (skip merge)")
1427
+ parser.add_argument("--eln-labels", action="store_true",
1428
+ help="Label products with ELN experiment names "
1429
+ "instead of sequential numbers")
1430
+ parser.add_argument("-v", "--verbose", action="store_true")
1431
+ args = parser.parse_args()
1432
+
1433
+ for jp in args.json_paths:
1434
+ if not os.path.exists(jp):
1435
+ print(f"Error: {jp} not found", file=sys.stderr)
1436
+ sys.exit(1)
1437
+
1438
+ include_run_arrows = not args.no_run_arrows
1439
+
1440
+ if len(args.json_paths) == 1:
1441
+ # Single input: existing behavior
1442
+ jp = args.json_paths[0]
1443
+ output = args.output
1444
+ if output is None:
1445
+ stem = os.path.splitext(os.path.basename(jp))[0]
1446
+ output = os.path.join(
1447
+ os.path.dirname(jp) or ".", f"{stem}-scheme.yaml")
1448
+ result = write_scheme_yaml(
1449
+ jp, output, layout=args.layout,
1450
+ include_run_arrows=include_run_arrows,
1451
+ use_eln_labels=args.eln_labels,
1452
+ )
1453
+ if args.verbose:
1454
+ print(f"Written: {result}", file=sys.stderr)
1455
+ print(result)
1456
+ else:
1457
+ # Multiple inputs: produce individual YAMLs + merged YAML
1458
+ for jp in args.json_paths:
1459
+ stem = os.path.splitext(os.path.basename(jp))[0]
1460
+ ind_output = os.path.join(
1461
+ os.path.dirname(jp) or ".", f"{stem}-scheme.yaml")
1462
+ result = write_scheme_yaml(
1463
+ jp, ind_output, layout=args.layout,
1464
+ include_run_arrows=include_run_arrows,
1465
+ use_eln_labels=args.eln_labels,
1466
+ )
1467
+ if args.verbose:
1468
+ print(f"Individual: {result}", file=sys.stderr)
1469
+
1470
+ if not args.no_merge:
1471
+ output = args.output
1472
+ if output is None:
1473
+ output = os.path.join(
1474
+ os.path.dirname(args.json_paths[0]) or ".",
1475
+ "merged-scheme.yaml")
1476
+ merged = write_merged_scheme_yaml(
1477
+ args.json_paths, output, layout=args.layout,
1478
+ include_run_arrows=include_run_arrows,
1479
+ use_eln_labels=args.eln_labels,
1480
+ )
1481
+ if args.verbose:
1482
+ print(f"Merged: {merged}", file=sys.stderr)
1483
+ print(merged)
1484
+
1485
+
1486
+ if __name__ == "__main__":
1487
+ main()