cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1487 @@
|
|
|
1
|
+
"""
|
|
2
|
+
scheme_yaml_writer.py — Generate scheme YAML from reaction_parser JSON.
|
|
3
|
+
|
|
4
|
+
This is the layout-decision layer between perception (reaction_parser) and
|
|
5
|
+
rendering (renderer). It reads a reaction JSON, decides where each species
|
|
6
|
+
goes in the scheme, and writes a YAML file that the renderer can consume.
|
|
7
|
+
|
|
8
|
+
The three decisions made here:
|
|
9
|
+
1. Structure or text? (atom-contributing → structure; non-contributing → text)
|
|
10
|
+
2. Position? (substrate → left; other reactant → above arrow; reagents → below)
|
|
11
|
+
3. Priority ordering of below-arrow text (catalyst > base > solvent > conditions)
|
|
12
|
+
|
|
13
|
+
Usage:
|
|
14
|
+
python experiments/scheme_dsl/scheme_yaml_writer.py reaction.json -o scheme.yaml
|
|
15
|
+
|
|
16
|
+
from scheme_dsl.scheme_yaml_writer import write_scheme_yaml
|
|
17
|
+
yaml_path = write_scheme_yaml("reaction.json", "scheme.yaml")
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import argparse
|
|
23
|
+
import json
|
|
24
|
+
import os
|
|
25
|
+
import re
|
|
26
|
+
import sys
|
|
27
|
+
from dataclasses import dataclass, field as dc_field
|
|
28
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
import yaml
|
|
32
|
+
except ImportError:
|
|
33
|
+
yaml = None # type: ignore[assignment]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
# Role-based priority for below-arrow text ordering
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
# Lower number = higher priority (appears first below arrow)
|
|
41
|
+
_ROLE_PRIORITY = {
|
|
42
|
+
"catalyst": 10,
|
|
43
|
+
"ligand": 15,
|
|
44
|
+
"base": 20,
|
|
45
|
+
"coupling_reagent": 25,
|
|
46
|
+
"reducing_agent": 30,
|
|
47
|
+
"oxidant": 30,
|
|
48
|
+
"lewis_acid": 30,
|
|
49
|
+
"activating_agent": 30,
|
|
50
|
+
"halogenating_agent": 30,
|
|
51
|
+
"fluorinating_agent": 30,
|
|
52
|
+
"borylating_agent": 30,
|
|
53
|
+
"protecting_group": 35,
|
|
54
|
+
"deprotecting_agent": 35,
|
|
55
|
+
"acid": 35,
|
|
56
|
+
"additive": 40,
|
|
57
|
+
"reductant": 40,
|
|
58
|
+
"reagent": 45,
|
|
59
|
+
"drying_agent": 50,
|
|
60
|
+
"inorganic_salt": 50,
|
|
61
|
+
"solvent": 80,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# Roles that should always be shown as text, never as drawn structures
|
|
65
|
+
_DEMOTE_ROLES = {
|
|
66
|
+
"base", "catalyst", "ligand", "coupling_reagent", "reducing_agent",
|
|
67
|
+
"oxidant", "protecting_group", "deprotecting_agent", "acid",
|
|
68
|
+
"activating_agent", "lewis_acid", "drying_agent", "halogenating_agent",
|
|
69
|
+
"fluorinating_agent", "borylating_agent", "additive", "reductant",
|
|
70
|
+
"reagent", "solvent", "inorganic_salt",
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# ---------------------------------------------------------------------------
|
|
75
|
+
# Public API
|
|
76
|
+
# ---------------------------------------------------------------------------
|
|
77
|
+
|
|
78
|
+
def write_scheme_yaml(
|
|
79
|
+
json_path: str,
|
|
80
|
+
output_path: str,
|
|
81
|
+
layout: str = "auto",
|
|
82
|
+
include_run_arrows: bool = True,
|
|
83
|
+
use_eln_labels: bool = False,
|
|
84
|
+
) -> str:
|
|
85
|
+
"""Read reaction JSON, make layout decisions, write YAML file.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
json_path : str
|
|
90
|
+
Path to reaction_parser JSON file.
|
|
91
|
+
output_path : str
|
|
92
|
+
Path where the YAML will be written.
|
|
93
|
+
layout : str
|
|
94
|
+
Layout type: "linear", "sequential", or "auto" (inferred from step count).
|
|
95
|
+
include_run_arrows : bool
|
|
96
|
+
If True and ELN data has SM mass + product yield, include run_arrows.
|
|
97
|
+
use_eln_labels : bool
|
|
98
|
+
If True, label products with ELN experiment names instead of
|
|
99
|
+
sequential numbers.
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
str
|
|
104
|
+
The absolute path to the written YAML file.
|
|
105
|
+
"""
|
|
106
|
+
with open(json_path, "r", encoding="utf-8") as f:
|
|
107
|
+
data = json.load(f)
|
|
108
|
+
|
|
109
|
+
species = data.get("species", [])
|
|
110
|
+
eln_data = data.get("eln_data") or {}
|
|
111
|
+
conditions = data.get("conditions", [])
|
|
112
|
+
|
|
113
|
+
product_label = None
|
|
114
|
+
if use_eln_labels:
|
|
115
|
+
experiment = data.get("experiment",
|
|
116
|
+
os.path.splitext(os.path.basename(json_path))[0])
|
|
117
|
+
product_label = experiment
|
|
118
|
+
|
|
119
|
+
# Build the YAML dict
|
|
120
|
+
yaml_dict = _build_yaml_dict(species, conditions, eln_data,
|
|
121
|
+
layout=layout,
|
|
122
|
+
include_run_arrows=include_run_arrows,
|
|
123
|
+
product_label=product_label)
|
|
124
|
+
|
|
125
|
+
# Write YAML
|
|
126
|
+
_write_yaml_file(yaml_dict, output_path)
|
|
127
|
+
|
|
128
|
+
return os.path.abspath(output_path)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def build_scheme_yaml_dict(
|
|
132
|
+
json_path: str,
|
|
133
|
+
layout: str = "auto",
|
|
134
|
+
include_run_arrows: bool = True,
|
|
135
|
+
use_eln_labels: bool = False,
|
|
136
|
+
) -> Dict[str, Any]:
|
|
137
|
+
"""Read reaction JSON and return the YAML dict (without writing to disk).
|
|
138
|
+
|
|
139
|
+
Useful for programmatic access when you want to inspect or modify
|
|
140
|
+
the dict before writing.
|
|
141
|
+
"""
|
|
142
|
+
with open(json_path, "r", encoding="utf-8") as f:
|
|
143
|
+
data = json.load(f)
|
|
144
|
+
|
|
145
|
+
species = data.get("species", [])
|
|
146
|
+
eln_data = data.get("eln_data") or {}
|
|
147
|
+
conditions = data.get("conditions", [])
|
|
148
|
+
|
|
149
|
+
product_label = None
|
|
150
|
+
if use_eln_labels:
|
|
151
|
+
experiment = data.get("experiment",
|
|
152
|
+
os.path.splitext(os.path.basename(json_path))[0])
|
|
153
|
+
product_label = experiment
|
|
154
|
+
|
|
155
|
+
return _build_yaml_dict(species, conditions, eln_data,
|
|
156
|
+
layout=layout,
|
|
157
|
+
include_run_arrows=include_run_arrows,
|
|
158
|
+
product_label=product_label)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ---------------------------------------------------------------------------
|
|
162
|
+
# Core layout logic
|
|
163
|
+
# ---------------------------------------------------------------------------
|
|
164
|
+
|
|
165
|
+
def _build_yaml_dict(
|
|
166
|
+
species: List[Dict[str, Any]],
|
|
167
|
+
conditions: List[str],
|
|
168
|
+
eln_data: Dict[str, Any],
|
|
169
|
+
layout: str = "auto",
|
|
170
|
+
include_run_arrows: bool = True,
|
|
171
|
+
product_label: Optional[str] = None,
|
|
172
|
+
) -> Dict[str, Any]:
|
|
173
|
+
"""Build the complete YAML dict from reaction data.
|
|
174
|
+
|
|
175
|
+
This is where all three layout decisions are made:
|
|
176
|
+
1. Structure or text? (atom_contributing → draw; else → text)
|
|
177
|
+
2. Position? (substrate left, other reactant above, reagents below)
|
|
178
|
+
3. Priority ordering of below-arrow text
|
|
179
|
+
|
|
180
|
+
Parameters
|
|
181
|
+
----------
|
|
182
|
+
product_label : str, optional
|
|
183
|
+
When set, only products get this label (ELN mode).
|
|
184
|
+
Substrates and above-arrow structures are unlabelled.
|
|
185
|
+
"""
|
|
186
|
+
# --- Classify species into scheme positions ---
|
|
187
|
+
substrates = [] # left of arrow (drawn structures)
|
|
188
|
+
above_structures = [] # above arrow (drawn structures)
|
|
189
|
+
above_text = [] # above arrow (text, e.g. "(1.2 eq)")
|
|
190
|
+
below_text_items = [] # below arrow (text with priority for sorting)
|
|
191
|
+
products = [] # right of arrow (drawn structures)
|
|
192
|
+
|
|
193
|
+
# Track species that will be drawn as structures (need StructureRef entries)
|
|
194
|
+
drawn_ids = set()
|
|
195
|
+
|
|
196
|
+
for sp in species:
|
|
197
|
+
sp_id = sp.get("id", "")
|
|
198
|
+
role = sp.get("role", "")
|
|
199
|
+
role_detail = (sp.get("role_detail") or "").lower()
|
|
200
|
+
is_sm = sp.get("is_sm", False)
|
|
201
|
+
is_dp = sp.get("is_dp", False)
|
|
202
|
+
is_substrate = sp.get("is_substrate", False)
|
|
203
|
+
is_solvent = sp.get("is_solvent", False)
|
|
204
|
+
atom_contributing = (role == "atom_contributing")
|
|
205
|
+
source = sp.get("source", "")
|
|
206
|
+
# Use raw name for labels (display_text may already have equiv appended)
|
|
207
|
+
name = sp.get("name", "")
|
|
208
|
+
display = sp.get("display_text") or name
|
|
209
|
+
smiles = sp.get("smiles")
|
|
210
|
+
|
|
211
|
+
# Check is_dp flag as well as role for product identification
|
|
212
|
+
if role == "product" or is_dp:
|
|
213
|
+
if smiles:
|
|
214
|
+
products.append(sp_id)
|
|
215
|
+
drawn_ids.add(sp_id)
|
|
216
|
+
|
|
217
|
+
# Check is_sm/is_substrate flags — but only honor them when the
|
|
218
|
+
# species is atom-contributing or unclassified. The CSV may mark
|
|
219
|
+
# reagents like n-BuLi as "substrate" even though RXNMapper says
|
|
220
|
+
# they are non-contributing. Role classification wins.
|
|
221
|
+
elif (is_substrate or is_sm) and (atom_contributing or role in ("", "unclassified")):
|
|
222
|
+
if smiles:
|
|
223
|
+
substrates.append(sp_id)
|
|
224
|
+
drawn_ids.add(sp_id)
|
|
225
|
+
# else: CSV-only SM entry with no structure — skip (the actual
|
|
226
|
+
# structure should be a separate atom_contributing species)
|
|
227
|
+
|
|
228
|
+
elif atom_contributing:
|
|
229
|
+
# Atom-contributing species are drawn as structures above arrow
|
|
230
|
+
above_structures.append(sp_id)
|
|
231
|
+
drawn_ids.add(sp_id)
|
|
232
|
+
# Add equiv text below the structure
|
|
233
|
+
equiv = sp.get("csv_equiv")
|
|
234
|
+
if equiv:
|
|
235
|
+
above_text.append(f"({equiv} eq)")
|
|
236
|
+
|
|
237
|
+
elif is_solvent:
|
|
238
|
+
# Solvents → below arrow text, low priority
|
|
239
|
+
priority = _ROLE_PRIORITY.get("solvent", 80)
|
|
240
|
+
below_text_items.append((priority, display))
|
|
241
|
+
|
|
242
|
+
else:
|
|
243
|
+
# Non-contributing / unclassified species → text below arrow
|
|
244
|
+
# Check if it should be demoted (drawn structure → text)
|
|
245
|
+
should_demote = role_detail in _DEMOTE_ROLES
|
|
246
|
+
has_structure = source in ("fragment", "rxn") and smiles
|
|
247
|
+
|
|
248
|
+
if has_structure and not should_demote:
|
|
249
|
+
# Unusual: non-contributing but not a known reagent type.
|
|
250
|
+
# Draw it above the arrow as a structure.
|
|
251
|
+
above_structures.append(sp_id)
|
|
252
|
+
drawn_ids.add(sp_id)
|
|
253
|
+
else:
|
|
254
|
+
# Text label below arrow — use raw name + format equiv here
|
|
255
|
+
priority = _ROLE_PRIORITY.get(role_detail, 50)
|
|
256
|
+
label = _format_label_with_equiv(name, display, sp.get("csv_equiv"))
|
|
257
|
+
if not label:
|
|
258
|
+
continue
|
|
259
|
+
below_text_items.append((priority, label))
|
|
260
|
+
|
|
261
|
+
# If no substrate was identified but there are atom-contributing species
|
|
262
|
+
# above the arrow, promote the most likely one to substrate (left of arrow).
|
|
263
|
+
# Prefer the one with csv_equiv closest to 1.0, else the first one.
|
|
264
|
+
if not substrates and above_structures:
|
|
265
|
+
above_set = set(above_structures)
|
|
266
|
+
best_id = above_structures[0]
|
|
267
|
+
best_diff = float("inf")
|
|
268
|
+
for sp in species:
|
|
269
|
+
sp_id_check = sp.get("id")
|
|
270
|
+
if sp_id_check in above_set:
|
|
271
|
+
eq = sp.get("csv_equiv")
|
|
272
|
+
if eq:
|
|
273
|
+
try:
|
|
274
|
+
diff = abs(float(eq) - 1.0)
|
|
275
|
+
if diff < best_diff:
|
|
276
|
+
best_diff = diff
|
|
277
|
+
best_id = sp_id_check
|
|
278
|
+
except (ValueError, TypeError):
|
|
279
|
+
pass
|
|
280
|
+
above_structures.remove(best_id)
|
|
281
|
+
substrates.append(best_id)
|
|
282
|
+
# Rebuild above_text from remaining above_structures
|
|
283
|
+
above_text = []
|
|
284
|
+
remaining = set(above_structures)
|
|
285
|
+
for sp in species:
|
|
286
|
+
if sp.get("id") in remaining:
|
|
287
|
+
eq = sp.get("csv_equiv")
|
|
288
|
+
if eq:
|
|
289
|
+
above_text.append(f"({eq} eq)")
|
|
290
|
+
|
|
291
|
+
# Sort below-arrow text by priority
|
|
292
|
+
below_text_items.sort(key=lambda x: x[0])
|
|
293
|
+
below_text = [item[1] for item in below_text_items]
|
|
294
|
+
|
|
295
|
+
# Add conditions (temperature, time, atmosphere) at end
|
|
296
|
+
if conditions:
|
|
297
|
+
below_text.extend(_normalize_conditions(conditions))
|
|
298
|
+
|
|
299
|
+
# --- Build structures dict ---
|
|
300
|
+
# Assign compound labels to substrates and products.
|
|
301
|
+
# Above-arrow structures (reagents) don't get numbered.
|
|
302
|
+
# When product_label is set (ELN mode), only products get labels.
|
|
303
|
+
label_map: Dict[str, str] = {}
|
|
304
|
+
if product_label is not None:
|
|
305
|
+
# ELN mode: only products get the provided label
|
|
306
|
+
for sid in products:
|
|
307
|
+
label_map[sid] = product_label
|
|
308
|
+
else:
|
|
309
|
+
# Default mode: sequential numbers 1, 2, 3, ...
|
|
310
|
+
label_order: List[str] = []
|
|
311
|
+
for sid in substrates:
|
|
312
|
+
if sid not in label_order:
|
|
313
|
+
label_order.append(sid)
|
|
314
|
+
for sid in products:
|
|
315
|
+
if sid not in label_order:
|
|
316
|
+
label_order.append(sid)
|
|
317
|
+
label_counter = 1
|
|
318
|
+
for sid in label_order:
|
|
319
|
+
label_map[sid] = str(label_counter)
|
|
320
|
+
label_counter += 1
|
|
321
|
+
|
|
322
|
+
structures = {}
|
|
323
|
+
for sp in species:
|
|
324
|
+
sp_id = sp.get("id", "")
|
|
325
|
+
if sp_id not in drawn_ids:
|
|
326
|
+
continue
|
|
327
|
+
smiles = sp.get("smiles")
|
|
328
|
+
if not smiles:
|
|
329
|
+
continue
|
|
330
|
+
entry: Dict[str, Any] = {"smiles": smiles}
|
|
331
|
+
# Compound number label below structure (substrates + products only)
|
|
332
|
+
if sp_id in label_map:
|
|
333
|
+
entry["label"] = label_map[sp_id]
|
|
334
|
+
structures[sp_id] = entry
|
|
335
|
+
|
|
336
|
+
# --- Build step ---
|
|
337
|
+
step: Dict[str, Any] = {}
|
|
338
|
+
if substrates:
|
|
339
|
+
step["substrates"] = substrates
|
|
340
|
+
if products:
|
|
341
|
+
step["products"] = products
|
|
342
|
+
|
|
343
|
+
above: Dict[str, Any] = {}
|
|
344
|
+
if above_structures:
|
|
345
|
+
above["structures"] = above_structures
|
|
346
|
+
if above_text:
|
|
347
|
+
above["text"] = above_text
|
|
348
|
+
if above:
|
|
349
|
+
step["above_arrow"] = above
|
|
350
|
+
|
|
351
|
+
if below_text:
|
|
352
|
+
step["below_arrow"] = {"text": below_text}
|
|
353
|
+
|
|
354
|
+
# --- Determine layout ---
|
|
355
|
+
if layout == "auto":
|
|
356
|
+
layout = "linear" # single step for now; multi-step will be "sequential"
|
|
357
|
+
|
|
358
|
+
# --- Build top-level YAML dict ---
|
|
359
|
+
yaml_dict: Dict[str, Any] = {
|
|
360
|
+
"structures": structures,
|
|
361
|
+
"steps": [step],
|
|
362
|
+
"layout": layout,
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
# --- Run arrows ---
|
|
366
|
+
# Run arrows already display yield in their output label, so only add
|
|
367
|
+
# yield_ to the step when run arrows are NOT present (avoids duplication).
|
|
368
|
+
run_arrows_added = False
|
|
369
|
+
if include_run_arrows and eln_data:
|
|
370
|
+
run_arrows = _build_run_arrows(eln_data)
|
|
371
|
+
if run_arrows:
|
|
372
|
+
yaml_dict["run_arrows"] = run_arrows
|
|
373
|
+
run_arrows_added = True
|
|
374
|
+
|
|
375
|
+
if not run_arrows_added:
|
|
376
|
+
yield_pct = eln_data.get("product_yield", "").strip()
|
|
377
|
+
if yield_pct:
|
|
378
|
+
yield_str = yield_pct.rstrip("%").strip()
|
|
379
|
+
step["yield_"] = f"{yield_str}%"
|
|
380
|
+
|
|
381
|
+
return yaml_dict
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
# ---------------------------------------------------------------------------
|
|
385
|
+
# Run arrows
|
|
386
|
+
# ---------------------------------------------------------------------------
|
|
387
|
+
|
|
388
|
+
def _build_run_arrows(eln_data: Dict[str, Any]) -> Optional[List[Dict[str, Any]]]:
|
|
389
|
+
"""Build run_arrows list from ELN data (SM mass → product yield)."""
|
|
390
|
+
sm_mass = eln_data.get("sm_mass", "").strip()
|
|
391
|
+
product_obtained = eln_data.get("product_obtained", "").strip()
|
|
392
|
+
product_yield = eln_data.get("product_yield", "").strip()
|
|
393
|
+
|
|
394
|
+
if not sm_mass or not product_obtained:
|
|
395
|
+
return None
|
|
396
|
+
|
|
397
|
+
input_label = sm_mass if _has_unit(sm_mass) else f"{sm_mass} g"
|
|
398
|
+
obtained_str = (product_obtained if _has_unit(product_obtained)
|
|
399
|
+
else f"{product_obtained} g")
|
|
400
|
+
if product_yield:
|
|
401
|
+
yield_clean = product_yield.rstrip("%").strip()
|
|
402
|
+
output_label = f"{obtained_str}, {yield_clean}% yield"
|
|
403
|
+
else:
|
|
404
|
+
output_label = obtained_str
|
|
405
|
+
|
|
406
|
+
return [{
|
|
407
|
+
"step": 1,
|
|
408
|
+
"runs": [{"input": input_label, "output": output_label}],
|
|
409
|
+
}]
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def _format_label_with_equiv(
|
|
413
|
+
name: str, display: str, csv_equiv: Optional[str],
|
|
414
|
+
) -> str:
|
|
415
|
+
"""Build a text label, adding equiv only if not already present.
|
|
416
|
+
|
|
417
|
+
Uses raw ``name`` as the base label, falling back to ``display``.
|
|
418
|
+
Avoids duplicating "(X eq)" when ``display_text`` already contains it.
|
|
419
|
+
"""
|
|
420
|
+
base = name or display
|
|
421
|
+
if not base:
|
|
422
|
+
return ""
|
|
423
|
+
# If equiv data exists and the label doesn't already mention "eq"
|
|
424
|
+
if csv_equiv and "eq" not in base.lower():
|
|
425
|
+
return f"{base} ({csv_equiv} eq)"
|
|
426
|
+
# If display has equiv but name doesn't, use display as-is
|
|
427
|
+
if "eq" in display.lower():
|
|
428
|
+
return display
|
|
429
|
+
return base
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def _normalize_conditions(conditions: List[str]) -> List[str]:
|
|
433
|
+
"""Normalize condition strings for display.
|
|
434
|
+
|
|
435
|
+
Fixes temperature formatting:
|
|
436
|
+
"80 C" → "80 °C"
|
|
437
|
+
"105C" → "105 °C"
|
|
438
|
+
"80°C" → "80 °C"
|
|
439
|
+
"-78 °C" → "-78 °C" (no change)
|
|
440
|
+
"""
|
|
441
|
+
result = []
|
|
442
|
+
for cond in conditions:
|
|
443
|
+
# "80 C" or "-78 C" → "80 °C" (missing degree symbol)
|
|
444
|
+
cond = re.sub(r"(-?\d+\.?\d*)\s+C\b", r"\1 °C", cond)
|
|
445
|
+
# "80C" or "105C" → "80 °C" (no space, no degree)
|
|
446
|
+
cond = re.sub(r"(-?\d+\.?\d*)C\b", r"\1 °C", cond)
|
|
447
|
+
# "80°C" → "80 °C" (no space before degree)
|
|
448
|
+
cond = re.sub(r"(-?\d+\.?\d*)°C", r"\1 °C", cond)
|
|
449
|
+
result.append(cond)
|
|
450
|
+
return result
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def _has_unit(value: str) -> bool:
|
|
454
|
+
"""Check if a mass string already contains a unit (g, mg, mL, etc.)."""
|
|
455
|
+
return bool(re.search(r"\d\s*(g|mg|kg|mL|µL|L)\b", value))
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def _merge_eln_labels(experiments: List[str]) -> str:
|
|
459
|
+
"""Merge multiple ELN experiment names into a compact label.
|
|
460
|
+
|
|
461
|
+
If all share a common prefix (e.g. "KL-7001-"), uses compact form:
|
|
462
|
+
"KL-7001-001/003/004/009". Otherwise joins with ", ".
|
|
463
|
+
"""
|
|
464
|
+
if not experiments:
|
|
465
|
+
return ""
|
|
466
|
+
if len(experiments) == 1:
|
|
467
|
+
return experiments[0]
|
|
468
|
+
|
|
469
|
+
# Try to find common prefix up to the last dash
|
|
470
|
+
parts = [exp.rsplit("-", 1) for exp in experiments if "-" in exp]
|
|
471
|
+
if len(parts) == len(experiments):
|
|
472
|
+
prefixes = set(p[0] for p in parts)
|
|
473
|
+
if len(prefixes) == 1:
|
|
474
|
+
prefix = parts[0][0]
|
|
475
|
+
suffixes = [p[1] for p in parts]
|
|
476
|
+
return prefix + "-" + "/".join(suffixes)
|
|
477
|
+
return ", ".join(experiments)
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
# ---------------------------------------------------------------------------
|
|
481
|
+
# Multi-reaction merge — data structures
|
|
482
|
+
# ---------------------------------------------------------------------------
|
|
483
|
+
|
|
484
|
+
@dataclass
|
|
485
|
+
class ReactionSummary:
|
|
486
|
+
"""Extracted summary of one reaction JSON for merge classification."""
|
|
487
|
+
index: int
|
|
488
|
+
json_path: str
|
|
489
|
+
experiment: str
|
|
490
|
+
sm_smiles: str
|
|
491
|
+
dp_smiles: str
|
|
492
|
+
reagent_smiles: Dict[str, str] # {species_id: canonical_smiles}
|
|
493
|
+
reagent_names: Dict[str, str] # {species_id: display_name}
|
|
494
|
+
reagent_equivs: Dict[str, str] # {species_id: equiv_str}
|
|
495
|
+
all_smiles: set # all valid canonical SMILES in this reaction
|
|
496
|
+
species: List[Dict[str, Any]]
|
|
497
|
+
conditions: List[str]
|
|
498
|
+
eln_data: Dict[str, Any]
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
@dataclass
|
|
502
|
+
class MergePlan:
|
|
503
|
+
"""How to combine N reaction JSONs into a merged scheme."""
|
|
504
|
+
parallel_groups: List[List[int]] # groups of reaction indices
|
|
505
|
+
chains: List[List[int]] # independent sequential chains (each is a topo-sorted list of group indices)
|
|
506
|
+
unrelated_groups: List[int] # indices into parallel_groups with no chain link
|
|
507
|
+
|
|
508
|
+
def describe(self) -> str:
|
|
509
|
+
parts: List[str] = []
|
|
510
|
+
for chain in self.chains:
|
|
511
|
+
descs = []
|
|
512
|
+
for gi in chain:
|
|
513
|
+
grp = self.parallel_groups[gi]
|
|
514
|
+
descs.append("+".join(str(g) for g in grp)
|
|
515
|
+
if len(grp) > 1 else str(grp[0]))
|
|
516
|
+
parts.append("Chain: " + " -> ".join(descs))
|
|
517
|
+
for gi in self.unrelated_groups:
|
|
518
|
+
grp = self.parallel_groups[gi]
|
|
519
|
+
parts.append("Unrelated: " + "+".join(str(g) for g in grp))
|
|
520
|
+
return "; ".join(parts) if parts else "Single reaction"
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
# ---------------------------------------------------------------------------
|
|
524
|
+
# Multi-reaction merge — SMILES matching
|
|
525
|
+
# ---------------------------------------------------------------------------
|
|
526
|
+
|
|
527
|
+
def _canonicalize(smiles: str) -> str:
|
|
528
|
+
"""Return RDKit canonical SMILES, or original string if RDKit fails."""
|
|
529
|
+
if not smiles:
|
|
530
|
+
return ""
|
|
531
|
+
try:
|
|
532
|
+
from rdkit import Chem
|
|
533
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
534
|
+
if mol is not None:
|
|
535
|
+
return Chem.MolToSmiles(mol)
|
|
536
|
+
except Exception:
|
|
537
|
+
pass
|
|
538
|
+
return smiles
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def _strip_salts(smiles: str) -> str:
|
|
542
|
+
"""Strip small fragments (counterions) from multi-component SMILES.
|
|
543
|
+
|
|
544
|
+
Keeps only the largest fragment by heavy atom count. This handles
|
|
545
|
+
common salt forms (HCl, TFA, Na+, etc.) that differ between ELN
|
|
546
|
+
entries for the same compound.
|
|
547
|
+
"""
|
|
548
|
+
if not smiles or "." not in smiles:
|
|
549
|
+
return smiles
|
|
550
|
+
try:
|
|
551
|
+
from rdkit import Chem
|
|
552
|
+
frags = smiles.split(".")
|
|
553
|
+
best = smiles
|
|
554
|
+
best_size = 0
|
|
555
|
+
for frag in frags:
|
|
556
|
+
mol = Chem.MolFromSmiles(frag)
|
|
557
|
+
if mol is not None:
|
|
558
|
+
size = mol.GetNumHeavyAtoms()
|
|
559
|
+
if size > best_size:
|
|
560
|
+
best_size = size
|
|
561
|
+
best = Chem.MolToSmiles(mol)
|
|
562
|
+
return best
|
|
563
|
+
except Exception:
|
|
564
|
+
return smiles
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def _smiles_match(a: str, b: str) -> bool:
|
|
568
|
+
"""Check if two SMILES represent the same molecule, tolerating salt forms."""
|
|
569
|
+
if not a or not b:
|
|
570
|
+
return False
|
|
571
|
+
if a == b:
|
|
572
|
+
return True
|
|
573
|
+
return _strip_salts(a) == _strip_salts(b)
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
def _extract_reaction_summary(index: int, json_path: str) -> ReactionSummary:
|
|
577
|
+
"""Load a reaction JSON and extract the key data for merge classification.
|
|
578
|
+
|
|
579
|
+
Solvents are excluded from the reagent set (checked via role_detail or
|
|
580
|
+
is_solvent flag).
|
|
581
|
+
"""
|
|
582
|
+
with open(json_path, "r", encoding="utf-8") as f:
|
|
583
|
+
data = json.load(f)
|
|
584
|
+
|
|
585
|
+
species = data.get("species", [])
|
|
586
|
+
conditions = data.get("conditions", [])
|
|
587
|
+
eln_data = data.get("eln_data") or {}
|
|
588
|
+
experiment = data.get("experiment", os.path.splitext(
|
|
589
|
+
os.path.basename(json_path))[0])
|
|
590
|
+
|
|
591
|
+
sm_smiles = ""
|
|
592
|
+
dp_smiles = ""
|
|
593
|
+
reagent_smiles: Dict[str, str] = {}
|
|
594
|
+
reagent_names: Dict[str, str] = {}
|
|
595
|
+
reagent_equivs: Dict[str, str] = {}
|
|
596
|
+
all_smiles: set = set()
|
|
597
|
+
|
|
598
|
+
for sp in species:
|
|
599
|
+
smi = _canonicalize(sp.get("smiles", ""))
|
|
600
|
+
sp_id = sp.get("id", "")
|
|
601
|
+
is_solvent = (sp.get("is_solvent", False)
|
|
602
|
+
or (sp.get("role_detail") or "").lower() == "solvent")
|
|
603
|
+
|
|
604
|
+
if smi and smi != "?" and smi != "":
|
|
605
|
+
all_smiles.add(smi)
|
|
606
|
+
|
|
607
|
+
if sp.get("is_sm") and smi and not is_solvent:
|
|
608
|
+
sm_smiles = smi
|
|
609
|
+
if (sp.get("is_dp") or sp.get("role") == "product") and smi:
|
|
610
|
+
dp_smiles = smi
|
|
611
|
+
elif not is_solvent and smi and not sp.get("is_sm") and not sp.get("is_dp"):
|
|
612
|
+
reagent_smiles[sp_id] = smi
|
|
613
|
+
reagent_names[sp_id] = (sp.get("name") or sp.get("display_text")
|
|
614
|
+
or sp_id)
|
|
615
|
+
equiv = sp.get("csv_equiv")
|
|
616
|
+
if equiv:
|
|
617
|
+
reagent_equivs[sp_id] = str(equiv)
|
|
618
|
+
|
|
619
|
+
return ReactionSummary(
|
|
620
|
+
index=index, json_path=json_path, experiment=experiment,
|
|
621
|
+
sm_smiles=sm_smiles, dp_smiles=dp_smiles,
|
|
622
|
+
reagent_smiles=reagent_smiles, reagent_names=reagent_names,
|
|
623
|
+
reagent_equivs=reagent_equivs, all_smiles=all_smiles,
|
|
624
|
+
species=species, conditions=conditions, eln_data=eln_data,
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
# ---------------------------------------------------------------------------
|
|
629
|
+
# Multi-reaction merge — pair classification
|
|
630
|
+
# ---------------------------------------------------------------------------
|
|
631
|
+
|
|
632
|
+
def _classify_pair(a: ReactionSummary, b: ReactionSummary) -> str:
|
|
633
|
+
"""Classify the relationship between two reactions.
|
|
634
|
+
|
|
635
|
+
Returns "parallel", "sequential_ab", "sequential_ba", or "unrelated".
|
|
636
|
+
|
|
637
|
+
Parallel requires: same SM + same DP + at least one shared non-solvent
|
|
638
|
+
reagent. Same SM+DP with no shared reagent = different chemistry.
|
|
639
|
+
|
|
640
|
+
Salt forms are tolerated: free amine and HCl salt are treated as the
|
|
641
|
+
same molecule for SM/DP matching.
|
|
642
|
+
"""
|
|
643
|
+
sm_match = _smiles_match(a.sm_smiles, b.sm_smiles)
|
|
644
|
+
dp_match = _smiles_match(a.dp_smiles, b.dp_smiles)
|
|
645
|
+
|
|
646
|
+
if sm_match and dp_match:
|
|
647
|
+
a_set = set(a.reagent_smiles.values())
|
|
648
|
+
b_set = set(b.reagent_smiles.values())
|
|
649
|
+
if a_set & b_set:
|
|
650
|
+
return "parallel"
|
|
651
|
+
return "unrelated"
|
|
652
|
+
|
|
653
|
+
# Direct DP→SM match (salt-tolerant)
|
|
654
|
+
if _smiles_match(a.dp_smiles, b.sm_smiles):
|
|
655
|
+
return "sequential_ab"
|
|
656
|
+
if _smiles_match(b.dp_smiles, a.sm_smiles):
|
|
657
|
+
return "sequential_ba"
|
|
658
|
+
|
|
659
|
+
# Fallback: check if A's product SMILES appears anywhere in B's species
|
|
660
|
+
# (handles cases where SM SMILES is unresolved but the molecule appears
|
|
661
|
+
# as a reagent or is otherwise present in the reaction)
|
|
662
|
+
if a.dp_smiles:
|
|
663
|
+
a_dp_stripped = _strip_salts(a.dp_smiles)
|
|
664
|
+
for smi in b.all_smiles:
|
|
665
|
+
if a.dp_smiles == smi or a_dp_stripped == _strip_salts(smi):
|
|
666
|
+
return "sequential_ab"
|
|
667
|
+
if b.dp_smiles:
|
|
668
|
+
b_dp_stripped = _strip_salts(b.dp_smiles)
|
|
669
|
+
for smi in a.all_smiles:
|
|
670
|
+
if b.dp_smiles == smi or b_dp_stripped == _strip_salts(smi):
|
|
671
|
+
return "sequential_ba"
|
|
672
|
+
|
|
673
|
+
return "unrelated"
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
def _build_merge_plan(summaries: List[ReactionSummary]) -> MergePlan:
|
|
677
|
+
"""Analyze N reactions and determine merge strategy.
|
|
678
|
+
|
|
679
|
+
Algorithm:
|
|
680
|
+
1. Pairwise classification
|
|
681
|
+
2. Union-Find for parallel clustering
|
|
682
|
+
3. DAG construction for sequential links between clusters
|
|
683
|
+
4. Find connected components in the DAG (independent chains)
|
|
684
|
+
5. Topological sort within each component
|
|
685
|
+
"""
|
|
686
|
+
n = len(summaries)
|
|
687
|
+
if n == 1:
|
|
688
|
+
return MergePlan(
|
|
689
|
+
parallel_groups=[[0]], chains=[[0]],
|
|
690
|
+
unrelated_groups=[],
|
|
691
|
+
)
|
|
692
|
+
|
|
693
|
+
# Classify all pairs
|
|
694
|
+
classifications: Dict[Tuple[int, int], str] = {}
|
|
695
|
+
for i in range(n):
|
|
696
|
+
for j in range(i + 1, n):
|
|
697
|
+
classifications[(i, j)] = _classify_pair(summaries[i], summaries[j])
|
|
698
|
+
|
|
699
|
+
# Union-Find for parallel clusters
|
|
700
|
+
parent = list(range(n))
|
|
701
|
+
|
|
702
|
+
def find(x: int) -> int:
|
|
703
|
+
while parent[x] != x:
|
|
704
|
+
parent[x] = parent[parent[x]]
|
|
705
|
+
x = parent[x]
|
|
706
|
+
return x
|
|
707
|
+
|
|
708
|
+
def union(x: int, y: int) -> None:
|
|
709
|
+
px, py = find(x), find(y)
|
|
710
|
+
if px != py:
|
|
711
|
+
parent[px] = py
|
|
712
|
+
|
|
713
|
+
for (i, j), c in classifications.items():
|
|
714
|
+
if c == "parallel":
|
|
715
|
+
union(i, j)
|
|
716
|
+
|
|
717
|
+
# Build groups
|
|
718
|
+
groups_map: Dict[int, List[int]] = {}
|
|
719
|
+
for i in range(n):
|
|
720
|
+
root = find(i)
|
|
721
|
+
groups_map.setdefault(root, []).append(i)
|
|
722
|
+
groups = list(groups_map.values())
|
|
723
|
+
|
|
724
|
+
reaction_to_group: Dict[int, int] = {}
|
|
725
|
+
for gi, grp in enumerate(groups):
|
|
726
|
+
for ri in grp:
|
|
727
|
+
reaction_to_group[ri] = gi
|
|
728
|
+
|
|
729
|
+
# DAG of sequential links between groups
|
|
730
|
+
ng = len(groups)
|
|
731
|
+
seq_edges: set = set()
|
|
732
|
+
for (i, j), c in classifications.items():
|
|
733
|
+
gi, gj = reaction_to_group[i], reaction_to_group[j]
|
|
734
|
+
if gi == gj:
|
|
735
|
+
continue
|
|
736
|
+
if c == "sequential_ab":
|
|
737
|
+
seq_edges.add((gi, gj))
|
|
738
|
+
elif c == "sequential_ba":
|
|
739
|
+
seq_edges.add((gj, gi))
|
|
740
|
+
|
|
741
|
+
if not seq_edges:
|
|
742
|
+
# No sequential links at all
|
|
743
|
+
if ng == 1:
|
|
744
|
+
return MergePlan(
|
|
745
|
+
parallel_groups=groups, chains=[[0]],
|
|
746
|
+
unrelated_groups=[],
|
|
747
|
+
)
|
|
748
|
+
return MergePlan(
|
|
749
|
+
parallel_groups=groups, chains=[],
|
|
750
|
+
unrelated_groups=list(range(ng)),
|
|
751
|
+
)
|
|
752
|
+
|
|
753
|
+
# Find connected components in the undirected version of the DAG
|
|
754
|
+
adj_undirected: Dict[int, set] = {i: set() for i in range(ng)}
|
|
755
|
+
adj_directed: Dict[int, List[int]] = {i: [] for i in range(ng)}
|
|
756
|
+
in_deg: Dict[int, int] = {i: 0 for i in range(ng)}
|
|
757
|
+
for a, b in seq_edges:
|
|
758
|
+
adj_undirected[a].add(b)
|
|
759
|
+
adj_undirected[b].add(a)
|
|
760
|
+
adj_directed[a].append(b)
|
|
761
|
+
in_deg[b] += 1
|
|
762
|
+
|
|
763
|
+
visited: set = set()
|
|
764
|
+
components: List[set] = []
|
|
765
|
+
for start in range(ng):
|
|
766
|
+
if start in visited or not adj_undirected[start]:
|
|
767
|
+
continue
|
|
768
|
+
# BFS to find connected component
|
|
769
|
+
component: set = set()
|
|
770
|
+
bfs_queue = [start]
|
|
771
|
+
while bfs_queue:
|
|
772
|
+
node = bfs_queue.pop(0)
|
|
773
|
+
if node in visited:
|
|
774
|
+
continue
|
|
775
|
+
visited.add(node)
|
|
776
|
+
component.add(node)
|
|
777
|
+
for nb in adj_undirected[node]:
|
|
778
|
+
if nb not in visited:
|
|
779
|
+
bfs_queue.append(nb)
|
|
780
|
+
components.append(component)
|
|
781
|
+
|
|
782
|
+
# Topological sort within each component → one chain per component
|
|
783
|
+
chains: List[List[int]] = []
|
|
784
|
+
for component in components:
|
|
785
|
+
# Kahn's algorithm on the subgraph
|
|
786
|
+
local_in: Dict[int, int] = {g: 0 for g in component}
|
|
787
|
+
for g in component:
|
|
788
|
+
for nb in adj_directed[g]:
|
|
789
|
+
if nb in component:
|
|
790
|
+
local_in[nb] += 1
|
|
791
|
+
queue = [g for g in component if local_in[g] == 0]
|
|
792
|
+
chain: List[int] = []
|
|
793
|
+
while queue:
|
|
794
|
+
node = queue.pop(0)
|
|
795
|
+
chain.append(node)
|
|
796
|
+
for nb in adj_directed[node]:
|
|
797
|
+
if nb in component:
|
|
798
|
+
local_in[nb] -= 1
|
|
799
|
+
if local_in[nb] == 0:
|
|
800
|
+
queue.append(nb)
|
|
801
|
+
if len(chain) != len(component):
|
|
802
|
+
chain = sorted(component) # cycle fallback
|
|
803
|
+
chains.append(chain)
|
|
804
|
+
|
|
805
|
+
# Groups not in any component → unrelated
|
|
806
|
+
connected_groups: set = set()
|
|
807
|
+
for comp in components:
|
|
808
|
+
connected_groups.update(comp)
|
|
809
|
+
unrelated = [gi for gi in range(ng) if gi not in connected_groups]
|
|
810
|
+
|
|
811
|
+
return MergePlan(
|
|
812
|
+
parallel_groups=groups,
|
|
813
|
+
chains=chains,
|
|
814
|
+
unrelated_groups=unrelated,
|
|
815
|
+
)
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
# ---------------------------------------------------------------------------
|
|
819
|
+
# Multi-reaction merge — parallel merge helpers
|
|
820
|
+
# ---------------------------------------------------------------------------
|
|
821
|
+
|
|
822
|
+
def _pick_template(
|
|
823
|
+
summaries: List[ReactionSummary],
|
|
824
|
+
group_indices: List[int],
|
|
825
|
+
) -> int:
|
|
826
|
+
"""Pick the best template reaction for a parallel group.
|
|
827
|
+
|
|
828
|
+
Returns the index (into summaries) of the reaction whose reagent
|
|
829
|
+
SMILES set is shared by the most other reactions in the group.
|
|
830
|
+
This minimizes the number of run-arrow notes needed.
|
|
831
|
+
"""
|
|
832
|
+
if len(group_indices) <= 1:
|
|
833
|
+
return group_indices[0]
|
|
834
|
+
|
|
835
|
+
reagent_sets = {}
|
|
836
|
+
for ri in group_indices:
|
|
837
|
+
s = summaries[ri]
|
|
838
|
+
reagent_sets[ri] = frozenset(s.reagent_smiles.values())
|
|
839
|
+
|
|
840
|
+
best_ri = group_indices[0]
|
|
841
|
+
best_count = 0
|
|
842
|
+
for ri in group_indices:
|
|
843
|
+
count = sum(1 for other in group_indices
|
|
844
|
+
if reagent_sets[other] == reagent_sets[ri])
|
|
845
|
+
if count > best_count:
|
|
846
|
+
best_count = count
|
|
847
|
+
best_ri = ri
|
|
848
|
+
return best_ri
|
|
849
|
+
|
|
850
|
+
|
|
851
|
+
def _diff_reagents(
|
|
852
|
+
summaries: List[ReactionSummary],
|
|
853
|
+
group_indices: List[int],
|
|
854
|
+
) -> Tuple[bool, Dict[int, Optional[str]]]:
|
|
855
|
+
"""Compare reagents across parallel reactions against the optimal template.
|
|
856
|
+
|
|
857
|
+
Each run is compared against the template reaction (first in group).
|
|
858
|
+
Notes only show reagents that are in THIS run but NOT in the template
|
|
859
|
+
(i.e. what's different about this particular run). Equiv differences
|
|
860
|
+
for shared reagents are handled by range notation on the main arrow.
|
|
861
|
+
|
|
862
|
+
Returns (all_identical, {reaction_index: note_string_or_None}).
|
|
863
|
+
"""
|
|
864
|
+
if len(group_indices) <= 1:
|
|
865
|
+
return True, {}
|
|
866
|
+
|
|
867
|
+
# Build per-reaction fingerprint: {canonical_smiles: (name, equiv)}
|
|
868
|
+
per_reaction: Dict[int, Dict[str, Tuple[str, str]]] = {}
|
|
869
|
+
for ri in group_indices:
|
|
870
|
+
s = summaries[ri]
|
|
871
|
+
fp: Dict[str, Tuple[str, str]] = {}
|
|
872
|
+
for sp_id, smi in s.reagent_smiles.items():
|
|
873
|
+
equiv = s.reagent_equivs.get(sp_id, "")
|
|
874
|
+
name = s.reagent_names.get(sp_id, "")
|
|
875
|
+
fp[smi] = (name, equiv)
|
|
876
|
+
per_reaction[ri] = fp
|
|
877
|
+
|
|
878
|
+
template_ri = _pick_template(summaries, group_indices)
|
|
879
|
+
template_smiles = set(per_reaction[template_ri].keys())
|
|
880
|
+
|
|
881
|
+
# Check if any run has a different reagent set than the template
|
|
882
|
+
has_differences = False
|
|
883
|
+
notes: Dict[int, Optional[str]] = {}
|
|
884
|
+
for ri in group_indices:
|
|
885
|
+
run_smiles = set(per_reaction[ri].keys())
|
|
886
|
+
# Reagents in this run but NOT in the template
|
|
887
|
+
extra = run_smiles - template_smiles
|
|
888
|
+
if extra:
|
|
889
|
+
has_differences = True
|
|
890
|
+
parts: List[str] = []
|
|
891
|
+
for smi in sorted(extra):
|
|
892
|
+
name, equiv = per_reaction[ri][smi]
|
|
893
|
+
if equiv:
|
|
894
|
+
parts.append(f"{name} ({equiv} eq)")
|
|
895
|
+
else:
|
|
896
|
+
parts.append(name)
|
|
897
|
+
notes[ri] = ", ".join(parts)
|
|
898
|
+
else:
|
|
899
|
+
notes[ri] = None
|
|
900
|
+
|
|
901
|
+
if not has_differences:
|
|
902
|
+
return True, {}
|
|
903
|
+
|
|
904
|
+
return False, notes
|
|
905
|
+
|
|
906
|
+
|
|
907
|
+
def _equiv_range(
|
|
908
|
+
summaries: List[ReactionSummary],
|
|
909
|
+
group_indices: List[int],
|
|
910
|
+
smiles: str,
|
|
911
|
+
) -> str:
|
|
912
|
+
"""Compute range notation for equivalents of one reagent across parallel runs.
|
|
913
|
+
|
|
914
|
+
Returns e.g. "1.1\u20131.5" (en-dash) if they differ, single value if same.
|
|
915
|
+
"""
|
|
916
|
+
values: List[float] = []
|
|
917
|
+
for ri in group_indices:
|
|
918
|
+
s = summaries[ri]
|
|
919
|
+
for sp_id, smi in s.reagent_smiles.items():
|
|
920
|
+
if smi == smiles:
|
|
921
|
+
eq = s.reagent_equivs.get(sp_id, "")
|
|
922
|
+
if eq:
|
|
923
|
+
try:
|
|
924
|
+
values.append(float(eq))
|
|
925
|
+
except ValueError:
|
|
926
|
+
pass
|
|
927
|
+
if not values:
|
|
928
|
+
return ""
|
|
929
|
+
unique = sorted(set(values))
|
|
930
|
+
if len(unique) == 1:
|
|
931
|
+
return f"{unique[0]:g}"
|
|
932
|
+
return f"{unique[0]:g}\u2013{unique[-1]:g}"
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
# ---------------------------------------------------------------------------
|
|
936
|
+
# Multi-reaction merge — combined YAML generation
|
|
937
|
+
# ---------------------------------------------------------------------------
|
|
938
|
+
|
|
939
|
+
def _namespace_species_id(reaction_index: int, sp_id: str) -> str:
|
|
940
|
+
"""Prefix a species ID with reaction index to avoid collisions."""
|
|
941
|
+
return f"rxn{reaction_index}_{sp_id}"
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
def _apply_namespace(
|
|
945
|
+
yaml_dict: Dict[str, Any],
|
|
946
|
+
reaction_index: int,
|
|
947
|
+
remap: Dict[str, str],
|
|
948
|
+
) -> Dict[str, Any]:
|
|
949
|
+
"""Namespace all structure IDs in a single-reaction YAML dict.
|
|
950
|
+
|
|
951
|
+
Returns a new dict with namespaced structure keys and step references.
|
|
952
|
+
Applies the remap table for shared intermediates.
|
|
953
|
+
"""
|
|
954
|
+
old_structures = yaml_dict.get("structures", {})
|
|
955
|
+
new_structures: Dict[str, Any] = {}
|
|
956
|
+
id_map: Dict[str, str] = {} # old_id -> final_id
|
|
957
|
+
|
|
958
|
+
for old_id, struct_data in old_structures.items():
|
|
959
|
+
ns_id = _namespace_species_id(reaction_index, old_id)
|
|
960
|
+
final_id = remap.get(ns_id, ns_id)
|
|
961
|
+
id_map[old_id] = final_id
|
|
962
|
+
new_structures[final_id] = struct_data
|
|
963
|
+
|
|
964
|
+
def _remap_ids(id_list: List[str]) -> List[str]:
|
|
965
|
+
return [id_map.get(sid, sid) for sid in id_list]
|
|
966
|
+
|
|
967
|
+
new_steps = []
|
|
968
|
+
for step in yaml_dict.get("steps", []):
|
|
969
|
+
new_step = dict(step)
|
|
970
|
+
if "substrates" in new_step:
|
|
971
|
+
new_step["substrates"] = _remap_ids(new_step["substrates"])
|
|
972
|
+
if "products" in new_step:
|
|
973
|
+
new_step["products"] = _remap_ids(new_step["products"])
|
|
974
|
+
if "above_arrow" in new_step:
|
|
975
|
+
above = dict(new_step["above_arrow"])
|
|
976
|
+
if "structures" in above:
|
|
977
|
+
above["structures"] = _remap_ids(above["structures"])
|
|
978
|
+
new_step["above_arrow"] = above
|
|
979
|
+
new_steps.append(new_step)
|
|
980
|
+
|
|
981
|
+
result = dict(yaml_dict)
|
|
982
|
+
result["structures"] = new_structures
|
|
983
|
+
result["steps"] = new_steps
|
|
984
|
+
return result
|
|
985
|
+
|
|
986
|
+
|
|
987
|
+
def _build_run_entry_from_eln(
|
|
988
|
+
eln_data: Dict[str, Any],
|
|
989
|
+
allow_partial: bool = False,
|
|
990
|
+
) -> Optional[Dict[str, Any]]:
|
|
991
|
+
"""Build a single run arrow entry dict from ELN data.
|
|
992
|
+
|
|
993
|
+
Parameters
|
|
994
|
+
----------
|
|
995
|
+
allow_partial : bool
|
|
996
|
+
When True, create an entry even if only sm_mass is available
|
|
997
|
+
(output will be empty). Used for merged schemes where every
|
|
998
|
+
reaction should get a run arrow.
|
|
999
|
+
"""
|
|
1000
|
+
sm_mass = eln_data.get("sm_mass", "").strip()
|
|
1001
|
+
product_obtained = eln_data.get("product_obtained", "").strip()
|
|
1002
|
+
product_yield = eln_data.get("product_yield", "").strip()
|
|
1003
|
+
|
|
1004
|
+
if not sm_mass:
|
|
1005
|
+
return None
|
|
1006
|
+
if not product_obtained and not allow_partial:
|
|
1007
|
+
return None
|
|
1008
|
+
|
|
1009
|
+
input_label = sm_mass if _has_unit(sm_mass) else f"{sm_mass} g"
|
|
1010
|
+
|
|
1011
|
+
if product_obtained:
|
|
1012
|
+
obtained_str = (product_obtained if _has_unit(product_obtained)
|
|
1013
|
+
else f"{product_obtained} g")
|
|
1014
|
+
if product_yield:
|
|
1015
|
+
yield_clean = product_yield.rstrip("%").strip()
|
|
1016
|
+
output_label = f"{obtained_str}, {yield_clean}% yield"
|
|
1017
|
+
else:
|
|
1018
|
+
output_label = obtained_str
|
|
1019
|
+
else:
|
|
1020
|
+
output_label = ""
|
|
1021
|
+
|
|
1022
|
+
return {"input": input_label, "output": output_label}
|
|
1023
|
+
|
|
1024
|
+
|
|
1025
|
+
def _update_below_arrow_with_ranges(
|
|
1026
|
+
step_dict: Dict[str, Any],
|
|
1027
|
+
summaries: List[ReactionSummary],
|
|
1028
|
+
group_indices: List[int],
|
|
1029
|
+
template: ReactionSummary,
|
|
1030
|
+
) -> None:
|
|
1031
|
+
"""Replace individual equiv values with range notation in below_arrow text.
|
|
1032
|
+
|
|
1033
|
+
For parallel groups, reagents that vary across runs get range notation
|
|
1034
|
+
(e.g., "Cs2CO3 (1.5\u20132.0 eq)").
|
|
1035
|
+
"""
|
|
1036
|
+
below = step_dict.get("below_arrow")
|
|
1037
|
+
if not below:
|
|
1038
|
+
return
|
|
1039
|
+
text_lines = below.get("text", [])
|
|
1040
|
+
if not text_lines:
|
|
1041
|
+
return
|
|
1042
|
+
|
|
1043
|
+
new_lines = []
|
|
1044
|
+
for line in text_lines:
|
|
1045
|
+
updated = False
|
|
1046
|
+
for sp in template.species:
|
|
1047
|
+
name = sp.get("name", "")
|
|
1048
|
+
if not name or name not in line or "eq" not in line:
|
|
1049
|
+
continue
|
|
1050
|
+
smi = _canonicalize(sp.get("smiles", ""))
|
|
1051
|
+
if not smi:
|
|
1052
|
+
continue
|
|
1053
|
+
range_str = _equiv_range(summaries, group_indices, smi)
|
|
1054
|
+
if range_str and "\u2013" in range_str:
|
|
1055
|
+
new_line = re.sub(
|
|
1056
|
+
r"\([^)]*eq\)", f"({range_str} eq)", line)
|
|
1057
|
+
new_lines.append(new_line)
|
|
1058
|
+
updated = True
|
|
1059
|
+
break
|
|
1060
|
+
if not updated:
|
|
1061
|
+
new_lines.append(line)
|
|
1062
|
+
|
|
1063
|
+
below["text"] = new_lines
|
|
1064
|
+
|
|
1065
|
+
|
|
1066
|
+
def _update_above_arrow_with_ranges(
|
|
1067
|
+
step_dict: Dict[str, Any],
|
|
1068
|
+
summaries: List[ReactionSummary],
|
|
1069
|
+
group_indices: List[int],
|
|
1070
|
+
template: ReactionSummary,
|
|
1071
|
+
) -> None:
|
|
1072
|
+
"""Replace equiv values in above_arrow text with range notation."""
|
|
1073
|
+
above = step_dict.get("above_arrow")
|
|
1074
|
+
if not above:
|
|
1075
|
+
return
|
|
1076
|
+
text_lines = above.get("text", [])
|
|
1077
|
+
if not text_lines:
|
|
1078
|
+
return
|
|
1079
|
+
|
|
1080
|
+
# Above-arrow text entries are typically "(X eq)" for each above structure.
|
|
1081
|
+
# Find the corresponding species by position.
|
|
1082
|
+
above_structs = above.get("structures", [])
|
|
1083
|
+
new_lines = []
|
|
1084
|
+
for i, line in enumerate(text_lines):
|
|
1085
|
+
if "eq" not in line:
|
|
1086
|
+
new_lines.append(line)
|
|
1087
|
+
continue
|
|
1088
|
+
# Find the SMILES for the i-th above structure
|
|
1089
|
+
if i < len(above_structs):
|
|
1090
|
+
sid = above_structs[i]
|
|
1091
|
+
sp = next((s for s in template.species if s.get("id") == sid), None)
|
|
1092
|
+
if sp:
|
|
1093
|
+
smi = _canonicalize(sp.get("smiles", ""))
|
|
1094
|
+
if smi:
|
|
1095
|
+
range_str = _equiv_range(summaries, group_indices, smi)
|
|
1096
|
+
if range_str and "\u2013" in range_str:
|
|
1097
|
+
new_lines.append(f"({range_str} eq)")
|
|
1098
|
+
continue
|
|
1099
|
+
new_lines.append(line)
|
|
1100
|
+
|
|
1101
|
+
above["text"] = new_lines
|
|
1102
|
+
|
|
1103
|
+
|
|
1104
|
+
def build_merged_scheme_yaml_dict(
|
|
1105
|
+
json_paths: List[str],
|
|
1106
|
+
layout: str = "auto",
|
|
1107
|
+
include_run_arrows: bool = True,
|
|
1108
|
+
use_eln_labels: bool = False,
|
|
1109
|
+
) -> Dict[str, Any]:
|
|
1110
|
+
"""Build a combined YAML dict from multiple reaction JSONs.
|
|
1111
|
+
|
|
1112
|
+
Detects parallel reactions (same SM + DP + shared reagents) and sequential
|
|
1113
|
+
chains (product of A = SM of B), and produces a merged scheme.
|
|
1114
|
+
"""
|
|
1115
|
+
all_summaries = [_extract_reaction_summary(i, p)
|
|
1116
|
+
for i, p in enumerate(json_paths)]
|
|
1117
|
+
|
|
1118
|
+
# Filter out degenerate reactions (SM == DP, e.g. solubility tests,
|
|
1119
|
+
# control experiments). These have no meaningful reaction to display.
|
|
1120
|
+
summaries = [s for s in all_summaries
|
|
1121
|
+
if not (s.sm_smiles and s.dp_smiles
|
|
1122
|
+
and s.sm_smiles == s.dp_smiles)]
|
|
1123
|
+
|
|
1124
|
+
if not summaries:
|
|
1125
|
+
summaries = all_summaries # fallback: don't filter everything out
|
|
1126
|
+
|
|
1127
|
+
plan = _build_merge_plan(summaries)
|
|
1128
|
+
|
|
1129
|
+
# --- Determine shared intermediates (per chain) ---
|
|
1130
|
+
remap: Dict[str, str] = {}
|
|
1131
|
+
for chain in plan.chains:
|
|
1132
|
+
for ci in range(len(chain) - 1):
|
|
1133
|
+
gi_a = chain[ci]
|
|
1134
|
+
gi_b = chain[ci + 1]
|
|
1135
|
+
ri_a = plan.parallel_groups[gi_a][0]
|
|
1136
|
+
ri_b = plan.parallel_groups[gi_b][0]
|
|
1137
|
+
sa, sb = summaries[ri_a], summaries[ri_b]
|
|
1138
|
+
|
|
1139
|
+
# Find the DP species in A that links to B
|
|
1140
|
+
dp_id_a = next(
|
|
1141
|
+
(sp["id"] for sp in sa.species
|
|
1142
|
+
if (sp.get("is_dp") or sp.get("role") == "product")
|
|
1143
|
+
and _canonicalize(sp.get("smiles", "")) == sa.dp_smiles),
|
|
1144
|
+
None,
|
|
1145
|
+
)
|
|
1146
|
+
# Find the SM species in B that matches A's product.
|
|
1147
|
+
# Try direct SM match first; if SM SMILES is unresolved, find any
|
|
1148
|
+
# species in B whose SMILES equals A's DP.
|
|
1149
|
+
sm_id_b = next(
|
|
1150
|
+
(sp["id"] for sp in sb.species
|
|
1151
|
+
if sp.get("is_sm")
|
|
1152
|
+
and _canonicalize(sp.get("smiles", "")) == sa.dp_smiles),
|
|
1153
|
+
None,
|
|
1154
|
+
)
|
|
1155
|
+
if sm_id_b is None:
|
|
1156
|
+
# Fallback: any species in B with matching SMILES
|
|
1157
|
+
sm_id_b = next(
|
|
1158
|
+
(sp["id"] for sp in sb.species
|
|
1159
|
+
if _canonicalize(sp.get("smiles", "")) == sa.dp_smiles),
|
|
1160
|
+
None,
|
|
1161
|
+
)
|
|
1162
|
+
if dp_id_a and sm_id_b:
|
|
1163
|
+
canonical = _namespace_species_id(ri_a, dp_id_a)
|
|
1164
|
+
replaced = _namespace_species_id(ri_b, sm_id_b)
|
|
1165
|
+
remap[replaced] = canonical
|
|
1166
|
+
|
|
1167
|
+
# --- Build per-group YAML dicts ---
|
|
1168
|
+
def _build_group(
|
|
1169
|
+
group_indices: List[int],
|
|
1170
|
+
step_number: int,
|
|
1171
|
+
label_start: int,
|
|
1172
|
+
) -> Tuple[Dict[str, Any], List[Dict[str, Any]], int]:
|
|
1173
|
+
"""Build YAML structures + step(s) for one parallel group.
|
|
1174
|
+
|
|
1175
|
+
Returns (structures_dict, [step_dict], next_label).
|
|
1176
|
+
"""
|
|
1177
|
+
template_ri = _pick_template(summaries, group_indices)
|
|
1178
|
+
template = summaries[template_ri]
|
|
1179
|
+
|
|
1180
|
+
# ELN label for products (when use_eln_labels is enabled)
|
|
1181
|
+
plabel = None
|
|
1182
|
+
if use_eln_labels:
|
|
1183
|
+
exps = [summaries[ri].experiment for ri in group_indices]
|
|
1184
|
+
plabel = _merge_eln_labels(exps)
|
|
1185
|
+
|
|
1186
|
+
# Build single-reaction dict using existing logic
|
|
1187
|
+
single = _build_yaml_dict(
|
|
1188
|
+
template.species, template.conditions, template.eln_data,
|
|
1189
|
+
layout="linear", include_run_arrows=False,
|
|
1190
|
+
product_label=plabel,
|
|
1191
|
+
)
|
|
1192
|
+
|
|
1193
|
+
# Range notation for parallel groups — apply BEFORE namespacing
|
|
1194
|
+
# so that species IDs in above_arrow.structures match template.species
|
|
1195
|
+
if len(group_indices) > 1:
|
|
1196
|
+
_update_below_arrow_with_ranges(
|
|
1197
|
+
single["steps"][0], summaries, group_indices, template)
|
|
1198
|
+
_update_above_arrow_with_ranges(
|
|
1199
|
+
single["steps"][0], summaries, group_indices, template)
|
|
1200
|
+
|
|
1201
|
+
# Namespace IDs
|
|
1202
|
+
ns = _apply_namespace(single, template_ri, remap)
|
|
1203
|
+
structures = ns["structures"]
|
|
1204
|
+
step = ns["steps"][0]
|
|
1205
|
+
|
|
1206
|
+
# Relabel: skip IDs already in all_structures (shared intermediates
|
|
1207
|
+
# get their label from the group that first produced them).
|
|
1208
|
+
if use_eln_labels:
|
|
1209
|
+
# ELN mode: labels already set by _build_yaml_dict (product_label)
|
|
1210
|
+
for _sid in list(structures.keys()):
|
|
1211
|
+
if _sid in all_structures:
|
|
1212
|
+
del structures[_sid]
|
|
1213
|
+
else:
|
|
1214
|
+
# Default mode: relabel with global counter
|
|
1215
|
+
label_counter = label_start
|
|
1216
|
+
for _sid in list(structures.keys()):
|
|
1217
|
+
if _sid in all_structures:
|
|
1218
|
+
del structures[_sid]
|
|
1219
|
+
continue
|
|
1220
|
+
entry = structures[_sid]
|
|
1221
|
+
if "label" in entry:
|
|
1222
|
+
entry["label"] = str(label_counter)
|
|
1223
|
+
label_counter += 1
|
|
1224
|
+
label_start = label_counter
|
|
1225
|
+
|
|
1226
|
+
return structures, [step], label_start
|
|
1227
|
+
|
|
1228
|
+
def _build_group_run_arrows(
|
|
1229
|
+
group_indices: List[int],
|
|
1230
|
+
step_number: int,
|
|
1231
|
+
include: bool,
|
|
1232
|
+
) -> Optional[Dict[str, Any]]:
|
|
1233
|
+
"""Build run_arrows entry for one parallel group.
|
|
1234
|
+
|
|
1235
|
+
Every reaction in the group gets a run arrow, even if the ELN
|
|
1236
|
+
data only has sm_mass (no product_obtained). This ensures all
|
|
1237
|
+
runs are visible, with deviation notes shown where applicable.
|
|
1238
|
+
"""
|
|
1239
|
+
if not include:
|
|
1240
|
+
return None
|
|
1241
|
+
|
|
1242
|
+
all_identical, notes = _diff_reagents(summaries, group_indices)
|
|
1243
|
+
runs: List[Dict[str, Any]] = []
|
|
1244
|
+
for ri in group_indices:
|
|
1245
|
+
entry = _build_run_entry_from_eln(
|
|
1246
|
+
summaries[ri].eln_data, allow_partial=True)
|
|
1247
|
+
if entry:
|
|
1248
|
+
if not all_identical and notes.get(ri):
|
|
1249
|
+
entry["note"] = notes[ri]
|
|
1250
|
+
runs.append(entry)
|
|
1251
|
+
|
|
1252
|
+
if runs:
|
|
1253
|
+
return {"step": step_number, "runs": runs}
|
|
1254
|
+
return None
|
|
1255
|
+
|
|
1256
|
+
# --- Determine overall layout ---
|
|
1257
|
+
num_chains = len(plan.chains)
|
|
1258
|
+
num_unrelated = len(plan.unrelated_groups)
|
|
1259
|
+
num_sections = num_chains + num_unrelated
|
|
1260
|
+
|
|
1261
|
+
if layout == "auto":
|
|
1262
|
+
if num_sections > 1:
|
|
1263
|
+
layout = "stacked-rows"
|
|
1264
|
+
elif num_chains == 1 and len(plan.chains[0]) > 1:
|
|
1265
|
+
layout = "sequential"
|
|
1266
|
+
else:
|
|
1267
|
+
layout = "linear"
|
|
1268
|
+
|
|
1269
|
+
# --- Assemble ---
|
|
1270
|
+
all_structures: Dict[str, Any] = {}
|
|
1271
|
+
run_arrows_list: List[Dict[str, Any]] = []
|
|
1272
|
+
|
|
1273
|
+
if layout == "stacked-rows" or num_sections > 1:
|
|
1274
|
+
# Each chain becomes a section; each unrelated group becomes a section
|
|
1275
|
+
sections: List[Dict[str, Any]] = []
|
|
1276
|
+
label_counter = 1
|
|
1277
|
+
global_step = 1
|
|
1278
|
+
|
|
1279
|
+
for chain in plan.chains:
|
|
1280
|
+
chain_steps: List[Dict[str, Any]] = []
|
|
1281
|
+
for gi in chain:
|
|
1282
|
+
grp = plan.parallel_groups[gi]
|
|
1283
|
+
structs, steps, label_counter = _build_group(
|
|
1284
|
+
grp, global_step, label_counter)
|
|
1285
|
+
valid_steps = [s for s in steps
|
|
1286
|
+
if s.get("substrates") or s.get("products")]
|
|
1287
|
+
if not valid_steps:
|
|
1288
|
+
continue
|
|
1289
|
+
all_structures.update(structs)
|
|
1290
|
+
chain_steps.extend(valid_steps)
|
|
1291
|
+
ra = _build_group_run_arrows(grp, global_step, include_run_arrows)
|
|
1292
|
+
if ra:
|
|
1293
|
+
run_arrows_list.append(ra)
|
|
1294
|
+
global_step += 1
|
|
1295
|
+
if chain_steps:
|
|
1296
|
+
sec: Dict[str, Any] = {"steps": chain_steps}
|
|
1297
|
+
if len(chain_steps) > 1:
|
|
1298
|
+
sec["layout"] = "sequential"
|
|
1299
|
+
sections.append(sec)
|
|
1300
|
+
|
|
1301
|
+
# Each unrelated group as its own section
|
|
1302
|
+
for gi in plan.unrelated_groups:
|
|
1303
|
+
grp = plan.parallel_groups[gi]
|
|
1304
|
+
structs, steps, label_counter = _build_group(
|
|
1305
|
+
grp, global_step, label_counter)
|
|
1306
|
+
valid_steps = [s for s in steps
|
|
1307
|
+
if s.get("substrates") or s.get("products")]
|
|
1308
|
+
if not valid_steps:
|
|
1309
|
+
continue
|
|
1310
|
+
all_structures.update(structs)
|
|
1311
|
+
sec = {"steps": valid_steps}
|
|
1312
|
+
ra = _build_group_run_arrows(grp, global_step, include_run_arrows)
|
|
1313
|
+
if ra:
|
|
1314
|
+
run_arrows_list.append(ra)
|
|
1315
|
+
global_step += 1
|
|
1316
|
+
sections.append(sec)
|
|
1317
|
+
|
|
1318
|
+
# If only 1 section survived, collapse to flat sequential layout
|
|
1319
|
+
if len(sections) == 1:
|
|
1320
|
+
flat_steps = sections[0].get("steps", [])
|
|
1321
|
+
flat_layout = "sequential" if len(flat_steps) > 1 else "linear"
|
|
1322
|
+
yaml_dict: Dict[str, Any] = {
|
|
1323
|
+
"structures": all_structures,
|
|
1324
|
+
"steps": flat_steps,
|
|
1325
|
+
"layout": flat_layout,
|
|
1326
|
+
}
|
|
1327
|
+
else:
|
|
1328
|
+
yaml_dict = {
|
|
1329
|
+
"structures": all_structures,
|
|
1330
|
+
"sections": sections,
|
|
1331
|
+
"layout": "stacked-rows",
|
|
1332
|
+
}
|
|
1333
|
+
else:
|
|
1334
|
+
# Linear or sequential: single chain, flat steps list
|
|
1335
|
+
all_steps: List[Dict[str, Any]] = []
|
|
1336
|
+
label_counter = 1
|
|
1337
|
+
step_num = 1
|
|
1338
|
+
# Use the single chain if available, otherwise unrelated groups
|
|
1339
|
+
group_order = plan.chains[0] if plan.chains else plan.unrelated_groups
|
|
1340
|
+
for gi in group_order:
|
|
1341
|
+
grp = plan.parallel_groups[gi]
|
|
1342
|
+
structs, steps, label_counter = _build_group(
|
|
1343
|
+
grp, step_num, label_counter)
|
|
1344
|
+
valid_steps = [s for s in steps
|
|
1345
|
+
if s.get("substrates") or s.get("products")]
|
|
1346
|
+
if not valid_steps:
|
|
1347
|
+
continue
|
|
1348
|
+
all_structures.update(structs)
|
|
1349
|
+
all_steps.extend(valid_steps)
|
|
1350
|
+
ra = _build_group_run_arrows(grp, step_num, include_run_arrows)
|
|
1351
|
+
if ra:
|
|
1352
|
+
run_arrows_list.append(ra)
|
|
1353
|
+
step_num += 1
|
|
1354
|
+
|
|
1355
|
+
yaml_dict = {
|
|
1356
|
+
"structures": all_structures,
|
|
1357
|
+
"steps": all_steps,
|
|
1358
|
+
"layout": layout,
|
|
1359
|
+
}
|
|
1360
|
+
|
|
1361
|
+
# Prevent auto-wrapping; merged schemes should render as-is
|
|
1362
|
+
if yaml_dict.get("layout") == "sequential":
|
|
1363
|
+
yaml_dict["wrap"] = "none"
|
|
1364
|
+
|
|
1365
|
+
if run_arrows_list:
|
|
1366
|
+
yaml_dict["run_arrows"] = run_arrows_list
|
|
1367
|
+
|
|
1368
|
+
return yaml_dict
|
|
1369
|
+
|
|
1370
|
+
|
|
1371
|
+
def write_merged_scheme_yaml(
|
|
1372
|
+
json_paths: List[str],
|
|
1373
|
+
output_path: str,
|
|
1374
|
+
layout: str = "auto",
|
|
1375
|
+
include_run_arrows: bool = True,
|
|
1376
|
+
use_eln_labels: bool = False,
|
|
1377
|
+
) -> str:
|
|
1378
|
+
"""Read multiple reaction JSONs, detect relationships, write merged YAML.
|
|
1379
|
+
|
|
1380
|
+
Returns the absolute path to the written YAML file.
|
|
1381
|
+
"""
|
|
1382
|
+
yaml_dict = build_merged_scheme_yaml_dict(
|
|
1383
|
+
json_paths, layout=layout, include_run_arrows=include_run_arrows,
|
|
1384
|
+
use_eln_labels=use_eln_labels,
|
|
1385
|
+
)
|
|
1386
|
+
_write_yaml_file(yaml_dict, output_path)
|
|
1387
|
+
return os.path.abspath(output_path)
|
|
1388
|
+
|
|
1389
|
+
|
|
1390
|
+
# ---------------------------------------------------------------------------
|
|
1391
|
+
# YAML output
|
|
1392
|
+
# ---------------------------------------------------------------------------
|
|
1393
|
+
|
|
1394
|
+
def _write_yaml_file(data: Dict[str, Any], path: str) -> None:
|
|
1395
|
+
"""Write YAML dict to file.
|
|
1396
|
+
|
|
1397
|
+
Uses PyYAML if available, otherwise writes a simple manual format.
|
|
1398
|
+
"""
|
|
1399
|
+
if yaml is not None:
|
|
1400
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
1401
|
+
yaml.dump(data, f, default_flow_style=False, allow_unicode=True,
|
|
1402
|
+
sort_keys=False)
|
|
1403
|
+
else:
|
|
1404
|
+
# Fallback: write JSON with .yaml extension (valid YAML superset)
|
|
1405
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
1406
|
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
1407
|
+
|
|
1408
|
+
|
|
1409
|
+
# ---------------------------------------------------------------------------
|
|
1410
|
+
# CLI
|
|
1411
|
+
# ---------------------------------------------------------------------------
|
|
1412
|
+
|
|
1413
|
+
def main():
|
|
1414
|
+
parser = argparse.ArgumentParser(
|
|
1415
|
+
description="Generate scheme YAML from one or more reaction_parser JSON files.",
|
|
1416
|
+
)
|
|
1417
|
+
parser.add_argument("json_paths", nargs="+",
|
|
1418
|
+
help="One or more reaction parser JSON files")
|
|
1419
|
+
parser.add_argument("-o", "--output", default=None,
|
|
1420
|
+
help="Output YAML path (default: auto-generated)")
|
|
1421
|
+
parser.add_argument("--layout", default="auto",
|
|
1422
|
+
help="Layout: linear, sequential, stacked-rows, auto")
|
|
1423
|
+
parser.add_argument("--no-run-arrows", action="store_true",
|
|
1424
|
+
help="Suppress run arrows")
|
|
1425
|
+
parser.add_argument("--no-merge", action="store_true",
|
|
1426
|
+
help="Process each JSON individually (skip merge)")
|
|
1427
|
+
parser.add_argument("--eln-labels", action="store_true",
|
|
1428
|
+
help="Label products with ELN experiment names "
|
|
1429
|
+
"instead of sequential numbers")
|
|
1430
|
+
parser.add_argument("-v", "--verbose", action="store_true")
|
|
1431
|
+
args = parser.parse_args()
|
|
1432
|
+
|
|
1433
|
+
for jp in args.json_paths:
|
|
1434
|
+
if not os.path.exists(jp):
|
|
1435
|
+
print(f"Error: {jp} not found", file=sys.stderr)
|
|
1436
|
+
sys.exit(1)
|
|
1437
|
+
|
|
1438
|
+
include_run_arrows = not args.no_run_arrows
|
|
1439
|
+
|
|
1440
|
+
if len(args.json_paths) == 1:
|
|
1441
|
+
# Single input: existing behavior
|
|
1442
|
+
jp = args.json_paths[0]
|
|
1443
|
+
output = args.output
|
|
1444
|
+
if output is None:
|
|
1445
|
+
stem = os.path.splitext(os.path.basename(jp))[0]
|
|
1446
|
+
output = os.path.join(
|
|
1447
|
+
os.path.dirname(jp) or ".", f"{stem}-scheme.yaml")
|
|
1448
|
+
result = write_scheme_yaml(
|
|
1449
|
+
jp, output, layout=args.layout,
|
|
1450
|
+
include_run_arrows=include_run_arrows,
|
|
1451
|
+
use_eln_labels=args.eln_labels,
|
|
1452
|
+
)
|
|
1453
|
+
if args.verbose:
|
|
1454
|
+
print(f"Written: {result}", file=sys.stderr)
|
|
1455
|
+
print(result)
|
|
1456
|
+
else:
|
|
1457
|
+
# Multiple inputs: produce individual YAMLs + merged YAML
|
|
1458
|
+
for jp in args.json_paths:
|
|
1459
|
+
stem = os.path.splitext(os.path.basename(jp))[0]
|
|
1460
|
+
ind_output = os.path.join(
|
|
1461
|
+
os.path.dirname(jp) or ".", f"{stem}-scheme.yaml")
|
|
1462
|
+
result = write_scheme_yaml(
|
|
1463
|
+
jp, ind_output, layout=args.layout,
|
|
1464
|
+
include_run_arrows=include_run_arrows,
|
|
1465
|
+
use_eln_labels=args.eln_labels,
|
|
1466
|
+
)
|
|
1467
|
+
if args.verbose:
|
|
1468
|
+
print(f"Individual: {result}", file=sys.stderr)
|
|
1469
|
+
|
|
1470
|
+
if not args.no_merge:
|
|
1471
|
+
output = args.output
|
|
1472
|
+
if output is None:
|
|
1473
|
+
output = os.path.join(
|
|
1474
|
+
os.path.dirname(args.json_paths[0]) or ".",
|
|
1475
|
+
"merged-scheme.yaml")
|
|
1476
|
+
merged = write_merged_scheme_yaml(
|
|
1477
|
+
args.json_paths, output, layout=args.layout,
|
|
1478
|
+
include_run_arrows=include_run_arrows,
|
|
1479
|
+
use_eln_labels=args.eln_labels,
|
|
1480
|
+
)
|
|
1481
|
+
if args.verbose:
|
|
1482
|
+
print(f"Merged: {merged}", file=sys.stderr)
|
|
1483
|
+
print(merged)
|
|
1484
|
+
|
|
1485
|
+
|
|
1486
|
+
if __name__ == "__main__":
|
|
1487
|
+
main()
|