cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,706 @@
|
|
|
1
|
+
"""
|
|
2
|
+
parser.py — Parse YAML scheme files into SchemeDescriptor dataclasses.
|
|
3
|
+
|
|
4
|
+
Validates structure references, layout keywords, and produces clear error
|
|
5
|
+
messages for malformed input.
|
|
6
|
+
|
|
7
|
+
A ``_normalize_scheme_data`` preprocessing pass converts common LLM-generated
|
|
8
|
+
patterns (inline structures, ``reagents`` key, ``species`` alias, bare SMILES
|
|
9
|
+
refs, etc.) into the canonical format before the main parsing logic runs.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import hashlib
|
|
15
|
+
import re
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Dict, List, Optional, Union
|
|
18
|
+
|
|
19
|
+
import yaml
|
|
20
|
+
|
|
21
|
+
from .schema import (
|
|
22
|
+
ArrowContent,
|
|
23
|
+
RunArrowEntry,
|
|
24
|
+
SchemeDescriptor,
|
|
25
|
+
SectionDescriptor,
|
|
26
|
+
StepDescriptor,
|
|
27
|
+
StepRunArrows,
|
|
28
|
+
StructureRef,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Valid layout and wrap keywords
|
|
32
|
+
VALID_LAYOUTS = {
|
|
33
|
+
"linear", "sequential", "divergent", "stacked-rows",
|
|
34
|
+
"numbered-parallel", "convergent",
|
|
35
|
+
}
|
|
36
|
+
VALID_WRAPS = {"repeat", "serpentine", "none"}
|
|
37
|
+
VALID_ARROW_STYLES = {"solid", "dashed", "failed"}
|
|
38
|
+
|
|
39
|
+
# Unambiguous SMILES syntax characters (never appear in plain identifiers)
|
|
40
|
+
_SMILES_SYNTAX_RE = re.compile(r'[=()\[\]#@\\\/]')
|
|
41
|
+
|
|
42
|
+
# Letters that appear in English words / abbreviations but are NOT valid
|
|
43
|
+
# SMILES atom symbols or bond/ring characters.
|
|
44
|
+
# SMILES-valid letters: B C F H I K L M N O P R S V (uppercase)
|
|
45
|
+
# b c n o p s (aromatic lowercase)
|
|
46
|
+
# r l (part of Br, Cl two-char elements)
|
|
47
|
+
# "Word-only" letters: A D E G J Q T U W X Y Z (uppercase)
|
|
48
|
+
# a d e f g h i j k m q t u v w x y z (lowercase,
|
|
49
|
+
# except c n o p s b r l)
|
|
50
|
+
_WORD_ONLY_LETTER_RE = re.compile(r'[ADEGJQTUWXYZadeghijkmqtuvwxyz]')
|
|
51
|
+
|
|
52
|
+
# Pure uppercase organic-element chains of 3+ chars with no underscores,
|
|
53
|
+
# hyphens, or digits (e.g. "CCO", "CCCC", "COC", "CCOCC")
|
|
54
|
+
_ORGANIC_CHAIN_RE = re.compile(r'^[BCFIKLMNOPRSV]{3,}$')
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class SchemeParseError(Exception):
|
|
58
|
+
"""Raised when YAML content is invalid or violates schema rules."""
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def parse_yaml(source: Union[str, Path]) -> SchemeDescriptor:
|
|
63
|
+
"""
|
|
64
|
+
Parse a YAML file or string into a SchemeDescriptor.
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
source : str or Path
|
|
69
|
+
If a Path or string ending in .yaml/.yml, read as file.
|
|
70
|
+
Otherwise, interpret as raw YAML text.
|
|
71
|
+
|
|
72
|
+
Returns
|
|
73
|
+
-------
|
|
74
|
+
SchemeDescriptor
|
|
75
|
+
|
|
76
|
+
Raises
|
|
77
|
+
------
|
|
78
|
+
SchemeParseError
|
|
79
|
+
On any validation failure.
|
|
80
|
+
"""
|
|
81
|
+
text = _load_yaml_text(source)
|
|
82
|
+
try:
|
|
83
|
+
data = yaml.safe_load(text)
|
|
84
|
+
except yaml.YAMLError as e:
|
|
85
|
+
raise SchemeParseError(f"Invalid YAML syntax: {e}") from e
|
|
86
|
+
|
|
87
|
+
if not isinstance(data, dict):
|
|
88
|
+
raise SchemeParseError("Top-level YAML must be a mapping (dict)")
|
|
89
|
+
|
|
90
|
+
# Allow either top-level keys directly or wrapped in 'scheme:'
|
|
91
|
+
if "scheme" in data and isinstance(data["scheme"], dict):
|
|
92
|
+
data = data["scheme"]
|
|
93
|
+
|
|
94
|
+
# Normalize LLM-friendly patterns into canonical form before parsing
|
|
95
|
+
data = _normalize_scheme_data(data)
|
|
96
|
+
|
|
97
|
+
return _parse_scheme(data)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# ---------------------------------------------------------------------------
|
|
101
|
+
# Normalization helpers
|
|
102
|
+
# ---------------------------------------------------------------------------
|
|
103
|
+
|
|
104
|
+
def _smiles_id(smiles: str, existing: Dict[str, Any], counter: List[int]) -> str:
|
|
105
|
+
"""
|
|
106
|
+
Return a deterministic, collision-safe structure ID for a SMILES string.
|
|
107
|
+
|
|
108
|
+
Uses the first 8 hex chars of the SHA-1 of the SMILES. Falls back to a
|
|
109
|
+
sequential ``struct_N`` name when the hash already exists with a different
|
|
110
|
+
SMILES (collision is astronomically unlikely but handled anyway).
|
|
111
|
+
"""
|
|
112
|
+
token = "s_" + hashlib.sha1(smiles.encode()).hexdigest()[:8]
|
|
113
|
+
if token not in existing:
|
|
114
|
+
return token
|
|
115
|
+
# Hash collision or same SMILES used twice — return existing key if same
|
|
116
|
+
entry = existing[token]
|
|
117
|
+
existing_smiles = entry.get("smiles") if isinstance(entry, dict) else entry
|
|
118
|
+
if existing_smiles == smiles:
|
|
119
|
+
return token # already registered with this SMILES
|
|
120
|
+
# True collision: fall back to sequential name
|
|
121
|
+
idx = counter[0]
|
|
122
|
+
counter[0] += 1
|
|
123
|
+
return f"struct_{idx}"
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _looks_like_smiles(s: str) -> bool:
|
|
127
|
+
"""
|
|
128
|
+
Return True if *s* looks like a SMILES string rather than a structure ID.
|
|
129
|
+
|
|
130
|
+
Strategy: reject strings that contain "word-only" letters (letters that
|
|
131
|
+
are not valid in any SMILES notation) or identifier punctuation (``_``,
|
|
132
|
+
space, ``-``). Then apply positive evidence checks.
|
|
133
|
+
|
|
134
|
+
"Word-only" letters are those that never appear as SMILES atom symbols
|
|
135
|
+
or in element two-char symbols: ``A``, ``D``, ``E``, ``G``, ``J``, ``Q``,
|
|
136
|
+
``T``, ``U``, ``W``, ``X``, ``Y``, ``Z`` (uppercase) and all lowercase
|
|
137
|
+
letters except ``b``, ``c``, ``h``, ``n``, ``o``, ``p``, ``r``, ``s``,
|
|
138
|
+
``l`` (which appear in two-char elements like Br, Cl or aromatic atoms).
|
|
139
|
+
In practice ``h`` and ``l`` and ``r`` can appear in words too, but the
|
|
140
|
+
other word-only letters (``d``, ``e``, ``g``, etc.) are highly diagnostic.
|
|
141
|
+
|
|
142
|
+
Positive evidence tiers:
|
|
143
|
+
|
|
144
|
+
1. Unambiguous SMILES syntax: ``=``, ``(``, ``)``, ``[``, ``]``, ``#``,
|
|
145
|
+
``@``, ``\\``, ``/``.
|
|
146
|
+
|
|
147
|
+
2. Pure uppercase organic-element chain of 3+ chars containing ``C``
|
|
148
|
+
(catches ``CCO``, ``CCCC``, ``COC`` while rejecting ``SM``, ``TFA``).
|
|
149
|
+
|
|
150
|
+
3. Single-character organic element symbol.
|
|
151
|
+
"""
|
|
152
|
+
# Reject identifiers
|
|
153
|
+
if "_" in s or " " in s:
|
|
154
|
+
return False
|
|
155
|
+
if "-" in s and not _SMILES_SYNTAX_RE.search(s):
|
|
156
|
+
return False
|
|
157
|
+
# Strings with word-only letters are plain names/abbreviations
|
|
158
|
+
if _WORD_ONLY_LETTER_RE.search(s):
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
if len(s) == 1 and s in "BCFIKLMNOPRSV":
|
|
162
|
+
return True # single organic element
|
|
163
|
+
if _SMILES_SYNTAX_RE.search(s):
|
|
164
|
+
return True
|
|
165
|
+
# At this point the string contains only SMILES-compatible characters.
|
|
166
|
+
# Require at least one lowercase letter (aromatic atom) OR a pure
|
|
167
|
+
# uppercase organic chain of length >= 3 with a carbon.
|
|
168
|
+
if re.search(r'[a-z]', s): # has an aromatic or two-char element letter
|
|
169
|
+
return True
|
|
170
|
+
if _ORGANIC_CHAIN_RE.match(s) and "C" in s:
|
|
171
|
+
return True
|
|
172
|
+
return False
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _normalize_entry_list(
|
|
176
|
+
entries: Any,
|
|
177
|
+
structures: Dict[str, Any],
|
|
178
|
+
counter: List[int],
|
|
179
|
+
) -> List[str]:
|
|
180
|
+
"""
|
|
181
|
+
Normalise a substrate/product/above-arrow structures list.
|
|
182
|
+
|
|
183
|
+
Each entry may be:
|
|
184
|
+
- A plain string: kept as-is (may be an existing ID or bare SMILES).
|
|
185
|
+
- A dict with at least ``smiles``: auto-registered into *structures*.
|
|
186
|
+
|
|
187
|
+
Returns a list of structure ID strings.
|
|
188
|
+
"""
|
|
189
|
+
if not isinstance(entries, list):
|
|
190
|
+
entries = [entries]
|
|
191
|
+
|
|
192
|
+
result: List[str] = []
|
|
193
|
+
for entry in entries:
|
|
194
|
+
if isinstance(entry, dict):
|
|
195
|
+
smiles = entry.get("smiles")
|
|
196
|
+
name = entry.get("name")
|
|
197
|
+
label = entry.get("label")
|
|
198
|
+
sid = entry.get("id")
|
|
199
|
+
if smiles and not sid:
|
|
200
|
+
sid = _smiles_id(smiles, structures, counter)
|
|
201
|
+
elif not sid:
|
|
202
|
+
# No smiles and no id — use name as key, or generate one
|
|
203
|
+
if name:
|
|
204
|
+
sid = name
|
|
205
|
+
else:
|
|
206
|
+
sid = f"struct_{counter[0]}"
|
|
207
|
+
counter[0] += 1
|
|
208
|
+
# Register if not already present
|
|
209
|
+
if sid not in structures:
|
|
210
|
+
struct_def: Dict[str, Any] = {}
|
|
211
|
+
if smiles:
|
|
212
|
+
struct_def["smiles"] = smiles
|
|
213
|
+
if name:
|
|
214
|
+
struct_def["name"] = name
|
|
215
|
+
if label:
|
|
216
|
+
struct_def["label"] = label
|
|
217
|
+
structures[sid] = struct_def if struct_def else smiles or sid
|
|
218
|
+
result.append(sid)
|
|
219
|
+
else:
|
|
220
|
+
result.append(str(entry))
|
|
221
|
+
|
|
222
|
+
return result
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _normalize_scheme_data(data: Dict[str, Any]) -> Dict[str, Any]:
|
|
226
|
+
"""
|
|
227
|
+
Normalise LLM-friendly YAML patterns into the canonical form expected by
|
|
228
|
+
``_parse_scheme``.
|
|
229
|
+
|
|
230
|
+
This function is **idempotent**: running it on already-canonical YAML
|
|
231
|
+
produces the same result. It mutates and returns a shallow copy of *data*.
|
|
232
|
+
|
|
233
|
+
Changes applied
|
|
234
|
+
---------------
|
|
235
|
+
1. ``species`` key is renamed to ``structures``.
|
|
236
|
+
2. ``structures`` given as a list of dicts is converted to a keyed mapping.
|
|
237
|
+
3. Inline structure dicts inside ``substrates``/``products``/
|
|
238
|
+
``above_arrow.structures`` are auto-registered and replaced with IDs.
|
|
239
|
+
4. Bare SMILES strings in ``substrates``/``products`` are auto-registered.
|
|
240
|
+
5. ``reactants`` is accepted as an alias for ``substrates`` inside steps.
|
|
241
|
+
6. ``text`` scalar is wrapped in a list inside ``above_arrow``/``below_arrow``.
|
|
242
|
+
7. ``reagents`` list inside a step is distributed into ``above_arrow`` or
|
|
243
|
+
``below_arrow`` depending on an ``above_arrow`` flag on each reagent.
|
|
244
|
+
8. Redundant ``id`` field inside structure defs is silently accepted
|
|
245
|
+
(already handled by ``_parse_structure``; nothing to do here).
|
|
246
|
+
"""
|
|
247
|
+
import copy
|
|
248
|
+
data = copy.deepcopy(data)
|
|
249
|
+
|
|
250
|
+
# 1. Accept ``species`` / ``substrates`` as alias for ``structures``
|
|
251
|
+
for alias in ("species", "substrates"):
|
|
252
|
+
if alias in data and "structures" not in data:
|
|
253
|
+
data["structures"] = data.pop(alias)
|
|
254
|
+
break
|
|
255
|
+
|
|
256
|
+
# 2. Accept ``structures`` as a list of dicts (convert to keyed mapping)
|
|
257
|
+
raw_structs = data.get("structures")
|
|
258
|
+
if isinstance(raw_structs, list):
|
|
259
|
+
converted: Dict[str, Any] = {}
|
|
260
|
+
for idx, item in enumerate(raw_structs):
|
|
261
|
+
if isinstance(item, dict):
|
|
262
|
+
key = str(item.get("id", f"struct_{idx}"))
|
|
263
|
+
# Remove the redundant 'id' key from the value dict to keep
|
|
264
|
+
# it clean (harmless either way — _parse_structure ignores it)
|
|
265
|
+
val = {k: v for k, v in item.items() if k != "id"}
|
|
266
|
+
converted[key] = val if val else item.get("smiles", f"struct_{idx}")
|
|
267
|
+
else:
|
|
268
|
+
converted[f"struct_{idx}"] = str(item)
|
|
269
|
+
data["structures"] = converted
|
|
270
|
+
|
|
271
|
+
# Work with the normalised structures mapping (may be empty / absent)
|
|
272
|
+
structures: Dict[str, Any] = data.setdefault("structures", {})
|
|
273
|
+
counter = [0] # mutable counter shared across steps
|
|
274
|
+
|
|
275
|
+
# Normalise each step
|
|
276
|
+
raw_steps = data.get("steps", [])
|
|
277
|
+
if not isinstance(raw_steps, list):
|
|
278
|
+
raw_steps = []
|
|
279
|
+
normalised_steps = []
|
|
280
|
+
for step in raw_steps:
|
|
281
|
+
if not isinstance(step, dict):
|
|
282
|
+
normalised_steps.append(step)
|
|
283
|
+
continue
|
|
284
|
+
step = dict(step) # shallow copy so we can mutate
|
|
285
|
+
|
|
286
|
+
# 5. ``reactants`` alias for ``substrates``
|
|
287
|
+
if "reactants" in step and "substrates" not in step:
|
|
288
|
+
step["substrates"] = step.pop("reactants")
|
|
289
|
+
|
|
290
|
+
# 3 & 4. Inline / bare-SMILES substrates
|
|
291
|
+
if "substrates" in step:
|
|
292
|
+
step["substrates"] = _normalize_entry_list(
|
|
293
|
+
step["substrates"], structures, counter
|
|
294
|
+
)
|
|
295
|
+
# Bare SMILES strings in the resulting list
|
|
296
|
+
step["substrates"] = _register_bare_smiles(
|
|
297
|
+
step["substrates"], structures, counter
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
# 3 & 4. Inline / bare-SMILES products
|
|
301
|
+
if "products" in step:
|
|
302
|
+
step["products"] = _normalize_entry_list(
|
|
303
|
+
step["products"], structures, counter
|
|
304
|
+
)
|
|
305
|
+
step["products"] = _register_bare_smiles(
|
|
306
|
+
step["products"], structures, counter
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# 6. ``text`` as string in above_arrow / below_arrow
|
|
310
|
+
for arrow_key in ("above_arrow", "below_arrow"):
|
|
311
|
+
if arrow_key in step and isinstance(step[arrow_key], dict):
|
|
312
|
+
arrow = dict(step[arrow_key])
|
|
313
|
+
if isinstance(arrow.get("text"), str):
|
|
314
|
+
arrow["text"] = [arrow["text"]]
|
|
315
|
+
# 3. Inline structs inside above_arrow.structures
|
|
316
|
+
if "structures" in arrow:
|
|
317
|
+
arrow["structures"] = _normalize_entry_list(
|
|
318
|
+
arrow["structures"], structures, counter
|
|
319
|
+
)
|
|
320
|
+
arrow["structures"] = _register_bare_smiles(
|
|
321
|
+
arrow["structures"], structures, counter
|
|
322
|
+
)
|
|
323
|
+
step[arrow_key] = arrow
|
|
324
|
+
|
|
325
|
+
# 7. ``reagents`` key: distribute into above_arrow / below_arrow
|
|
326
|
+
if "reagents" in step:
|
|
327
|
+
reagents = step.pop("reagents")
|
|
328
|
+
if not isinstance(reagents, list):
|
|
329
|
+
reagents = [reagents]
|
|
330
|
+
for reagent in reagents:
|
|
331
|
+
if isinstance(reagent, dict):
|
|
332
|
+
goes_above = reagent.get("above_arrow", False)
|
|
333
|
+
# Normalise the reagent as a structure entry
|
|
334
|
+
reg_ids = _normalize_entry_list([reagent], structures, counter)
|
|
335
|
+
# Also accept bare SMILES
|
|
336
|
+
reg_ids = _register_bare_smiles(reg_ids, structures, counter)
|
|
337
|
+
if goes_above:
|
|
338
|
+
above = step.setdefault("above_arrow", {})
|
|
339
|
+
if isinstance(above, dict):
|
|
340
|
+
structs = above.setdefault("structures", [])
|
|
341
|
+
if isinstance(structs, list):
|
|
342
|
+
structs.extend(reg_ids)
|
|
343
|
+
else:
|
|
344
|
+
# Render as text using the display name or SMILES
|
|
345
|
+
below = step.setdefault("below_arrow", {})
|
|
346
|
+
if isinstance(below, dict):
|
|
347
|
+
texts = below.setdefault("text", [])
|
|
348
|
+
if isinstance(texts, list):
|
|
349
|
+
for rid in reg_ids:
|
|
350
|
+
entry = structures.get(rid, {})
|
|
351
|
+
display = (
|
|
352
|
+
entry.get("name") if isinstance(entry, dict)
|
|
353
|
+
else None
|
|
354
|
+
) or rid
|
|
355
|
+
texts.append(display)
|
|
356
|
+
else:
|
|
357
|
+
# Plain string reagent — add as below_arrow text
|
|
358
|
+
below = step.setdefault("below_arrow", {})
|
|
359
|
+
if isinstance(below, dict):
|
|
360
|
+
texts = below.setdefault("text", [])
|
|
361
|
+
if isinstance(texts, list):
|
|
362
|
+
texts.append(str(reagent))
|
|
363
|
+
|
|
364
|
+
normalised_steps.append(step)
|
|
365
|
+
data["steps"] = normalised_steps
|
|
366
|
+
|
|
367
|
+
# Normalise steps inside sections as well
|
|
368
|
+
raw_sections = data.get("sections", [])
|
|
369
|
+
if isinstance(raw_sections, list):
|
|
370
|
+
for sec in raw_sections:
|
|
371
|
+
if isinstance(sec, dict) and "steps" in sec:
|
|
372
|
+
sec_steps = sec.get("steps", [])
|
|
373
|
+
if isinstance(sec_steps, list):
|
|
374
|
+
# Re-use the same normalisation by temporarily building a
|
|
375
|
+
# sub-dict and merging back
|
|
376
|
+
sub = _normalize_scheme_data(
|
|
377
|
+
{"structures": structures, "steps": sec_steps}
|
|
378
|
+
)
|
|
379
|
+
sec["steps"] = sub["steps"]
|
|
380
|
+
structures.update(sub.get("structures", {}))
|
|
381
|
+
|
|
382
|
+
return data
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def _register_bare_smiles(
|
|
386
|
+
ids: List[str],
|
|
387
|
+
structures: Dict[str, Any],
|
|
388
|
+
counter: List[int],
|
|
389
|
+
) -> List[str]:
|
|
390
|
+
"""
|
|
391
|
+
For each string in *ids* that is not an existing structure key and looks
|
|
392
|
+
like SMILES, auto-register it and return the canonical ID in its place.
|
|
393
|
+
"""
|
|
394
|
+
result: List[str] = []
|
|
395
|
+
for sid in ids:
|
|
396
|
+
if sid not in structures and _looks_like_smiles(sid):
|
|
397
|
+
new_id = _smiles_id(sid, structures, counter)
|
|
398
|
+
structures[new_id] = {"smiles": sid}
|
|
399
|
+
result.append(new_id)
|
|
400
|
+
else:
|
|
401
|
+
result.append(sid)
|
|
402
|
+
return result
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def _load_yaml_text(source: Union[str, Path]) -> str:
|
|
406
|
+
"""Load YAML text from a file path or return raw string."""
|
|
407
|
+
if isinstance(source, Path):
|
|
408
|
+
return source.read_text(encoding="utf-8")
|
|
409
|
+
if isinstance(source, str) and (
|
|
410
|
+
source.endswith(".yaml") or source.endswith(".yml")
|
|
411
|
+
):
|
|
412
|
+
return Path(source).read_text(encoding="utf-8")
|
|
413
|
+
return source
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def _parse_scheme(data: Dict[str, Any]) -> SchemeDescriptor:
|
|
417
|
+
"""Parse the scheme-level dict."""
|
|
418
|
+
# --- Source (reaction_parser JSON) ---
|
|
419
|
+
source = data.get("source")
|
|
420
|
+
if source is not None:
|
|
421
|
+
source = str(source)
|
|
422
|
+
|
|
423
|
+
# --- Structures ---
|
|
424
|
+
raw_structs = data.get("structures", {})
|
|
425
|
+
if raw_structs is None:
|
|
426
|
+
raw_structs = {}
|
|
427
|
+
if not isinstance(raw_structs, dict):
|
|
428
|
+
raise SchemeParseError("'structures' must be a mapping")
|
|
429
|
+
structures = {}
|
|
430
|
+
for key, val in raw_structs.items():
|
|
431
|
+
key = str(key)
|
|
432
|
+
structures[key] = _parse_structure(key, val)
|
|
433
|
+
|
|
434
|
+
# --- Sections (for stacked-rows layout) ---
|
|
435
|
+
raw_sections = data.get("sections", [])
|
|
436
|
+
if not isinstance(raw_sections, list):
|
|
437
|
+
raise SchemeParseError("'sections' must be a list")
|
|
438
|
+
sections = [_parse_section(i, s) for i, s in enumerate(raw_sections)]
|
|
439
|
+
|
|
440
|
+
# --- Steps ---
|
|
441
|
+
raw_steps = data.get("steps", [])
|
|
442
|
+
if not isinstance(raw_steps, list):
|
|
443
|
+
raise SchemeParseError("'steps' must be a list")
|
|
444
|
+
if not raw_steps and not sections:
|
|
445
|
+
raise SchemeParseError("At least one step is required (or use 'sections' for stacked-rows)")
|
|
446
|
+
steps = [_parse_step(i, s) for i, s in enumerate(raw_steps)]
|
|
447
|
+
|
|
448
|
+
# --- Validate structure refs in steps and sections ---
|
|
449
|
+
# When source is present, refs may be resolved from JSON at render time,
|
|
450
|
+
# so we only validate refs that are NOT in the declared structures block
|
|
451
|
+
# (the renderer will handle resolution failures for source-backed refs).
|
|
452
|
+
def _validate_step_refs(step_list, context_prefix=""):
|
|
453
|
+
for i, step in enumerate(step_list):
|
|
454
|
+
prefix = f"{context_prefix}step {i+1}"
|
|
455
|
+
_validate_refs(f"{prefix} substrates", step.substrates, structures)
|
|
456
|
+
_validate_refs(f"{prefix} products", step.products, structures)
|
|
457
|
+
if step.above_arrow:
|
|
458
|
+
_validate_refs(
|
|
459
|
+
f"{prefix} above_arrow.structures",
|
|
460
|
+
step.above_arrow.structures,
|
|
461
|
+
structures,
|
|
462
|
+
)
|
|
463
|
+
if step.below_arrow:
|
|
464
|
+
_validate_refs(
|
|
465
|
+
f"{prefix} below_arrow.structures",
|
|
466
|
+
step.below_arrow.structures,
|
|
467
|
+
structures,
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
if not source:
|
|
471
|
+
_validate_step_refs(steps)
|
|
472
|
+
for sec_idx, sec in enumerate(sections):
|
|
473
|
+
_validate_step_refs(sec.steps, f"section {sec_idx+1} ")
|
|
474
|
+
|
|
475
|
+
# --- Layout ---
|
|
476
|
+
layout = str(data.get("layout", "linear"))
|
|
477
|
+
if layout not in VALID_LAYOUTS:
|
|
478
|
+
raise SchemeParseError(
|
|
479
|
+
f"Invalid layout '{layout}'. Must be one of: {sorted(VALID_LAYOUTS)}"
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
# --- Wrap ---
|
|
483
|
+
wrap = str(data.get("wrap", "repeat"))
|
|
484
|
+
if wrap not in VALID_WRAPS:
|
|
485
|
+
raise SchemeParseError(
|
|
486
|
+
f"Invalid wrap '{wrap}'. Must be one of: {sorted(VALID_WRAPS)}"
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
# --- Steps per row ---
|
|
490
|
+
steps_per_row = data.get("steps_per_row")
|
|
491
|
+
if steps_per_row is not None:
|
|
492
|
+
steps_per_row = int(steps_per_row)
|
|
493
|
+
if steps_per_row < 1:
|
|
494
|
+
raise SchemeParseError("steps_per_row must be >= 1")
|
|
495
|
+
|
|
496
|
+
# --- Title ---
|
|
497
|
+
title = data.get("title")
|
|
498
|
+
if title is not None:
|
|
499
|
+
title = str(title)
|
|
500
|
+
|
|
501
|
+
# --- Run arrows ---
|
|
502
|
+
raw_runs = data.get("run_arrows", [])
|
|
503
|
+
if not isinstance(raw_runs, list):
|
|
504
|
+
raise SchemeParseError("'run_arrows' must be a list")
|
|
505
|
+
run_arrows = [_parse_run_arrows(r) for r in raw_runs]
|
|
506
|
+
|
|
507
|
+
# --- Condition key ---
|
|
508
|
+
condition_key = data.get("condition_key")
|
|
509
|
+
if condition_key is not None and not isinstance(condition_key, dict):
|
|
510
|
+
raise SchemeParseError("'condition_key' must be a mapping")
|
|
511
|
+
|
|
512
|
+
return SchemeDescriptor(
|
|
513
|
+
source=source,
|
|
514
|
+
structures=structures,
|
|
515
|
+
steps=steps,
|
|
516
|
+
layout=layout,
|
|
517
|
+
wrap=wrap,
|
|
518
|
+
steps_per_row=steps_per_row,
|
|
519
|
+
title=title,
|
|
520
|
+
run_arrows=run_arrows,
|
|
521
|
+
condition_key=condition_key,
|
|
522
|
+
sections=sections,
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def _parse_section(index: int, data: Any) -> SectionDescriptor:
|
|
527
|
+
"""Parse a single section entry (for stacked-rows layout)."""
|
|
528
|
+
if not isinstance(data, dict):
|
|
529
|
+
raise SchemeParseError(
|
|
530
|
+
f"Section {index+1} must be a mapping, got {type(data).__name__}"
|
|
531
|
+
)
|
|
532
|
+
label = data.get("label")
|
|
533
|
+
if label is not None:
|
|
534
|
+
label = str(label)
|
|
535
|
+
|
|
536
|
+
raw_steps = data.get("steps", [])
|
|
537
|
+
if not isinstance(raw_steps, list):
|
|
538
|
+
raise SchemeParseError(f"Section {index+1} 'steps' must be a list")
|
|
539
|
+
if not raw_steps:
|
|
540
|
+
raise SchemeParseError(f"Section {index+1} must have at least one step")
|
|
541
|
+
steps = [_parse_step(i, s) for i, s in enumerate(raw_steps)]
|
|
542
|
+
|
|
543
|
+
layout = str(data.get("layout", "linear"))
|
|
544
|
+
|
|
545
|
+
return SectionDescriptor(label=label, steps=steps, layout=layout)
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def _parse_structure(key: str, val: Any) -> StructureRef:
|
|
549
|
+
"""Parse a single structure entry."""
|
|
550
|
+
if isinstance(val, str):
|
|
551
|
+
# Shorthand: just a SMILES string
|
|
552
|
+
return StructureRef(id=key, smiles=val)
|
|
553
|
+
if not isinstance(val, dict):
|
|
554
|
+
raise SchemeParseError(
|
|
555
|
+
f"Structure '{key}' must be a mapping or SMILES string, got {type(val).__name__}"
|
|
556
|
+
)
|
|
557
|
+
return StructureRef(
|
|
558
|
+
id=key,
|
|
559
|
+
smiles=val.get("smiles"),
|
|
560
|
+
name=val.get("name"),
|
|
561
|
+
file=val.get("file"),
|
|
562
|
+
cdxml_id=val.get("cdxml_id"),
|
|
563
|
+
label=val.get("label"),
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def _parse_step(index: int, data: Any) -> StepDescriptor:
|
|
568
|
+
"""Parse a single step entry."""
|
|
569
|
+
if not isinstance(data, dict):
|
|
570
|
+
raise SchemeParseError(f"Step {index+1} must be a mapping, got {type(data).__name__}")
|
|
571
|
+
|
|
572
|
+
substrates = _as_str_list(data.get("substrates", []), f"step {index+1} substrates")
|
|
573
|
+
products = _as_str_list(data.get("products", []), f"step {index+1} products")
|
|
574
|
+
|
|
575
|
+
if not substrates:
|
|
576
|
+
raise SchemeParseError(f"Step {index+1} must have at least one substrate")
|
|
577
|
+
if not products:
|
|
578
|
+
raise SchemeParseError(f"Step {index+1} must have at least one product")
|
|
579
|
+
|
|
580
|
+
above = _parse_arrow_content(data.get("above_arrow"), f"step {index+1} above_arrow")
|
|
581
|
+
below = _parse_arrow_content(data.get("below_arrow"), f"step {index+1} below_arrow")
|
|
582
|
+
|
|
583
|
+
return StepDescriptor(
|
|
584
|
+
substrates=substrates,
|
|
585
|
+
products=products,
|
|
586
|
+
above_arrow=above,
|
|
587
|
+
below_arrow=below,
|
|
588
|
+
yield_=data.get("yield"),
|
|
589
|
+
number=data.get("number"),
|
|
590
|
+
id=data.get("id"),
|
|
591
|
+
arrow_style=_validate_arrow_style(data.get("arrow_style", "solid"), index),
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
def _parse_arrow_content(data: Any, context: str) -> Optional[ArrowContent]:
|
|
596
|
+
"""Parse above_arrow or below_arrow content."""
|
|
597
|
+
if data is None:
|
|
598
|
+
return None
|
|
599
|
+
|
|
600
|
+
# Normalize: if arrow content is a list, merge into a single dict.
|
|
601
|
+
# LLMs commonly write: above_arrow: [{structures: [...]}, "text"]
|
|
602
|
+
# or: above_arrow: [acid_id] (bare list of structure refs)
|
|
603
|
+
if isinstance(data, list):
|
|
604
|
+
merged = {"structures": [], "text": []}
|
|
605
|
+
for item in data:
|
|
606
|
+
if isinstance(item, dict):
|
|
607
|
+
for k, v in item.items():
|
|
608
|
+
if k == "structures" and isinstance(v, list):
|
|
609
|
+
merged["structures"].extend(v)
|
|
610
|
+
elif k == "text":
|
|
611
|
+
if isinstance(v, list):
|
|
612
|
+
merged["text"].extend(v)
|
|
613
|
+
elif isinstance(v, str):
|
|
614
|
+
merged["text"].append(v)
|
|
615
|
+
elif k == "structure":
|
|
616
|
+
# singular "structure: acid_id"
|
|
617
|
+
merged["structures"].append(v)
|
|
618
|
+
else:
|
|
619
|
+
merged["text"].append(str(v))
|
|
620
|
+
elif isinstance(item, str):
|
|
621
|
+
# Bare string in list — could be structure ID or text
|
|
622
|
+
merged["structures"].append(item)
|
|
623
|
+
data = merged
|
|
624
|
+
|
|
625
|
+
# Normalize: if arrow content is a bare string, treat as text
|
|
626
|
+
if isinstance(data, str):
|
|
627
|
+
data = {"text": [data]}
|
|
628
|
+
|
|
629
|
+
if not isinstance(data, dict):
|
|
630
|
+
raise SchemeParseError(
|
|
631
|
+
f"'{context}' must be a mapping (e.g. {{structures: [ID], text: ['conditions']}}), "
|
|
632
|
+
f"got {type(data).__name__}. "
|
|
633
|
+
f"Correct format: above_arrow: {{structures: [ReagentID], text: ['HATU']}}"
|
|
634
|
+
)
|
|
635
|
+
structures = _as_str_list(data.get("structures", []), f"{context}.structures")
|
|
636
|
+
text = _as_str_list(data.get("text", []), f"{context}.text")
|
|
637
|
+
if not structures and not text:
|
|
638
|
+
return None
|
|
639
|
+
return ArrowContent(structures=structures, text=text)
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
def _parse_run_arrows(data: Any) -> StepRunArrows:
|
|
643
|
+
"""Parse a run_arrows entry."""
|
|
644
|
+
if not isinstance(data, dict):
|
|
645
|
+
raise SchemeParseError(f"run_arrows entry must be a mapping")
|
|
646
|
+
step = data.get("step")
|
|
647
|
+
if step is None:
|
|
648
|
+
raise SchemeParseError("run_arrows entry must have a 'step' field")
|
|
649
|
+
step = int(step)
|
|
650
|
+
raw_runs = data.get("runs", [])
|
|
651
|
+
if not isinstance(raw_runs, list):
|
|
652
|
+
raise SchemeParseError(f"run_arrows step {step} 'runs' must be a list")
|
|
653
|
+
runs = []
|
|
654
|
+
for r in raw_runs:
|
|
655
|
+
if not isinstance(r, dict):
|
|
656
|
+
raise SchemeParseError(f"run entry must be a mapping")
|
|
657
|
+
inp = r.get("input", "")
|
|
658
|
+
out = r.get("output", "")
|
|
659
|
+
if not inp:
|
|
660
|
+
raise SchemeParseError(
|
|
661
|
+
f"run entry must have an 'input' field"
|
|
662
|
+
)
|
|
663
|
+
note = r.get("note")
|
|
664
|
+
if note is not None:
|
|
665
|
+
note = str(note)
|
|
666
|
+
runs.append(RunArrowEntry(input_label=str(inp), output_label=str(out),
|
|
667
|
+
note=note))
|
|
668
|
+
return StepRunArrows(step=step, runs=runs)
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def _validate_arrow_style(style: str, step_idx: int) -> str:
|
|
672
|
+
"""Validate arrow_style value."""
|
|
673
|
+
style = str(style)
|
|
674
|
+
if style not in VALID_ARROW_STYLES:
|
|
675
|
+
raise SchemeParseError(
|
|
676
|
+
f"Step {step_idx+1}: invalid arrow_style '{style}'. "
|
|
677
|
+
f"Must be one of: {sorted(VALID_ARROW_STYLES)}"
|
|
678
|
+
)
|
|
679
|
+
return style
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
def _validate_refs(
|
|
683
|
+
context: str,
|
|
684
|
+
refs: List[str],
|
|
685
|
+
structures: Dict[str, StructureRef],
|
|
686
|
+
) -> None:
|
|
687
|
+
"""Check that all refs point to defined structures.
|
|
688
|
+
|
|
689
|
+
Undeclared refs are auto-registered as bare StructureRefs so the renderer
|
|
690
|
+
can attempt resolution via reagent_db (name → SMILES lookup).
|
|
691
|
+
"""
|
|
692
|
+
for ref in refs:
|
|
693
|
+
if ref not in structures:
|
|
694
|
+
# Auto-register as a bare ref — renderer will try reagent_db
|
|
695
|
+
structures[ref] = StructureRef(id=ref)
|
|
696
|
+
|
|
697
|
+
|
|
698
|
+
def _as_str_list(val: Any, context: str) -> List[str]:
|
|
699
|
+
"""Coerce a value to a list of strings."""
|
|
700
|
+
if val is None:
|
|
701
|
+
return []
|
|
702
|
+
if isinstance(val, str):
|
|
703
|
+
return [val]
|
|
704
|
+
if isinstance(val, list):
|
|
705
|
+
return [str(v) for v in val]
|
|
706
|
+
raise SchemeParseError(f"'{context}' must be a list or string, got {type(val).__name__}")
|