cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,706 @@
1
+ """
2
+ parser.py — Parse YAML scheme files into SchemeDescriptor dataclasses.
3
+
4
+ Validates structure references, layout keywords, and produces clear error
5
+ messages for malformed input.
6
+
7
+ A ``_normalize_scheme_data`` preprocessing pass converts common LLM-generated
8
+ patterns (inline structures, ``reagents`` key, ``species`` alias, bare SMILES
9
+ refs, etc.) into the canonical format before the main parsing logic runs.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import hashlib
15
+ import re
16
+ from pathlib import Path
17
+ from typing import Any, Dict, List, Optional, Union
18
+
19
+ import yaml
20
+
21
+ from .schema import (
22
+ ArrowContent,
23
+ RunArrowEntry,
24
+ SchemeDescriptor,
25
+ SectionDescriptor,
26
+ StepDescriptor,
27
+ StepRunArrows,
28
+ StructureRef,
29
+ )
30
+
31
+ # Valid layout and wrap keywords
32
+ VALID_LAYOUTS = {
33
+ "linear", "sequential", "divergent", "stacked-rows",
34
+ "numbered-parallel", "convergent",
35
+ }
36
+ VALID_WRAPS = {"repeat", "serpentine", "none"}
37
+ VALID_ARROW_STYLES = {"solid", "dashed", "failed"}
38
+
39
+ # Unambiguous SMILES syntax characters (never appear in plain identifiers)
40
+ _SMILES_SYNTAX_RE = re.compile(r'[=()\[\]#@\\\/]')
41
+
42
+ # Letters that appear in English words / abbreviations but are NOT valid
43
+ # SMILES atom symbols or bond/ring characters.
44
+ # SMILES-valid letters: B C F H I K L M N O P R S V (uppercase)
45
+ # b c n o p s (aromatic lowercase)
46
+ # r l (part of Br, Cl two-char elements)
47
+ # "Word-only" letters: A D E G J Q T U W X Y Z (uppercase)
48
+ # a d e f g h i j k m q t u v w x y z (lowercase,
49
+ # except c n o p s b r l)
50
+ _WORD_ONLY_LETTER_RE = re.compile(r'[ADEGJQTUWXYZadeghijkmqtuvwxyz]')
51
+
52
+ # Pure uppercase organic-element chains of 3+ chars with no underscores,
53
+ # hyphens, or digits (e.g. "CCO", "CCCC", "COC", "CCOCC")
54
+ _ORGANIC_CHAIN_RE = re.compile(r'^[BCFIKLMNOPRSV]{3,}$')
55
+
56
+
57
+ class SchemeParseError(Exception):
58
+ """Raised when YAML content is invalid or violates schema rules."""
59
+ pass
60
+
61
+
62
+ def parse_yaml(source: Union[str, Path]) -> SchemeDescriptor:
63
+ """
64
+ Parse a YAML file or string into a SchemeDescriptor.
65
+
66
+ Parameters
67
+ ----------
68
+ source : str or Path
69
+ If a Path or string ending in .yaml/.yml, read as file.
70
+ Otherwise, interpret as raw YAML text.
71
+
72
+ Returns
73
+ -------
74
+ SchemeDescriptor
75
+
76
+ Raises
77
+ ------
78
+ SchemeParseError
79
+ On any validation failure.
80
+ """
81
+ text = _load_yaml_text(source)
82
+ try:
83
+ data = yaml.safe_load(text)
84
+ except yaml.YAMLError as e:
85
+ raise SchemeParseError(f"Invalid YAML syntax: {e}") from e
86
+
87
+ if not isinstance(data, dict):
88
+ raise SchemeParseError("Top-level YAML must be a mapping (dict)")
89
+
90
+ # Allow either top-level keys directly or wrapped in 'scheme:'
91
+ if "scheme" in data and isinstance(data["scheme"], dict):
92
+ data = data["scheme"]
93
+
94
+ # Normalize LLM-friendly patterns into canonical form before parsing
95
+ data = _normalize_scheme_data(data)
96
+
97
+ return _parse_scheme(data)
98
+
99
+
100
+ # ---------------------------------------------------------------------------
101
+ # Normalization helpers
102
+ # ---------------------------------------------------------------------------
103
+
104
+ def _smiles_id(smiles: str, existing: Dict[str, Any], counter: List[int]) -> str:
105
+ """
106
+ Return a deterministic, collision-safe structure ID for a SMILES string.
107
+
108
+ Uses the first 8 hex chars of the SHA-1 of the SMILES. Falls back to a
109
+ sequential ``struct_N`` name when the hash already exists with a different
110
+ SMILES (collision is astronomically unlikely but handled anyway).
111
+ """
112
+ token = "s_" + hashlib.sha1(smiles.encode()).hexdigest()[:8]
113
+ if token not in existing:
114
+ return token
115
+ # Hash collision or same SMILES used twice — return existing key if same
116
+ entry = existing[token]
117
+ existing_smiles = entry.get("smiles") if isinstance(entry, dict) else entry
118
+ if existing_smiles == smiles:
119
+ return token # already registered with this SMILES
120
+ # True collision: fall back to sequential name
121
+ idx = counter[0]
122
+ counter[0] += 1
123
+ return f"struct_{idx}"
124
+
125
+
126
+ def _looks_like_smiles(s: str) -> bool:
127
+ """
128
+ Return True if *s* looks like a SMILES string rather than a structure ID.
129
+
130
+ Strategy: reject strings that contain "word-only" letters (letters that
131
+ are not valid in any SMILES notation) or identifier punctuation (``_``,
132
+ space, ``-``). Then apply positive evidence checks.
133
+
134
+ "Word-only" letters are those that never appear as SMILES atom symbols
135
+ or in element two-char symbols: ``A``, ``D``, ``E``, ``G``, ``J``, ``Q``,
136
+ ``T``, ``U``, ``W``, ``X``, ``Y``, ``Z`` (uppercase) and all lowercase
137
+ letters except ``b``, ``c``, ``h``, ``n``, ``o``, ``p``, ``r``, ``s``,
138
+ ``l`` (which appear in two-char elements like Br, Cl or aromatic atoms).
139
+ In practice ``h`` and ``l`` and ``r`` can appear in words too, but the
140
+ other word-only letters (``d``, ``e``, ``g``, etc.) are highly diagnostic.
141
+
142
+ Positive evidence tiers:
143
+
144
+ 1. Unambiguous SMILES syntax: ``=``, ``(``, ``)``, ``[``, ``]``, ``#``,
145
+ ``@``, ``\\``, ``/``.
146
+
147
+ 2. Pure uppercase organic-element chain of 3+ chars containing ``C``
148
+ (catches ``CCO``, ``CCCC``, ``COC`` while rejecting ``SM``, ``TFA``).
149
+
150
+ 3. Single-character organic element symbol.
151
+ """
152
+ # Reject identifiers
153
+ if "_" in s or " " in s:
154
+ return False
155
+ if "-" in s and not _SMILES_SYNTAX_RE.search(s):
156
+ return False
157
+ # Strings with word-only letters are plain names/abbreviations
158
+ if _WORD_ONLY_LETTER_RE.search(s):
159
+ return False
160
+
161
+ if len(s) == 1 and s in "BCFIKLMNOPRSV":
162
+ return True # single organic element
163
+ if _SMILES_SYNTAX_RE.search(s):
164
+ return True
165
+ # At this point the string contains only SMILES-compatible characters.
166
+ # Require at least one lowercase letter (aromatic atom) OR a pure
167
+ # uppercase organic chain of length >= 3 with a carbon.
168
+ if re.search(r'[a-z]', s): # has an aromatic or two-char element letter
169
+ return True
170
+ if _ORGANIC_CHAIN_RE.match(s) and "C" in s:
171
+ return True
172
+ return False
173
+
174
+
175
+ def _normalize_entry_list(
176
+ entries: Any,
177
+ structures: Dict[str, Any],
178
+ counter: List[int],
179
+ ) -> List[str]:
180
+ """
181
+ Normalise a substrate/product/above-arrow structures list.
182
+
183
+ Each entry may be:
184
+ - A plain string: kept as-is (may be an existing ID or bare SMILES).
185
+ - A dict with at least ``smiles``: auto-registered into *structures*.
186
+
187
+ Returns a list of structure ID strings.
188
+ """
189
+ if not isinstance(entries, list):
190
+ entries = [entries]
191
+
192
+ result: List[str] = []
193
+ for entry in entries:
194
+ if isinstance(entry, dict):
195
+ smiles = entry.get("smiles")
196
+ name = entry.get("name")
197
+ label = entry.get("label")
198
+ sid = entry.get("id")
199
+ if smiles and not sid:
200
+ sid = _smiles_id(smiles, structures, counter)
201
+ elif not sid:
202
+ # No smiles and no id — use name as key, or generate one
203
+ if name:
204
+ sid = name
205
+ else:
206
+ sid = f"struct_{counter[0]}"
207
+ counter[0] += 1
208
+ # Register if not already present
209
+ if sid not in structures:
210
+ struct_def: Dict[str, Any] = {}
211
+ if smiles:
212
+ struct_def["smiles"] = smiles
213
+ if name:
214
+ struct_def["name"] = name
215
+ if label:
216
+ struct_def["label"] = label
217
+ structures[sid] = struct_def if struct_def else smiles or sid
218
+ result.append(sid)
219
+ else:
220
+ result.append(str(entry))
221
+
222
+ return result
223
+
224
+
225
+ def _normalize_scheme_data(data: Dict[str, Any]) -> Dict[str, Any]:
226
+ """
227
+ Normalise LLM-friendly YAML patterns into the canonical form expected by
228
+ ``_parse_scheme``.
229
+
230
+ This function is **idempotent**: running it on already-canonical YAML
231
+ produces the same result. It mutates and returns a shallow copy of *data*.
232
+
233
+ Changes applied
234
+ ---------------
235
+ 1. ``species`` key is renamed to ``structures``.
236
+ 2. ``structures`` given as a list of dicts is converted to a keyed mapping.
237
+ 3. Inline structure dicts inside ``substrates``/``products``/
238
+ ``above_arrow.structures`` are auto-registered and replaced with IDs.
239
+ 4. Bare SMILES strings in ``substrates``/``products`` are auto-registered.
240
+ 5. ``reactants`` is accepted as an alias for ``substrates`` inside steps.
241
+ 6. ``text`` scalar is wrapped in a list inside ``above_arrow``/``below_arrow``.
242
+ 7. ``reagents`` list inside a step is distributed into ``above_arrow`` or
243
+ ``below_arrow`` depending on an ``above_arrow`` flag on each reagent.
244
+ 8. Redundant ``id`` field inside structure defs is silently accepted
245
+ (already handled by ``_parse_structure``; nothing to do here).
246
+ """
247
+ import copy
248
+ data = copy.deepcopy(data)
249
+
250
+ # 1. Accept ``species`` / ``substrates`` as alias for ``structures``
251
+ for alias in ("species", "substrates"):
252
+ if alias in data and "structures" not in data:
253
+ data["structures"] = data.pop(alias)
254
+ break
255
+
256
+ # 2. Accept ``structures`` as a list of dicts (convert to keyed mapping)
257
+ raw_structs = data.get("structures")
258
+ if isinstance(raw_structs, list):
259
+ converted: Dict[str, Any] = {}
260
+ for idx, item in enumerate(raw_structs):
261
+ if isinstance(item, dict):
262
+ key = str(item.get("id", f"struct_{idx}"))
263
+ # Remove the redundant 'id' key from the value dict to keep
264
+ # it clean (harmless either way — _parse_structure ignores it)
265
+ val = {k: v for k, v in item.items() if k != "id"}
266
+ converted[key] = val if val else item.get("smiles", f"struct_{idx}")
267
+ else:
268
+ converted[f"struct_{idx}"] = str(item)
269
+ data["structures"] = converted
270
+
271
+ # Work with the normalised structures mapping (may be empty / absent)
272
+ structures: Dict[str, Any] = data.setdefault("structures", {})
273
+ counter = [0] # mutable counter shared across steps
274
+
275
+ # Normalise each step
276
+ raw_steps = data.get("steps", [])
277
+ if not isinstance(raw_steps, list):
278
+ raw_steps = []
279
+ normalised_steps = []
280
+ for step in raw_steps:
281
+ if not isinstance(step, dict):
282
+ normalised_steps.append(step)
283
+ continue
284
+ step = dict(step) # shallow copy so we can mutate
285
+
286
+ # 5. ``reactants`` alias for ``substrates``
287
+ if "reactants" in step and "substrates" not in step:
288
+ step["substrates"] = step.pop("reactants")
289
+
290
+ # 3 & 4. Inline / bare-SMILES substrates
291
+ if "substrates" in step:
292
+ step["substrates"] = _normalize_entry_list(
293
+ step["substrates"], structures, counter
294
+ )
295
+ # Bare SMILES strings in the resulting list
296
+ step["substrates"] = _register_bare_smiles(
297
+ step["substrates"], structures, counter
298
+ )
299
+
300
+ # 3 & 4. Inline / bare-SMILES products
301
+ if "products" in step:
302
+ step["products"] = _normalize_entry_list(
303
+ step["products"], structures, counter
304
+ )
305
+ step["products"] = _register_bare_smiles(
306
+ step["products"], structures, counter
307
+ )
308
+
309
+ # 6. ``text`` as string in above_arrow / below_arrow
310
+ for arrow_key in ("above_arrow", "below_arrow"):
311
+ if arrow_key in step and isinstance(step[arrow_key], dict):
312
+ arrow = dict(step[arrow_key])
313
+ if isinstance(arrow.get("text"), str):
314
+ arrow["text"] = [arrow["text"]]
315
+ # 3. Inline structs inside above_arrow.structures
316
+ if "structures" in arrow:
317
+ arrow["structures"] = _normalize_entry_list(
318
+ arrow["structures"], structures, counter
319
+ )
320
+ arrow["structures"] = _register_bare_smiles(
321
+ arrow["structures"], structures, counter
322
+ )
323
+ step[arrow_key] = arrow
324
+
325
+ # 7. ``reagents`` key: distribute into above_arrow / below_arrow
326
+ if "reagents" in step:
327
+ reagents = step.pop("reagents")
328
+ if not isinstance(reagents, list):
329
+ reagents = [reagents]
330
+ for reagent in reagents:
331
+ if isinstance(reagent, dict):
332
+ goes_above = reagent.get("above_arrow", False)
333
+ # Normalise the reagent as a structure entry
334
+ reg_ids = _normalize_entry_list([reagent], structures, counter)
335
+ # Also accept bare SMILES
336
+ reg_ids = _register_bare_smiles(reg_ids, structures, counter)
337
+ if goes_above:
338
+ above = step.setdefault("above_arrow", {})
339
+ if isinstance(above, dict):
340
+ structs = above.setdefault("structures", [])
341
+ if isinstance(structs, list):
342
+ structs.extend(reg_ids)
343
+ else:
344
+ # Render as text using the display name or SMILES
345
+ below = step.setdefault("below_arrow", {})
346
+ if isinstance(below, dict):
347
+ texts = below.setdefault("text", [])
348
+ if isinstance(texts, list):
349
+ for rid in reg_ids:
350
+ entry = structures.get(rid, {})
351
+ display = (
352
+ entry.get("name") if isinstance(entry, dict)
353
+ else None
354
+ ) or rid
355
+ texts.append(display)
356
+ else:
357
+ # Plain string reagent — add as below_arrow text
358
+ below = step.setdefault("below_arrow", {})
359
+ if isinstance(below, dict):
360
+ texts = below.setdefault("text", [])
361
+ if isinstance(texts, list):
362
+ texts.append(str(reagent))
363
+
364
+ normalised_steps.append(step)
365
+ data["steps"] = normalised_steps
366
+
367
+ # Normalise steps inside sections as well
368
+ raw_sections = data.get("sections", [])
369
+ if isinstance(raw_sections, list):
370
+ for sec in raw_sections:
371
+ if isinstance(sec, dict) and "steps" in sec:
372
+ sec_steps = sec.get("steps", [])
373
+ if isinstance(sec_steps, list):
374
+ # Re-use the same normalisation by temporarily building a
375
+ # sub-dict and merging back
376
+ sub = _normalize_scheme_data(
377
+ {"structures": structures, "steps": sec_steps}
378
+ )
379
+ sec["steps"] = sub["steps"]
380
+ structures.update(sub.get("structures", {}))
381
+
382
+ return data
383
+
384
+
385
+ def _register_bare_smiles(
386
+ ids: List[str],
387
+ structures: Dict[str, Any],
388
+ counter: List[int],
389
+ ) -> List[str]:
390
+ """
391
+ For each string in *ids* that is not an existing structure key and looks
392
+ like SMILES, auto-register it and return the canonical ID in its place.
393
+ """
394
+ result: List[str] = []
395
+ for sid in ids:
396
+ if sid not in structures and _looks_like_smiles(sid):
397
+ new_id = _smiles_id(sid, structures, counter)
398
+ structures[new_id] = {"smiles": sid}
399
+ result.append(new_id)
400
+ else:
401
+ result.append(sid)
402
+ return result
403
+
404
+
405
+ def _load_yaml_text(source: Union[str, Path]) -> str:
406
+ """Load YAML text from a file path or return raw string."""
407
+ if isinstance(source, Path):
408
+ return source.read_text(encoding="utf-8")
409
+ if isinstance(source, str) and (
410
+ source.endswith(".yaml") or source.endswith(".yml")
411
+ ):
412
+ return Path(source).read_text(encoding="utf-8")
413
+ return source
414
+
415
+
416
+ def _parse_scheme(data: Dict[str, Any]) -> SchemeDescriptor:
417
+ """Parse the scheme-level dict."""
418
+ # --- Source (reaction_parser JSON) ---
419
+ source = data.get("source")
420
+ if source is not None:
421
+ source = str(source)
422
+
423
+ # --- Structures ---
424
+ raw_structs = data.get("structures", {})
425
+ if raw_structs is None:
426
+ raw_structs = {}
427
+ if not isinstance(raw_structs, dict):
428
+ raise SchemeParseError("'structures' must be a mapping")
429
+ structures = {}
430
+ for key, val in raw_structs.items():
431
+ key = str(key)
432
+ structures[key] = _parse_structure(key, val)
433
+
434
+ # --- Sections (for stacked-rows layout) ---
435
+ raw_sections = data.get("sections", [])
436
+ if not isinstance(raw_sections, list):
437
+ raise SchemeParseError("'sections' must be a list")
438
+ sections = [_parse_section(i, s) for i, s in enumerate(raw_sections)]
439
+
440
+ # --- Steps ---
441
+ raw_steps = data.get("steps", [])
442
+ if not isinstance(raw_steps, list):
443
+ raise SchemeParseError("'steps' must be a list")
444
+ if not raw_steps and not sections:
445
+ raise SchemeParseError("At least one step is required (or use 'sections' for stacked-rows)")
446
+ steps = [_parse_step(i, s) for i, s in enumerate(raw_steps)]
447
+
448
+ # --- Validate structure refs in steps and sections ---
449
+ # When source is present, refs may be resolved from JSON at render time,
450
+ # so we only validate refs that are NOT in the declared structures block
451
+ # (the renderer will handle resolution failures for source-backed refs).
452
+ def _validate_step_refs(step_list, context_prefix=""):
453
+ for i, step in enumerate(step_list):
454
+ prefix = f"{context_prefix}step {i+1}"
455
+ _validate_refs(f"{prefix} substrates", step.substrates, structures)
456
+ _validate_refs(f"{prefix} products", step.products, structures)
457
+ if step.above_arrow:
458
+ _validate_refs(
459
+ f"{prefix} above_arrow.structures",
460
+ step.above_arrow.structures,
461
+ structures,
462
+ )
463
+ if step.below_arrow:
464
+ _validate_refs(
465
+ f"{prefix} below_arrow.structures",
466
+ step.below_arrow.structures,
467
+ structures,
468
+ )
469
+
470
+ if not source:
471
+ _validate_step_refs(steps)
472
+ for sec_idx, sec in enumerate(sections):
473
+ _validate_step_refs(sec.steps, f"section {sec_idx+1} ")
474
+
475
+ # --- Layout ---
476
+ layout = str(data.get("layout", "linear"))
477
+ if layout not in VALID_LAYOUTS:
478
+ raise SchemeParseError(
479
+ f"Invalid layout '{layout}'. Must be one of: {sorted(VALID_LAYOUTS)}"
480
+ )
481
+
482
+ # --- Wrap ---
483
+ wrap = str(data.get("wrap", "repeat"))
484
+ if wrap not in VALID_WRAPS:
485
+ raise SchemeParseError(
486
+ f"Invalid wrap '{wrap}'. Must be one of: {sorted(VALID_WRAPS)}"
487
+ )
488
+
489
+ # --- Steps per row ---
490
+ steps_per_row = data.get("steps_per_row")
491
+ if steps_per_row is not None:
492
+ steps_per_row = int(steps_per_row)
493
+ if steps_per_row < 1:
494
+ raise SchemeParseError("steps_per_row must be >= 1")
495
+
496
+ # --- Title ---
497
+ title = data.get("title")
498
+ if title is not None:
499
+ title = str(title)
500
+
501
+ # --- Run arrows ---
502
+ raw_runs = data.get("run_arrows", [])
503
+ if not isinstance(raw_runs, list):
504
+ raise SchemeParseError("'run_arrows' must be a list")
505
+ run_arrows = [_parse_run_arrows(r) for r in raw_runs]
506
+
507
+ # --- Condition key ---
508
+ condition_key = data.get("condition_key")
509
+ if condition_key is not None and not isinstance(condition_key, dict):
510
+ raise SchemeParseError("'condition_key' must be a mapping")
511
+
512
+ return SchemeDescriptor(
513
+ source=source,
514
+ structures=structures,
515
+ steps=steps,
516
+ layout=layout,
517
+ wrap=wrap,
518
+ steps_per_row=steps_per_row,
519
+ title=title,
520
+ run_arrows=run_arrows,
521
+ condition_key=condition_key,
522
+ sections=sections,
523
+ )
524
+
525
+
526
+ def _parse_section(index: int, data: Any) -> SectionDescriptor:
527
+ """Parse a single section entry (for stacked-rows layout)."""
528
+ if not isinstance(data, dict):
529
+ raise SchemeParseError(
530
+ f"Section {index+1} must be a mapping, got {type(data).__name__}"
531
+ )
532
+ label = data.get("label")
533
+ if label is not None:
534
+ label = str(label)
535
+
536
+ raw_steps = data.get("steps", [])
537
+ if not isinstance(raw_steps, list):
538
+ raise SchemeParseError(f"Section {index+1} 'steps' must be a list")
539
+ if not raw_steps:
540
+ raise SchemeParseError(f"Section {index+1} must have at least one step")
541
+ steps = [_parse_step(i, s) for i, s in enumerate(raw_steps)]
542
+
543
+ layout = str(data.get("layout", "linear"))
544
+
545
+ return SectionDescriptor(label=label, steps=steps, layout=layout)
546
+
547
+
548
+ def _parse_structure(key: str, val: Any) -> StructureRef:
549
+ """Parse a single structure entry."""
550
+ if isinstance(val, str):
551
+ # Shorthand: just a SMILES string
552
+ return StructureRef(id=key, smiles=val)
553
+ if not isinstance(val, dict):
554
+ raise SchemeParseError(
555
+ f"Structure '{key}' must be a mapping or SMILES string, got {type(val).__name__}"
556
+ )
557
+ return StructureRef(
558
+ id=key,
559
+ smiles=val.get("smiles"),
560
+ name=val.get("name"),
561
+ file=val.get("file"),
562
+ cdxml_id=val.get("cdxml_id"),
563
+ label=val.get("label"),
564
+ )
565
+
566
+
567
+ def _parse_step(index: int, data: Any) -> StepDescriptor:
568
+ """Parse a single step entry."""
569
+ if not isinstance(data, dict):
570
+ raise SchemeParseError(f"Step {index+1} must be a mapping, got {type(data).__name__}")
571
+
572
+ substrates = _as_str_list(data.get("substrates", []), f"step {index+1} substrates")
573
+ products = _as_str_list(data.get("products", []), f"step {index+1} products")
574
+
575
+ if not substrates:
576
+ raise SchemeParseError(f"Step {index+1} must have at least one substrate")
577
+ if not products:
578
+ raise SchemeParseError(f"Step {index+1} must have at least one product")
579
+
580
+ above = _parse_arrow_content(data.get("above_arrow"), f"step {index+1} above_arrow")
581
+ below = _parse_arrow_content(data.get("below_arrow"), f"step {index+1} below_arrow")
582
+
583
+ return StepDescriptor(
584
+ substrates=substrates,
585
+ products=products,
586
+ above_arrow=above,
587
+ below_arrow=below,
588
+ yield_=data.get("yield"),
589
+ number=data.get("number"),
590
+ id=data.get("id"),
591
+ arrow_style=_validate_arrow_style(data.get("arrow_style", "solid"), index),
592
+ )
593
+
594
+
595
+ def _parse_arrow_content(data: Any, context: str) -> Optional[ArrowContent]:
596
+ """Parse above_arrow or below_arrow content."""
597
+ if data is None:
598
+ return None
599
+
600
+ # Normalize: if arrow content is a list, merge into a single dict.
601
+ # LLMs commonly write: above_arrow: [{structures: [...]}, "text"]
602
+ # or: above_arrow: [acid_id] (bare list of structure refs)
603
+ if isinstance(data, list):
604
+ merged = {"structures": [], "text": []}
605
+ for item in data:
606
+ if isinstance(item, dict):
607
+ for k, v in item.items():
608
+ if k == "structures" and isinstance(v, list):
609
+ merged["structures"].extend(v)
610
+ elif k == "text":
611
+ if isinstance(v, list):
612
+ merged["text"].extend(v)
613
+ elif isinstance(v, str):
614
+ merged["text"].append(v)
615
+ elif k == "structure":
616
+ # singular "structure: acid_id"
617
+ merged["structures"].append(v)
618
+ else:
619
+ merged["text"].append(str(v))
620
+ elif isinstance(item, str):
621
+ # Bare string in list — could be structure ID or text
622
+ merged["structures"].append(item)
623
+ data = merged
624
+
625
+ # Normalize: if arrow content is a bare string, treat as text
626
+ if isinstance(data, str):
627
+ data = {"text": [data]}
628
+
629
+ if not isinstance(data, dict):
630
+ raise SchemeParseError(
631
+ f"'{context}' must be a mapping (e.g. {{structures: [ID], text: ['conditions']}}), "
632
+ f"got {type(data).__name__}. "
633
+ f"Correct format: above_arrow: {{structures: [ReagentID], text: ['HATU']}}"
634
+ )
635
+ structures = _as_str_list(data.get("structures", []), f"{context}.structures")
636
+ text = _as_str_list(data.get("text", []), f"{context}.text")
637
+ if not structures and not text:
638
+ return None
639
+ return ArrowContent(structures=structures, text=text)
640
+
641
+
642
+ def _parse_run_arrows(data: Any) -> StepRunArrows:
643
+ """Parse a run_arrows entry."""
644
+ if not isinstance(data, dict):
645
+ raise SchemeParseError(f"run_arrows entry must be a mapping")
646
+ step = data.get("step")
647
+ if step is None:
648
+ raise SchemeParseError("run_arrows entry must have a 'step' field")
649
+ step = int(step)
650
+ raw_runs = data.get("runs", [])
651
+ if not isinstance(raw_runs, list):
652
+ raise SchemeParseError(f"run_arrows step {step} 'runs' must be a list")
653
+ runs = []
654
+ for r in raw_runs:
655
+ if not isinstance(r, dict):
656
+ raise SchemeParseError(f"run entry must be a mapping")
657
+ inp = r.get("input", "")
658
+ out = r.get("output", "")
659
+ if not inp:
660
+ raise SchemeParseError(
661
+ f"run entry must have an 'input' field"
662
+ )
663
+ note = r.get("note")
664
+ if note is not None:
665
+ note = str(note)
666
+ runs.append(RunArrowEntry(input_label=str(inp), output_label=str(out),
667
+ note=note))
668
+ return StepRunArrows(step=step, runs=runs)
669
+
670
+
671
+ def _validate_arrow_style(style: str, step_idx: int) -> str:
672
+ """Validate arrow_style value."""
673
+ style = str(style)
674
+ if style not in VALID_ARROW_STYLES:
675
+ raise SchemeParseError(
676
+ f"Step {step_idx+1}: invalid arrow_style '{style}'. "
677
+ f"Must be one of: {sorted(VALID_ARROW_STYLES)}"
678
+ )
679
+ return style
680
+
681
+
682
+ def _validate_refs(
683
+ context: str,
684
+ refs: List[str],
685
+ structures: Dict[str, StructureRef],
686
+ ) -> None:
687
+ """Check that all refs point to defined structures.
688
+
689
+ Undeclared refs are auto-registered as bare StructureRefs so the renderer
690
+ can attempt resolution via reagent_db (name → SMILES lookup).
691
+ """
692
+ for ref in refs:
693
+ if ref not in structures:
694
+ # Auto-register as a bare ref — renderer will try reagent_db
695
+ structures[ref] = StructureRef(id=ref)
696
+
697
+
698
+ def _as_str_list(val: Any, context: str) -> List[str]:
699
+ """Coerce a value to a list of strings."""
700
+ if val is None:
701
+ return []
702
+ if isinstance(val, str):
703
+ return [val]
704
+ if isinstance(val, list):
705
+ return [str(v) for v in val]
706
+ raise SchemeParseError(f"'{context}' must be a list or string, got {type(val).__name__}")