cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,632 @@
1
+ """compact_parser.py — Parse compact reaction scheme syntax into SchemeDescriptor.
2
+
3
+ Transforms a concise text notation (see COMPACT_SYNTAX.md) into the same
4
+ SchemeDescriptor dataclasses produced by the YAML parser.
5
+
6
+ Grammar highlights:
7
+ ArBr{BrC1=CC=CC=C1} --> product{c1ccc(N2CCCCC2)cc1} (94%)
8
+ above: piperidine{C1CCNCC1}, "Pd-RuPhos (0.5 mol%)"
9
+ below: "NaOtBu, THF", "85 deg C, 6 hrs"
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ from typing import Optional
16
+
17
+ from .schema import (
18
+ StructureRef,
19
+ ArrowContent,
20
+ StepDescriptor,
21
+ RunArrowEntry,
22
+ StepRunArrows,
23
+ SchemeDescriptor,
24
+ VALID_LAYOUTS,
25
+ VALID_WRAPS,
26
+ VALID_ARROW_STYLES,
27
+ )
28
+
29
+
30
+ # ── Error type ────────────────────────────────────────────────────────
31
+
32
+ class ParseError(Exception):
33
+ """Syntax or semantic error with optional line number."""
34
+
35
+ def __init__(self, message: str, line: int | None = None):
36
+ self.line = line
37
+ if line is not None:
38
+ super().__init__(f"Line {line}: {message}")
39
+ else:
40
+ super().__init__(message)
41
+
42
+
43
+ # ── Low-level helpers ─────────────────────────────────────────────────
44
+
45
+ def _split_respecting(text: str, delimiter: str) -> list[str]:
46
+ """Split *text* on *delimiter*, skipping occurrences inside ``{}`` or ``\"\"``."""
47
+ parts: list[str] = []
48
+ current: list[str] = []
49
+ depth = 0
50
+ in_quote = False
51
+ i = 0
52
+ dlen = len(delimiter)
53
+ while i < len(text):
54
+ ch = text[i]
55
+ if ch == '"' and depth == 0:
56
+ in_quote = not in_quote
57
+ current.append(ch)
58
+ elif ch == '{' and not in_quote:
59
+ depth += 1
60
+ current.append(ch)
61
+ elif ch == '}' and not in_quote:
62
+ depth = max(depth - 1, 0)
63
+ current.append(ch)
64
+ elif (
65
+ not in_quote
66
+ and depth == 0
67
+ and text[i : i + dlen] == delimiter
68
+ ):
69
+ parts.append("".join(current))
70
+ current = []
71
+ i += dlen
72
+ continue
73
+ else:
74
+ current.append(ch)
75
+ i += 1
76
+ parts.append("".join(current))
77
+ return parts
78
+
79
+
80
+ # Arrow regex fragments (compiled once)
81
+ _ARROW_PATTERNS = [
82
+ # order matters: longer/more-specific first
83
+ (re.compile(r"==>"), "solid"), # parallel (arrow style is solid; layout implies parallel)
84
+ (re.compile(r"\.\.>"), "dashed"),
85
+ (re.compile(r"-->"), "solid"),
86
+ (re.compile(r"[Xx]>"), "failed"),
87
+ ]
88
+
89
+
90
+ def _find_arrows(line: str) -> list[tuple[int, int, str, str | None]]:
91
+ """Return ``[(start, end, arrow_style, label_or_None), ...]`` for every
92
+ arrow token in *line* that is outside ``{}`` and ``\"\"``."""
93
+ arrows: list[tuple[int, int, str, str | None]] = []
94
+ depth = 0
95
+ in_quote = False
96
+ i = 0
97
+ while i < len(line):
98
+ ch = line[i]
99
+ if ch == '"' and depth == 0:
100
+ in_quote = not in_quote
101
+ i += 1
102
+ continue
103
+ if ch == '{' and not in_quote:
104
+ depth += 1
105
+ i += 1
106
+ continue
107
+ if ch == '}' and not in_quote:
108
+ depth = max(depth - 1, 0)
109
+ i += 1
110
+ continue
111
+ if depth > 0 or in_quote:
112
+ i += 1
113
+ continue
114
+ # Try each arrow pattern at position i
115
+ for pat, style in _ARROW_PATTERNS:
116
+ m = pat.match(line, i)
117
+ if m:
118
+ end = m.end()
119
+ # Check for |label| immediately after
120
+ label: str | None = None
121
+ if end < len(line) and line[end] == '|':
122
+ pipe_close = line.find('|', end + 1)
123
+ if pipe_close > end:
124
+ label = line[end + 1 : pipe_close]
125
+ end = pipe_close + 1
126
+ arrows.append((m.start(), end, style, label))
127
+ i = end
128
+ break
129
+ else:
130
+ i += 1
131
+ return arrows
132
+
133
+
134
+ def _strip_trailing_yield(line: str) -> tuple[str, str | None]:
135
+ """Remove a trailing ``(N%)`` yield annotation that is outside ``{}``.
136
+
137
+ Returns ``(cleaned_line, yield_string_or_None)``.
138
+ """
139
+ depth = 0
140
+ in_quote = False
141
+ last_close = -1
142
+ for i, ch in enumerate(line):
143
+ if ch == '"' and depth == 0:
144
+ in_quote = not in_quote
145
+ elif ch == '{' and not in_quote:
146
+ depth += 1
147
+ elif ch == '}' and not in_quote:
148
+ depth = max(depth - 1, 0)
149
+ elif ch == ')' and depth == 0 and not in_quote:
150
+ last_close = i
151
+ if last_close < 1:
152
+ return line, None
153
+ # Walk backwards from last_close to find matching '('
154
+ depth = 0
155
+ in_quote = False
156
+ open_pos = -1
157
+ for i in range(last_close, -1, -1):
158
+ ch = line[i]
159
+ if ch == ')':
160
+ depth += 1
161
+ elif ch == '(':
162
+ depth -= 1
163
+ if depth == 0:
164
+ open_pos = i
165
+ break
166
+ if open_pos < 0:
167
+ return line, None
168
+ content = line[open_pos + 1 : last_close]
169
+ if '%' in content:
170
+ return line[:open_pos].rstrip(), content
171
+ return line, None
172
+
173
+
174
+ # ── Species token parsing ─────────────────────────────────────────────
175
+
176
+ _ID_RE = re.compile(r'^[A-Za-z_][A-Za-z0-9_-]*$')
177
+
178
+
179
+ def _parse_species_token(
180
+ token: str, lineno: int | None = None
181
+ ) -> tuple[str, str | None, str | None]:
182
+ """Parse a single species token into ``(id, smiles_or_None, label_or_None)``.
183
+
184
+ Accepted forms::
185
+
186
+ ArBr → ("ArBr", None, None)
187
+ ArBr{BrC1=CC=CC=C1} → ("ArBr", "BrC1…", None)
188
+ "1"{BrC1=CC=CC=C1} → ("1", "BrC1…", "1")
189
+ "1" → ("1", None, "1")
190
+ """
191
+ token = token.strip()
192
+ if not token:
193
+ raise ParseError("Empty species token", lineno)
194
+
195
+ smiles: str | None = None
196
+ label: str | None = None
197
+
198
+ # Extract {SMILES}
199
+ brace_open = -1
200
+ depth = 0
201
+ in_q = False
202
+ for i, ch in enumerate(token):
203
+ if ch == '"':
204
+ in_q = not in_q
205
+ elif ch == '{' and not in_q and depth == 0:
206
+ brace_open = i
207
+ depth += 1
208
+ elif ch == '{' and not in_q:
209
+ depth += 1
210
+ elif ch == '}' and not in_q:
211
+ depth -= 1
212
+
213
+ if brace_open >= 0:
214
+ brace_close = token.rfind('}')
215
+ if brace_close <= brace_open:
216
+ raise ParseError(f"Unclosed '{{' in species: {token}", lineno)
217
+ smiles = token[brace_open + 1 : brace_close]
218
+ id_part = token[:brace_open].strip()
219
+ else:
220
+ id_part = token
221
+
222
+ # Quoted ID → also becomes label
223
+ if id_part.startswith('"') and id_part.endswith('"') and len(id_part) >= 2:
224
+ id_str = id_part[1:-1]
225
+ label = id_str
226
+ elif id_part.startswith('"'):
227
+ raise ParseError(f"Unclosed quote in species ID: {token}", lineno)
228
+ else:
229
+ id_str = id_part
230
+
231
+ if not id_str:
232
+ raise ParseError(f"Empty species ID: {token}", lineno)
233
+
234
+ return id_str, smiles, label
235
+
236
+
237
+ # ── Condition items parsing ───────────────────────────────────────────
238
+
239
+ def _parse_condition_items(
240
+ text: str,
241
+ structures: dict[str, StructureRef],
242
+ lineno: int | None = None,
243
+ ) -> ArrowContent:
244
+ """Parse a comma-separated list of condition items into :class:`ArrowContent`.
245
+
246
+ Rules:
247
+ - ``"quoted text"`` → text label
248
+ - ``bare_id`` → structure reference
249
+ - ``bare_id{SMILES}`` → define structure inline + reference it
250
+ """
251
+ ac = ArrowContent()
252
+ items = _split_respecting(text.strip(), ",")
253
+ for raw in items:
254
+ item = raw.strip()
255
+ if not item:
256
+ continue
257
+ if item.startswith('"') and item.endswith('"') and len(item) >= 2:
258
+ # Text label
259
+ ac.text.append(item[1:-1])
260
+ elif '{' in item:
261
+ # Inline structure definition
262
+ sid, smi, lbl = _parse_species_token(item, lineno)
263
+ if sid not in structures:
264
+ structures[sid] = StructureRef(id=sid, smiles=smi, label=lbl)
265
+ elif smi and structures[sid].smiles is None:
266
+ structures[sid].smiles = smi
267
+ ac.structures.append(sid)
268
+ else:
269
+ # Bare structure reference
270
+ ac.structures.append(item)
271
+ return ac
272
+
273
+
274
+ # ── Run arrow parsing ─────────────────────────────────────────────────
275
+
276
+ def _parse_run_arrow(text: str, lineno: int | None = None) -> RunArrowEntry:
277
+ """Parse ``"input" -> "output"`` into a :class:`RunArrowEntry`."""
278
+ parts = text.split('->')
279
+ if len(parts) != 2:
280
+ raise ParseError(
281
+ f"Run arrow must have exactly one '->': {text!r}", lineno
282
+ )
283
+ inp = parts[0].strip().strip('"')
284
+ out = parts[1].strip().strip('"')
285
+ if not inp or not out:
286
+ raise ParseError(f"Empty input or output in run arrow: {text!r}", lineno)
287
+ return RunArrowEntry(input_label=inp, output_label=out)
288
+
289
+
290
+ # ── Main parser ───────────────────────────────────────────────────────
291
+
292
+ def parse_compact(text: str) -> SchemeDescriptor:
293
+ """Parse compact syntax text into a :class:`SchemeDescriptor`."""
294
+ lines = text.split('\n')
295
+
296
+ # Collected state
297
+ directives: dict[str, str] = {}
298
+ structures: dict[str, StructureRef] = {}
299
+ reaction_chain_line: str | None = None
300
+ reaction_chain_lineno: int | None = None
301
+ # Conditions collected per step-number (1-indexed); 0 = implicit single-step
302
+ step_above: dict[int, list[tuple[str, int]]] = {} # step → [(raw_text, lineno)]
303
+ step_below: dict[int, list[tuple[str, int]]] = {}
304
+ step_runs: dict[int, list[tuple[str, int]]] = {}
305
+ condition_key_entries: dict[str, str] = {}
306
+ is_parallel = False
307
+
308
+ current_step: int = 0 # 0 = implicit single-step context
309
+ in_conditions_block = False
310
+
311
+ for raw_lineno, raw_line in enumerate(lines, start=1):
312
+ line = raw_line.rstrip()
313
+
314
+ # Blank line
315
+ if not line or line.isspace():
316
+ continue
317
+
318
+ # Comment
319
+ if line.lstrip().startswith('#'):
320
+ continue
321
+
322
+ # Row separator — not supported in compact syntax
323
+ if line.strip() == '---':
324
+ raise ParseError(
325
+ "Row separators (---) are not supported in compact syntax. "
326
+ "Use YAML format for stacked-rows layout.",
327
+ raw_lineno,
328
+ )
329
+
330
+ # Directive
331
+ if line.startswith('@'):
332
+ in_conditions_block = False
333
+ rest = line[1:].strip()
334
+ if rest.startswith('conditions'):
335
+ in_conditions_block = True
336
+ continue
337
+ parts = rest.split(None, 1)
338
+ if not parts:
339
+ raise ParseError("Empty directive", raw_lineno)
340
+ key = parts[0]
341
+ val = parts[1].strip('"') if len(parts) > 1 else ""
342
+ directives[key] = val
343
+ continue
344
+
345
+ # Condition key entry: (a) "..."
346
+ if in_conditions_block:
347
+ m = re.match(r'^\(([a-zA-Z0-9,]+)\)\s*"(.*)"', line.strip())
348
+ if m:
349
+ condition_key_entries[m.group(1)] = m.group(2)
350
+ continue
351
+ else:
352
+ in_conditions_block = False
353
+ # fall through to other parsing
354
+
355
+ # Step block header: "step N:"
356
+ m_step = re.match(r'^step\s+(\d+)\s*:', line)
357
+ if m_step:
358
+ current_step = int(m_step.group(1))
359
+ continue
360
+
361
+ # Definition line: "id: {SMILES}" or "id: name ..." or "id: file ..."
362
+ m_def = re.match(
363
+ r'^([A-Za-z_][A-Za-z0-9_-]*)\s*:\s*(.*)', line
364
+ )
365
+ if m_def and not line.lstrip().startswith(('above', 'below', 'run')):
366
+ sid = m_def.group(1)
367
+ spec = m_def.group(2).strip()
368
+ sref = StructureRef(id=sid)
369
+
370
+ if spec.startswith('{'):
371
+ close = spec.rfind('}')
372
+ if close < 0:
373
+ raise ParseError(f"Unclosed '{{' in definition of {sid}", raw_lineno)
374
+ sref.smiles = spec[1:close]
375
+ remainder = spec[close + 1 :].strip()
376
+ # Check for label "..."
377
+ lm = re.match(r'label\s+"([^"]*)"', remainder)
378
+ if lm:
379
+ sref.label = lm.group(1)
380
+ elif spec.startswith('name'):
381
+ nm = re.match(r'name\s+"([^"]*)"', spec)
382
+ if nm:
383
+ sref.name = nm.group(1)
384
+ else:
385
+ raise ParseError(f"Invalid name spec for {sid}", raw_lineno)
386
+ elif spec.startswith('file'):
387
+ fm = re.match(r'file\s+"([^"]*)"', spec)
388
+ if fm:
389
+ sref.file = fm.group(1)
390
+ else:
391
+ raise ParseError(f"Invalid file spec for {sid}", raw_lineno)
392
+ else:
393
+ raise ParseError(f"Unknown definition spec for {sid}: {spec}", raw_lineno)
394
+
395
+ if sid in structures:
396
+ raise ParseError(f"Duplicate structure definition: {sid}", raw_lineno)
397
+ structures[sid] = sref
398
+ continue
399
+
400
+ # Indented condition/run line
401
+ if line.startswith((' ', '\t')):
402
+ stripped = line.strip()
403
+
404
+ # Step-indexed: [N] above: ... or [N] below: ...
405
+ m_idx = re.match(r'^\[(\d+)\]\s*(above|below)\s*:\s*(.*)', stripped)
406
+ if m_idx:
407
+ step_num = int(m_idx.group(1))
408
+ position = m_idx.group(2)
409
+ content = m_idx.group(3)
410
+ target = step_above if position == 'above' else step_below
411
+ target.setdefault(step_num, []).append((content, raw_lineno))
412
+ continue
413
+
414
+ # above: / below:
415
+ if stripped.startswith('above:') or stripped.startswith('below:'):
416
+ position = 'above' if stripped.startswith('above') else 'below'
417
+ content = stripped.split(':', 1)[1].strip()
418
+ target = step_above if position == 'above' else step_below
419
+ target.setdefault(current_step, []).append((content, raw_lineno))
420
+ continue
421
+
422
+ # run: "in" -> "out" or run[N]: "in" -> "out"
423
+ m_run = re.match(r'^run(?:\[(\d+)\])?\s*:\s*(.*)', stripped)
424
+ if m_run:
425
+ step_num = int(m_run.group(1)) if m_run.group(1) else current_step
426
+ step_runs.setdefault(step_num, []).append(
427
+ (m_run.group(2), raw_lineno)
428
+ )
429
+ continue
430
+
431
+ # If none of the above matched, try reaction chain
432
+ # A reaction chain must contain at least one arrow
433
+ arrows = _find_arrows(line)
434
+ if arrows:
435
+ if reaction_chain_line is not None:
436
+ raise ParseError(
437
+ "Multiple reaction chains found (only one per scheme)",
438
+ raw_lineno,
439
+ )
440
+ reaction_chain_line = line
441
+ reaction_chain_lineno = raw_lineno
442
+ # Detect parallel arrow
443
+ for _, _, style, _ in arrows:
444
+ if style == "solid" and '==>' in line:
445
+ is_parallel = True
446
+ continue
447
+
448
+ # Before giving up, check for common syntax errors
449
+ if '{' in line and line.count('{') != line.count('}'):
450
+ raise ParseError(
451
+ "Unclosed '{' in line (mismatched braces)", raw_lineno
452
+ )
453
+ if line.count('"') % 2 != 0:
454
+ raise ParseError(
455
+ "Unclosed quote in line (odd number of '\"')", raw_lineno
456
+ )
457
+ raise ParseError(f"Unrecognized line: {line!r}", raw_lineno)
458
+
459
+ # ── Require a reaction chain ──────────────────────────────────
460
+ if reaction_chain_line is None:
461
+ raise ParseError("No reaction chain found (missing --> arrow)")
462
+
463
+ # ── Parse reaction chain ──────────────────────────────────────
464
+ chain = reaction_chain_line
465
+ chain, yield_str = _strip_trailing_yield(chain)
466
+ arrows = _find_arrows(chain)
467
+ if not arrows:
468
+ raise ParseError(
469
+ "No arrow found in reaction chain", reaction_chain_lineno
470
+ )
471
+
472
+ # Extract segments between arrows
473
+ segments: list[str] = []
474
+ arrow_styles: list[str] = []
475
+ arrow_labels: list[str | None] = []
476
+ prev_end = 0
477
+ for start, end, style, label in arrows:
478
+ segments.append(chain[prev_end:start])
479
+ arrow_styles.append(style)
480
+ arrow_labels.append(label)
481
+ prev_end = end
482
+ segments.append(chain[prev_end:]) # last segment (products of last step)
483
+
484
+ # Parse each segment into species
485
+ segment_species: list[list[tuple[str, str | None, str | None]]] = []
486
+ for seg in segments:
487
+ species_tokens = _split_respecting(seg.strip(), '+')
488
+ species_list: list[tuple[str, str | None, str | None]] = []
489
+ for tok in species_tokens:
490
+ tok = tok.strip()
491
+ if not tok:
492
+ continue
493
+ species_list.append(
494
+ _parse_species_token(tok, reaction_chain_lineno)
495
+ )
496
+ segment_species.append(species_list)
497
+
498
+ # Validate: no empty segments
499
+ for idx, seg in enumerate(segment_species):
500
+ if not seg:
501
+ if idx == 0:
502
+ raise ParseError(
503
+ "Missing substrates before first arrow",
504
+ reaction_chain_lineno,
505
+ )
506
+ else:
507
+ raise ParseError(
508
+ f"Missing species after arrow {idx}",
509
+ reaction_chain_lineno,
510
+ )
511
+
512
+ # Register all species as StructureRefs
513
+ for seg in segment_species:
514
+ for sid, smi, lbl in seg:
515
+ if sid not in structures:
516
+ structures[sid] = StructureRef(id=sid, smiles=smi, label=lbl)
517
+ else:
518
+ # Update SMILES / label if provided inline and not yet set
519
+ if smi and structures[sid].smiles is None:
520
+ structures[sid].smiles = smi
521
+ if lbl and structures[sid].label is None:
522
+ structures[sid].label = lbl
523
+
524
+ # ── Build steps ───────────────────────────────────────────────
525
+ num_steps = len(arrows)
526
+ steps: list[StepDescriptor] = []
527
+
528
+ for i in range(num_steps):
529
+ substrates = [sid for sid, _, _ in segment_species[i]]
530
+ products = [sid for sid, _, _ in segment_species[i + 1]]
531
+ step_num = i + 1 # 1-indexed
532
+
533
+ # Determine above/below content
534
+ # Try step-indexed first, then fall back to implicit (key=0) for single-step
535
+ above_key = step_num if step_num in step_above else (0 if num_steps == 1 else step_num)
536
+ below_key = step_num if step_num in step_below else (0 if num_steps == 1 else step_num)
537
+
538
+ above = ArrowContent()
539
+ for raw_text, ln in step_above.get(above_key, []):
540
+ ac = _parse_condition_items(raw_text, structures, ln)
541
+ above.structures.extend(ac.structures)
542
+ above.text.extend(ac.text)
543
+
544
+ below = ArrowContent()
545
+ for raw_text, ln in step_below.get(below_key, []):
546
+ ac = _parse_condition_items(raw_text, structures, ln)
547
+ below.structures.extend(ac.structures)
548
+ below.text.extend(ac.text)
549
+
550
+ # Arrow label → populate conditions from letter key if available
551
+ lbl = arrow_labels[i]
552
+ if lbl and condition_key_entries:
553
+ # Letter conditions mode — put the full text below the arrow
554
+ for letter in re.split(r'[,\s]+', lbl):
555
+ letter = letter.strip()
556
+ if letter in condition_key_entries:
557
+ below.text.append(condition_key_entries[letter])
558
+
559
+ step = StepDescriptor(
560
+ substrates=substrates,
561
+ products=products,
562
+ above_arrow=above if (above.structures or above.text) else None,
563
+ below_arrow=below if (below.structures or below.text) else None,
564
+ yield_=(yield_str if i == num_steps - 1 else None),
565
+ number=step_num if num_steps > 1 else None,
566
+ arrow_style=arrow_styles[i],
567
+ )
568
+ steps.append(step)
569
+
570
+ # ── Build run arrows ──────────────────────────────────────────
571
+ run_arrows: list[StepRunArrows] = []
572
+ for step_key, run_lines in sorted(step_runs.items()):
573
+ # Map key 0 → step 1
574
+ step_num = step_key if step_key > 0 else 1
575
+ entries: list[RunArrowEntry] = []
576
+ for raw_text, ln in run_lines:
577
+ entries.append(_parse_run_arrow(raw_text, ln))
578
+ run_arrows.append(StepRunArrows(step=step_num, runs=entries))
579
+
580
+ # ── Determine layout ──────────────────────────────────────────
581
+ explicit_layout = 'layout' in directives
582
+ layout = directives.get('layout', 'linear')
583
+ if not explicit_layout:
584
+ if is_parallel:
585
+ layout = 'numbered-parallel'
586
+ elif num_steps > 1:
587
+ layout = 'sequential'
588
+ if layout not in VALID_LAYOUTS:
589
+ raise ParseError(f"Unknown layout: {layout!r}")
590
+
591
+ wrap = directives.get('wrap', 'repeat')
592
+ if wrap not in VALID_WRAPS:
593
+ raise ParseError(f"Unknown wrap: {wrap!r}")
594
+
595
+ steps_per_row: int | None = None
596
+ if 'steps_per_row' in directives:
597
+ try:
598
+ steps_per_row = int(directives['steps_per_row'])
599
+ except ValueError:
600
+ raise ParseError(f"steps_per_row must be integer: {directives['steps_per_row']!r}")
601
+
602
+ title = directives.get('title')
603
+
604
+ cond_key = condition_key_entries if condition_key_entries else None
605
+
606
+ # ── Validate references ───────────────────────────────────────
607
+ for step in steps:
608
+ for sid in step.substrates + step.products:
609
+ if sid not in structures:
610
+ raise ParseError(f"Undefined structure reference: {sid!r}")
611
+ for ac in (step.above_arrow, step.below_arrow):
612
+ if ac:
613
+ for sid in ac.structures:
614
+ if sid not in structures:
615
+ raise ParseError(f"Undefined structure reference in conditions: {sid!r}")
616
+
617
+ return SchemeDescriptor(
618
+ structures=structures,
619
+ steps=steps,
620
+ layout=layout,
621
+ wrap=wrap,
622
+ steps_per_row=steps_per_row,
623
+ title=title,
624
+ run_arrows=run_arrows,
625
+ condition_key=cond_key,
626
+ )
627
+
628
+
629
+ def parse_compact_file(path: str) -> SchemeDescriptor:
630
+ """Read a file and parse it as compact syntax."""
631
+ with open(path, 'r', encoding='utf-8') as f:
632
+ return parse_compact(f.read())