cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,632 @@
|
|
|
1
|
+
"""compact_parser.py — Parse compact reaction scheme syntax into SchemeDescriptor.
|
|
2
|
+
|
|
3
|
+
Transforms a concise text notation (see COMPACT_SYNTAX.md) into the same
|
|
4
|
+
SchemeDescriptor dataclasses produced by the YAML parser.
|
|
5
|
+
|
|
6
|
+
Grammar highlights:
|
|
7
|
+
ArBr{BrC1=CC=CC=C1} --> product{c1ccc(N2CCCCC2)cc1} (94%)
|
|
8
|
+
above: piperidine{C1CCNCC1}, "Pd-RuPhos (0.5 mol%)"
|
|
9
|
+
below: "NaOtBu, THF", "85 deg C, 6 hrs"
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
from .schema import (
|
|
18
|
+
StructureRef,
|
|
19
|
+
ArrowContent,
|
|
20
|
+
StepDescriptor,
|
|
21
|
+
RunArrowEntry,
|
|
22
|
+
StepRunArrows,
|
|
23
|
+
SchemeDescriptor,
|
|
24
|
+
VALID_LAYOUTS,
|
|
25
|
+
VALID_WRAPS,
|
|
26
|
+
VALID_ARROW_STYLES,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ── Error type ────────────────────────────────────────────────────────
|
|
31
|
+
|
|
32
|
+
class ParseError(Exception):
|
|
33
|
+
"""Syntax or semantic error with optional line number."""
|
|
34
|
+
|
|
35
|
+
def __init__(self, message: str, line: int | None = None):
|
|
36
|
+
self.line = line
|
|
37
|
+
if line is not None:
|
|
38
|
+
super().__init__(f"Line {line}: {message}")
|
|
39
|
+
else:
|
|
40
|
+
super().__init__(message)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ── Low-level helpers ─────────────────────────────────────────────────
|
|
44
|
+
|
|
45
|
+
def _split_respecting(text: str, delimiter: str) -> list[str]:
|
|
46
|
+
"""Split *text* on *delimiter*, skipping occurrences inside ``{}`` or ``\"\"``."""
|
|
47
|
+
parts: list[str] = []
|
|
48
|
+
current: list[str] = []
|
|
49
|
+
depth = 0
|
|
50
|
+
in_quote = False
|
|
51
|
+
i = 0
|
|
52
|
+
dlen = len(delimiter)
|
|
53
|
+
while i < len(text):
|
|
54
|
+
ch = text[i]
|
|
55
|
+
if ch == '"' and depth == 0:
|
|
56
|
+
in_quote = not in_quote
|
|
57
|
+
current.append(ch)
|
|
58
|
+
elif ch == '{' and not in_quote:
|
|
59
|
+
depth += 1
|
|
60
|
+
current.append(ch)
|
|
61
|
+
elif ch == '}' and not in_quote:
|
|
62
|
+
depth = max(depth - 1, 0)
|
|
63
|
+
current.append(ch)
|
|
64
|
+
elif (
|
|
65
|
+
not in_quote
|
|
66
|
+
and depth == 0
|
|
67
|
+
and text[i : i + dlen] == delimiter
|
|
68
|
+
):
|
|
69
|
+
parts.append("".join(current))
|
|
70
|
+
current = []
|
|
71
|
+
i += dlen
|
|
72
|
+
continue
|
|
73
|
+
else:
|
|
74
|
+
current.append(ch)
|
|
75
|
+
i += 1
|
|
76
|
+
parts.append("".join(current))
|
|
77
|
+
return parts
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# Arrow regex fragments (compiled once)
|
|
81
|
+
_ARROW_PATTERNS = [
|
|
82
|
+
# order matters: longer/more-specific first
|
|
83
|
+
(re.compile(r"==>"), "solid"), # parallel (arrow style is solid; layout implies parallel)
|
|
84
|
+
(re.compile(r"\.\.>"), "dashed"),
|
|
85
|
+
(re.compile(r"-->"), "solid"),
|
|
86
|
+
(re.compile(r"[Xx]>"), "failed"),
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _find_arrows(line: str) -> list[tuple[int, int, str, str | None]]:
|
|
91
|
+
"""Return ``[(start, end, arrow_style, label_or_None), ...]`` for every
|
|
92
|
+
arrow token in *line* that is outside ``{}`` and ``\"\"``."""
|
|
93
|
+
arrows: list[tuple[int, int, str, str | None]] = []
|
|
94
|
+
depth = 0
|
|
95
|
+
in_quote = False
|
|
96
|
+
i = 0
|
|
97
|
+
while i < len(line):
|
|
98
|
+
ch = line[i]
|
|
99
|
+
if ch == '"' and depth == 0:
|
|
100
|
+
in_quote = not in_quote
|
|
101
|
+
i += 1
|
|
102
|
+
continue
|
|
103
|
+
if ch == '{' and not in_quote:
|
|
104
|
+
depth += 1
|
|
105
|
+
i += 1
|
|
106
|
+
continue
|
|
107
|
+
if ch == '}' and not in_quote:
|
|
108
|
+
depth = max(depth - 1, 0)
|
|
109
|
+
i += 1
|
|
110
|
+
continue
|
|
111
|
+
if depth > 0 or in_quote:
|
|
112
|
+
i += 1
|
|
113
|
+
continue
|
|
114
|
+
# Try each arrow pattern at position i
|
|
115
|
+
for pat, style in _ARROW_PATTERNS:
|
|
116
|
+
m = pat.match(line, i)
|
|
117
|
+
if m:
|
|
118
|
+
end = m.end()
|
|
119
|
+
# Check for |label| immediately after
|
|
120
|
+
label: str | None = None
|
|
121
|
+
if end < len(line) and line[end] == '|':
|
|
122
|
+
pipe_close = line.find('|', end + 1)
|
|
123
|
+
if pipe_close > end:
|
|
124
|
+
label = line[end + 1 : pipe_close]
|
|
125
|
+
end = pipe_close + 1
|
|
126
|
+
arrows.append((m.start(), end, style, label))
|
|
127
|
+
i = end
|
|
128
|
+
break
|
|
129
|
+
else:
|
|
130
|
+
i += 1
|
|
131
|
+
return arrows
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _strip_trailing_yield(line: str) -> tuple[str, str | None]:
|
|
135
|
+
"""Remove a trailing ``(N%)`` yield annotation that is outside ``{}``.
|
|
136
|
+
|
|
137
|
+
Returns ``(cleaned_line, yield_string_or_None)``.
|
|
138
|
+
"""
|
|
139
|
+
depth = 0
|
|
140
|
+
in_quote = False
|
|
141
|
+
last_close = -1
|
|
142
|
+
for i, ch in enumerate(line):
|
|
143
|
+
if ch == '"' and depth == 0:
|
|
144
|
+
in_quote = not in_quote
|
|
145
|
+
elif ch == '{' and not in_quote:
|
|
146
|
+
depth += 1
|
|
147
|
+
elif ch == '}' and not in_quote:
|
|
148
|
+
depth = max(depth - 1, 0)
|
|
149
|
+
elif ch == ')' and depth == 0 and not in_quote:
|
|
150
|
+
last_close = i
|
|
151
|
+
if last_close < 1:
|
|
152
|
+
return line, None
|
|
153
|
+
# Walk backwards from last_close to find matching '('
|
|
154
|
+
depth = 0
|
|
155
|
+
in_quote = False
|
|
156
|
+
open_pos = -1
|
|
157
|
+
for i in range(last_close, -1, -1):
|
|
158
|
+
ch = line[i]
|
|
159
|
+
if ch == ')':
|
|
160
|
+
depth += 1
|
|
161
|
+
elif ch == '(':
|
|
162
|
+
depth -= 1
|
|
163
|
+
if depth == 0:
|
|
164
|
+
open_pos = i
|
|
165
|
+
break
|
|
166
|
+
if open_pos < 0:
|
|
167
|
+
return line, None
|
|
168
|
+
content = line[open_pos + 1 : last_close]
|
|
169
|
+
if '%' in content:
|
|
170
|
+
return line[:open_pos].rstrip(), content
|
|
171
|
+
return line, None
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
# ── Species token parsing ─────────────────────────────────────────────
|
|
175
|
+
|
|
176
|
+
_ID_RE = re.compile(r'^[A-Za-z_][A-Za-z0-9_-]*$')
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _parse_species_token(
|
|
180
|
+
token: str, lineno: int | None = None
|
|
181
|
+
) -> tuple[str, str | None, str | None]:
|
|
182
|
+
"""Parse a single species token into ``(id, smiles_or_None, label_or_None)``.
|
|
183
|
+
|
|
184
|
+
Accepted forms::
|
|
185
|
+
|
|
186
|
+
ArBr → ("ArBr", None, None)
|
|
187
|
+
ArBr{BrC1=CC=CC=C1} → ("ArBr", "BrC1…", None)
|
|
188
|
+
"1"{BrC1=CC=CC=C1} → ("1", "BrC1…", "1")
|
|
189
|
+
"1" → ("1", None, "1")
|
|
190
|
+
"""
|
|
191
|
+
token = token.strip()
|
|
192
|
+
if not token:
|
|
193
|
+
raise ParseError("Empty species token", lineno)
|
|
194
|
+
|
|
195
|
+
smiles: str | None = None
|
|
196
|
+
label: str | None = None
|
|
197
|
+
|
|
198
|
+
# Extract {SMILES}
|
|
199
|
+
brace_open = -1
|
|
200
|
+
depth = 0
|
|
201
|
+
in_q = False
|
|
202
|
+
for i, ch in enumerate(token):
|
|
203
|
+
if ch == '"':
|
|
204
|
+
in_q = not in_q
|
|
205
|
+
elif ch == '{' and not in_q and depth == 0:
|
|
206
|
+
brace_open = i
|
|
207
|
+
depth += 1
|
|
208
|
+
elif ch == '{' and not in_q:
|
|
209
|
+
depth += 1
|
|
210
|
+
elif ch == '}' and not in_q:
|
|
211
|
+
depth -= 1
|
|
212
|
+
|
|
213
|
+
if brace_open >= 0:
|
|
214
|
+
brace_close = token.rfind('}')
|
|
215
|
+
if brace_close <= brace_open:
|
|
216
|
+
raise ParseError(f"Unclosed '{{' in species: {token}", lineno)
|
|
217
|
+
smiles = token[brace_open + 1 : brace_close]
|
|
218
|
+
id_part = token[:brace_open].strip()
|
|
219
|
+
else:
|
|
220
|
+
id_part = token
|
|
221
|
+
|
|
222
|
+
# Quoted ID → also becomes label
|
|
223
|
+
if id_part.startswith('"') and id_part.endswith('"') and len(id_part) >= 2:
|
|
224
|
+
id_str = id_part[1:-1]
|
|
225
|
+
label = id_str
|
|
226
|
+
elif id_part.startswith('"'):
|
|
227
|
+
raise ParseError(f"Unclosed quote in species ID: {token}", lineno)
|
|
228
|
+
else:
|
|
229
|
+
id_str = id_part
|
|
230
|
+
|
|
231
|
+
if not id_str:
|
|
232
|
+
raise ParseError(f"Empty species ID: {token}", lineno)
|
|
233
|
+
|
|
234
|
+
return id_str, smiles, label
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
# ── Condition items parsing ───────────────────────────────────────────
|
|
238
|
+
|
|
239
|
+
def _parse_condition_items(
|
|
240
|
+
text: str,
|
|
241
|
+
structures: dict[str, StructureRef],
|
|
242
|
+
lineno: int | None = None,
|
|
243
|
+
) -> ArrowContent:
|
|
244
|
+
"""Parse a comma-separated list of condition items into :class:`ArrowContent`.
|
|
245
|
+
|
|
246
|
+
Rules:
|
|
247
|
+
- ``"quoted text"`` → text label
|
|
248
|
+
- ``bare_id`` → structure reference
|
|
249
|
+
- ``bare_id{SMILES}`` → define structure inline + reference it
|
|
250
|
+
"""
|
|
251
|
+
ac = ArrowContent()
|
|
252
|
+
items = _split_respecting(text.strip(), ",")
|
|
253
|
+
for raw in items:
|
|
254
|
+
item = raw.strip()
|
|
255
|
+
if not item:
|
|
256
|
+
continue
|
|
257
|
+
if item.startswith('"') and item.endswith('"') and len(item) >= 2:
|
|
258
|
+
# Text label
|
|
259
|
+
ac.text.append(item[1:-1])
|
|
260
|
+
elif '{' in item:
|
|
261
|
+
# Inline structure definition
|
|
262
|
+
sid, smi, lbl = _parse_species_token(item, lineno)
|
|
263
|
+
if sid not in structures:
|
|
264
|
+
structures[sid] = StructureRef(id=sid, smiles=smi, label=lbl)
|
|
265
|
+
elif smi and structures[sid].smiles is None:
|
|
266
|
+
structures[sid].smiles = smi
|
|
267
|
+
ac.structures.append(sid)
|
|
268
|
+
else:
|
|
269
|
+
# Bare structure reference
|
|
270
|
+
ac.structures.append(item)
|
|
271
|
+
return ac
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
# ── Run arrow parsing ─────────────────────────────────────────────────
|
|
275
|
+
|
|
276
|
+
def _parse_run_arrow(text: str, lineno: int | None = None) -> RunArrowEntry:
|
|
277
|
+
"""Parse ``"input" -> "output"`` into a :class:`RunArrowEntry`."""
|
|
278
|
+
parts = text.split('->')
|
|
279
|
+
if len(parts) != 2:
|
|
280
|
+
raise ParseError(
|
|
281
|
+
f"Run arrow must have exactly one '->': {text!r}", lineno
|
|
282
|
+
)
|
|
283
|
+
inp = parts[0].strip().strip('"')
|
|
284
|
+
out = parts[1].strip().strip('"')
|
|
285
|
+
if not inp or not out:
|
|
286
|
+
raise ParseError(f"Empty input or output in run arrow: {text!r}", lineno)
|
|
287
|
+
return RunArrowEntry(input_label=inp, output_label=out)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
# ── Main parser ───────────────────────────────────────────────────────
|
|
291
|
+
|
|
292
|
+
def parse_compact(text: str) -> SchemeDescriptor:
|
|
293
|
+
"""Parse compact syntax text into a :class:`SchemeDescriptor`."""
|
|
294
|
+
lines = text.split('\n')
|
|
295
|
+
|
|
296
|
+
# Collected state
|
|
297
|
+
directives: dict[str, str] = {}
|
|
298
|
+
structures: dict[str, StructureRef] = {}
|
|
299
|
+
reaction_chain_line: str | None = None
|
|
300
|
+
reaction_chain_lineno: int | None = None
|
|
301
|
+
# Conditions collected per step-number (1-indexed); 0 = implicit single-step
|
|
302
|
+
step_above: dict[int, list[tuple[str, int]]] = {} # step → [(raw_text, lineno)]
|
|
303
|
+
step_below: dict[int, list[tuple[str, int]]] = {}
|
|
304
|
+
step_runs: dict[int, list[tuple[str, int]]] = {}
|
|
305
|
+
condition_key_entries: dict[str, str] = {}
|
|
306
|
+
is_parallel = False
|
|
307
|
+
|
|
308
|
+
current_step: int = 0 # 0 = implicit single-step context
|
|
309
|
+
in_conditions_block = False
|
|
310
|
+
|
|
311
|
+
for raw_lineno, raw_line in enumerate(lines, start=1):
|
|
312
|
+
line = raw_line.rstrip()
|
|
313
|
+
|
|
314
|
+
# Blank line
|
|
315
|
+
if not line or line.isspace():
|
|
316
|
+
continue
|
|
317
|
+
|
|
318
|
+
# Comment
|
|
319
|
+
if line.lstrip().startswith('#'):
|
|
320
|
+
continue
|
|
321
|
+
|
|
322
|
+
# Row separator — not supported in compact syntax
|
|
323
|
+
if line.strip() == '---':
|
|
324
|
+
raise ParseError(
|
|
325
|
+
"Row separators (---) are not supported in compact syntax. "
|
|
326
|
+
"Use YAML format for stacked-rows layout.",
|
|
327
|
+
raw_lineno,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Directive
|
|
331
|
+
if line.startswith('@'):
|
|
332
|
+
in_conditions_block = False
|
|
333
|
+
rest = line[1:].strip()
|
|
334
|
+
if rest.startswith('conditions'):
|
|
335
|
+
in_conditions_block = True
|
|
336
|
+
continue
|
|
337
|
+
parts = rest.split(None, 1)
|
|
338
|
+
if not parts:
|
|
339
|
+
raise ParseError("Empty directive", raw_lineno)
|
|
340
|
+
key = parts[0]
|
|
341
|
+
val = parts[1].strip('"') if len(parts) > 1 else ""
|
|
342
|
+
directives[key] = val
|
|
343
|
+
continue
|
|
344
|
+
|
|
345
|
+
# Condition key entry: (a) "..."
|
|
346
|
+
if in_conditions_block:
|
|
347
|
+
m = re.match(r'^\(([a-zA-Z0-9,]+)\)\s*"(.*)"', line.strip())
|
|
348
|
+
if m:
|
|
349
|
+
condition_key_entries[m.group(1)] = m.group(2)
|
|
350
|
+
continue
|
|
351
|
+
else:
|
|
352
|
+
in_conditions_block = False
|
|
353
|
+
# fall through to other parsing
|
|
354
|
+
|
|
355
|
+
# Step block header: "step N:"
|
|
356
|
+
m_step = re.match(r'^step\s+(\d+)\s*:', line)
|
|
357
|
+
if m_step:
|
|
358
|
+
current_step = int(m_step.group(1))
|
|
359
|
+
continue
|
|
360
|
+
|
|
361
|
+
# Definition line: "id: {SMILES}" or "id: name ..." or "id: file ..."
|
|
362
|
+
m_def = re.match(
|
|
363
|
+
r'^([A-Za-z_][A-Za-z0-9_-]*)\s*:\s*(.*)', line
|
|
364
|
+
)
|
|
365
|
+
if m_def and not line.lstrip().startswith(('above', 'below', 'run')):
|
|
366
|
+
sid = m_def.group(1)
|
|
367
|
+
spec = m_def.group(2).strip()
|
|
368
|
+
sref = StructureRef(id=sid)
|
|
369
|
+
|
|
370
|
+
if spec.startswith('{'):
|
|
371
|
+
close = spec.rfind('}')
|
|
372
|
+
if close < 0:
|
|
373
|
+
raise ParseError(f"Unclosed '{{' in definition of {sid}", raw_lineno)
|
|
374
|
+
sref.smiles = spec[1:close]
|
|
375
|
+
remainder = spec[close + 1 :].strip()
|
|
376
|
+
# Check for label "..."
|
|
377
|
+
lm = re.match(r'label\s+"([^"]*)"', remainder)
|
|
378
|
+
if lm:
|
|
379
|
+
sref.label = lm.group(1)
|
|
380
|
+
elif spec.startswith('name'):
|
|
381
|
+
nm = re.match(r'name\s+"([^"]*)"', spec)
|
|
382
|
+
if nm:
|
|
383
|
+
sref.name = nm.group(1)
|
|
384
|
+
else:
|
|
385
|
+
raise ParseError(f"Invalid name spec for {sid}", raw_lineno)
|
|
386
|
+
elif spec.startswith('file'):
|
|
387
|
+
fm = re.match(r'file\s+"([^"]*)"', spec)
|
|
388
|
+
if fm:
|
|
389
|
+
sref.file = fm.group(1)
|
|
390
|
+
else:
|
|
391
|
+
raise ParseError(f"Invalid file spec for {sid}", raw_lineno)
|
|
392
|
+
else:
|
|
393
|
+
raise ParseError(f"Unknown definition spec for {sid}: {spec}", raw_lineno)
|
|
394
|
+
|
|
395
|
+
if sid in structures:
|
|
396
|
+
raise ParseError(f"Duplicate structure definition: {sid}", raw_lineno)
|
|
397
|
+
structures[sid] = sref
|
|
398
|
+
continue
|
|
399
|
+
|
|
400
|
+
# Indented condition/run line
|
|
401
|
+
if line.startswith((' ', '\t')):
|
|
402
|
+
stripped = line.strip()
|
|
403
|
+
|
|
404
|
+
# Step-indexed: [N] above: ... or [N] below: ...
|
|
405
|
+
m_idx = re.match(r'^\[(\d+)\]\s*(above|below)\s*:\s*(.*)', stripped)
|
|
406
|
+
if m_idx:
|
|
407
|
+
step_num = int(m_idx.group(1))
|
|
408
|
+
position = m_idx.group(2)
|
|
409
|
+
content = m_idx.group(3)
|
|
410
|
+
target = step_above if position == 'above' else step_below
|
|
411
|
+
target.setdefault(step_num, []).append((content, raw_lineno))
|
|
412
|
+
continue
|
|
413
|
+
|
|
414
|
+
# above: / below:
|
|
415
|
+
if stripped.startswith('above:') or stripped.startswith('below:'):
|
|
416
|
+
position = 'above' if stripped.startswith('above') else 'below'
|
|
417
|
+
content = stripped.split(':', 1)[1].strip()
|
|
418
|
+
target = step_above if position == 'above' else step_below
|
|
419
|
+
target.setdefault(current_step, []).append((content, raw_lineno))
|
|
420
|
+
continue
|
|
421
|
+
|
|
422
|
+
# run: "in" -> "out" or run[N]: "in" -> "out"
|
|
423
|
+
m_run = re.match(r'^run(?:\[(\d+)\])?\s*:\s*(.*)', stripped)
|
|
424
|
+
if m_run:
|
|
425
|
+
step_num = int(m_run.group(1)) if m_run.group(1) else current_step
|
|
426
|
+
step_runs.setdefault(step_num, []).append(
|
|
427
|
+
(m_run.group(2), raw_lineno)
|
|
428
|
+
)
|
|
429
|
+
continue
|
|
430
|
+
|
|
431
|
+
# If none of the above matched, try reaction chain
|
|
432
|
+
# A reaction chain must contain at least one arrow
|
|
433
|
+
arrows = _find_arrows(line)
|
|
434
|
+
if arrows:
|
|
435
|
+
if reaction_chain_line is not None:
|
|
436
|
+
raise ParseError(
|
|
437
|
+
"Multiple reaction chains found (only one per scheme)",
|
|
438
|
+
raw_lineno,
|
|
439
|
+
)
|
|
440
|
+
reaction_chain_line = line
|
|
441
|
+
reaction_chain_lineno = raw_lineno
|
|
442
|
+
# Detect parallel arrow
|
|
443
|
+
for _, _, style, _ in arrows:
|
|
444
|
+
if style == "solid" and '==>' in line:
|
|
445
|
+
is_parallel = True
|
|
446
|
+
continue
|
|
447
|
+
|
|
448
|
+
# Before giving up, check for common syntax errors
|
|
449
|
+
if '{' in line and line.count('{') != line.count('}'):
|
|
450
|
+
raise ParseError(
|
|
451
|
+
"Unclosed '{' in line (mismatched braces)", raw_lineno
|
|
452
|
+
)
|
|
453
|
+
if line.count('"') % 2 != 0:
|
|
454
|
+
raise ParseError(
|
|
455
|
+
"Unclosed quote in line (odd number of '\"')", raw_lineno
|
|
456
|
+
)
|
|
457
|
+
raise ParseError(f"Unrecognized line: {line!r}", raw_lineno)
|
|
458
|
+
|
|
459
|
+
# ── Require a reaction chain ──────────────────────────────────
|
|
460
|
+
if reaction_chain_line is None:
|
|
461
|
+
raise ParseError("No reaction chain found (missing --> arrow)")
|
|
462
|
+
|
|
463
|
+
# ── Parse reaction chain ──────────────────────────────────────
|
|
464
|
+
chain = reaction_chain_line
|
|
465
|
+
chain, yield_str = _strip_trailing_yield(chain)
|
|
466
|
+
arrows = _find_arrows(chain)
|
|
467
|
+
if not arrows:
|
|
468
|
+
raise ParseError(
|
|
469
|
+
"No arrow found in reaction chain", reaction_chain_lineno
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
# Extract segments between arrows
|
|
473
|
+
segments: list[str] = []
|
|
474
|
+
arrow_styles: list[str] = []
|
|
475
|
+
arrow_labels: list[str | None] = []
|
|
476
|
+
prev_end = 0
|
|
477
|
+
for start, end, style, label in arrows:
|
|
478
|
+
segments.append(chain[prev_end:start])
|
|
479
|
+
arrow_styles.append(style)
|
|
480
|
+
arrow_labels.append(label)
|
|
481
|
+
prev_end = end
|
|
482
|
+
segments.append(chain[prev_end:]) # last segment (products of last step)
|
|
483
|
+
|
|
484
|
+
# Parse each segment into species
|
|
485
|
+
segment_species: list[list[tuple[str, str | None, str | None]]] = []
|
|
486
|
+
for seg in segments:
|
|
487
|
+
species_tokens = _split_respecting(seg.strip(), '+')
|
|
488
|
+
species_list: list[tuple[str, str | None, str | None]] = []
|
|
489
|
+
for tok in species_tokens:
|
|
490
|
+
tok = tok.strip()
|
|
491
|
+
if not tok:
|
|
492
|
+
continue
|
|
493
|
+
species_list.append(
|
|
494
|
+
_parse_species_token(tok, reaction_chain_lineno)
|
|
495
|
+
)
|
|
496
|
+
segment_species.append(species_list)
|
|
497
|
+
|
|
498
|
+
# Validate: no empty segments
|
|
499
|
+
for idx, seg in enumerate(segment_species):
|
|
500
|
+
if not seg:
|
|
501
|
+
if idx == 0:
|
|
502
|
+
raise ParseError(
|
|
503
|
+
"Missing substrates before first arrow",
|
|
504
|
+
reaction_chain_lineno,
|
|
505
|
+
)
|
|
506
|
+
else:
|
|
507
|
+
raise ParseError(
|
|
508
|
+
f"Missing species after arrow {idx}",
|
|
509
|
+
reaction_chain_lineno,
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
# Register all species as StructureRefs
|
|
513
|
+
for seg in segment_species:
|
|
514
|
+
for sid, smi, lbl in seg:
|
|
515
|
+
if sid not in structures:
|
|
516
|
+
structures[sid] = StructureRef(id=sid, smiles=smi, label=lbl)
|
|
517
|
+
else:
|
|
518
|
+
# Update SMILES / label if provided inline and not yet set
|
|
519
|
+
if smi and structures[sid].smiles is None:
|
|
520
|
+
structures[sid].smiles = smi
|
|
521
|
+
if lbl and structures[sid].label is None:
|
|
522
|
+
structures[sid].label = lbl
|
|
523
|
+
|
|
524
|
+
# ── Build steps ───────────────────────────────────────────────
|
|
525
|
+
num_steps = len(arrows)
|
|
526
|
+
steps: list[StepDescriptor] = []
|
|
527
|
+
|
|
528
|
+
for i in range(num_steps):
|
|
529
|
+
substrates = [sid for sid, _, _ in segment_species[i]]
|
|
530
|
+
products = [sid for sid, _, _ in segment_species[i + 1]]
|
|
531
|
+
step_num = i + 1 # 1-indexed
|
|
532
|
+
|
|
533
|
+
# Determine above/below content
|
|
534
|
+
# Try step-indexed first, then fall back to implicit (key=0) for single-step
|
|
535
|
+
above_key = step_num if step_num in step_above else (0 if num_steps == 1 else step_num)
|
|
536
|
+
below_key = step_num if step_num in step_below else (0 if num_steps == 1 else step_num)
|
|
537
|
+
|
|
538
|
+
above = ArrowContent()
|
|
539
|
+
for raw_text, ln in step_above.get(above_key, []):
|
|
540
|
+
ac = _parse_condition_items(raw_text, structures, ln)
|
|
541
|
+
above.structures.extend(ac.structures)
|
|
542
|
+
above.text.extend(ac.text)
|
|
543
|
+
|
|
544
|
+
below = ArrowContent()
|
|
545
|
+
for raw_text, ln in step_below.get(below_key, []):
|
|
546
|
+
ac = _parse_condition_items(raw_text, structures, ln)
|
|
547
|
+
below.structures.extend(ac.structures)
|
|
548
|
+
below.text.extend(ac.text)
|
|
549
|
+
|
|
550
|
+
# Arrow label → populate conditions from letter key if available
|
|
551
|
+
lbl = arrow_labels[i]
|
|
552
|
+
if lbl and condition_key_entries:
|
|
553
|
+
# Letter conditions mode — put the full text below the arrow
|
|
554
|
+
for letter in re.split(r'[,\s]+', lbl):
|
|
555
|
+
letter = letter.strip()
|
|
556
|
+
if letter in condition_key_entries:
|
|
557
|
+
below.text.append(condition_key_entries[letter])
|
|
558
|
+
|
|
559
|
+
step = StepDescriptor(
|
|
560
|
+
substrates=substrates,
|
|
561
|
+
products=products,
|
|
562
|
+
above_arrow=above if (above.structures or above.text) else None,
|
|
563
|
+
below_arrow=below if (below.structures or below.text) else None,
|
|
564
|
+
yield_=(yield_str if i == num_steps - 1 else None),
|
|
565
|
+
number=step_num if num_steps > 1 else None,
|
|
566
|
+
arrow_style=arrow_styles[i],
|
|
567
|
+
)
|
|
568
|
+
steps.append(step)
|
|
569
|
+
|
|
570
|
+
# ── Build run arrows ──────────────────────────────────────────
|
|
571
|
+
run_arrows: list[StepRunArrows] = []
|
|
572
|
+
for step_key, run_lines in sorted(step_runs.items()):
|
|
573
|
+
# Map key 0 → step 1
|
|
574
|
+
step_num = step_key if step_key > 0 else 1
|
|
575
|
+
entries: list[RunArrowEntry] = []
|
|
576
|
+
for raw_text, ln in run_lines:
|
|
577
|
+
entries.append(_parse_run_arrow(raw_text, ln))
|
|
578
|
+
run_arrows.append(StepRunArrows(step=step_num, runs=entries))
|
|
579
|
+
|
|
580
|
+
# ── Determine layout ──────────────────────────────────────────
|
|
581
|
+
explicit_layout = 'layout' in directives
|
|
582
|
+
layout = directives.get('layout', 'linear')
|
|
583
|
+
if not explicit_layout:
|
|
584
|
+
if is_parallel:
|
|
585
|
+
layout = 'numbered-parallel'
|
|
586
|
+
elif num_steps > 1:
|
|
587
|
+
layout = 'sequential'
|
|
588
|
+
if layout not in VALID_LAYOUTS:
|
|
589
|
+
raise ParseError(f"Unknown layout: {layout!r}")
|
|
590
|
+
|
|
591
|
+
wrap = directives.get('wrap', 'repeat')
|
|
592
|
+
if wrap not in VALID_WRAPS:
|
|
593
|
+
raise ParseError(f"Unknown wrap: {wrap!r}")
|
|
594
|
+
|
|
595
|
+
steps_per_row: int | None = None
|
|
596
|
+
if 'steps_per_row' in directives:
|
|
597
|
+
try:
|
|
598
|
+
steps_per_row = int(directives['steps_per_row'])
|
|
599
|
+
except ValueError:
|
|
600
|
+
raise ParseError(f"steps_per_row must be integer: {directives['steps_per_row']!r}")
|
|
601
|
+
|
|
602
|
+
title = directives.get('title')
|
|
603
|
+
|
|
604
|
+
cond_key = condition_key_entries if condition_key_entries else None
|
|
605
|
+
|
|
606
|
+
# ── Validate references ───────────────────────────────────────
|
|
607
|
+
for step in steps:
|
|
608
|
+
for sid in step.substrates + step.products:
|
|
609
|
+
if sid not in structures:
|
|
610
|
+
raise ParseError(f"Undefined structure reference: {sid!r}")
|
|
611
|
+
for ac in (step.above_arrow, step.below_arrow):
|
|
612
|
+
if ac:
|
|
613
|
+
for sid in ac.structures:
|
|
614
|
+
if sid not in structures:
|
|
615
|
+
raise ParseError(f"Undefined structure reference in conditions: {sid!r}")
|
|
616
|
+
|
|
617
|
+
return SchemeDescriptor(
|
|
618
|
+
structures=structures,
|
|
619
|
+
steps=steps,
|
|
620
|
+
layout=layout,
|
|
621
|
+
wrap=wrap,
|
|
622
|
+
steps_per_row=steps_per_row,
|
|
623
|
+
title=title,
|
|
624
|
+
run_arrows=run_arrows,
|
|
625
|
+
condition_key=cond_key,
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
def parse_compact_file(path: str) -> SchemeDescriptor:
|
|
630
|
+
"""Read a file and parse it as compact syntax."""
|
|
631
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
632
|
+
return parse_compact(f.read())
|