cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2948 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
scheme_reader.py — Read CDXML reaction schemes into structured descriptions.
|
|
4
|
+
|
|
5
|
+
The semantic inverse of the DSL renderer: takes a CDXML file containing a
|
|
6
|
+
reaction scheme (single or multi-step) and produces a structured JSON with
|
|
7
|
+
a species registry, reaction graph, topology classification, and a natural
|
|
8
|
+
language narrative suitable for LLM consumption.
|
|
9
|
+
|
|
10
|
+
Two parsing strategies (tried in order):
|
|
11
|
+
1. Step-attribute path — reads <scheme><step> attributes
|
|
12
|
+
(ReactionStepReactants/Products/Above/Below).
|
|
13
|
+
2. Geometry-based fallback — assigns roles by spatial position relative
|
|
14
|
+
to arrows.
|
|
15
|
+
|
|
16
|
+
CLI:
|
|
17
|
+
python -m cdxml_toolkit.scheme_reader scheme.cdxml -o description.json
|
|
18
|
+
python -m cdxml_toolkit.scheme_reader scheme.cdxml --narrative-only
|
|
19
|
+
|
|
20
|
+
Python API:
|
|
21
|
+
from cdxml_toolkit.perception.scheme_reader import read_scheme
|
|
22
|
+
desc = read_scheme("scheme.cdxml")
|
|
23
|
+
print(desc.narrative)
|
|
24
|
+
desc.to_json("description.json")
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import argparse
|
|
30
|
+
import json
|
|
31
|
+
import os
|
|
32
|
+
import re
|
|
33
|
+
import sys
|
|
34
|
+
from collections import defaultdict
|
|
35
|
+
from dataclasses import dataclass, field, asdict
|
|
36
|
+
from typing import Any, Dict, List, Optional, Tuple, Set
|
|
37
|
+
from xml.etree import ElementTree as ET
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
# Logging
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
_verbose = False
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _log(msg: str) -> None:
|
|
47
|
+
if _verbose:
|
|
48
|
+
print(f" [scheme_reader] {msg}", file=sys.stderr)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
# Data model
|
|
53
|
+
# ---------------------------------------------------------------------------
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class SpeciesRecord:
|
|
57
|
+
"""One chemical entity identified in the scheme."""
|
|
58
|
+
id: str = "" # "species_0", ...
|
|
59
|
+
cdxml_element_id: str = "" # CDXML element id
|
|
60
|
+
element_type: str = "" # "fragment" or "text"
|
|
61
|
+
smiles: Optional[str] = None # canonical SMILES (abbreviations resolved)
|
|
62
|
+
smiles_raw: Optional[str] = None # SMILES without abbreviation expansion
|
|
63
|
+
name: Optional[str] = None # display name / text label content
|
|
64
|
+
formula: Optional[str] = None # molecular formula
|
|
65
|
+
mw: Optional[float] = None # average molecular weight
|
|
66
|
+
label: Optional[str] = None # compound number ("1", "2a")
|
|
67
|
+
iupac_name: Optional[str] = None # IUPAC name (from ChemScript or PubChem)
|
|
68
|
+
aligned_iupac: Optional[str] = None # aligned IUPAC name (from aligned_namer)
|
|
69
|
+
text_category: Optional[str] = None # for text species: "chemical", "condition_ref",
|
|
70
|
+
# "footnote", "yield", "compound_label",
|
|
71
|
+
# "citation", "bioactivity"
|
|
72
|
+
is_solvent: bool = False # True if reagent_db role == "solvent"
|
|
73
|
+
equiv_text: Optional[str] = None # e.g. "1.2 eq", "5 mol%"
|
|
74
|
+
|
|
75
|
+
def to_dict(self) -> dict:
|
|
76
|
+
return {k: v for k, v in asdict(self).items()
|
|
77
|
+
if v is not None and v is not False}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class StepRecord:
|
|
82
|
+
"""One reaction step extracted from the scheme."""
|
|
83
|
+
step_index: int = 0 # 0-based
|
|
84
|
+
reactant_ids: List[str] = field(default_factory=list)
|
|
85
|
+
product_ids: List[str] = field(default_factory=list)
|
|
86
|
+
reagent_ids: List[str] = field(default_factory=list)
|
|
87
|
+
conditions: List[str] = field(default_factory=list)
|
|
88
|
+
condition_text_raw: List[str] = field(default_factory=list)
|
|
89
|
+
yield_text: Optional[str] = None
|
|
90
|
+
arrow_style: str = "solid" # "solid", "dashed", "failed"
|
|
91
|
+
arrow_cdxml_id: Optional[str] = None
|
|
92
|
+
molecular_diff_text: Optional[str] = None # e.g. "bromo → phenyl"
|
|
93
|
+
|
|
94
|
+
def to_dict(self) -> dict:
|
|
95
|
+
d = asdict(self)
|
|
96
|
+
return {k: v for k, v in d.items()
|
|
97
|
+
if v is not None and v != [] and v != ""}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass
|
|
101
|
+
class ScopeEntry:
|
|
102
|
+
"""One entry in a substrate scope table."""
|
|
103
|
+
entry_id: str = "" # "scope_0", "scope_1", ...
|
|
104
|
+
species_id: str = "" # SpeciesRecord.id of the scope structure
|
|
105
|
+
label: Optional[str] = None # compound number ("5.70a")
|
|
106
|
+
conditions_variant: Optional[str] = None # "X = I" or "X = Br"
|
|
107
|
+
yield_text: Optional[str] = None # "39%"
|
|
108
|
+
mass_text: Optional[str] = None # "22 mg"
|
|
109
|
+
notes: Optional[str] = None # "Scale-up: 130 mg, 16%"
|
|
110
|
+
|
|
111
|
+
def to_dict(self) -> dict:
|
|
112
|
+
return {k: v for k, v in asdict(self).items() if v is not None}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@dataclass
|
|
116
|
+
class SchemeDescription:
|
|
117
|
+
"""Complete structured description of a reaction scheme."""
|
|
118
|
+
version: str = "1.0"
|
|
119
|
+
source_file: str = ""
|
|
120
|
+
topology: str = "linear"
|
|
121
|
+
content_type: str = "" # "synthesis", "sar_design", "biological_pathway",
|
|
122
|
+
# "target_array", "literature_comparison",
|
|
123
|
+
# "composite", "investigation", "unknown",
|
|
124
|
+
# "substrate_scope"
|
|
125
|
+
num_steps: int = 0
|
|
126
|
+
species: Dict[str, SpeciesRecord] = field(default_factory=dict)
|
|
127
|
+
steps: List[StepRecord] = field(default_factory=list)
|
|
128
|
+
scope_entries: List[ScopeEntry] = field(default_factory=list)
|
|
129
|
+
sub_schemes: List["SchemeDescription"] = field(default_factory=list)
|
|
130
|
+
narrative: str = ""
|
|
131
|
+
warnings: List[str] = field(default_factory=list)
|
|
132
|
+
# --- Spatial assignment metadata (v1.1) ---
|
|
133
|
+
layout_pattern: Optional[str] = None # detected layout from spatial engine
|
|
134
|
+
parse_method: str = "" # "geometry" or "step_attribute"
|
|
135
|
+
assignment_confidences: Dict[str, float] = field(default_factory=dict)
|
|
136
|
+
|
|
137
|
+
def to_dict(self) -> dict:
|
|
138
|
+
d = {
|
|
139
|
+
"version": self.version,
|
|
140
|
+
"source_file": self.source_file,
|
|
141
|
+
"topology": self.topology,
|
|
142
|
+
"num_steps": self.num_steps,
|
|
143
|
+
"species": {k: v.to_dict() for k, v in self.species.items()},
|
|
144
|
+
"steps": [s.to_dict() for s in self.steps],
|
|
145
|
+
"narrative": self.narrative,
|
|
146
|
+
"warnings": self.warnings,
|
|
147
|
+
}
|
|
148
|
+
if self.content_type:
|
|
149
|
+
d["content_type"] = self.content_type
|
|
150
|
+
if self.scope_entries:
|
|
151
|
+
d["scope_entries"] = [e.to_dict() for e in self.scope_entries]
|
|
152
|
+
if self.sub_schemes:
|
|
153
|
+
d["sub_schemes"] = [s.to_dict() for s in self.sub_schemes]
|
|
154
|
+
if self.layout_pattern:
|
|
155
|
+
d["layout_pattern"] = self.layout_pattern
|
|
156
|
+
if self.parse_method:
|
|
157
|
+
d["parse_method"] = self.parse_method
|
|
158
|
+
if self.assignment_confidences:
|
|
159
|
+
d["assignment_confidences"] = self.assignment_confidences
|
|
160
|
+
return d
|
|
161
|
+
|
|
162
|
+
def to_json(self, path: str, pretty: bool = True) -> None:
|
|
163
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
164
|
+
json.dump(self.to_dict(), f, indent=2 if pretty else None,
|
|
165
|
+
ensure_ascii=False)
|
|
166
|
+
|
|
167
|
+
@classmethod
|
|
168
|
+
def from_json(cls, path: str) -> "SchemeDescription":
|
|
169
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
170
|
+
return cls.from_dict(json.load(f))
|
|
171
|
+
|
|
172
|
+
@classmethod
|
|
173
|
+
def from_dict(cls, d: dict) -> "SchemeDescription":
|
|
174
|
+
species = {}
|
|
175
|
+
for k, v in d.get("species", {}).items():
|
|
176
|
+
valid = {f for f in SpeciesRecord.__dataclass_fields__}
|
|
177
|
+
species[k] = SpeciesRecord(**{f: v[f] for f in valid if f in v})
|
|
178
|
+
steps = []
|
|
179
|
+
for s in d.get("steps", []):
|
|
180
|
+
valid = {f for f in StepRecord.__dataclass_fields__}
|
|
181
|
+
steps.append(StepRecord(**{f: s[f] for f in valid if f in s}))
|
|
182
|
+
scope_entries = []
|
|
183
|
+
for se in d.get("scope_entries", []):
|
|
184
|
+
valid = {f for f in ScopeEntry.__dataclass_fields__}
|
|
185
|
+
scope_entries.append(
|
|
186
|
+
ScopeEntry(**{f: se[f] for f in valid if f in se}))
|
|
187
|
+
sub_schemes = [cls.from_dict(sd)
|
|
188
|
+
for sd in d.get("sub_schemes", [])]
|
|
189
|
+
return cls(
|
|
190
|
+
version=d.get("version", "1.0"),
|
|
191
|
+
source_file=d.get("source_file", ""),
|
|
192
|
+
topology=d.get("topology", "linear"),
|
|
193
|
+
content_type=d.get("content_type", ""),
|
|
194
|
+
num_steps=d.get("num_steps", 0),
|
|
195
|
+
species=species,
|
|
196
|
+
steps=steps,
|
|
197
|
+
scope_entries=scope_entries,
|
|
198
|
+
sub_schemes=sub_schemes,
|
|
199
|
+
narrative=d.get("narrative", ""),
|
|
200
|
+
warnings=d.get("warnings", []),
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
def to_scheme_descriptor(self) -> "SchemeDescriptor":
|
|
204
|
+
"""Convert to a DSL SchemeDescriptor for round-trip rendering."""
|
|
205
|
+
from ..render.schema import (SchemeDescriptor, StepDescriptor,
|
|
206
|
+
ArrowContent, StructureRef)
|
|
207
|
+
|
|
208
|
+
structures = {}
|
|
209
|
+
for sp_id, sp in self.species.items():
|
|
210
|
+
if sp.smiles or sp.name:
|
|
211
|
+
structures[sp_id] = StructureRef(
|
|
212
|
+
id=sp_id,
|
|
213
|
+
smiles=sp.smiles,
|
|
214
|
+
name=sp.name if not sp.smiles else None,
|
|
215
|
+
label=sp.label,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
dsl_steps = []
|
|
219
|
+
for step in self.steps:
|
|
220
|
+
above = ArrowContent()
|
|
221
|
+
below = ArrowContent()
|
|
222
|
+
|
|
223
|
+
for rid in step.reagent_ids:
|
|
224
|
+
sp = self.species.get(rid)
|
|
225
|
+
if sp and sp.element_type == "fragment" and sp.smiles:
|
|
226
|
+
above.structures.append(rid)
|
|
227
|
+
elif sp and sp.name:
|
|
228
|
+
below.text.append(sp.name)
|
|
229
|
+
|
|
230
|
+
below.text.extend(step.conditions)
|
|
231
|
+
|
|
232
|
+
sd = StepDescriptor(
|
|
233
|
+
substrates=list(step.reactant_ids),
|
|
234
|
+
products=list(step.product_ids),
|
|
235
|
+
above_arrow=above if (above.structures or above.text) else None,
|
|
236
|
+
below_arrow=below if below.text else None,
|
|
237
|
+
yield_=step.yield_text,
|
|
238
|
+
arrow_style=step.arrow_style,
|
|
239
|
+
)
|
|
240
|
+
dsl_steps.append(sd)
|
|
241
|
+
|
|
242
|
+
layout_map = {
|
|
243
|
+
"linear": "linear" if len(dsl_steps) <= 1 else "sequential",
|
|
244
|
+
"divergent": "divergent",
|
|
245
|
+
"convergent": "convergent",
|
|
246
|
+
"parallel": "stacked-rows",
|
|
247
|
+
"mixed": "sequential",
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
return SchemeDescriptor(
|
|
251
|
+
structures=structures,
|
|
252
|
+
steps=dsl_steps,
|
|
253
|
+
layout=layout_map.get(self.topology, "sequential"),
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
# ---------------------------------------------------------------------------
|
|
258
|
+
# Internal intermediate structure
|
|
259
|
+
# ---------------------------------------------------------------------------
|
|
260
|
+
|
|
261
|
+
@dataclass
|
|
262
|
+
class _RawStep:
|
|
263
|
+
"""Intermediate parsed step before species registry is built."""
|
|
264
|
+
step_elem_id: str = ""
|
|
265
|
+
reactant_elem_ids: List[str] = field(default_factory=list)
|
|
266
|
+
product_elem_ids: List[str] = field(default_factory=list)
|
|
267
|
+
above_arrow_ids: List[str] = field(default_factory=list)
|
|
268
|
+
below_arrow_ids: List[str] = field(default_factory=list)
|
|
269
|
+
arrow_elem_id: Optional[str] = None
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
# ---------------------------------------------------------------------------
|
|
273
|
+
# Text extraction helpers
|
|
274
|
+
# ---------------------------------------------------------------------------
|
|
275
|
+
|
|
276
|
+
def _get_text_content(t_elem: ET.Element) -> str:
|
|
277
|
+
"""Extract plain text from a <t> element."""
|
|
278
|
+
parts = []
|
|
279
|
+
for s in t_elem.iter("s"):
|
|
280
|
+
if s.text:
|
|
281
|
+
parts.append(s.text)
|
|
282
|
+
return "".join(parts).strip()
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
_YIELD_RE = re.compile(r"(\d+(?:\.\d+)?\s*%)")
|
|
286
|
+
_QUANT_RE = re.compile(r"\bquant\.?\b", re.IGNORECASE)
|
|
287
|
+
_LABEL_RE = re.compile(r"^[1-9]\d{0,2}[a-z]?$|^\([ivx]+\)$|^[a-z]$",
|
|
288
|
+
re.IGNORECASE)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _extract_yield_from_text(text: str) -> Optional[str]:
|
|
292
|
+
"""Extract yield percentage from a text string."""
|
|
293
|
+
m = _YIELD_RE.search(text)
|
|
294
|
+
if m:
|
|
295
|
+
return m.group(1)
|
|
296
|
+
if _QUANT_RE.search(text):
|
|
297
|
+
return "quant."
|
|
298
|
+
return None
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
# ---------------------------------------------------------------------------
|
|
302
|
+
# Arrow helpers
|
|
303
|
+
# ---------------------------------------------------------------------------
|
|
304
|
+
|
|
305
|
+
def _arrow_endpoints(arrow: ET.Element) -> Tuple[float, float, float, float]:
|
|
306
|
+
"""Return (tail_x, tail_y, head_x, head_y) from an arrow element."""
|
|
307
|
+
from ..cdxml_utils import arrow_endpoints
|
|
308
|
+
return arrow_endpoints(arrow)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _resolve_arrow(page: ET.Element, arrow_id: str,
|
|
312
|
+
id_map: Dict[str, ET.Element]) -> Optional[ET.Element]:
|
|
313
|
+
"""Resolve arrow element from ID, following SupersededBy chains."""
|
|
314
|
+
el = id_map.get(arrow_id)
|
|
315
|
+
if el is not None and el.tag == "arrow":
|
|
316
|
+
return el
|
|
317
|
+
if el is not None and el.tag == "graphic":
|
|
318
|
+
sup_id = el.get("SupersededBy", "")
|
|
319
|
+
if sup_id:
|
|
320
|
+
arrow_el = id_map.get(sup_id)
|
|
321
|
+
if arrow_el is not None:
|
|
322
|
+
return arrow_el
|
|
323
|
+
# Also search page children for graphic → arrow chain
|
|
324
|
+
for child in page:
|
|
325
|
+
if child.tag == "graphic" and child.get("id") == arrow_id:
|
|
326
|
+
sup_id = child.get("SupersededBy", "")
|
|
327
|
+
if sup_id:
|
|
328
|
+
for child2 in page:
|
|
329
|
+
if child2.get("id") == sup_id:
|
|
330
|
+
return child2
|
|
331
|
+
return None
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _detect_arrow_style(arrow: Optional[ET.Element]) -> str:
|
|
335
|
+
"""Detect arrow style from element attributes."""
|
|
336
|
+
if arrow is None:
|
|
337
|
+
return "solid"
|
|
338
|
+
# NoGo="Cross" means failed reaction (X on arrow)
|
|
339
|
+
if arrow.get("NoGo") == "Cross":
|
|
340
|
+
return "failed"
|
|
341
|
+
# Dashed arrow
|
|
342
|
+
line_type = arrow.get("LineType", "")
|
|
343
|
+
if line_type.lower() in ("dash", "dashed", "dot"):
|
|
344
|
+
return "dashed"
|
|
345
|
+
# Check ArrowheadType for dashed variant
|
|
346
|
+
aht = arrow.get("ArrowheadType", "")
|
|
347
|
+
if aht.lower() == "dashed":
|
|
348
|
+
return "dashed"
|
|
349
|
+
return "solid"
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def _find_all_arrows(page: ET.Element) -> List[ET.Element]:
|
|
353
|
+
"""Find all reaction arrows on the page."""
|
|
354
|
+
arrows = []
|
|
355
|
+
seen_ids: Set[str] = set()
|
|
356
|
+
for el in page:
|
|
357
|
+
if el.tag == "arrow":
|
|
358
|
+
eid = el.get("id", "")
|
|
359
|
+
if eid not in seen_ids:
|
|
360
|
+
arrows.append(el)
|
|
361
|
+
seen_ids.add(eid)
|
|
362
|
+
# Also check for graphic elements with arrow attributes
|
|
363
|
+
for el in page:
|
|
364
|
+
if el.tag == "graphic":
|
|
365
|
+
if el.get("GraphicType") == "Line" and el.get("ArrowType"):
|
|
366
|
+
eid = el.get("id", "")
|
|
367
|
+
if eid not in seen_ids:
|
|
368
|
+
arrows.append(el)
|
|
369
|
+
seen_ids.add(eid)
|
|
370
|
+
return arrows
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
# ---------------------------------------------------------------------------
|
|
374
|
+
# Step-attribute parsing (primary path)
|
|
375
|
+
# ---------------------------------------------------------------------------
|
|
376
|
+
|
|
377
|
+
def _parse_from_step_attributes(page: ET.Element,
|
|
378
|
+
id_map: Dict[str, ET.Element],
|
|
379
|
+
scheme_filter: Optional[Set[str]] = None,
|
|
380
|
+
) -> List[_RawStep]:
|
|
381
|
+
"""Parse steps using <scheme><step> element attributes.
|
|
382
|
+
|
|
383
|
+
Iterates ALL <scheme> elements on the page (there may be multiple
|
|
384
|
+
for stacked-rows layouts).
|
|
385
|
+
|
|
386
|
+
Parameters
|
|
387
|
+
----------
|
|
388
|
+
scheme_filter : set of str, optional
|
|
389
|
+
If provided, only process ``<scheme>`` elements whose ``id``
|
|
390
|
+
is in this set. Used by the segmenter to parse a single
|
|
391
|
+
sub-scheme from a multi-panel file.
|
|
392
|
+
"""
|
|
393
|
+
raw_steps: List[_RawStep] = []
|
|
394
|
+
|
|
395
|
+
# Find all scheme elements (could be multiple for stacked sections)
|
|
396
|
+
schemes = page.findall("scheme")
|
|
397
|
+
if not schemes:
|
|
398
|
+
# Also try deeper nesting
|
|
399
|
+
schemes = page.findall(".//scheme")
|
|
400
|
+
|
|
401
|
+
for scheme_el in schemes:
|
|
402
|
+
if scheme_filter is not None:
|
|
403
|
+
if scheme_el.get("id", "") not in scheme_filter:
|
|
404
|
+
continue
|
|
405
|
+
for step_el in scheme_el.findall("step"):
|
|
406
|
+
step_id = step_el.get("id", "")
|
|
407
|
+
|
|
408
|
+
reactant_ids = step_el.get("ReactionStepReactants", "").split()
|
|
409
|
+
product_ids = step_el.get("ReactionStepProducts", "").split()
|
|
410
|
+
above_ids = step_el.get("ReactionStepObjectsAboveArrow", "").split()
|
|
411
|
+
below_ids = step_el.get("ReactionStepObjectsBelowArrow", "").split()
|
|
412
|
+
arrow_ids = step_el.get("ReactionStepArrows", "").split()
|
|
413
|
+
|
|
414
|
+
# Filter out empty strings from split
|
|
415
|
+
reactant_ids = [x for x in reactant_ids if x]
|
|
416
|
+
product_ids = [x for x in product_ids if x]
|
|
417
|
+
above_ids = [x for x in above_ids if x]
|
|
418
|
+
below_ids = [x for x in below_ids if x]
|
|
419
|
+
arrow_ids = [x for x in arrow_ids if x]
|
|
420
|
+
|
|
421
|
+
# Validate IDs exist in id_map
|
|
422
|
+
for eid in reactant_ids + product_ids + above_ids + below_ids:
|
|
423
|
+
if eid not in id_map:
|
|
424
|
+
_log(f"Warning: element id {eid} in step {step_id} "
|
|
425
|
+
f"not found in page")
|
|
426
|
+
|
|
427
|
+
# Resolve arrow ID (take first if multiple)
|
|
428
|
+
arrow_elem_id = arrow_ids[0] if arrow_ids else None
|
|
429
|
+
|
|
430
|
+
raw_steps.append(_RawStep(
|
|
431
|
+
step_elem_id=step_id,
|
|
432
|
+
reactant_elem_ids=reactant_ids,
|
|
433
|
+
product_elem_ids=product_ids,
|
|
434
|
+
above_arrow_ids=above_ids,
|
|
435
|
+
below_arrow_ids=below_ids,
|
|
436
|
+
arrow_elem_id=arrow_elem_id,
|
|
437
|
+
))
|
|
438
|
+
|
|
439
|
+
return raw_steps
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
# ---------------------------------------------------------------------------
|
|
443
|
+
# Orphan transition-arrow recovery (serpentine layouts)
|
|
444
|
+
# ---------------------------------------------------------------------------
|
|
445
|
+
|
|
446
|
+
def _recover_orphan_transition_steps(
|
|
447
|
+
page: ET.Element,
|
|
448
|
+
raw_steps: List[_RawStep],
|
|
449
|
+
id_map: Dict[str, ET.Element],
|
|
450
|
+
) -> List[_RawStep]:
|
|
451
|
+
"""Recover reaction steps from orphan vertical arrows.
|
|
452
|
+
|
|
453
|
+
In serpentine layouts the DSL renderer emits vertical transition arrows
|
|
454
|
+
outside any ``<scheme><step>`` element. The step-attribute parser
|
|
455
|
+
therefore misses them, leaving disconnected row-groups that the topology
|
|
456
|
+
detector wrongly classifies as "parallel".
|
|
457
|
+
|
|
458
|
+
This function detects those orphan vertical arrows, spatially resolves
|
|
459
|
+
their nearest reactant/product fragments, collects nearby condition text,
|
|
460
|
+
and inserts synthetic ``_RawStep`` entries at the correct position in
|
|
461
|
+
*raw_steps* so that the downstream species-registry and topology
|
|
462
|
+
detector see a fully-connected chain.
|
|
463
|
+
|
|
464
|
+
Parameters
|
|
465
|
+
----------
|
|
466
|
+
page : ET.Element
|
|
467
|
+
The ``<page>`` element of the parsed CDXML.
|
|
468
|
+
raw_steps : list of _RawStep
|
|
469
|
+
The steps already found by the step-attribute parser (mutated
|
|
470
|
+
in-place via insertion).
|
|
471
|
+
id_map : dict
|
|
472
|
+
Element-id → element mapping for the page.
|
|
473
|
+
|
|
474
|
+
Returns
|
|
475
|
+
-------
|
|
476
|
+
list of _RawStep
|
|
477
|
+
The *raw_steps* list, possibly with additional entries inserted.
|
|
478
|
+
"""
|
|
479
|
+
from ..cdxml_utils import arrow_endpoints as _ae, fragment_centroid
|
|
480
|
+
|
|
481
|
+
if not raw_steps:
|
|
482
|
+
return raw_steps
|
|
483
|
+
|
|
484
|
+
# Collect arrow IDs already claimed by existing steps
|
|
485
|
+
claimed_arrow_ids: Set[str] = set()
|
|
486
|
+
for rs in raw_steps:
|
|
487
|
+
if rs.arrow_elem_id:
|
|
488
|
+
claimed_arrow_ids.add(rs.arrow_elem_id)
|
|
489
|
+
|
|
490
|
+
# Collect element IDs already claimed (reactants/products/above/below)
|
|
491
|
+
claimed_elem_ids: Set[str] = set()
|
|
492
|
+
for rs in raw_steps:
|
|
493
|
+
claimed_elem_ids.update(rs.reactant_elem_ids)
|
|
494
|
+
claimed_elem_ids.update(rs.product_elem_ids)
|
|
495
|
+
claimed_elem_ids.update(rs.above_arrow_ids)
|
|
496
|
+
claimed_elem_ids.update(rs.below_arrow_ids)
|
|
497
|
+
|
|
498
|
+
# Compute a length threshold from existing step arrows.
|
|
499
|
+
# Serpentine transition arrows are comparable in size to the reaction
|
|
500
|
+
# arrows; tiny annotation arrows (15-20 pt) should be ignored.
|
|
501
|
+
import math
|
|
502
|
+
step_arrow_lengths: List[float] = []
|
|
503
|
+
for rs in raw_steps:
|
|
504
|
+
if rs.arrow_elem_id:
|
|
505
|
+
a_el = id_map.get(rs.arrow_elem_id)
|
|
506
|
+
if a_el is not None:
|
|
507
|
+
atx, aty, ahx, ahy = _ae(a_el)
|
|
508
|
+
step_arrow_lengths.append(math.hypot(ahx - atx, ahy - aty))
|
|
509
|
+
min_arrow_len = 30.0 # absolute floor
|
|
510
|
+
if step_arrow_lengths:
|
|
511
|
+
median_len = sorted(step_arrow_lengths)[len(step_arrow_lengths) // 2]
|
|
512
|
+
# Require at least 40% of the median step-arrow length
|
|
513
|
+
min_arrow_len = max(min_arrow_len, 0.4 * median_len)
|
|
514
|
+
|
|
515
|
+
# Build set of element IDs that are products of existing steps
|
|
516
|
+
# (the orphan arrow's reactant must be one of these to qualify)
|
|
517
|
+
existing_product_eids: Set[str] = set()
|
|
518
|
+
for rs in raw_steps:
|
|
519
|
+
existing_product_eids.update(rs.product_elem_ids)
|
|
520
|
+
|
|
521
|
+
# Find orphan arrows on the page
|
|
522
|
+
orphan_arrows = []
|
|
523
|
+
for el in page:
|
|
524
|
+
if el.tag != "arrow":
|
|
525
|
+
continue
|
|
526
|
+
eid = el.get("id", "")
|
|
527
|
+
if eid in claimed_arrow_ids:
|
|
528
|
+
continue
|
|
529
|
+
tx, ty, hx, hy = _ae(el)
|
|
530
|
+
dx, dy = hx - tx, hy - ty
|
|
531
|
+
# Only consider substantially vertical arrows (|dy| > |dx|)
|
|
532
|
+
if abs(dy) <= abs(dx):
|
|
533
|
+
continue
|
|
534
|
+
# Must be long enough to be a real reaction arrow
|
|
535
|
+
if math.hypot(dx, dy) < min_arrow_len:
|
|
536
|
+
continue
|
|
537
|
+
orphan_arrows.append({
|
|
538
|
+
"element": el,
|
|
539
|
+
"id": eid,
|
|
540
|
+
"tail_x": tx, "tail_y": ty,
|
|
541
|
+
"head_x": hx, "head_y": hy,
|
|
542
|
+
"mid_x": (tx + hx) / 2, "mid_y": (ty + hy) / 2,
|
|
543
|
+
})
|
|
544
|
+
|
|
545
|
+
if not orphan_arrows:
|
|
546
|
+
return raw_steps
|
|
547
|
+
|
|
548
|
+
# Collect fragment centroids (exclude already-claimed where possible)
|
|
549
|
+
frag_data = []
|
|
550
|
+
for el in page:
|
|
551
|
+
if el.tag == "fragment":
|
|
552
|
+
c = fragment_centroid(el)
|
|
553
|
+
if c:
|
|
554
|
+
frag_data.append({
|
|
555
|
+
"id": el.get("id", ""),
|
|
556
|
+
"cx": c[0], "cy": c[1],
|
|
557
|
+
})
|
|
558
|
+
|
|
559
|
+
# Collect text element positions
|
|
560
|
+
text_data = []
|
|
561
|
+
for el in page:
|
|
562
|
+
if el.tag == "t":
|
|
563
|
+
tid = el.get("id", "")
|
|
564
|
+
p = el.get("p")
|
|
565
|
+
if p:
|
|
566
|
+
parts = p.split()
|
|
567
|
+
tcx, tcy = float(parts[0]), float(parts[1])
|
|
568
|
+
else:
|
|
569
|
+
bb = el.get("BoundingBox", "")
|
|
570
|
+
if bb:
|
|
571
|
+
vals = [float(v) for v in bb.split()]
|
|
572
|
+
tcx = (vals[0] + vals[2]) / 2
|
|
573
|
+
tcy = (vals[1] + vals[3]) / 2
|
|
574
|
+
else:
|
|
575
|
+
continue
|
|
576
|
+
text_data.append({"id": tid, "cx": tcx, "cy": tcy})
|
|
577
|
+
|
|
578
|
+
# Build product→step-index map to find the insertion point
|
|
579
|
+
product_to_step_idx: Dict[str, int] = {}
|
|
580
|
+
for i, rs in enumerate(raw_steps):
|
|
581
|
+
for pid in rs.product_elem_ids:
|
|
582
|
+
product_to_step_idx[pid] = i
|
|
583
|
+
|
|
584
|
+
# Process each orphan vertical arrow
|
|
585
|
+
new_entries: List[Tuple[int, _RawStep]] = [] # (insert_after_idx, step)
|
|
586
|
+
|
|
587
|
+
for oa in orphan_arrows:
|
|
588
|
+
# Find nearest fragment on the tail side (reactant)
|
|
589
|
+
# CDXML y increases downward; vertical arrow goes from
|
|
590
|
+
# tail (upper) to head (lower).
|
|
591
|
+
best_reactant = None
|
|
592
|
+
best_r_dist = float("inf")
|
|
593
|
+
for fd in frag_data:
|
|
594
|
+
# Reactant should be above/near the tail (cy <= tail_y + margin)
|
|
595
|
+
if fd["cy"] > oa["mid_y"]:
|
|
596
|
+
continue # below midpoint — candidate for product, not reactant
|
|
597
|
+
dist = ((fd["cx"] - oa["tail_x"])**2
|
|
598
|
+
+ (fd["cy"] - oa["tail_y"])**2)**0.5
|
|
599
|
+
if dist < best_r_dist:
|
|
600
|
+
best_r_dist = dist
|
|
601
|
+
best_reactant = fd
|
|
602
|
+
|
|
603
|
+
# Find nearest fragment on the head side (product)
|
|
604
|
+
best_product = None
|
|
605
|
+
best_p_dist = float("inf")
|
|
606
|
+
for fd in frag_data:
|
|
607
|
+
# Product should be below/near the head (cy >= mid_y)
|
|
608
|
+
if fd["cy"] < oa["mid_y"]:
|
|
609
|
+
continue # above midpoint — candidate for reactant
|
|
610
|
+
dist = ((fd["cx"] - oa["head_x"])**2
|
|
611
|
+
+ (fd["cy"] - oa["head_y"])**2)**0.5
|
|
612
|
+
if dist < best_p_dist:
|
|
613
|
+
best_p_dist = dist
|
|
614
|
+
best_product = fd
|
|
615
|
+
|
|
616
|
+
if best_reactant is None or best_product is None:
|
|
617
|
+
continue # can't resolve both ends
|
|
618
|
+
|
|
619
|
+
# Sanity check: distances should be reasonable (< 5× arrow length)
|
|
620
|
+
arrow_len = abs(oa["head_y"] - oa["tail_y"])
|
|
621
|
+
if best_r_dist > 5 * arrow_len or best_p_dist > 5 * arrow_len:
|
|
622
|
+
continue
|
|
623
|
+
|
|
624
|
+
reactant_id = best_reactant["id"]
|
|
625
|
+
product_id = best_product["id"]
|
|
626
|
+
|
|
627
|
+
# The reactant fragment must be a product of an existing step —
|
|
628
|
+
# this ensures we are bridging two rows of a serpentine layout
|
|
629
|
+
# rather than picking up unrelated annotation arrows.
|
|
630
|
+
if reactant_id not in existing_product_eids:
|
|
631
|
+
continue
|
|
632
|
+
|
|
633
|
+
# Find condition text elements near the arrow body
|
|
634
|
+
# (between tail and head, or slightly to the side)
|
|
635
|
+
condition_ids = []
|
|
636
|
+
arrow_len_x2 = 2.0 * arrow_len
|
|
637
|
+
for td in text_data:
|
|
638
|
+
if td["id"] in claimed_elem_ids:
|
|
639
|
+
continue
|
|
640
|
+
# Must be reasonably close to the arrow midpoint
|
|
641
|
+
dist = ((td["cx"] - oa["mid_x"])**2
|
|
642
|
+
+ (td["cy"] - oa["mid_y"])**2)**0.5
|
|
643
|
+
if dist > arrow_len_x2:
|
|
644
|
+
continue
|
|
645
|
+
# Skip compound labels that are close to reactant/product
|
|
646
|
+
if best_reactant:
|
|
647
|
+
r_dist = ((td["cx"] - best_reactant["cx"])**2
|
|
648
|
+
+ (td["cy"] - best_reactant["cy"])**2)**0.5
|
|
649
|
+
if r_dist < arrow_len * 0.6:
|
|
650
|
+
continue
|
|
651
|
+
if best_product:
|
|
652
|
+
p_dist = ((td["cx"] - best_product["cx"])**2
|
|
653
|
+
+ (td["cy"] - best_product["cy"])**2)**0.5
|
|
654
|
+
if p_dist < arrow_len * 0.6:
|
|
655
|
+
continue
|
|
656
|
+
condition_ids.append(td["id"])
|
|
657
|
+
|
|
658
|
+
# Build the synthetic _RawStep
|
|
659
|
+
step = _RawStep(
|
|
660
|
+
step_elem_id=oa["id"],
|
|
661
|
+
reactant_elem_ids=[reactant_id],
|
|
662
|
+
product_elem_ids=[product_id],
|
|
663
|
+
above_arrow_ids=[],
|
|
664
|
+
below_arrow_ids=condition_ids,
|
|
665
|
+
arrow_elem_id=oa["id"],
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
# Determine insertion position: after the step whose product is
|
|
669
|
+
# our reactant fragment
|
|
670
|
+
insert_after = product_to_step_idx.get(reactant_id, len(raw_steps) - 1)
|
|
671
|
+
new_entries.append((insert_after, step))
|
|
672
|
+
|
|
673
|
+
_log(f"Recovered orphan transition step from arrow {oa['id']}: "
|
|
674
|
+
f"reactant={reactant_id} -> product={product_id} "
|
|
675
|
+
f"(conditions: {len(condition_ids)} text element(s))")
|
|
676
|
+
|
|
677
|
+
# Insert new entries in reverse order to preserve indices
|
|
678
|
+
new_entries.sort(key=lambda x: x[0], reverse=True)
|
|
679
|
+
for insert_after, step in new_entries:
|
|
680
|
+
raw_steps.insert(insert_after + 1, step)
|
|
681
|
+
|
|
682
|
+
return raw_steps
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
# ---------------------------------------------------------------------------
|
|
686
|
+
# Geometry-based fallback
|
|
687
|
+
# ---------------------------------------------------------------------------
|
|
688
|
+
|
|
689
|
+
def _parse_from_geometry(page: ET.Element,
|
|
690
|
+
id_map: Dict[str, ET.Element],
|
|
691
|
+
) -> List[_RawStep]:
|
|
692
|
+
"""Parse steps using spatial position relative to arrows.
|
|
693
|
+
|
|
694
|
+
Fallback for CDXML files without <scheme><step> attributes.
|
|
695
|
+
"""
|
|
696
|
+
from ..cdxml_utils import fragment_centroid
|
|
697
|
+
|
|
698
|
+
arrows = _find_all_arrows(page)
|
|
699
|
+
if not arrows:
|
|
700
|
+
return []
|
|
701
|
+
|
|
702
|
+
# Get arrow data sorted by tail x-position
|
|
703
|
+
arrow_data = []
|
|
704
|
+
for arrow in arrows:
|
|
705
|
+
tx, ty, hx, hy = _arrow_endpoints(arrow)
|
|
706
|
+
# Ensure tail is left of head for horizontal arrows
|
|
707
|
+
if tx > hx:
|
|
708
|
+
tx, ty, hx, hy = hx, hy, tx, ty
|
|
709
|
+
arrow_data.append({
|
|
710
|
+
"element": arrow,
|
|
711
|
+
"id": arrow.get("id", ""),
|
|
712
|
+
"tail_x": tx, "tail_y": ty,
|
|
713
|
+
"head_x": hx, "head_y": hy,
|
|
714
|
+
"mid_x": (tx + hx) / 2,
|
|
715
|
+
"mid_y": (ty + hy) / 2,
|
|
716
|
+
})
|
|
717
|
+
arrow_data.sort(key=lambda a: a["tail_x"])
|
|
718
|
+
|
|
719
|
+
# Collect all fragments and text elements on the page
|
|
720
|
+
fragments = []
|
|
721
|
+
texts = []
|
|
722
|
+
for el in page:
|
|
723
|
+
if el.tag == "fragment":
|
|
724
|
+
centroid = fragment_centroid(el)
|
|
725
|
+
if centroid:
|
|
726
|
+
cx, cy = centroid
|
|
727
|
+
else:
|
|
728
|
+
cx, cy = 0.0, 0.0
|
|
729
|
+
fragments.append({
|
|
730
|
+
"element": el,
|
|
731
|
+
"id": el.get("id", ""),
|
|
732
|
+
"cx": cx, "cy": cy,
|
|
733
|
+
})
|
|
734
|
+
elif el.tag == "t":
|
|
735
|
+
p = el.get("p")
|
|
736
|
+
if p:
|
|
737
|
+
parts = p.split()
|
|
738
|
+
tx_coord, ty_coord = float(parts[0]), float(parts[1])
|
|
739
|
+
else:
|
|
740
|
+
bb = el.get("BoundingBox", "")
|
|
741
|
+
if bb:
|
|
742
|
+
vals = [float(v) for v in bb.split()]
|
|
743
|
+
tx_coord = (vals[0] + vals[2]) / 2
|
|
744
|
+
ty_coord = (vals[1] + vals[3]) / 2
|
|
745
|
+
else:
|
|
746
|
+
continue
|
|
747
|
+
texts.append({
|
|
748
|
+
"element": el,
|
|
749
|
+
"id": el.get("id", ""),
|
|
750
|
+
"cx": tx_coord, "cy": ty_coord,
|
|
751
|
+
})
|
|
752
|
+
|
|
753
|
+
# Build raw steps by assigning elements to their nearest arrow
|
|
754
|
+
raw_steps: List[_RawStep] = []
|
|
755
|
+
|
|
756
|
+
for arrow_idx, ad in enumerate(arrow_data):
|
|
757
|
+
step = _RawStep(
|
|
758
|
+
step_elem_id=ad["id"],
|
|
759
|
+
arrow_elem_id=ad["id"],
|
|
760
|
+
)
|
|
761
|
+
|
|
762
|
+
# Determine the x-range boundaries for this arrow
|
|
763
|
+
# Left boundary: either the start of the page or the previous arrow's head
|
|
764
|
+
left_bound = arrow_data[arrow_idx - 1]["head_x"] if arrow_idx > 0 else -1e9
|
|
765
|
+
# Right boundary: either the end of the page or the next arrow's tail
|
|
766
|
+
right_bound = (arrow_data[arrow_idx + 1]["tail_x"]
|
|
767
|
+
if arrow_idx < len(arrow_data) - 1 else 1e9)
|
|
768
|
+
|
|
769
|
+
for frag in fragments:
|
|
770
|
+
cx = frag["cx"]
|
|
771
|
+
fid = frag["id"]
|
|
772
|
+
|
|
773
|
+
# Check if this fragment belongs to this arrow's zone
|
|
774
|
+
if cx < ad["tail_x"] and cx >= left_bound:
|
|
775
|
+
# Left of tail → reactant
|
|
776
|
+
step.reactant_elem_ids.append(fid)
|
|
777
|
+
elif cx > ad["head_x"] and cx <= right_bound:
|
|
778
|
+
# Right of head → product
|
|
779
|
+
step.product_elem_ids.append(fid)
|
|
780
|
+
elif ad["tail_x"] <= cx <= ad["head_x"]:
|
|
781
|
+
# Between tail and head → above/below based on y
|
|
782
|
+
cy = frag["cy"]
|
|
783
|
+
if cy < ad["mid_y"]:
|
|
784
|
+
step.above_arrow_ids.append(fid)
|
|
785
|
+
else:
|
|
786
|
+
step.below_arrow_ids.append(fid)
|
|
787
|
+
|
|
788
|
+
for txt in texts:
|
|
789
|
+
tx_coord = txt["cx"]
|
|
790
|
+
tid = txt["id"]
|
|
791
|
+
|
|
792
|
+
# Only assign text within the arrow's x-span
|
|
793
|
+
if ad["tail_x"] - 20 <= tx_coord <= ad["head_x"] + 20:
|
|
794
|
+
ty_coord = txt["cy"]
|
|
795
|
+
if ty_coord < ad["mid_y"]:
|
|
796
|
+
step.above_arrow_ids.append(tid)
|
|
797
|
+
else:
|
|
798
|
+
step.below_arrow_ids.append(tid)
|
|
799
|
+
|
|
800
|
+
raw_steps.append(step)
|
|
801
|
+
|
|
802
|
+
# Handle shared intermediates: product of step i that overlaps with
|
|
803
|
+
# reactant of step i+1
|
|
804
|
+
for i in range(len(raw_steps) - 1):
|
|
805
|
+
curr_products = set(raw_steps[i].product_elem_ids)
|
|
806
|
+
next_reactants = set(raw_steps[i + 1].reactant_elem_ids)
|
|
807
|
+
# If no reactants found for next step, check if current products
|
|
808
|
+
# should be shared
|
|
809
|
+
if not next_reactants:
|
|
810
|
+
for pid in raw_steps[i].product_elem_ids:
|
|
811
|
+
raw_steps[i + 1].reactant_elem_ids.append(pid)
|
|
812
|
+
|
|
813
|
+
return raw_steps
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
# ---------------------------------------------------------------------------
|
|
817
|
+
# Spatial-engine bridge (geometry-first primary path)
|
|
818
|
+
# ---------------------------------------------------------------------------
|
|
819
|
+
|
|
820
|
+
def _parse_from_spatial_engine(
|
|
821
|
+
page: ET.Element,
|
|
822
|
+
id_map: Dict[str, ET.Element],
|
|
823
|
+
) -> Optional[List[_RawStep]]:
|
|
824
|
+
"""Parse steps using the spatial_assignment engine.
|
|
825
|
+
|
|
826
|
+
Returns a list of _RawStep or None if no arrows found.
|
|
827
|
+
Stores metadata (layout_pattern, confidences) on the function object
|
|
828
|
+
as ``_parse_from_spatial_engine._last_meta`` for retrieval by the caller.
|
|
829
|
+
"""
|
|
830
|
+
from .spatial_assignment import (
|
|
831
|
+
build_arrow_vectors, classify_layout, assign_elements,
|
|
832
|
+
)
|
|
833
|
+
|
|
834
|
+
arrows = build_arrow_vectors(page)
|
|
835
|
+
if not arrows:
|
|
836
|
+
_parse_from_spatial_engine._last_meta = {} # type: ignore[attr-defined]
|
|
837
|
+
return None
|
|
838
|
+
|
|
839
|
+
layout = classify_layout(arrows)
|
|
840
|
+
steps, results = assign_elements(arrows, page, layout)
|
|
841
|
+
|
|
842
|
+
# Convert spatial_assignment.RawStep -> scheme_reader._RawStep
|
|
843
|
+
raw_steps: List[_RawStep] = []
|
|
844
|
+
for sa_step in steps:
|
|
845
|
+
raw = _RawStep(
|
|
846
|
+
step_elem_id=sa_step.arrow_id,
|
|
847
|
+
arrow_elem_id=sa_step.arrow_id,
|
|
848
|
+
)
|
|
849
|
+
raw.reactant_elem_ids = list(sa_step.reactant_ids)
|
|
850
|
+
raw.product_elem_ids = list(sa_step.product_ids)
|
|
851
|
+
raw.above_arrow_ids = list(sa_step.above_arrow_ids)
|
|
852
|
+
raw.below_arrow_ids = list(sa_step.below_arrow_ids)
|
|
853
|
+
raw_steps.append(raw)
|
|
854
|
+
|
|
855
|
+
# Store metadata for caller
|
|
856
|
+
confidences = {r.element_id: r.confidence for r in results}
|
|
857
|
+
_parse_from_spatial_engine._last_meta = { # type: ignore[attr-defined]
|
|
858
|
+
"layout_pattern": layout.value,
|
|
859
|
+
"confidences": confidences,
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
return raw_steps
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
# ---------------------------------------------------------------------------
|
|
866
|
+
# Name resolution helpers
|
|
867
|
+
# ---------------------------------------------------------------------------
|
|
868
|
+
|
|
869
|
+
def _name_from_smiles(smiles: str) -> Optional[str]:
|
|
870
|
+
"""Look up a display name for a SMILES string via reagent_db."""
|
|
871
|
+
try:
|
|
872
|
+
from ..resolve.reagent_db import get_reagent_db
|
|
873
|
+
db = get_reagent_db()
|
|
874
|
+
entry = db.entry_for_smiles(smiles)
|
|
875
|
+
if entry:
|
|
876
|
+
return entry.get("display") or entry.get("name")
|
|
877
|
+
except Exception:
|
|
878
|
+
pass
|
|
879
|
+
return None
|
|
880
|
+
|
|
881
|
+
|
|
882
|
+
# ---------------------------------------------------------------------------
|
|
883
|
+
# Text classification patterns
|
|
884
|
+
# ---------------------------------------------------------------------------
|
|
885
|
+
|
|
886
|
+
# Condition reference letters: "a", "b, c", "d,e", "a, b, c, d"
|
|
887
|
+
_CONDITION_REF_RE = re.compile(
|
|
888
|
+
r"^[a-z](\s*[,/]\s*[a-z])*$"
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
# Condition ref with "or": "a or b"
|
|
892
|
+
_CONDITION_REF_OR_RE = re.compile(
|
|
893
|
+
r"^[a-z]\s+or\s+[a-z]$", re.IGNORECASE
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
# Footnote text: "(a) morpholine (1.2 eq), Pd2(dba)3 (5 mol%), ..."
|
|
897
|
+
# Requires letter enclosed in parens — the standard format for condition footnotes
|
|
898
|
+
# in reaction scheme literature.
|
|
899
|
+
_FOOTNOTE_RE = re.compile(
|
|
900
|
+
r"^\(([a-z])\)\s+\S",
|
|
901
|
+
re.IGNORECASE
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
# Pure yield text: "72%", "(85%)", "92% yield", "quant.", "(quant.)"
|
|
905
|
+
_YIELD_ONLY_RE = re.compile(
|
|
906
|
+
r"^\(?\d+(?:\.\d+)?\s*%\s*(yield)?\)?$|"
|
|
907
|
+
r"^\(?quant\.?\)?$",
|
|
908
|
+
re.IGNORECASE
|
|
909
|
+
)
|
|
910
|
+
|
|
911
|
+
# Compound labels: "1", "2a", "15", "SM-1", "DP-2", "(iii)"
|
|
912
|
+
# Extends _LABEL_RE with prefix patterns (SM-, DP-, etc.)
|
|
913
|
+
_COMPOUND_LABEL_RE = re.compile(
|
|
914
|
+
r"^[1-9]\d{0,2}[a-z]?$|" # numeric: "1", "2a", "15b"
|
|
915
|
+
r"^\([ivx]+\)$|" # roman: "(i)", "(iii)"
|
|
916
|
+
r"^(SM|DP|P|CP|Int)-?\d+[a-z]?$", # prefixed: "SM-1", "DP-2", "P1"
|
|
917
|
+
re.IGNORECASE
|
|
918
|
+
)
|
|
919
|
+
|
|
920
|
+
# Literature citations: "Author et al. J. Org. Chem. 1994, 59, 1937"
|
|
921
|
+
_CITATION_RE = re.compile(
|
|
922
|
+
r"[A-Z][a-z]+\s+et\s+al\.", re.IGNORECASE
|
|
923
|
+
)
|
|
924
|
+
_JOURNAL_RE = re.compile(
|
|
925
|
+
r"(J\.\s*(Org|Med|Am)\.\s*Chem|Angew\.\s*Chem|Org\.\s*Lett|"
|
|
926
|
+
r"Tetrahedron|Bioorg\.\s*Med|Chem\.\s*Commun|ChemMedChem|"
|
|
927
|
+
r"Proc\.\s*Natl|Biochem\.\s*Biophys|Chem\.\s*Ber|"
|
|
928
|
+
r"Org\.\s*Process\.\s*Res|Digital\s*Discovery|RSC|"
|
|
929
|
+
r"JACS|ACS\s*Catal|Nat\.\s*Chem)",
|
|
930
|
+
re.IGNORECASE
|
|
931
|
+
)
|
|
932
|
+
|
|
933
|
+
# Bioactivity data: "IC50 = 23nM", "EC50 (RPMI-8226) = 190nM", "Ki = 5 µM"
|
|
934
|
+
_BIOACTIVITY_RE = re.compile(
|
|
935
|
+
r"(IC50|EC50|Ki|Kd|MIC|ED50|GI50|CC50)\s*[=(]",
|
|
936
|
+
re.IGNORECASE
|
|
937
|
+
)
|
|
938
|
+
|
|
939
|
+
|
|
940
|
+
def _classify_text_species(text: str) -> str:
|
|
941
|
+
"""Classify a text element into a category.
|
|
942
|
+
|
|
943
|
+
Returns one of: "condition_ref", "footnote", "yield",
|
|
944
|
+
"compound_label", "citation", "bioactivity", "chemical" (default).
|
|
945
|
+
"""
|
|
946
|
+
stripped = text.strip()
|
|
947
|
+
|
|
948
|
+
# Condition reference letters (single or comma/slash-separated)
|
|
949
|
+
if _CONDITION_REF_RE.match(stripped):
|
|
950
|
+
return "condition_ref"
|
|
951
|
+
if _CONDITION_REF_OR_RE.match(stripped):
|
|
952
|
+
return "condition_ref"
|
|
953
|
+
|
|
954
|
+
# Pure yield annotations (before footnote check — footnotes may end with %)
|
|
955
|
+
if _YIELD_ONLY_RE.match(stripped):
|
|
956
|
+
return "yield"
|
|
957
|
+
|
|
958
|
+
# Compound labels — short numeric/prefixed identifiers
|
|
959
|
+
if _COMPOUND_LABEL_RE.match(stripped):
|
|
960
|
+
return "compound_label"
|
|
961
|
+
|
|
962
|
+
# Footnote text: "(a) reagent, conditions..." or "(b) NBS, DMF..."
|
|
963
|
+
# Must be long enough to contain actual conditions (not just "(a)")
|
|
964
|
+
if len(stripped) > 5 and _FOOTNOTE_RE.match(stripped):
|
|
965
|
+
return "footnote"
|
|
966
|
+
|
|
967
|
+
# Literature citations
|
|
968
|
+
if _CITATION_RE.search(stripped) or _JOURNAL_RE.search(stripped):
|
|
969
|
+
return "citation"
|
|
970
|
+
|
|
971
|
+
# Bioactivity annotations
|
|
972
|
+
if _BIOACTIVITY_RE.search(stripped):
|
|
973
|
+
return "bioactivity"
|
|
974
|
+
|
|
975
|
+
return "chemical"
|
|
976
|
+
|
|
977
|
+
|
|
978
|
+
# Single-letter names that PubChem falsely resolves (d → deuterium, etc.)
|
|
979
|
+
_LETTER_SMILES_BLACKLIST = frozenset("abcdefghijklmnopqrstuvwxyz")
|
|
980
|
+
|
|
981
|
+
|
|
982
|
+
# ---------------------------------------------------------------------------
|
|
983
|
+
# Species registry building
|
|
984
|
+
# ---------------------------------------------------------------------------
|
|
985
|
+
|
|
986
|
+
|
|
987
|
+
def _extract_variable_labels(frag_el: ET.Element) -> List[str]:
|
|
988
|
+
"""Extract variable position labels from a fragment's child nodes.
|
|
989
|
+
|
|
990
|
+
Looks for GenericNickname and Unspecified node types that carry
|
|
991
|
+
text labels (R3, R4, Linker, etc.).
|
|
992
|
+
"""
|
|
993
|
+
labels = []
|
|
994
|
+
for node in frag_el.iter("n"):
|
|
995
|
+
node_type = node.get("NodeType", "")
|
|
996
|
+
if node_type in ("GenericNickname", "Unspecified"):
|
|
997
|
+
# Get the text label
|
|
998
|
+
t_el = node.find("t")
|
|
999
|
+
if t_el is not None:
|
|
1000
|
+
text = _get_text_content(t_el)
|
|
1001
|
+
if text and text.strip():
|
|
1002
|
+
labels.append(text.strip())
|
|
1003
|
+
elif node_type == "GenericNickname":
|
|
1004
|
+
# Fallback to GenericNickname attribute
|
|
1005
|
+
gn = node.get("GenericNickname", "")
|
|
1006
|
+
if gn:
|
|
1007
|
+
labels.append(gn)
|
|
1008
|
+
return labels
|
|
1009
|
+
|
|
1010
|
+
|
|
1011
|
+
def _build_static_species_registry(
|
|
1012
|
+
page: ET.Element,
|
|
1013
|
+
id_map: Dict[str, ET.Element],
|
|
1014
|
+
use_network: bool = True,
|
|
1015
|
+
use_chemscript: bool = False,
|
|
1016
|
+
) -> Dict[str, SpeciesRecord]:
|
|
1017
|
+
"""Enumerate all fragments on a page without requiring reaction steps.
|
|
1018
|
+
|
|
1019
|
+
Used for non-reaction CDXMLs (target arrays, standalone structures)
|
|
1020
|
+
where no arrows are present. Returns a species dict keyed by species
|
|
1021
|
+
ID, similar to the first return value of ``_build_species_registry``.
|
|
1022
|
+
"""
|
|
1023
|
+
from ..rdkit_utils import frag_to_smiles_resolved, frag_to_smiles, frag_to_mw
|
|
1024
|
+
|
|
1025
|
+
# Optional ChemScript
|
|
1026
|
+
_frag_to_smiles_cs = None
|
|
1027
|
+
_cs_bridge = None
|
|
1028
|
+
if use_chemscript:
|
|
1029
|
+
try:
|
|
1030
|
+
from ..rdkit_utils import frag_to_smiles_chemscript
|
|
1031
|
+
_frag_to_smiles_cs = frag_to_smiles_chemscript
|
|
1032
|
+
except ImportError:
|
|
1033
|
+
pass
|
|
1034
|
+
try:
|
|
1035
|
+
from ..chemdraw.chemscript_bridge import ChemScriptBridge
|
|
1036
|
+
_cs_bridge = ChemScriptBridge()
|
|
1037
|
+
except Exception:
|
|
1038
|
+
pass
|
|
1039
|
+
|
|
1040
|
+
try:
|
|
1041
|
+
from rdkit import Chem
|
|
1042
|
+
from rdkit.Chem import Descriptors, rdMolDescriptors
|
|
1043
|
+
_has_rdkit = True
|
|
1044
|
+
except ImportError:
|
|
1045
|
+
_has_rdkit = False
|
|
1046
|
+
|
|
1047
|
+
species_dict: Dict[str, SpeciesRecord] = {}
|
|
1048
|
+
species_counter = 0
|
|
1049
|
+
|
|
1050
|
+
for el in page:
|
|
1051
|
+
if el.tag != "fragment":
|
|
1052
|
+
continue
|
|
1053
|
+
|
|
1054
|
+
elem_id = el.get("id", "")
|
|
1055
|
+
sp_id = f"species_{species_counter}"
|
|
1056
|
+
species_counter += 1
|
|
1057
|
+
|
|
1058
|
+
# SMILES extraction (same cascade as _build_species_registry)
|
|
1059
|
+
smiles_cs = None
|
|
1060
|
+
smiles_resolved = None
|
|
1061
|
+
smiles_raw = None
|
|
1062
|
+
if _frag_to_smiles_cs is not None:
|
|
1063
|
+
try:
|
|
1064
|
+
smiles_cs = _frag_to_smiles_cs(el)
|
|
1065
|
+
except Exception:
|
|
1066
|
+
pass
|
|
1067
|
+
try:
|
|
1068
|
+
smiles_resolved = frag_to_smiles_resolved(el)
|
|
1069
|
+
except Exception:
|
|
1070
|
+
pass
|
|
1071
|
+
try:
|
|
1072
|
+
smiles_raw = frag_to_smiles(el)
|
|
1073
|
+
except Exception:
|
|
1074
|
+
pass
|
|
1075
|
+
|
|
1076
|
+
smiles = smiles_cs or smiles_resolved or smiles_raw
|
|
1077
|
+
|
|
1078
|
+
# MW
|
|
1079
|
+
mw = None
|
|
1080
|
+
try:
|
|
1081
|
+
mw = frag_to_mw(el)
|
|
1082
|
+
except Exception:
|
|
1083
|
+
pass
|
|
1084
|
+
|
|
1085
|
+
# Formula
|
|
1086
|
+
formula = None
|
|
1087
|
+
if smiles and _has_rdkit:
|
|
1088
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
1089
|
+
if mol:
|
|
1090
|
+
formula = rdMolDescriptors.CalcMolFormula(mol)
|
|
1091
|
+
|
|
1092
|
+
# Label
|
|
1093
|
+
label = _find_nearby_label(el, page, id_map)
|
|
1094
|
+
|
|
1095
|
+
# Name from reagent_db
|
|
1096
|
+
name = None
|
|
1097
|
+
if smiles:
|
|
1098
|
+
name = _name_from_smiles(smiles)
|
|
1099
|
+
|
|
1100
|
+
# IUPAC name
|
|
1101
|
+
iupac_name = None
|
|
1102
|
+
if _cs_bridge and smiles:
|
|
1103
|
+
try:
|
|
1104
|
+
iupac_name = _cs_bridge.get_name(smiles)
|
|
1105
|
+
except Exception:
|
|
1106
|
+
pass
|
|
1107
|
+
|
|
1108
|
+
# Generic/variable group metadata
|
|
1109
|
+
var_labels = _extract_variable_labels(el)
|
|
1110
|
+
if var_labels:
|
|
1111
|
+
var_str = ", ".join(var_labels)
|
|
1112
|
+
if name:
|
|
1113
|
+
name = f"{name} (variable: {var_str})"
|
|
1114
|
+
else:
|
|
1115
|
+
name = f"scaffold (variable: {var_str})"
|
|
1116
|
+
|
|
1117
|
+
record = SpeciesRecord(
|
|
1118
|
+
id=sp_id,
|
|
1119
|
+
cdxml_element_id=elem_id,
|
|
1120
|
+
element_type="fragment",
|
|
1121
|
+
smiles=smiles,
|
|
1122
|
+
smiles_raw=smiles_raw if smiles_raw != smiles else None,
|
|
1123
|
+
name=name,
|
|
1124
|
+
iupac_name=iupac_name,
|
|
1125
|
+
formula=formula,
|
|
1126
|
+
mw=round(mw, 2) if mw else None,
|
|
1127
|
+
label=label,
|
|
1128
|
+
)
|
|
1129
|
+
species_dict[sp_id] = record
|
|
1130
|
+
|
|
1131
|
+
# Also collect standalone text elements on the page
|
|
1132
|
+
for el in page:
|
|
1133
|
+
if el.tag != "t":
|
|
1134
|
+
continue
|
|
1135
|
+
text_content = _get_text_content(el)
|
|
1136
|
+
if not text_content or not text_content.strip():
|
|
1137
|
+
continue
|
|
1138
|
+
# Skip trivially short or known non-chemical text
|
|
1139
|
+
stripped = text_content.strip()
|
|
1140
|
+
if len(stripped) < 2:
|
|
1141
|
+
continue
|
|
1142
|
+
|
|
1143
|
+
sp_id = f"species_{species_counter}"
|
|
1144
|
+
species_counter += 1
|
|
1145
|
+
text_cat = _classify_text_species(stripped)
|
|
1146
|
+
|
|
1147
|
+
record = SpeciesRecord(
|
|
1148
|
+
id=sp_id,
|
|
1149
|
+
cdxml_element_id=el.get("id", ""),
|
|
1150
|
+
element_type="text",
|
|
1151
|
+
name=text_content,
|
|
1152
|
+
text_category=text_cat,
|
|
1153
|
+
)
|
|
1154
|
+
species_dict[sp_id] = record
|
|
1155
|
+
|
|
1156
|
+
return species_dict
|
|
1157
|
+
|
|
1158
|
+
|
|
1159
|
+
def _build_species_registry(
|
|
1160
|
+
raw_steps: List[_RawStep],
|
|
1161
|
+
id_map: Dict[str, ET.Element],
|
|
1162
|
+
page: ET.Element,
|
|
1163
|
+
use_network: bool = True,
|
|
1164
|
+
use_chemscript: bool = False,
|
|
1165
|
+
) -> Tuple[Dict[str, SpeciesRecord], Dict[str, List[str]]]:
|
|
1166
|
+
"""Build species records for all referenced elements.
|
|
1167
|
+
|
|
1168
|
+
Returns:
|
|
1169
|
+
(species_dict, elem_to_species_ids) where elem_to_species_ids maps
|
|
1170
|
+
CDXML element IDs to lists of species IDs (one-to-many for split
|
|
1171
|
+
text blocks).
|
|
1172
|
+
"""
|
|
1173
|
+
from ..rdkit_utils import frag_to_smiles_resolved, frag_to_smiles, frag_to_mw
|
|
1174
|
+
|
|
1175
|
+
# Optional ChemScript-based SMILES (best abbreviation resolution)
|
|
1176
|
+
_frag_to_smiles_cs = None
|
|
1177
|
+
_cs_bridge = None
|
|
1178
|
+
if use_chemscript:
|
|
1179
|
+
try:
|
|
1180
|
+
from ..rdkit_utils import frag_to_smiles_chemscript
|
|
1181
|
+
_frag_to_smiles_cs = frag_to_smiles_chemscript
|
|
1182
|
+
_log("ChemScript SMILES resolution enabled")
|
|
1183
|
+
except ImportError:
|
|
1184
|
+
_log("ChemScript not available, using RDKit resolution")
|
|
1185
|
+
# Also get ChemScript bridge for IUPAC name generation
|
|
1186
|
+
try:
|
|
1187
|
+
from ..chemdraw.chemscript_bridge import ChemScriptBridge
|
|
1188
|
+
_cs_bridge = ChemScriptBridge()
|
|
1189
|
+
_log("ChemScript IUPAC naming enabled")
|
|
1190
|
+
except Exception:
|
|
1191
|
+
pass
|
|
1192
|
+
|
|
1193
|
+
try:
|
|
1194
|
+
from rdkit import Chem
|
|
1195
|
+
from rdkit.Chem import Descriptors, rdMolDescriptors
|
|
1196
|
+
_has_rdkit = True
|
|
1197
|
+
except ImportError:
|
|
1198
|
+
_has_rdkit = False
|
|
1199
|
+
|
|
1200
|
+
# Collect all unique element IDs
|
|
1201
|
+
all_elem_ids: Set[str] = set()
|
|
1202
|
+
for step in raw_steps:
|
|
1203
|
+
all_elem_ids.update(step.reactant_elem_ids)
|
|
1204
|
+
all_elem_ids.update(step.product_elem_ids)
|
|
1205
|
+
all_elem_ids.update(step.above_arrow_ids)
|
|
1206
|
+
all_elem_ids.update(step.below_arrow_ids)
|
|
1207
|
+
|
|
1208
|
+
species_dict: Dict[str, SpeciesRecord] = {}
|
|
1209
|
+
elem_to_species: Dict[str, List[str]] = {}
|
|
1210
|
+
species_counter = 0
|
|
1211
|
+
|
|
1212
|
+
for elem_id in sorted(all_elem_ids):
|
|
1213
|
+
if elem_id in elem_to_species:
|
|
1214
|
+
continue # already registered (shared intermediate)
|
|
1215
|
+
|
|
1216
|
+
el = id_map.get(elem_id)
|
|
1217
|
+
if el is None:
|
|
1218
|
+
_log(f"Element {elem_id} not found in id_map, skipping")
|
|
1219
|
+
continue
|
|
1220
|
+
|
|
1221
|
+
sp_id = f"species_{species_counter}"
|
|
1222
|
+
species_counter += 1
|
|
1223
|
+
|
|
1224
|
+
if el.tag == "fragment":
|
|
1225
|
+
# Extract SMILES — try ChemScript first (best abbreviation
|
|
1226
|
+
# expansion), then superatom-table resolution, then raw.
|
|
1227
|
+
smiles_cs = None
|
|
1228
|
+
smiles_resolved = None
|
|
1229
|
+
smiles_raw = None
|
|
1230
|
+
if _frag_to_smiles_cs is not None:
|
|
1231
|
+
try:
|
|
1232
|
+
smiles_cs = _frag_to_smiles_cs(el)
|
|
1233
|
+
except Exception as e:
|
|
1234
|
+
_log(f"frag_to_smiles_chemscript failed for {elem_id}: {e}")
|
|
1235
|
+
try:
|
|
1236
|
+
smiles_resolved = frag_to_smiles_resolved(el)
|
|
1237
|
+
except Exception as e:
|
|
1238
|
+
_log(f"frag_to_smiles_resolved failed for {elem_id}: {e}")
|
|
1239
|
+
try:
|
|
1240
|
+
smiles_raw = frag_to_smiles(el)
|
|
1241
|
+
except Exception as e:
|
|
1242
|
+
_log(f"frag_to_smiles failed for {elem_id}: {e}")
|
|
1243
|
+
|
|
1244
|
+
smiles = smiles_cs or smiles_resolved or smiles_raw
|
|
1245
|
+
|
|
1246
|
+
# Compute MW
|
|
1247
|
+
mw = None
|
|
1248
|
+
try:
|
|
1249
|
+
mw = frag_to_mw(el)
|
|
1250
|
+
except Exception:
|
|
1251
|
+
pass
|
|
1252
|
+
|
|
1253
|
+
# Compute formula from SMILES
|
|
1254
|
+
formula = None
|
|
1255
|
+
if smiles and _has_rdkit:
|
|
1256
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
1257
|
+
if mol:
|
|
1258
|
+
formula = rdMolDescriptors.CalcMolFormula(mol)
|
|
1259
|
+
|
|
1260
|
+
# Detect compound label from nearby text
|
|
1261
|
+
label = _find_nearby_label(el, page, id_map)
|
|
1262
|
+
|
|
1263
|
+
# Try to get a name from reagent_db by SMILES
|
|
1264
|
+
name = None
|
|
1265
|
+
if smiles:
|
|
1266
|
+
name = _name_from_smiles(smiles)
|
|
1267
|
+
|
|
1268
|
+
# IUPAC name via ChemScript (when available)
|
|
1269
|
+
iupac_name = None
|
|
1270
|
+
if _cs_bridge and smiles:
|
|
1271
|
+
try:
|
|
1272
|
+
iupac_name = _cs_bridge.get_name(smiles)
|
|
1273
|
+
except Exception:
|
|
1274
|
+
pass # ChemScript fails on some structures (charges, etc.)
|
|
1275
|
+
|
|
1276
|
+
record = SpeciesRecord(
|
|
1277
|
+
id=sp_id,
|
|
1278
|
+
cdxml_element_id=elem_id,
|
|
1279
|
+
element_type="fragment",
|
|
1280
|
+
smiles=smiles,
|
|
1281
|
+
smiles_raw=smiles_raw if smiles_raw != smiles else None,
|
|
1282
|
+
name=name,
|
|
1283
|
+
iupac_name=iupac_name,
|
|
1284
|
+
formula=formula,
|
|
1285
|
+
mw=round(mw, 2) if mw else None,
|
|
1286
|
+
label=label,
|
|
1287
|
+
)
|
|
1288
|
+
|
|
1289
|
+
elif el.tag == "t":
|
|
1290
|
+
text_content = _get_text_content(el)
|
|
1291
|
+
if not text_content:
|
|
1292
|
+
continue
|
|
1293
|
+
|
|
1294
|
+
# Skip pure annotation text that isn't a chemical name:
|
|
1295
|
+
# - equiv annotations: "(1.2 eq)"
|
|
1296
|
+
# These are captured as step metadata, not species.
|
|
1297
|
+
stripped = text_content.strip()
|
|
1298
|
+
if re.match(r"^\(?\d+\.?\d*\s*eq\.?\)?$", stripped,
|
|
1299
|
+
re.IGNORECASE):
|
|
1300
|
+
_log(f"Skipping equiv annotation: {stripped!r}")
|
|
1301
|
+
species_counter -= 1 # reclaim ID
|
|
1302
|
+
continue
|
|
1303
|
+
|
|
1304
|
+
# Classify text species
|
|
1305
|
+
text_cat = _classify_text_species(stripped)
|
|
1306
|
+
_log(f"Text species {elem_id} classified as {text_cat}: "
|
|
1307
|
+
f"{stripped[:60]!r}")
|
|
1308
|
+
|
|
1309
|
+
if text_cat == "chemical":
|
|
1310
|
+
# Split multi-line text blocks into individual species.
|
|
1311
|
+
# Each chemical entity becomes its own SpeciesRecord;
|
|
1312
|
+
# condition tokens (temp, time, atmosphere) are skipped.
|
|
1313
|
+
from .reaction_parser import (
|
|
1314
|
+
_resolve_text_label, _is_condition_token)
|
|
1315
|
+
from ..resolve.reagent_db import get_reagent_db
|
|
1316
|
+
_reagent_db = get_reagent_db()
|
|
1317
|
+
|
|
1318
|
+
_equiv_re = re.compile(
|
|
1319
|
+
r'\s*\((\d+\.?\d*\s*(?:eq\.?|equiv\.?|mol\s*%))\)'
|
|
1320
|
+
r'\s*$', re.IGNORECASE)
|
|
1321
|
+
|
|
1322
|
+
lines = [l.strip() for l in text_content.split("\n")
|
|
1323
|
+
if l.strip()]
|
|
1324
|
+
split_records: List[SpeciesRecord] = []
|
|
1325
|
+
|
|
1326
|
+
for line in lines:
|
|
1327
|
+
# Extract equiv/mol% annotation
|
|
1328
|
+
eq_match = _equiv_re.search(line)
|
|
1329
|
+
line_equiv = eq_match.group(1) if eq_match else None
|
|
1330
|
+
clean_line = _equiv_re.sub("", line).strip()
|
|
1331
|
+
if not clean_line:
|
|
1332
|
+
continue
|
|
1333
|
+
|
|
1334
|
+
# Sub-split on ", " (comma+space) or ";"
|
|
1335
|
+
# Protects "1,4-dioxane" (no space after comma)
|
|
1336
|
+
parts = re.split(r'\s*;\s*|,\s+', clean_line)
|
|
1337
|
+
|
|
1338
|
+
# Further split "/" mixtures into separate entities.
|
|
1339
|
+
# e.g. "dioxane/H2O (3:1)" → ["dioxane", "H2O"]
|
|
1340
|
+
# Strip trailing ratio annotations like "(3:1)" first.
|
|
1341
|
+
expanded_parts: list = []
|
|
1342
|
+
_ratio_re = re.compile(
|
|
1343
|
+
r'\s*\(\d+:\d+\)\s*$')
|
|
1344
|
+
for p in parts:
|
|
1345
|
+
p_clean = _ratio_re.sub("", p).strip()
|
|
1346
|
+
if "/" in p_clean:
|
|
1347
|
+
expanded_parts.extend(
|
|
1348
|
+
s.strip() for s in p_clean.split("/")
|
|
1349
|
+
if s.strip())
|
|
1350
|
+
else:
|
|
1351
|
+
expanded_parts.append(p_clean)
|
|
1352
|
+
parts = expanded_parts
|
|
1353
|
+
|
|
1354
|
+
for pi, part in enumerate(parts):
|
|
1355
|
+
part = part.strip()
|
|
1356
|
+
if not part:
|
|
1357
|
+
continue
|
|
1358
|
+
# Skip condition tokens
|
|
1359
|
+
if _is_condition_token(part):
|
|
1360
|
+
continue
|
|
1361
|
+
# Skip yield annotations ("72%", "quant.")
|
|
1362
|
+
if _YIELD_ONLY_RE.match(part):
|
|
1363
|
+
continue
|
|
1364
|
+
# Skip compound labels ("3a", "SM-1")
|
|
1365
|
+
if _COMPOUND_LABEL_RE.match(part):
|
|
1366
|
+
continue
|
|
1367
|
+
# Skip single letters (false resolutions)
|
|
1368
|
+
if part.lower() in _LETTER_SMILES_BLACKLIST:
|
|
1369
|
+
continue
|
|
1370
|
+
|
|
1371
|
+
# Strip qualifier suffixes: "(cat.)", "(xs)",
|
|
1372
|
+
# "(excess)", "(aq.)", "(anhyd.)" etc.
|
|
1373
|
+
part = re.sub(
|
|
1374
|
+
r'\s*\((cat\.?|xs|excess|anhyd\.?|'
|
|
1375
|
+
r'aq\.?|anhydrous|catalytic|sat\.?|'
|
|
1376
|
+
r'conc\.?|dil\.?)\)\s*$',
|
|
1377
|
+
'', part, flags=re.IGNORECASE).strip()
|
|
1378
|
+
if not part:
|
|
1379
|
+
continue
|
|
1380
|
+
|
|
1381
|
+
# Skip reaction names / workup text that got
|
|
1382
|
+
# through as "chemical". Heuristic: if the
|
|
1383
|
+
# token contains only Latin words (no digits,
|
|
1384
|
+
# no chemical punctuation like parentheses or
|
|
1385
|
+
# brackets), and it contains a known non-
|
|
1386
|
+
# chemical keyword, skip it.
|
|
1387
|
+
_lower = part.lower()
|
|
1388
|
+
_NON_CHEM_KEYWORDS = {
|
|
1389
|
+
"formylation", "coupling", "reaction",
|
|
1390
|
+
"addition", "reduction", "oxidation",
|
|
1391
|
+
"cyclization", "rearrangement", "workup",
|
|
1392
|
+
"work-up", "quench", "extraction",
|
|
1393
|
+
}
|
|
1394
|
+
if any(kw in _lower for kw in _NON_CHEM_KEYWORDS):
|
|
1395
|
+
continue
|
|
1396
|
+
# Skip "then ..." workup prefixes
|
|
1397
|
+
if _lower.startswith("then "):
|
|
1398
|
+
continue
|
|
1399
|
+
|
|
1400
|
+
# Resolve SMILES
|
|
1401
|
+
smi = None
|
|
1402
|
+
try:
|
|
1403
|
+
smi = _resolve_text_label(
|
|
1404
|
+
part, use_network=use_network)
|
|
1405
|
+
except Exception:
|
|
1406
|
+
pass
|
|
1407
|
+
|
|
1408
|
+
# Compute MW / formula
|
|
1409
|
+
mw_val = None
|
|
1410
|
+
formula_val = None
|
|
1411
|
+
if smi and _has_rdkit:
|
|
1412
|
+
mol = Chem.MolFromSmiles(smi)
|
|
1413
|
+
if mol:
|
|
1414
|
+
formula_val = (
|
|
1415
|
+
rdMolDescriptors.CalcMolFormula(mol))
|
|
1416
|
+
mw_val = round(Descriptors.MolWt(mol), 2)
|
|
1417
|
+
|
|
1418
|
+
# Detect solvent via reagent_db role
|
|
1419
|
+
is_solvent = False
|
|
1420
|
+
role = _reagent_db.role_for_name(part)
|
|
1421
|
+
if role == "solvent":
|
|
1422
|
+
is_solvent = True
|
|
1423
|
+
|
|
1424
|
+
cur_id = f"species_{species_counter}"
|
|
1425
|
+
species_counter += 1
|
|
1426
|
+
rec = SpeciesRecord(
|
|
1427
|
+
id=cur_id,
|
|
1428
|
+
cdxml_element_id=elem_id,
|
|
1429
|
+
element_type="text",
|
|
1430
|
+
smiles=smi,
|
|
1431
|
+
name=part,
|
|
1432
|
+
formula=formula_val,
|
|
1433
|
+
mw=mw_val,
|
|
1434
|
+
text_category="chemical",
|
|
1435
|
+
is_solvent=is_solvent,
|
|
1436
|
+
# Attach equiv only to first part of a line
|
|
1437
|
+
equiv_text=line_equiv if pi == 0 else None,
|
|
1438
|
+
)
|
|
1439
|
+
split_records.append(rec)
|
|
1440
|
+
|
|
1441
|
+
if split_records:
|
|
1442
|
+
# Reclaim the pre-allocated sp_id; we use our own IDs
|
|
1443
|
+
species_counter -= 1 # undo the +1 from line 937
|
|
1444
|
+
# Re-number: the split_records already have correct IDs
|
|
1445
|
+
# allocated above; just fix the counter
|
|
1446
|
+
species_counter = int(
|
|
1447
|
+
split_records[-1].id.split("_")[1]) + 1
|
|
1448
|
+
for rec in split_records:
|
|
1449
|
+
species_dict[rec.id] = rec
|
|
1450
|
+
elem_to_species.setdefault(elem_id, []).append(
|
|
1451
|
+
rec.id)
|
|
1452
|
+
continue # skip the generic record/assignment below
|
|
1453
|
+
else:
|
|
1454
|
+
# No chemical tokens extracted — fall back to a single
|
|
1455
|
+
# record with the raw text (e.g. pure condition block)
|
|
1456
|
+
record = SpeciesRecord(
|
|
1457
|
+
id=sp_id,
|
|
1458
|
+
cdxml_element_id=elem_id,
|
|
1459
|
+
element_type="text",
|
|
1460
|
+
name=text_content,
|
|
1461
|
+
text_category=text_cat,
|
|
1462
|
+
)
|
|
1463
|
+
else:
|
|
1464
|
+
# Non-chemical text (condition_ref, citation, bioactivity)
|
|
1465
|
+
record = SpeciesRecord(
|
|
1466
|
+
id=sp_id,
|
|
1467
|
+
cdxml_element_id=elem_id,
|
|
1468
|
+
element_type="text",
|
|
1469
|
+
name=text_content,
|
|
1470
|
+
text_category=text_cat,
|
|
1471
|
+
)
|
|
1472
|
+
|
|
1473
|
+
else:
|
|
1474
|
+
# Unknown element type — skip but warn
|
|
1475
|
+
_log(f"Element {elem_id} has unexpected tag '{el.tag}', skipping")
|
|
1476
|
+
continue
|
|
1477
|
+
|
|
1478
|
+
species_dict[sp_id] = record
|
|
1479
|
+
elem_to_species.setdefault(elem_id, []).append(sp_id)
|
|
1480
|
+
|
|
1481
|
+
return species_dict, elem_to_species
|
|
1482
|
+
|
|
1483
|
+
|
|
1484
|
+
def _find_nearby_label(frag: ET.Element, page: ET.Element,
|
|
1485
|
+
id_map: Dict[str, ET.Element]) -> Optional[str]:
|
|
1486
|
+
"""Find a compound label text element near the bottom of a fragment.
|
|
1487
|
+
|
|
1488
|
+
Labels are typically short text elements ("1", "2a", "3") positioned
|
|
1489
|
+
directly below the fragment bounding box.
|
|
1490
|
+
"""
|
|
1491
|
+
from ..cdxml_utils import fragment_bbox
|
|
1492
|
+
|
|
1493
|
+
bbox = fragment_bbox(frag)
|
|
1494
|
+
if bbox is None:
|
|
1495
|
+
return None
|
|
1496
|
+
|
|
1497
|
+
min_x, min_y, max_x, max_y = bbox
|
|
1498
|
+
frag_center_x = (min_x + max_x) / 2
|
|
1499
|
+
frag_width = max_x - min_x
|
|
1500
|
+
|
|
1501
|
+
best_label = None
|
|
1502
|
+
best_dist = float("inf")
|
|
1503
|
+
|
|
1504
|
+
for el in page:
|
|
1505
|
+
if el.tag != "t":
|
|
1506
|
+
continue
|
|
1507
|
+
p = el.get("p")
|
|
1508
|
+
if not p:
|
|
1509
|
+
continue
|
|
1510
|
+
parts = p.split()
|
|
1511
|
+
tx, ty = float(parts[0]), float(parts[1])
|
|
1512
|
+
|
|
1513
|
+
# Label should be below the fragment (within ~25pt)
|
|
1514
|
+
if ty < max_y or ty > max_y + 25:
|
|
1515
|
+
continue
|
|
1516
|
+
# Label should be horizontally near the fragment center
|
|
1517
|
+
if abs(tx - frag_center_x) > frag_width / 2 + 15:
|
|
1518
|
+
continue
|
|
1519
|
+
|
|
1520
|
+
text = _get_text_content(el)
|
|
1521
|
+
if text and _LABEL_RE.match(text):
|
|
1522
|
+
dist = abs(tx - frag_center_x) + abs(ty - max_y)
|
|
1523
|
+
if dist < best_dist:
|
|
1524
|
+
best_dist = dist
|
|
1525
|
+
best_label = text
|
|
1526
|
+
|
|
1527
|
+
return best_label
|
|
1528
|
+
|
|
1529
|
+
|
|
1530
|
+
# ---------------------------------------------------------------------------
|
|
1531
|
+
# Step record building
|
|
1532
|
+
# ---------------------------------------------------------------------------
|
|
1533
|
+
|
|
1534
|
+
def _build_step_records(
|
|
1535
|
+
raw_steps: List[_RawStep],
|
|
1536
|
+
elem_to_species: Dict[str, List[str]],
|
|
1537
|
+
species_dict: Dict[str, "SpeciesRecord"],
|
|
1538
|
+
id_map: Dict[str, ET.Element],
|
|
1539
|
+
page: ET.Element,
|
|
1540
|
+
) -> List[StepRecord]:
|
|
1541
|
+
"""Convert raw steps to StepRecords with species IDs and parsed text."""
|
|
1542
|
+
from .reaction_parser import (split_condition_text,
|
|
1543
|
+
extract_conditions_from_text)
|
|
1544
|
+
|
|
1545
|
+
# Categories that should NOT be added to reagent_ids
|
|
1546
|
+
_NON_REAGENT_CATS = frozenset({
|
|
1547
|
+
"condition_ref", "yield", "compound_label",
|
|
1548
|
+
"footnote", "citation", "bioactivity",
|
|
1549
|
+
})
|
|
1550
|
+
|
|
1551
|
+
def _is_reagent_species(sp_id: str) -> bool:
|
|
1552
|
+
"""Return True if a species should be listed as a reagent."""
|
|
1553
|
+
sp = species_dict.get(sp_id)
|
|
1554
|
+
if sp is None:
|
|
1555
|
+
return True # unknown → keep (shouldn't happen)
|
|
1556
|
+
if sp.element_type != "text":
|
|
1557
|
+
return True # fragments are always reagents
|
|
1558
|
+
return sp.text_category not in _NON_REAGENT_CATS
|
|
1559
|
+
|
|
1560
|
+
records: List[StepRecord] = []
|
|
1561
|
+
|
|
1562
|
+
for idx, raw in enumerate(raw_steps):
|
|
1563
|
+
step = StepRecord(step_index=idx)
|
|
1564
|
+
|
|
1565
|
+
# Map element IDs to species IDs
|
|
1566
|
+
for eid in raw.reactant_elem_ids:
|
|
1567
|
+
sp_ids = elem_to_species.get(eid, [])
|
|
1568
|
+
step.reactant_ids.extend(sp_ids)
|
|
1569
|
+
|
|
1570
|
+
for eid in raw.product_elem_ids:
|
|
1571
|
+
sp_ids = elem_to_species.get(eid, [])
|
|
1572
|
+
step.product_ids.extend(sp_ids)
|
|
1573
|
+
|
|
1574
|
+
# Process above/below arrow elements
|
|
1575
|
+
for eid in raw.above_arrow_ids:
|
|
1576
|
+
el = id_map.get(eid)
|
|
1577
|
+
if el is None:
|
|
1578
|
+
continue
|
|
1579
|
+
|
|
1580
|
+
if el.tag == "fragment":
|
|
1581
|
+
sp_ids = elem_to_species.get(eid, [])
|
|
1582
|
+
step.reagent_ids.extend(sp_ids)
|
|
1583
|
+
elif el.tag == "t":
|
|
1584
|
+
text = _get_text_content(el)
|
|
1585
|
+
if not text:
|
|
1586
|
+
continue
|
|
1587
|
+
# Text above arrow: only add chemical species as reagents
|
|
1588
|
+
sp_ids = elem_to_species.get(eid, [])
|
|
1589
|
+
reagent_sp_ids = [s for s in sp_ids if _is_reagent_species(s)]
|
|
1590
|
+
if reagent_sp_ids:
|
|
1591
|
+
step.reagent_ids.extend(reagent_sp_ids)
|
|
1592
|
+
elif not sp_ids:
|
|
1593
|
+
# No species at all → condition metadata
|
|
1594
|
+
step.condition_text_raw.append(text)
|
|
1595
|
+
# For yield text above arrow, extract yield
|
|
1596
|
+
stripped = text.strip()
|
|
1597
|
+
if _YIELD_ONLY_RE.match(stripped):
|
|
1598
|
+
y = _extract_yield_from_text(text)
|
|
1599
|
+
if y and step.yield_text is None:
|
|
1600
|
+
step.yield_text = y
|
|
1601
|
+
|
|
1602
|
+
for eid in raw.below_arrow_ids:
|
|
1603
|
+
el = id_map.get(eid)
|
|
1604
|
+
if el is None:
|
|
1605
|
+
continue
|
|
1606
|
+
|
|
1607
|
+
if el.tag == "fragment":
|
|
1608
|
+
sp_ids = elem_to_species.get(eid, [])
|
|
1609
|
+
step.reagent_ids.extend(sp_ids)
|
|
1610
|
+
elif el.tag == "t":
|
|
1611
|
+
text = _get_text_content(el)
|
|
1612
|
+
if not text:
|
|
1613
|
+
continue
|
|
1614
|
+
|
|
1615
|
+
step.condition_text_raw.append(text)
|
|
1616
|
+
|
|
1617
|
+
# Extract yield from text
|
|
1618
|
+
y = _extract_yield_from_text(text)
|
|
1619
|
+
if y and step.yield_text is None:
|
|
1620
|
+
step.yield_text = y
|
|
1621
|
+
|
|
1622
|
+
# Split into conditions vs chemical names
|
|
1623
|
+
conds = extract_conditions_from_text(text)
|
|
1624
|
+
step.conditions.extend(conds)
|
|
1625
|
+
|
|
1626
|
+
# Only add chemical text species as reagents
|
|
1627
|
+
sp_ids = elem_to_species.get(eid, [])
|
|
1628
|
+
reagent_sp_ids = [s for s in sp_ids if _is_reagent_species(s)]
|
|
1629
|
+
step.reagent_ids.extend(reagent_sp_ids)
|
|
1630
|
+
|
|
1631
|
+
# Detect arrow style
|
|
1632
|
+
if raw.arrow_elem_id:
|
|
1633
|
+
arrow_el = _resolve_arrow(page, raw.arrow_elem_id, id_map)
|
|
1634
|
+
step.arrow_style = _detect_arrow_style(arrow_el)
|
|
1635
|
+
step.arrow_cdxml_id = raw.arrow_elem_id
|
|
1636
|
+
|
|
1637
|
+
records.append(step)
|
|
1638
|
+
|
|
1639
|
+
return records
|
|
1640
|
+
|
|
1641
|
+
|
|
1642
|
+
# ---------------------------------------------------------------------------
|
|
1643
|
+
# Footnote resolution
|
|
1644
|
+
# ---------------------------------------------------------------------------
|
|
1645
|
+
|
|
1646
|
+
def _collect_footnotes(
|
|
1647
|
+
page: ET.Element,
|
|
1648
|
+
registered_elem_ids: Set[str],
|
|
1649
|
+
) -> Dict[str, str]:
|
|
1650
|
+
"""Scan page for footnote text elements and return {letter: conditions_text}.
|
|
1651
|
+
|
|
1652
|
+
Footnotes are standalone text blocks like:
|
|
1653
|
+
"(a) morpholine (1.2 eq), Pd2(dba)3 (5 mol%), ..."
|
|
1654
|
+
"(b) NBS (1.1 eq), DMF, 0 C, 2 h, 95%"
|
|
1655
|
+
|
|
1656
|
+
Only text elements NOT already registered as species are considered.
|
|
1657
|
+
"""
|
|
1658
|
+
footnotes: Dict[str, str] = {}
|
|
1659
|
+
for el in page:
|
|
1660
|
+
if el.tag != "t":
|
|
1661
|
+
continue
|
|
1662
|
+
eid = el.get("id", "")
|
|
1663
|
+
if eid in registered_elem_ids:
|
|
1664
|
+
continue
|
|
1665
|
+
text = _get_text_content(el)
|
|
1666
|
+
if not text or len(text.strip()) <= 5:
|
|
1667
|
+
continue
|
|
1668
|
+
stripped = text.strip()
|
|
1669
|
+
m = _FOOTNOTE_RE.match(stripped)
|
|
1670
|
+
if m:
|
|
1671
|
+
letter = m.group(1).lower()
|
|
1672
|
+
# Extract the conditions part (everything after "(letter) ")
|
|
1673
|
+
cond_text = re.sub(r"^\([a-z]\)\s+", "", stripped,
|
|
1674
|
+
count=1, flags=re.IGNORECASE)
|
|
1675
|
+
if cond_text:
|
|
1676
|
+
footnotes[letter] = cond_text
|
|
1677
|
+
_log(f"Footnote '{letter}': {cond_text[:60]!r}")
|
|
1678
|
+
return footnotes
|
|
1679
|
+
|
|
1680
|
+
|
|
1681
|
+
def _resolve_footnote_conditions(
|
|
1682
|
+
steps: List[StepRecord],
|
|
1683
|
+
species_dict: Dict[str, "SpeciesRecord"],
|
|
1684
|
+
footnotes: Dict[str, str],
|
|
1685
|
+
) -> None:
|
|
1686
|
+
"""Enrich steps that use condition_ref letters with their footnote text.
|
|
1687
|
+
|
|
1688
|
+
For each step, if its above/below arrow text includes condition_ref
|
|
1689
|
+
species (letters like "a", "b"), look up the corresponding footnote
|
|
1690
|
+
and populate the step's condition_text_raw, conditions, and yield_text.
|
|
1691
|
+
"""
|
|
1692
|
+
if not footnotes:
|
|
1693
|
+
return
|
|
1694
|
+
|
|
1695
|
+
from .reaction_parser import extract_conditions_from_text
|
|
1696
|
+
|
|
1697
|
+
for step in steps:
|
|
1698
|
+
# Find condition_ref letters used by this step
|
|
1699
|
+
# (they were NOT added to reagent_ids, but we can find them
|
|
1700
|
+
# by checking species that share the step's arrow elements)
|
|
1701
|
+
ref_letters: List[str] = []
|
|
1702
|
+
for sp in species_dict.values():
|
|
1703
|
+
if sp.text_category != "condition_ref":
|
|
1704
|
+
continue
|
|
1705
|
+
# Check if this condition_ref letter is associated with
|
|
1706
|
+
# any element that belongs to this step's raw data.
|
|
1707
|
+
# Since we can't easily access raw step data here, instead
|
|
1708
|
+
# we look at all condition_ref species and match by
|
|
1709
|
+
# checking if their letter has a footnote.
|
|
1710
|
+
letters = [c.strip().lower() for c in sp.name.split(",")
|
|
1711
|
+
if c.strip()]
|
|
1712
|
+
ref_letters.extend(letters)
|
|
1713
|
+
|
|
1714
|
+
# For simplicity, resolve ALL footnotes for ALL steps that have
|
|
1715
|
+
# condition_ref species. The proper approach would track which
|
|
1716
|
+
# condition_ref belongs to which step, but that requires the
|
|
1717
|
+
# raw step data. Instead, we map letters to steps by position.
|
|
1718
|
+
# This works because steps and condition_refs are ordered.
|
|
1719
|
+
|
|
1720
|
+
# Better approach: pair condition_ref species to steps via
|
|
1721
|
+
# elem_to_species mapping. Since we've already built steps,
|
|
1722
|
+
# we iterate steps and check for condition_ref species by
|
|
1723
|
+
# looking at which species are condition_ref and near which arrow.
|
|
1724
|
+
# For now, use a simpler heuristic: steps with no chemical reagents
|
|
1725
|
+
# and condition_ref species nearby get the footnote conditions.
|
|
1726
|
+
|
|
1727
|
+
# Collect all condition_ref letters per step
|
|
1728
|
+
# We need to re-derive this from the species dict.
|
|
1729
|
+
# Strategy: condition_ref species have names like "a", "b, c".
|
|
1730
|
+
# Steps are ordered; condition_refs are ordered by position.
|
|
1731
|
+
# Match them by step index.
|
|
1732
|
+
all_cond_refs = sorted(
|
|
1733
|
+
[(sp.cdxml_element_id, sp.name.strip().lower())
|
|
1734
|
+
for sp in species_dict.values()
|
|
1735
|
+
if sp.text_category == "condition_ref"],
|
|
1736
|
+
key=lambda x: x[0] # sort by element ID (roughly positional)
|
|
1737
|
+
)
|
|
1738
|
+
|
|
1739
|
+
if not all_cond_refs:
|
|
1740
|
+
return
|
|
1741
|
+
|
|
1742
|
+
# Map each step to its condition_ref letters
|
|
1743
|
+
# For schemes with N steps and N condition_ref letters, assign 1:1
|
|
1744
|
+
# For multi-letter refs like "a, b", split into individual letters
|
|
1745
|
+
ref_idx = 0
|
|
1746
|
+
for step in steps:
|
|
1747
|
+
if ref_idx >= len(all_cond_refs):
|
|
1748
|
+
break
|
|
1749
|
+
elem_id, ref_text = all_cond_refs[ref_idx]
|
|
1750
|
+
letters = [c.strip() for c in re.split(r"[,/\s]+", ref_text)
|
|
1751
|
+
if c.strip() and len(c.strip()) == 1]
|
|
1752
|
+
ref_idx += 1
|
|
1753
|
+
|
|
1754
|
+
for letter in letters:
|
|
1755
|
+
fn_text = footnotes.get(letter)
|
|
1756
|
+
if not fn_text:
|
|
1757
|
+
continue
|
|
1758
|
+
|
|
1759
|
+
_log(f"Step {step.step_index}: resolving footnote "
|
|
1760
|
+
f"'{letter}' → {fn_text[:60]!r}")
|
|
1761
|
+
|
|
1762
|
+
step.condition_text_raw.append(f"({letter}) {fn_text}")
|
|
1763
|
+
|
|
1764
|
+
# Extract yield
|
|
1765
|
+
y = _extract_yield_from_text(fn_text)
|
|
1766
|
+
if y and step.yield_text is None:
|
|
1767
|
+
step.yield_text = y
|
|
1768
|
+
|
|
1769
|
+
# Extract conditions
|
|
1770
|
+
conds = extract_conditions_from_text(fn_text)
|
|
1771
|
+
step.conditions.extend(conds)
|
|
1772
|
+
|
|
1773
|
+
|
|
1774
|
+
# ---------------------------------------------------------------------------
|
|
1775
|
+
# Cross-scheme linkage (for wrap-repeat layouts)
|
|
1776
|
+
# ---------------------------------------------------------------------------
|
|
1777
|
+
|
|
1778
|
+
def _smiles_to_inchi(smiles: str) -> Optional[str]:
|
|
1779
|
+
"""Convert SMILES to InChI for stereo-invariant comparison.
|
|
1780
|
+
|
|
1781
|
+
InChI normalises stereochemistry representation, so two SMILES
|
|
1782
|
+
that differ only in stereo assignment (common when ChemScript
|
|
1783
|
+
re-processes redrawn copies of the same intermediate) will still
|
|
1784
|
+
match by InChI. Falls back to canonical SMILES if RDKit or InChI
|
|
1785
|
+
generation fails.
|
|
1786
|
+
"""
|
|
1787
|
+
try:
|
|
1788
|
+
from rdkit import Chem
|
|
1789
|
+
from rdkit.Chem.inchi import MolToInchi
|
|
1790
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
1791
|
+
if mol is None:
|
|
1792
|
+
return None
|
|
1793
|
+
inchi = MolToInchi(mol)
|
|
1794
|
+
return inchi if inchi else None
|
|
1795
|
+
except Exception:
|
|
1796
|
+
return None
|
|
1797
|
+
|
|
1798
|
+
|
|
1799
|
+
def _link_repeated_species(steps: List[StepRecord],
|
|
1800
|
+
species: Dict[str, SpeciesRecord]) -> None:
|
|
1801
|
+
"""Link repeated structures across separate <scheme> elements.
|
|
1802
|
+
|
|
1803
|
+
Wrap-repeat layouts re-draw intermediates with new element IDs.
|
|
1804
|
+
E.g., the product of step 2 (species_X, SMILES=AAA) appears as the
|
|
1805
|
+
reactant of step 3 (species_Y, SMILES=AAA) with a different ID.
|
|
1806
|
+
|
|
1807
|
+
Uses InChI comparison (stereo-invariant) as the primary matcher,
|
|
1808
|
+
falling back to exact SMILES match. This handles the case where
|
|
1809
|
+
ChemScript produces different stereo-specific SMILES for two
|
|
1810
|
+
drawings of the same intermediate.
|
|
1811
|
+
"""
|
|
1812
|
+
# Build product species lookup keyed by InChI (primary) and SMILES (fallback)
|
|
1813
|
+
product_by_inchi: Dict[str, str] = {} # InChI → species_id
|
|
1814
|
+
product_by_smiles: Dict[str, str] = {} # SMILES → species_id
|
|
1815
|
+
for step in steps:
|
|
1816
|
+
for pid in step.product_ids:
|
|
1817
|
+
sp = species.get(pid)
|
|
1818
|
+
if sp and sp.smiles:
|
|
1819
|
+
product_by_smiles[sp.smiles] = pid
|
|
1820
|
+
inchi = _smiles_to_inchi(sp.smiles)
|
|
1821
|
+
if inchi:
|
|
1822
|
+
product_by_inchi[inchi] = pid
|
|
1823
|
+
|
|
1824
|
+
# Check each step's reactants for matches
|
|
1825
|
+
for step in steps:
|
|
1826
|
+
new_reactants = []
|
|
1827
|
+
for rid in step.reactant_ids:
|
|
1828
|
+
sp = species.get(rid)
|
|
1829
|
+
if sp and sp.smiles:
|
|
1830
|
+
# Try InChI match first (handles stereo differences)
|
|
1831
|
+
matched_id = None
|
|
1832
|
+
inchi = _smiles_to_inchi(sp.smiles)
|
|
1833
|
+
if inchi and inchi in product_by_inchi:
|
|
1834
|
+
candidate = product_by_inchi[inchi]
|
|
1835
|
+
if candidate != rid:
|
|
1836
|
+
matched_id = candidate
|
|
1837
|
+
# Fallback to exact SMILES match
|
|
1838
|
+
if matched_id is None and sp.smiles in product_by_smiles:
|
|
1839
|
+
candidate = product_by_smiles[sp.smiles]
|
|
1840
|
+
if candidate != rid:
|
|
1841
|
+
matched_id = candidate
|
|
1842
|
+
|
|
1843
|
+
if matched_id:
|
|
1844
|
+
_log(f"Linking repeated species: {rid} -> {matched_id} "
|
|
1845
|
+
f"(SMILES: {sp.smiles[:40]})")
|
|
1846
|
+
new_reactants.append(matched_id)
|
|
1847
|
+
continue
|
|
1848
|
+
new_reactants.append(rid)
|
|
1849
|
+
step.reactant_ids = new_reactants
|
|
1850
|
+
|
|
1851
|
+
|
|
1852
|
+
# ---------------------------------------------------------------------------
|
|
1853
|
+
# Topology detection
|
|
1854
|
+
# ---------------------------------------------------------------------------
|
|
1855
|
+
|
|
1856
|
+
def _detect_topology(steps: List[StepRecord]) -> str:
|
|
1857
|
+
"""Classify scheme topology from the reaction graph.
|
|
1858
|
+
|
|
1859
|
+
Returns one of: "linear", "divergent", "convergent", "parallel",
|
|
1860
|
+
"cycle", "mixed"
|
|
1861
|
+
"""
|
|
1862
|
+
if len(steps) == 0:
|
|
1863
|
+
return "linear"
|
|
1864
|
+
if len(steps) == 1:
|
|
1865
|
+
return "linear"
|
|
1866
|
+
|
|
1867
|
+
# Build graph: which species are reactants/products in which steps
|
|
1868
|
+
reactant_of: Dict[str, Set[int]] = defaultdict(set)
|
|
1869
|
+
product_of: Dict[str, Set[int]] = defaultdict(set)
|
|
1870
|
+
|
|
1871
|
+
for i, step in enumerate(steps):
|
|
1872
|
+
for rid in step.reactant_ids:
|
|
1873
|
+
reactant_of[rid].add(i)
|
|
1874
|
+
for pid in step.product_ids:
|
|
1875
|
+
product_of[pid].add(i)
|
|
1876
|
+
|
|
1877
|
+
# Check for sequential links: product of step i = reactant of step j
|
|
1878
|
+
sequential_links = 0
|
|
1879
|
+
for i in range(len(steps)):
|
|
1880
|
+
for j in range(i + 1, len(steps)):
|
|
1881
|
+
shared = set(steps[i].product_ids) & set(steps[j].reactant_ids)
|
|
1882
|
+
if shared:
|
|
1883
|
+
sequential_links += 1
|
|
1884
|
+
|
|
1885
|
+
# Check divergent: same reactant in multiple steps with different products
|
|
1886
|
+
divergent = False
|
|
1887
|
+
for sp_id, step_indices in reactant_of.items():
|
|
1888
|
+
if len(step_indices) > 1:
|
|
1889
|
+
# Check that they produce different things
|
|
1890
|
+
product_sets = [frozenset(steps[i].product_ids)
|
|
1891
|
+
for i in step_indices]
|
|
1892
|
+
if len(set(product_sets)) > 1:
|
|
1893
|
+
divergent = True
|
|
1894
|
+
break
|
|
1895
|
+
|
|
1896
|
+
# Check convergent: one product step consumes species from multiple
|
|
1897
|
+
# different source steps
|
|
1898
|
+
convergent = False
|
|
1899
|
+
for sp_id, step_indices in product_of.items():
|
|
1900
|
+
if len(step_indices) > 1:
|
|
1901
|
+
convergent = True
|
|
1902
|
+
break
|
|
1903
|
+
|
|
1904
|
+
# Check for disconnected components (parallel reactions)
|
|
1905
|
+
# Build adjacency: two steps are connected if they share any species
|
|
1906
|
+
adj: Dict[int, Set[int]] = defaultdict(set)
|
|
1907
|
+
for sp_id in set(list(reactant_of.keys()) + list(product_of.keys())):
|
|
1908
|
+
involved = reactant_of[sp_id] | product_of[sp_id]
|
|
1909
|
+
for si in involved:
|
|
1910
|
+
for sj in involved:
|
|
1911
|
+
if si != sj:
|
|
1912
|
+
adj[si].add(sj)
|
|
1913
|
+
|
|
1914
|
+
# Count connected components via BFS
|
|
1915
|
+
visited: Set[int] = set()
|
|
1916
|
+
components = 0
|
|
1917
|
+
for i in range(len(steps)):
|
|
1918
|
+
if i in visited:
|
|
1919
|
+
continue
|
|
1920
|
+
components += 1
|
|
1921
|
+
queue = [i]
|
|
1922
|
+
while queue:
|
|
1923
|
+
node = queue.pop(0)
|
|
1924
|
+
if node in visited:
|
|
1925
|
+
continue
|
|
1926
|
+
visited.add(node)
|
|
1927
|
+
for neighbor in adj.get(node, set()):
|
|
1928
|
+
if neighbor not in visited:
|
|
1929
|
+
queue.append(neighbor)
|
|
1930
|
+
|
|
1931
|
+
# Check for cycles: product of step i = reactant of step j AND path
|
|
1932
|
+
# from j eventually leads back to i
|
|
1933
|
+
# Build directed graph: step i -> step j if product of i = reactant of j
|
|
1934
|
+
directed_adj: Dict[int, Set[int]] = defaultdict(set)
|
|
1935
|
+
for i in range(len(steps)):
|
|
1936
|
+
for j in range(len(steps)):
|
|
1937
|
+
if i == j:
|
|
1938
|
+
continue
|
|
1939
|
+
if set(steps[i].product_ids) & set(steps[j].reactant_ids):
|
|
1940
|
+
directed_adj[i].add(j)
|
|
1941
|
+
|
|
1942
|
+
# DFS cycle detection
|
|
1943
|
+
WHITE, GRAY, BLACK = 0, 1, 2
|
|
1944
|
+
color = [WHITE] * len(steps)
|
|
1945
|
+
has_cycle = False
|
|
1946
|
+
|
|
1947
|
+
def _dfs_cycle(u: int) -> bool:
|
|
1948
|
+
nonlocal has_cycle
|
|
1949
|
+
color[u] = GRAY
|
|
1950
|
+
for v in directed_adj.get(u, set()):
|
|
1951
|
+
if color[v] == GRAY:
|
|
1952
|
+
return True
|
|
1953
|
+
if color[v] == WHITE and _dfs_cycle(v):
|
|
1954
|
+
return True
|
|
1955
|
+
color[u] = BLACK
|
|
1956
|
+
return False
|
|
1957
|
+
|
|
1958
|
+
for i in range(len(steps)):
|
|
1959
|
+
if color[i] == WHITE and _dfs_cycle(i):
|
|
1960
|
+
has_cycle = True
|
|
1961
|
+
break
|
|
1962
|
+
|
|
1963
|
+
if components > 1:
|
|
1964
|
+
if divergent or convergent or has_cycle:
|
|
1965
|
+
return "mixed"
|
|
1966
|
+
return "parallel"
|
|
1967
|
+
if has_cycle:
|
|
1968
|
+
if divergent or convergent:
|
|
1969
|
+
return "mixed"
|
|
1970
|
+
return "cycle"
|
|
1971
|
+
if divergent and convergent:
|
|
1972
|
+
return "mixed"
|
|
1973
|
+
if divergent:
|
|
1974
|
+
return "divergent"
|
|
1975
|
+
if convergent:
|
|
1976
|
+
return "convergent"
|
|
1977
|
+
if sequential_links > 0:
|
|
1978
|
+
return "linear" # sequential chain
|
|
1979
|
+
return "parallel" # no links found between steps
|
|
1980
|
+
|
|
1981
|
+
|
|
1982
|
+
# ---------------------------------------------------------------------------
|
|
1983
|
+
# Content type heuristic detection
|
|
1984
|
+
# ---------------------------------------------------------------------------
|
|
1985
|
+
|
|
1986
|
+
def _detect_content_type(steps: List[StepRecord],
|
|
1987
|
+
species: Dict[str, SpeciesRecord]) -> str:
|
|
1988
|
+
"""Classify the scheme content type using heuristics.
|
|
1989
|
+
|
|
1990
|
+
Returns one of: "synthesis", "sar_design", "biological_pathway",
|
|
1991
|
+
"target_array", "literature_comparison", "investigation", "unknown".
|
|
1992
|
+
"""
|
|
1993
|
+
# No steps → static figure (target_array or standalone structure)
|
|
1994
|
+
if not steps:
|
|
1995
|
+
return "target_array"
|
|
1996
|
+
|
|
1997
|
+
# Count text species by category
|
|
1998
|
+
cats = defaultdict(int)
|
|
1999
|
+
for sp in species.values():
|
|
2000
|
+
if sp.text_category:
|
|
2001
|
+
cats[sp.text_category] += 1
|
|
2002
|
+
|
|
2003
|
+
n_citation = cats.get("citation", 0)
|
|
2004
|
+
n_bioactivity = cats.get("bioactivity", 0)
|
|
2005
|
+
n_condition_ref = cats.get("condition_ref", 0)
|
|
2006
|
+
n_chemical = cats.get("chemical", 0)
|
|
2007
|
+
n_text = sum(1 for sp in species.values() if sp.element_type == "text")
|
|
2008
|
+
n_frag = sum(1 for sp in species.values() if sp.element_type == "fragment")
|
|
2009
|
+
|
|
2010
|
+
# Bioactivity-heavy → literature comparison (SAR data display)
|
|
2011
|
+
if n_bioactivity >= 3:
|
|
2012
|
+
return "literature_comparison"
|
|
2013
|
+
|
|
2014
|
+
# Citation-heavy with few actual steps → literature comparison
|
|
2015
|
+
if n_citation >= 3 and len(steps) <= 2:
|
|
2016
|
+
return "literature_comparison"
|
|
2017
|
+
|
|
2018
|
+
# Check for biological pathway markers (enzyme names in text)
|
|
2019
|
+
enzyme_pattern = re.compile(
|
|
2020
|
+
r"(ase\b|synthase|transferase|reductase|oxidase|kinase|"
|
|
2021
|
+
r"isomerase|mutase|ligase|lyase|dehydrogenase)",
|
|
2022
|
+
re.IGNORECASE
|
|
2023
|
+
)
|
|
2024
|
+
enzyme_count = sum(
|
|
2025
|
+
1 for sp in species.values()
|
|
2026
|
+
if sp.element_type == "text" and sp.name
|
|
2027
|
+
and enzyme_pattern.search(sp.name)
|
|
2028
|
+
)
|
|
2029
|
+
if enzyme_count >= 2:
|
|
2030
|
+
return "biological_pathway"
|
|
2031
|
+
|
|
2032
|
+
# Many condition refs → likely synthetic scheme with footnoted conditions
|
|
2033
|
+
# (typical of thesis schemes)
|
|
2034
|
+
|
|
2035
|
+
# Default: synthesis (the most common case)
|
|
2036
|
+
if len(steps) >= 1 and n_frag >= 2:
|
|
2037
|
+
return "synthesis"
|
|
2038
|
+
|
|
2039
|
+
return "unknown"
|
|
2040
|
+
|
|
2041
|
+
|
|
2042
|
+
# ---------------------------------------------------------------------------
|
|
2043
|
+
# Narrative generation
|
|
2044
|
+
# ---------------------------------------------------------------------------
|
|
2045
|
+
|
|
2046
|
+
def _species_display(sp: SpeciesRecord, include_smiles: bool = True) -> str:
|
|
2047
|
+
"""Best available display string for a species.
|
|
2048
|
+
|
|
2049
|
+
Priority: label > aligned_iupac > name > formula > SMILES.
|
|
2050
|
+
"""
|
|
2051
|
+
parts = []
|
|
2052
|
+
if sp.label:
|
|
2053
|
+
parts.append(sp.label)
|
|
2054
|
+
elif sp.aligned_iupac:
|
|
2055
|
+
parts.append(sp.aligned_iupac)
|
|
2056
|
+
elif sp.name:
|
|
2057
|
+
# Use first line of name only (multi-line condition blocks)
|
|
2058
|
+
first_line = sp.name.split("\n")[0].strip()
|
|
2059
|
+
parts.append(first_line)
|
|
2060
|
+
elif sp.formula:
|
|
2061
|
+
parts.append(sp.formula)
|
|
2062
|
+
elif sp.smiles:
|
|
2063
|
+
parts.append(sp.smiles[:40])
|
|
2064
|
+
else:
|
|
2065
|
+
parts.append(sp.id)
|
|
2066
|
+
|
|
2067
|
+
# When a label is used as primary, add the aligned name as qualifier
|
|
2068
|
+
if sp.label and sp.aligned_iupac:
|
|
2069
|
+
parts.append(f"({sp.aligned_iupac})")
|
|
2070
|
+
elif include_smiles and sp.smiles:
|
|
2071
|
+
# Fallback: add SMILES only if not already used as the main display
|
|
2072
|
+
display = parts[0]
|
|
2073
|
+
if display != sp.smiles and display != sp.smiles[:40]:
|
|
2074
|
+
parts.append(f"(SMILES: {sp.smiles})")
|
|
2075
|
+
|
|
2076
|
+
return " ".join(parts)
|
|
2077
|
+
|
|
2078
|
+
|
|
2079
|
+
def _generate_composite_narrative(
|
|
2080
|
+
sub_schemes: List["SchemeDescription"]) -> str:
|
|
2081
|
+
"""Generate narrative for a composite (multi-panel) scheme."""
|
|
2082
|
+
parts = [f"Composite scheme with {len(sub_schemes)} independent "
|
|
2083
|
+
f"sub-schemes:"]
|
|
2084
|
+
parts.append("")
|
|
2085
|
+
for i, sub in enumerate(sub_schemes, 1):
|
|
2086
|
+
# Summarize each sub-scheme
|
|
2087
|
+
header = f"--- Sub-scheme {i} ---"
|
|
2088
|
+
parts.append(header)
|
|
2089
|
+
if sub.narrative:
|
|
2090
|
+
# Indent sub-narrative
|
|
2091
|
+
for line in sub.narrative.split("\n"):
|
|
2092
|
+
parts.append(f" {line}")
|
|
2093
|
+
else:
|
|
2094
|
+
parts.append(f" {sub.num_steps} step(s), "
|
|
2095
|
+
f"{len(sub.species)} species, "
|
|
2096
|
+
f"topology: {sub.topology}")
|
|
2097
|
+
parts.append("")
|
|
2098
|
+
return "\n".join(parts)
|
|
2099
|
+
|
|
2100
|
+
|
|
2101
|
+
def _generate_narrative(desc: SchemeDescription) -> str:
|
|
2102
|
+
"""Generate LLM-consumable natural language description."""
|
|
2103
|
+
parts = []
|
|
2104
|
+
|
|
2105
|
+
# Opening line
|
|
2106
|
+
topo_label = {
|
|
2107
|
+
"linear": "linear",
|
|
2108
|
+
"divergent": "divergent",
|
|
2109
|
+
"convergent": "convergent",
|
|
2110
|
+
"parallel": "parallel (unrelated)",
|
|
2111
|
+
"mixed": "mixed-topology",
|
|
2112
|
+
}.get(desc.topology, desc.topology)
|
|
2113
|
+
|
|
2114
|
+
# Content type label
|
|
2115
|
+
ct_label = {
|
|
2116
|
+
"synthesis": "reaction scheme",
|
|
2117
|
+
"sar_design": "SAR design diagram",
|
|
2118
|
+
"biological_pathway": "biological pathway",
|
|
2119
|
+
"target_array": "target structure",
|
|
2120
|
+
"literature_comparison": "literature comparison",
|
|
2121
|
+
"investigation": "mechanistic investigation",
|
|
2122
|
+
}.get(desc.content_type, "reaction scheme")
|
|
2123
|
+
|
|
2124
|
+
if desc.num_steps == 1:
|
|
2125
|
+
parts.append(f"Single-step {ct_label}.")
|
|
2126
|
+
elif desc.num_steps == 0:
|
|
2127
|
+
parts.append(f"Static {ct_label} (no reaction steps).")
|
|
2128
|
+
else:
|
|
2129
|
+
parts.append(f"{desc.num_steps}-step {topo_label} {ct_label}.")
|
|
2130
|
+
|
|
2131
|
+
# Per-step descriptions
|
|
2132
|
+
for step in desc.steps:
|
|
2133
|
+
step_num = step.step_index + 1
|
|
2134
|
+
line_parts = [f"\nStep {step_num}:"]
|
|
2135
|
+
|
|
2136
|
+
# Reactants
|
|
2137
|
+
reactant_names = []
|
|
2138
|
+
for rid in step.reactant_ids:
|
|
2139
|
+
sp = desc.species.get(rid)
|
|
2140
|
+
if sp:
|
|
2141
|
+
reactant_names.append(_species_display(sp))
|
|
2142
|
+
if reactant_names:
|
|
2143
|
+
line_parts.append(" + ".join(reactant_names))
|
|
2144
|
+
|
|
2145
|
+
# Reagents
|
|
2146
|
+
reagent_names = []
|
|
2147
|
+
for rid in step.reagent_ids:
|
|
2148
|
+
sp = desc.species.get(rid)
|
|
2149
|
+
if sp:
|
|
2150
|
+
reagent_names.append(
|
|
2151
|
+
_species_display(sp, include_smiles=False))
|
|
2152
|
+
if reagent_names:
|
|
2153
|
+
line_parts.append(f"with {', '.join(reagent_names)}")
|
|
2154
|
+
|
|
2155
|
+
# Arrow
|
|
2156
|
+
line_parts.append("->")
|
|
2157
|
+
|
|
2158
|
+
# Products
|
|
2159
|
+
product_names = []
|
|
2160
|
+
for pid in step.product_ids:
|
|
2161
|
+
sp = desc.species.get(pid)
|
|
2162
|
+
if sp:
|
|
2163
|
+
product_names.append(_species_display(sp))
|
|
2164
|
+
if product_names:
|
|
2165
|
+
line_parts.append(" + ".join(product_names))
|
|
2166
|
+
|
|
2167
|
+
# Conditions — combine parsed conditions with raw text fallback
|
|
2168
|
+
if step.conditions:
|
|
2169
|
+
line_parts.append(f"({', '.join(step.conditions)})")
|
|
2170
|
+
elif step.condition_text_raw:
|
|
2171
|
+
# No parsed conditions — use raw text, cleaned up
|
|
2172
|
+
cleaned = []
|
|
2173
|
+
for raw in step.condition_text_raw:
|
|
2174
|
+
for line in raw.split("\n"):
|
|
2175
|
+
line = line.strip()
|
|
2176
|
+
if line:
|
|
2177
|
+
cleaned.append(line)
|
|
2178
|
+
if cleaned:
|
|
2179
|
+
line_parts.append(f"({'; '.join(cleaned)})")
|
|
2180
|
+
|
|
2181
|
+
# Yield
|
|
2182
|
+
if step.yield_text:
|
|
2183
|
+
line_parts.append(f"[{step.yield_text}]")
|
|
2184
|
+
|
|
2185
|
+
# Arrow style annotations
|
|
2186
|
+
if step.arrow_style == "failed":
|
|
2187
|
+
line_parts.append("[FAILED]")
|
|
2188
|
+
elif step.arrow_style == "dashed":
|
|
2189
|
+
line_parts.append("[tentative/planned]")
|
|
2190
|
+
|
|
2191
|
+
# Molecular diff
|
|
2192
|
+
if step.molecular_diff_text:
|
|
2193
|
+
line_parts.append(f"[{step.molecular_diff_text}]")
|
|
2194
|
+
|
|
2195
|
+
parts.append(" ".join(line_parts))
|
|
2196
|
+
|
|
2197
|
+
return "\n".join(parts)
|
|
2198
|
+
|
|
2199
|
+
|
|
2200
|
+
# ---------------------------------------------------------------------------
|
|
2201
|
+
# Substrate scope table detection
|
|
2202
|
+
# ---------------------------------------------------------------------------
|
|
2203
|
+
|
|
2204
|
+
# Regex for scope table yield/result annotations
|
|
2205
|
+
_SCOPE_YIELD_RE = re.compile(r'(\d+(?:\.\d+)?)\s*%')
|
|
2206
|
+
_SCOPE_MASS_RE = re.compile(r'(\d+(?:\.\d+)?)\s*mg')
|
|
2207
|
+
_SCOPE_X_RE = re.compile(r'(?:X|R\d*)\s*=\s*(\w+)', re.IGNORECASE)
|
|
2208
|
+
_SCOPE_LABEL_RE = re.compile(
|
|
2209
|
+
r'(\d+\.\d+[a-z](?:-[a-z])?(?:\')?)' # e.g. "5.70a", "5.70k'", "4.1a-f"
|
|
2210
|
+
r'|'
|
|
2211
|
+
r'(\d+[a-z](?:\')?)' # e.g. "3a", "4b'"
|
|
2212
|
+
)
|
|
2213
|
+
|
|
2214
|
+
|
|
2215
|
+
def _parse_scope_annotation(text: str) -> Optional[dict]:
|
|
2216
|
+
"""Parse a scope table text annotation into structured fields.
|
|
2217
|
+
|
|
2218
|
+
Returns dict with keys: label, conditions_variant, yield_text,
|
|
2219
|
+
mass_text, notes. Returns None if text doesn't look like a scope entry.
|
|
2220
|
+
"""
|
|
2221
|
+
if not text or len(text) < 3:
|
|
2222
|
+
return None
|
|
2223
|
+
|
|
2224
|
+
# Must contain at least one of: yield %, mass mg, X = halide
|
|
2225
|
+
has_yield = _SCOPE_YIELD_RE.search(text) is not None
|
|
2226
|
+
has_mass = _SCOPE_MASS_RE.search(text) is not None
|
|
2227
|
+
has_x = _SCOPE_X_RE.search(text) is not None
|
|
2228
|
+
has_label = _SCOPE_LABEL_RE.search(text) is not None
|
|
2229
|
+
|
|
2230
|
+
if not (has_yield or has_mass or has_x or has_label):
|
|
2231
|
+
return None
|
|
2232
|
+
|
|
2233
|
+
result = {}
|
|
2234
|
+
|
|
2235
|
+
# Extract compound label
|
|
2236
|
+
m = _SCOPE_LABEL_RE.search(text)
|
|
2237
|
+
if m:
|
|
2238
|
+
result["label"] = m.group(1) or m.group(2)
|
|
2239
|
+
elif has_x:
|
|
2240
|
+
# Try numeric-only label (e.g. "4.22") when followed by X=/R= variant
|
|
2241
|
+
m_num = re.match(r'(\d+\.\d+)\s+', text)
|
|
2242
|
+
if m_num:
|
|
2243
|
+
result["label"] = m_num.group(1)
|
|
2244
|
+
|
|
2245
|
+
# Extract conditions variant (X = I, R3 = F, etc.)
|
|
2246
|
+
# Capture all variable assignments in the line
|
|
2247
|
+
var_matches = _SCOPE_X_RE.findall(text)
|
|
2248
|
+
if var_matches:
|
|
2249
|
+
# Rebuild the full conditions string from all matches
|
|
2250
|
+
all_matches = list(_SCOPE_X_RE.finditer(text))
|
|
2251
|
+
result["conditions_variant"] = ", ".join(m.group(0) for m in all_matches)
|
|
2252
|
+
|
|
2253
|
+
# Extract yield
|
|
2254
|
+
yields = _SCOPE_YIELD_RE.findall(text)
|
|
2255
|
+
if yields:
|
|
2256
|
+
result["yield_text"] = yields[0] + "%"
|
|
2257
|
+
|
|
2258
|
+
# Extract mass
|
|
2259
|
+
masses = _SCOPE_MASS_RE.findall(text)
|
|
2260
|
+
if masses:
|
|
2261
|
+
result["mass_text"] = masses[0] + " mg"
|
|
2262
|
+
|
|
2263
|
+
# Notes: special annotations like "Reaction failed", "Scale-up:", etc.
|
|
2264
|
+
notes_parts = []
|
|
2265
|
+
if re.search(r'\bfailed\b', text, re.IGNORECASE):
|
|
2266
|
+
notes_parts.append("Reaction failed")
|
|
2267
|
+
m = re.search(r'[Ss]cale-up[:\s]*(\d+\s*mg[,\s]*\d+\s*%)', text)
|
|
2268
|
+
if m:
|
|
2269
|
+
notes_parts.append(f"Scale-up: {m.group(1).strip()}")
|
|
2270
|
+
result["notes"] = "; ".join(notes_parts) if notes_parts else None
|
|
2271
|
+
|
|
2272
|
+
return result
|
|
2273
|
+
|
|
2274
|
+
|
|
2275
|
+
def _detect_scope_table(
|
|
2276
|
+
page: ET.Element,
|
|
2277
|
+
id_map: Dict[str, ET.Element],
|
|
2278
|
+
raw_steps: List,
|
|
2279
|
+
species_dict: Dict[str, SpeciesRecord],
|
|
2280
|
+
elem_to_species: Dict[str, List[str]],
|
|
2281
|
+
use_network: bool = True,
|
|
2282
|
+
use_chemscript: bool = False,
|
|
2283
|
+
) -> Tuple[List[ScopeEntry], Dict[str, SpeciesRecord]]:
|
|
2284
|
+
"""Detect substrate scope table entries from orphaned structures.
|
|
2285
|
+
|
|
2286
|
+
Looks for:
|
|
2287
|
+
1. ``<bracketedgroup>`` elements with ``BracketedObjectIDs``
|
|
2288
|
+
2. Fragments/groups not claimed by any step
|
|
2289
|
+
3. Yield/result text annotations near orphaned fragments
|
|
2290
|
+
|
|
2291
|
+
Returns (scope_entries, new_species) to be merged into the description.
|
|
2292
|
+
"""
|
|
2293
|
+
from ..rdkit_utils import frag_to_smiles_resolved, frag_to_smiles, frag_to_mw
|
|
2294
|
+
|
|
2295
|
+
# Build set of all element IDs claimed by steps
|
|
2296
|
+
claimed: Set[str] = set()
|
|
2297
|
+
for step in raw_steps:
|
|
2298
|
+
claimed.update(step.reactant_elem_ids)
|
|
2299
|
+
claimed.update(step.product_elem_ids)
|
|
2300
|
+
claimed.update(step.above_arrow_ids)
|
|
2301
|
+
claimed.update(step.below_arrow_ids)
|
|
2302
|
+
|
|
2303
|
+
# Also include all elements already in species_dict
|
|
2304
|
+
for sp_id, sp in species_dict.items():
|
|
2305
|
+
claimed.add(sp.cdxml_element_id)
|
|
2306
|
+
|
|
2307
|
+
# Check for bracketedgroup elements — these are the primary scope signal
|
|
2308
|
+
bracketed_groups = list(page.iter("bracketedgroup"))
|
|
2309
|
+
|
|
2310
|
+
if not bracketed_groups:
|
|
2311
|
+
return [], {}
|
|
2312
|
+
|
|
2313
|
+
_log(f"Found {len(bracketed_groups)} bracketedgroup element(s)")
|
|
2314
|
+
|
|
2315
|
+
# Build parent map for looking up parent elements (standard ElementTree
|
|
2316
|
+
# does not track parent references)
|
|
2317
|
+
parent_map: Dict[ET.Element, ET.Element] = {}
|
|
2318
|
+
for parent in page.iter():
|
|
2319
|
+
for child in parent:
|
|
2320
|
+
parent_map[child] = parent
|
|
2321
|
+
|
|
2322
|
+
# Collect all text elements on the page with their positions
|
|
2323
|
+
# We iterate over elements that contain <t> children and are NOT inside
|
|
2324
|
+
# a fragment (to skip text labels on atoms).
|
|
2325
|
+
text_elements = []
|
|
2326
|
+
# Find <t> elements that are direct children of page-level containers
|
|
2327
|
+
# (not inside <fragment> elements)
|
|
2328
|
+
fragment_ids: Set[str] = set()
|
|
2329
|
+
for frag in page.iter("fragment"):
|
|
2330
|
+
fragment_ids.add(id(frag))
|
|
2331
|
+
|
|
2332
|
+
for t_el in page.iter("t"):
|
|
2333
|
+
# Check if this <t> is inside a fragment by walking parents
|
|
2334
|
+
in_fragment = False
|
|
2335
|
+
check = t_el
|
|
2336
|
+
while check in parent_map:
|
|
2337
|
+
p = parent_map[check]
|
|
2338
|
+
if p.tag == "fragment":
|
|
2339
|
+
in_fragment = True
|
|
2340
|
+
break
|
|
2341
|
+
check = p
|
|
2342
|
+
if in_fragment:
|
|
2343
|
+
continue
|
|
2344
|
+
|
|
2345
|
+
text_content = _get_text_content(t_el)
|
|
2346
|
+
if not text_content:
|
|
2347
|
+
continue
|
|
2348
|
+
|
|
2349
|
+
# Get bounding box from the <t> element itself or its parent
|
|
2350
|
+
bb = None
|
|
2351
|
+
t_parent = parent_map.get(t_el)
|
|
2352
|
+
for search_el in ([t_el, t_parent] if t_parent is not None
|
|
2353
|
+
else [t_el]):
|
|
2354
|
+
if search_el is None:
|
|
2355
|
+
continue
|
|
2356
|
+
bb_str = search_el.get("BoundingBox", "")
|
|
2357
|
+
if bb_str:
|
|
2358
|
+
try:
|
|
2359
|
+
vals = [float(v) for v in bb_str.split()]
|
|
2360
|
+
bb = vals
|
|
2361
|
+
except (ValueError, IndexError):
|
|
2362
|
+
pass
|
|
2363
|
+
break
|
|
2364
|
+
# Try position (p) attribute
|
|
2365
|
+
p = search_el.get("p", "")
|
|
2366
|
+
if p:
|
|
2367
|
+
try:
|
|
2368
|
+
parts = p.split()
|
|
2369
|
+
bb = [float(parts[0]), float(parts[1]),
|
|
2370
|
+
float(parts[0]) + 50, float(parts[1]) + 10]
|
|
2371
|
+
except (ValueError, IndexError):
|
|
2372
|
+
pass
|
|
2373
|
+
break
|
|
2374
|
+
|
|
2375
|
+
if bb:
|
|
2376
|
+
t_el_id = t_el.get("id", "")
|
|
2377
|
+
t_parent_id = (t_parent.get("id", "")
|
|
2378
|
+
if t_parent is not None else "")
|
|
2379
|
+
# Skip text elements already claimed by steps (conditions text)
|
|
2380
|
+
# Check both the <t> element ID and its parent's ID
|
|
2381
|
+
if (t_el_id and t_el_id in claimed) or \
|
|
2382
|
+
(t_parent_id and t_parent_id in claimed):
|
|
2383
|
+
continue
|
|
2384
|
+
# Use parent ID for display if available, else element ID
|
|
2385
|
+
el_id = t_parent_id if t_parent_id else t_el_id
|
|
2386
|
+
text_elements.append({
|
|
2387
|
+
"text": text_content,
|
|
2388
|
+
"id": el_id,
|
|
2389
|
+
"cx": (bb[0] + bb[2]) / 2,
|
|
2390
|
+
"cy": (bb[1] + bb[3]) / 2,
|
|
2391
|
+
"bb": bb,
|
|
2392
|
+
})
|
|
2393
|
+
|
|
2394
|
+
# Parse scope annotations from text elements.
|
|
2395
|
+
# Strategy: first try parsing each text box as a single scope entry.
|
|
2396
|
+
# If a text box has multiple lines where EACH line has its own compound
|
|
2397
|
+
# label or X= variant, split into per-line entries (e.g. oleObject9:
|
|
2398
|
+
# "4.22 X = H\n4.26 X = Me" → 2 entries). Otherwise treat the whole
|
|
2399
|
+
# text box as one entry (e.g. oleObject19: "5.70a\nX = I\n22 mg, 39%"
|
|
2400
|
+
# → 1 entry).
|
|
2401
|
+
scope_annotations = []
|
|
2402
|
+
for te in text_elements:
|
|
2403
|
+
full_text = te["text"]
|
|
2404
|
+
lines = [ln.strip() for ln in full_text.split("\n") if ln.strip()]
|
|
2405
|
+
|
|
2406
|
+
if len(lines) <= 1:
|
|
2407
|
+
# Single-line text: parse directly
|
|
2408
|
+
parsed = _parse_scope_annotation(full_text)
|
|
2409
|
+
if parsed:
|
|
2410
|
+
parsed["_text_id"] = te["id"]
|
|
2411
|
+
parsed["_cx"] = te["cx"]
|
|
2412
|
+
parsed["_cy"] = te["cy"]
|
|
2413
|
+
scope_annotations.append(parsed)
|
|
2414
|
+
continue
|
|
2415
|
+
|
|
2416
|
+
# Multi-line: count how many lines have their own scope signal
|
|
2417
|
+
# (label, X=, yield, mass). If multiple lines each have a label
|
|
2418
|
+
# or X= pattern, treat as per-line entries.
|
|
2419
|
+
line_parseds = []
|
|
2420
|
+
n_labels = 0
|
|
2421
|
+
n_x_variants = 0
|
|
2422
|
+
for ln in lines:
|
|
2423
|
+
p = _parse_scope_annotation(ln)
|
|
2424
|
+
line_parseds.append(p)
|
|
2425
|
+
if p:
|
|
2426
|
+
if p.get("label"):
|
|
2427
|
+
n_labels += 1
|
|
2428
|
+
if p.get("conditions_variant"):
|
|
2429
|
+
n_x_variants += 1
|
|
2430
|
+
|
|
2431
|
+
# Split into per-line entries if multiple lines have labels or
|
|
2432
|
+
# multiple lines have X=/R= variants (table of variants)
|
|
2433
|
+
split_by_line = (n_labels >= 2 or n_x_variants >= 2)
|
|
2434
|
+
|
|
2435
|
+
if split_by_line:
|
|
2436
|
+
for p in line_parseds:
|
|
2437
|
+
if p:
|
|
2438
|
+
p["_text_id"] = te["id"]
|
|
2439
|
+
p["_cx"] = te["cx"]
|
|
2440
|
+
p["_cy"] = te["cy"]
|
|
2441
|
+
scope_annotations.append(p)
|
|
2442
|
+
else:
|
|
2443
|
+
# Parse whole text box as one entry
|
|
2444
|
+
parsed = _parse_scope_annotation(full_text)
|
|
2445
|
+
if parsed:
|
|
2446
|
+
parsed["_text_id"] = te["id"]
|
|
2447
|
+
parsed["_cx"] = te["cx"]
|
|
2448
|
+
parsed["_cy"] = te["cy"]
|
|
2449
|
+
scope_annotations.append(parsed)
|
|
2450
|
+
|
|
2451
|
+
if not scope_annotations:
|
|
2452
|
+
return [], {}
|
|
2453
|
+
|
|
2454
|
+
_log(f"Found {len(scope_annotations)} scope annotation(s)")
|
|
2455
|
+
|
|
2456
|
+
# Build scope entries directly from annotations (one per text box or
|
|
2457
|
+
# per line when split). No spatial clustering needed since multi-line
|
|
2458
|
+
# handling already consolidates within each text element.
|
|
2459
|
+
scope_entries: List[ScopeEntry] = []
|
|
2460
|
+
new_species: Dict[str, SpeciesRecord] = {}
|
|
2461
|
+
|
|
2462
|
+
for i, ann in enumerate(scope_annotations):
|
|
2463
|
+
entry = ScopeEntry(
|
|
2464
|
+
entry_id=f"scope_{i}",
|
|
2465
|
+
label=ann.get("label"),
|
|
2466
|
+
conditions_variant=ann.get("conditions_variant"),
|
|
2467
|
+
yield_text=ann.get("yield_text"),
|
|
2468
|
+
mass_text=ann.get("mass_text"),
|
|
2469
|
+
notes=ann.get("notes"),
|
|
2470
|
+
)
|
|
2471
|
+
scope_entries.append(entry)
|
|
2472
|
+
|
|
2473
|
+
return scope_entries, new_species
|
|
2474
|
+
|
|
2475
|
+
|
|
2476
|
+
# ---------------------------------------------------------------------------
|
|
2477
|
+
# Aligned IUPAC naming enrichment
|
|
2478
|
+
# ---------------------------------------------------------------------------
|
|
2479
|
+
|
|
2480
|
+
# Common heterocyclic ring names for parent normalization, ordered
|
|
2481
|
+
# largest-first so that "benzimidazole" matches before "imidazole".
|
|
2482
|
+
_KNOWN_RING_NAMES = [
|
|
2483
|
+
'benzimidazole', 'isoquinoline', 'quinazoline', 'naphthalene',
|
|
2484
|
+
'quinoline', 'carbazole', 'acridine',
|
|
2485
|
+
'morpholine', 'piperidine', 'piperazine', 'pyrimidine',
|
|
2486
|
+
'pyridine', 'thiophene', 'imidazole', 'thiazole', 'oxazole',
|
|
2487
|
+
'indole', 'furan', 'benzene',
|
|
2488
|
+
]
|
|
2489
|
+
|
|
2490
|
+
|
|
2491
|
+
def _find_preferred_parent(desc: "SchemeDescription") -> str:
|
|
2492
|
+
"""Pre-scan all principal species to find the dominant naming parent.
|
|
2493
|
+
|
|
2494
|
+
Decomposes each unique principal species (highest-MW per step) into
|
|
2495
|
+
its available naming parents, normalises each parent to a root ring
|
|
2496
|
+
name (e.g. "3-bromoquinoline" → "quinoline"), then picks the root
|
|
2497
|
+
ring that appears in the most compounds.
|
|
2498
|
+
|
|
2499
|
+
Tiebreaker: prefer the ring present in the **final product**
|
|
2500
|
+
(last step's product). In drug-discovery synthesis, the final product
|
|
2501
|
+
defines the target scaffold — transformations build *toward* that ring,
|
|
2502
|
+
while other ring substituents (morpholine, piperidine) are passengers.
|
|
2503
|
+
|
|
2504
|
+
Returns the root ring name (e.g. "quinoline") or "" if none found.
|
|
2505
|
+
"""
|
|
2506
|
+
try:
|
|
2507
|
+
from ..naming.name_decomposer import decompose_name
|
|
2508
|
+
except ImportError:
|
|
2509
|
+
return ""
|
|
2510
|
+
|
|
2511
|
+
# Collect unique principal SMILES across all steps, preserving order
|
|
2512
|
+
principal_smiles: Dict[str, None] = {} # ordered set
|
|
2513
|
+
for step in desc.steps:
|
|
2514
|
+
for role_ids in [step.reactant_ids, step.product_ids]:
|
|
2515
|
+
sps = [desc.species[sid] for sid in role_ids
|
|
2516
|
+
if sid in desc.species and desc.species[sid].smiles]
|
|
2517
|
+
if sps:
|
|
2518
|
+
best = max(sps, key=lambda s: s.mw or 0)
|
|
2519
|
+
if best.smiles:
|
|
2520
|
+
principal_smiles[best.smiles] = None
|
|
2521
|
+
|
|
2522
|
+
if not principal_smiles:
|
|
2523
|
+
return ""
|
|
2524
|
+
|
|
2525
|
+
# Find the final product SMILES (last step's principal product)
|
|
2526
|
+
final_product_smiles = ""
|
|
2527
|
+
if desc.steps:
|
|
2528
|
+
last_step = desc.steps[-1]
|
|
2529
|
+
prod_sps = [desc.species[sid] for sid in last_step.product_ids
|
|
2530
|
+
if sid in desc.species and desc.species[sid].smiles]
|
|
2531
|
+
if prod_sps:
|
|
2532
|
+
final_product_smiles = max(prod_sps,
|
|
2533
|
+
key=lambda s: s.mw or 0).smiles or ""
|
|
2534
|
+
|
|
2535
|
+
# Decompose each and collect all available parent names
|
|
2536
|
+
from collections import Counter
|
|
2537
|
+
ring_counts: Counter = Counter()
|
|
2538
|
+
final_prod_rings: set = set() # rings in the final product
|
|
2539
|
+
|
|
2540
|
+
for smi in principal_smiles:
|
|
2541
|
+
try:
|
|
2542
|
+
r = decompose_name(smi)
|
|
2543
|
+
except Exception:
|
|
2544
|
+
continue
|
|
2545
|
+
|
|
2546
|
+
# Gather all parent strings for this compound
|
|
2547
|
+
parents: set = set()
|
|
2548
|
+
if r.canonical_parent:
|
|
2549
|
+
parents.add(r.canonical_parent.lower())
|
|
2550
|
+
for alt in r.alternatives:
|
|
2551
|
+
if alt.valid and alt.parent_name:
|
|
2552
|
+
parents.add(alt.parent_name.lower())
|
|
2553
|
+
# Also check the name itself for ring stems (handles cases
|
|
2554
|
+
# where the ring appears in complex parent names like
|
|
2555
|
+
# "4-(4-phenylquinolin-2-yl)morpholine")
|
|
2556
|
+
all_names = [r.canonical_name.lower()]
|
|
2557
|
+
all_names.extend(a.name.lower() for a in r.alternatives if a.valid)
|
|
2558
|
+
|
|
2559
|
+
# Normalise: for each parent/name, find which root ring it contains
|
|
2560
|
+
compound_rings: set = set()
|
|
2561
|
+
for text in list(parents) + all_names:
|
|
2562
|
+
for ring in _KNOWN_RING_NAMES:
|
|
2563
|
+
# Match both full form ("quinoline") and stem ("quinolin")
|
|
2564
|
+
ring_stem = ring.rstrip('e')
|
|
2565
|
+
if ring in text or ring_stem in text:
|
|
2566
|
+
compound_rings.add(ring)
|
|
2567
|
+
ring_counts.update(compound_rings)
|
|
2568
|
+
|
|
2569
|
+
# Remember which rings the final product has
|
|
2570
|
+
if smi == final_product_smiles:
|
|
2571
|
+
final_prod_rings = compound_rings.copy()
|
|
2572
|
+
|
|
2573
|
+
if not ring_counts:
|
|
2574
|
+
return ""
|
|
2575
|
+
|
|
2576
|
+
# Pick the ring present in the most compounds.
|
|
2577
|
+
# Tiebreaker: prefer rings from the final product (the target scaffold),
|
|
2578
|
+
# then larger ring systems.
|
|
2579
|
+
best_ring = max(
|
|
2580
|
+
ring_counts,
|
|
2581
|
+
key=lambda r: (ring_counts[r],
|
|
2582
|
+
1 if r in final_prod_rings else 0,
|
|
2583
|
+
len(r)))
|
|
2584
|
+
_log(f"Preferred naming parent: {best_ring} "
|
|
2585
|
+
f"(in {ring_counts[best_ring]}/{len(principal_smiles)} compounds"
|
|
2586
|
+
f"{', final-product' if best_ring in final_prod_rings else ''})")
|
|
2587
|
+
return best_ring
|
|
2588
|
+
|
|
2589
|
+
|
|
2590
|
+
def _enrich_aligned_names(desc: "SchemeDescription") -> None:
|
|
2591
|
+
"""Populate aligned_iupac on species and molecular_diff_text on steps.
|
|
2592
|
+
|
|
2593
|
+
For each step, finds the principal SM/product pair (largest MW),
|
|
2594
|
+
runs ``find_aligned_names`` to get MCS-based aligned IUPAC names,
|
|
2595
|
+
and fills ``format_molecular_diff`` text on the step.
|
|
2596
|
+
|
|
2597
|
+
Uses a global "preferred parent" strategy: pre-scans all principal
|
|
2598
|
+
species to find the dominant ring system, then passes it as a hint
|
|
2599
|
+
to every step so the entire scheme uses a consistent naming backbone.
|
|
2600
|
+
|
|
2601
|
+
Gracefully degrades if aligned_namer is unavailable.
|
|
2602
|
+
"""
|
|
2603
|
+
try:
|
|
2604
|
+
from ..naming.aligned_namer import (
|
|
2605
|
+
find_aligned_names,
|
|
2606
|
+
format_molecular_diff,
|
|
2607
|
+
)
|
|
2608
|
+
except ImportError:
|
|
2609
|
+
return
|
|
2610
|
+
|
|
2611
|
+
# Find the globally preferred naming parent
|
|
2612
|
+
preferred_parent = _find_preferred_parent(desc)
|
|
2613
|
+
|
|
2614
|
+
for step in desc.steps:
|
|
2615
|
+
sm_list = [desc.species[sid] for sid in step.reactant_ids
|
|
2616
|
+
if sid in desc.species and desc.species[sid].smiles]
|
|
2617
|
+
prod_list = [desc.species[sid] for sid in step.product_ids
|
|
2618
|
+
if sid in desc.species and desc.species[sid].smiles]
|
|
2619
|
+
|
|
2620
|
+
if not sm_list or not prod_list:
|
|
2621
|
+
continue
|
|
2622
|
+
|
|
2623
|
+
# Principal pair: largest MW (the "core" substrate, not additives)
|
|
2624
|
+
sm_sp = max(sm_list, key=lambda s: s.mw or 0)
|
|
2625
|
+
prod_sp = max(prod_list, key=lambda s: s.mw or 0)
|
|
2626
|
+
|
|
2627
|
+
if not sm_sp.smiles or not prod_sp.smiles:
|
|
2628
|
+
continue
|
|
2629
|
+
|
|
2630
|
+
try:
|
|
2631
|
+
ar = find_aligned_names(sm_sp.smiles, prod_sp.smiles,
|
|
2632
|
+
preferred_parent=preferred_parent or None)
|
|
2633
|
+
|
|
2634
|
+
# Only set aligned_iupac if not already assigned by a previous
|
|
2635
|
+
# step. This preserves naming consistency for intermediates.
|
|
2636
|
+
if ar.best_sm_name and not sm_sp.aligned_iupac:
|
|
2637
|
+
sm_sp.aligned_iupac = ar.best_sm_name
|
|
2638
|
+
if ar.best_prod_name and not prod_sp.aligned_iupac:
|
|
2639
|
+
prod_sp.aligned_iupac = ar.best_prod_name
|
|
2640
|
+
|
|
2641
|
+
diff_text = format_molecular_diff(
|
|
2642
|
+
sm_sp.smiles, prod_sp.smiles, ar)
|
|
2643
|
+
if diff_text:
|
|
2644
|
+
step.molecular_diff_text = diff_text
|
|
2645
|
+
except Exception:
|
|
2646
|
+
# Non-critical enrichment — don't break scheme reading
|
|
2647
|
+
pass
|
|
2648
|
+
|
|
2649
|
+
|
|
2650
|
+
# ---------------------------------------------------------------------------
|
|
2651
|
+
# Main API
|
|
2652
|
+
# ---------------------------------------------------------------------------
|
|
2653
|
+
|
|
2654
|
+
def read_scheme(
|
|
2655
|
+
cdxml_path: str,
|
|
2656
|
+
use_network: bool = True,
|
|
2657
|
+
use_chemscript: bool = False,
|
|
2658
|
+
verbose: bool = False,
|
|
2659
|
+
segment: bool = False,
|
|
2660
|
+
_scheme_filter: Optional[Set[str]] = None,
|
|
2661
|
+
) -> SchemeDescription:
|
|
2662
|
+
"""Read a CDXML reaction scheme and return a structured description.
|
|
2663
|
+
|
|
2664
|
+
Primary path: uses <scheme><step> attributes if present.
|
|
2665
|
+
Fallback: geometry-based arrow detection.
|
|
2666
|
+
|
|
2667
|
+
Parameters
|
|
2668
|
+
----------
|
|
2669
|
+
cdxml_path : str
|
|
2670
|
+
Path to CDXML file.
|
|
2671
|
+
use_network : bool
|
|
2672
|
+
Allow PubChem network lookups for text label resolution.
|
|
2673
|
+
use_chemscript : bool
|
|
2674
|
+
Use ChemScript for SMILES extraction (best abbreviation resolution,
|
|
2675
|
+
requires ChemDraw 16+ on Windows). Falls back to RDKit-based
|
|
2676
|
+
resolution if ChemScript is unavailable.
|
|
2677
|
+
verbose : bool
|
|
2678
|
+
Print debug info to stderr.
|
|
2679
|
+
segment : bool
|
|
2680
|
+
Auto-segment multi-panel CDXML files into independent sub-schemes.
|
|
2681
|
+
When True, the returned SchemeDescription may have a non-empty
|
|
2682
|
+
``sub_schemes`` list with independent sub-scheme descriptions.
|
|
2683
|
+
_scheme_filter : set of str, optional
|
|
2684
|
+
Internal parameter used by the segmenter. If provided, only
|
|
2685
|
+
process ``<scheme>`` elements whose ``id`` is in this set.
|
|
2686
|
+
|
|
2687
|
+
Returns
|
|
2688
|
+
-------
|
|
2689
|
+
SchemeDescription
|
|
2690
|
+
Complete structured description with species, steps, topology,
|
|
2691
|
+
and narrative.
|
|
2692
|
+
"""
|
|
2693
|
+
global _verbose
|
|
2694
|
+
_verbose = verbose
|
|
2695
|
+
|
|
2696
|
+
from ..cdxml_utils import parse_cdxml, build_id_map
|
|
2697
|
+
|
|
2698
|
+
tree = parse_cdxml(cdxml_path)
|
|
2699
|
+
root = tree.getroot()
|
|
2700
|
+
page = root.find(".//page")
|
|
2701
|
+
if page is None:
|
|
2702
|
+
return SchemeDescription(
|
|
2703
|
+
source_file=os.path.abspath(cdxml_path),
|
|
2704
|
+
warnings=["No <page> element found in CDXML"],
|
|
2705
|
+
)
|
|
2706
|
+
|
|
2707
|
+
id_map = build_id_map(page)
|
|
2708
|
+
|
|
2709
|
+
# -----------------------------------------------------------------------
|
|
2710
|
+
# Auto-segmentation: detect independent sub-schemes
|
|
2711
|
+
# -----------------------------------------------------------------------
|
|
2712
|
+
if segment and _scheme_filter is None:
|
|
2713
|
+
from .scheme_segmenter import segment_scheme as _segment_scheme
|
|
2714
|
+
seg_result = _segment_scheme(cdxml_path, verbose=verbose)
|
|
2715
|
+
if seg_result.is_multi_panel and seg_result.num_segments > 1:
|
|
2716
|
+
_log(f"Multi-panel detected: {seg_result.num_segments} segments")
|
|
2717
|
+
sub_schemes = []
|
|
2718
|
+
all_species: Dict[str, SpeciesRecord] = {}
|
|
2719
|
+
all_steps: List[StepRecord] = []
|
|
2720
|
+
for seg in seg_result.segments:
|
|
2721
|
+
filter_ids = set(seg.scheme_element_ids)
|
|
2722
|
+
sub_desc = read_scheme(
|
|
2723
|
+
cdxml_path,
|
|
2724
|
+
use_network=use_network,
|
|
2725
|
+
use_chemscript=use_chemscript,
|
|
2726
|
+
verbose=verbose,
|
|
2727
|
+
segment=False,
|
|
2728
|
+
_scheme_filter=filter_ids,
|
|
2729
|
+
)
|
|
2730
|
+
sub_desc.source_file = os.path.abspath(cdxml_path)
|
|
2731
|
+
sub_schemes.append(sub_desc)
|
|
2732
|
+
all_species.update(sub_desc.species)
|
|
2733
|
+
all_steps.extend(sub_desc.steps)
|
|
2734
|
+
|
|
2735
|
+
# Build composite description
|
|
2736
|
+
total_steps = sum(s.num_steps for s in sub_schemes)
|
|
2737
|
+
composite = SchemeDescription(
|
|
2738
|
+
source_file=os.path.abspath(cdxml_path),
|
|
2739
|
+
topology="parallel",
|
|
2740
|
+
content_type="composite",
|
|
2741
|
+
num_steps=total_steps,
|
|
2742
|
+
species=all_species,
|
|
2743
|
+
steps=all_steps,
|
|
2744
|
+
sub_schemes=sub_schemes,
|
|
2745
|
+
narrative=_generate_composite_narrative(sub_schemes),
|
|
2746
|
+
)
|
|
2747
|
+
return composite
|
|
2748
|
+
|
|
2749
|
+
# Dual-strategy parsing:
|
|
2750
|
+
# - Geometry engine: works on all files, including pycdxml-converted CDX
|
|
2751
|
+
# - Step-attribute path: uses ChemDraw's <scheme><step> when available
|
|
2752
|
+
#
|
|
2753
|
+
# Preference: use step attributes when available (they encode the
|
|
2754
|
+
# author's explicit grouping). Use geometry engine as primary when
|
|
2755
|
+
# step attributes are missing (pycdxml output, manual drawings).
|
|
2756
|
+
# The geometry engine also provides layout_pattern and confidence metadata
|
|
2757
|
+
# regardless of which strategy is used for the final assignment.
|
|
2758
|
+
parse_method = ""
|
|
2759
|
+
layout_pattern = None
|
|
2760
|
+
confidence_map: Dict[str, float] = {}
|
|
2761
|
+
|
|
2762
|
+
# Always run geometry engine (for metadata + fallback)
|
|
2763
|
+
geo_steps = _parse_from_spatial_engine(page, id_map)
|
|
2764
|
+
_sa_meta = getattr(_parse_from_spatial_engine, "_last_meta", {})
|
|
2765
|
+
layout_pattern = _sa_meta.get("layout_pattern")
|
|
2766
|
+
confidence_map = _sa_meta.get("confidences", {})
|
|
2767
|
+
if geo_steps:
|
|
2768
|
+
_log(f"Spatial engine: {len(geo_steps)} step(s), "
|
|
2769
|
+
f"layout={layout_pattern}")
|
|
2770
|
+
|
|
2771
|
+
# Try step-attribute path
|
|
2772
|
+
attr_steps = _parse_from_step_attributes(page, id_map,
|
|
2773
|
+
scheme_filter=_scheme_filter)
|
|
2774
|
+
if attr_steps:
|
|
2775
|
+
_log(f"Step-attribute path: {len(attr_steps)} step(s)")
|
|
2776
|
+
|
|
2777
|
+
# Choose strategy
|
|
2778
|
+
if attr_steps:
|
|
2779
|
+
raw_steps = attr_steps
|
|
2780
|
+
parse_method = "step_attribute"
|
|
2781
|
+
elif geo_steps:
|
|
2782
|
+
raw_steps = geo_steps
|
|
2783
|
+
parse_method = "geometry"
|
|
2784
|
+
else:
|
|
2785
|
+
raw_steps = []
|
|
2786
|
+
|
|
2787
|
+
if not raw_steps:
|
|
2788
|
+
# No reaction steps — still enumerate all structures on the page
|
|
2789
|
+
species_dict = _build_static_species_registry(
|
|
2790
|
+
page, id_map, use_network=use_network,
|
|
2791
|
+
use_chemscript=use_chemscript)
|
|
2792
|
+
content_type = "target_array" if species_dict else "unknown"
|
|
2793
|
+
desc = SchemeDescription(
|
|
2794
|
+
source_file=os.path.abspath(cdxml_path),
|
|
2795
|
+
content_type=content_type,
|
|
2796
|
+
species=species_dict,
|
|
2797
|
+
warnings=["No reaction steps found "
|
|
2798
|
+
"(no <step> attributes, no arrows)"],
|
|
2799
|
+
)
|
|
2800
|
+
desc.narrative = _generate_narrative(desc)
|
|
2801
|
+
return desc
|
|
2802
|
+
|
|
2803
|
+
# Recover orphan transition arrows (serpentine vertical connectors
|
|
2804
|
+
# that the renderer places outside <scheme><step> elements)
|
|
2805
|
+
if parse_method == "step_attribute":
|
|
2806
|
+
pre_count = len(raw_steps)
|
|
2807
|
+
raw_steps = _recover_orphan_transition_steps(page, raw_steps, id_map)
|
|
2808
|
+
if len(raw_steps) > pre_count:
|
|
2809
|
+
_log(f"Recovered {len(raw_steps) - pre_count} orphan "
|
|
2810
|
+
f"transition step(s)")
|
|
2811
|
+
|
|
2812
|
+
_log(f"Found {len(raw_steps)} step(s)")
|
|
2813
|
+
|
|
2814
|
+
# Build species registry
|
|
2815
|
+
species_dict, elem_to_species = _build_species_registry(
|
|
2816
|
+
raw_steps, id_map, page,
|
|
2817
|
+
use_network=use_network,
|
|
2818
|
+
use_chemscript=use_chemscript,
|
|
2819
|
+
)
|
|
2820
|
+
_log(f"Built registry with {len(species_dict)} species")
|
|
2821
|
+
|
|
2822
|
+
# Convert to step records
|
|
2823
|
+
steps = _build_step_records(raw_steps, elem_to_species, species_dict,
|
|
2824
|
+
id_map, page)
|
|
2825
|
+
|
|
2826
|
+
# Resolve footnote conditions (e.g. "(a) Pd2(dba)3, BINAP, ...")
|
|
2827
|
+
# for steps that use condition_ref letters
|
|
2828
|
+
registered_eids = set(elem_to_species.keys())
|
|
2829
|
+
footnotes = _collect_footnotes(page, registered_eids)
|
|
2830
|
+
if footnotes:
|
|
2831
|
+
_resolve_footnote_conditions(steps, species_dict, footnotes)
|
|
2832
|
+
_log(f"Resolved {len(footnotes)} footnote(s)")
|
|
2833
|
+
|
|
2834
|
+
# Link repeated structures across separate <scheme> elements
|
|
2835
|
+
# (wrap-repeat layouts re-draw intermediates with new element IDs)
|
|
2836
|
+
_link_repeated_species(steps, species_dict)
|
|
2837
|
+
|
|
2838
|
+
# Detect topology
|
|
2839
|
+
topology = _detect_topology(steps)
|
|
2840
|
+
_log(f"Detected topology: {topology}")
|
|
2841
|
+
|
|
2842
|
+
# Detect content type
|
|
2843
|
+
content_type = _detect_content_type(steps, species_dict)
|
|
2844
|
+
_log(f"Detected content type: {content_type}")
|
|
2845
|
+
|
|
2846
|
+
# Detect substrate scope table (orphaned structures + yield annotations)
|
|
2847
|
+
scope_entries, scope_species = _detect_scope_table(
|
|
2848
|
+
page, id_map, raw_steps, species_dict, elem_to_species,
|
|
2849
|
+
use_network=use_network, use_chemscript=use_chemscript,
|
|
2850
|
+
)
|
|
2851
|
+
if scope_species:
|
|
2852
|
+
species_dict.update(scope_species)
|
|
2853
|
+
if scope_entries:
|
|
2854
|
+
_log(f"Detected {len(scope_entries)} scope table entries")
|
|
2855
|
+
if content_type == "synthesis":
|
|
2856
|
+
content_type = "substrate_scope"
|
|
2857
|
+
|
|
2858
|
+
desc = SchemeDescription(
|
|
2859
|
+
source_file=os.path.abspath(cdxml_path),
|
|
2860
|
+
topology=topology,
|
|
2861
|
+
content_type=content_type,
|
|
2862
|
+
num_steps=len(steps),
|
|
2863
|
+
species=species_dict,
|
|
2864
|
+
steps=steps,
|
|
2865
|
+
scope_entries=scope_entries,
|
|
2866
|
+
layout_pattern=layout_pattern,
|
|
2867
|
+
parse_method=parse_method,
|
|
2868
|
+
assignment_confidences=confidence_map,
|
|
2869
|
+
)
|
|
2870
|
+
|
|
2871
|
+
# Add warnings for low-confidence assignments
|
|
2872
|
+
for elem_id, conf in confidence_map.items():
|
|
2873
|
+
if conf < 0.5:
|
|
2874
|
+
desc.warnings.append(
|
|
2875
|
+
f"Low confidence ({conf:.2f}) assigning element {elem_id}")
|
|
2876
|
+
|
|
2877
|
+
# Enrich with aligned IUPAC names + molecular diffs
|
|
2878
|
+
_enrich_aligned_names(desc)
|
|
2879
|
+
|
|
2880
|
+
# Generate narrative
|
|
2881
|
+
desc.narrative = _generate_narrative(desc)
|
|
2882
|
+
|
|
2883
|
+
return desc
|
|
2884
|
+
|
|
2885
|
+
|
|
2886
|
+
# ---------------------------------------------------------------------------
|
|
2887
|
+
# CLI
|
|
2888
|
+
# ---------------------------------------------------------------------------
|
|
2889
|
+
|
|
2890
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
2891
|
+
parser = argparse.ArgumentParser(
|
|
2892
|
+
prog="scheme_reader",
|
|
2893
|
+
description="Read a CDXML reaction scheme and produce structured JSON.",
|
|
2894
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
2895
|
+
epilog="""\
|
|
2896
|
+
examples:
|
|
2897
|
+
python -m cdxml_toolkit.scheme_reader scheme.cdxml
|
|
2898
|
+
python -m cdxml_toolkit.scheme_reader scheme.cdxml -o description.json
|
|
2899
|
+
python -m cdxml_toolkit.scheme_reader scheme.cdxml --narrative-only
|
|
2900
|
+
""",
|
|
2901
|
+
)
|
|
2902
|
+
parser.add_argument("input", help="Input CDXML file with reaction scheme")
|
|
2903
|
+
parser.add_argument("-o", "--output",
|
|
2904
|
+
help="Output JSON path (default: stdout)")
|
|
2905
|
+
parser.add_argument("--pretty", action="store_true", default=True,
|
|
2906
|
+
help="Pretty-print JSON (default: yes)")
|
|
2907
|
+
parser.add_argument("--no-pretty", dest="pretty", action="store_false")
|
|
2908
|
+
parser.add_argument("--no-network", action="store_true",
|
|
2909
|
+
help="Disable network lookups (PubChem, OPSIN)")
|
|
2910
|
+
parser.add_argument("--chemscript", action="store_true",
|
|
2911
|
+
help="Use ChemScript for SMILES (best abbreviation "
|
|
2912
|
+
"resolution, requires ChemDraw 16+ on Windows)")
|
|
2913
|
+
parser.add_argument("--narrative-only", action="store_true",
|
|
2914
|
+
help="Print only the narrative text to stdout")
|
|
2915
|
+
parser.add_argument("-v", "--verbose", action="store_true",
|
|
2916
|
+
help="Print debug info to stderr")
|
|
2917
|
+
|
|
2918
|
+
args = parser.parse_args(argv)
|
|
2919
|
+
|
|
2920
|
+
if not os.path.isfile(args.input):
|
|
2921
|
+
print(f"Error: file not found: {args.input}", file=sys.stderr)
|
|
2922
|
+
return 1
|
|
2923
|
+
|
|
2924
|
+
desc = read_scheme(
|
|
2925
|
+
args.input,
|
|
2926
|
+
use_network=not args.no_network,
|
|
2927
|
+
use_chemscript=args.chemscript,
|
|
2928
|
+
verbose=args.verbose,
|
|
2929
|
+
)
|
|
2930
|
+
|
|
2931
|
+
if args.narrative_only:
|
|
2932
|
+
print(desc.narrative)
|
|
2933
|
+
return 0
|
|
2934
|
+
|
|
2935
|
+
if args.output:
|
|
2936
|
+
desc.to_json(args.output, pretty=args.pretty)
|
|
2937
|
+
print(f"Written to {args.output}", file=sys.stderr)
|
|
2938
|
+
else:
|
|
2939
|
+
out = json.dumps(desc.to_dict(), indent=2 if args.pretty else None,
|
|
2940
|
+
ensure_ascii=False)
|
|
2941
|
+
sys.stdout.buffer.write(out.encode("utf-8"))
|
|
2942
|
+
sys.stdout.buffer.write(b"\n")
|
|
2943
|
+
|
|
2944
|
+
return 0
|
|
2945
|
+
|
|
2946
|
+
|
|
2947
|
+
if __name__ == "__main__":
|
|
2948
|
+
sys.exit(main())
|