cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2150 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
reaction_parser.py — Unified reaction semantic layer.
|
|
4
|
+
|
|
5
|
+
Parses ELN export files (any combination of CDX, CDXML, RXN, CSV) into a
|
|
6
|
+
single persisted JSON descriptor listing every chemical species with:
|
|
7
|
+
- Canonical SMILES (full + neutral/salt-split)
|
|
8
|
+
- Role classification (atom_contributing / non_contributing / product)
|
|
9
|
+
- Display name (SM, DP, curated abbreviation, CSV name, or formula)
|
|
10
|
+
- Mass data (exact mass, MW, ESI adducts)
|
|
11
|
+
|
|
12
|
+
The JSON output serves as the single source of truth for downstream tools
|
|
13
|
+
(procedure_writer, scheme_merger, flower_predictor, etc.).
|
|
14
|
+
|
|
15
|
+
CLI:
|
|
16
|
+
python reaction_parser.py experiment.cdxml -o reaction.json
|
|
17
|
+
python reaction_parser.py experiment.cdxml --csv exp.csv --pretty
|
|
18
|
+
python reaction_parser.py --input-dir path/ --experiment KL-7001-004
|
|
19
|
+
|
|
20
|
+
Python API:
|
|
21
|
+
from cdxml_toolkit.perception.reaction_parser import parse_reaction, ReactionDescriptor
|
|
22
|
+
desc = parse_reaction(cdxml="scheme.cdxml", csv="exp.csv")
|
|
23
|
+
desc.to_json("reaction.json")
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import argparse
|
|
27
|
+
import json
|
|
28
|
+
import os
|
|
29
|
+
import re
|
|
30
|
+
import sys
|
|
31
|
+
import datetime
|
|
32
|
+
from dataclasses import dataclass, field, asdict
|
|
33
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
34
|
+
from xml.etree import ElementTree as ET
|
|
35
|
+
|
|
36
|
+
from ..constants import MW_MATCH_TOLERANCE, MASS_TOLERANCE
|
|
37
|
+
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
# Logging helper
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
_verbose = False
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _log(msg: str) -> None:
|
|
45
|
+
if _verbose:
|
|
46
|
+
print(msg, file=sys.stderr)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
# Data classes
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class SpeciesDescriptor:
|
|
55
|
+
"""A single chemical species in the reaction."""
|
|
56
|
+
id: str = "" # "sp_0", "sp_1", ...
|
|
57
|
+
smiles: Optional[str] = None # canonical, full (salts together)
|
|
58
|
+
smiles_neutral: Optional[str] = None # largest fragment (for LCMS)
|
|
59
|
+
name: str = "" # display name
|
|
60
|
+
role: str = "" # atom_contributing | non_contributing | product
|
|
61
|
+
role_detail: Optional[str] = None # from reagent_db: base, catalyst, ...
|
|
62
|
+
rxn_insight_role: Optional[str] = None # from RXN Insight
|
|
63
|
+
classification_method: str = "" # role_lookup, rxnmapper, mcs, csv_type, ...
|
|
64
|
+
is_sm: bool = False
|
|
65
|
+
is_dp: bool = False
|
|
66
|
+
exact_mass: float = 0.0 # monoisotopic, neutral
|
|
67
|
+
exact_mass_full: float = 0.0 # monoisotopic, full salt
|
|
68
|
+
mw: float = 0.0 # average MW
|
|
69
|
+
formula: Optional[str] = None
|
|
70
|
+
adducts: Dict[str, float] = field(default_factory=dict)
|
|
71
|
+
source: str = "" # fragment, text_label, csv_only, rxn
|
|
72
|
+
source_id: Optional[str] = None # CDXML element id
|
|
73
|
+
csv_equiv: Optional[str] = None
|
|
74
|
+
csv_mass: Optional[str] = None
|
|
75
|
+
csv_name: Optional[str] = None
|
|
76
|
+
csv_volume: Optional[str] = None
|
|
77
|
+
csv_supplier: Optional[str] = None
|
|
78
|
+
# v1.1 fields — ELN enrichment
|
|
79
|
+
is_substrate: bool = False # True = equiv 1.0 in CSV (for scheme layout)
|
|
80
|
+
is_solvent: bool = False # From CSV SOLVENT section or reagent_db role
|
|
81
|
+
display_text: Optional[str] = None # Formatted text for scheme: "Cs2CO3 (2 eq.)"
|
|
82
|
+
# v1.2 fields — original CDXML geometry preservation
|
|
83
|
+
original_geometry: Optional[Dict[str, Any]] = field(default=None)
|
|
84
|
+
# Structure of original_geometry:
|
|
85
|
+
# {
|
|
86
|
+
# "atoms": [
|
|
87
|
+
# {"id": 42, "x": 100.0, "y": 200.0, "symbol": "C"},
|
|
88
|
+
# {"id": 43, "x": 114.4, "y": 207.2, "symbol": "N", "num_hydrogens": 1},
|
|
89
|
+
# {"id": 44, "x": 128.8, "y": 200.0, "is_abbreviation": True,
|
|
90
|
+
# "label": "OTs", "label_smiles": "O[S](=O)(C1=CC=C(C)C=C1)=O",
|
|
91
|
+
# "is_generic": False},
|
|
92
|
+
# {"id": 45, "x": 140.0, "y": 210.0, "is_generic": True,
|
|
93
|
+
# "label": "R", "node_type": "GenericNickname"},
|
|
94
|
+
# ],
|
|
95
|
+
# "bonds": [
|
|
96
|
+
# {"begin": 42, "end": 43, "order": 1},
|
|
97
|
+
# {"begin": 43, "end": 44, "order": 1, "double_position": "Left"},
|
|
98
|
+
# ],
|
|
99
|
+
# "bond_length": 14.4, # average bond length in CDXML points
|
|
100
|
+
# }
|
|
101
|
+
|
|
102
|
+
def to_dict(self) -> dict:
|
|
103
|
+
d = asdict(self)
|
|
104
|
+
# Drop None values for cleaner JSON
|
|
105
|
+
return {k: v for k, v in d.items() if v is not None}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class ReactionDescriptor:
|
|
110
|
+
"""Complete parsed reaction description."""
|
|
111
|
+
version: str = "1.3"
|
|
112
|
+
experiment: str = ""
|
|
113
|
+
input_files: Dict[str, Optional[str]] = field(default_factory=dict)
|
|
114
|
+
reaction_smiles: Optional[str] = None
|
|
115
|
+
reaction_class: Optional[str] = None
|
|
116
|
+
reaction_name: Optional[str] = None
|
|
117
|
+
classification_confidence: Optional[float] = None
|
|
118
|
+
species: List[SpeciesDescriptor] = field(default_factory=list)
|
|
119
|
+
warnings: List[str] = field(default_factory=list)
|
|
120
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
121
|
+
# v1.1 fields — scheme layout and ELN enrichment
|
|
122
|
+
conditions: List[str] = field(default_factory=list) # ["80 °C", "24 h", "N2"]
|
|
123
|
+
eln_data: Optional[Dict[str, Any]] = field(default=None) # run arrow data + procedure
|
|
124
|
+
|
|
125
|
+
def to_dict(self) -> dict:
|
|
126
|
+
d = {
|
|
127
|
+
"version": self.version,
|
|
128
|
+
"experiment": self.experiment,
|
|
129
|
+
"input_files": self.input_files,
|
|
130
|
+
"reaction_smiles": self.reaction_smiles,
|
|
131
|
+
"reaction_class": self.reaction_class,
|
|
132
|
+
"reaction_name": self.reaction_name,
|
|
133
|
+
"classification_confidence": self.classification_confidence,
|
|
134
|
+
"species": [sp.to_dict() for sp in self.species],
|
|
135
|
+
"warnings": self.warnings,
|
|
136
|
+
"metadata": self.metadata,
|
|
137
|
+
"conditions": self.conditions,
|
|
138
|
+
"eln_data": self.eln_data,
|
|
139
|
+
}
|
|
140
|
+
return d
|
|
141
|
+
|
|
142
|
+
@classmethod
|
|
143
|
+
def from_dict(cls, d: dict) -> "ReactionDescriptor":
|
|
144
|
+
species_raw = d.get("species", [])
|
|
145
|
+
species = []
|
|
146
|
+
for sp_d in species_raw:
|
|
147
|
+
sp = SpeciesDescriptor(**{k: v for k, v in sp_d.items()
|
|
148
|
+
if k in SpeciesDescriptor.__dataclass_fields__})
|
|
149
|
+
species.append(sp)
|
|
150
|
+
return cls(
|
|
151
|
+
version=d.get("version", "1.0"),
|
|
152
|
+
experiment=d.get("experiment", ""),
|
|
153
|
+
input_files=d.get("input_files", {}),
|
|
154
|
+
reaction_smiles=d.get("reaction_smiles"),
|
|
155
|
+
reaction_class=d.get("reaction_class"),
|
|
156
|
+
reaction_name=d.get("reaction_name"),
|
|
157
|
+
classification_confidence=d.get("classification_confidence"),
|
|
158
|
+
species=species,
|
|
159
|
+
warnings=d.get("warnings", []),
|
|
160
|
+
metadata=d.get("metadata", {}),
|
|
161
|
+
conditions=d.get("conditions", []),
|
|
162
|
+
eln_data=d.get("eln_data"),
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
def to_json(self, path: str, pretty: bool = True) -> None:
|
|
166
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
167
|
+
json.dump(self.to_dict(), f, indent=2 if pretty else None,
|
|
168
|
+
ensure_ascii=False)
|
|
169
|
+
|
|
170
|
+
@classmethod
|
|
171
|
+
def from_json(cls, path: str) -> "ReactionDescriptor":
|
|
172
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
173
|
+
return cls.from_dict(json.load(f))
|
|
174
|
+
|
|
175
|
+
def get_sm(self) -> Optional[SpeciesDescriptor]:
|
|
176
|
+
"""Return the starting material species, or None."""
|
|
177
|
+
for sp in self.species:
|
|
178
|
+
if sp.is_sm:
|
|
179
|
+
return sp
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
def get_dp(self) -> Optional[SpeciesDescriptor]:
|
|
183
|
+
"""Return the desired product species, or None."""
|
|
184
|
+
for sp in self.species:
|
|
185
|
+
if sp.is_dp:
|
|
186
|
+
return sp
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
def get_expected_species(self) -> List[dict]:
|
|
190
|
+
"""Return ExpectedSpecies-compatible dicts for LCMS matching."""
|
|
191
|
+
result = []
|
|
192
|
+
for sp in self.species:
|
|
193
|
+
if sp.exact_mass > 0 and sp.smiles:
|
|
194
|
+
result.append({
|
|
195
|
+
"name": sp.name,
|
|
196
|
+
"role": _lcms_role(sp),
|
|
197
|
+
"exact_mass": sp.exact_mass,
|
|
198
|
+
"smiles": sp.smiles_neutral or sp.smiles,
|
|
199
|
+
"adducts": dict(sp.adducts),
|
|
200
|
+
})
|
|
201
|
+
return result
|
|
202
|
+
|
|
203
|
+
# -- Default field sets for summary() ------------------------------------
|
|
204
|
+
DEFAULT_SPECIES_FIELDS = [
|
|
205
|
+
"id", "name", "role", "role_detail", "smiles",
|
|
206
|
+
"display_text", "formula", "mw",
|
|
207
|
+
]
|
|
208
|
+
DEFAULT_TOP_FIELDS = [
|
|
209
|
+
"experiment", "conditions",
|
|
210
|
+
]
|
|
211
|
+
DEFAULT_ELN_FIELDS = [
|
|
212
|
+
"product_yield", "reaction_type",
|
|
213
|
+
]
|
|
214
|
+
ALL_SPECIES_FIELDS = [
|
|
215
|
+
f.name for f in SpeciesDescriptor.__dataclass_fields__.values()
|
|
216
|
+
]
|
|
217
|
+
ALL_TOP_FIELDS = [
|
|
218
|
+
"version", "experiment", "input_files", "reaction_smiles",
|
|
219
|
+
"reaction_class", "reaction_name", "classification_confidence",
|
|
220
|
+
"warnings", "metadata", "conditions",
|
|
221
|
+
]
|
|
222
|
+
ALL_ELN_FIELDS = [
|
|
223
|
+
"sm_mass", "product_obtained", "product_yield", "procedure_text",
|
|
224
|
+
"procedure_plain", "reaction_type", "start_date", "labbook_name",
|
|
225
|
+
"solvents", "solvent_details",
|
|
226
|
+
]
|
|
227
|
+
|
|
228
|
+
def summary(
|
|
229
|
+
self,
|
|
230
|
+
species_fields: Optional[List[str]] = None,
|
|
231
|
+
top_fields: Optional[List[str]] = None,
|
|
232
|
+
eln_fields: Optional[List[str]] = None,
|
|
233
|
+
) -> dict:
|
|
234
|
+
"""Return a slim summary dict for LLM context.
|
|
235
|
+
|
|
236
|
+
Parameters
|
|
237
|
+
----------
|
|
238
|
+
species_fields : list of str, optional
|
|
239
|
+
Per-species fields to include. ``None`` → DEFAULT_SPECIES_FIELDS.
|
|
240
|
+
Pass ``["*"]`` for all fields.
|
|
241
|
+
top_fields : list of str, optional
|
|
242
|
+
Top-level reaction fields to include. ``None`` → DEFAULT_TOP_FIELDS.
|
|
243
|
+
Pass ``["*"]`` for all fields.
|
|
244
|
+
eln_fields : list of str, optional
|
|
245
|
+
``eln_data`` sub-fields to include. ``None`` → DEFAULT_ELN_FIELDS.
|
|
246
|
+
Pass ``["*"]`` for all eln_data fields. Pass ``[]`` to omit eln_data.
|
|
247
|
+
|
|
248
|
+
Returns
|
|
249
|
+
-------
|
|
250
|
+
dict
|
|
251
|
+
A filtered copy of the reaction descriptor.
|
|
252
|
+
"""
|
|
253
|
+
sp_keys = (self.ALL_SPECIES_FIELDS if species_fields == ["*"]
|
|
254
|
+
else (species_fields or self.DEFAULT_SPECIES_FIELDS))
|
|
255
|
+
t_keys = (self.ALL_TOP_FIELDS if top_fields == ["*"]
|
|
256
|
+
else (top_fields or self.DEFAULT_TOP_FIELDS))
|
|
257
|
+
e_keys = (self.ALL_ELN_FIELDS if eln_fields == ["*"]
|
|
258
|
+
else (eln_fields if eln_fields is not None
|
|
259
|
+
else self.DEFAULT_ELN_FIELDS))
|
|
260
|
+
|
|
261
|
+
# Top-level fields
|
|
262
|
+
full = self.to_dict()
|
|
263
|
+
out: Dict[str, Any] = {}
|
|
264
|
+
for k in t_keys:
|
|
265
|
+
if k in full:
|
|
266
|
+
out[k] = full[k]
|
|
267
|
+
|
|
268
|
+
# Species
|
|
269
|
+
species_out = []
|
|
270
|
+
for sp in self.species:
|
|
271
|
+
sp_d = sp.to_dict()
|
|
272
|
+
species_out.append({k: sp_d[k] for k in sp_keys if k in sp_d})
|
|
273
|
+
out["species"] = species_out
|
|
274
|
+
|
|
275
|
+
# ELN data
|
|
276
|
+
if e_keys and self.eln_data:
|
|
277
|
+
eln_out = {k: self.eln_data[k] for k in e_keys
|
|
278
|
+
if k in self.eln_data}
|
|
279
|
+
if eln_out:
|
|
280
|
+
out["eln_data"] = eln_out
|
|
281
|
+
|
|
282
|
+
return out
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def reaction_summary(
|
|
286
|
+
json_path: str,
|
|
287
|
+
species_fields: Optional[List[str]] = None,
|
|
288
|
+
top_fields: Optional[List[str]] = None,
|
|
289
|
+
eln_fields: Optional[List[str]] = None,
|
|
290
|
+
) -> dict:
|
|
291
|
+
"""Load a reaction JSON and return a slim summary for LLM context.
|
|
292
|
+
|
|
293
|
+
Convenience wrapper around ``ReactionDescriptor.from_json().summary()``.
|
|
294
|
+
See :meth:`ReactionDescriptor.summary` for parameter docs.
|
|
295
|
+
|
|
296
|
+
Available species fields
|
|
297
|
+
------------------------
|
|
298
|
+
id, name, role, role_detail, smiles, smiles_neutral, classification_method,
|
|
299
|
+
is_sm, is_dp, is_substrate, is_solvent, exact_mass, exact_mass_full, mw,
|
|
300
|
+
formula, adducts, source, source_id, csv_equiv, csv_mass, csv_name,
|
|
301
|
+
csv_volume, csv_supplier, display_text, original_geometry
|
|
302
|
+
|
|
303
|
+
Available top-level fields
|
|
304
|
+
--------------------------
|
|
305
|
+
version, experiment, input_files, reaction_smiles, reaction_class,
|
|
306
|
+
reaction_name, classification_confidence, warnings, metadata, conditions
|
|
307
|
+
|
|
308
|
+
Available eln_data fields
|
|
309
|
+
-------------------------
|
|
310
|
+
sm_mass, product_obtained, product_yield, procedure_text, procedure_plain,
|
|
311
|
+
reaction_type, start_date, labbook_name, solvents, solvent_details
|
|
312
|
+
"""
|
|
313
|
+
desc = ReactionDescriptor.from_json(json_path)
|
|
314
|
+
return desc.summary(
|
|
315
|
+
species_fields=species_fields,
|
|
316
|
+
top_fields=top_fields,
|
|
317
|
+
eln_fields=eln_fields,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _lcms_role(sp: SpeciesDescriptor) -> str:
|
|
322
|
+
"""Map SpeciesDescriptor role to ExpectedSpecies role string."""
|
|
323
|
+
if sp.is_sm:
|
|
324
|
+
return "substrate"
|
|
325
|
+
if sp.is_dp:
|
|
326
|
+
return "product"
|
|
327
|
+
if sp.role == "product":
|
|
328
|
+
return "product"
|
|
329
|
+
return "reactant"
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
# ---------------------------------------------------------------------------
|
|
333
|
+
# Condition text splitting
|
|
334
|
+
# ---------------------------------------------------------------------------
|
|
335
|
+
|
|
336
|
+
# Patterns that identify non-chemical condition tokens
|
|
337
|
+
_CONDITION_PATTERNS = [
|
|
338
|
+
re.compile(r"^-?\d+\.?\d*\s+.{0,2}C.*$"), # temperature: "80 °C", "105 C", encoding issues
|
|
339
|
+
re.compile(r"^-?\d+\.?\d*\s*[°\u00b0\ufffd].*$"), # temperature: "80°C", degree prefix
|
|
340
|
+
re.compile(r"^r\.?t\.?$", re.IGNORECASE), # room temperature
|
|
341
|
+
re.compile(r"^reflux$", re.IGNORECASE),
|
|
342
|
+
re.compile(r"^refl\.?$", re.IGNORECASE),
|
|
343
|
+
re.compile(r"^\d+\.?\d*\s*(h|hr|hrs|min|d|days?)$", re.IGNORECASE), # time
|
|
344
|
+
re.compile(r"^\d+\.?\d*\s*mol\s*%$", re.IGNORECASE), # catalyst loading
|
|
345
|
+
re.compile(r"^overnight$", re.IGNORECASE),
|
|
346
|
+
re.compile(r"^o\.?n\.?$", re.IGNORECASE),
|
|
347
|
+
re.compile(r"^\d+\.?\d*\s*bar$", re.IGNORECASE), # pressure
|
|
348
|
+
re.compile(r"^N[2\u2082]\s*(atm)?$"), # N2 atmosphere
|
|
349
|
+
re.compile(r"^Ar\s*(atm)?$"), # argon atmosphere
|
|
350
|
+
re.compile(r"^MW$", re.IGNORECASE), # microwave
|
|
351
|
+
re.compile(r"^sealed\s+tube$", re.IGNORECASE),
|
|
352
|
+
re.compile(r"^-?\d+\s+to\s+-?\d+", re.IGNORECASE), # temp range: "-78 to RT", "0 to 25"
|
|
353
|
+
re.compile(r"^-?\d+\s*(?:°|[\u00b0\ufffd])?\s*(?:C\s+)?to\s+(?:RT|r\.?t\.?|-?\d+)", re.IGNORECASE),
|
|
354
|
+
re.compile(r"^\d+\.?\d*\s*equiv?\.?$", re.IGNORECASE), # equivalents
|
|
355
|
+
re.compile(r"^\d+\.?\d*\s*eq\.?$", re.IGNORECASE),
|
|
356
|
+
re.compile(r"^\d+\s*M$"), # molarity: "2 M"
|
|
357
|
+
re.compile(r"^\d+\.?\d*\s*mL$", re.IGNORECASE), # volume
|
|
358
|
+
re.compile(r"^then$", re.IGNORECASE),
|
|
359
|
+
]
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def _is_condition_token(token: str) -> bool:
|
|
363
|
+
"""Return True if token is a reaction condition, not a chemical name."""
|
|
364
|
+
return any(p.match(token) for p in _CONDITION_PATTERNS)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def split_condition_text(text: str) -> List[str]:
|
|
368
|
+
"""Split a merged condition text block into individual chemical tokens.
|
|
369
|
+
|
|
370
|
+
Handles merged ``<t>`` blocks where reagent names are separated by
|
|
371
|
+
newlines and/or commas. Filters out non-chemical tokens (temperature,
|
|
372
|
+
time, "rt", "reflux", etc.).
|
|
373
|
+
|
|
374
|
+
Returns a list of chemical name strings.
|
|
375
|
+
"""
|
|
376
|
+
from ..resolve.reagent_db import get_reagent_db
|
|
377
|
+
db = get_reagent_db()
|
|
378
|
+
|
|
379
|
+
# Split on newlines first (scheme_polisher merges with \n)
|
|
380
|
+
lines = text.split("\n")
|
|
381
|
+
tokens: List[str] = []
|
|
382
|
+
|
|
383
|
+
for line in lines:
|
|
384
|
+
line = line.strip()
|
|
385
|
+
if not line:
|
|
386
|
+
continue
|
|
387
|
+
|
|
388
|
+
# Strip trailing equiv annotations: "Cs2CO3 (2 eq.)" → "Cs2CO3"
|
|
389
|
+
line = re.sub(r"\s*\(\d+\.?\d*\s*eq\.?\)\s*$", "", line,
|
|
390
|
+
flags=re.IGNORECASE)
|
|
391
|
+
|
|
392
|
+
# If the entire line (before comma-split) is a known reagent, keep it
|
|
393
|
+
if db.entry_for_name(line.strip().lower()):
|
|
394
|
+
tokens.append(line.strip())
|
|
395
|
+
continue
|
|
396
|
+
|
|
397
|
+
# Split on comma/semicolon, but protect names like "1,4-dioxane"
|
|
398
|
+
# Strategy: try splitting, and if any resulting segment is a known
|
|
399
|
+
# chemical, use the split; otherwise keep the line intact.
|
|
400
|
+
parts = re.split(r"[;,]\s*", line)
|
|
401
|
+
if len(parts) == 1:
|
|
402
|
+
# No delimiter found
|
|
403
|
+
token = parts[0].strip()
|
|
404
|
+
if token and not _is_condition_token(token):
|
|
405
|
+
tokens.append(token)
|
|
406
|
+
else:
|
|
407
|
+
# Multiple parts — filter each
|
|
408
|
+
for part in parts:
|
|
409
|
+
part = part.strip()
|
|
410
|
+
if not part:
|
|
411
|
+
continue
|
|
412
|
+
if _is_condition_token(part):
|
|
413
|
+
continue
|
|
414
|
+
tokens.append(part)
|
|
415
|
+
|
|
416
|
+
return tokens
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def extract_conditions_from_text(text: str) -> List[str]:
|
|
420
|
+
"""Extract condition tokens (temperature, time, atmosphere) from text.
|
|
421
|
+
|
|
422
|
+
Inverse of ``split_condition_text`` — returns ONLY the non-chemical
|
|
423
|
+
tokens that represent reaction conditions.
|
|
424
|
+
"""
|
|
425
|
+
conditions: List[str] = []
|
|
426
|
+
for line in text.split("\n"):
|
|
427
|
+
line = line.strip()
|
|
428
|
+
if not line:
|
|
429
|
+
continue
|
|
430
|
+
# Strip trailing equiv annotations before splitting
|
|
431
|
+
line = re.sub(r"\s*\(\d+\.?\d*\s*eq\.?\)\s*$", "", line,
|
|
432
|
+
flags=re.IGNORECASE)
|
|
433
|
+
parts = re.split(r"[;,]\s*", line)
|
|
434
|
+
for part in parts:
|
|
435
|
+
part = part.strip()
|
|
436
|
+
if part and _is_condition_token(part):
|
|
437
|
+
conditions.append(part)
|
|
438
|
+
return conditions
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
# ---------------------------------------------------------------------------
|
|
442
|
+
# Text label → SMILES resolution
|
|
443
|
+
# ---------------------------------------------------------------------------
|
|
444
|
+
|
|
445
|
+
def _resolve_text_label(text: str,
|
|
446
|
+
use_network: bool = True) -> Optional[str]:
|
|
447
|
+
"""Resolve a text label to canonical SMILES.
|
|
448
|
+
|
|
449
|
+
Resolution chain (first success wins):
|
|
450
|
+
1. reagent_db name → SMILES (curated dictionary)
|
|
451
|
+
2. condensed formula parser (generative, offline)
|
|
452
|
+
3. OPSIN (offline, IUPAC/systematic names)
|
|
453
|
+
4. PubChem (online, if *use_network*)
|
|
454
|
+
|
|
455
|
+
Returns canonical SMILES or None.
|
|
456
|
+
"""
|
|
457
|
+
from ..resolve.reagent_db import get_reagent_db
|
|
458
|
+
db = get_reagent_db()
|
|
459
|
+
|
|
460
|
+
# Normalize: strip equiv annotations, whitespace
|
|
461
|
+
clean = re.sub(r"\s*\(\d+\.?\d*\s*eq\.?\)\s*$", "", text,
|
|
462
|
+
flags=re.IGNORECASE).strip()
|
|
463
|
+
|
|
464
|
+
# 1. Reagent DB name → SMILES
|
|
465
|
+
entry = db.entry_for_name(clean.lower())
|
|
466
|
+
if entry:
|
|
467
|
+
smi = entry.get("smiles")
|
|
468
|
+
if smi:
|
|
469
|
+
# May be a list of SMILES variants — take the first
|
|
470
|
+
if isinstance(smi, list):
|
|
471
|
+
smi = smi[0]
|
|
472
|
+
# Try to canonicalize
|
|
473
|
+
try:
|
|
474
|
+
from rdkit import Chem
|
|
475
|
+
mol = Chem.MolFromSmiles(smi)
|
|
476
|
+
if mol:
|
|
477
|
+
return Chem.MolToSmiles(mol)
|
|
478
|
+
except ImportError:
|
|
479
|
+
pass
|
|
480
|
+
return smi
|
|
481
|
+
|
|
482
|
+
# 2. Condensed formula parser (generative, offline)
|
|
483
|
+
try:
|
|
484
|
+
from ..resolve.condensed_formula import resolve_condensed_formula
|
|
485
|
+
smi = resolve_condensed_formula(clean)
|
|
486
|
+
if smi:
|
|
487
|
+
return smi
|
|
488
|
+
except (ImportError, Exception):
|
|
489
|
+
pass
|
|
490
|
+
|
|
491
|
+
# 3. OPSIN (offline)
|
|
492
|
+
try:
|
|
493
|
+
from .reactant_heuristic import _opsin_name_to_smiles
|
|
494
|
+
smi = _opsin_name_to_smiles(clean)
|
|
495
|
+
if smi:
|
|
496
|
+
return smi
|
|
497
|
+
except (ImportError, Exception):
|
|
498
|
+
pass
|
|
499
|
+
|
|
500
|
+
# 4. PubChem (online)
|
|
501
|
+
if use_network:
|
|
502
|
+
try:
|
|
503
|
+
from ..resolve.cas_resolver import resolve_name_to_smiles
|
|
504
|
+
smi = resolve_name_to_smiles(clean)
|
|
505
|
+
if smi:
|
|
506
|
+
return smi
|
|
507
|
+
except (ImportError, Exception):
|
|
508
|
+
pass
|
|
509
|
+
|
|
510
|
+
return None
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
# ---------------------------------------------------------------------------
|
|
514
|
+
# Arrow detection and side assignment (does NOT use <step> attributes)
|
|
515
|
+
# ---------------------------------------------------------------------------
|
|
516
|
+
|
|
517
|
+
def _find_arrow(page: ET.Element) -> Optional[ET.Element]:
|
|
518
|
+
"""Find the main reaction arrow on the page.
|
|
519
|
+
|
|
520
|
+
Looks for ``<arrow>`` elements first, then ``<graphic>`` elements with
|
|
521
|
+
arrow-type attributes. Returns the first found, or None.
|
|
522
|
+
"""
|
|
523
|
+
# Direct <arrow> elements
|
|
524
|
+
for el in page:
|
|
525
|
+
if el.tag == "arrow":
|
|
526
|
+
return el
|
|
527
|
+
|
|
528
|
+
# <graphic> with arrow attributes (ChemDraw CDXML variant)
|
|
529
|
+
for el in page:
|
|
530
|
+
if el.tag == "graphic":
|
|
531
|
+
if el.get("GraphicType") == "Line" and el.get("ArrowType"):
|
|
532
|
+
return el
|
|
533
|
+
# SupersededBy linkage
|
|
534
|
+
if el.get("SupersededBy"):
|
|
535
|
+
continue
|
|
536
|
+
|
|
537
|
+
return None
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
def _arrow_endpoints(arrow: ET.Element) -> Tuple[float, float, float, float]:
|
|
541
|
+
"""Return (tail_x, tail_y, head_x, head_y) from an arrow element."""
|
|
542
|
+
from ..cdxml_utils import arrow_endpoints
|
|
543
|
+
return arrow_endpoints(arrow)
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def _fragment_centroid(frag: ET.Element) -> Tuple[float, float]:
|
|
547
|
+
"""Compute centroid from direct-child atom positions."""
|
|
548
|
+
xs, ys = [], []
|
|
549
|
+
for n in frag.findall("n"):
|
|
550
|
+
p = n.get("p")
|
|
551
|
+
if p:
|
|
552
|
+
parts = p.split()
|
|
553
|
+
xs.append(float(parts[0]))
|
|
554
|
+
ys.append(float(parts[1]))
|
|
555
|
+
if xs:
|
|
556
|
+
return sum(xs) / len(xs), sum(ys) / len(ys)
|
|
557
|
+
return 0.0, 0.0
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
def _text_anchor(t_elem: ET.Element) -> Tuple[float, float]:
|
|
561
|
+
"""Get approximate position of a text element."""
|
|
562
|
+
p = t_elem.get("p")
|
|
563
|
+
if p:
|
|
564
|
+
parts = p.split()
|
|
565
|
+
return float(parts[0]), float(parts[1])
|
|
566
|
+
bb = t_elem.get("BoundingBox")
|
|
567
|
+
if bb:
|
|
568
|
+
vals = [float(v) for v in bb.split()]
|
|
569
|
+
return (vals[0] + vals[2]) / 2, (vals[1] + vals[3]) / 2
|
|
570
|
+
return 0.0, 0.0
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def _extract_geometry(frag_elem) -> Optional[Dict[str, Any]]:
|
|
574
|
+
"""Extract original CDXML geometry from a <fragment> element.
|
|
575
|
+
|
|
576
|
+
Returns a dict with atoms, bonds, and average bond length that can be
|
|
577
|
+
stored in ``SpeciesDescriptor.original_geometry``. Abbreviation groups
|
|
578
|
+
(``NodeType="Fragment"``) and generic groups (``GenericNickname``, etc.)
|
|
579
|
+
are flagged with their label text so downstream tools can re-abbreviate.
|
|
580
|
+
"""
|
|
581
|
+
_GENERIC_NODETYPES = {"GenericNickname", "Nickname", "Unspecified"}
|
|
582
|
+
|
|
583
|
+
atoms = []
|
|
584
|
+
id_set = set()
|
|
585
|
+
for n in frag_elem.findall("n"):
|
|
586
|
+
nid_str = n.get("id")
|
|
587
|
+
if nid_str is None:
|
|
588
|
+
continue
|
|
589
|
+
nid = int(nid_str)
|
|
590
|
+
node_type = n.get("NodeType")
|
|
591
|
+
if node_type == "ExternalConnectionPoint":
|
|
592
|
+
continue
|
|
593
|
+
|
|
594
|
+
p = n.get("p", "0 0").split()
|
|
595
|
+
x, y = float(p[0]), float(p[1])
|
|
596
|
+
elem = int(n.get("Element", "6"))
|
|
597
|
+
sym = _ELEM_SYMBOLS.get(elem, "C")
|
|
598
|
+
num_h_attr = n.get("NumHydrogens")
|
|
599
|
+
|
|
600
|
+
atom_d: Dict[str, Any] = {"id": nid, "x": x, "y": y, "symbol": sym}
|
|
601
|
+
if num_h_attr is not None:
|
|
602
|
+
atom_d["num_hydrogens"] = int(num_h_attr)
|
|
603
|
+
|
|
604
|
+
# Abbreviation groups (real superatom abbreviations)
|
|
605
|
+
if node_type == "Fragment":
|
|
606
|
+
# Get label text
|
|
607
|
+
label = None
|
|
608
|
+
for t in n.findall("t"):
|
|
609
|
+
parts = []
|
|
610
|
+
for s in t.findall("s"):
|
|
611
|
+
if s.text:
|
|
612
|
+
parts.append(s.text)
|
|
613
|
+
if parts:
|
|
614
|
+
label = "".join(parts)
|
|
615
|
+
break
|
|
616
|
+
atom_d["is_abbreviation"] = True
|
|
617
|
+
atom_d["is_generic"] = False
|
|
618
|
+
if label:
|
|
619
|
+
atom_d["label"] = label
|
|
620
|
+
# Look up the SMILES for this abbreviation
|
|
621
|
+
try:
|
|
622
|
+
from ..resolve.superatom_table import lookup_smiles
|
|
623
|
+
lsmi = lookup_smiles(label)
|
|
624
|
+
if lsmi:
|
|
625
|
+
atom_d["label_smiles"] = lsmi
|
|
626
|
+
except ImportError:
|
|
627
|
+
pass
|
|
628
|
+
|
|
629
|
+
# Generic variable groups (R, X, Ar, R1, etc.)
|
|
630
|
+
elif node_type in _GENERIC_NODETYPES:
|
|
631
|
+
label = None
|
|
632
|
+
# Try GenericNickname attribute first
|
|
633
|
+
label = n.get("GenericNickname")
|
|
634
|
+
if not label:
|
|
635
|
+
for t in n.findall("t"):
|
|
636
|
+
parts = []
|
|
637
|
+
for s in t.findall("s"):
|
|
638
|
+
if s.text:
|
|
639
|
+
parts.append(s.text)
|
|
640
|
+
if parts:
|
|
641
|
+
label = "".join(parts)
|
|
642
|
+
break
|
|
643
|
+
atom_d["is_abbreviation"] = False
|
|
644
|
+
atom_d["is_generic"] = True
|
|
645
|
+
atom_d["node_type"] = node_type
|
|
646
|
+
if label:
|
|
647
|
+
atom_d["label"] = label
|
|
648
|
+
|
|
649
|
+
atoms.append(atom_d)
|
|
650
|
+
id_set.add(nid)
|
|
651
|
+
|
|
652
|
+
bonds = []
|
|
653
|
+
bond_lengths = []
|
|
654
|
+
atom_pos = {a["id"]: (a["x"], a["y"]) for a in atoms}
|
|
655
|
+
for b in frag_elem.findall("b"):
|
|
656
|
+
bi, ei = int(b.get("B", "0")), int(b.get("E", "0"))
|
|
657
|
+
if bi not in id_set or ei not in id_set:
|
|
658
|
+
continue
|
|
659
|
+
order = int(b.get("Order", "1"))
|
|
660
|
+
bond_d: Dict[str, Any] = {"begin": bi, "end": ei, "order": order}
|
|
661
|
+
dp = b.get("DoublePosition")
|
|
662
|
+
if dp:
|
|
663
|
+
bond_d["double_position"] = dp
|
|
664
|
+
bonds.append(bond_d)
|
|
665
|
+
# Compute bond length for average
|
|
666
|
+
if bi in atom_pos and ei in atom_pos:
|
|
667
|
+
dx = atom_pos[bi][0] - atom_pos[ei][0]
|
|
668
|
+
dy = atom_pos[bi][1] - atom_pos[ei][1]
|
|
669
|
+
bl = (dx * dx + dy * dy) ** 0.5
|
|
670
|
+
if bl > 0:
|
|
671
|
+
bond_lengths.append(bl)
|
|
672
|
+
|
|
673
|
+
if not atoms:
|
|
674
|
+
return None
|
|
675
|
+
|
|
676
|
+
result: Dict[str, Any] = {"atoms": atoms, "bonds": bonds}
|
|
677
|
+
if bond_lengths:
|
|
678
|
+
result["bond_length"] = round(sum(bond_lengths) / len(bond_lengths), 2)
|
|
679
|
+
return result
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
# Element number → symbol mapping for _extract_geometry
|
|
683
|
+
_ELEM_SYMBOLS = {
|
|
684
|
+
1: "H", 5: "B", 6: "C", 7: "N", 8: "O", 9: "F",
|
|
685
|
+
14: "Si", 15: "P", 16: "S", 17: "Cl", 35: "Br", 53: "I",
|
|
686
|
+
3: "Li", 11: "Na", 12: "Mg", 19: "K", 20: "Ca", 26: "Fe",
|
|
687
|
+
29: "Cu", 30: "Zn", 46: "Pd", 55: "Cs", 78: "Pt",
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def _get_text_content(t_elem: ET.Element) -> str:
|
|
692
|
+
"""Extract plain text content from a <t> element."""
|
|
693
|
+
parts = []
|
|
694
|
+
for s in t_elem.iter("s"):
|
|
695
|
+
if s.text:
|
|
696
|
+
parts.append(s.text)
|
|
697
|
+
return "".join(parts).strip()
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
# ---------------------------------------------------------------------------
|
|
701
|
+
# CDXML extraction (fragments + text labels)
|
|
702
|
+
# ---------------------------------------------------------------------------
|
|
703
|
+
|
|
704
|
+
def _extract_from_cdxml(cdxml_path: str,
|
|
705
|
+
use_network: bool = True,
|
|
706
|
+
) -> Tuple[List[SpeciesDescriptor], List[str], List[str]]:
|
|
707
|
+
"""Extract species from a CDXML scheme.
|
|
708
|
+
|
|
709
|
+
Returns (species_list, warnings, conditions).
|
|
710
|
+
Determines product vs reactant by position relative to the arrow,
|
|
711
|
+
NOT from ``<step>`` attributes. Conditions are non-chemical tokens
|
|
712
|
+
(temperatures, times, atmospheres) from text labels near the arrow.
|
|
713
|
+
"""
|
|
714
|
+
from ..cdxml_utils import parse_cdxml
|
|
715
|
+
|
|
716
|
+
tree = parse_cdxml(cdxml_path)
|
|
717
|
+
root = tree.getroot()
|
|
718
|
+
page = root.find(".//page")
|
|
719
|
+
if page is None:
|
|
720
|
+
return [], ["No <page> element found in CDXML"], []
|
|
721
|
+
|
|
722
|
+
# Find the arrow
|
|
723
|
+
arrow = _find_arrow(page)
|
|
724
|
+
if arrow is None:
|
|
725
|
+
return [], ["No reaction arrow found in CDXML"], []
|
|
726
|
+
|
|
727
|
+
tail_x, tail_y, head_x, head_y = _arrow_endpoints(arrow)
|
|
728
|
+
# Ensure tail is left of head
|
|
729
|
+
if tail_x > head_x:
|
|
730
|
+
tail_x, head_x = head_x, tail_x
|
|
731
|
+
tail_y, head_y = head_y, tail_y
|
|
732
|
+
|
|
733
|
+
arrow_y = (tail_y + head_y) / 2.0
|
|
734
|
+
_log(f" Arrow: tail=({tail_x:.1f}, {tail_y:.1f}), "
|
|
735
|
+
f"head=({head_x:.1f}, {head_y:.1f})")
|
|
736
|
+
|
|
737
|
+
# Collect the arrow element id (and any graphic superseding it)
|
|
738
|
+
arrow_ids = set()
|
|
739
|
+
aid = arrow.get("id")
|
|
740
|
+
if aid:
|
|
741
|
+
arrow_ids.add(aid)
|
|
742
|
+
# Also find graphic SupersededBy this arrow
|
|
743
|
+
for el in page:
|
|
744
|
+
if el.tag == "graphic" and el.get("SupersededBy") == aid:
|
|
745
|
+
gid = el.get("id")
|
|
746
|
+
if gid:
|
|
747
|
+
arrow_ids.add(gid)
|
|
748
|
+
|
|
749
|
+
species = []
|
|
750
|
+
warnings = []
|
|
751
|
+
sp_idx = 0
|
|
752
|
+
|
|
753
|
+
# Try to import frag_to_smiles (prefer resolved version for abbreviations)
|
|
754
|
+
_frag_to_smiles_resolved = None
|
|
755
|
+
_frag_to_smiles_plain = None
|
|
756
|
+
try:
|
|
757
|
+
from ..rdkit_utils import frag_to_smiles_resolved as _frag_to_smiles_resolved
|
|
758
|
+
from ..rdkit_utils import frag_to_smiles as _frag_to_smiles_plain
|
|
759
|
+
from ..rdkit_utils import frag_to_mw as _frag_to_mw
|
|
760
|
+
except ImportError:
|
|
761
|
+
_frag_to_mw = None
|
|
762
|
+
|
|
763
|
+
# Process all fragments
|
|
764
|
+
for el in page:
|
|
765
|
+
if el.tag != "fragment":
|
|
766
|
+
continue
|
|
767
|
+
|
|
768
|
+
eid = el.get("id", "")
|
|
769
|
+
if eid in arrow_ids:
|
|
770
|
+
continue
|
|
771
|
+
|
|
772
|
+
cx, cy = _fragment_centroid(el)
|
|
773
|
+
|
|
774
|
+
# Determine role by position relative to arrow
|
|
775
|
+
if cx > head_x:
|
|
776
|
+
pos_role = "product"
|
|
777
|
+
else:
|
|
778
|
+
pos_role = "candidate" # reactant or reagent — classified later
|
|
779
|
+
|
|
780
|
+
# Extract SMILES — prefer resolved (abbreviation-expanded) version
|
|
781
|
+
smi = None
|
|
782
|
+
if _frag_to_smiles_resolved is not None:
|
|
783
|
+
smi = _frag_to_smiles_resolved(el)
|
|
784
|
+
|
|
785
|
+
# Fallback to plain SMILES (may have [*] for abbreviations)
|
|
786
|
+
if smi is None and _frag_to_smiles_plain is not None:
|
|
787
|
+
smi = _frag_to_smiles_plain(el)
|
|
788
|
+
|
|
789
|
+
# If still has unresolved abbreviations, try ChemScript
|
|
790
|
+
if smi is not None and '*' in smi:
|
|
791
|
+
cs_smi = _try_chemscript_smiles(el, cdxml_path)
|
|
792
|
+
if cs_smi and '*' not in cs_smi:
|
|
793
|
+
smi = cs_smi
|
|
794
|
+
|
|
795
|
+
if smi is None:
|
|
796
|
+
# Try ChemScript fallback for total failures
|
|
797
|
+
smi = _try_chemscript_smiles(el, cdxml_path)
|
|
798
|
+
|
|
799
|
+
mw = 0.0
|
|
800
|
+
if _frag_to_mw is not None:
|
|
801
|
+
mw_val = _frag_to_mw(el)
|
|
802
|
+
if mw_val is not None:
|
|
803
|
+
mw = mw_val
|
|
804
|
+
|
|
805
|
+
# Extract original geometry (coordinates + abbreviation data)
|
|
806
|
+
geom = _extract_geometry(el)
|
|
807
|
+
|
|
808
|
+
sp = SpeciesDescriptor(
|
|
809
|
+
id=f"sp_{sp_idx}",
|
|
810
|
+
smiles=smi,
|
|
811
|
+
name="",
|
|
812
|
+
role=pos_role,
|
|
813
|
+
source="fragment",
|
|
814
|
+
source_id=eid,
|
|
815
|
+
mw=mw,
|
|
816
|
+
original_geometry=geom,
|
|
817
|
+
)
|
|
818
|
+
species.append(sp)
|
|
819
|
+
sp_idx += 1
|
|
820
|
+
_log(f" Fragment id={eid}: smiles={smi}, pos_role={pos_role}, mw={mw:.1f}")
|
|
821
|
+
|
|
822
|
+
# Process text labels (may contain reagent names and condition tokens)
|
|
823
|
+
cdxml_conditions: List[str] = []
|
|
824
|
+
|
|
825
|
+
for el in page:
|
|
826
|
+
if el.tag != "t":
|
|
827
|
+
continue
|
|
828
|
+
|
|
829
|
+
eid = el.get("id", "")
|
|
830
|
+
if eid in arrow_ids:
|
|
831
|
+
continue
|
|
832
|
+
|
|
833
|
+
text = _get_text_content(el)
|
|
834
|
+
if not text:
|
|
835
|
+
continue
|
|
836
|
+
|
|
837
|
+
tx, ty = _text_anchor(el)
|
|
838
|
+
|
|
839
|
+
# Skip text to the right of the arrow (product labels)
|
|
840
|
+
if tx > head_x:
|
|
841
|
+
continue
|
|
842
|
+
|
|
843
|
+
# Extract condition tokens from this text block
|
|
844
|
+
conds = extract_conditions_from_text(text)
|
|
845
|
+
cdxml_conditions.extend(conds)
|
|
846
|
+
|
|
847
|
+
# Split merged condition text into individual chemical tokens
|
|
848
|
+
tokens = split_condition_text(text)
|
|
849
|
+
if not tokens:
|
|
850
|
+
continue
|
|
851
|
+
|
|
852
|
+
for token in tokens:
|
|
853
|
+
smi = _resolve_text_label(token, use_network=use_network)
|
|
854
|
+
|
|
855
|
+
sp = SpeciesDescriptor(
|
|
856
|
+
id=f"sp_{sp_idx}",
|
|
857
|
+
smiles=smi,
|
|
858
|
+
name=token, # provisional — may be overwritten by display names
|
|
859
|
+
role="candidate",
|
|
860
|
+
source="text_label",
|
|
861
|
+
source_id=eid,
|
|
862
|
+
)
|
|
863
|
+
species.append(sp)
|
|
864
|
+
sp_idx += 1
|
|
865
|
+
_log(f" Text id={eid}: token='{token}', smiles={smi}")
|
|
866
|
+
|
|
867
|
+
return species, warnings, cdxml_conditions
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
def _try_chemscript_smiles(frag_elem: ET.Element,
|
|
871
|
+
cdxml_path: str) -> Optional[str]:
|
|
872
|
+
"""Try to extract SMILES from a fragment via ChemScript.
|
|
873
|
+
|
|
874
|
+
Wraps the fragment in a minimal CDXML, writes to temp file,
|
|
875
|
+
and calls ChemScript to export SMILES.
|
|
876
|
+
"""
|
|
877
|
+
try:
|
|
878
|
+
from ..chemdraw.chemscript_bridge import ChemScriptBridge
|
|
879
|
+
from ..constants import CDXML_MINIMAL_HEADER, CDXML_FOOTER
|
|
880
|
+
except ImportError:
|
|
881
|
+
return None
|
|
882
|
+
|
|
883
|
+
import tempfile
|
|
884
|
+
|
|
885
|
+
# Build minimal CDXML containing just this fragment
|
|
886
|
+
frag_xml = ET.tostring(frag_elem, encoding="unicode")
|
|
887
|
+
cdxml_str = f"{CDXML_MINIMAL_HEADER}<page>{frag_xml}</page>{CDXML_FOOTER}"
|
|
888
|
+
|
|
889
|
+
try:
|
|
890
|
+
with tempfile.NamedTemporaryFile(suffix=".cdxml", delete=False,
|
|
891
|
+
mode="w", encoding="utf-8") as f:
|
|
892
|
+
f.write(cdxml_str)
|
|
893
|
+
tmp_path = f.name
|
|
894
|
+
try:
|
|
895
|
+
cs = ChemScriptBridge()
|
|
896
|
+
smi = cs.write_data(tmp_path, "smiles")
|
|
897
|
+
return smi.strip() if smi else None
|
|
898
|
+
finally:
|
|
899
|
+
try:
|
|
900
|
+
os.unlink(tmp_path)
|
|
901
|
+
except OSError:
|
|
902
|
+
pass
|
|
903
|
+
except Exception:
|
|
904
|
+
return None
|
|
905
|
+
|
|
906
|
+
|
|
907
|
+
# ---------------------------------------------------------------------------
|
|
908
|
+
# RXN file extraction
|
|
909
|
+
# ---------------------------------------------------------------------------
|
|
910
|
+
|
|
911
|
+
def _extract_from_rxn(rxn_path: str) -> Tuple[List[SpeciesDescriptor], List[str]]:
|
|
912
|
+
"""Extract species from an RXN file.
|
|
913
|
+
|
|
914
|
+
Tier 1: ChemScript ``load_reaction()`` → SMILES for each component.
|
|
915
|
+
Tier 2: RDKit ``ReactionFromRxnFile()`` → MOL templates → SMILES.
|
|
916
|
+
|
|
917
|
+
.. warning::
|
|
918
|
+
Neither tier handles V2000 S-group superatom abbreviations
|
|
919
|
+
(``M STY ... SUP`` / ``M SMT ... label``). Findmolecule RXN
|
|
920
|
+
exports commonly use these for groups like COOH, COOtBu, etc.
|
|
921
|
+
The placeholder atom is read as bare C, producing an incorrect
|
|
922
|
+
SMILES. **Best practice:** parse CDX (via ChemDraw COM) + CSV
|
|
923
|
+
together; RXN is a supplementary source only.
|
|
924
|
+
|
|
925
|
+
Returns (species_list, warnings).
|
|
926
|
+
"""
|
|
927
|
+
species = []
|
|
928
|
+
warnings = []
|
|
929
|
+
|
|
930
|
+
# Tier 1: ChemScript
|
|
931
|
+
try:
|
|
932
|
+
from ..chemdraw.chemscript_bridge import ChemScriptBridge
|
|
933
|
+
cs = ChemScriptBridge()
|
|
934
|
+
result = cs.load_reaction(rxn_path)
|
|
935
|
+
if result and result.get("ok"):
|
|
936
|
+
sp_idx = 0
|
|
937
|
+
for rct in result.get("reactants", []):
|
|
938
|
+
sp = SpeciesDescriptor(
|
|
939
|
+
id=f"sp_{sp_idx}",
|
|
940
|
+
smiles=rct.get("smiles"),
|
|
941
|
+
name=rct.get("name", ""),
|
|
942
|
+
role="candidate",
|
|
943
|
+
source="rxn",
|
|
944
|
+
formula=rct.get("formula"),
|
|
945
|
+
)
|
|
946
|
+
species.append(sp)
|
|
947
|
+
sp_idx += 1
|
|
948
|
+
for prod in result.get("products", []):
|
|
949
|
+
sp = SpeciesDescriptor(
|
|
950
|
+
id=f"sp_{sp_idx}",
|
|
951
|
+
smiles=prod.get("smiles"),
|
|
952
|
+
name=prod.get("name", ""),
|
|
953
|
+
role="product",
|
|
954
|
+
source="rxn",
|
|
955
|
+
formula=prod.get("formula"),
|
|
956
|
+
)
|
|
957
|
+
species.append(sp)
|
|
958
|
+
sp_idx += 1
|
|
959
|
+
_log(f" RXN via ChemScript: {len(species)} species")
|
|
960
|
+
return species, warnings
|
|
961
|
+
except Exception as e:
|
|
962
|
+
_log(f" ChemScript RXN load failed: {e}")
|
|
963
|
+
|
|
964
|
+
# Tier 2: RDKit
|
|
965
|
+
try:
|
|
966
|
+
from rdkit import Chem
|
|
967
|
+
from rdkit.Chem import AllChem
|
|
968
|
+
|
|
969
|
+
rxn = AllChem.ReactionFromRxnFile(rxn_path)
|
|
970
|
+
if rxn is None:
|
|
971
|
+
warnings.append(f"RDKit could not parse RXN file: {rxn_path}")
|
|
972
|
+
return [], warnings
|
|
973
|
+
|
|
974
|
+
sp_idx = 0
|
|
975
|
+
for i in range(rxn.GetNumReactantTemplates()):
|
|
976
|
+
mol = rxn.GetReactantTemplate(i)
|
|
977
|
+
if mol is None or mol.GetNumAtoms() == 0:
|
|
978
|
+
continue
|
|
979
|
+
try:
|
|
980
|
+
Chem.SanitizeMol(mol)
|
|
981
|
+
except Exception:
|
|
982
|
+
pass
|
|
983
|
+
smi = Chem.MolToSmiles(mol) if mol else None
|
|
984
|
+
sp = SpeciesDescriptor(
|
|
985
|
+
id=f"sp_{sp_idx}",
|
|
986
|
+
smiles=smi,
|
|
987
|
+
role="candidate",
|
|
988
|
+
source="rxn",
|
|
989
|
+
)
|
|
990
|
+
species.append(sp)
|
|
991
|
+
sp_idx += 1
|
|
992
|
+
|
|
993
|
+
for i in range(rxn.GetNumProductTemplates()):
|
|
994
|
+
mol = rxn.GetProductTemplate(i)
|
|
995
|
+
if mol is None or mol.GetNumAtoms() == 0:
|
|
996
|
+
continue
|
|
997
|
+
try:
|
|
998
|
+
Chem.SanitizeMol(mol)
|
|
999
|
+
except Exception:
|
|
1000
|
+
pass
|
|
1001
|
+
smi = Chem.MolToSmiles(mol) if mol else None
|
|
1002
|
+
sp = SpeciesDescriptor(
|
|
1003
|
+
id=f"sp_{sp_idx}",
|
|
1004
|
+
smiles=smi,
|
|
1005
|
+
role="product",
|
|
1006
|
+
source="rxn",
|
|
1007
|
+
)
|
|
1008
|
+
species.append(sp)
|
|
1009
|
+
sp_idx += 1
|
|
1010
|
+
|
|
1011
|
+
_log(f" RXN via RDKit: {len(species)} species")
|
|
1012
|
+
except ImportError:
|
|
1013
|
+
warnings.append("Neither ChemScript nor RDKit available for RXN parsing")
|
|
1014
|
+
except Exception as e:
|
|
1015
|
+
warnings.append(f"RXN parsing failed: {e}")
|
|
1016
|
+
|
|
1017
|
+
return species, warnings
|
|
1018
|
+
|
|
1019
|
+
|
|
1020
|
+
# ---------------------------------------------------------------------------
|
|
1021
|
+
# CSV matching
|
|
1022
|
+
# ---------------------------------------------------------------------------
|
|
1023
|
+
|
|
1024
|
+
def _match_csv_data(species: List[SpeciesDescriptor],
|
|
1025
|
+
csv_path: str) -> Tuple[List[SpeciesDescriptor], List[str], Any]:
|
|
1026
|
+
"""Match CSV reagent data to species by MW or name.
|
|
1027
|
+
|
|
1028
|
+
Supplements species with CSV metadata (equiv, mass, name, substrate flag).
|
|
1029
|
+
Species not matched to any structural source are added as csv_only.
|
|
1030
|
+
|
|
1031
|
+
Returns (updated_species, warnings, exp_data).
|
|
1032
|
+
"""
|
|
1033
|
+
warnings = []
|
|
1034
|
+
|
|
1035
|
+
try:
|
|
1036
|
+
from .eln_csv_parser import parse_eln_csv
|
|
1037
|
+
except ImportError:
|
|
1038
|
+
warnings.append("eln_csv_parser not available for CSV parsing")
|
|
1039
|
+
return species, warnings, None
|
|
1040
|
+
|
|
1041
|
+
exp_data = parse_eln_csv(csv_path)
|
|
1042
|
+
if exp_data is None:
|
|
1043
|
+
warnings.append(f"Could not parse CSV: {csv_path}")
|
|
1044
|
+
return species, warnings, None
|
|
1045
|
+
|
|
1046
|
+
from ..resolve.reagent_db import get_reagent_db
|
|
1047
|
+
db = get_reagent_db()
|
|
1048
|
+
|
|
1049
|
+
# Build match tracking
|
|
1050
|
+
matched_species = set() # species indices already matched
|
|
1051
|
+
matched_csv = set() # CSV reagent indices already matched
|
|
1052
|
+
|
|
1053
|
+
# --- Pass 1: Name match ---
|
|
1054
|
+
for ci, rgt in enumerate(exp_data.reactants):
|
|
1055
|
+
if ci in matched_csv:
|
|
1056
|
+
continue
|
|
1057
|
+
csv_name_lower = rgt.name.strip().lower()
|
|
1058
|
+
csv_display = db.resolve_display(rgt.name)
|
|
1059
|
+
|
|
1060
|
+
for si, sp in enumerate(species):
|
|
1061
|
+
if si in matched_species:
|
|
1062
|
+
continue
|
|
1063
|
+
if sp.role == "product":
|
|
1064
|
+
continue
|
|
1065
|
+
|
|
1066
|
+
# Compare against text label name
|
|
1067
|
+
sp_name_lower = (sp.name or "").strip().lower()
|
|
1068
|
+
sp_display_lower = db.resolve_display(sp.name or "").lower()
|
|
1069
|
+
|
|
1070
|
+
if (sp_name_lower and (sp_name_lower == csv_name_lower
|
|
1071
|
+
or sp_display_lower == csv_display.lower())):
|
|
1072
|
+
_apply_csv_match(sp, rgt)
|
|
1073
|
+
matched_species.add(si)
|
|
1074
|
+
matched_csv.add(ci)
|
|
1075
|
+
_log(f" CSV name match: '{rgt.name}' → sp_{si}")
|
|
1076
|
+
break
|
|
1077
|
+
|
|
1078
|
+
# --- Pass 2: MW match (species with known MW) ---
|
|
1079
|
+
for ci, rgt in enumerate(exp_data.reactants):
|
|
1080
|
+
if ci in matched_csv:
|
|
1081
|
+
continue
|
|
1082
|
+
if rgt.mw <= 0:
|
|
1083
|
+
continue
|
|
1084
|
+
|
|
1085
|
+
best_si = None
|
|
1086
|
+
best_delta = MW_MATCH_TOLERANCE
|
|
1087
|
+
|
|
1088
|
+
for si, sp in enumerate(species):
|
|
1089
|
+
if si in matched_species:
|
|
1090
|
+
continue
|
|
1091
|
+
if sp.role == "product":
|
|
1092
|
+
continue
|
|
1093
|
+
if sp.mw <= 0:
|
|
1094
|
+
continue
|
|
1095
|
+
|
|
1096
|
+
delta = abs(sp.mw - rgt.mw)
|
|
1097
|
+
if delta < best_delta:
|
|
1098
|
+
best_delta = delta
|
|
1099
|
+
best_si = si
|
|
1100
|
+
|
|
1101
|
+
if best_si is not None:
|
|
1102
|
+
_apply_csv_match(species[best_si], rgt)
|
|
1103
|
+
matched_species.add(best_si)
|
|
1104
|
+
matched_csv.add(ci)
|
|
1105
|
+
_log(f" CSV MW match: '{rgt.name}' (MW={rgt.mw:.1f}) "
|
|
1106
|
+
f"→ sp_{best_si} (MW={species[best_si].mw:.1f})")
|
|
1107
|
+
|
|
1108
|
+
# --- Pass 3: MW match via SMILES from reagent_db ---
|
|
1109
|
+
try:
|
|
1110
|
+
from rdkit import Chem
|
|
1111
|
+
from rdkit.Chem import Descriptors
|
|
1112
|
+
has_rdkit = True
|
|
1113
|
+
except ImportError:
|
|
1114
|
+
has_rdkit = False
|
|
1115
|
+
|
|
1116
|
+
if has_rdkit:
|
|
1117
|
+
for ci, rgt in enumerate(exp_data.reactants):
|
|
1118
|
+
if ci in matched_csv:
|
|
1119
|
+
continue
|
|
1120
|
+
if rgt.mw <= 0:
|
|
1121
|
+
continue
|
|
1122
|
+
|
|
1123
|
+
for si, sp in enumerate(species):
|
|
1124
|
+
if si in matched_species:
|
|
1125
|
+
continue
|
|
1126
|
+
if sp.role == "product":
|
|
1127
|
+
continue
|
|
1128
|
+
if sp.smiles or sp.mw > 0:
|
|
1129
|
+
continue # already has structural data
|
|
1130
|
+
|
|
1131
|
+
# Try to get SMILES from reagent_db for this text label
|
|
1132
|
+
sp_name = (sp.name or "").strip()
|
|
1133
|
+
if not sp_name:
|
|
1134
|
+
continue
|
|
1135
|
+
entry = db.entry_for_name(sp_name.lower())
|
|
1136
|
+
if not entry:
|
|
1137
|
+
continue
|
|
1138
|
+
smi = entry.get("smiles")
|
|
1139
|
+
if not smi:
|
|
1140
|
+
continue
|
|
1141
|
+
if isinstance(smi, list):
|
|
1142
|
+
smi = smi[0]
|
|
1143
|
+
|
|
1144
|
+
mol = Chem.MolFromSmiles(smi)
|
|
1145
|
+
if mol is None:
|
|
1146
|
+
continue
|
|
1147
|
+
text_mw = Descriptors.MolWt(mol)
|
|
1148
|
+
delta = abs(text_mw - rgt.mw)
|
|
1149
|
+
if delta < MW_MATCH_TOLERANCE:
|
|
1150
|
+
sp.smiles = Chem.MolToSmiles(mol)
|
|
1151
|
+
sp.mw = text_mw
|
|
1152
|
+
_apply_csv_match(sp, rgt)
|
|
1153
|
+
matched_species.add(si)
|
|
1154
|
+
matched_csv.add(ci)
|
|
1155
|
+
_log(f" CSV MW→DB match: '{rgt.name}' → sp_{si} "
|
|
1156
|
+
f"via DB SMILES '{sp_name}'")
|
|
1157
|
+
break
|
|
1158
|
+
|
|
1159
|
+
# --- Add unmatched CSV reagents as csv_only species ---
|
|
1160
|
+
sp_idx = max((int(sp.id.split("_")[1]) for sp in species), default=-1) + 1
|
|
1161
|
+
for ci, rgt in enumerate(exp_data.reactants):
|
|
1162
|
+
if ci in matched_csv:
|
|
1163
|
+
continue
|
|
1164
|
+
sp = SpeciesDescriptor(
|
|
1165
|
+
id=f"sp_{sp_idx}",
|
|
1166
|
+
name=rgt.name,
|
|
1167
|
+
role="candidate",
|
|
1168
|
+
source="csv_only",
|
|
1169
|
+
mw=rgt.mw,
|
|
1170
|
+
csv_name=rgt.name,
|
|
1171
|
+
csv_equiv=rgt.equiv,
|
|
1172
|
+
csv_mass=rgt.mass,
|
|
1173
|
+
)
|
|
1174
|
+
# Try to resolve SMILES from name
|
|
1175
|
+
smi = _resolve_text_label(rgt.name, use_network=False)
|
|
1176
|
+
if smi:
|
|
1177
|
+
sp.smiles = smi
|
|
1178
|
+
if rgt.is_substrate:
|
|
1179
|
+
sp.is_sm = True
|
|
1180
|
+
species.append(sp)
|
|
1181
|
+
sp_idx += 1
|
|
1182
|
+
_log(f" CSV-only species: '{rgt.name}' (MW={rgt.mw:.1f})")
|
|
1183
|
+
|
|
1184
|
+
# --- Match product to CSV ---
|
|
1185
|
+
if exp_data.product and exp_data.product.mw > 0:
|
|
1186
|
+
for sp in species:
|
|
1187
|
+
if sp.role != "product":
|
|
1188
|
+
continue
|
|
1189
|
+
if sp.mw > 0:
|
|
1190
|
+
delta = abs(sp.mw - exp_data.product.mw)
|
|
1191
|
+
if delta < MW_MATCH_TOLERANCE:
|
|
1192
|
+
sp.csv_name = exp_data.product.name
|
|
1193
|
+
sp.is_dp = True
|
|
1194
|
+
_log(f" Product CSV match: '{exp_data.product.name}'")
|
|
1195
|
+
break
|
|
1196
|
+
|
|
1197
|
+
return species, warnings, exp_data
|
|
1198
|
+
|
|
1199
|
+
|
|
1200
|
+
def _apply_csv_match(sp: SpeciesDescriptor, rgt) -> None:
|
|
1201
|
+
"""Apply CSV reagent data to a species descriptor."""
|
|
1202
|
+
sp.csv_name = rgt.name
|
|
1203
|
+
sp.csv_equiv = rgt.equiv
|
|
1204
|
+
sp.csv_mass = rgt.mass
|
|
1205
|
+
if hasattr(rgt, "volume") and rgt.volume:
|
|
1206
|
+
sp.csv_volume = rgt.volume
|
|
1207
|
+
if hasattr(rgt, "supplier") and rgt.supplier:
|
|
1208
|
+
sp.csv_supplier = rgt.supplier
|
|
1209
|
+
if hasattr(rgt, "is_substrate") and rgt.is_substrate:
|
|
1210
|
+
sp.is_sm = True # Mark from CSV substrate flag
|
|
1211
|
+
sp.is_substrate = True
|
|
1212
|
+
|
|
1213
|
+
|
|
1214
|
+
# ---------------------------------------------------------------------------
|
|
1215
|
+
# Species classification
|
|
1216
|
+
# ---------------------------------------------------------------------------
|
|
1217
|
+
|
|
1218
|
+
def _classify_species(species: List[SpeciesDescriptor],
|
|
1219
|
+
use_rxnmapper: bool = True,
|
|
1220
|
+
use_rxn_insight: bool = True,
|
|
1221
|
+
) -> Optional[float]:
|
|
1222
|
+
"""Classify non-product species using the tiered pipeline.
|
|
1223
|
+
|
|
1224
|
+
Returns Schneider FP score (if classification ran), or None.
|
|
1225
|
+
use_rxnmapper is deprecated and ignored (kept for API compat).
|
|
1226
|
+
"""
|
|
1227
|
+
from .reactant_heuristic import (
|
|
1228
|
+
ReagentInfo, classify_reagents, role_lookup,
|
|
1229
|
+
)
|
|
1230
|
+
|
|
1231
|
+
# Find product SMILES (needed for classification)
|
|
1232
|
+
product_smiles = None
|
|
1233
|
+
for sp in species:
|
|
1234
|
+
if sp.role == "product" and sp.smiles:
|
|
1235
|
+
product_smiles = sp.smiles
|
|
1236
|
+
break
|
|
1237
|
+
|
|
1238
|
+
if not product_smiles:
|
|
1239
|
+
_log(" WARNING: No product SMILES found, cannot classify reagents")
|
|
1240
|
+
return None
|
|
1241
|
+
|
|
1242
|
+
# Build ReagentInfo list for the classification pipeline
|
|
1243
|
+
reagents = []
|
|
1244
|
+
sp_to_ri = {} # map species index → ReagentInfo index
|
|
1245
|
+
for i, sp in enumerate(species):
|
|
1246
|
+
if sp.role == "product":
|
|
1247
|
+
continue
|
|
1248
|
+
if sp.role == "candidate":
|
|
1249
|
+
ri = ReagentInfo(
|
|
1250
|
+
source_id=sp.source_id or sp.id,
|
|
1251
|
+
source_type=sp.source,
|
|
1252
|
+
name=sp.name or None,
|
|
1253
|
+
smiles=sp.smiles,
|
|
1254
|
+
position="reactant",
|
|
1255
|
+
classification="",
|
|
1256
|
+
classification_method="",
|
|
1257
|
+
)
|
|
1258
|
+
sp_to_ri[i] = len(reagents)
|
|
1259
|
+
reagents.append(ri)
|
|
1260
|
+
|
|
1261
|
+
if not reagents:
|
|
1262
|
+
return None
|
|
1263
|
+
|
|
1264
|
+
# Run 2-tier classification (Schneider FP → DB enrichment)
|
|
1265
|
+
classify_reagents(reagents, product_smiles)
|
|
1266
|
+
|
|
1267
|
+
# Apply results back to species
|
|
1268
|
+
schneider_score = None
|
|
1269
|
+
for sp_i, ri_i in sp_to_ri.items():
|
|
1270
|
+
ri = reagents[ri_i]
|
|
1271
|
+
sp = species[sp_i]
|
|
1272
|
+
sp.role = ri.classification or "unclassified"
|
|
1273
|
+
sp.classification_method = ri.classification_method
|
|
1274
|
+
sp.role_detail = ri.role
|
|
1275
|
+
if ri.schneider_score is not None and schneider_score is None:
|
|
1276
|
+
schneider_score = ri.schneider_score
|
|
1277
|
+
|
|
1278
|
+
# --- Optional RXN Insight enrichment ---
|
|
1279
|
+
rxn_class = None
|
|
1280
|
+
rxn_name = None
|
|
1281
|
+
if use_rxn_insight:
|
|
1282
|
+
rxn_class, rxn_name = _try_rxn_insight(species, product_smiles)
|
|
1283
|
+
|
|
1284
|
+
return schneider_score
|
|
1285
|
+
|
|
1286
|
+
|
|
1287
|
+
def _try_rxn_insight(species: List[SpeciesDescriptor],
|
|
1288
|
+
product_smiles: str,
|
|
1289
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
1290
|
+
"""Try RXN Insight enrichment for reaction class and per-species roles.
|
|
1291
|
+
|
|
1292
|
+
Returns (reaction_class, reaction_name) or (None, None).
|
|
1293
|
+
"""
|
|
1294
|
+
try:
|
|
1295
|
+
from experiments.role_classification.rxn_role_classifier import (
|
|
1296
|
+
classify_roles_enriched,
|
|
1297
|
+
)
|
|
1298
|
+
except ImportError:
|
|
1299
|
+
return None, None
|
|
1300
|
+
|
|
1301
|
+
# Build full reaction SMILES: all reactant/reagent SMILES >> product
|
|
1302
|
+
lhs_parts = []
|
|
1303
|
+
for sp in species:
|
|
1304
|
+
if sp.role != "product" and sp.smiles:
|
|
1305
|
+
lhs_parts.append(sp.smiles)
|
|
1306
|
+
if not lhs_parts:
|
|
1307
|
+
return None, None
|
|
1308
|
+
|
|
1309
|
+
rxn_smi = ".".join(lhs_parts) + ">>" + product_smiles
|
|
1310
|
+
|
|
1311
|
+
try:
|
|
1312
|
+
result = classify_roles_enriched(rxn_smi)
|
|
1313
|
+
except Exception as e:
|
|
1314
|
+
_log(f" RXN Insight failed: {e}")
|
|
1315
|
+
return None, None
|
|
1316
|
+
|
|
1317
|
+
if not result:
|
|
1318
|
+
return None, None
|
|
1319
|
+
|
|
1320
|
+
rxn_class = result.get("reaction_class")
|
|
1321
|
+
rxn_name = result.get("reaction_name")
|
|
1322
|
+
|
|
1323
|
+
# Map per-component roles back to species
|
|
1324
|
+
try:
|
|
1325
|
+
from rdkit import Chem
|
|
1326
|
+
def _canon(smi):
|
|
1327
|
+
mol = Chem.MolFromSmiles(smi)
|
|
1328
|
+
return Chem.MolToSmiles(mol) if mol else smi
|
|
1329
|
+
except ImportError:
|
|
1330
|
+
def _canon(smi):
|
|
1331
|
+
return smi
|
|
1332
|
+
|
|
1333
|
+
comp_map = {}
|
|
1334
|
+
for comp in result.get("components", []):
|
|
1335
|
+
canon = _canon(comp.get("smiles", ""))
|
|
1336
|
+
comp_map[canon] = comp.get("role")
|
|
1337
|
+
|
|
1338
|
+
for sp in species:
|
|
1339
|
+
if sp.smiles and sp.role != "product":
|
|
1340
|
+
canon = _canon(sp.smiles)
|
|
1341
|
+
insight_role = comp_map.get(canon)
|
|
1342
|
+
if insight_role:
|
|
1343
|
+
sp.rxn_insight_role = insight_role
|
|
1344
|
+
|
|
1345
|
+
_log(f" RXN Insight: class={rxn_class}, name={rxn_name}")
|
|
1346
|
+
return rxn_class, rxn_name
|
|
1347
|
+
|
|
1348
|
+
|
|
1349
|
+
# ---------------------------------------------------------------------------
|
|
1350
|
+
# SM / DP identification and display names
|
|
1351
|
+
# ---------------------------------------------------------------------------
|
|
1352
|
+
|
|
1353
|
+
def _identify_sm_dp(species: List[SpeciesDescriptor]) -> None:
|
|
1354
|
+
"""Identify SM and DP, then apply display name precedence rules."""
|
|
1355
|
+
|
|
1356
|
+
# --- DP: single product or largest product ---
|
|
1357
|
+
products = [sp for sp in species if sp.role == "product"]
|
|
1358
|
+
if len(products) == 1:
|
|
1359
|
+
products[0].is_dp = True
|
|
1360
|
+
elif len(products) > 1:
|
|
1361
|
+
# If one already matched CSV product, it stays DP
|
|
1362
|
+
dp_found = any(sp.is_dp for sp in products)
|
|
1363
|
+
if not dp_found:
|
|
1364
|
+
# Pick largest by MW
|
|
1365
|
+
best = max(products, key=lambda sp: sp.mw)
|
|
1366
|
+
best.is_dp = True
|
|
1367
|
+
|
|
1368
|
+
# --- SM: CSV substrate flag → most contributing → largest ---
|
|
1369
|
+
# Priority 0: Check if CSV already marked a substrate
|
|
1370
|
+
csv_substrates = [sp for sp in species
|
|
1371
|
+
if sp.is_sm and sp.role != "product"]
|
|
1372
|
+
if csv_substrates:
|
|
1373
|
+
# Pick largest MW among CSV substrates (handles multi-substrate)
|
|
1374
|
+
sm = max(csv_substrates, key=lambda sp: sp.mw)
|
|
1375
|
+
# Clear other substrate flags — only keep the primary SM
|
|
1376
|
+
for sp in csv_substrates:
|
|
1377
|
+
if sp is not sm:
|
|
1378
|
+
sp.is_sm = False
|
|
1379
|
+
else:
|
|
1380
|
+
# Priority 1: Largest atom_contributing non-solvent by MW
|
|
1381
|
+
atom_contributing = [sp for sp in species
|
|
1382
|
+
if sp.role == "atom_contributing"
|
|
1383
|
+
and not sp.is_solvent and sp.mw > 50]
|
|
1384
|
+
if atom_contributing:
|
|
1385
|
+
sm = max(atom_contributing, key=lambda sp: sp.mw)
|
|
1386
|
+
sm.is_sm = True
|
|
1387
|
+
else:
|
|
1388
|
+
# Priority 2: Largest non-product, non-solvent species by MW
|
|
1389
|
+
# Exclude counterions (MW < 50: HCl=36, HBr=81 — use 50 cutoff)
|
|
1390
|
+
fallback = [sp for sp in species
|
|
1391
|
+
if sp.role != "product"
|
|
1392
|
+
and not sp.is_solvent
|
|
1393
|
+
and sp.mw > 50]
|
|
1394
|
+
if fallback:
|
|
1395
|
+
sm = max(fallback, key=lambda sp: sp.mw)
|
|
1396
|
+
sm.is_sm = True
|
|
1397
|
+
|
|
1398
|
+
|
|
1399
|
+
def _apply_display_names(species: List[SpeciesDescriptor]) -> None:
|
|
1400
|
+
"""Apply display name precedence rules to all species."""
|
|
1401
|
+
from ..resolve.reagent_db import get_reagent_db
|
|
1402
|
+
db = get_reagent_db()
|
|
1403
|
+
|
|
1404
|
+
for sp in species:
|
|
1405
|
+
# SM / DP are identified by is_sm / is_dp flags — their display names
|
|
1406
|
+
# follow the same precedence as other species (no special "SM"/"DP"
|
|
1407
|
+
# override; compound labels are a layout-layer decision).
|
|
1408
|
+
|
|
1409
|
+
# 1. Reagent DB display name from SMILES
|
|
1410
|
+
if sp.smiles:
|
|
1411
|
+
display = db.display_for_smiles(sp.smiles)
|
|
1412
|
+
if display:
|
|
1413
|
+
sp.name = display
|
|
1414
|
+
continue
|
|
1415
|
+
|
|
1416
|
+
# 3. Reagent DB display name from name
|
|
1417
|
+
if sp.name:
|
|
1418
|
+
display = db.resolve_display(sp.name)
|
|
1419
|
+
if display and display.lower() != sp.name.lower():
|
|
1420
|
+
sp.name = display
|
|
1421
|
+
continue
|
|
1422
|
+
# Keep existing name if resolve_display just returns input
|
|
1423
|
+
if display:
|
|
1424
|
+
sp.name = display
|
|
1425
|
+
continue
|
|
1426
|
+
|
|
1427
|
+
# 3b. Reagent DB display name from csv_name (abbreviation > full name)
|
|
1428
|
+
if sp.csv_name:
|
|
1429
|
+
display = db.display_for_name(sp.csv_name.lower())
|
|
1430
|
+
if display:
|
|
1431
|
+
sp.name = display
|
|
1432
|
+
continue
|
|
1433
|
+
|
|
1434
|
+
# 4. CSV name
|
|
1435
|
+
if sp.csv_name:
|
|
1436
|
+
sp.name = sp.csv_name
|
|
1437
|
+
continue
|
|
1438
|
+
|
|
1439
|
+
# 5. Molecular formula
|
|
1440
|
+
if sp.formula:
|
|
1441
|
+
sp.name = sp.formula
|
|
1442
|
+
continue
|
|
1443
|
+
|
|
1444
|
+
# 6. SMILES as last resort
|
|
1445
|
+
if sp.smiles:
|
|
1446
|
+
sp.name = sp.smiles
|
|
1447
|
+
|
|
1448
|
+
|
|
1449
|
+
|
|
1450
|
+
|
|
1451
|
+
def _detect_solvents(species: List[SpeciesDescriptor],
|
|
1452
|
+
exp_data: Optional[Any] = None) -> None:
|
|
1453
|
+
"""Mark solvent species from CSV SOLVENT section and reagent_db role."""
|
|
1454
|
+
from ..resolve.reagent_db import get_reagent_db
|
|
1455
|
+
db = get_reagent_db()
|
|
1456
|
+
|
|
1457
|
+
# From reagent_db role_detail
|
|
1458
|
+
for sp in species:
|
|
1459
|
+
if sp.role_detail == "solvent":
|
|
1460
|
+
sp.is_solvent = True
|
|
1461
|
+
|
|
1462
|
+
if exp_data is None:
|
|
1463
|
+
return
|
|
1464
|
+
|
|
1465
|
+
# From CSV SOLVENT section — match by name to existing species
|
|
1466
|
+
csv_solvents = getattr(exp_data, "solvents", [])
|
|
1467
|
+
matched_solvent_names = set()
|
|
1468
|
+
|
|
1469
|
+
for solv in csv_solvents:
|
|
1470
|
+
solv_name = solv.name.strip()
|
|
1471
|
+
if not solv_name:
|
|
1472
|
+
continue
|
|
1473
|
+
solv_lower = solv_name.lower()
|
|
1474
|
+
solv_display = db.resolve_display(solv_name).lower()
|
|
1475
|
+
|
|
1476
|
+
for sp in species:
|
|
1477
|
+
sp_name_lower = (sp.name or "").strip().lower()
|
|
1478
|
+
sp_csv_lower = (sp.csv_name or "").strip().lower()
|
|
1479
|
+
sp_display_lower = db.resolve_display(sp.name or "").lower()
|
|
1480
|
+
sp_display_text_lower = (sp.display_text or "").strip().lower()
|
|
1481
|
+
candidates = {sp_name_lower, sp_csv_lower, sp_display_lower,
|
|
1482
|
+
sp_display_text_lower} - {""}
|
|
1483
|
+
if candidates & {solv_lower, solv_display}:
|
|
1484
|
+
sp.is_solvent = True
|
|
1485
|
+
matched_solvent_names.add(solv_lower)
|
|
1486
|
+
break
|
|
1487
|
+
|
|
1488
|
+
# Add unmatched solvents as csv_only species
|
|
1489
|
+
sp_idx = max((int(sp.id.split("_")[1]) for sp in species), default=-1) + 1
|
|
1490
|
+
for solv in csv_solvents:
|
|
1491
|
+
solv_name = solv.name.strip()
|
|
1492
|
+
if not solv_name or solv_name.lower() in matched_solvent_names:
|
|
1493
|
+
continue
|
|
1494
|
+
# Check if this is a known reagent
|
|
1495
|
+
smi = None
|
|
1496
|
+
entry = db.entry_for_name(solv_name.lower())
|
|
1497
|
+
if entry:
|
|
1498
|
+
smi_val = entry.get("smiles")
|
|
1499
|
+
if isinstance(smi_val, list):
|
|
1500
|
+
smi_val = smi_val[0] if smi_val else None
|
|
1501
|
+
smi = smi_val
|
|
1502
|
+
sp = SpeciesDescriptor(
|
|
1503
|
+
id=f"sp_{sp_idx}",
|
|
1504
|
+
name=solv_name,
|
|
1505
|
+
role="non_contributing",
|
|
1506
|
+
role_detail="solvent",
|
|
1507
|
+
source="csv_only",
|
|
1508
|
+
smiles=smi,
|
|
1509
|
+
is_solvent=True,
|
|
1510
|
+
)
|
|
1511
|
+
species.append(sp)
|
|
1512
|
+
sp_idx += 1
|
|
1513
|
+
matched_solvent_names.add(solv_lower) # prevent duplicate csv_only entries
|
|
1514
|
+
_log(f" CSV solvent added: '{solv_name}'")
|
|
1515
|
+
|
|
1516
|
+
|
|
1517
|
+
def _format_equiv(equiv_str: str) -> str:
|
|
1518
|
+
"""Format equivalents for display: '2.0' → '2', '0.05' → '0.05'."""
|
|
1519
|
+
if not equiv_str:
|
|
1520
|
+
return ""
|
|
1521
|
+
try:
|
|
1522
|
+
val = float(equiv_str)
|
|
1523
|
+
if val == int(val):
|
|
1524
|
+
return str(int(val))
|
|
1525
|
+
return equiv_str.strip()
|
|
1526
|
+
except (ValueError, TypeError):
|
|
1527
|
+
return equiv_str.strip()
|
|
1528
|
+
|
|
1529
|
+
|
|
1530
|
+
def _build_display_texts(species: List[SpeciesDescriptor]) -> None:
|
|
1531
|
+
"""Build display_text for each species (name + equiv annotation).
|
|
1532
|
+
|
|
1533
|
+
display_text is what would appear on a rendered scheme:
|
|
1534
|
+
- Reagents with equiv > 1: "Cs2CO3 (2 eq.)"
|
|
1535
|
+
- Solvents: just the name (no equiv)
|
|
1536
|
+
- SM/DP substrates: just the name (equiv=1 suppressed)
|
|
1537
|
+
"""
|
|
1538
|
+
for sp in species:
|
|
1539
|
+
base = sp.name or ""
|
|
1540
|
+
if not base:
|
|
1541
|
+
sp.display_text = None
|
|
1542
|
+
continue
|
|
1543
|
+
|
|
1544
|
+
# Substrates and products: just the name
|
|
1545
|
+
if sp.is_substrate or sp.is_sm or sp.is_dp:
|
|
1546
|
+
sp.display_text = base
|
|
1547
|
+
elif sp.is_solvent:
|
|
1548
|
+
sp.display_text = base
|
|
1549
|
+
elif sp.csv_equiv:
|
|
1550
|
+
# Non-substrate species with equiv → "Name (X eq.)"
|
|
1551
|
+
equiv_str = _format_equiv(sp.csv_equiv)
|
|
1552
|
+
if equiv_str and equiv_str != "1":
|
|
1553
|
+
sp.display_text = f"{base} ({equiv_str} eq.)"
|
|
1554
|
+
else:
|
|
1555
|
+
sp.display_text = base
|
|
1556
|
+
else:
|
|
1557
|
+
sp.display_text = base
|
|
1558
|
+
|
|
1559
|
+
|
|
1560
|
+
def _populate_eln_data(desc: "ReactionDescriptor",
|
|
1561
|
+
exp_data: Optional[Any]) -> None:
|
|
1562
|
+
"""Populate desc.eln_data from parsed CSV ExperimentData."""
|
|
1563
|
+
if exp_data is None:
|
|
1564
|
+
return
|
|
1565
|
+
|
|
1566
|
+
eln = {}
|
|
1567
|
+
|
|
1568
|
+
# SM mass from substrate species
|
|
1569
|
+
sm = desc.get_sm()
|
|
1570
|
+
if sm and sm.csv_mass:
|
|
1571
|
+
eln["sm_mass"] = sm.csv_mass.strip()
|
|
1572
|
+
|
|
1573
|
+
# Product yield data
|
|
1574
|
+
product = getattr(exp_data, "product", None)
|
|
1575
|
+
if product:
|
|
1576
|
+
if hasattr(product, "obtained_mass") and product.obtained_mass:
|
|
1577
|
+
eln["product_obtained"] = product.obtained_mass.strip()
|
|
1578
|
+
if hasattr(product, "yield_pct") and product.yield_pct:
|
|
1579
|
+
eln["product_yield"] = product.yield_pct.strip()
|
|
1580
|
+
|
|
1581
|
+
# Procedure text (HTML + plain text)
|
|
1582
|
+
procedure = getattr(exp_data, "procedure_html", "")
|
|
1583
|
+
if procedure:
|
|
1584
|
+
eln["procedure_text"] = procedure
|
|
1585
|
+
procedure_plain = getattr(exp_data, "procedure_text", "")
|
|
1586
|
+
if procedure_plain:
|
|
1587
|
+
eln["procedure_plain"] = procedure_plain
|
|
1588
|
+
|
|
1589
|
+
# Experiment metadata
|
|
1590
|
+
reaction_type = getattr(exp_data, "reaction_type", "")
|
|
1591
|
+
if reaction_type:
|
|
1592
|
+
eln["reaction_type"] = reaction_type
|
|
1593
|
+
start_date = getattr(exp_data, "start_date", "")
|
|
1594
|
+
if start_date:
|
|
1595
|
+
eln["start_date"] = start_date
|
|
1596
|
+
labbook = getattr(exp_data, "labbook_name", "")
|
|
1597
|
+
if labbook:
|
|
1598
|
+
eln["labbook_name"] = labbook
|
|
1599
|
+
|
|
1600
|
+
# Solvents list (names only, backward compat)
|
|
1601
|
+
solvents = getattr(exp_data, "solvents", [])
|
|
1602
|
+
if solvents:
|
|
1603
|
+
eln["solvents"] = [s.name.strip() for s in solvents if s.name.strip()]
|
|
1604
|
+
# Full solvent details with volume/concentration
|
|
1605
|
+
eln["solvent_details"] = [
|
|
1606
|
+
{
|
|
1607
|
+
"name": s.name.strip(),
|
|
1608
|
+
"volume": getattr(s, "volume", "").strip(),
|
|
1609
|
+
"concentration": getattr(s, "concentration", "").strip(),
|
|
1610
|
+
}
|
|
1611
|
+
for s in solvents if s.name.strip()
|
|
1612
|
+
]
|
|
1613
|
+
|
|
1614
|
+
if eln:
|
|
1615
|
+
desc.eln_data = eln
|
|
1616
|
+
|
|
1617
|
+
|
|
1618
|
+
# ---------------------------------------------------------------------------
|
|
1619
|
+
# Mass computation
|
|
1620
|
+
# ---------------------------------------------------------------------------
|
|
1621
|
+
|
|
1622
|
+
def _compute_all_masses(species: List[SpeciesDescriptor]) -> None:
|
|
1623
|
+
"""Compute exact masses, neutral masses, MW, formula, and adducts."""
|
|
1624
|
+
try:
|
|
1625
|
+
from rdkit import Chem
|
|
1626
|
+
from rdkit.Chem import Descriptors, rdMolDescriptors
|
|
1627
|
+
has_rdkit = True
|
|
1628
|
+
except ImportError:
|
|
1629
|
+
has_rdkit = False
|
|
1630
|
+
|
|
1631
|
+
for sp in species:
|
|
1632
|
+
if not sp.smiles or not has_rdkit:
|
|
1633
|
+
continue
|
|
1634
|
+
|
|
1635
|
+
mol = Chem.MolFromSmiles(sp.smiles)
|
|
1636
|
+
if mol is None:
|
|
1637
|
+
continue
|
|
1638
|
+
|
|
1639
|
+
# Full mass (including counterions)
|
|
1640
|
+
sp.exact_mass_full = Descriptors.ExactMolWt(mol)
|
|
1641
|
+
|
|
1642
|
+
# Average MW (for CSV matching)
|
|
1643
|
+
if sp.mw <= 0:
|
|
1644
|
+
sp.mw = Descriptors.MolWt(mol)
|
|
1645
|
+
|
|
1646
|
+
# Formula
|
|
1647
|
+
if not sp.formula:
|
|
1648
|
+
sp.formula = rdMolDescriptors.CalcMolFormula(mol)
|
|
1649
|
+
|
|
1650
|
+
# Salt splitting: neutral = largest fragment
|
|
1651
|
+
frags = Chem.GetMolFrags(mol, asMols=True)
|
|
1652
|
+
if len(frags) > 1:
|
|
1653
|
+
neutral_mol = max(frags, key=lambda m: m.GetNumHeavyAtoms())
|
|
1654
|
+
sp.exact_mass = Descriptors.ExactMolWt(neutral_mol)
|
|
1655
|
+
sp.smiles_neutral = Chem.MolToSmiles(neutral_mol)
|
|
1656
|
+
else:
|
|
1657
|
+
sp.exact_mass = sp.exact_mass_full
|
|
1658
|
+
sp.smiles_neutral = sp.smiles
|
|
1659
|
+
|
|
1660
|
+
# Adducts from neutral mass (for LCMS matching)
|
|
1661
|
+
# [M+H]+, [M-H]-, [M+Na]+, [M+formate]-
|
|
1662
|
+
sp.adducts = {
|
|
1663
|
+
"[M+H]+": sp.exact_mass + 1.00728,
|
|
1664
|
+
"[M-H]-": sp.exact_mass - 1.00728,
|
|
1665
|
+
"[M+Na]+": sp.exact_mass + 22.98922,
|
|
1666
|
+
"[M+formate]-": sp.exact_mass + 44.99820,
|
|
1667
|
+
}
|
|
1668
|
+
|
|
1669
|
+
|
|
1670
|
+
# ---------------------------------------------------------------------------
|
|
1671
|
+
# Deduplication
|
|
1672
|
+
# ---------------------------------------------------------------------------
|
|
1673
|
+
|
|
1674
|
+
def _deduplicate_species(species: List[SpeciesDescriptor]) -> List[SpeciesDescriptor]:
|
|
1675
|
+
"""Remove duplicate species by canonical SMILES.
|
|
1676
|
+
|
|
1677
|
+
When duplicates exist, prefer the one with the most metadata
|
|
1678
|
+
(CSV match, fragment source, etc.). SMILES are canonicalized via
|
|
1679
|
+
RDKit before comparison so that different representations of the
|
|
1680
|
+
same molecule (kekulized vs aromatic, different atom ordering) are
|
|
1681
|
+
recognized as duplicates.
|
|
1682
|
+
|
|
1683
|
+
Species with no SMILES are merged into a SMILES-bearing entry by MW,
|
|
1684
|
+
but only when MW values are unambiguous (no two remaining species
|
|
1685
|
+
share the same MW within tolerance).
|
|
1686
|
+
"""
|
|
1687
|
+
if not species:
|
|
1688
|
+
return species
|
|
1689
|
+
|
|
1690
|
+
# --- Build canonicalizer ---
|
|
1691
|
+
try:
|
|
1692
|
+
from rdkit import Chem
|
|
1693
|
+
|
|
1694
|
+
def _canon(smi: str) -> str:
|
|
1695
|
+
mol = Chem.MolFromSmiles(smi)
|
|
1696
|
+
return Chem.MolToSmiles(mol) if mol else smi
|
|
1697
|
+
except ImportError:
|
|
1698
|
+
def _canon(smi: str) -> str:
|
|
1699
|
+
return smi
|
|
1700
|
+
|
|
1701
|
+
_ROLE_PRIO = {"product": 0, "atom_contributing": 1,
|
|
1702
|
+
"non_contributing": 2, "candidate": 3,
|
|
1703
|
+
"unclassified": 4}
|
|
1704
|
+
|
|
1705
|
+
seen: Dict[str, int] = {} # canonical SMILES → index in result
|
|
1706
|
+
result = []
|
|
1707
|
+
|
|
1708
|
+
for sp in species:
|
|
1709
|
+
if not sp.smiles:
|
|
1710
|
+
result.append(sp)
|
|
1711
|
+
continue
|
|
1712
|
+
|
|
1713
|
+
key = _canon(sp.smiles)
|
|
1714
|
+
# Also update the stored SMILES to canonical form
|
|
1715
|
+
sp.smiles = key
|
|
1716
|
+
|
|
1717
|
+
if key in seen:
|
|
1718
|
+
_merge_into(result[seen[key]], sp, _ROLE_PRIO)
|
|
1719
|
+
else:
|
|
1720
|
+
seen[key] = len(result)
|
|
1721
|
+
result.append(sp)
|
|
1722
|
+
|
|
1723
|
+
# --- MW-based merge for no-SMILES entries ---
|
|
1724
|
+
# Only when MW values are unambiguous: if two SMILES-bearing entries
|
|
1725
|
+
# have the same MW (within tolerance), skip MW-based merging entirely
|
|
1726
|
+
# to avoid wrong matches.
|
|
1727
|
+
try:
|
|
1728
|
+
from rdkit.Chem import Descriptors as _Desc
|
|
1729
|
+
from rdkit import Chem as _Chem
|
|
1730
|
+
_has_rdkit = True
|
|
1731
|
+
except ImportError:
|
|
1732
|
+
_has_rdkit = False
|
|
1733
|
+
|
|
1734
|
+
merged_indices: set = set()
|
|
1735
|
+
if _has_rdkit:
|
|
1736
|
+
# Compute MW for all SMILES-bearing entries
|
|
1737
|
+
smiles_mws: List[Tuple[int, float]] = [] # (index, mw)
|
|
1738
|
+
for i, sp in enumerate(result):
|
|
1739
|
+
if not sp.smiles:
|
|
1740
|
+
continue
|
|
1741
|
+
mol = _Chem.MolFromSmiles(sp.smiles)
|
|
1742
|
+
if mol is not None:
|
|
1743
|
+
smiles_mws.append((i, _Desc.MolWt(mol)))
|
|
1744
|
+
|
|
1745
|
+
# Check for ambiguous MWs (two entries within tolerance)
|
|
1746
|
+
mw_ambiguous = False
|
|
1747
|
+
for a_idx in range(len(smiles_mws)):
|
|
1748
|
+
for b_idx in range(a_idx + 1, len(smiles_mws)):
|
|
1749
|
+
if abs(smiles_mws[a_idx][1] - smiles_mws[b_idx][1]) < MW_MATCH_TOLERANCE:
|
|
1750
|
+
mw_ambiguous = True
|
|
1751
|
+
break
|
|
1752
|
+
if mw_ambiguous:
|
|
1753
|
+
break
|
|
1754
|
+
|
|
1755
|
+
if not mw_ambiguous:
|
|
1756
|
+
for i, sp in enumerate(result):
|
|
1757
|
+
if sp.smiles:
|
|
1758
|
+
continue
|
|
1759
|
+
sp_mw = sp.mw
|
|
1760
|
+
if not sp_mw:
|
|
1761
|
+
continue
|
|
1762
|
+
best_delta = MW_MATCH_TOLERANCE
|
|
1763
|
+
best_idx = -1
|
|
1764
|
+
for j, mw_val in smiles_mws:
|
|
1765
|
+
if j in merged_indices:
|
|
1766
|
+
continue
|
|
1767
|
+
delta = abs(mw_val - sp_mw)
|
|
1768
|
+
if delta < best_delta:
|
|
1769
|
+
best_delta = delta
|
|
1770
|
+
best_idx = j
|
|
1771
|
+
if best_idx >= 0:
|
|
1772
|
+
_merge_into(result[best_idx], sp, _ROLE_PRIO)
|
|
1773
|
+
merged_indices.add(i)
|
|
1774
|
+
_log(f" Dedup MW-merge: {sp.name or sp.csv_name} → "
|
|
1775
|
+
f"{result[best_idx].name or result[best_idx].csv_name} "
|
|
1776
|
+
f"(delta={best_delta:.1f} Da)")
|
|
1777
|
+
elif any(not sp.smiles and sp.mw for sp in result):
|
|
1778
|
+
_log(" Dedup: skipping MW-merge (ambiguous MW among species)")
|
|
1779
|
+
|
|
1780
|
+
if merged_indices:
|
|
1781
|
+
result = [sp for i, sp in enumerate(result) if i not in merged_indices]
|
|
1782
|
+
|
|
1783
|
+
# Re-index
|
|
1784
|
+
for i, sp in enumerate(result):
|
|
1785
|
+
sp.id = f"sp_{i}"
|
|
1786
|
+
|
|
1787
|
+
return result
|
|
1788
|
+
|
|
1789
|
+
|
|
1790
|
+
def _merge_into(existing: "SpeciesDescriptor", incoming: "SpeciesDescriptor",
|
|
1791
|
+
role_prio: Dict[str, int]) -> None:
|
|
1792
|
+
"""Merge *incoming* metadata into *existing*, mutating existing in place."""
|
|
1793
|
+
if not existing.csv_name and incoming.csv_name:
|
|
1794
|
+
existing.csv_name = incoming.csv_name
|
|
1795
|
+
existing.csv_equiv = incoming.csv_equiv
|
|
1796
|
+
existing.csv_mass = incoming.csv_mass
|
|
1797
|
+
if not existing.name and incoming.name:
|
|
1798
|
+
existing.name = incoming.name
|
|
1799
|
+
if incoming.is_sm:
|
|
1800
|
+
existing.is_sm = True
|
|
1801
|
+
if incoming.is_dp:
|
|
1802
|
+
existing.is_dp = True
|
|
1803
|
+
if incoming.is_substrate and not existing.is_substrate:
|
|
1804
|
+
existing.is_substrate = True
|
|
1805
|
+
if incoming.is_solvent and not existing.is_solvent:
|
|
1806
|
+
existing.is_solvent = True
|
|
1807
|
+
# Prefer non-empty role_detail
|
|
1808
|
+
if not existing.role_detail and incoming.role_detail:
|
|
1809
|
+
existing.role_detail = incoming.role_detail
|
|
1810
|
+
# Prefer source with more info: fragment > rxn > text_label > csv_only
|
|
1811
|
+
_SRC_PRIO = {"fragment": 0, "rxn": 1, "text_label": 2, "csv_only": 3}
|
|
1812
|
+
if _SRC_PRIO.get(incoming.source, 9) < _SRC_PRIO.get(existing.source, 9):
|
|
1813
|
+
existing.source = incoming.source
|
|
1814
|
+
# Keep higher role (product > atom_contributing > non_contributing)
|
|
1815
|
+
if role_prio.get(incoming.role, 5) < role_prio.get(existing.role, 5):
|
|
1816
|
+
existing.role = incoming.role
|
|
1817
|
+
# Prefer SMILES from the incoming entry if existing has none
|
|
1818
|
+
if not existing.smiles and incoming.smiles:
|
|
1819
|
+
existing.smiles = incoming.smiles
|
|
1820
|
+
# Merge MW
|
|
1821
|
+
if not existing.mw and incoming.mw:
|
|
1822
|
+
existing.mw = incoming.mw
|
|
1823
|
+
|
|
1824
|
+
|
|
1825
|
+
# ---------------------------------------------------------------------------
|
|
1826
|
+
# Build reaction SMILES
|
|
1827
|
+
# ---------------------------------------------------------------------------
|
|
1828
|
+
|
|
1829
|
+
def _build_reaction_smiles(species: List[SpeciesDescriptor]) -> Optional[str]:
|
|
1830
|
+
"""Build full reaction SMILES from species list."""
|
|
1831
|
+
lhs_parts = []
|
|
1832
|
+
rhs_parts = []
|
|
1833
|
+
|
|
1834
|
+
for sp in species:
|
|
1835
|
+
if not sp.smiles:
|
|
1836
|
+
continue
|
|
1837
|
+
if sp.role == "product":
|
|
1838
|
+
rhs_parts.append(sp.smiles)
|
|
1839
|
+
else:
|
|
1840
|
+
lhs_parts.append(sp.smiles)
|
|
1841
|
+
|
|
1842
|
+
if not rhs_parts or not lhs_parts:
|
|
1843
|
+
return None
|
|
1844
|
+
|
|
1845
|
+
return ".".join(lhs_parts) + ">>" + ".".join(rhs_parts)
|
|
1846
|
+
|
|
1847
|
+
|
|
1848
|
+
# ---------------------------------------------------------------------------
|
|
1849
|
+
# Main public API
|
|
1850
|
+
# ---------------------------------------------------------------------------
|
|
1851
|
+
|
|
1852
|
+
def parse_reaction(
|
|
1853
|
+
cdxml: Optional[str] = None,
|
|
1854
|
+
cdx: Optional[str] = None,
|
|
1855
|
+
csv: Optional[str] = None,
|
|
1856
|
+
rxn: Optional[str] = None,
|
|
1857
|
+
input_dir: Optional[str] = None,
|
|
1858
|
+
experiment: Optional[str] = None,
|
|
1859
|
+
use_rxnmapper: bool = False,
|
|
1860
|
+
use_rxn_insight: bool = True,
|
|
1861
|
+
use_network: bool = True,
|
|
1862
|
+
verbose: bool = False,
|
|
1863
|
+
) -> ReactionDescriptor:
|
|
1864
|
+
"""Parse reaction from ELN files and return a ReactionDescriptor.
|
|
1865
|
+
|
|
1866
|
+
Accepts any combination of input files. Each contributes different
|
|
1867
|
+
information (see plan).
|
|
1868
|
+
|
|
1869
|
+
Args:
|
|
1870
|
+
cdxml: Path to CDXML file (polished or raw)
|
|
1871
|
+
cdx: Path to CDX file (converted to CDXML internally)
|
|
1872
|
+
csv: Path to Findmolecule ELN CSV
|
|
1873
|
+
rxn: Path to RXN file
|
|
1874
|
+
input_dir: Directory to auto-discover files from
|
|
1875
|
+
experiment: Experiment name (with input_dir)
|
|
1876
|
+
use_rxnmapper: Deprecated, ignored. Classification uses Schneider FP.
|
|
1877
|
+
use_rxn_insight: Enable RXN Insight enrichment
|
|
1878
|
+
use_network: Enable PubChem name resolution
|
|
1879
|
+
verbose: Print diagnostic messages to stderr
|
|
1880
|
+
|
|
1881
|
+
Returns:
|
|
1882
|
+
ReactionDescriptor with all species and metadata.
|
|
1883
|
+
"""
|
|
1884
|
+
global _verbose
|
|
1885
|
+
_verbose = verbose
|
|
1886
|
+
|
|
1887
|
+
# --- Step 0: Auto-discover files if input_dir given ---
|
|
1888
|
+
if input_dir and experiment:
|
|
1889
|
+
try:
|
|
1890
|
+
from discover_experiment_files import discover_experiment_files
|
|
1891
|
+
disc = discover_experiment_files(input_dir, experiment)
|
|
1892
|
+
if not cdxml and disc.cdx_files:
|
|
1893
|
+
cdx = cdx or disc.cdx_files[0]
|
|
1894
|
+
if not csv and disc.csv_files:
|
|
1895
|
+
csv = csv or disc.csv_files[0]
|
|
1896
|
+
if not rxn and disc.rxn_files:
|
|
1897
|
+
rxn = rxn or disc.rxn_files[0]
|
|
1898
|
+
except Exception as e:
|
|
1899
|
+
_log(f" File discovery failed: {e}")
|
|
1900
|
+
|
|
1901
|
+
# --- Step 0b: CDX → CDXML conversion ---
|
|
1902
|
+
if cdx and not cdxml:
|
|
1903
|
+
cdxml = _convert_cdx_to_cdxml(cdx)
|
|
1904
|
+
|
|
1905
|
+
desc = ReactionDescriptor(
|
|
1906
|
+
experiment=experiment or _stem(cdxml or cdx or rxn or csv or "unknown"),
|
|
1907
|
+
input_files={
|
|
1908
|
+
"cdxml": cdxml,
|
|
1909
|
+
"csv": csv,
|
|
1910
|
+
"rxn": rxn,
|
|
1911
|
+
"cdx": cdx,
|
|
1912
|
+
},
|
|
1913
|
+
)
|
|
1914
|
+
|
|
1915
|
+
# Metadata
|
|
1916
|
+
desc.metadata["parser_version"] = "1.3"
|
|
1917
|
+
desc.metadata["timestamp"] = datetime.datetime.now().isoformat(
|
|
1918
|
+
timespec="seconds")
|
|
1919
|
+
desc.metadata["rdkit_available"] = _check_rdkit()
|
|
1920
|
+
desc.metadata["chemscript_available"] = _check_chemscript()
|
|
1921
|
+
|
|
1922
|
+
# --- Step 1: Extract species from structural source ---
|
|
1923
|
+
species: List[SpeciesDescriptor] = []
|
|
1924
|
+
warnings: List[str] = []
|
|
1925
|
+
cdxml_conditions: List[str] = []
|
|
1926
|
+
|
|
1927
|
+
if cdxml:
|
|
1928
|
+
_log(f"Extracting from CDXML: {os.path.basename(cdxml)}")
|
|
1929
|
+
sp, w, conds = _extract_from_cdxml(cdxml, use_network=use_network)
|
|
1930
|
+
species.extend(sp)
|
|
1931
|
+
warnings.extend(w)
|
|
1932
|
+
cdxml_conditions.extend(conds)
|
|
1933
|
+
elif rxn:
|
|
1934
|
+
_log(f"Extracting from RXN: {os.path.basename(rxn)}")
|
|
1935
|
+
sp, w = _extract_from_rxn(rxn)
|
|
1936
|
+
species.extend(sp)
|
|
1937
|
+
warnings.extend(w)
|
|
1938
|
+
|
|
1939
|
+
# --- Step 2: Match CSV data (also returns exp_data for ELN enrichment) ---
|
|
1940
|
+
exp_data = None
|
|
1941
|
+
if csv:
|
|
1942
|
+
_log(f"Matching CSV: {os.path.basename(csv)}")
|
|
1943
|
+
species, w, exp_data = _match_csv_data(species, csv)
|
|
1944
|
+
warnings.extend(w)
|
|
1945
|
+
|
|
1946
|
+
# --- Step 3: Deduplicate ---
|
|
1947
|
+
species = _deduplicate_species(species)
|
|
1948
|
+
|
|
1949
|
+
# --- Step 4: Compute masses (needed before classification MW checks) ---
|
|
1950
|
+
_compute_all_masses(species)
|
|
1951
|
+
|
|
1952
|
+
# --- Step 5: Classify roles ---
|
|
1953
|
+
_log("Classifying species roles...")
|
|
1954
|
+
confidence = _classify_species(
|
|
1955
|
+
species,
|
|
1956
|
+
use_rxnmapper=use_rxnmapper,
|
|
1957
|
+
use_rxn_insight=use_rxn_insight,
|
|
1958
|
+
)
|
|
1959
|
+
desc.classification_confidence = confidence
|
|
1960
|
+
|
|
1961
|
+
# --- Step 6: Identify SM and DP ---
|
|
1962
|
+
_identify_sm_dp(species)
|
|
1963
|
+
|
|
1964
|
+
# --- Step 6.5: Detect solvents (from CSV + reagent_db) ---
|
|
1965
|
+
_detect_solvents(species, exp_data=exp_data)
|
|
1966
|
+
|
|
1967
|
+
# --- Step 7: Apply display names ---
|
|
1968
|
+
_apply_display_names(species)
|
|
1969
|
+
|
|
1970
|
+
# --- Step 8.5: Build display_text ---
|
|
1971
|
+
_build_display_texts(species)
|
|
1972
|
+
|
|
1973
|
+
# --- Step 9: Build reaction SMILES ---
|
|
1974
|
+
desc.reaction_smiles = _build_reaction_smiles(species)
|
|
1975
|
+
|
|
1976
|
+
# --- Step 10: Get RXN Insight reaction class (from classify step) ---
|
|
1977
|
+
for sp in species:
|
|
1978
|
+
if sp.rxn_insight_role:
|
|
1979
|
+
# _try_rxn_insight was called — check if it set reaction class
|
|
1980
|
+
break
|
|
1981
|
+
|
|
1982
|
+
desc.species = species
|
|
1983
|
+
desc.warnings = warnings
|
|
1984
|
+
|
|
1985
|
+
# --- Step 11: Populate ELN data (run arrow, procedure, solvents) ---
|
|
1986
|
+
_populate_eln_data(desc, exp_data)
|
|
1987
|
+
|
|
1988
|
+
# --- Step 12: Populate conditions (from CDXML text extraction) ---
|
|
1989
|
+
desc.conditions = cdxml_conditions
|
|
1990
|
+
|
|
1991
|
+
_log(f"Parsed {len(species)} species, "
|
|
1992
|
+
f"{sum(1 for s in species if s.is_sm)} SM, "
|
|
1993
|
+
f"{sum(1 for s in species if s.is_dp)} DP")
|
|
1994
|
+
|
|
1995
|
+
return desc
|
|
1996
|
+
|
|
1997
|
+
|
|
1998
|
+
# ---------------------------------------------------------------------------
|
|
1999
|
+
# Helper utilities
|
|
2000
|
+
# ---------------------------------------------------------------------------
|
|
2001
|
+
|
|
2002
|
+
def _stem(path: str) -> str:
|
|
2003
|
+
"""Filename stem without extension."""
|
|
2004
|
+
return os.path.splitext(os.path.basename(path))[0]
|
|
2005
|
+
|
|
2006
|
+
|
|
2007
|
+
def _check_rdkit() -> bool:
|
|
2008
|
+
try:
|
|
2009
|
+
from rdkit import Chem # noqa: F401
|
|
2010
|
+
return True
|
|
2011
|
+
except ImportError:
|
|
2012
|
+
return False
|
|
2013
|
+
|
|
2014
|
+
|
|
2015
|
+
def _check_chemscript() -> bool:
|
|
2016
|
+
try:
|
|
2017
|
+
from ..chemdraw.chemscript_bridge import ChemScriptBridge # noqa: F401
|
|
2018
|
+
return True
|
|
2019
|
+
except ImportError:
|
|
2020
|
+
return False
|
|
2021
|
+
|
|
2022
|
+
|
|
2023
|
+
def _convert_cdx_to_cdxml(cdx_path: str) -> Optional[str]:
|
|
2024
|
+
"""Convert CDX to CDXML via cdx_converter.py subprocess."""
|
|
2025
|
+
import subprocess
|
|
2026
|
+
import tempfile
|
|
2027
|
+
|
|
2028
|
+
out_path = os.path.splitext(cdx_path)[0] + ".cdxml"
|
|
2029
|
+
if os.path.isfile(out_path):
|
|
2030
|
+
return out_path
|
|
2031
|
+
|
|
2032
|
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
2033
|
+
converter = os.path.join(script_dir, "cdx_converter.py")
|
|
2034
|
+
|
|
2035
|
+
if not os.path.isfile(converter):
|
|
2036
|
+
_log(f" cdx_converter.py not found at {converter}")
|
|
2037
|
+
return None
|
|
2038
|
+
|
|
2039
|
+
try:
|
|
2040
|
+
result = subprocess.run(
|
|
2041
|
+
[sys.executable, converter, cdx_path, "-o", out_path],
|
|
2042
|
+
capture_output=True, text=True, timeout=60)
|
|
2043
|
+
if result.returncode == 0 and os.path.isfile(out_path):
|
|
2044
|
+
_log(f" Converted CDX → CDXML: {out_path}")
|
|
2045
|
+
return out_path
|
|
2046
|
+
else:
|
|
2047
|
+
_log(f" CDX conversion failed: {result.stderr}")
|
|
2048
|
+
return None
|
|
2049
|
+
except Exception as e:
|
|
2050
|
+
_log(f" CDX conversion error: {e}")
|
|
2051
|
+
return None
|
|
2052
|
+
|
|
2053
|
+
|
|
2054
|
+
# ---------------------------------------------------------------------------
|
|
2055
|
+
# CLI
|
|
2056
|
+
# ---------------------------------------------------------------------------
|
|
2057
|
+
|
|
2058
|
+
def _build_arg_parser() -> argparse.ArgumentParser:
|
|
2059
|
+
p = argparse.ArgumentParser(
|
|
2060
|
+
description="Parse reaction from ELN files into a persisted JSON descriptor.",
|
|
2061
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
2062
|
+
epilog="""
|
|
2063
|
+
Examples:
|
|
2064
|
+
python reaction_parser.py experiment.cdxml -o reaction.json
|
|
2065
|
+
python reaction_parser.py experiment.cdxml --csv exp.csv --pretty
|
|
2066
|
+
python reaction_parser.py --input-dir path/ --experiment KL-7001-004
|
|
2067
|
+
""",
|
|
2068
|
+
)
|
|
2069
|
+
# Input files
|
|
2070
|
+
p.add_argument("cdxml_positional", nargs="?", default=None,
|
|
2071
|
+
help="Input CDXML file (positional)")
|
|
2072
|
+
p.add_argument("--cdxml", dest="cdxml_named", default=None,
|
|
2073
|
+
help="Input CDXML file (named)")
|
|
2074
|
+
p.add_argument("--cdx", default=None,
|
|
2075
|
+
help="Input CDX file (converted to CDXML)")
|
|
2076
|
+
p.add_argument("--csv", default=None,
|
|
2077
|
+
help="Findmolecule ELN CSV file")
|
|
2078
|
+
p.add_argument("--rxn", default=None,
|
|
2079
|
+
help="RXN file")
|
|
2080
|
+
p.add_argument("--input-dir", default=None,
|
|
2081
|
+
help="Experiment directory (auto-discover files)")
|
|
2082
|
+
p.add_argument("--experiment", default=None,
|
|
2083
|
+
help="Experiment name (with --input-dir)")
|
|
2084
|
+
# Output
|
|
2085
|
+
p.add_argument("-o", "--output", default=None,
|
|
2086
|
+
help="Output JSON file (default: stdout)")
|
|
2087
|
+
p.add_argument("--pretty", action="store_true",
|
|
2088
|
+
help="Pretty-print JSON output")
|
|
2089
|
+
# Options
|
|
2090
|
+
p.add_argument("--no-rxnmapper", action="store_true",
|
|
2091
|
+
help="Deprecated (RXNMapper no longer used for classification)")
|
|
2092
|
+
p.add_argument("--no-rxn-insight", action="store_true",
|
|
2093
|
+
help="Skip RXN Insight enrichment")
|
|
2094
|
+
p.add_argument("--no-network", action="store_true",
|
|
2095
|
+
help="Skip PubChem name resolution (offline only)")
|
|
2096
|
+
p.add_argument("--json-errors", action="store_true",
|
|
2097
|
+
help="Output structured JSON errors to stderr")
|
|
2098
|
+
p.add_argument("-v", "--verbose", action="store_true",
|
|
2099
|
+
help="Print diagnostic messages to stderr")
|
|
2100
|
+
return p
|
|
2101
|
+
|
|
2102
|
+
|
|
2103
|
+
def main(argv=None) -> int:
|
|
2104
|
+
parser = _build_arg_parser()
|
|
2105
|
+
args = parser.parse_args(argv)
|
|
2106
|
+
|
|
2107
|
+
# Resolve CDXML from positional or named argument
|
|
2108
|
+
cdxml = args.cdxml_positional or args.cdxml_named
|
|
2109
|
+
|
|
2110
|
+
if not any([cdxml, args.cdx, args.csv, args.rxn,
|
|
2111
|
+
args.input_dir]):
|
|
2112
|
+
parser.error("No input files specified")
|
|
2113
|
+
|
|
2114
|
+
try:
|
|
2115
|
+
desc = parse_reaction(
|
|
2116
|
+
cdxml=cdxml,
|
|
2117
|
+
cdx=args.cdx,
|
|
2118
|
+
csv=args.csv,
|
|
2119
|
+
rxn=args.rxn,
|
|
2120
|
+
input_dir=args.input_dir,
|
|
2121
|
+
experiment=args.experiment,
|
|
2122
|
+
use_rxnmapper=not args.no_rxnmapper,
|
|
2123
|
+
use_rxn_insight=not args.no_rxn_insight,
|
|
2124
|
+
use_network=not args.no_network,
|
|
2125
|
+
verbose=args.verbose,
|
|
2126
|
+
)
|
|
2127
|
+
|
|
2128
|
+
if args.output:
|
|
2129
|
+
desc.to_json(args.output, pretty=args.pretty)
|
|
2130
|
+
print(f"Wrote {args.output} ({len(desc.species)} species)",
|
|
2131
|
+
file=sys.stderr)
|
|
2132
|
+
else:
|
|
2133
|
+
output = json.dumps(desc.to_dict(),
|
|
2134
|
+
indent=2 if args.pretty else None,
|
|
2135
|
+
ensure_ascii=False)
|
|
2136
|
+
print(output)
|
|
2137
|
+
|
|
2138
|
+
return 0
|
|
2139
|
+
|
|
2140
|
+
except Exception as e:
|
|
2141
|
+
if args.json_errors:
|
|
2142
|
+
err = {"error": "parse_failed", "detail": str(e)}
|
|
2143
|
+
print(json.dumps(err), file=sys.stderr)
|
|
2144
|
+
else:
|
|
2145
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
2146
|
+
return 1
|
|
2147
|
+
|
|
2148
|
+
|
|
2149
|
+
if __name__ == "__main__":
|
|
2150
|
+
sys.exit(main())
|