cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1045 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
reactant_heuristic.py — Classify reaction reagents as atom-contributing
|
|
4
|
+
or non-contributing using role lookup + RDKit MCS.
|
|
5
|
+
|
|
6
|
+
Two input modes:
|
|
7
|
+
cdxml Parse a CDXML reaction file; extract fragments + text from <step>
|
|
8
|
+
smiles Accept reagent SMILES + product SMILES directly on the CLI
|
|
9
|
+
|
|
10
|
+
Examples:
|
|
11
|
+
python reactant_heuristic.py cdxml -i reaction.cdxml --pretty
|
|
12
|
+
python reactant_heuristic.py smiles --reagents "C1COCCN1" "c1cc2scnc2Br" \\
|
|
13
|
+
--product "c1cc2scnc2N1CCOCC1" --pretty
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import os
|
|
19
|
+
import sys
|
|
20
|
+
import tempfile
|
|
21
|
+
from dataclasses import dataclass, field, asdict
|
|
22
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
23
|
+
from xml.etree import ElementTree as ET
|
|
24
|
+
|
|
25
|
+
from ..constants import CDXML_FOOTER, CDXML_MINIMAL_HEADER
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
# Data classes
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class ReagentInfo:
|
|
34
|
+
"""Information about a single reagent being classified."""
|
|
35
|
+
source_id: str = ""
|
|
36
|
+
source_type: str = "" # "fragment", "text", "smiles_input"
|
|
37
|
+
name: Optional[str] = None
|
|
38
|
+
smiles: Optional[str] = None
|
|
39
|
+
position: str = "" # "reactant", "above_arrow", "below_arrow"
|
|
40
|
+
classification: str = "" # "atom_contributing", "non_contributing", "unclassified"
|
|
41
|
+
classification_method: str = "" # "schneider_fp", "role_lookup", "fm_type", etc.
|
|
42
|
+
mcs_ratio: Optional[float] = None
|
|
43
|
+
rxnmapper_confidence: Optional[float] = None # deprecated — kept for compat
|
|
44
|
+
schneider_score: Optional[float] = None # Schneider FP combo score
|
|
45
|
+
role: Optional[str] = None # "catalyst", "ligand", "base", "solvent", etc.
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
# Role Lookup (Tier 1) — via shared reagent database
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
from ..resolve.reagent_db import get_reagent_db
|
|
53
|
+
|
|
54
|
+
# Transition metals commonly used as catalysts (by atomic number)
|
|
55
|
+
CATALYST_METALS = {46, 28, 29, 77, 45, 44, 78, 76, 79}
|
|
56
|
+
# Pd=46, Ni=28, Cu=29, Ir=77, Rh=45, Ru=44, Pt=78, Os=76, Au=79
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# ---------------------------------------------------------------------------
|
|
60
|
+
# CDXML Parsing Helpers (adapted from cdxml_combiner.py)
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
def _get_page(root: ET.Element) -> ET.Element:
|
|
64
|
+
"""Find the <page> element in a CDXML root."""
|
|
65
|
+
page = root.find("page")
|
|
66
|
+
if page is None:
|
|
67
|
+
raise SystemExit("ERROR: no <page> element in CDXML")
|
|
68
|
+
return page
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _count_heavy_atoms(frag: ET.Element) -> int:
|
|
72
|
+
"""Count non-hydrogen atoms in a fragment."""
|
|
73
|
+
count = 0
|
|
74
|
+
for n in frag.iter("n"):
|
|
75
|
+
if n.get("NodeType") in ("ExternalConnectionPoint", "Fragment",
|
|
76
|
+
"Unspecified"):
|
|
77
|
+
continue
|
|
78
|
+
count += 1
|
|
79
|
+
return count
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _get_text_content(el: ET.Element) -> str:
|
|
83
|
+
"""Extract concatenated text from all <s> children of a <t> element."""
|
|
84
|
+
parts = []
|
|
85
|
+
for s in el.iter("s"):
|
|
86
|
+
if s.text:
|
|
87
|
+
parts.append(s.text.strip())
|
|
88
|
+
return " ".join(parts).strip()
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _get_fm_molecule_type(el: ET.Element) -> Optional[int]:
|
|
92
|
+
"""Read the Findmolecule MOLECULE TYPE objecttag.
|
|
93
|
+
Values: 0=molecule, 1=solvent, 2=condition text, 3=product."""
|
|
94
|
+
for ot in el.iter("objecttag"):
|
|
95
|
+
if ot.get("Name") == "FM MOLECULE TYPE":
|
|
96
|
+
try:
|
|
97
|
+
return int(ot.get("Value", ""))
|
|
98
|
+
except ValueError:
|
|
99
|
+
return None
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _attrs_to_str(el: ET.Element) -> str:
|
|
104
|
+
parts = []
|
|
105
|
+
for k, v in el.attrib.items():
|
|
106
|
+
v = v.replace("&", "&").replace('"', """).replace("<", "<")
|
|
107
|
+
parts.append(f'{k}="{v}"')
|
|
108
|
+
return " ".join(parts)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _element_to_string(el: ET.Element) -> str:
|
|
112
|
+
tag = el.tag
|
|
113
|
+
attrs = _attrs_to_str(el)
|
|
114
|
+
children = list(el)
|
|
115
|
+
text = el.text or ""
|
|
116
|
+
if attrs:
|
|
117
|
+
open_tag = f"<{tag} {attrs}"
|
|
118
|
+
else:
|
|
119
|
+
open_tag = f"<{tag}"
|
|
120
|
+
if not children and not text.strip():
|
|
121
|
+
return f"{open_tag}/>"
|
|
122
|
+
result = f"{open_tag}>"
|
|
123
|
+
if text.strip():
|
|
124
|
+
safe = text.replace("&", "&").replace("<", "<").replace(">", ">")
|
|
125
|
+
result += safe
|
|
126
|
+
for child in children:
|
|
127
|
+
result += _element_to_string(child)
|
|
128
|
+
result += f"</{tag}>"
|
|
129
|
+
return result
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _fragment_to_cdxml(frag: ET.Element) -> str:
|
|
133
|
+
"""Wrap a single <fragment> in a minimal CDXML document."""
|
|
134
|
+
return (
|
|
135
|
+
CDXML_MINIMAL_HEADER + "\n<page id=\"1\">\n"
|
|
136
|
+
+ _element_to_string(frag)
|
|
137
|
+
+ "\n</page>\n" + CDXML_FOOTER
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# ---------------------------------------------------------------------------
|
|
142
|
+
# SMILES Extraction
|
|
143
|
+
# ---------------------------------------------------------------------------
|
|
144
|
+
|
|
145
|
+
# Lazy ChemScript singleton
|
|
146
|
+
_cs_instance = None
|
|
147
|
+
_cs_tried = False
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _get_chemscript():
|
|
151
|
+
"""Return a ChemScriptBridge instance (lazy singleton), or None."""
|
|
152
|
+
global _cs_instance, _cs_tried
|
|
153
|
+
if _cs_tried:
|
|
154
|
+
return _cs_instance
|
|
155
|
+
_cs_tried = True
|
|
156
|
+
try:
|
|
157
|
+
from ..chemdraw.chemscript_bridge import ChemScriptBridge
|
|
158
|
+
_cs_instance = ChemScriptBridge()
|
|
159
|
+
except Exception as e:
|
|
160
|
+
print(f" [warn] ChemScript not available: {e}", file=sys.stderr)
|
|
161
|
+
_cs_instance = None
|
|
162
|
+
return _cs_instance
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _fragment_to_smiles(frag: ET.Element) -> Optional[str]:
|
|
166
|
+
"""Convert a CDXML <fragment> to SMILES via ChemScript."""
|
|
167
|
+
cs = _get_chemscript()
|
|
168
|
+
if cs is None:
|
|
169
|
+
return None
|
|
170
|
+
cdxml_str = _fragment_to_cdxml(frag)
|
|
171
|
+
tmp_path = None
|
|
172
|
+
try:
|
|
173
|
+
with tempfile.NamedTemporaryFile(suffix=".cdxml", mode="w",
|
|
174
|
+
delete=False, encoding="utf-8") as f:
|
|
175
|
+
f.write(cdxml_str)
|
|
176
|
+
tmp_path = f.name
|
|
177
|
+
smiles = cs.write_data(tmp_path, "smiles")
|
|
178
|
+
return smiles.strip() if smiles else None
|
|
179
|
+
except Exception as e:
|
|
180
|
+
print(f" [warn] fragment→SMILES failed: {e}", file=sys.stderr)
|
|
181
|
+
return None
|
|
182
|
+
finally:
|
|
183
|
+
if tmp_path and os.path.exists(tmp_path):
|
|
184
|
+
os.unlink(tmp_path)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _text_to_smiles(text_content: str) -> Optional[str]:
|
|
188
|
+
"""Resolve a reagent name to SMILES.
|
|
189
|
+
|
|
190
|
+
Resolution chain (first success wins):
|
|
191
|
+
1. py2opsin name->SMILES (offline, handles IUPAC/systematic names)
|
|
192
|
+
2. PubChem name->SMILES via cas_resolver (online, fallback)
|
|
193
|
+
"""
|
|
194
|
+
# --- Try OPSIN first (offline) ---
|
|
195
|
+
smiles = _opsin_name_to_smiles(text_content)
|
|
196
|
+
if smiles:
|
|
197
|
+
return smiles
|
|
198
|
+
|
|
199
|
+
# --- Fall back to PubChem (online) ---
|
|
200
|
+
try:
|
|
201
|
+
from ..resolve.cas_resolver import resolve_name_to_smiles
|
|
202
|
+
return resolve_name_to_smiles(text_content)
|
|
203
|
+
except Exception as e:
|
|
204
|
+
print(f" [warn] name->SMILES failed for '{text_content}': {e}",
|
|
205
|
+
file=sys.stderr)
|
|
206
|
+
return None
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
# ---------------------------------------------------------------------------
|
|
210
|
+
# OPSIN name resolution (offline)
|
|
211
|
+
# ---------------------------------------------------------------------------
|
|
212
|
+
|
|
213
|
+
_opsin_available: Optional[bool] = None
|
|
214
|
+
_java_exe: Optional[str] = None
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _find_java() -> Optional[str]:
|
|
218
|
+
"""Find the Java executable for OPSIN.
|
|
219
|
+
|
|
220
|
+
Discovery order:
|
|
221
|
+
1. ``java`` on PATH (system-installed)
|
|
222
|
+
2. ``JAVA_HOME`` environment variable
|
|
223
|
+
3. Bundled JRE alongside test data (``CHEM_TEST_DATA`` env var)
|
|
224
|
+
4. Known default location for the project JRE
|
|
225
|
+
|
|
226
|
+
Returns the full path to the ``java`` (or ``java.exe``) binary,
|
|
227
|
+
or None if no JRE is found.
|
|
228
|
+
"""
|
|
229
|
+
import shutil
|
|
230
|
+
|
|
231
|
+
# 1. Already on PATH?
|
|
232
|
+
java = shutil.which("java")
|
|
233
|
+
if java:
|
|
234
|
+
return java
|
|
235
|
+
|
|
236
|
+
# 2. JAVA_HOME env var
|
|
237
|
+
java_home = os.environ.get("JAVA_HOME")
|
|
238
|
+
if java_home:
|
|
239
|
+
candidate = os.path.join(java_home, "bin", "java.exe")
|
|
240
|
+
if os.path.isfile(candidate):
|
|
241
|
+
return candidate
|
|
242
|
+
candidate = os.path.join(java_home, "bin", "java")
|
|
243
|
+
if os.path.isfile(candidate):
|
|
244
|
+
return candidate
|
|
245
|
+
|
|
246
|
+
# 3. Bundled JRE relative to CHEM_TEST_DATA
|
|
247
|
+
test_data = os.environ.get("CHEM_TEST_DATA")
|
|
248
|
+
if test_data:
|
|
249
|
+
# Look for any JRE directory inside CHEM_TEST_DATA
|
|
250
|
+
_jre = _scan_for_jre(test_data)
|
|
251
|
+
if _jre:
|
|
252
|
+
return _jre
|
|
253
|
+
|
|
254
|
+
# 4. Known default location (project-specific)
|
|
255
|
+
_known = os.path.expanduser(
|
|
256
|
+
os.path.join("~", "chem-test-data",
|
|
257
|
+
"OpenJDK21U-jre_x64_windows_hotspot_21.0.10_7"))
|
|
258
|
+
if os.path.isdir(_known):
|
|
259
|
+
_jre = _scan_for_jre(_known)
|
|
260
|
+
if _jre:
|
|
261
|
+
return _jre
|
|
262
|
+
|
|
263
|
+
return None
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _scan_for_jre(base_dir: str) -> Optional[str]:
|
|
267
|
+
"""Scan a directory tree (1 level deep) for a JRE bin/java."""
|
|
268
|
+
for name in ("bin",):
|
|
269
|
+
candidate = os.path.join(base_dir, name, "java.exe")
|
|
270
|
+
if os.path.isfile(candidate):
|
|
271
|
+
return candidate
|
|
272
|
+
candidate = os.path.join(base_dir, name, "java")
|
|
273
|
+
if os.path.isfile(candidate):
|
|
274
|
+
return candidate
|
|
275
|
+
|
|
276
|
+
# Check one level of subdirectories (e.g. jdk-21.0.10+7-jre/bin/)
|
|
277
|
+
try:
|
|
278
|
+
for entry in os.listdir(base_dir):
|
|
279
|
+
subdir = os.path.join(base_dir, entry)
|
|
280
|
+
if os.path.isdir(subdir):
|
|
281
|
+
candidate = os.path.join(subdir, "bin", "java.exe")
|
|
282
|
+
if os.path.isfile(candidate):
|
|
283
|
+
return candidate
|
|
284
|
+
candidate = os.path.join(subdir, "bin", "java")
|
|
285
|
+
if os.path.isfile(candidate):
|
|
286
|
+
return candidate
|
|
287
|
+
except OSError:
|
|
288
|
+
pass
|
|
289
|
+
return None
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _opsin_name_to_smiles(name: str) -> Optional[str]:
|
|
293
|
+
"""Try to resolve a chemical name to SMILES via OPSIN (offline).
|
|
294
|
+
|
|
295
|
+
OPSIN handles systematic/IUPAC names and many common names well
|
|
296
|
+
(e.g. "cesium carbonate", "triethylamine", "sodium tert-butoxide").
|
|
297
|
+
Fails on abbreviations (BINAP, Pd2dba3) and some organometallics.
|
|
298
|
+
|
|
299
|
+
Requires the py2opsin package. A JRE is auto-downloaded on first
|
|
300
|
+
use if no system Java is found (via :mod:`cdxml_toolkit.resolve.jre_manager`).
|
|
301
|
+
"""
|
|
302
|
+
global _opsin_available, _java_exe
|
|
303
|
+
if _opsin_available is False:
|
|
304
|
+
return None
|
|
305
|
+
try:
|
|
306
|
+
import warnings
|
|
307
|
+
from py2opsin import py2opsin
|
|
308
|
+
|
|
309
|
+
# Ensure Java is discoverable by py2opsin's subprocess call.
|
|
310
|
+
# Uses the centralized JRE manager which auto-downloads if needed.
|
|
311
|
+
from cdxml_toolkit.resolve.jre_manager import ensure_java_on_path
|
|
312
|
+
if not ensure_java_on_path():
|
|
313
|
+
# Fall back to legacy _find_java for non-standard locations
|
|
314
|
+
if _java_exe is None:
|
|
315
|
+
_java_exe = _find_java()
|
|
316
|
+
if _java_exe and _java_exe not in os.environ.get("PATH", ""):
|
|
317
|
+
java_bin_dir = os.path.dirname(_java_exe)
|
|
318
|
+
os.environ["PATH"] = java_bin_dir + os.pathsep + os.environ.get("PATH", "")
|
|
319
|
+
java_home = os.path.dirname(java_bin_dir)
|
|
320
|
+
os.environ["JAVA_HOME"] = java_home
|
|
321
|
+
|
|
322
|
+
with warnings.catch_warnings():
|
|
323
|
+
warnings.simplefilter("ignore", RuntimeWarning)
|
|
324
|
+
result = py2opsin(name)
|
|
325
|
+
if result:
|
|
326
|
+
_opsin_available = True
|
|
327
|
+
return result
|
|
328
|
+
_opsin_available = True
|
|
329
|
+
return None
|
|
330
|
+
except FileNotFoundError:
|
|
331
|
+
if _opsin_available is None:
|
|
332
|
+
print(" [info] OPSIN unavailable (Java not found)", file=sys.stderr)
|
|
333
|
+
_opsin_available = False
|
|
334
|
+
return None
|
|
335
|
+
except ImportError:
|
|
336
|
+
if _opsin_available is None:
|
|
337
|
+
print(" [info] py2opsin not installed", file=sys.stderr)
|
|
338
|
+
_opsin_available = False
|
|
339
|
+
return None
|
|
340
|
+
except Exception as e:
|
|
341
|
+
print(f" [info] OPSIN name->SMILES failed for '{name}': {e}",
|
|
342
|
+
file=sys.stderr)
|
|
343
|
+
return None
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
# ---------------------------------------------------------------------------
|
|
347
|
+
# Tier 1 — Role Lookup
|
|
348
|
+
# ---------------------------------------------------------------------------
|
|
349
|
+
|
|
350
|
+
def _contains_catalyst_metal(smiles: str) -> bool:
|
|
351
|
+
"""Check if a molecule contains a transition-metal catalyst atom."""
|
|
352
|
+
try:
|
|
353
|
+
from rdkit import Chem
|
|
354
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
355
|
+
if mol is None:
|
|
356
|
+
return False
|
|
357
|
+
return any(a.GetAtomicNum() in CATALYST_METALS for a in mol.GetAtoms())
|
|
358
|
+
except Exception:
|
|
359
|
+
return False
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def _is_inorganic(smiles: str) -> bool:
|
|
363
|
+
"""Heuristic: molecule has no carbons, or only 1 C with ≥4 heavy atoms
|
|
364
|
+
(likely carbonate, cyanide, etc.)."""
|
|
365
|
+
try:
|
|
366
|
+
from rdkit import Chem
|
|
367
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
368
|
+
if mol is None:
|
|
369
|
+
return False
|
|
370
|
+
carbons = sum(1 for a in mol.GetAtoms() if a.GetAtomicNum() == 6)
|
|
371
|
+
total = mol.GetNumHeavyAtoms()
|
|
372
|
+
if carbons == 0:
|
|
373
|
+
return True
|
|
374
|
+
if carbons == 1 and total >= 4:
|
|
375
|
+
return True
|
|
376
|
+
return False
|
|
377
|
+
except Exception:
|
|
378
|
+
return False
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def role_lookup(smiles: Optional[str], name: Optional[str]
|
|
382
|
+
) -> Optional[Tuple[str, str]]:
|
|
383
|
+
"""Tier 1 classification. Returns (role, method) or None."""
|
|
384
|
+
db = get_reagent_db()
|
|
385
|
+
|
|
386
|
+
# 1. SMILES-based lookup (exact canonical match)
|
|
387
|
+
if smiles:
|
|
388
|
+
role = db.role_for_smiles(smiles)
|
|
389
|
+
if role:
|
|
390
|
+
return (role, "role_lookup")
|
|
391
|
+
|
|
392
|
+
# 1b. Stereo-agnostic SMILES lookup. RDKit-only SMILES extraction
|
|
393
|
+
# often omits E/Z on double bonds (e.g. DEAD's N=N) because
|
|
394
|
+
# frag_to_mol doesn't set bond stereo from 2D coordinates.
|
|
395
|
+
if smiles:
|
|
396
|
+
role = _role_for_smiles_no_stereo(smiles, db)
|
|
397
|
+
if role:
|
|
398
|
+
return (role, "role_lookup_no_stereo")
|
|
399
|
+
|
|
400
|
+
# 2. Name-based lookup
|
|
401
|
+
if name:
|
|
402
|
+
role = db.role_for_name(name)
|
|
403
|
+
if role:
|
|
404
|
+
return (role, "role_lookup")
|
|
405
|
+
|
|
406
|
+
# 3. Metal-containing → catalyst
|
|
407
|
+
if smiles and _contains_catalyst_metal(smiles):
|
|
408
|
+
return ("catalyst", "metal_check")
|
|
409
|
+
|
|
410
|
+
# 4. Inorganic salt
|
|
411
|
+
if smiles and _is_inorganic(smiles):
|
|
412
|
+
return ("inorganic_salt", "inorganic_check")
|
|
413
|
+
|
|
414
|
+
return None
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def _role_for_smiles_no_stereo(smiles: str, db) -> Optional[str]:
|
|
418
|
+
"""Match SMILES against DB after stripping stereochemistry."""
|
|
419
|
+
try:
|
|
420
|
+
from rdkit import Chem
|
|
421
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
422
|
+
if mol is None:
|
|
423
|
+
return None
|
|
424
|
+
Chem.RemoveStereochemistry(mol)
|
|
425
|
+
flat_smi = Chem.MolToSmiles(mol)
|
|
426
|
+
|
|
427
|
+
for smi_key, entry in db._by_smiles.items():
|
|
428
|
+
mol2 = Chem.MolFromSmiles(smi_key)
|
|
429
|
+
if mol2 is None:
|
|
430
|
+
continue
|
|
431
|
+
Chem.RemoveStereochemistry(mol2)
|
|
432
|
+
if flat_smi == Chem.MolToSmiles(mol2):
|
|
433
|
+
return entry.get("role")
|
|
434
|
+
except ImportError:
|
|
435
|
+
pass
|
|
436
|
+
except Exception:
|
|
437
|
+
pass
|
|
438
|
+
return None
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
# ---------------------------------------------------------------------------
|
|
442
|
+
# Tier 2 — RDKit MCS (kept for alignment use; no longer used for classification)
|
|
443
|
+
# ---------------------------------------------------------------------------
|
|
444
|
+
|
|
445
|
+
def mcs_ratio(reagent_smiles: str, product_smiles: str) -> Optional[float]:
|
|
446
|
+
"""Compute MCS heavy-atom ratio: MCS_atoms / reagent_heavy_atoms.
|
|
447
|
+
|
|
448
|
+
NOTE: No longer used for classification (replaced by Schneider FP).
|
|
449
|
+
Kept because alignment.py may call it for 2D coordinate matching.
|
|
450
|
+
"""
|
|
451
|
+
try:
|
|
452
|
+
from rdkit import Chem
|
|
453
|
+
from rdkit.Chem import rdFMCS
|
|
454
|
+
|
|
455
|
+
reagent_mol = Chem.MolFromSmiles(reagent_smiles)
|
|
456
|
+
product_mol = Chem.MolFromSmiles(product_smiles)
|
|
457
|
+
if reagent_mol is None or product_mol is None:
|
|
458
|
+
return None
|
|
459
|
+
|
|
460
|
+
reagent_heavy = reagent_mol.GetNumHeavyAtoms()
|
|
461
|
+
if reagent_heavy == 0:
|
|
462
|
+
return None
|
|
463
|
+
|
|
464
|
+
result = rdFMCS.FindMCS(
|
|
465
|
+
[reagent_mol, product_mol],
|
|
466
|
+
atomCompare=rdFMCS.AtomCompare.CompareElements,
|
|
467
|
+
bondCompare=rdFMCS.BondCompare.CompareAny,
|
|
468
|
+
ringMatchesRingOnly=True,
|
|
469
|
+
completeRingsOnly=True,
|
|
470
|
+
timeout=10,
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
if result.canceled or result.numAtoms == 0:
|
|
474
|
+
return 0.0
|
|
475
|
+
|
|
476
|
+
return result.numAtoms / reagent_heavy
|
|
477
|
+
|
|
478
|
+
except Exception as e:
|
|
479
|
+
print(f" [warn] MCS failed: {e}", file=sys.stderr)
|
|
480
|
+
return None
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
# ---------------------------------------------------------------------------
|
|
484
|
+
# Tier 1 — Schneider FP-based reaction role assignment
|
|
485
|
+
# ---------------------------------------------------------------------------
|
|
486
|
+
# Implements the algorithm from Schneider et al., JCIM 2016:
|
|
487
|
+
# "What's What: The (Nearly) Definitive Guide to Reaction Role Assignment"
|
|
488
|
+
#
|
|
489
|
+
# Context-aware: considers the specific product to determine which candidates
|
|
490
|
+
# are atom-contributing (reactants) vs non-contributing (reagents).
|
|
491
|
+
|
|
492
|
+
# Common reagents mined from 1.3M USPTO patent reactions (appear in >1000
|
|
493
|
+
# reactions across >100 reaction types). Canonical SMILES.
|
|
494
|
+
_SCHNEIDER_COMMON_REAGENTS: Optional[set] = None
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def _get_common_reagents() -> set:
|
|
498
|
+
"""Lazily build the canonical common-reagent set."""
|
|
499
|
+
global _SCHNEIDER_COMMON_REAGENTS
|
|
500
|
+
if _SCHNEIDER_COMMON_REAGENTS is not None:
|
|
501
|
+
return _SCHNEIDER_COMMON_REAGENTS
|
|
502
|
+
try:
|
|
503
|
+
from rdkit import Chem
|
|
504
|
+
except ImportError:
|
|
505
|
+
_SCHNEIDER_COMMON_REAGENTS = set()
|
|
506
|
+
return _SCHNEIDER_COMMON_REAGENTS
|
|
507
|
+
|
|
508
|
+
raw = [
|
|
509
|
+
# Solvents
|
|
510
|
+
"ClCCl", "C(Cl)(Cl)Cl", "CS(C)=O", "CCOC(C)=O", "CC#N",
|
|
511
|
+
"C1CCOC1", "C1COCCO1", "CO", "CCO", "CC(C)=O",
|
|
512
|
+
"c1ccncc1", "CN(C)C=O", "c1ccccc1", "Cc1ccccc1", "CCOCC",
|
|
513
|
+
"CC(C)O", "ClC(Cl)Cl", "O", "CC(=O)O",
|
|
514
|
+
# Bases
|
|
515
|
+
"CCN(CC)CC", "CN(C)C",
|
|
516
|
+
# Common ions / salts
|
|
517
|
+
"[Na+]", "[K+]", "[Li+]", "[Cs+]",
|
|
518
|
+
"[OH-]", "[Cl-]", "[Br-]", "[I-]", "[F-]", "[H-]",
|
|
519
|
+
"[NH4+]", "O=C([O-])[O-]", "O=S([O-])([O-])=O",
|
|
520
|
+
# Catalyst metals
|
|
521
|
+
"[Pd]", "[Pt]", "[Ni]",
|
|
522
|
+
]
|
|
523
|
+
result = set()
|
|
524
|
+
for smi in raw:
|
|
525
|
+
mol = Chem.MolFromSmiles(smi)
|
|
526
|
+
if mol:
|
|
527
|
+
result.add(Chem.MolToSmiles(mol))
|
|
528
|
+
_SCHNEIDER_COMMON_REAGENTS = result
|
|
529
|
+
return result
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def _is_schneider_common_reagent(mol) -> bool:
|
|
533
|
+
"""Check if a molecule (or all its fragments) are common reagents."""
|
|
534
|
+
from rdkit import Chem
|
|
535
|
+
common = _get_common_reagents()
|
|
536
|
+
can_smi = Chem.MolToSmiles(mol)
|
|
537
|
+
if can_smi in common:
|
|
538
|
+
return True
|
|
539
|
+
frags = Chem.GetMolFrags(mol, asMols=True)
|
|
540
|
+
if len(frags) > 1:
|
|
541
|
+
return all(Chem.MolToSmiles(f) in common for f in frags)
|
|
542
|
+
return False
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
def _schneider_fp(mol, scaffold: bool = False):
|
|
546
|
+
"""Count-based Morgan FP (radius=1) as a dict."""
|
|
547
|
+
from rdkit.Chem import rdFingerprintGenerator as rfg
|
|
548
|
+
gen = rfg.GetMorganGenerator(
|
|
549
|
+
radius=1,
|
|
550
|
+
atomInvariantsGenerator=(
|
|
551
|
+
rfg.GetMorganAtomInvGen(includeRingMembership=False)
|
|
552
|
+
if scaffold else None
|
|
553
|
+
),
|
|
554
|
+
)
|
|
555
|
+
return dict(gen.GetCountFingerprint(mol).GetNonzeroElements())
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def _schneider_sum_fps(fps):
|
|
559
|
+
"""Sum multiple count fingerprints."""
|
|
560
|
+
from collections import Counter
|
|
561
|
+
r = Counter()
|
|
562
|
+
for fp in fps:
|
|
563
|
+
for k, v in fp.items():
|
|
564
|
+
r[k] += v
|
|
565
|
+
return dict(r)
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def _schneider_score(prod_fp: dict, react_fp: dict) -> float:
|
|
569
|
+
"""Score a reactant combination against the product FP.
|
|
570
|
+
|
|
571
|
+
First term: coverage (how well reactants explain the product)
|
|
572
|
+
Second term: leaving-group penalty (weighted less — sqrt)
|
|
573
|
+
"""
|
|
574
|
+
keys = set(prod_fp) | set(react_fp)
|
|
575
|
+
total = sum(prod_fp.values())
|
|
576
|
+
if not keys or total == 0:
|
|
577
|
+
return 0.0
|
|
578
|
+
pos = sum(max(0, prod_fp.get(k, 0) - react_fp.get(k, 0)) for k in keys)
|
|
579
|
+
neg = sum(max(0, react_fp.get(k, 0) - prod_fp.get(k, 0)) for k in keys)
|
|
580
|
+
return max(0.0, (1.0 - pos / total) - 0.5 * (neg / total) ** 0.5)
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def _schneider_classify(reagents: List[ReagentInfo],
|
|
584
|
+
product_smiles: str) -> None:
|
|
585
|
+
"""Tier 1: Schneider FP-based reaction role assignment.
|
|
586
|
+
|
|
587
|
+
Classifies unclassified reagents as atom_contributing or non_contributing
|
|
588
|
+
by finding the combination of candidates whose Morgan fingerprints best
|
|
589
|
+
explain the product fingerprint.
|
|
590
|
+
|
|
591
|
+
Modifies reagents in place.
|
|
592
|
+
"""
|
|
593
|
+
if not product_smiles:
|
|
594
|
+
return
|
|
595
|
+
|
|
596
|
+
try:
|
|
597
|
+
from rdkit import Chem
|
|
598
|
+
except ImportError:
|
|
599
|
+
return
|
|
600
|
+
|
|
601
|
+
import itertools
|
|
602
|
+
|
|
603
|
+
# Parse product(s) — may contain fragments separated by '.'
|
|
604
|
+
prod_mol = Chem.MolFromSmiles(product_smiles)
|
|
605
|
+
if prod_mol is None:
|
|
606
|
+
return
|
|
607
|
+
|
|
608
|
+
prod_fp_d = _schneider_fp(prod_mol, scaffold=False)
|
|
609
|
+
prod_fp_s = _schneider_fp(prod_mol, scaffold=True)
|
|
610
|
+
total_prod_atoms = prod_mol.GetNumHeavyAtoms()
|
|
611
|
+
|
|
612
|
+
if total_prod_atoms == 0:
|
|
613
|
+
return
|
|
614
|
+
|
|
615
|
+
# Collect unclassified reagents that have parseable SMILES
|
|
616
|
+
candidates = []
|
|
617
|
+
for r in reagents:
|
|
618
|
+
if r.classification:
|
|
619
|
+
continue
|
|
620
|
+
if not r.smiles:
|
|
621
|
+
continue
|
|
622
|
+
mol = Chem.MolFromSmiles(r.smiles)
|
|
623
|
+
if mol is None:
|
|
624
|
+
continue
|
|
625
|
+
candidates.append({
|
|
626
|
+
"reagent": r,
|
|
627
|
+
"mol": mol,
|
|
628
|
+
"fp_d": _schneider_fp(mol, scaffold=False),
|
|
629
|
+
"fp_s": _schneider_fp(mol, scaffold=True),
|
|
630
|
+
"n_atoms": mol.GetNumHeavyAtoms(),
|
|
631
|
+
"is_common": _is_schneider_common_reagent(mol),
|
|
632
|
+
})
|
|
633
|
+
|
|
634
|
+
if not candidates:
|
|
635
|
+
return
|
|
636
|
+
|
|
637
|
+
def _find_best(cand_list):
|
|
638
|
+
"""Find the best-scoring reactant combination."""
|
|
639
|
+
best_score, best_combo = -1.0, None
|
|
640
|
+
n = len(cand_list)
|
|
641
|
+
if n == 0 or n > 18:
|
|
642
|
+
return best_combo, best_score
|
|
643
|
+
for r in range(1, min(n + 1, 6)): # max 5 reactants
|
|
644
|
+
for combo in itertools.combinations(cand_list, r):
|
|
645
|
+
na = sum(c["n_atoms"] for c in combo)
|
|
646
|
+
if na < total_prod_atoms * 0.5 or na > total_prod_atoms * 6:
|
|
647
|
+
continue
|
|
648
|
+
fp_d = _schneider_sum_fps([c["fp_d"] for c in combo])
|
|
649
|
+
fp_s = _schneider_sum_fps([c["fp_s"] for c in combo])
|
|
650
|
+
sc = (_schneider_score(prod_fp_d, fp_d) +
|
|
651
|
+
_schneider_score(prod_fp_s, fp_s))
|
|
652
|
+
if sc > best_score:
|
|
653
|
+
best_score, best_combo = sc, combo
|
|
654
|
+
return best_combo, best_score
|
|
655
|
+
|
|
656
|
+
# Phase 1: try without common reagents
|
|
657
|
+
non_common = [c for c in candidates if not c["is_common"]]
|
|
658
|
+
best_combo, best_score = _find_best(non_common)
|
|
659
|
+
|
|
660
|
+
# Phase 2: if no good result, include common reagents
|
|
661
|
+
if best_combo is None or best_score < 0.5:
|
|
662
|
+
combo2, score2 = _find_best(candidates)
|
|
663
|
+
if score2 > best_score:
|
|
664
|
+
best_combo, best_score = combo2, score2
|
|
665
|
+
|
|
666
|
+
# Apply results
|
|
667
|
+
reactant_set = set()
|
|
668
|
+
if best_combo:
|
|
669
|
+
reactant_set = {id(c["reagent"]) for c in best_combo}
|
|
670
|
+
|
|
671
|
+
for c in candidates:
|
|
672
|
+
r = c["reagent"]
|
|
673
|
+
if id(r) in reactant_set:
|
|
674
|
+
r.classification = "atom_contributing"
|
|
675
|
+
else:
|
|
676
|
+
r.classification = "non_contributing"
|
|
677
|
+
r.classification_method = "schneider_fp"
|
|
678
|
+
r.schneider_score = round(best_score, 4)
|
|
679
|
+
|
|
680
|
+
# Mark any remaining unclassified (no SMILES) as unclassified
|
|
681
|
+
for r in reagents:
|
|
682
|
+
if not r.classification:
|
|
683
|
+
r.classification = "unclassified"
|
|
684
|
+
r.classification_method = "none"
|
|
685
|
+
|
|
686
|
+
print(f" Schneider FP classification (score={best_score:.3f}): "
|
|
687
|
+
f"{sum(1 for c in candidates if c['reagent'].classification == 'atom_contributing')} "
|
|
688
|
+
f"reactant(s), "
|
|
689
|
+
f"{sum(1 for c in candidates if c['reagent'].classification == 'non_contributing')} "
|
|
690
|
+
f"reagent(s)",
|
|
691
|
+
file=sys.stderr)
|
|
692
|
+
|
|
693
|
+
|
|
694
|
+
# ---------------------------------------------------------------------------
|
|
695
|
+
# Main Classification Logic
|
|
696
|
+
# ---------------------------------------------------------------------------
|
|
697
|
+
|
|
698
|
+
def classify_reagents(reagents: List[ReagentInfo],
|
|
699
|
+
product_smiles: str,
|
|
700
|
+
mcs_threshold: float = 0.3,
|
|
701
|
+
use_rxnmapper: bool = True) -> List[ReagentInfo]:
|
|
702
|
+
"""Classify each reagent using a two-tier strategy.
|
|
703
|
+
|
|
704
|
+
Tier 1: Schneider FP scoring — context-aware binary classification
|
|
705
|
+
(atom_contributing vs non_contributing).
|
|
706
|
+
Tier 2: Curated DB lookup — semantic role enrichment for non-contributing
|
|
707
|
+
species (adds labels like 'base', 'catalyst', 'solvent').
|
|
708
|
+
|
|
709
|
+
Schneider always wins on the binary question. The DB never overrides it.
|
|
710
|
+
|
|
711
|
+
Args:
|
|
712
|
+
mcs_threshold: deprecated, ignored (kept for API compat)
|
|
713
|
+
use_rxnmapper: deprecated, ignored (kept for API compat)
|
|
714
|
+
"""
|
|
715
|
+
# --- Tier 1: Schneider FP-based classification (context-aware) ---
|
|
716
|
+
_schneider_classify(reagents, product_smiles)
|
|
717
|
+
|
|
718
|
+
# --- Tier 2: Semantic role enrichment for non-contributing species ---
|
|
719
|
+
for r in reagents:
|
|
720
|
+
if r.classification == "non_contributing" and not r.role:
|
|
721
|
+
result = role_lookup(r.smiles, r.name)
|
|
722
|
+
if result:
|
|
723
|
+
role, _method = result
|
|
724
|
+
r.role = role # "base", "catalyst", "solvent", etc.
|
|
725
|
+
|
|
726
|
+
return reagents
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
def _try_rxnmapper_classification(reagents: List[ReagentInfo],
|
|
730
|
+
product_smiles: str) -> None:
|
|
731
|
+
"""Tier 1.5: Use RXNMapper atom maps to classify unclassified reagents.
|
|
732
|
+
|
|
733
|
+
Builds a reaction SMILES from all unclassified reagent SMILES + product,
|
|
734
|
+
calls RXNMapper via subprocess (rxn-experiments env), and uses the atom
|
|
735
|
+
map results to determine which reagents are atom-contributing.
|
|
736
|
+
|
|
737
|
+
Modifies reagents in place. Silently returns if RXNMapper is unavailable.
|
|
738
|
+
"""
|
|
739
|
+
if not product_smiles:
|
|
740
|
+
return
|
|
741
|
+
|
|
742
|
+
# Collect unclassified reagents that have SMILES
|
|
743
|
+
unclassified = [r for r in reagents if not r.classification and r.smiles]
|
|
744
|
+
if not unclassified:
|
|
745
|
+
return
|
|
746
|
+
|
|
747
|
+
# Build reaction SMILES: all unclassified reagent SMILES >> product
|
|
748
|
+
reactant_smiles_list = [r.smiles for r in unclassified]
|
|
749
|
+
rxn_smi = ".".join(reactant_smiles_list) + ">>" + product_smiles
|
|
750
|
+
|
|
751
|
+
# Try to call RXNMapper
|
|
752
|
+
try:
|
|
753
|
+
from experiments.atom_mapping.rxn_atom_mapper import classify_roles
|
|
754
|
+
except ImportError:
|
|
755
|
+
# rxn_atom_mapper not available — skip silently
|
|
756
|
+
return
|
|
757
|
+
|
|
758
|
+
try:
|
|
759
|
+
result = classify_roles(rxn_smi)
|
|
760
|
+
except Exception as exc:
|
|
761
|
+
print(f" [info] RXNMapper classification failed: {exc}",
|
|
762
|
+
file=sys.stderr)
|
|
763
|
+
return
|
|
764
|
+
|
|
765
|
+
if result is None:
|
|
766
|
+
return
|
|
767
|
+
|
|
768
|
+
confidence = result.get("confidence", 0.0)
|
|
769
|
+
components = result.get("components", [])
|
|
770
|
+
|
|
771
|
+
if not components:
|
|
772
|
+
return
|
|
773
|
+
|
|
774
|
+
print(f" RXNMapper classification (confidence={confidence:.4f}):",
|
|
775
|
+
file=sys.stderr)
|
|
776
|
+
|
|
777
|
+
# Match results back to reagents by canonical SMILES
|
|
778
|
+
try:
|
|
779
|
+
from rdkit import Chem
|
|
780
|
+
def _canon(smi):
|
|
781
|
+
mol = Chem.MolFromSmiles(smi)
|
|
782
|
+
return Chem.MolToSmiles(mol) if mol else smi
|
|
783
|
+
except ImportError:
|
|
784
|
+
def _canon(smi):
|
|
785
|
+
return smi
|
|
786
|
+
|
|
787
|
+
# Build lookup: canonical SMILES → RXNMapper component info
|
|
788
|
+
rxnm_by_smi = {}
|
|
789
|
+
for comp in components:
|
|
790
|
+
canon = _canon(comp["smiles"])
|
|
791
|
+
rxnm_by_smi[canon] = comp
|
|
792
|
+
|
|
793
|
+
# Apply to unclassified reagents
|
|
794
|
+
for r in unclassified:
|
|
795
|
+
canon = _canon(r.smiles)
|
|
796
|
+
comp = rxnm_by_smi.get(canon)
|
|
797
|
+
if comp is None:
|
|
798
|
+
continue
|
|
799
|
+
|
|
800
|
+
is_contributing = comp.get("atom_contributing")
|
|
801
|
+
if is_contributing is None:
|
|
802
|
+
continue
|
|
803
|
+
|
|
804
|
+
if is_contributing:
|
|
805
|
+
r.classification = "atom_contributing"
|
|
806
|
+
r.classification_method = "rxnmapper"
|
|
807
|
+
n_atoms = comp.get("n_product_atoms", 0)
|
|
808
|
+
print(f" {r.smiles[:50]:50s} → atom_contributing "
|
|
809
|
+
f"({n_atoms} atoms in product)", file=sys.stderr)
|
|
810
|
+
else:
|
|
811
|
+
r.classification = "non_contributing"
|
|
812
|
+
r.classification_method = "rxnmapper"
|
|
813
|
+
print(f" {r.smiles[:50]:50s} → non_contributing",
|
|
814
|
+
file=sys.stderr)
|
|
815
|
+
|
|
816
|
+
r.rxnmapper_confidence = confidence
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
# ---------------------------------------------------------------------------
|
|
820
|
+
# CDXML Mode Entry Point
|
|
821
|
+
# ---------------------------------------------------------------------------
|
|
822
|
+
|
|
823
|
+
def classify_from_cdxml(cdxml_path: str,
|
|
824
|
+
mcs_threshold: float = 0.3,
|
|
825
|
+
use_rxnmapper: bool = False) -> Dict[str, Any]:
|
|
826
|
+
"""Parse a CDXML reaction file and classify all reagents.
|
|
827
|
+
|
|
828
|
+
mcs_threshold and use_rxnmapper are deprecated and ignored (kept for
|
|
829
|
+
API compat). Classification uses Schneider FP scoring internally.
|
|
830
|
+
"""
|
|
831
|
+
tree = ET.parse(cdxml_path)
|
|
832
|
+
root = tree.getroot()
|
|
833
|
+
page = _get_page(root)
|
|
834
|
+
|
|
835
|
+
# --- Parse <step> metadata ---
|
|
836
|
+
scheme = page.find("scheme")
|
|
837
|
+
step = scheme.find("step") if scheme is not None else None
|
|
838
|
+
if step is None:
|
|
839
|
+
raise SystemExit("ERROR: no <scheme><step> found in CDXML")
|
|
840
|
+
|
|
841
|
+
reactant_ids = step.get("ReactionStepReactants", "").split()
|
|
842
|
+
product_ids = step.get("ReactionStepProducts", "").split()
|
|
843
|
+
above_ids = step.get("ReactionStepObjectsAboveArrow", "").split()
|
|
844
|
+
below_ids = step.get("ReactionStepObjectsBelowArrow", "").split()
|
|
845
|
+
|
|
846
|
+
# Build id → element map
|
|
847
|
+
id_to_el: Dict[str, ET.Element] = {}
|
|
848
|
+
for el in page:
|
|
849
|
+
eid = el.get("id", "")
|
|
850
|
+
if eid:
|
|
851
|
+
id_to_el[eid] = el
|
|
852
|
+
|
|
853
|
+
# --- Extract product SMILES ---
|
|
854
|
+
product_smiles = None
|
|
855
|
+
for pid in product_ids:
|
|
856
|
+
el = id_to_el.get(pid)
|
|
857
|
+
if el is not None and el.tag == "fragment":
|
|
858
|
+
product_smiles = _fragment_to_smiles(el)
|
|
859
|
+
if product_smiles:
|
|
860
|
+
break
|
|
861
|
+
if not product_smiles:
|
|
862
|
+
raise SystemExit("ERROR: could not extract product SMILES")
|
|
863
|
+
|
|
864
|
+
print(f"Product SMILES: {product_smiles}", file=sys.stderr)
|
|
865
|
+
|
|
866
|
+
# --- Collect reagents ---
|
|
867
|
+
reagents: List[ReagentInfo] = []
|
|
868
|
+
seen_ids: set = set()
|
|
869
|
+
|
|
870
|
+
def _process_element(eid: str, position: str):
|
|
871
|
+
"""Process a single element (fragment or text) as a potential reagent."""
|
|
872
|
+
if eid in seen_ids:
|
|
873
|
+
return
|
|
874
|
+
seen_ids.add(eid)
|
|
875
|
+
|
|
876
|
+
el = id_to_el.get(eid)
|
|
877
|
+
if el is None:
|
|
878
|
+
return
|
|
879
|
+
|
|
880
|
+
fm_type = _get_fm_molecule_type(el)
|
|
881
|
+
|
|
882
|
+
# Skip products and condition text
|
|
883
|
+
if fm_type == 3:
|
|
884
|
+
return
|
|
885
|
+
if fm_type == 2:
|
|
886
|
+
return
|
|
887
|
+
|
|
888
|
+
ri = ReagentInfo(source_id=eid, position=position)
|
|
889
|
+
|
|
890
|
+
# FM type = 1 → solvent hint (Schneider may override)
|
|
891
|
+
if fm_type == 1:
|
|
892
|
+
ri.source_type = el.tag
|
|
893
|
+
ri.role = "solvent" # hint only; Schneider decides classification
|
|
894
|
+
if el.tag == "t":
|
|
895
|
+
ri.name = _get_text_content(el)
|
|
896
|
+
|
|
897
|
+
# Fragment → extract SMILES via ChemScript
|
|
898
|
+
if el.tag == "fragment":
|
|
899
|
+
ri.source_type = "fragment"
|
|
900
|
+
ri.smiles = _fragment_to_smiles(el)
|
|
901
|
+
if ri.smiles:
|
|
902
|
+
print(f" Fragment {eid}: {ri.smiles}", file=sys.stderr)
|
|
903
|
+
|
|
904
|
+
# Text → resolve name to SMILES via PubChem
|
|
905
|
+
elif el.tag == "t":
|
|
906
|
+
ri.source_type = "text"
|
|
907
|
+
text = _get_text_content(el)
|
|
908
|
+
ri.name = text
|
|
909
|
+
ri.smiles = _text_to_smiles(text)
|
|
910
|
+
if ri.smiles:
|
|
911
|
+
print(f" Text '{text}' → {ri.smiles}", file=sys.stderr)
|
|
912
|
+
else:
|
|
913
|
+
print(f" Text '{text}' → no SMILES (name-only)", file=sys.stderr)
|
|
914
|
+
else:
|
|
915
|
+
return
|
|
916
|
+
|
|
917
|
+
reagents.append(ri)
|
|
918
|
+
|
|
919
|
+
# Process reactants first, then above/below arrow
|
|
920
|
+
for rid in reactant_ids:
|
|
921
|
+
_process_element(rid, "reactant")
|
|
922
|
+
for eid in above_ids:
|
|
923
|
+
_process_element(eid, "above_arrow")
|
|
924
|
+
for eid in below_ids:
|
|
925
|
+
_process_element(eid, "below_arrow")
|
|
926
|
+
|
|
927
|
+
# --- Classify ---
|
|
928
|
+
classify_reagents(reagents, product_smiles, mcs_threshold,
|
|
929
|
+
use_rxnmapper=use_rxnmapper)
|
|
930
|
+
|
|
931
|
+
return {
|
|
932
|
+
"cdxml_file": os.path.basename(cdxml_path),
|
|
933
|
+
"product_smiles": product_smiles,
|
|
934
|
+
"reagents": [_reagent_to_dict(r) for r in reagents],
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
|
|
938
|
+
# ---------------------------------------------------------------------------
|
|
939
|
+
# SMILES Mode Entry Point
|
|
940
|
+
# ---------------------------------------------------------------------------
|
|
941
|
+
|
|
942
|
+
def classify_from_smiles(reagent_smiles: List[str],
|
|
943
|
+
product_smiles: str,
|
|
944
|
+
reagent_names: Optional[List[str]] = None,
|
|
945
|
+
mcs_threshold: float = 0.3,
|
|
946
|
+
use_rxnmapper: bool = True) -> Dict[str, Any]:
|
|
947
|
+
"""Classify reagents given as SMILES strings."""
|
|
948
|
+
reagents: List[ReagentInfo] = []
|
|
949
|
+
for i, smi in enumerate(reagent_smiles):
|
|
950
|
+
name = reagent_names[i] if reagent_names and i < len(reagent_names) else None
|
|
951
|
+
ri = ReagentInfo(source_type="smiles_input", smiles=smi, name=name)
|
|
952
|
+
reagents.append(ri)
|
|
953
|
+
classify_reagents(reagents, product_smiles, mcs_threshold,
|
|
954
|
+
use_rxnmapper=use_rxnmapper)
|
|
955
|
+
return {
|
|
956
|
+
"product_smiles": product_smiles,
|
|
957
|
+
"reagents": [_reagent_to_dict(r) for r in reagents],
|
|
958
|
+
}
|
|
959
|
+
|
|
960
|
+
|
|
961
|
+
# ---------------------------------------------------------------------------
|
|
962
|
+
# Output Helpers
|
|
963
|
+
# ---------------------------------------------------------------------------
|
|
964
|
+
|
|
965
|
+
def _reagent_to_dict(r: ReagentInfo) -> Dict[str, Any]:
|
|
966
|
+
d = asdict(r)
|
|
967
|
+
# Drop empty/None optional fields for cleaner output
|
|
968
|
+
if d["mcs_ratio"] is None:
|
|
969
|
+
del d["mcs_ratio"]
|
|
970
|
+
if d.get("rxnmapper_confidence") is None:
|
|
971
|
+
d.pop("rxnmapper_confidence", None)
|
|
972
|
+
if d.get("schneider_score") is None:
|
|
973
|
+
d.pop("schneider_score", None)
|
|
974
|
+
if d["role"] is None:
|
|
975
|
+
del d["role"]
|
|
976
|
+
if d["name"] is None:
|
|
977
|
+
del d["name"]
|
|
978
|
+
return d
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
# ---------------------------------------------------------------------------
|
|
982
|
+
# CLI
|
|
983
|
+
# ---------------------------------------------------------------------------
|
|
984
|
+
|
|
985
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
986
|
+
parser = argparse.ArgumentParser(
|
|
987
|
+
description="Classify reaction reagents as atom-contributing "
|
|
988
|
+
"or non-contributing (role lookup + RDKit MCS).",
|
|
989
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
990
|
+
epilog=__doc__,
|
|
991
|
+
)
|
|
992
|
+
sub = parser.add_subparsers(dest="mode", required=True,
|
|
993
|
+
help="Input mode")
|
|
994
|
+
|
|
995
|
+
# Shared args for both modes
|
|
996
|
+
common = argparse.ArgumentParser(add_help=False)
|
|
997
|
+
common.add_argument("-o", "--output",
|
|
998
|
+
help="Output JSON file (default: stdout)")
|
|
999
|
+
common.add_argument("--pretty", action="store_true",
|
|
1000
|
+
help="Pretty-print JSON output")
|
|
1001
|
+
common.add_argument("--threshold", type=float, default=0.5,
|
|
1002
|
+
help="MCS ratio threshold (default: 0.5)")
|
|
1003
|
+
|
|
1004
|
+
# CDXML mode
|
|
1005
|
+
p_cdxml = sub.add_parser("cdxml", parents=[common],
|
|
1006
|
+
help="Classify from a CDXML reaction file")
|
|
1007
|
+
p_cdxml.add_argument("-i", "--input", required=True,
|
|
1008
|
+
help="Input CDXML file")
|
|
1009
|
+
|
|
1010
|
+
# SMILES mode
|
|
1011
|
+
p_smi = sub.add_parser("smiles", parents=[common],
|
|
1012
|
+
help="Classify from SMILES strings")
|
|
1013
|
+
p_smi.add_argument("--reagents", nargs="+", required=True,
|
|
1014
|
+
help="Reagent SMILES strings")
|
|
1015
|
+
p_smi.add_argument("--product", required=True,
|
|
1016
|
+
help="Product SMILES")
|
|
1017
|
+
p_smi.add_argument("--names", nargs="+", default=None,
|
|
1018
|
+
help="Reagent names (parallel to --reagents)")
|
|
1019
|
+
|
|
1020
|
+
args = parser.parse_args(argv)
|
|
1021
|
+
|
|
1022
|
+
if args.mode == "cdxml":
|
|
1023
|
+
result = classify_from_cdxml(args.input, args.threshold)
|
|
1024
|
+
elif args.mode == "smiles":
|
|
1025
|
+
result = classify_from_smiles(
|
|
1026
|
+
args.reagents, args.product, args.names, args.threshold)
|
|
1027
|
+
else:
|
|
1028
|
+
parser.print_help()
|
|
1029
|
+
return 1
|
|
1030
|
+
|
|
1031
|
+
indent = 2 if args.pretty else None
|
|
1032
|
+
json_str = json.dumps(result, indent=indent, ensure_ascii=False)
|
|
1033
|
+
|
|
1034
|
+
if args.output:
|
|
1035
|
+
with open(args.output, "w", encoding="utf-8") as f:
|
|
1036
|
+
f.write(json_str + "\n")
|
|
1037
|
+
print(f"Written to {args.output}", file=sys.stderr)
|
|
1038
|
+
else:
|
|
1039
|
+
print(json_str)
|
|
1040
|
+
|
|
1041
|
+
return 0
|
|
1042
|
+
|
|
1043
|
+
|
|
1044
|
+
if __name__ == "__main__":
|
|
1045
|
+
sys.exit(main())
|