cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,664 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
SciFinder RDF Reaction Parser
|
|
4
|
+
Parses SciFinder .rdf reaction export files (V3000 MOL blocks) into structured JSON.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
python rdf_parser.py reaction.rdf
|
|
8
|
+
python rdf_parser.py reaction.rdf --output parsed.json
|
|
9
|
+
python rdf_parser.py reaction.rdf --resolve-cas # also resolve CAS via PubChem
|
|
10
|
+
python rdf_parser.py reaction.rdf --pretty
|
|
11
|
+
|
|
12
|
+
Output: JSON with reactants, products, reagents/catalysts/solvents, conditions,
|
|
13
|
+
literature references, and yield data.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import re
|
|
19
|
+
import sys
|
|
20
|
+
from dataclasses import dataclass, field, asdict
|
|
21
|
+
from typing import List, Optional, Dict, Any
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
# Data structures
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class Atom:
|
|
30
|
+
"""A single atom from a V3000 MOL block."""
|
|
31
|
+
index: int
|
|
32
|
+
symbol: str
|
|
33
|
+
x: float
|
|
34
|
+
y: float
|
|
35
|
+
z: float
|
|
36
|
+
cfg: int = 0 # stereochemistry flag (1=wedge, 2=dash, 3=either)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class Bond:
|
|
41
|
+
"""A single bond from a V3000 MOL block."""
|
|
42
|
+
index: int
|
|
43
|
+
order: int # 1=single, 2=double, 3=triple
|
|
44
|
+
atom1: int
|
|
45
|
+
atom2: int
|
|
46
|
+
cfg: int = 0 # stereo bond config
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class StereoCollection:
|
|
51
|
+
"""Stereo collection from V3000 (ABS, REL, RAC)."""
|
|
52
|
+
stereo_type: str # "ABS", "REL", "RAC"
|
|
53
|
+
atom_indices: List[int] = field(default_factory=list)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class Molecule:
|
|
58
|
+
"""A molecule parsed from a $MOL block."""
|
|
59
|
+
name: str = ""
|
|
60
|
+
formula: str = ""
|
|
61
|
+
cas: str = ""
|
|
62
|
+
role: str = "" # "reactant" or "product"
|
|
63
|
+
atoms: List[Atom] = field(default_factory=list)
|
|
64
|
+
bonds: List[Bond] = field(default_factory=list)
|
|
65
|
+
stereo: List[StereoCollection] = field(default_factory=list)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class ReagentEntry:
|
|
70
|
+
"""A reagent, catalyst, or solvent identified by CAS in $DTYPE/$DATUM."""
|
|
71
|
+
cas: str = ""
|
|
72
|
+
role: str = "" # "reagent", "catalyst", "solvent"
|
|
73
|
+
name: str = "" # populated if --resolve-cas is used
|
|
74
|
+
mw: Optional[float] = None
|
|
75
|
+
formula: str = ""
|
|
76
|
+
smiles: str = ""
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class Reference:
|
|
81
|
+
"""A literature reference from the reaction record."""
|
|
82
|
+
title: str = ""
|
|
83
|
+
authors: str = ""
|
|
84
|
+
citation: str = ""
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class ReactionVariation:
|
|
89
|
+
"""One experimental variation of a reaction (SciFinder VAR block)."""
|
|
90
|
+
cas_reaction_number: str = ""
|
|
91
|
+
steps: int = 1
|
|
92
|
+
stages: int = 1
|
|
93
|
+
yield_pct: Optional[float] = None
|
|
94
|
+
reagents: List[ReagentEntry] = field(default_factory=list)
|
|
95
|
+
catalysts: List[ReagentEntry] = field(default_factory=list)
|
|
96
|
+
solvents: List[ReagentEntry] = field(default_factory=list)
|
|
97
|
+
references: List[Reference] = field(default_factory=list)
|
|
98
|
+
conditions: Dict[str, str] = field(default_factory=dict)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass
|
|
102
|
+
class Reaction:
|
|
103
|
+
"""A complete parsed reaction record from an RDF file."""
|
|
104
|
+
file_date: str = ""
|
|
105
|
+
scheme_id: str = ""
|
|
106
|
+
num_reactants: int = 0
|
|
107
|
+
num_products: int = 0
|
|
108
|
+
reactants: List[Molecule] = field(default_factory=list)
|
|
109
|
+
products: List[Molecule] = field(default_factory=list)
|
|
110
|
+
variations: List[ReactionVariation] = field(default_factory=list)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
# V3000 MOL block parsing
|
|
115
|
+
# ---------------------------------------------------------------------------
|
|
116
|
+
|
|
117
|
+
def parse_v3000_mol(lines: List[str]) -> tuple:
|
|
118
|
+
"""
|
|
119
|
+
Parse a V3000 MOL block into atoms, bonds, and stereo collections.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
lines: Lines of the MOL block starting from the counts line
|
|
123
|
+
(after name/formula/CAS header lines).
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
(atoms, bonds, stereo_collections)
|
|
127
|
+
"""
|
|
128
|
+
atoms = []
|
|
129
|
+
bonds = []
|
|
130
|
+
stereo = []
|
|
131
|
+
|
|
132
|
+
in_atom = False
|
|
133
|
+
in_bond = False
|
|
134
|
+
in_collection = False
|
|
135
|
+
|
|
136
|
+
for line in lines:
|
|
137
|
+
stripped = line.strip()
|
|
138
|
+
|
|
139
|
+
# Detect section boundaries
|
|
140
|
+
if "BEGIN ATOM" in stripped:
|
|
141
|
+
in_atom = True
|
|
142
|
+
continue
|
|
143
|
+
elif "END ATOM" in stripped:
|
|
144
|
+
in_atom = False
|
|
145
|
+
continue
|
|
146
|
+
elif "BEGIN BOND" in stripped:
|
|
147
|
+
in_bond = True
|
|
148
|
+
continue
|
|
149
|
+
elif "END BOND" in stripped:
|
|
150
|
+
in_bond = False
|
|
151
|
+
continue
|
|
152
|
+
elif "BEGIN COLLECTION" in stripped:
|
|
153
|
+
in_collection = True
|
|
154
|
+
continue
|
|
155
|
+
elif "END COLLECTION" in stripped:
|
|
156
|
+
in_collection = False
|
|
157
|
+
continue
|
|
158
|
+
elif "END CTAB" in stripped or stripped == "M END":
|
|
159
|
+
break
|
|
160
|
+
|
|
161
|
+
# Parse atoms: M V30 index symbol x y z charge [CFG=n]
|
|
162
|
+
if in_atom and stripped.startswith("M V30"):
|
|
163
|
+
parts = stripped[6:].split()
|
|
164
|
+
if len(parts) >= 6:
|
|
165
|
+
idx = int(parts[0])
|
|
166
|
+
symbol = parts[1]
|
|
167
|
+
x = float(parts[2])
|
|
168
|
+
y = float(parts[3])
|
|
169
|
+
z = float(parts[4])
|
|
170
|
+
cfg = 0
|
|
171
|
+
for p in parts[5:]:
|
|
172
|
+
if p.startswith("CFG="):
|
|
173
|
+
cfg = int(p.split("=")[1])
|
|
174
|
+
atoms.append(Atom(index=idx, symbol=symbol, x=x, y=y, z=z, cfg=cfg))
|
|
175
|
+
|
|
176
|
+
# Parse bonds: M V30 index order atom1 atom2 [CFG=n]
|
|
177
|
+
elif in_bond and stripped.startswith("M V30"):
|
|
178
|
+
parts = stripped[6:].split()
|
|
179
|
+
if len(parts) >= 4:
|
|
180
|
+
idx = int(parts[0])
|
|
181
|
+
order = int(parts[1])
|
|
182
|
+
a1 = int(parts[2])
|
|
183
|
+
a2 = int(parts[3])
|
|
184
|
+
cfg = 0
|
|
185
|
+
for p in parts[4:]:
|
|
186
|
+
if p.startswith("CFG="):
|
|
187
|
+
cfg = int(p.split("=")[1])
|
|
188
|
+
bonds.append(Bond(index=idx, order=order, atom1=a1, atom2=a2, cfg=cfg))
|
|
189
|
+
|
|
190
|
+
# Parse stereo collections: M V30 MDLV30/STEABS ATOMS=(1 9)
|
|
191
|
+
elif in_collection and stripped.startswith("M V30"):
|
|
192
|
+
content = stripped[6:].strip()
|
|
193
|
+
# Match patterns like MDLV30/STEABS ATOMS=(1 9)
|
|
194
|
+
m = re.match(r'MDLV30/STE(\w+)\s+ATOMS=\((.+?)\)', content)
|
|
195
|
+
if m:
|
|
196
|
+
stype = m.group(1) # ABS, REL, RAC
|
|
197
|
+
atom_ids = [int(x) for x in m.group(2).split()]
|
|
198
|
+
stereo.append(StereoCollection(stereo_type=stype, atom_indices=atom_ids))
|
|
199
|
+
|
|
200
|
+
return atoms, bonds, stereo
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# ---------------------------------------------------------------------------
|
|
204
|
+
# RDF file parsing
|
|
205
|
+
# ---------------------------------------------------------------------------
|
|
206
|
+
|
|
207
|
+
def parse_rdf(filepath: str) -> List[Reaction]:
|
|
208
|
+
"""
|
|
209
|
+
Parse a SciFinder .rdf file and return a list of Reaction objects.
|
|
210
|
+
|
|
211
|
+
The RDF format consists of:
|
|
212
|
+
- $RDFILE header
|
|
213
|
+
- $DATM timestamp
|
|
214
|
+
- One or more reaction records starting with $RFMT
|
|
215
|
+
- Each record has $RXN header, $MOL blocks, and $DTYPE/$DATUM metadata
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
filepath: Path to the .rdf file.
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
List of Reaction objects.
|
|
222
|
+
"""
|
|
223
|
+
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
|
|
224
|
+
content = f.read()
|
|
225
|
+
|
|
226
|
+
reactions = []
|
|
227
|
+
|
|
228
|
+
# Parse file header
|
|
229
|
+
file_date = ""
|
|
230
|
+
datm_match = re.search(r'\$DATM\s+(.+)', content)
|
|
231
|
+
if datm_match:
|
|
232
|
+
file_date = datm_match.group(1).strip()
|
|
233
|
+
|
|
234
|
+
# Split into reaction records at $RFMT
|
|
235
|
+
# The first chunk before any $RFMT is the file header
|
|
236
|
+
rfmt_parts = re.split(r'^\$RFMT\b', content, flags=re.MULTILINE)
|
|
237
|
+
|
|
238
|
+
for part_idx, part in enumerate(rfmt_parts):
|
|
239
|
+
if part_idx == 0:
|
|
240
|
+
# File header — skip
|
|
241
|
+
continue
|
|
242
|
+
|
|
243
|
+
rxn = Reaction(file_date=file_date)
|
|
244
|
+
|
|
245
|
+
# Parse scheme ID from the $RFMT line remainder
|
|
246
|
+
first_line = part.split("\n")[0].strip()
|
|
247
|
+
scheme_match = re.search(r'\$RIREG\s+(\S+)', first_line)
|
|
248
|
+
if scheme_match:
|
|
249
|
+
rxn.scheme_id = scheme_match.group(1)
|
|
250
|
+
|
|
251
|
+
# Find the $RXN block and count line
|
|
252
|
+
rxn_match = re.search(r'\$RXN\s*\n', part)
|
|
253
|
+
if not rxn_match:
|
|
254
|
+
continue
|
|
255
|
+
|
|
256
|
+
# Lines after $RXN: two blank lines, then counts line
|
|
257
|
+
post_rxn = part[rxn_match.end():]
|
|
258
|
+
post_lines = post_rxn.split("\n")
|
|
259
|
+
|
|
260
|
+
# Find the counts line (first non-blank line after $RXN header lines)
|
|
261
|
+
counts_line = None
|
|
262
|
+
counts_line_idx = 0
|
|
263
|
+
for i, line in enumerate(post_lines):
|
|
264
|
+
stripped = line.strip()
|
|
265
|
+
if stripped and re.match(r'^\d+\s+\d+', stripped):
|
|
266
|
+
counts_line = stripped
|
|
267
|
+
counts_line_idx = i
|
|
268
|
+
break
|
|
269
|
+
|
|
270
|
+
if counts_line:
|
|
271
|
+
count_parts = counts_line.split()
|
|
272
|
+
rxn.num_reactants = int(count_parts[0])
|
|
273
|
+
rxn.num_products = int(count_parts[1])
|
|
274
|
+
|
|
275
|
+
# Split out $MOL blocks
|
|
276
|
+
mol_splits = re.split(r'^\$MOL\s*$', part, flags=re.MULTILINE)
|
|
277
|
+
# mol_splits[0] = everything before first $MOL (RXN header)
|
|
278
|
+
# mol_splits[1..n] = individual MOL blocks
|
|
279
|
+
|
|
280
|
+
total_mols = rxn.num_reactants + rxn.num_products
|
|
281
|
+
for mol_idx in range(1, len(mol_splits)):
|
|
282
|
+
if mol_idx > total_mols:
|
|
283
|
+
break
|
|
284
|
+
|
|
285
|
+
mol_text = mol_splits[mol_idx]
|
|
286
|
+
mol_lines = mol_text.strip().split("\n")
|
|
287
|
+
|
|
288
|
+
mol = Molecule()
|
|
289
|
+
|
|
290
|
+
# First three lines: name, formula, CAS/copyright
|
|
291
|
+
if len(mol_lines) >= 1:
|
|
292
|
+
mol.name = mol_lines[0].strip()
|
|
293
|
+
if len(mol_lines) >= 2:
|
|
294
|
+
mol.formula = mol_lines[1].strip()
|
|
295
|
+
if len(mol_lines) >= 3:
|
|
296
|
+
cas_line = mol_lines[2].strip()
|
|
297
|
+
cas_match = re.match(r'([\d-]+)', cas_line)
|
|
298
|
+
if cas_match:
|
|
299
|
+
mol.cas = cas_match.group(1)
|
|
300
|
+
|
|
301
|
+
# Determine role
|
|
302
|
+
if mol_idx <= rxn.num_reactants:
|
|
303
|
+
mol.role = "reactant"
|
|
304
|
+
else:
|
|
305
|
+
mol.role = "product"
|
|
306
|
+
|
|
307
|
+
# Parse V3000 CTAB — starts after the header line containing "V3000"
|
|
308
|
+
ctab_start = None
|
|
309
|
+
for i, line in enumerate(mol_lines):
|
|
310
|
+
if "V3000" in line:
|
|
311
|
+
ctab_start = i
|
|
312
|
+
break
|
|
313
|
+
|
|
314
|
+
if ctab_start is not None:
|
|
315
|
+
atoms, bonds, stereo = parse_v3000_mol(mol_lines[ctab_start + 1:])
|
|
316
|
+
mol.atoms = atoms
|
|
317
|
+
mol.bonds = bonds
|
|
318
|
+
mol.stereo = stereo
|
|
319
|
+
|
|
320
|
+
if mol.role == "reactant":
|
|
321
|
+
rxn.reactants.append(mol)
|
|
322
|
+
else:
|
|
323
|
+
rxn.products.append(mol)
|
|
324
|
+
|
|
325
|
+
# Parse $DTYPE / $DATUM metadata
|
|
326
|
+
_parse_dtype_datum(part, rxn)
|
|
327
|
+
|
|
328
|
+
reactions.append(rxn)
|
|
329
|
+
|
|
330
|
+
return reactions
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _parse_dtype_datum(text: str, rxn: Reaction):
|
|
334
|
+
"""
|
|
335
|
+
Parse all $DTYPE/$DATUM pairs from a reaction record and populate
|
|
336
|
+
the reaction's variation data (reagents, catalysts, solvents,
|
|
337
|
+
yield, references, conditions).
|
|
338
|
+
|
|
339
|
+
Handles multiline $DATUM values (continuation lines without $DTYPE prefix).
|
|
340
|
+
"""
|
|
341
|
+
# Extract all DTYPE/DATUM pairs, handling multiline DATUM values
|
|
342
|
+
dtype_datum_pairs = []
|
|
343
|
+
lines = text.split("\n")
|
|
344
|
+
i = 0
|
|
345
|
+
while i < len(lines):
|
|
346
|
+
line = lines[i].strip()
|
|
347
|
+
if line.startswith("$DTYPE"):
|
|
348
|
+
dtype = line[6:].strip()
|
|
349
|
+
datum_lines = []
|
|
350
|
+
i += 1
|
|
351
|
+
# Collect the $DATUM line and any continuation lines
|
|
352
|
+
while i < len(lines):
|
|
353
|
+
dline = lines[i]
|
|
354
|
+
if dline.strip().startswith("$DATUM"):
|
|
355
|
+
datum_lines.append(dline.strip()[6:].strip())
|
|
356
|
+
i += 1
|
|
357
|
+
# Continuation: lines that don't start with $ are part of this datum
|
|
358
|
+
while i < len(lines):
|
|
359
|
+
cont = lines[i]
|
|
360
|
+
if cont.strip().startswith("$"):
|
|
361
|
+
break
|
|
362
|
+
if cont.strip():
|
|
363
|
+
datum_lines.append(cont.strip())
|
|
364
|
+
i += 1
|
|
365
|
+
break
|
|
366
|
+
else:
|
|
367
|
+
i += 1
|
|
368
|
+
datum = " ".join(datum_lines)
|
|
369
|
+
dtype_datum_pairs.append((dtype, datum))
|
|
370
|
+
else:
|
|
371
|
+
i += 1
|
|
372
|
+
|
|
373
|
+
# Ensure we have at least one variation to populate
|
|
374
|
+
variations = {} # var_num -> ReactionVariation
|
|
375
|
+
|
|
376
|
+
for dtype, datum in dtype_datum_pairs:
|
|
377
|
+
# Direct reactant/product CAS: RXN:RCT(n):CAS_RN, RXN:PRO(n):CAS_RN
|
|
378
|
+
# These are already captured in the MOL blocks, but we verify here
|
|
379
|
+
|
|
380
|
+
# Variation-level data: RXN:VAR(n):...
|
|
381
|
+
var_match = re.match(r'RXN:VAR\((\d+)\):(.+)', dtype)
|
|
382
|
+
if var_match:
|
|
383
|
+
var_num = int(var_match.group(1))
|
|
384
|
+
var_key = var_match.group(2)
|
|
385
|
+
|
|
386
|
+
if var_num not in variations:
|
|
387
|
+
variations[var_num] = ReactionVariation()
|
|
388
|
+
var = variations[var_num]
|
|
389
|
+
|
|
390
|
+
# Yield: PRO(n):YIELD
|
|
391
|
+
yield_match = re.match(r'PRO\(\d+\):YIELD', var_key)
|
|
392
|
+
if yield_match:
|
|
393
|
+
try:
|
|
394
|
+
var.yield_pct = float(datum)
|
|
395
|
+
except ValueError:
|
|
396
|
+
var.yield_pct = None
|
|
397
|
+
continue
|
|
398
|
+
|
|
399
|
+
# CAS Reaction Number
|
|
400
|
+
if var_key == "CAS_Reaction_Number":
|
|
401
|
+
var.cas_reaction_number = datum
|
|
402
|
+
continue
|
|
403
|
+
|
|
404
|
+
# Steps / Stages
|
|
405
|
+
if var_key == "STEPS":
|
|
406
|
+
try:
|
|
407
|
+
var.steps = int(datum)
|
|
408
|
+
except ValueError:
|
|
409
|
+
pass
|
|
410
|
+
continue
|
|
411
|
+
if var_key == "STAGES":
|
|
412
|
+
try:
|
|
413
|
+
var.stages = int(datum)
|
|
414
|
+
except ValueError:
|
|
415
|
+
pass
|
|
416
|
+
continue
|
|
417
|
+
|
|
418
|
+
# Reagents: RGT(n):CAS_RN
|
|
419
|
+
rgt_match = re.match(r'RGT\((\d+)\):CAS_RN', var_key)
|
|
420
|
+
if rgt_match:
|
|
421
|
+
var.reagents.append(ReagentEntry(cas=datum, role="reagent"))
|
|
422
|
+
continue
|
|
423
|
+
|
|
424
|
+
# Catalysts: CAT(n):CAS_RN
|
|
425
|
+
cat_match = re.match(r'CAT\((\d+)\):CAS_RN', var_key)
|
|
426
|
+
if cat_match:
|
|
427
|
+
var.catalysts.append(ReagentEntry(cas=datum, role="catalyst"))
|
|
428
|
+
continue
|
|
429
|
+
|
|
430
|
+
# Solvents: SOL(n):CAS_RN
|
|
431
|
+
sol_match = re.match(r'SOL\((\d+)\):CAS_RN', var_key)
|
|
432
|
+
if sol_match:
|
|
433
|
+
var.solvents.append(ReagentEntry(cas=datum, role="solvent"))
|
|
434
|
+
continue
|
|
435
|
+
|
|
436
|
+
# References: REFERENCE(n):TITLE / AUTHOR / CITATION
|
|
437
|
+
ref_match = re.match(r'REFERENCE\((\d+)\):(\w+)', var_key)
|
|
438
|
+
if ref_match:
|
|
439
|
+
ref_num = int(ref_match.group(1))
|
|
440
|
+
ref_field = ref_match.group(2).upper()
|
|
441
|
+
# Ensure we have enough reference objects
|
|
442
|
+
while len(var.references) < ref_num:
|
|
443
|
+
var.references.append(Reference())
|
|
444
|
+
ref = var.references[ref_num - 1]
|
|
445
|
+
if ref_field == "TITLE":
|
|
446
|
+
ref.title = datum
|
|
447
|
+
elif ref_field == "AUTHOR":
|
|
448
|
+
ref.authors = datum
|
|
449
|
+
elif ref_field == "CITATION":
|
|
450
|
+
ref.citation = datum
|
|
451
|
+
continue
|
|
452
|
+
|
|
453
|
+
# Temperature, time, pressure, pH, etc.
|
|
454
|
+
cond_match = re.match(r'COND\((\d+)\):(.+)', var_key)
|
|
455
|
+
if cond_match:
|
|
456
|
+
cond_key = cond_match.group(2)
|
|
457
|
+
var.conditions[cond_key] = datum
|
|
458
|
+
continue
|
|
459
|
+
|
|
460
|
+
# Anything else — store as generic condition
|
|
461
|
+
# e.g. TEMP, TIME, PRESSURE from some exports
|
|
462
|
+
if var_key in ("TEMP", "TIME", "PRESSURE", "PH", "ATMOSPHERE"):
|
|
463
|
+
var.conditions[var_key] = datum
|
|
464
|
+
continue
|
|
465
|
+
|
|
466
|
+
# Add all variations sorted by var number
|
|
467
|
+
for var_num in sorted(variations.keys()):
|
|
468
|
+
rxn.variations.append(variations[var_num])
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
# ---------------------------------------------------------------------------
|
|
472
|
+
# CAS resolution (optional, delegates to cas_resolver.py)
|
|
473
|
+
# ---------------------------------------------------------------------------
|
|
474
|
+
|
|
475
|
+
def resolve_cas_numbers(reaction: Reaction) -> None:
|
|
476
|
+
"""
|
|
477
|
+
Resolve all CAS numbers in the reaction using cas_resolver.
|
|
478
|
+
Populates name, MW, formula, SMILES for reagents/catalysts/solvents.
|
|
479
|
+
"""
|
|
480
|
+
try:
|
|
481
|
+
from ..resolve.cas_resolver import resolve_cas
|
|
482
|
+
except ImportError:
|
|
483
|
+
print("Warning: cas_resolver.py not found. Skipping CAS resolution.",
|
|
484
|
+
file=sys.stderr)
|
|
485
|
+
return
|
|
486
|
+
|
|
487
|
+
# Resolve reagents, catalysts, solvents in all variations
|
|
488
|
+
for var in reaction.variations:
|
|
489
|
+
for entry_list in [var.reagents, var.catalysts, var.solvents]:
|
|
490
|
+
for entry in entry_list:
|
|
491
|
+
if entry.cas:
|
|
492
|
+
result = resolve_cas(entry.cas)
|
|
493
|
+
if result:
|
|
494
|
+
entry.name = result.get("name", "")
|
|
495
|
+
entry.mw = result.get("mw")
|
|
496
|
+
entry.formula = result.get("formula", "")
|
|
497
|
+
entry.smiles = result.get("smiles", "")
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
# ---------------------------------------------------------------------------
|
|
501
|
+
# Output formatting
|
|
502
|
+
# ---------------------------------------------------------------------------
|
|
503
|
+
|
|
504
|
+
def reaction_to_dict(rxn: Reaction) -> Dict[str, Any]:
|
|
505
|
+
"""Convert a Reaction dataclass to a clean dictionary for JSON output."""
|
|
506
|
+
|
|
507
|
+
def mol_to_dict(mol: Molecule) -> Dict[str, Any]:
|
|
508
|
+
d = {
|
|
509
|
+
"name": mol.name,
|
|
510
|
+
"formula": mol.formula,
|
|
511
|
+
"cas": mol.cas,
|
|
512
|
+
"role": mol.role,
|
|
513
|
+
"atom_count": len(mol.atoms),
|
|
514
|
+
"bond_count": len(mol.bonds),
|
|
515
|
+
"atoms": [
|
|
516
|
+
{
|
|
517
|
+
"index": a.index,
|
|
518
|
+
"symbol": a.symbol,
|
|
519
|
+
"x": a.x,
|
|
520
|
+
"y": a.y,
|
|
521
|
+
"z": a.z,
|
|
522
|
+
**({"cfg": a.cfg} if a.cfg else {}),
|
|
523
|
+
}
|
|
524
|
+
for a in mol.atoms
|
|
525
|
+
],
|
|
526
|
+
"bonds": [
|
|
527
|
+
{
|
|
528
|
+
"index": b.index,
|
|
529
|
+
"order": b.order,
|
|
530
|
+
"atom1": b.atom1,
|
|
531
|
+
"atom2": b.atom2,
|
|
532
|
+
**({"cfg": b.cfg} if b.cfg else {}),
|
|
533
|
+
}
|
|
534
|
+
for b in mol.bonds
|
|
535
|
+
],
|
|
536
|
+
}
|
|
537
|
+
if mol.stereo:
|
|
538
|
+
d["stereo"] = [
|
|
539
|
+
{"type": s.stereo_type, "atoms": s.atom_indices}
|
|
540
|
+
for s in mol.stereo
|
|
541
|
+
]
|
|
542
|
+
return d
|
|
543
|
+
|
|
544
|
+
def entry_to_dict(e: ReagentEntry) -> Dict[str, Any]:
|
|
545
|
+
d = {"cas": e.cas, "role": e.role}
|
|
546
|
+
if e.name:
|
|
547
|
+
d["name"] = e.name
|
|
548
|
+
if e.mw is not None:
|
|
549
|
+
d["mw"] = e.mw
|
|
550
|
+
if e.formula:
|
|
551
|
+
d["formula"] = e.formula
|
|
552
|
+
if e.smiles:
|
|
553
|
+
d["smiles"] = e.smiles
|
|
554
|
+
return d
|
|
555
|
+
|
|
556
|
+
def ref_to_dict(r: Reference) -> Dict[str, Any]:
|
|
557
|
+
d = {}
|
|
558
|
+
if r.title:
|
|
559
|
+
d["title"] = r.title
|
|
560
|
+
if r.authors:
|
|
561
|
+
d["authors"] = r.authors
|
|
562
|
+
if r.citation:
|
|
563
|
+
d["citation"] = r.citation
|
|
564
|
+
return d
|
|
565
|
+
|
|
566
|
+
result = {
|
|
567
|
+
"file_date": rxn.file_date,
|
|
568
|
+
"scheme_id": rxn.scheme_id,
|
|
569
|
+
"num_reactants": rxn.num_reactants,
|
|
570
|
+
"num_products": rxn.num_products,
|
|
571
|
+
"reactants": [mol_to_dict(m) for m in rxn.reactants],
|
|
572
|
+
"products": [mol_to_dict(m) for m in rxn.products],
|
|
573
|
+
"variations": [],
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
for var in rxn.variations:
|
|
577
|
+
v = {}
|
|
578
|
+
if var.cas_reaction_number:
|
|
579
|
+
v["cas_reaction_number"] = var.cas_reaction_number
|
|
580
|
+
v["steps"] = var.steps
|
|
581
|
+
v["stages"] = var.stages
|
|
582
|
+
if var.yield_pct is not None:
|
|
583
|
+
v["yield_pct"] = var.yield_pct
|
|
584
|
+
if var.reagents:
|
|
585
|
+
v["reagents"] = [entry_to_dict(e) for e in var.reagents]
|
|
586
|
+
if var.catalysts:
|
|
587
|
+
v["catalysts"] = [entry_to_dict(e) for e in var.catalysts]
|
|
588
|
+
if var.solvents:
|
|
589
|
+
v["solvents"] = [entry_to_dict(e) for e in var.solvents]
|
|
590
|
+
if var.references:
|
|
591
|
+
v["references"] = [ref_to_dict(r) for r in var.references]
|
|
592
|
+
if var.conditions:
|
|
593
|
+
v["conditions"] = var.conditions
|
|
594
|
+
result["variations"].append(v)
|
|
595
|
+
|
|
596
|
+
return result
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
# ---------------------------------------------------------------------------
|
|
600
|
+
# CLI
|
|
601
|
+
# ---------------------------------------------------------------------------
|
|
602
|
+
|
|
603
|
+
def main(argv=None) -> int:
|
|
604
|
+
parser = argparse.ArgumentParser(
|
|
605
|
+
description="Parse SciFinder .rdf reaction exports into structured JSON."
|
|
606
|
+
)
|
|
607
|
+
parser.add_argument(
|
|
608
|
+
"rdf_file",
|
|
609
|
+
help="Path to the SciFinder .rdf file",
|
|
610
|
+
)
|
|
611
|
+
parser.add_argument(
|
|
612
|
+
"--output", "-o",
|
|
613
|
+
help="Output JSON file (default: print to stdout)",
|
|
614
|
+
)
|
|
615
|
+
parser.add_argument(
|
|
616
|
+
"--resolve-cas",
|
|
617
|
+
action="store_true",
|
|
618
|
+
help="Resolve reagent/catalyst/solvent CAS numbers via PubChem "
|
|
619
|
+
"(requires cas_resolver.py)",
|
|
620
|
+
)
|
|
621
|
+
parser.add_argument(
|
|
622
|
+
"--pretty",
|
|
623
|
+
action="store_true",
|
|
624
|
+
help="Pretty-print JSON output",
|
|
625
|
+
)
|
|
626
|
+
args = parser.parse_args(argv)
|
|
627
|
+
|
|
628
|
+
# Parse
|
|
629
|
+
reactions = parse_rdf(args.rdf_file)
|
|
630
|
+
|
|
631
|
+
if not reactions:
|
|
632
|
+
print(f"No reactions found in {args.rdf_file}", file=sys.stderr)
|
|
633
|
+
return 1
|
|
634
|
+
|
|
635
|
+
print(f"Parsed {len(reactions)} reaction(s) from {args.rdf_file}",
|
|
636
|
+
file=sys.stderr)
|
|
637
|
+
|
|
638
|
+
# Optionally resolve CAS numbers
|
|
639
|
+
if args.resolve_cas:
|
|
640
|
+
for rxn in reactions:
|
|
641
|
+
resolve_cas_numbers(rxn)
|
|
642
|
+
|
|
643
|
+
# Convert to JSON
|
|
644
|
+
if len(reactions) == 1:
|
|
645
|
+
output = reaction_to_dict(reactions[0])
|
|
646
|
+
else:
|
|
647
|
+
output = [reaction_to_dict(r) for r in reactions]
|
|
648
|
+
|
|
649
|
+
indent = 2 if args.pretty else None
|
|
650
|
+
json_str = json.dumps(output, indent=indent, ensure_ascii=False)
|
|
651
|
+
|
|
652
|
+
if args.output:
|
|
653
|
+
with open(args.output, "w", encoding="utf-8") as f:
|
|
654
|
+
f.write(json_str)
|
|
655
|
+
f.write("\n")
|
|
656
|
+
print(f"Written to {args.output}", file=sys.stderr)
|
|
657
|
+
else:
|
|
658
|
+
print(json_str)
|
|
659
|
+
|
|
660
|
+
return 0
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
if __name__ == "__main__":
|
|
664
|
+
sys.exit(main())
|