cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,664 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ SciFinder RDF Reaction Parser
4
+ Parses SciFinder .rdf reaction export files (V3000 MOL blocks) into structured JSON.
5
+
6
+ Usage:
7
+ python rdf_parser.py reaction.rdf
8
+ python rdf_parser.py reaction.rdf --output parsed.json
9
+ python rdf_parser.py reaction.rdf --resolve-cas # also resolve CAS via PubChem
10
+ python rdf_parser.py reaction.rdf --pretty
11
+
12
+ Output: JSON with reactants, products, reagents/catalysts/solvents, conditions,
13
+ literature references, and yield data.
14
+ """
15
+
16
+ import argparse
17
+ import json
18
+ import re
19
+ import sys
20
+ from dataclasses import dataclass, field, asdict
21
+ from typing import List, Optional, Dict, Any
22
+
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Data structures
26
+ # ---------------------------------------------------------------------------
27
+
28
+ @dataclass
29
+ class Atom:
30
+ """A single atom from a V3000 MOL block."""
31
+ index: int
32
+ symbol: str
33
+ x: float
34
+ y: float
35
+ z: float
36
+ cfg: int = 0 # stereochemistry flag (1=wedge, 2=dash, 3=either)
37
+
38
+
39
+ @dataclass
40
+ class Bond:
41
+ """A single bond from a V3000 MOL block."""
42
+ index: int
43
+ order: int # 1=single, 2=double, 3=triple
44
+ atom1: int
45
+ atom2: int
46
+ cfg: int = 0 # stereo bond config
47
+
48
+
49
+ @dataclass
50
+ class StereoCollection:
51
+ """Stereo collection from V3000 (ABS, REL, RAC)."""
52
+ stereo_type: str # "ABS", "REL", "RAC"
53
+ atom_indices: List[int] = field(default_factory=list)
54
+
55
+
56
+ @dataclass
57
+ class Molecule:
58
+ """A molecule parsed from a $MOL block."""
59
+ name: str = ""
60
+ formula: str = ""
61
+ cas: str = ""
62
+ role: str = "" # "reactant" or "product"
63
+ atoms: List[Atom] = field(default_factory=list)
64
+ bonds: List[Bond] = field(default_factory=list)
65
+ stereo: List[StereoCollection] = field(default_factory=list)
66
+
67
+
68
+ @dataclass
69
+ class ReagentEntry:
70
+ """A reagent, catalyst, or solvent identified by CAS in $DTYPE/$DATUM."""
71
+ cas: str = ""
72
+ role: str = "" # "reagent", "catalyst", "solvent"
73
+ name: str = "" # populated if --resolve-cas is used
74
+ mw: Optional[float] = None
75
+ formula: str = ""
76
+ smiles: str = ""
77
+
78
+
79
+ @dataclass
80
+ class Reference:
81
+ """A literature reference from the reaction record."""
82
+ title: str = ""
83
+ authors: str = ""
84
+ citation: str = ""
85
+
86
+
87
+ @dataclass
88
+ class ReactionVariation:
89
+ """One experimental variation of a reaction (SciFinder VAR block)."""
90
+ cas_reaction_number: str = ""
91
+ steps: int = 1
92
+ stages: int = 1
93
+ yield_pct: Optional[float] = None
94
+ reagents: List[ReagentEntry] = field(default_factory=list)
95
+ catalysts: List[ReagentEntry] = field(default_factory=list)
96
+ solvents: List[ReagentEntry] = field(default_factory=list)
97
+ references: List[Reference] = field(default_factory=list)
98
+ conditions: Dict[str, str] = field(default_factory=dict)
99
+
100
+
101
+ @dataclass
102
+ class Reaction:
103
+ """A complete parsed reaction record from an RDF file."""
104
+ file_date: str = ""
105
+ scheme_id: str = ""
106
+ num_reactants: int = 0
107
+ num_products: int = 0
108
+ reactants: List[Molecule] = field(default_factory=list)
109
+ products: List[Molecule] = field(default_factory=list)
110
+ variations: List[ReactionVariation] = field(default_factory=list)
111
+
112
+
113
+ # ---------------------------------------------------------------------------
114
+ # V3000 MOL block parsing
115
+ # ---------------------------------------------------------------------------
116
+
117
+ def parse_v3000_mol(lines: List[str]) -> tuple:
118
+ """
119
+ Parse a V3000 MOL block into atoms, bonds, and stereo collections.
120
+
121
+ Args:
122
+ lines: Lines of the MOL block starting from the counts line
123
+ (after name/formula/CAS header lines).
124
+
125
+ Returns:
126
+ (atoms, bonds, stereo_collections)
127
+ """
128
+ atoms = []
129
+ bonds = []
130
+ stereo = []
131
+
132
+ in_atom = False
133
+ in_bond = False
134
+ in_collection = False
135
+
136
+ for line in lines:
137
+ stripped = line.strip()
138
+
139
+ # Detect section boundaries
140
+ if "BEGIN ATOM" in stripped:
141
+ in_atom = True
142
+ continue
143
+ elif "END ATOM" in stripped:
144
+ in_atom = False
145
+ continue
146
+ elif "BEGIN BOND" in stripped:
147
+ in_bond = True
148
+ continue
149
+ elif "END BOND" in stripped:
150
+ in_bond = False
151
+ continue
152
+ elif "BEGIN COLLECTION" in stripped:
153
+ in_collection = True
154
+ continue
155
+ elif "END COLLECTION" in stripped:
156
+ in_collection = False
157
+ continue
158
+ elif "END CTAB" in stripped or stripped == "M END":
159
+ break
160
+
161
+ # Parse atoms: M V30 index symbol x y z charge [CFG=n]
162
+ if in_atom and stripped.startswith("M V30"):
163
+ parts = stripped[6:].split()
164
+ if len(parts) >= 6:
165
+ idx = int(parts[0])
166
+ symbol = parts[1]
167
+ x = float(parts[2])
168
+ y = float(parts[3])
169
+ z = float(parts[4])
170
+ cfg = 0
171
+ for p in parts[5:]:
172
+ if p.startswith("CFG="):
173
+ cfg = int(p.split("=")[1])
174
+ atoms.append(Atom(index=idx, symbol=symbol, x=x, y=y, z=z, cfg=cfg))
175
+
176
+ # Parse bonds: M V30 index order atom1 atom2 [CFG=n]
177
+ elif in_bond and stripped.startswith("M V30"):
178
+ parts = stripped[6:].split()
179
+ if len(parts) >= 4:
180
+ idx = int(parts[0])
181
+ order = int(parts[1])
182
+ a1 = int(parts[2])
183
+ a2 = int(parts[3])
184
+ cfg = 0
185
+ for p in parts[4:]:
186
+ if p.startswith("CFG="):
187
+ cfg = int(p.split("=")[1])
188
+ bonds.append(Bond(index=idx, order=order, atom1=a1, atom2=a2, cfg=cfg))
189
+
190
+ # Parse stereo collections: M V30 MDLV30/STEABS ATOMS=(1 9)
191
+ elif in_collection and stripped.startswith("M V30"):
192
+ content = stripped[6:].strip()
193
+ # Match patterns like MDLV30/STEABS ATOMS=(1 9)
194
+ m = re.match(r'MDLV30/STE(\w+)\s+ATOMS=\((.+?)\)', content)
195
+ if m:
196
+ stype = m.group(1) # ABS, REL, RAC
197
+ atom_ids = [int(x) for x in m.group(2).split()]
198
+ stereo.append(StereoCollection(stereo_type=stype, atom_indices=atom_ids))
199
+
200
+ return atoms, bonds, stereo
201
+
202
+
203
+ # ---------------------------------------------------------------------------
204
+ # RDF file parsing
205
+ # ---------------------------------------------------------------------------
206
+
207
+ def parse_rdf(filepath: str) -> List[Reaction]:
208
+ """
209
+ Parse a SciFinder .rdf file and return a list of Reaction objects.
210
+
211
+ The RDF format consists of:
212
+ - $RDFILE header
213
+ - $DATM timestamp
214
+ - One or more reaction records starting with $RFMT
215
+ - Each record has $RXN header, $MOL blocks, and $DTYPE/$DATUM metadata
216
+
217
+ Args:
218
+ filepath: Path to the .rdf file.
219
+
220
+ Returns:
221
+ List of Reaction objects.
222
+ """
223
+ with open(filepath, "r", encoding="utf-8", errors="replace") as f:
224
+ content = f.read()
225
+
226
+ reactions = []
227
+
228
+ # Parse file header
229
+ file_date = ""
230
+ datm_match = re.search(r'\$DATM\s+(.+)', content)
231
+ if datm_match:
232
+ file_date = datm_match.group(1).strip()
233
+
234
+ # Split into reaction records at $RFMT
235
+ # The first chunk before any $RFMT is the file header
236
+ rfmt_parts = re.split(r'^\$RFMT\b', content, flags=re.MULTILINE)
237
+
238
+ for part_idx, part in enumerate(rfmt_parts):
239
+ if part_idx == 0:
240
+ # File header — skip
241
+ continue
242
+
243
+ rxn = Reaction(file_date=file_date)
244
+
245
+ # Parse scheme ID from the $RFMT line remainder
246
+ first_line = part.split("\n")[0].strip()
247
+ scheme_match = re.search(r'\$RIREG\s+(\S+)', first_line)
248
+ if scheme_match:
249
+ rxn.scheme_id = scheme_match.group(1)
250
+
251
+ # Find the $RXN block and count line
252
+ rxn_match = re.search(r'\$RXN\s*\n', part)
253
+ if not rxn_match:
254
+ continue
255
+
256
+ # Lines after $RXN: two blank lines, then counts line
257
+ post_rxn = part[rxn_match.end():]
258
+ post_lines = post_rxn.split("\n")
259
+
260
+ # Find the counts line (first non-blank line after $RXN header lines)
261
+ counts_line = None
262
+ counts_line_idx = 0
263
+ for i, line in enumerate(post_lines):
264
+ stripped = line.strip()
265
+ if stripped and re.match(r'^\d+\s+\d+', stripped):
266
+ counts_line = stripped
267
+ counts_line_idx = i
268
+ break
269
+
270
+ if counts_line:
271
+ count_parts = counts_line.split()
272
+ rxn.num_reactants = int(count_parts[0])
273
+ rxn.num_products = int(count_parts[1])
274
+
275
+ # Split out $MOL blocks
276
+ mol_splits = re.split(r'^\$MOL\s*$', part, flags=re.MULTILINE)
277
+ # mol_splits[0] = everything before first $MOL (RXN header)
278
+ # mol_splits[1..n] = individual MOL blocks
279
+
280
+ total_mols = rxn.num_reactants + rxn.num_products
281
+ for mol_idx in range(1, len(mol_splits)):
282
+ if mol_idx > total_mols:
283
+ break
284
+
285
+ mol_text = mol_splits[mol_idx]
286
+ mol_lines = mol_text.strip().split("\n")
287
+
288
+ mol = Molecule()
289
+
290
+ # First three lines: name, formula, CAS/copyright
291
+ if len(mol_lines) >= 1:
292
+ mol.name = mol_lines[0].strip()
293
+ if len(mol_lines) >= 2:
294
+ mol.formula = mol_lines[1].strip()
295
+ if len(mol_lines) >= 3:
296
+ cas_line = mol_lines[2].strip()
297
+ cas_match = re.match(r'([\d-]+)', cas_line)
298
+ if cas_match:
299
+ mol.cas = cas_match.group(1)
300
+
301
+ # Determine role
302
+ if mol_idx <= rxn.num_reactants:
303
+ mol.role = "reactant"
304
+ else:
305
+ mol.role = "product"
306
+
307
+ # Parse V3000 CTAB — starts after the header line containing "V3000"
308
+ ctab_start = None
309
+ for i, line in enumerate(mol_lines):
310
+ if "V3000" in line:
311
+ ctab_start = i
312
+ break
313
+
314
+ if ctab_start is not None:
315
+ atoms, bonds, stereo = parse_v3000_mol(mol_lines[ctab_start + 1:])
316
+ mol.atoms = atoms
317
+ mol.bonds = bonds
318
+ mol.stereo = stereo
319
+
320
+ if mol.role == "reactant":
321
+ rxn.reactants.append(mol)
322
+ else:
323
+ rxn.products.append(mol)
324
+
325
+ # Parse $DTYPE / $DATUM metadata
326
+ _parse_dtype_datum(part, rxn)
327
+
328
+ reactions.append(rxn)
329
+
330
+ return reactions
331
+
332
+
333
+ def _parse_dtype_datum(text: str, rxn: Reaction):
334
+ """
335
+ Parse all $DTYPE/$DATUM pairs from a reaction record and populate
336
+ the reaction's variation data (reagents, catalysts, solvents,
337
+ yield, references, conditions).
338
+
339
+ Handles multiline $DATUM values (continuation lines without $DTYPE prefix).
340
+ """
341
+ # Extract all DTYPE/DATUM pairs, handling multiline DATUM values
342
+ dtype_datum_pairs = []
343
+ lines = text.split("\n")
344
+ i = 0
345
+ while i < len(lines):
346
+ line = lines[i].strip()
347
+ if line.startswith("$DTYPE"):
348
+ dtype = line[6:].strip()
349
+ datum_lines = []
350
+ i += 1
351
+ # Collect the $DATUM line and any continuation lines
352
+ while i < len(lines):
353
+ dline = lines[i]
354
+ if dline.strip().startswith("$DATUM"):
355
+ datum_lines.append(dline.strip()[6:].strip())
356
+ i += 1
357
+ # Continuation: lines that don't start with $ are part of this datum
358
+ while i < len(lines):
359
+ cont = lines[i]
360
+ if cont.strip().startswith("$"):
361
+ break
362
+ if cont.strip():
363
+ datum_lines.append(cont.strip())
364
+ i += 1
365
+ break
366
+ else:
367
+ i += 1
368
+ datum = " ".join(datum_lines)
369
+ dtype_datum_pairs.append((dtype, datum))
370
+ else:
371
+ i += 1
372
+
373
+ # Ensure we have at least one variation to populate
374
+ variations = {} # var_num -> ReactionVariation
375
+
376
+ for dtype, datum in dtype_datum_pairs:
377
+ # Direct reactant/product CAS: RXN:RCT(n):CAS_RN, RXN:PRO(n):CAS_RN
378
+ # These are already captured in the MOL blocks, but we verify here
379
+
380
+ # Variation-level data: RXN:VAR(n):...
381
+ var_match = re.match(r'RXN:VAR\((\d+)\):(.+)', dtype)
382
+ if var_match:
383
+ var_num = int(var_match.group(1))
384
+ var_key = var_match.group(2)
385
+
386
+ if var_num not in variations:
387
+ variations[var_num] = ReactionVariation()
388
+ var = variations[var_num]
389
+
390
+ # Yield: PRO(n):YIELD
391
+ yield_match = re.match(r'PRO\(\d+\):YIELD', var_key)
392
+ if yield_match:
393
+ try:
394
+ var.yield_pct = float(datum)
395
+ except ValueError:
396
+ var.yield_pct = None
397
+ continue
398
+
399
+ # CAS Reaction Number
400
+ if var_key == "CAS_Reaction_Number":
401
+ var.cas_reaction_number = datum
402
+ continue
403
+
404
+ # Steps / Stages
405
+ if var_key == "STEPS":
406
+ try:
407
+ var.steps = int(datum)
408
+ except ValueError:
409
+ pass
410
+ continue
411
+ if var_key == "STAGES":
412
+ try:
413
+ var.stages = int(datum)
414
+ except ValueError:
415
+ pass
416
+ continue
417
+
418
+ # Reagents: RGT(n):CAS_RN
419
+ rgt_match = re.match(r'RGT\((\d+)\):CAS_RN', var_key)
420
+ if rgt_match:
421
+ var.reagents.append(ReagentEntry(cas=datum, role="reagent"))
422
+ continue
423
+
424
+ # Catalysts: CAT(n):CAS_RN
425
+ cat_match = re.match(r'CAT\((\d+)\):CAS_RN', var_key)
426
+ if cat_match:
427
+ var.catalysts.append(ReagentEntry(cas=datum, role="catalyst"))
428
+ continue
429
+
430
+ # Solvents: SOL(n):CAS_RN
431
+ sol_match = re.match(r'SOL\((\d+)\):CAS_RN', var_key)
432
+ if sol_match:
433
+ var.solvents.append(ReagentEntry(cas=datum, role="solvent"))
434
+ continue
435
+
436
+ # References: REFERENCE(n):TITLE / AUTHOR / CITATION
437
+ ref_match = re.match(r'REFERENCE\((\d+)\):(\w+)', var_key)
438
+ if ref_match:
439
+ ref_num = int(ref_match.group(1))
440
+ ref_field = ref_match.group(2).upper()
441
+ # Ensure we have enough reference objects
442
+ while len(var.references) < ref_num:
443
+ var.references.append(Reference())
444
+ ref = var.references[ref_num - 1]
445
+ if ref_field == "TITLE":
446
+ ref.title = datum
447
+ elif ref_field == "AUTHOR":
448
+ ref.authors = datum
449
+ elif ref_field == "CITATION":
450
+ ref.citation = datum
451
+ continue
452
+
453
+ # Temperature, time, pressure, pH, etc.
454
+ cond_match = re.match(r'COND\((\d+)\):(.+)', var_key)
455
+ if cond_match:
456
+ cond_key = cond_match.group(2)
457
+ var.conditions[cond_key] = datum
458
+ continue
459
+
460
+ # Anything else — store as generic condition
461
+ # e.g. TEMP, TIME, PRESSURE from some exports
462
+ if var_key in ("TEMP", "TIME", "PRESSURE", "PH", "ATMOSPHERE"):
463
+ var.conditions[var_key] = datum
464
+ continue
465
+
466
+ # Add all variations sorted by var number
467
+ for var_num in sorted(variations.keys()):
468
+ rxn.variations.append(variations[var_num])
469
+
470
+
471
+ # ---------------------------------------------------------------------------
472
+ # CAS resolution (optional, delegates to cas_resolver.py)
473
+ # ---------------------------------------------------------------------------
474
+
475
+ def resolve_cas_numbers(reaction: Reaction) -> None:
476
+ """
477
+ Resolve all CAS numbers in the reaction using cas_resolver.
478
+ Populates name, MW, formula, SMILES for reagents/catalysts/solvents.
479
+ """
480
+ try:
481
+ from ..resolve.cas_resolver import resolve_cas
482
+ except ImportError:
483
+ print("Warning: cas_resolver.py not found. Skipping CAS resolution.",
484
+ file=sys.stderr)
485
+ return
486
+
487
+ # Resolve reagents, catalysts, solvents in all variations
488
+ for var in reaction.variations:
489
+ for entry_list in [var.reagents, var.catalysts, var.solvents]:
490
+ for entry in entry_list:
491
+ if entry.cas:
492
+ result = resolve_cas(entry.cas)
493
+ if result:
494
+ entry.name = result.get("name", "")
495
+ entry.mw = result.get("mw")
496
+ entry.formula = result.get("formula", "")
497
+ entry.smiles = result.get("smiles", "")
498
+
499
+
500
+ # ---------------------------------------------------------------------------
501
+ # Output formatting
502
+ # ---------------------------------------------------------------------------
503
+
504
+ def reaction_to_dict(rxn: Reaction) -> Dict[str, Any]:
505
+ """Convert a Reaction dataclass to a clean dictionary for JSON output."""
506
+
507
+ def mol_to_dict(mol: Molecule) -> Dict[str, Any]:
508
+ d = {
509
+ "name": mol.name,
510
+ "formula": mol.formula,
511
+ "cas": mol.cas,
512
+ "role": mol.role,
513
+ "atom_count": len(mol.atoms),
514
+ "bond_count": len(mol.bonds),
515
+ "atoms": [
516
+ {
517
+ "index": a.index,
518
+ "symbol": a.symbol,
519
+ "x": a.x,
520
+ "y": a.y,
521
+ "z": a.z,
522
+ **({"cfg": a.cfg} if a.cfg else {}),
523
+ }
524
+ for a in mol.atoms
525
+ ],
526
+ "bonds": [
527
+ {
528
+ "index": b.index,
529
+ "order": b.order,
530
+ "atom1": b.atom1,
531
+ "atom2": b.atom2,
532
+ **({"cfg": b.cfg} if b.cfg else {}),
533
+ }
534
+ for b in mol.bonds
535
+ ],
536
+ }
537
+ if mol.stereo:
538
+ d["stereo"] = [
539
+ {"type": s.stereo_type, "atoms": s.atom_indices}
540
+ for s in mol.stereo
541
+ ]
542
+ return d
543
+
544
+ def entry_to_dict(e: ReagentEntry) -> Dict[str, Any]:
545
+ d = {"cas": e.cas, "role": e.role}
546
+ if e.name:
547
+ d["name"] = e.name
548
+ if e.mw is not None:
549
+ d["mw"] = e.mw
550
+ if e.formula:
551
+ d["formula"] = e.formula
552
+ if e.smiles:
553
+ d["smiles"] = e.smiles
554
+ return d
555
+
556
+ def ref_to_dict(r: Reference) -> Dict[str, Any]:
557
+ d = {}
558
+ if r.title:
559
+ d["title"] = r.title
560
+ if r.authors:
561
+ d["authors"] = r.authors
562
+ if r.citation:
563
+ d["citation"] = r.citation
564
+ return d
565
+
566
+ result = {
567
+ "file_date": rxn.file_date,
568
+ "scheme_id": rxn.scheme_id,
569
+ "num_reactants": rxn.num_reactants,
570
+ "num_products": rxn.num_products,
571
+ "reactants": [mol_to_dict(m) for m in rxn.reactants],
572
+ "products": [mol_to_dict(m) for m in rxn.products],
573
+ "variations": [],
574
+ }
575
+
576
+ for var in rxn.variations:
577
+ v = {}
578
+ if var.cas_reaction_number:
579
+ v["cas_reaction_number"] = var.cas_reaction_number
580
+ v["steps"] = var.steps
581
+ v["stages"] = var.stages
582
+ if var.yield_pct is not None:
583
+ v["yield_pct"] = var.yield_pct
584
+ if var.reagents:
585
+ v["reagents"] = [entry_to_dict(e) for e in var.reagents]
586
+ if var.catalysts:
587
+ v["catalysts"] = [entry_to_dict(e) for e in var.catalysts]
588
+ if var.solvents:
589
+ v["solvents"] = [entry_to_dict(e) for e in var.solvents]
590
+ if var.references:
591
+ v["references"] = [ref_to_dict(r) for r in var.references]
592
+ if var.conditions:
593
+ v["conditions"] = var.conditions
594
+ result["variations"].append(v)
595
+
596
+ return result
597
+
598
+
599
+ # ---------------------------------------------------------------------------
600
+ # CLI
601
+ # ---------------------------------------------------------------------------
602
+
603
+ def main(argv=None) -> int:
604
+ parser = argparse.ArgumentParser(
605
+ description="Parse SciFinder .rdf reaction exports into structured JSON."
606
+ )
607
+ parser.add_argument(
608
+ "rdf_file",
609
+ help="Path to the SciFinder .rdf file",
610
+ )
611
+ parser.add_argument(
612
+ "--output", "-o",
613
+ help="Output JSON file (default: print to stdout)",
614
+ )
615
+ parser.add_argument(
616
+ "--resolve-cas",
617
+ action="store_true",
618
+ help="Resolve reagent/catalyst/solvent CAS numbers via PubChem "
619
+ "(requires cas_resolver.py)",
620
+ )
621
+ parser.add_argument(
622
+ "--pretty",
623
+ action="store_true",
624
+ help="Pretty-print JSON output",
625
+ )
626
+ args = parser.parse_args(argv)
627
+
628
+ # Parse
629
+ reactions = parse_rdf(args.rdf_file)
630
+
631
+ if not reactions:
632
+ print(f"No reactions found in {args.rdf_file}", file=sys.stderr)
633
+ return 1
634
+
635
+ print(f"Parsed {len(reactions)} reaction(s) from {args.rdf_file}",
636
+ file=sys.stderr)
637
+
638
+ # Optionally resolve CAS numbers
639
+ if args.resolve_cas:
640
+ for rxn in reactions:
641
+ resolve_cas_numbers(rxn)
642
+
643
+ # Convert to JSON
644
+ if len(reactions) == 1:
645
+ output = reaction_to_dict(reactions[0])
646
+ else:
647
+ output = [reaction_to_dict(r) for r in reactions]
648
+
649
+ indent = 2 if args.pretty else None
650
+ json_str = json.dumps(output, indent=indent, ensure_ascii=False)
651
+
652
+ if args.output:
653
+ with open(args.output, "w", encoding="utf-8") as f:
654
+ f.write(json_str)
655
+ f.write("\n")
656
+ print(f"Written to {args.output}", file=sys.stderr)
657
+ else:
658
+ print(json_str)
659
+
660
+ return 0
661
+
662
+
663
+ if __name__ == "__main__":
664
+ sys.exit(main())