cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1045 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ reactant_heuristic.py — Classify reaction reagents as atom-contributing
4
+ or non-contributing using role lookup + RDKit MCS.
5
+
6
+ Two input modes:
7
+ cdxml Parse a CDXML reaction file; extract fragments + text from <step>
8
+ smiles Accept reagent SMILES + product SMILES directly on the CLI
9
+
10
+ Examples:
11
+ python reactant_heuristic.py cdxml -i reaction.cdxml --pretty
12
+ python reactant_heuristic.py smiles --reagents "C1COCCN1" "c1cc2scnc2Br" \\
13
+ --product "c1cc2scnc2N1CCOCC1" --pretty
14
+ """
15
+
16
+ import argparse
17
+ import json
18
+ import os
19
+ import sys
20
+ import tempfile
21
+ from dataclasses import dataclass, field, asdict
22
+ from typing import Any, Dict, List, Optional, Tuple
23
+ from xml.etree import ElementTree as ET
24
+
25
+ from ..constants import CDXML_FOOTER, CDXML_MINIMAL_HEADER
26
+
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # Data classes
30
+ # ---------------------------------------------------------------------------
31
+
32
+ @dataclass
33
+ class ReagentInfo:
34
+ """Information about a single reagent being classified."""
35
+ source_id: str = ""
36
+ source_type: str = "" # "fragment", "text", "smiles_input"
37
+ name: Optional[str] = None
38
+ smiles: Optional[str] = None
39
+ position: str = "" # "reactant", "above_arrow", "below_arrow"
40
+ classification: str = "" # "atom_contributing", "non_contributing", "unclassified"
41
+ classification_method: str = "" # "schneider_fp", "role_lookup", "fm_type", etc.
42
+ mcs_ratio: Optional[float] = None
43
+ rxnmapper_confidence: Optional[float] = None # deprecated — kept for compat
44
+ schneider_score: Optional[float] = None # Schneider FP combo score
45
+ role: Optional[str] = None # "catalyst", "ligand", "base", "solvent", etc.
46
+
47
+
48
+ # ---------------------------------------------------------------------------
49
+ # Role Lookup (Tier 1) — via shared reagent database
50
+ # ---------------------------------------------------------------------------
51
+
52
+ from ..resolve.reagent_db import get_reagent_db
53
+
54
+ # Transition metals commonly used as catalysts (by atomic number)
55
+ CATALYST_METALS = {46, 28, 29, 77, 45, 44, 78, 76, 79}
56
+ # Pd=46, Ni=28, Cu=29, Ir=77, Rh=45, Ru=44, Pt=78, Os=76, Au=79
57
+
58
+
59
+ # ---------------------------------------------------------------------------
60
+ # CDXML Parsing Helpers (adapted from cdxml_combiner.py)
61
+ # ---------------------------------------------------------------------------
62
+
63
+ def _get_page(root: ET.Element) -> ET.Element:
64
+ """Find the <page> element in a CDXML root."""
65
+ page = root.find("page")
66
+ if page is None:
67
+ raise SystemExit("ERROR: no <page> element in CDXML")
68
+ return page
69
+
70
+
71
+ def _count_heavy_atoms(frag: ET.Element) -> int:
72
+ """Count non-hydrogen atoms in a fragment."""
73
+ count = 0
74
+ for n in frag.iter("n"):
75
+ if n.get("NodeType") in ("ExternalConnectionPoint", "Fragment",
76
+ "Unspecified"):
77
+ continue
78
+ count += 1
79
+ return count
80
+
81
+
82
+ def _get_text_content(el: ET.Element) -> str:
83
+ """Extract concatenated text from all <s> children of a <t> element."""
84
+ parts = []
85
+ for s in el.iter("s"):
86
+ if s.text:
87
+ parts.append(s.text.strip())
88
+ return " ".join(parts).strip()
89
+
90
+
91
+ def _get_fm_molecule_type(el: ET.Element) -> Optional[int]:
92
+ """Read the Findmolecule MOLECULE TYPE objecttag.
93
+ Values: 0=molecule, 1=solvent, 2=condition text, 3=product."""
94
+ for ot in el.iter("objecttag"):
95
+ if ot.get("Name") == "FM MOLECULE TYPE":
96
+ try:
97
+ return int(ot.get("Value", ""))
98
+ except ValueError:
99
+ return None
100
+ return None
101
+
102
+
103
+ def _attrs_to_str(el: ET.Element) -> str:
104
+ parts = []
105
+ for k, v in el.attrib.items():
106
+ v = v.replace("&", "&amp;").replace('"', "&quot;").replace("<", "&lt;")
107
+ parts.append(f'{k}="{v}"')
108
+ return " ".join(parts)
109
+
110
+
111
+ def _element_to_string(el: ET.Element) -> str:
112
+ tag = el.tag
113
+ attrs = _attrs_to_str(el)
114
+ children = list(el)
115
+ text = el.text or ""
116
+ if attrs:
117
+ open_tag = f"<{tag} {attrs}"
118
+ else:
119
+ open_tag = f"<{tag}"
120
+ if not children and not text.strip():
121
+ return f"{open_tag}/>"
122
+ result = f"{open_tag}>"
123
+ if text.strip():
124
+ safe = text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
125
+ result += safe
126
+ for child in children:
127
+ result += _element_to_string(child)
128
+ result += f"</{tag}>"
129
+ return result
130
+
131
+
132
+ def _fragment_to_cdxml(frag: ET.Element) -> str:
133
+ """Wrap a single <fragment> in a minimal CDXML document."""
134
+ return (
135
+ CDXML_MINIMAL_HEADER + "\n<page id=\"1\">\n"
136
+ + _element_to_string(frag)
137
+ + "\n</page>\n" + CDXML_FOOTER
138
+ )
139
+
140
+
141
+ # ---------------------------------------------------------------------------
142
+ # SMILES Extraction
143
+ # ---------------------------------------------------------------------------
144
+
145
+ # Lazy ChemScript singleton
146
+ _cs_instance = None
147
+ _cs_tried = False
148
+
149
+
150
+ def _get_chemscript():
151
+ """Return a ChemScriptBridge instance (lazy singleton), or None."""
152
+ global _cs_instance, _cs_tried
153
+ if _cs_tried:
154
+ return _cs_instance
155
+ _cs_tried = True
156
+ try:
157
+ from ..chemdraw.chemscript_bridge import ChemScriptBridge
158
+ _cs_instance = ChemScriptBridge()
159
+ except Exception as e:
160
+ print(f" [warn] ChemScript not available: {e}", file=sys.stderr)
161
+ _cs_instance = None
162
+ return _cs_instance
163
+
164
+
165
+ def _fragment_to_smiles(frag: ET.Element) -> Optional[str]:
166
+ """Convert a CDXML <fragment> to SMILES via ChemScript."""
167
+ cs = _get_chemscript()
168
+ if cs is None:
169
+ return None
170
+ cdxml_str = _fragment_to_cdxml(frag)
171
+ tmp_path = None
172
+ try:
173
+ with tempfile.NamedTemporaryFile(suffix=".cdxml", mode="w",
174
+ delete=False, encoding="utf-8") as f:
175
+ f.write(cdxml_str)
176
+ tmp_path = f.name
177
+ smiles = cs.write_data(tmp_path, "smiles")
178
+ return smiles.strip() if smiles else None
179
+ except Exception as e:
180
+ print(f" [warn] fragment→SMILES failed: {e}", file=sys.stderr)
181
+ return None
182
+ finally:
183
+ if tmp_path and os.path.exists(tmp_path):
184
+ os.unlink(tmp_path)
185
+
186
+
187
+ def _text_to_smiles(text_content: str) -> Optional[str]:
188
+ """Resolve a reagent name to SMILES.
189
+
190
+ Resolution chain (first success wins):
191
+ 1. py2opsin name->SMILES (offline, handles IUPAC/systematic names)
192
+ 2. PubChem name->SMILES via cas_resolver (online, fallback)
193
+ """
194
+ # --- Try OPSIN first (offline) ---
195
+ smiles = _opsin_name_to_smiles(text_content)
196
+ if smiles:
197
+ return smiles
198
+
199
+ # --- Fall back to PubChem (online) ---
200
+ try:
201
+ from ..resolve.cas_resolver import resolve_name_to_smiles
202
+ return resolve_name_to_smiles(text_content)
203
+ except Exception as e:
204
+ print(f" [warn] name->SMILES failed for '{text_content}': {e}",
205
+ file=sys.stderr)
206
+ return None
207
+
208
+
209
+ # ---------------------------------------------------------------------------
210
+ # OPSIN name resolution (offline)
211
+ # ---------------------------------------------------------------------------
212
+
213
+ _opsin_available: Optional[bool] = None
214
+ _java_exe: Optional[str] = None
215
+
216
+
217
+ def _find_java() -> Optional[str]:
218
+ """Find the Java executable for OPSIN.
219
+
220
+ Discovery order:
221
+ 1. ``java`` on PATH (system-installed)
222
+ 2. ``JAVA_HOME`` environment variable
223
+ 3. Bundled JRE alongside test data (``CHEM_TEST_DATA`` env var)
224
+ 4. Known default location for the project JRE
225
+
226
+ Returns the full path to the ``java`` (or ``java.exe``) binary,
227
+ or None if no JRE is found.
228
+ """
229
+ import shutil
230
+
231
+ # 1. Already on PATH?
232
+ java = shutil.which("java")
233
+ if java:
234
+ return java
235
+
236
+ # 2. JAVA_HOME env var
237
+ java_home = os.environ.get("JAVA_HOME")
238
+ if java_home:
239
+ candidate = os.path.join(java_home, "bin", "java.exe")
240
+ if os.path.isfile(candidate):
241
+ return candidate
242
+ candidate = os.path.join(java_home, "bin", "java")
243
+ if os.path.isfile(candidate):
244
+ return candidate
245
+
246
+ # 3. Bundled JRE relative to CHEM_TEST_DATA
247
+ test_data = os.environ.get("CHEM_TEST_DATA")
248
+ if test_data:
249
+ # Look for any JRE directory inside CHEM_TEST_DATA
250
+ _jre = _scan_for_jre(test_data)
251
+ if _jre:
252
+ return _jre
253
+
254
+ # 4. Known default location (project-specific)
255
+ _known = os.path.expanduser(
256
+ os.path.join("~", "chem-test-data",
257
+ "OpenJDK21U-jre_x64_windows_hotspot_21.0.10_7"))
258
+ if os.path.isdir(_known):
259
+ _jre = _scan_for_jre(_known)
260
+ if _jre:
261
+ return _jre
262
+
263
+ return None
264
+
265
+
266
+ def _scan_for_jre(base_dir: str) -> Optional[str]:
267
+ """Scan a directory tree (1 level deep) for a JRE bin/java."""
268
+ for name in ("bin",):
269
+ candidate = os.path.join(base_dir, name, "java.exe")
270
+ if os.path.isfile(candidate):
271
+ return candidate
272
+ candidate = os.path.join(base_dir, name, "java")
273
+ if os.path.isfile(candidate):
274
+ return candidate
275
+
276
+ # Check one level of subdirectories (e.g. jdk-21.0.10+7-jre/bin/)
277
+ try:
278
+ for entry in os.listdir(base_dir):
279
+ subdir = os.path.join(base_dir, entry)
280
+ if os.path.isdir(subdir):
281
+ candidate = os.path.join(subdir, "bin", "java.exe")
282
+ if os.path.isfile(candidate):
283
+ return candidate
284
+ candidate = os.path.join(subdir, "bin", "java")
285
+ if os.path.isfile(candidate):
286
+ return candidate
287
+ except OSError:
288
+ pass
289
+ return None
290
+
291
+
292
+ def _opsin_name_to_smiles(name: str) -> Optional[str]:
293
+ """Try to resolve a chemical name to SMILES via OPSIN (offline).
294
+
295
+ OPSIN handles systematic/IUPAC names and many common names well
296
+ (e.g. "cesium carbonate", "triethylamine", "sodium tert-butoxide").
297
+ Fails on abbreviations (BINAP, Pd2dba3) and some organometallics.
298
+
299
+ Requires the py2opsin package. A JRE is auto-downloaded on first
300
+ use if no system Java is found (via :mod:`cdxml_toolkit.resolve.jre_manager`).
301
+ """
302
+ global _opsin_available, _java_exe
303
+ if _opsin_available is False:
304
+ return None
305
+ try:
306
+ import warnings
307
+ from py2opsin import py2opsin
308
+
309
+ # Ensure Java is discoverable by py2opsin's subprocess call.
310
+ # Uses the centralized JRE manager which auto-downloads if needed.
311
+ from cdxml_toolkit.resolve.jre_manager import ensure_java_on_path
312
+ if not ensure_java_on_path():
313
+ # Fall back to legacy _find_java for non-standard locations
314
+ if _java_exe is None:
315
+ _java_exe = _find_java()
316
+ if _java_exe and _java_exe not in os.environ.get("PATH", ""):
317
+ java_bin_dir = os.path.dirname(_java_exe)
318
+ os.environ["PATH"] = java_bin_dir + os.pathsep + os.environ.get("PATH", "")
319
+ java_home = os.path.dirname(java_bin_dir)
320
+ os.environ["JAVA_HOME"] = java_home
321
+
322
+ with warnings.catch_warnings():
323
+ warnings.simplefilter("ignore", RuntimeWarning)
324
+ result = py2opsin(name)
325
+ if result:
326
+ _opsin_available = True
327
+ return result
328
+ _opsin_available = True
329
+ return None
330
+ except FileNotFoundError:
331
+ if _opsin_available is None:
332
+ print(" [info] OPSIN unavailable (Java not found)", file=sys.stderr)
333
+ _opsin_available = False
334
+ return None
335
+ except ImportError:
336
+ if _opsin_available is None:
337
+ print(" [info] py2opsin not installed", file=sys.stderr)
338
+ _opsin_available = False
339
+ return None
340
+ except Exception as e:
341
+ print(f" [info] OPSIN name->SMILES failed for '{name}': {e}",
342
+ file=sys.stderr)
343
+ return None
344
+
345
+
346
+ # ---------------------------------------------------------------------------
347
+ # Tier 1 — Role Lookup
348
+ # ---------------------------------------------------------------------------
349
+
350
+ def _contains_catalyst_metal(smiles: str) -> bool:
351
+ """Check if a molecule contains a transition-metal catalyst atom."""
352
+ try:
353
+ from rdkit import Chem
354
+ mol = Chem.MolFromSmiles(smiles)
355
+ if mol is None:
356
+ return False
357
+ return any(a.GetAtomicNum() in CATALYST_METALS for a in mol.GetAtoms())
358
+ except Exception:
359
+ return False
360
+
361
+
362
+ def _is_inorganic(smiles: str) -> bool:
363
+ """Heuristic: molecule has no carbons, or only 1 C with ≥4 heavy atoms
364
+ (likely carbonate, cyanide, etc.)."""
365
+ try:
366
+ from rdkit import Chem
367
+ mol = Chem.MolFromSmiles(smiles)
368
+ if mol is None:
369
+ return False
370
+ carbons = sum(1 for a in mol.GetAtoms() if a.GetAtomicNum() == 6)
371
+ total = mol.GetNumHeavyAtoms()
372
+ if carbons == 0:
373
+ return True
374
+ if carbons == 1 and total >= 4:
375
+ return True
376
+ return False
377
+ except Exception:
378
+ return False
379
+
380
+
381
+ def role_lookup(smiles: Optional[str], name: Optional[str]
382
+ ) -> Optional[Tuple[str, str]]:
383
+ """Tier 1 classification. Returns (role, method) or None."""
384
+ db = get_reagent_db()
385
+
386
+ # 1. SMILES-based lookup (exact canonical match)
387
+ if smiles:
388
+ role = db.role_for_smiles(smiles)
389
+ if role:
390
+ return (role, "role_lookup")
391
+
392
+ # 1b. Stereo-agnostic SMILES lookup. RDKit-only SMILES extraction
393
+ # often omits E/Z on double bonds (e.g. DEAD's N=N) because
394
+ # frag_to_mol doesn't set bond stereo from 2D coordinates.
395
+ if smiles:
396
+ role = _role_for_smiles_no_stereo(smiles, db)
397
+ if role:
398
+ return (role, "role_lookup_no_stereo")
399
+
400
+ # 2. Name-based lookup
401
+ if name:
402
+ role = db.role_for_name(name)
403
+ if role:
404
+ return (role, "role_lookup")
405
+
406
+ # 3. Metal-containing → catalyst
407
+ if smiles and _contains_catalyst_metal(smiles):
408
+ return ("catalyst", "metal_check")
409
+
410
+ # 4. Inorganic salt
411
+ if smiles and _is_inorganic(smiles):
412
+ return ("inorganic_salt", "inorganic_check")
413
+
414
+ return None
415
+
416
+
417
+ def _role_for_smiles_no_stereo(smiles: str, db) -> Optional[str]:
418
+ """Match SMILES against DB after stripping stereochemistry."""
419
+ try:
420
+ from rdkit import Chem
421
+ mol = Chem.MolFromSmiles(smiles)
422
+ if mol is None:
423
+ return None
424
+ Chem.RemoveStereochemistry(mol)
425
+ flat_smi = Chem.MolToSmiles(mol)
426
+
427
+ for smi_key, entry in db._by_smiles.items():
428
+ mol2 = Chem.MolFromSmiles(smi_key)
429
+ if mol2 is None:
430
+ continue
431
+ Chem.RemoveStereochemistry(mol2)
432
+ if flat_smi == Chem.MolToSmiles(mol2):
433
+ return entry.get("role")
434
+ except ImportError:
435
+ pass
436
+ except Exception:
437
+ pass
438
+ return None
439
+
440
+
441
+ # ---------------------------------------------------------------------------
442
+ # Tier 2 — RDKit MCS (kept for alignment use; no longer used for classification)
443
+ # ---------------------------------------------------------------------------
444
+
445
+ def mcs_ratio(reagent_smiles: str, product_smiles: str) -> Optional[float]:
446
+ """Compute MCS heavy-atom ratio: MCS_atoms / reagent_heavy_atoms.
447
+
448
+ NOTE: No longer used for classification (replaced by Schneider FP).
449
+ Kept because alignment.py may call it for 2D coordinate matching.
450
+ """
451
+ try:
452
+ from rdkit import Chem
453
+ from rdkit.Chem import rdFMCS
454
+
455
+ reagent_mol = Chem.MolFromSmiles(reagent_smiles)
456
+ product_mol = Chem.MolFromSmiles(product_smiles)
457
+ if reagent_mol is None or product_mol is None:
458
+ return None
459
+
460
+ reagent_heavy = reagent_mol.GetNumHeavyAtoms()
461
+ if reagent_heavy == 0:
462
+ return None
463
+
464
+ result = rdFMCS.FindMCS(
465
+ [reagent_mol, product_mol],
466
+ atomCompare=rdFMCS.AtomCompare.CompareElements,
467
+ bondCompare=rdFMCS.BondCompare.CompareAny,
468
+ ringMatchesRingOnly=True,
469
+ completeRingsOnly=True,
470
+ timeout=10,
471
+ )
472
+
473
+ if result.canceled or result.numAtoms == 0:
474
+ return 0.0
475
+
476
+ return result.numAtoms / reagent_heavy
477
+
478
+ except Exception as e:
479
+ print(f" [warn] MCS failed: {e}", file=sys.stderr)
480
+ return None
481
+
482
+
483
+ # ---------------------------------------------------------------------------
484
+ # Tier 1 — Schneider FP-based reaction role assignment
485
+ # ---------------------------------------------------------------------------
486
+ # Implements the algorithm from Schneider et al., JCIM 2016:
487
+ # "What's What: The (Nearly) Definitive Guide to Reaction Role Assignment"
488
+ #
489
+ # Context-aware: considers the specific product to determine which candidates
490
+ # are atom-contributing (reactants) vs non-contributing (reagents).
491
+
492
+ # Common reagents mined from 1.3M USPTO patent reactions (appear in >1000
493
+ # reactions across >100 reaction types). Canonical SMILES.
494
+ _SCHNEIDER_COMMON_REAGENTS: Optional[set] = None
495
+
496
+
497
+ def _get_common_reagents() -> set:
498
+ """Lazily build the canonical common-reagent set."""
499
+ global _SCHNEIDER_COMMON_REAGENTS
500
+ if _SCHNEIDER_COMMON_REAGENTS is not None:
501
+ return _SCHNEIDER_COMMON_REAGENTS
502
+ try:
503
+ from rdkit import Chem
504
+ except ImportError:
505
+ _SCHNEIDER_COMMON_REAGENTS = set()
506
+ return _SCHNEIDER_COMMON_REAGENTS
507
+
508
+ raw = [
509
+ # Solvents
510
+ "ClCCl", "C(Cl)(Cl)Cl", "CS(C)=O", "CCOC(C)=O", "CC#N",
511
+ "C1CCOC1", "C1COCCO1", "CO", "CCO", "CC(C)=O",
512
+ "c1ccncc1", "CN(C)C=O", "c1ccccc1", "Cc1ccccc1", "CCOCC",
513
+ "CC(C)O", "ClC(Cl)Cl", "O", "CC(=O)O",
514
+ # Bases
515
+ "CCN(CC)CC", "CN(C)C",
516
+ # Common ions / salts
517
+ "[Na+]", "[K+]", "[Li+]", "[Cs+]",
518
+ "[OH-]", "[Cl-]", "[Br-]", "[I-]", "[F-]", "[H-]",
519
+ "[NH4+]", "O=C([O-])[O-]", "O=S([O-])([O-])=O",
520
+ # Catalyst metals
521
+ "[Pd]", "[Pt]", "[Ni]",
522
+ ]
523
+ result = set()
524
+ for smi in raw:
525
+ mol = Chem.MolFromSmiles(smi)
526
+ if mol:
527
+ result.add(Chem.MolToSmiles(mol))
528
+ _SCHNEIDER_COMMON_REAGENTS = result
529
+ return result
530
+
531
+
532
+ def _is_schneider_common_reagent(mol) -> bool:
533
+ """Check if a molecule (or all its fragments) are common reagents."""
534
+ from rdkit import Chem
535
+ common = _get_common_reagents()
536
+ can_smi = Chem.MolToSmiles(mol)
537
+ if can_smi in common:
538
+ return True
539
+ frags = Chem.GetMolFrags(mol, asMols=True)
540
+ if len(frags) > 1:
541
+ return all(Chem.MolToSmiles(f) in common for f in frags)
542
+ return False
543
+
544
+
545
+ def _schneider_fp(mol, scaffold: bool = False):
546
+ """Count-based Morgan FP (radius=1) as a dict."""
547
+ from rdkit.Chem import rdFingerprintGenerator as rfg
548
+ gen = rfg.GetMorganGenerator(
549
+ radius=1,
550
+ atomInvariantsGenerator=(
551
+ rfg.GetMorganAtomInvGen(includeRingMembership=False)
552
+ if scaffold else None
553
+ ),
554
+ )
555
+ return dict(gen.GetCountFingerprint(mol).GetNonzeroElements())
556
+
557
+
558
+ def _schneider_sum_fps(fps):
559
+ """Sum multiple count fingerprints."""
560
+ from collections import Counter
561
+ r = Counter()
562
+ for fp in fps:
563
+ for k, v in fp.items():
564
+ r[k] += v
565
+ return dict(r)
566
+
567
+
568
+ def _schneider_score(prod_fp: dict, react_fp: dict) -> float:
569
+ """Score a reactant combination against the product FP.
570
+
571
+ First term: coverage (how well reactants explain the product)
572
+ Second term: leaving-group penalty (weighted less — sqrt)
573
+ """
574
+ keys = set(prod_fp) | set(react_fp)
575
+ total = sum(prod_fp.values())
576
+ if not keys or total == 0:
577
+ return 0.0
578
+ pos = sum(max(0, prod_fp.get(k, 0) - react_fp.get(k, 0)) for k in keys)
579
+ neg = sum(max(0, react_fp.get(k, 0) - prod_fp.get(k, 0)) for k in keys)
580
+ return max(0.0, (1.0 - pos / total) - 0.5 * (neg / total) ** 0.5)
581
+
582
+
583
+ def _schneider_classify(reagents: List[ReagentInfo],
584
+ product_smiles: str) -> None:
585
+ """Tier 1: Schneider FP-based reaction role assignment.
586
+
587
+ Classifies unclassified reagents as atom_contributing or non_contributing
588
+ by finding the combination of candidates whose Morgan fingerprints best
589
+ explain the product fingerprint.
590
+
591
+ Modifies reagents in place.
592
+ """
593
+ if not product_smiles:
594
+ return
595
+
596
+ try:
597
+ from rdkit import Chem
598
+ except ImportError:
599
+ return
600
+
601
+ import itertools
602
+
603
+ # Parse product(s) — may contain fragments separated by '.'
604
+ prod_mol = Chem.MolFromSmiles(product_smiles)
605
+ if prod_mol is None:
606
+ return
607
+
608
+ prod_fp_d = _schneider_fp(prod_mol, scaffold=False)
609
+ prod_fp_s = _schneider_fp(prod_mol, scaffold=True)
610
+ total_prod_atoms = prod_mol.GetNumHeavyAtoms()
611
+
612
+ if total_prod_atoms == 0:
613
+ return
614
+
615
+ # Collect unclassified reagents that have parseable SMILES
616
+ candidates = []
617
+ for r in reagents:
618
+ if r.classification:
619
+ continue
620
+ if not r.smiles:
621
+ continue
622
+ mol = Chem.MolFromSmiles(r.smiles)
623
+ if mol is None:
624
+ continue
625
+ candidates.append({
626
+ "reagent": r,
627
+ "mol": mol,
628
+ "fp_d": _schneider_fp(mol, scaffold=False),
629
+ "fp_s": _schneider_fp(mol, scaffold=True),
630
+ "n_atoms": mol.GetNumHeavyAtoms(),
631
+ "is_common": _is_schneider_common_reagent(mol),
632
+ })
633
+
634
+ if not candidates:
635
+ return
636
+
637
+ def _find_best(cand_list):
638
+ """Find the best-scoring reactant combination."""
639
+ best_score, best_combo = -1.0, None
640
+ n = len(cand_list)
641
+ if n == 0 or n > 18:
642
+ return best_combo, best_score
643
+ for r in range(1, min(n + 1, 6)): # max 5 reactants
644
+ for combo in itertools.combinations(cand_list, r):
645
+ na = sum(c["n_atoms"] for c in combo)
646
+ if na < total_prod_atoms * 0.5 or na > total_prod_atoms * 6:
647
+ continue
648
+ fp_d = _schneider_sum_fps([c["fp_d"] for c in combo])
649
+ fp_s = _schneider_sum_fps([c["fp_s"] for c in combo])
650
+ sc = (_schneider_score(prod_fp_d, fp_d) +
651
+ _schneider_score(prod_fp_s, fp_s))
652
+ if sc > best_score:
653
+ best_score, best_combo = sc, combo
654
+ return best_combo, best_score
655
+
656
+ # Phase 1: try without common reagents
657
+ non_common = [c for c in candidates if not c["is_common"]]
658
+ best_combo, best_score = _find_best(non_common)
659
+
660
+ # Phase 2: if no good result, include common reagents
661
+ if best_combo is None or best_score < 0.5:
662
+ combo2, score2 = _find_best(candidates)
663
+ if score2 > best_score:
664
+ best_combo, best_score = combo2, score2
665
+
666
+ # Apply results
667
+ reactant_set = set()
668
+ if best_combo:
669
+ reactant_set = {id(c["reagent"]) for c in best_combo}
670
+
671
+ for c in candidates:
672
+ r = c["reagent"]
673
+ if id(r) in reactant_set:
674
+ r.classification = "atom_contributing"
675
+ else:
676
+ r.classification = "non_contributing"
677
+ r.classification_method = "schneider_fp"
678
+ r.schneider_score = round(best_score, 4)
679
+
680
+ # Mark any remaining unclassified (no SMILES) as unclassified
681
+ for r in reagents:
682
+ if not r.classification:
683
+ r.classification = "unclassified"
684
+ r.classification_method = "none"
685
+
686
+ print(f" Schneider FP classification (score={best_score:.3f}): "
687
+ f"{sum(1 for c in candidates if c['reagent'].classification == 'atom_contributing')} "
688
+ f"reactant(s), "
689
+ f"{sum(1 for c in candidates if c['reagent'].classification == 'non_contributing')} "
690
+ f"reagent(s)",
691
+ file=sys.stderr)
692
+
693
+
694
+ # ---------------------------------------------------------------------------
695
+ # Main Classification Logic
696
+ # ---------------------------------------------------------------------------
697
+
698
+ def classify_reagents(reagents: List[ReagentInfo],
699
+ product_smiles: str,
700
+ mcs_threshold: float = 0.3,
701
+ use_rxnmapper: bool = True) -> List[ReagentInfo]:
702
+ """Classify each reagent using a two-tier strategy.
703
+
704
+ Tier 1: Schneider FP scoring — context-aware binary classification
705
+ (atom_contributing vs non_contributing).
706
+ Tier 2: Curated DB lookup — semantic role enrichment for non-contributing
707
+ species (adds labels like 'base', 'catalyst', 'solvent').
708
+
709
+ Schneider always wins on the binary question. The DB never overrides it.
710
+
711
+ Args:
712
+ mcs_threshold: deprecated, ignored (kept for API compat)
713
+ use_rxnmapper: deprecated, ignored (kept for API compat)
714
+ """
715
+ # --- Tier 1: Schneider FP-based classification (context-aware) ---
716
+ _schneider_classify(reagents, product_smiles)
717
+
718
+ # --- Tier 2: Semantic role enrichment for non-contributing species ---
719
+ for r in reagents:
720
+ if r.classification == "non_contributing" and not r.role:
721
+ result = role_lookup(r.smiles, r.name)
722
+ if result:
723
+ role, _method = result
724
+ r.role = role # "base", "catalyst", "solvent", etc.
725
+
726
+ return reagents
727
+
728
+
729
+ def _try_rxnmapper_classification(reagents: List[ReagentInfo],
730
+ product_smiles: str) -> None:
731
+ """Tier 1.5: Use RXNMapper atom maps to classify unclassified reagents.
732
+
733
+ Builds a reaction SMILES from all unclassified reagent SMILES + product,
734
+ calls RXNMapper via subprocess (rxn-experiments env), and uses the atom
735
+ map results to determine which reagents are atom-contributing.
736
+
737
+ Modifies reagents in place. Silently returns if RXNMapper is unavailable.
738
+ """
739
+ if not product_smiles:
740
+ return
741
+
742
+ # Collect unclassified reagents that have SMILES
743
+ unclassified = [r for r in reagents if not r.classification and r.smiles]
744
+ if not unclassified:
745
+ return
746
+
747
+ # Build reaction SMILES: all unclassified reagent SMILES >> product
748
+ reactant_smiles_list = [r.smiles for r in unclassified]
749
+ rxn_smi = ".".join(reactant_smiles_list) + ">>" + product_smiles
750
+
751
+ # Try to call RXNMapper
752
+ try:
753
+ from experiments.atom_mapping.rxn_atom_mapper import classify_roles
754
+ except ImportError:
755
+ # rxn_atom_mapper not available — skip silently
756
+ return
757
+
758
+ try:
759
+ result = classify_roles(rxn_smi)
760
+ except Exception as exc:
761
+ print(f" [info] RXNMapper classification failed: {exc}",
762
+ file=sys.stderr)
763
+ return
764
+
765
+ if result is None:
766
+ return
767
+
768
+ confidence = result.get("confidence", 0.0)
769
+ components = result.get("components", [])
770
+
771
+ if not components:
772
+ return
773
+
774
+ print(f" RXNMapper classification (confidence={confidence:.4f}):",
775
+ file=sys.stderr)
776
+
777
+ # Match results back to reagents by canonical SMILES
778
+ try:
779
+ from rdkit import Chem
780
+ def _canon(smi):
781
+ mol = Chem.MolFromSmiles(smi)
782
+ return Chem.MolToSmiles(mol) if mol else smi
783
+ except ImportError:
784
+ def _canon(smi):
785
+ return smi
786
+
787
+ # Build lookup: canonical SMILES → RXNMapper component info
788
+ rxnm_by_smi = {}
789
+ for comp in components:
790
+ canon = _canon(comp["smiles"])
791
+ rxnm_by_smi[canon] = comp
792
+
793
+ # Apply to unclassified reagents
794
+ for r in unclassified:
795
+ canon = _canon(r.smiles)
796
+ comp = rxnm_by_smi.get(canon)
797
+ if comp is None:
798
+ continue
799
+
800
+ is_contributing = comp.get("atom_contributing")
801
+ if is_contributing is None:
802
+ continue
803
+
804
+ if is_contributing:
805
+ r.classification = "atom_contributing"
806
+ r.classification_method = "rxnmapper"
807
+ n_atoms = comp.get("n_product_atoms", 0)
808
+ print(f" {r.smiles[:50]:50s} → atom_contributing "
809
+ f"({n_atoms} atoms in product)", file=sys.stderr)
810
+ else:
811
+ r.classification = "non_contributing"
812
+ r.classification_method = "rxnmapper"
813
+ print(f" {r.smiles[:50]:50s} → non_contributing",
814
+ file=sys.stderr)
815
+
816
+ r.rxnmapper_confidence = confidence
817
+
818
+
819
+ # ---------------------------------------------------------------------------
820
+ # CDXML Mode Entry Point
821
+ # ---------------------------------------------------------------------------
822
+
823
+ def classify_from_cdxml(cdxml_path: str,
824
+ mcs_threshold: float = 0.3,
825
+ use_rxnmapper: bool = False) -> Dict[str, Any]:
826
+ """Parse a CDXML reaction file and classify all reagents.
827
+
828
+ mcs_threshold and use_rxnmapper are deprecated and ignored (kept for
829
+ API compat). Classification uses Schneider FP scoring internally.
830
+ """
831
+ tree = ET.parse(cdxml_path)
832
+ root = tree.getroot()
833
+ page = _get_page(root)
834
+
835
+ # --- Parse <step> metadata ---
836
+ scheme = page.find("scheme")
837
+ step = scheme.find("step") if scheme is not None else None
838
+ if step is None:
839
+ raise SystemExit("ERROR: no <scheme><step> found in CDXML")
840
+
841
+ reactant_ids = step.get("ReactionStepReactants", "").split()
842
+ product_ids = step.get("ReactionStepProducts", "").split()
843
+ above_ids = step.get("ReactionStepObjectsAboveArrow", "").split()
844
+ below_ids = step.get("ReactionStepObjectsBelowArrow", "").split()
845
+
846
+ # Build id → element map
847
+ id_to_el: Dict[str, ET.Element] = {}
848
+ for el in page:
849
+ eid = el.get("id", "")
850
+ if eid:
851
+ id_to_el[eid] = el
852
+
853
+ # --- Extract product SMILES ---
854
+ product_smiles = None
855
+ for pid in product_ids:
856
+ el = id_to_el.get(pid)
857
+ if el is not None and el.tag == "fragment":
858
+ product_smiles = _fragment_to_smiles(el)
859
+ if product_smiles:
860
+ break
861
+ if not product_smiles:
862
+ raise SystemExit("ERROR: could not extract product SMILES")
863
+
864
+ print(f"Product SMILES: {product_smiles}", file=sys.stderr)
865
+
866
+ # --- Collect reagents ---
867
+ reagents: List[ReagentInfo] = []
868
+ seen_ids: set = set()
869
+
870
+ def _process_element(eid: str, position: str):
871
+ """Process a single element (fragment or text) as a potential reagent."""
872
+ if eid in seen_ids:
873
+ return
874
+ seen_ids.add(eid)
875
+
876
+ el = id_to_el.get(eid)
877
+ if el is None:
878
+ return
879
+
880
+ fm_type = _get_fm_molecule_type(el)
881
+
882
+ # Skip products and condition text
883
+ if fm_type == 3:
884
+ return
885
+ if fm_type == 2:
886
+ return
887
+
888
+ ri = ReagentInfo(source_id=eid, position=position)
889
+
890
+ # FM type = 1 → solvent hint (Schneider may override)
891
+ if fm_type == 1:
892
+ ri.source_type = el.tag
893
+ ri.role = "solvent" # hint only; Schneider decides classification
894
+ if el.tag == "t":
895
+ ri.name = _get_text_content(el)
896
+
897
+ # Fragment → extract SMILES via ChemScript
898
+ if el.tag == "fragment":
899
+ ri.source_type = "fragment"
900
+ ri.smiles = _fragment_to_smiles(el)
901
+ if ri.smiles:
902
+ print(f" Fragment {eid}: {ri.smiles}", file=sys.stderr)
903
+
904
+ # Text → resolve name to SMILES via PubChem
905
+ elif el.tag == "t":
906
+ ri.source_type = "text"
907
+ text = _get_text_content(el)
908
+ ri.name = text
909
+ ri.smiles = _text_to_smiles(text)
910
+ if ri.smiles:
911
+ print(f" Text '{text}' → {ri.smiles}", file=sys.stderr)
912
+ else:
913
+ print(f" Text '{text}' → no SMILES (name-only)", file=sys.stderr)
914
+ else:
915
+ return
916
+
917
+ reagents.append(ri)
918
+
919
+ # Process reactants first, then above/below arrow
920
+ for rid in reactant_ids:
921
+ _process_element(rid, "reactant")
922
+ for eid in above_ids:
923
+ _process_element(eid, "above_arrow")
924
+ for eid in below_ids:
925
+ _process_element(eid, "below_arrow")
926
+
927
+ # --- Classify ---
928
+ classify_reagents(reagents, product_smiles, mcs_threshold,
929
+ use_rxnmapper=use_rxnmapper)
930
+
931
+ return {
932
+ "cdxml_file": os.path.basename(cdxml_path),
933
+ "product_smiles": product_smiles,
934
+ "reagents": [_reagent_to_dict(r) for r in reagents],
935
+ }
936
+
937
+
938
+ # ---------------------------------------------------------------------------
939
+ # SMILES Mode Entry Point
940
+ # ---------------------------------------------------------------------------
941
+
942
+ def classify_from_smiles(reagent_smiles: List[str],
943
+ product_smiles: str,
944
+ reagent_names: Optional[List[str]] = None,
945
+ mcs_threshold: float = 0.3,
946
+ use_rxnmapper: bool = True) -> Dict[str, Any]:
947
+ """Classify reagents given as SMILES strings."""
948
+ reagents: List[ReagentInfo] = []
949
+ for i, smi in enumerate(reagent_smiles):
950
+ name = reagent_names[i] if reagent_names and i < len(reagent_names) else None
951
+ ri = ReagentInfo(source_type="smiles_input", smiles=smi, name=name)
952
+ reagents.append(ri)
953
+ classify_reagents(reagents, product_smiles, mcs_threshold,
954
+ use_rxnmapper=use_rxnmapper)
955
+ return {
956
+ "product_smiles": product_smiles,
957
+ "reagents": [_reagent_to_dict(r) for r in reagents],
958
+ }
959
+
960
+
961
+ # ---------------------------------------------------------------------------
962
+ # Output Helpers
963
+ # ---------------------------------------------------------------------------
964
+
965
+ def _reagent_to_dict(r: ReagentInfo) -> Dict[str, Any]:
966
+ d = asdict(r)
967
+ # Drop empty/None optional fields for cleaner output
968
+ if d["mcs_ratio"] is None:
969
+ del d["mcs_ratio"]
970
+ if d.get("rxnmapper_confidence") is None:
971
+ d.pop("rxnmapper_confidence", None)
972
+ if d.get("schneider_score") is None:
973
+ d.pop("schneider_score", None)
974
+ if d["role"] is None:
975
+ del d["role"]
976
+ if d["name"] is None:
977
+ del d["name"]
978
+ return d
979
+
980
+
981
+ # ---------------------------------------------------------------------------
982
+ # CLI
983
+ # ---------------------------------------------------------------------------
984
+
985
+ def main(argv: Optional[List[str]] = None) -> int:
986
+ parser = argparse.ArgumentParser(
987
+ description="Classify reaction reagents as atom-contributing "
988
+ "or non-contributing (role lookup + RDKit MCS).",
989
+ formatter_class=argparse.RawDescriptionHelpFormatter,
990
+ epilog=__doc__,
991
+ )
992
+ sub = parser.add_subparsers(dest="mode", required=True,
993
+ help="Input mode")
994
+
995
+ # Shared args for both modes
996
+ common = argparse.ArgumentParser(add_help=False)
997
+ common.add_argument("-o", "--output",
998
+ help="Output JSON file (default: stdout)")
999
+ common.add_argument("--pretty", action="store_true",
1000
+ help="Pretty-print JSON output")
1001
+ common.add_argument("--threshold", type=float, default=0.5,
1002
+ help="MCS ratio threshold (default: 0.5)")
1003
+
1004
+ # CDXML mode
1005
+ p_cdxml = sub.add_parser("cdxml", parents=[common],
1006
+ help="Classify from a CDXML reaction file")
1007
+ p_cdxml.add_argument("-i", "--input", required=True,
1008
+ help="Input CDXML file")
1009
+
1010
+ # SMILES mode
1011
+ p_smi = sub.add_parser("smiles", parents=[common],
1012
+ help="Classify from SMILES strings")
1013
+ p_smi.add_argument("--reagents", nargs="+", required=True,
1014
+ help="Reagent SMILES strings")
1015
+ p_smi.add_argument("--product", required=True,
1016
+ help="Product SMILES")
1017
+ p_smi.add_argument("--names", nargs="+", default=None,
1018
+ help="Reagent names (parallel to --reagents)")
1019
+
1020
+ args = parser.parse_args(argv)
1021
+
1022
+ if args.mode == "cdxml":
1023
+ result = classify_from_cdxml(args.input, args.threshold)
1024
+ elif args.mode == "smiles":
1025
+ result = classify_from_smiles(
1026
+ args.reagents, args.product, args.names, args.threshold)
1027
+ else:
1028
+ parser.print_help()
1029
+ return 1
1030
+
1031
+ indent = 2 if args.pretty else None
1032
+ json_str = json.dumps(result, indent=indent, ensure_ascii=False)
1033
+
1034
+ if args.output:
1035
+ with open(args.output, "w", encoding="utf-8") as f:
1036
+ f.write(json_str + "\n")
1037
+ print(f"Written to {args.output}", file=sys.stderr)
1038
+ else:
1039
+ print(json_str)
1040
+
1041
+ return 0
1042
+
1043
+
1044
+ if __name__ == "__main__":
1045
+ sys.exit(main())