cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
reagent_db.py — Shared reagent database loader (two-tier).
|
|
4
|
+
|
|
5
|
+
Loads a curated ``reagent_abbreviations.json`` (tier-1, ~172 entries with
|
|
6
|
+
roles) and a larger ``chemscanner_abbreviations.json`` (tier-2, ~5,800
|
|
7
|
+
entries, no roles) for name/SMILES → display/role resolution.
|
|
8
|
+
|
|
9
|
+
Tier-1 always wins. Tier-2 is consulted only when tier-1 returns None.
|
|
10
|
+
Role lookups are tier-1 only (ChemScanner has no role data).
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
from cdxml_toolkit.resolve.reagent_db import get_reagent_db
|
|
14
|
+
|
|
15
|
+
db = get_reagent_db()
|
|
16
|
+
db.display_for_name("cs2co3") # "Cs2CO3" (tier-1)
|
|
17
|
+
db.role_for_name("cs2co3") # "base" (tier-1 only)
|
|
18
|
+
db.display_for_name("hatu") # "HATU" (tier-2 fallback)
|
|
19
|
+
db.display_for_smiles(canon_smi) # "Et3N" (if SMILES matches)
|
|
20
|
+
db.resolve_display("cs2co3") # "Cs2CO3" (or original if unknown)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import json
|
|
24
|
+
import os
|
|
25
|
+
import re
|
|
26
|
+
import sys
|
|
27
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
# Name normalization helpers
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
# Unicode subscript digits → ASCII digits
|
|
35
|
+
_SUBSCRIPT_MAP = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")
|
|
36
|
+
|
|
37
|
+
# Common solvate/salt suffixes to strip (middle-dot or ASCII dot)
|
|
38
|
+
_SOLVATE_SUFFIXES = re.compile(
|
|
39
|
+
r"[·.](?:chcl3|dcm|ch2cl2|h2o|hcl|thf|dme|meoh|etoh|et2o)\s*$",
|
|
40
|
+
re.IGNORECASE,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Racemic/stereochemical prefixes to strip
|
|
44
|
+
_RAC_PREFIX = re.compile(r"^(?:rac-|\(±\)-)", re.IGNORECASE)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _normalize_name(name: str) -> str:
|
|
48
|
+
"""Normalize a reagent name for lookup.
|
|
49
|
+
|
|
50
|
+
Converts Unicode subscript digits to ASCII and lowercases.
|
|
51
|
+
"""
|
|
52
|
+
return name.translate(_SUBSCRIPT_MAP).strip().lower()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ReagentDB:
|
|
56
|
+
"""In-memory reagent database with two-tier lookup.
|
|
57
|
+
|
|
58
|
+
Tier-1: ``reagent_abbreviations.json`` (curated, with roles).
|
|
59
|
+
Tier-2: ``chemscanner_abbreviations.json`` (large, no roles).
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(self, json_path: Optional[str] = None,
|
|
63
|
+
secondary_path: Optional[str] = None):
|
|
64
|
+
module_dir = os.path.dirname(os.path.abspath(__file__))
|
|
65
|
+
|
|
66
|
+
if json_path is None:
|
|
67
|
+
json_path = os.path.join(module_dir, "reagent_abbreviations.json")
|
|
68
|
+
if secondary_path is None:
|
|
69
|
+
secondary_path = os.path.join(module_dir,
|
|
70
|
+
"chemscanner_abbreviations.json")
|
|
71
|
+
|
|
72
|
+
# --- Tier-1: curated ---
|
|
73
|
+
with open(json_path, encoding="utf-8") as f:
|
|
74
|
+
raw: Dict[str, dict] = json.load(f)
|
|
75
|
+
|
|
76
|
+
self._by_name: Dict[str, dict] = {}
|
|
77
|
+
self._by_smiles: Dict[str, dict] = {}
|
|
78
|
+
|
|
79
|
+
# Try to import RDKit for SMILES canonicalization
|
|
80
|
+
self._rdkit_Chem = None
|
|
81
|
+
try:
|
|
82
|
+
from rdkit import Chem
|
|
83
|
+
self._rdkit_Chem = Chem
|
|
84
|
+
except ImportError:
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
self._index_entries(raw, self._by_name, self._by_smiles)
|
|
88
|
+
|
|
89
|
+
# --- Tier-2: ChemScanner (optional) ---
|
|
90
|
+
self._cs_by_name: Dict[str, dict] = {}
|
|
91
|
+
self._cs_by_smiles: Dict[str, dict] = {}
|
|
92
|
+
|
|
93
|
+
if os.path.exists(secondary_path):
|
|
94
|
+
try:
|
|
95
|
+
with open(secondary_path, encoding="utf-8") as f:
|
|
96
|
+
cs_raw: Dict[str, dict] = json.load(f)
|
|
97
|
+
self._index_entries(cs_raw, self._cs_by_name,
|
|
98
|
+
self._cs_by_smiles)
|
|
99
|
+
except Exception as exc:
|
|
100
|
+
print(f"[reagent_db] Warning: could not load tier-2 "
|
|
101
|
+
f"database: {exc}", file=sys.stderr)
|
|
102
|
+
|
|
103
|
+
def _index_entries(self, raw: Dict[str, dict],
|
|
104
|
+
by_name: Dict[str, dict],
|
|
105
|
+
by_smiles: Dict[str, dict]):
|
|
106
|
+
"""Index a set of JSON entries into name and SMILES lookup dicts."""
|
|
107
|
+
for key, entry in raw.items():
|
|
108
|
+
# Index by primary key
|
|
109
|
+
by_name[key] = entry
|
|
110
|
+
|
|
111
|
+
# Index by aliases
|
|
112
|
+
for alias in entry.get("aliases", []):
|
|
113
|
+
by_name[alias] = entry
|
|
114
|
+
|
|
115
|
+
# Index by SMILES
|
|
116
|
+
smiles_val = entry.get("smiles")
|
|
117
|
+
if smiles_val is not None:
|
|
118
|
+
smiles_list: List[str] = (
|
|
119
|
+
smiles_val if isinstance(smiles_val, list)
|
|
120
|
+
else [smiles_val]
|
|
121
|
+
)
|
|
122
|
+
for smi in smiles_list:
|
|
123
|
+
by_smiles[smi] = entry
|
|
124
|
+
if self._rdkit_Chem is not None:
|
|
125
|
+
try:
|
|
126
|
+
mol = self._rdkit_Chem.MolFromSmiles(smi)
|
|
127
|
+
if mol:
|
|
128
|
+
canon = self._rdkit_Chem.MolToSmiles(mol)
|
|
129
|
+
by_smiles[canon] = entry
|
|
130
|
+
except Exception:
|
|
131
|
+
pass
|
|
132
|
+
|
|
133
|
+
# ----- name-based lookups -----
|
|
134
|
+
|
|
135
|
+
def _lookup_name_entry(self, name: str, tier1_only: bool = False
|
|
136
|
+
) -> Optional[dict]:
|
|
137
|
+
"""Internal progressive-normalization name lookup.
|
|
138
|
+
|
|
139
|
+
Tries (in order):
|
|
140
|
+
1. Exact (lowered, subscript-normalized)
|
|
141
|
+
2. With solvate suffix stripped ("·CHCl3", ".DCM", etc.)
|
|
142
|
+
3. With rac-/(±)- prefix stripped
|
|
143
|
+
4. With both solvate suffix and rac- prefix stripped
|
|
144
|
+
|
|
145
|
+
Each step checks tier-1 first, then tier-2 (unless tier1_only).
|
|
146
|
+
"""
|
|
147
|
+
key = _normalize_name(name)
|
|
148
|
+
candidates = [key]
|
|
149
|
+
|
|
150
|
+
# Strip solvate suffix
|
|
151
|
+
stripped_solvate = _SOLVATE_SUFFIXES.sub("", key).strip()
|
|
152
|
+
if stripped_solvate != key:
|
|
153
|
+
candidates.append(stripped_solvate)
|
|
154
|
+
|
|
155
|
+
# Strip rac-/(±)- prefix
|
|
156
|
+
stripped_rac = _RAC_PREFIX.sub("", key).strip()
|
|
157
|
+
if stripped_rac != key:
|
|
158
|
+
candidates.append(stripped_rac)
|
|
159
|
+
|
|
160
|
+
# Strip both
|
|
161
|
+
stripped_both = _RAC_PREFIX.sub("", stripped_solvate).strip()
|
|
162
|
+
if stripped_both not in candidates:
|
|
163
|
+
candidates.append(stripped_both)
|
|
164
|
+
|
|
165
|
+
for cand in candidates:
|
|
166
|
+
entry = self._by_name.get(cand)
|
|
167
|
+
if entry:
|
|
168
|
+
return entry
|
|
169
|
+
if not tier1_only:
|
|
170
|
+
entry = self._cs_by_name.get(cand)
|
|
171
|
+
if entry:
|
|
172
|
+
return entry
|
|
173
|
+
|
|
174
|
+
return None
|
|
175
|
+
|
|
176
|
+
def display_for_name(self, name: str) -> Optional[str]:
|
|
177
|
+
"""Return display string for a name/alias, or None if unknown.
|
|
178
|
+
|
|
179
|
+
Checks tier-1 (curated) first, then tier-2 (ChemScanner).
|
|
180
|
+
Uses progressive normalization (subscripts, solvate stripping,
|
|
181
|
+
rac- prefix stripping).
|
|
182
|
+
"""
|
|
183
|
+
entry = self._lookup_name_entry(name)
|
|
184
|
+
return entry["display"] if entry else None
|
|
185
|
+
|
|
186
|
+
def role_for_name(self, name: str) -> Optional[str]:
|
|
187
|
+
"""Return role string for a name/alias, or None.
|
|
188
|
+
|
|
189
|
+
Tier-1 only — ChemScanner entries have no roles.
|
|
190
|
+
"""
|
|
191
|
+
entry = self._lookup_name_entry(name, tier1_only=True)
|
|
192
|
+
return entry.get("role") if entry else None
|
|
193
|
+
|
|
194
|
+
def entry_for_name(self, name: str) -> Optional[dict]:
|
|
195
|
+
"""Return the full entry dict for a name/alias, or None.
|
|
196
|
+
|
|
197
|
+
Checks tier-1 first, then tier-2.
|
|
198
|
+
"""
|
|
199
|
+
return self._lookup_name_entry(name)
|
|
200
|
+
|
|
201
|
+
# ----- SMILES-based lookups -----
|
|
202
|
+
|
|
203
|
+
def _lookup_smiles_entry(self, smiles: str,
|
|
204
|
+
by_smiles: Dict[str, dict]) -> Optional[dict]:
|
|
205
|
+
"""Look up a SMILES in a single index (raw then canonicalized)."""
|
|
206
|
+
entry = by_smiles.get(smiles)
|
|
207
|
+
if entry:
|
|
208
|
+
return entry
|
|
209
|
+
if self._rdkit_Chem is not None:
|
|
210
|
+
try:
|
|
211
|
+
mol = self._rdkit_Chem.MolFromSmiles(smiles)
|
|
212
|
+
if mol:
|
|
213
|
+
canon = self._rdkit_Chem.MolToSmiles(mol)
|
|
214
|
+
entry = by_smiles.get(canon)
|
|
215
|
+
if entry:
|
|
216
|
+
return entry
|
|
217
|
+
except Exception:
|
|
218
|
+
pass
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
def display_for_smiles(self, smiles: str) -> Optional[str]:
|
|
222
|
+
"""Return display string for a SMILES, or None if unknown.
|
|
223
|
+
|
|
224
|
+
Checks tier-1 first, then tier-2.
|
|
225
|
+
"""
|
|
226
|
+
entry = self._lookup_smiles_entry(smiles, self._by_smiles)
|
|
227
|
+
if entry:
|
|
228
|
+
return entry["display"]
|
|
229
|
+
entry = self._lookup_smiles_entry(smiles, self._cs_by_smiles)
|
|
230
|
+
return entry["display"] if entry else None
|
|
231
|
+
|
|
232
|
+
def role_for_smiles(self, smiles: str) -> Optional[str]:
|
|
233
|
+
"""Return role string for a SMILES, or None.
|
|
234
|
+
|
|
235
|
+
Tier-1 only — ChemScanner entries have no roles.
|
|
236
|
+
"""
|
|
237
|
+
entry = self._lookup_smiles_entry(smiles, self._by_smiles)
|
|
238
|
+
return entry.get("role") if entry else None
|
|
239
|
+
|
|
240
|
+
def entry_for_smiles(self, smiles: str) -> Optional[dict]:
|
|
241
|
+
"""Return the full entry dict for a SMILES, or None.
|
|
242
|
+
|
|
243
|
+
Checks tier-1 first, then tier-2.
|
|
244
|
+
"""
|
|
245
|
+
entry = self._lookup_smiles_entry(smiles, self._by_smiles)
|
|
246
|
+
if entry:
|
|
247
|
+
return entry
|
|
248
|
+
return self._lookup_smiles_entry(smiles, self._cs_by_smiles)
|
|
249
|
+
|
|
250
|
+
# ----- convenience -----
|
|
251
|
+
|
|
252
|
+
def resolve_display(self, name: str) -> str:
|
|
253
|
+
"""Return the display string for *name*, or *name* itself if unknown.
|
|
254
|
+
|
|
255
|
+
Drop-in replacement for the old ``ABBREVIATIONS.get(key, text)``
|
|
256
|
+
pattern. Checks tier-1 first, then tier-2.
|
|
257
|
+
"""
|
|
258
|
+
display = self.display_for_name(name)
|
|
259
|
+
return display if display is not None else name
|
|
260
|
+
|
|
261
|
+
def smiles_role_display(self, smiles: str) -> Optional[Tuple[str, str]]:
|
|
262
|
+
"""Return (role, display) for a SMILES, matching the old
|
|
263
|
+
ROLE_BY_SMILES dict interface.
|
|
264
|
+
|
|
265
|
+
Tier-1 only (requires role). Returns None if unknown.
|
|
266
|
+
"""
|
|
267
|
+
entry = self._lookup_smiles_entry(smiles, self._by_smiles)
|
|
268
|
+
if entry and "role" in entry:
|
|
269
|
+
return (entry["role"], entry["display"])
|
|
270
|
+
return None
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
# ---------------------------------------------------------------------------
|
|
274
|
+
# Singleton accessor
|
|
275
|
+
# ---------------------------------------------------------------------------
|
|
276
|
+
|
|
277
|
+
_instance: Optional[ReagentDB] = None
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def get_reagent_db() -> ReagentDB:
|
|
281
|
+
"""Return the shared ReagentDB singleton (loaded on first call)."""
|
|
282
|
+
global _instance
|
|
283
|
+
if _instance is None:
|
|
284
|
+
_instance = ReagentDB()
|
|
285
|
+
return _instance
|