cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,285 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ reagent_db.py — Shared reagent database loader (two-tier).
4
+
5
+ Loads a curated ``reagent_abbreviations.json`` (tier-1, ~172 entries with
6
+ roles) and a larger ``chemscanner_abbreviations.json`` (tier-2, ~5,800
7
+ entries, no roles) for name/SMILES → display/role resolution.
8
+
9
+ Tier-1 always wins. Tier-2 is consulted only when tier-1 returns None.
10
+ Role lookups are tier-1 only (ChemScanner has no role data).
11
+
12
+ Usage:
13
+ from cdxml_toolkit.resolve.reagent_db import get_reagent_db
14
+
15
+ db = get_reagent_db()
16
+ db.display_for_name("cs2co3") # "Cs2CO3" (tier-1)
17
+ db.role_for_name("cs2co3") # "base" (tier-1 only)
18
+ db.display_for_name("hatu") # "HATU" (tier-2 fallback)
19
+ db.display_for_smiles(canon_smi) # "Et3N" (if SMILES matches)
20
+ db.resolve_display("cs2co3") # "Cs2CO3" (or original if unknown)
21
+ """
22
+
23
+ import json
24
+ import os
25
+ import re
26
+ import sys
27
+ from typing import Dict, List, Optional, Tuple, Union
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Name normalization helpers
32
+ # ---------------------------------------------------------------------------
33
+
34
+ # Unicode subscript digits → ASCII digits
35
+ _SUBSCRIPT_MAP = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")
36
+
37
+ # Common solvate/salt suffixes to strip (middle-dot or ASCII dot)
38
+ _SOLVATE_SUFFIXES = re.compile(
39
+ r"[·.](?:chcl3|dcm|ch2cl2|h2o|hcl|thf|dme|meoh|etoh|et2o)\s*$",
40
+ re.IGNORECASE,
41
+ )
42
+
43
+ # Racemic/stereochemical prefixes to strip
44
+ _RAC_PREFIX = re.compile(r"^(?:rac-|\(±\)-)", re.IGNORECASE)
45
+
46
+
47
+ def _normalize_name(name: str) -> str:
48
+ """Normalize a reagent name for lookup.
49
+
50
+ Converts Unicode subscript digits to ASCII and lowercases.
51
+ """
52
+ return name.translate(_SUBSCRIPT_MAP).strip().lower()
53
+
54
+
55
+ class ReagentDB:
56
+ """In-memory reagent database with two-tier lookup.
57
+
58
+ Tier-1: ``reagent_abbreviations.json`` (curated, with roles).
59
+ Tier-2: ``chemscanner_abbreviations.json`` (large, no roles).
60
+ """
61
+
62
+ def __init__(self, json_path: Optional[str] = None,
63
+ secondary_path: Optional[str] = None):
64
+ module_dir = os.path.dirname(os.path.abspath(__file__))
65
+
66
+ if json_path is None:
67
+ json_path = os.path.join(module_dir, "reagent_abbreviations.json")
68
+ if secondary_path is None:
69
+ secondary_path = os.path.join(module_dir,
70
+ "chemscanner_abbreviations.json")
71
+
72
+ # --- Tier-1: curated ---
73
+ with open(json_path, encoding="utf-8") as f:
74
+ raw: Dict[str, dict] = json.load(f)
75
+
76
+ self._by_name: Dict[str, dict] = {}
77
+ self._by_smiles: Dict[str, dict] = {}
78
+
79
+ # Try to import RDKit for SMILES canonicalization
80
+ self._rdkit_Chem = None
81
+ try:
82
+ from rdkit import Chem
83
+ self._rdkit_Chem = Chem
84
+ except ImportError:
85
+ pass
86
+
87
+ self._index_entries(raw, self._by_name, self._by_smiles)
88
+
89
+ # --- Tier-2: ChemScanner (optional) ---
90
+ self._cs_by_name: Dict[str, dict] = {}
91
+ self._cs_by_smiles: Dict[str, dict] = {}
92
+
93
+ if os.path.exists(secondary_path):
94
+ try:
95
+ with open(secondary_path, encoding="utf-8") as f:
96
+ cs_raw: Dict[str, dict] = json.load(f)
97
+ self._index_entries(cs_raw, self._cs_by_name,
98
+ self._cs_by_smiles)
99
+ except Exception as exc:
100
+ print(f"[reagent_db] Warning: could not load tier-2 "
101
+ f"database: {exc}", file=sys.stderr)
102
+
103
+ def _index_entries(self, raw: Dict[str, dict],
104
+ by_name: Dict[str, dict],
105
+ by_smiles: Dict[str, dict]):
106
+ """Index a set of JSON entries into name and SMILES lookup dicts."""
107
+ for key, entry in raw.items():
108
+ # Index by primary key
109
+ by_name[key] = entry
110
+
111
+ # Index by aliases
112
+ for alias in entry.get("aliases", []):
113
+ by_name[alias] = entry
114
+
115
+ # Index by SMILES
116
+ smiles_val = entry.get("smiles")
117
+ if smiles_val is not None:
118
+ smiles_list: List[str] = (
119
+ smiles_val if isinstance(smiles_val, list)
120
+ else [smiles_val]
121
+ )
122
+ for smi in smiles_list:
123
+ by_smiles[smi] = entry
124
+ if self._rdkit_Chem is not None:
125
+ try:
126
+ mol = self._rdkit_Chem.MolFromSmiles(smi)
127
+ if mol:
128
+ canon = self._rdkit_Chem.MolToSmiles(mol)
129
+ by_smiles[canon] = entry
130
+ except Exception:
131
+ pass
132
+
133
+ # ----- name-based lookups -----
134
+
135
+ def _lookup_name_entry(self, name: str, tier1_only: bool = False
136
+ ) -> Optional[dict]:
137
+ """Internal progressive-normalization name lookup.
138
+
139
+ Tries (in order):
140
+ 1. Exact (lowered, subscript-normalized)
141
+ 2. With solvate suffix stripped ("·CHCl3", ".DCM", etc.)
142
+ 3. With rac-/(±)- prefix stripped
143
+ 4. With both solvate suffix and rac- prefix stripped
144
+
145
+ Each step checks tier-1 first, then tier-2 (unless tier1_only).
146
+ """
147
+ key = _normalize_name(name)
148
+ candidates = [key]
149
+
150
+ # Strip solvate suffix
151
+ stripped_solvate = _SOLVATE_SUFFIXES.sub("", key).strip()
152
+ if stripped_solvate != key:
153
+ candidates.append(stripped_solvate)
154
+
155
+ # Strip rac-/(±)- prefix
156
+ stripped_rac = _RAC_PREFIX.sub("", key).strip()
157
+ if stripped_rac != key:
158
+ candidates.append(stripped_rac)
159
+
160
+ # Strip both
161
+ stripped_both = _RAC_PREFIX.sub("", stripped_solvate).strip()
162
+ if stripped_both not in candidates:
163
+ candidates.append(stripped_both)
164
+
165
+ for cand in candidates:
166
+ entry = self._by_name.get(cand)
167
+ if entry:
168
+ return entry
169
+ if not tier1_only:
170
+ entry = self._cs_by_name.get(cand)
171
+ if entry:
172
+ return entry
173
+
174
+ return None
175
+
176
+ def display_for_name(self, name: str) -> Optional[str]:
177
+ """Return display string for a name/alias, or None if unknown.
178
+
179
+ Checks tier-1 (curated) first, then tier-2 (ChemScanner).
180
+ Uses progressive normalization (subscripts, solvate stripping,
181
+ rac- prefix stripping).
182
+ """
183
+ entry = self._lookup_name_entry(name)
184
+ return entry["display"] if entry else None
185
+
186
+ def role_for_name(self, name: str) -> Optional[str]:
187
+ """Return role string for a name/alias, or None.
188
+
189
+ Tier-1 only — ChemScanner entries have no roles.
190
+ """
191
+ entry = self._lookup_name_entry(name, tier1_only=True)
192
+ return entry.get("role") if entry else None
193
+
194
+ def entry_for_name(self, name: str) -> Optional[dict]:
195
+ """Return the full entry dict for a name/alias, or None.
196
+
197
+ Checks tier-1 first, then tier-2.
198
+ """
199
+ return self._lookup_name_entry(name)
200
+
201
+ # ----- SMILES-based lookups -----
202
+
203
+ def _lookup_smiles_entry(self, smiles: str,
204
+ by_smiles: Dict[str, dict]) -> Optional[dict]:
205
+ """Look up a SMILES in a single index (raw then canonicalized)."""
206
+ entry = by_smiles.get(smiles)
207
+ if entry:
208
+ return entry
209
+ if self._rdkit_Chem is not None:
210
+ try:
211
+ mol = self._rdkit_Chem.MolFromSmiles(smiles)
212
+ if mol:
213
+ canon = self._rdkit_Chem.MolToSmiles(mol)
214
+ entry = by_smiles.get(canon)
215
+ if entry:
216
+ return entry
217
+ except Exception:
218
+ pass
219
+ return None
220
+
221
+ def display_for_smiles(self, smiles: str) -> Optional[str]:
222
+ """Return display string for a SMILES, or None if unknown.
223
+
224
+ Checks tier-1 first, then tier-2.
225
+ """
226
+ entry = self._lookup_smiles_entry(smiles, self._by_smiles)
227
+ if entry:
228
+ return entry["display"]
229
+ entry = self._lookup_smiles_entry(smiles, self._cs_by_smiles)
230
+ return entry["display"] if entry else None
231
+
232
+ def role_for_smiles(self, smiles: str) -> Optional[str]:
233
+ """Return role string for a SMILES, or None.
234
+
235
+ Tier-1 only — ChemScanner entries have no roles.
236
+ """
237
+ entry = self._lookup_smiles_entry(smiles, self._by_smiles)
238
+ return entry.get("role") if entry else None
239
+
240
+ def entry_for_smiles(self, smiles: str) -> Optional[dict]:
241
+ """Return the full entry dict for a SMILES, or None.
242
+
243
+ Checks tier-1 first, then tier-2.
244
+ """
245
+ entry = self._lookup_smiles_entry(smiles, self._by_smiles)
246
+ if entry:
247
+ return entry
248
+ return self._lookup_smiles_entry(smiles, self._cs_by_smiles)
249
+
250
+ # ----- convenience -----
251
+
252
+ def resolve_display(self, name: str) -> str:
253
+ """Return the display string for *name*, or *name* itself if unknown.
254
+
255
+ Drop-in replacement for the old ``ABBREVIATIONS.get(key, text)``
256
+ pattern. Checks tier-1 first, then tier-2.
257
+ """
258
+ display = self.display_for_name(name)
259
+ return display if display is not None else name
260
+
261
+ def smiles_role_display(self, smiles: str) -> Optional[Tuple[str, str]]:
262
+ """Return (role, display) for a SMILES, matching the old
263
+ ROLE_BY_SMILES dict interface.
264
+
265
+ Tier-1 only (requires role). Returns None if unknown.
266
+ """
267
+ entry = self._lookup_smiles_entry(smiles, self._by_smiles)
268
+ if entry and "role" in entry:
269
+ return (entry["role"], entry["display"])
270
+ return None
271
+
272
+
273
+ # ---------------------------------------------------------------------------
274
+ # Singleton accessor
275
+ # ---------------------------------------------------------------------------
276
+
277
+ _instance: Optional[ReagentDB] = None
278
+
279
+
280
+ def get_reagent_db() -> ReagentDB:
281
+ """Return the shared ReagentDB singleton (loaded on first call)."""
282
+ global _instance
283
+ if _instance is None:
284
+ _instance = ReagentDB()
285
+ return _instance