cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
"""Condensed structural formula parser.
|
|
2
|
+
|
|
3
|
+
Converts chemist-shorthand condensed formulae (PhB(OH)₂, Et₃N, MeI)
|
|
4
|
+
to canonical SMILES by tokenizing against the superatom fragment
|
|
5
|
+
vocabulary (~2,850 entries) and assembling via RDKit.
|
|
6
|
+
|
|
7
|
+
This is a *generative* parser — it handles novel combinations like
|
|
8
|
+
PhB(OMe)₂ or PhB(OEt)₂ without needing a dictionary entry for every
|
|
9
|
+
whole molecule.
|
|
10
|
+
|
|
11
|
+
Grammar patterns handled:
|
|
12
|
+
|
|
13
|
+
group + atom/group MeI, BzCl, EtOH
|
|
14
|
+
group_n + central (+ more) Et₃N, Ph₃P, Me₃SiCl
|
|
15
|
+
left + atom + (group)_n PhB(OH)₂, PhB(OMe)₂
|
|
16
|
+
elem_n + chain Cl₂CHOCH₃, PhCH₂Br
|
|
17
|
+
|
|
18
|
+
Usage::
|
|
19
|
+
|
|
20
|
+
>>> from cdxml_toolkit.condensed_formula import resolve_condensed_formula
|
|
21
|
+
>>> resolve_condensed_formula("PhB(OH)2")
|
|
22
|
+
'OB(O)c1ccccc1'
|
|
23
|
+
>>> resolve_condensed_formula("Et3N")
|
|
24
|
+
'CCN(CC)CC'
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import re
|
|
28
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
29
|
+
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
# Element table (symbols recognised as bare atoms in condensed formulae)
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
# Two-letter elements — checked before single-letter to avoid ambiguity.
|
|
35
|
+
_TWO_LETTER_ELEMENTS = {
|
|
36
|
+
"He", "Li", "Be", "Ne", "Na", "Mg", "Al", "Si", "Cl", "Ar",
|
|
37
|
+
"Ca", "Sc", "Ti", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
|
|
38
|
+
"Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Zr", "Nb",
|
|
39
|
+
"Mo", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te",
|
|
40
|
+
"Cs", "Ba", "La", "Ce", "Hf", "Ta", "Re", "Os", "Ir", "Pt",
|
|
41
|
+
"Au", "Hg", "Tl", "Pb", "Bi",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
# Single-letter elements.
|
|
45
|
+
_ONE_LETTER_ELEMENTS = {
|
|
46
|
+
"H", "B", "C", "N", "O", "F", "P", "S", "K", "I", "V", "Y", "W", "U",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Elements that should use bracket notation in SMILES.
|
|
50
|
+
_BRACKET_ELEMENTS = {
|
|
51
|
+
"H", # explicit hydrogen needs brackets
|
|
52
|
+
"Li", "Be", "Na", "Mg", "Al", "Si", "Ca", "Sc", "Ti", "Cr", "Mn",
|
|
53
|
+
"Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Rb", "Sr",
|
|
54
|
+
"Zr", "Nb", "Mo", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb",
|
|
55
|
+
"Te", "Cs", "Ba", "La", "Ce", "Hf", "Ta", "Re", "Os", "Ir", "Pt",
|
|
56
|
+
"Au", "Hg", "Tl", "Pb", "Bi", "K", "V", "Y", "W", "U",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# Organic-subset elements that don't need brackets in SMILES.
|
|
60
|
+
_ORGANIC_SUBSET = {"B", "C", "N", "O", "P", "S", "F", "Cl", "Br", "I"}
|
|
61
|
+
|
|
62
|
+
# Superatom table keys to EXCLUDE from abbreviation matching because they
|
|
63
|
+
# collide with element symbols. These single/double-letter entries map to
|
|
64
|
+
# bare atoms (n→N, o→O) or to wrong molecules (co→CO carbonyl, sn→NS,
|
|
65
|
+
# zn→CBz-variant). They must be handled by element matching instead.
|
|
66
|
+
_ELEMENT_COLLISIONS = {
|
|
67
|
+
sym.lower() for sym in (_ONE_LETTER_ELEMENTS | _TWO_LETTER_ELEMENTS)
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
# Tokenizer
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
|
|
75
|
+
def _get_abbrev_table() -> Dict[str, str]:
|
|
76
|
+
"""Return the superatom abbreviation table (lowercase key → SMILES)."""
|
|
77
|
+
from .superatom_table import get_superatom_table
|
|
78
|
+
return get_superatom_table()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def tokenize(formula: str) -> List[Tuple[str, Any]]:
|
|
82
|
+
"""Tokenize a condensed structural formula.
|
|
83
|
+
|
|
84
|
+
Returns a list of ``(token_type, value)`` tuples where *token_type*
|
|
85
|
+
is one of ``'abbrev'``, ``'element'``, ``'count'``,
|
|
86
|
+
``'paren_open'``, ``'paren_close'``.
|
|
87
|
+
|
|
88
|
+
Uses the superatom table (~2,854 entries) for abbreviation matching
|
|
89
|
+
with greedy longest-match, case-insensitive. Abbreviations take
|
|
90
|
+
priority over element symbols.
|
|
91
|
+
|
|
92
|
+
Returns an empty list if the formula contains unrecognisable tokens.
|
|
93
|
+
"""
|
|
94
|
+
table = _get_abbrev_table()
|
|
95
|
+
tokens: List[Tuple[str, Any]] = []
|
|
96
|
+
i = 0
|
|
97
|
+
s = formula
|
|
98
|
+
|
|
99
|
+
# Pre-compute max abbreviation length for the search window.
|
|
100
|
+
max_abbrev_len = max((len(k) for k in table), default=0)
|
|
101
|
+
|
|
102
|
+
while i < len(s):
|
|
103
|
+
ch = s[i]
|
|
104
|
+
|
|
105
|
+
# Skip whitespace
|
|
106
|
+
if ch == " ":
|
|
107
|
+
i += 1
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
# Parentheses
|
|
111
|
+
if ch == "(":
|
|
112
|
+
tokens.append(("paren_open", "("))
|
|
113
|
+
i += 1
|
|
114
|
+
continue
|
|
115
|
+
if ch == ")":
|
|
116
|
+
tokens.append(("paren_close", ")"))
|
|
117
|
+
i += 1
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
# Digit run → count
|
|
121
|
+
if ch.isdigit():
|
|
122
|
+
j = i
|
|
123
|
+
while j < len(s) and s[j].isdigit():
|
|
124
|
+
j += 1
|
|
125
|
+
tokens.append(("count", int(s[i:j])))
|
|
126
|
+
i = j
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
# Try two-letter element FIRST (exact case: uppercase + lowercase).
|
|
130
|
+
# This prevents superatom entries like "co"→CO (carbonyl) from
|
|
131
|
+
# shadowing the element Co (cobalt).
|
|
132
|
+
if i + 1 < len(s) and s[i:i + 2] in _TWO_LETTER_ELEMENTS:
|
|
133
|
+
tokens.append(("element", s[i:i + 2]))
|
|
134
|
+
i += 2
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
# Try abbreviation (longest match first, case-insensitive).
|
|
138
|
+
# Skip matches whose key collides with an element symbol
|
|
139
|
+
# (single-letter n/o/s/h or two-letter co/sn/zn) — those are
|
|
140
|
+
# handled by element matching above and below.
|
|
141
|
+
matched = False
|
|
142
|
+
hi = min(max_abbrev_len, len(s) - i)
|
|
143
|
+
for length in range(hi, 0, -1):
|
|
144
|
+
candidate = s[i:i + length]
|
|
145
|
+
key = candidate.lower()
|
|
146
|
+
if key in table and key not in _ELEMENT_COLLISIONS:
|
|
147
|
+
tokens.append(("abbrev", candidate))
|
|
148
|
+
i += length
|
|
149
|
+
matched = True
|
|
150
|
+
break
|
|
151
|
+
if matched:
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
# Try single-letter element (uppercase only)
|
|
155
|
+
if ch in _ONE_LETTER_ELEMENTS:
|
|
156
|
+
tokens.append(("element", ch))
|
|
157
|
+
i += 1
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
# Unrecognised character → bail out
|
|
161
|
+
return []
|
|
162
|
+
|
|
163
|
+
return tokens
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# ---------------------------------------------------------------------------
|
|
167
|
+
# SMILES assembler
|
|
168
|
+
# ---------------------------------------------------------------------------
|
|
169
|
+
|
|
170
|
+
def _element_smiles(sym: str) -> str:
|
|
171
|
+
"""Return SMILES atom string for an element symbol."""
|
|
172
|
+
if sym in _BRACKET_ELEMENTS:
|
|
173
|
+
return f"[{sym}]"
|
|
174
|
+
return sym
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _mol_from_token(tok_type: str, tok_val: str,
|
|
178
|
+
table: Dict[str, str]) -> Optional["Chem.Mol"]:
|
|
179
|
+
"""Create an RDKit Mol from a single token."""
|
|
180
|
+
from rdkit import Chem
|
|
181
|
+
|
|
182
|
+
if tok_type == "abbrev":
|
|
183
|
+
smiles = table.get(tok_val.lower())
|
|
184
|
+
if smiles is None:
|
|
185
|
+
return None
|
|
186
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
187
|
+
if mol is None:
|
|
188
|
+
# Some superatom entries are SMARTS
|
|
189
|
+
mol = Chem.MolFromSmarts(smiles)
|
|
190
|
+
if mol is not None:
|
|
191
|
+
try:
|
|
192
|
+
mol = Chem.RWMol(mol)
|
|
193
|
+
Chem.SanitizeMol(mol)
|
|
194
|
+
mol = mol.GetMol()
|
|
195
|
+
except Exception:
|
|
196
|
+
return None
|
|
197
|
+
return mol
|
|
198
|
+
|
|
199
|
+
if tok_type == "element":
|
|
200
|
+
smi = _element_smiles(tok_val)
|
|
201
|
+
return Chem.MolFromSmiles(smi)
|
|
202
|
+
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _attachment_idx(mol: "Chem.Mol") -> int:
|
|
207
|
+
"""Return the atom index used as the attachment point.
|
|
208
|
+
|
|
209
|
+
Superatom SMILES have the first atom in the SMILES string as the
|
|
210
|
+
attachment point. For RDKit mols created from SMILES, atom index 0
|
|
211
|
+
corresponds to the first atom written.
|
|
212
|
+
"""
|
|
213
|
+
return 0
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _combine(mol_a: "Chem.Mol", idx_a: int,
|
|
217
|
+
mol_b: "Chem.Mol", idx_b: int) -> "Chem.Mol":
|
|
218
|
+
"""Combine two molecules by adding a single bond between them.
|
|
219
|
+
|
|
220
|
+
Returns a new Mol with a bond between atom *idx_a* of *mol_a*
|
|
221
|
+
and atom *idx_b* of *mol_b*.
|
|
222
|
+
"""
|
|
223
|
+
from rdkit import Chem
|
|
224
|
+
|
|
225
|
+
combo = Chem.CombineMols(mol_a, mol_b)
|
|
226
|
+
offset = mol_a.GetNumAtoms()
|
|
227
|
+
rw = Chem.RWMol(combo)
|
|
228
|
+
rw.AddBond(idx_a, idx_b + offset, Chem.BondType.SINGLE)
|
|
229
|
+
try:
|
|
230
|
+
Chem.SanitizeMol(rw)
|
|
231
|
+
except Exception:
|
|
232
|
+
pass # Sanitization may fail for organometallics; that's OK
|
|
233
|
+
return rw.GetMol()
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _assemble(tokens: List[Tuple[str, Any]]) -> Optional[str]:
|
|
237
|
+
"""Assemble a canonical SMILES from a token list.
|
|
238
|
+
|
|
239
|
+
Implements a stack-based state machine that handles:
|
|
240
|
+
- Linear chaining (MeI, BzCl)
|
|
241
|
+
- Multiplied prefix groups (Et₃N, Ph₃P)
|
|
242
|
+
- Parenthesised branches with multiplier (PhB(OH)₂)
|
|
243
|
+
- Element subscripts in linear chains (Cl₂CH…)
|
|
244
|
+
"""
|
|
245
|
+
from rdkit import Chem
|
|
246
|
+
|
|
247
|
+
table = _get_abbrev_table()
|
|
248
|
+
|
|
249
|
+
if not tokens:
|
|
250
|
+
return None
|
|
251
|
+
|
|
252
|
+
# --- State ---
|
|
253
|
+
mol = None # Current molecule being built
|
|
254
|
+
tip = None # Atom index in mol that is the "current attachment point"
|
|
255
|
+
pending = None # (mol, attach_idx, count) — fragment waiting for its central atom
|
|
256
|
+
branch_stack = [] # Stack of (mol, tip) for parenthesised groups
|
|
257
|
+
branch_frags = [] # Fragments collected inside current parentheses
|
|
258
|
+
in_branch = 0 # Nesting depth of parentheses
|
|
259
|
+
|
|
260
|
+
i = 0
|
|
261
|
+
while i < len(tokens):
|
|
262
|
+
tok_type, tok_val = tokens[i]
|
|
263
|
+
|
|
264
|
+
# --- Parenthesis open ---
|
|
265
|
+
if tok_type == "paren_open":
|
|
266
|
+
branch_stack.append((mol, tip, branch_frags[:]))
|
|
267
|
+
branch_frags = []
|
|
268
|
+
in_branch += 1
|
|
269
|
+
i += 1
|
|
270
|
+
continue
|
|
271
|
+
|
|
272
|
+
# --- Parenthesis close ---
|
|
273
|
+
if tok_type == "paren_close":
|
|
274
|
+
if not branch_stack:
|
|
275
|
+
return None # unmatched paren
|
|
276
|
+
|
|
277
|
+
# Determine multiplier (peek ahead for count)
|
|
278
|
+
count = 1
|
|
279
|
+
if (i + 1 < len(tokens)
|
|
280
|
+
and tokens[i + 1][0] == "count"):
|
|
281
|
+
count = tokens[i + 1][1]
|
|
282
|
+
i += 1 # consume the count
|
|
283
|
+
|
|
284
|
+
# Build the branch fragment from collected pieces
|
|
285
|
+
branch_mol = None
|
|
286
|
+
branch_tip = None
|
|
287
|
+
for frag_mol, frag_attach in branch_frags:
|
|
288
|
+
if branch_mol is None:
|
|
289
|
+
branch_mol = frag_mol
|
|
290
|
+
branch_tip = frag_attach
|
|
291
|
+
else:
|
|
292
|
+
new_mol = _combine(branch_mol, branch_tip,
|
|
293
|
+
frag_mol, frag_attach)
|
|
294
|
+
branch_tip = branch_mol.GetNumAtoms() + frag_attach
|
|
295
|
+
branch_mol = new_mol
|
|
296
|
+
|
|
297
|
+
# Restore parent state
|
|
298
|
+
parent_mol, parent_tip, parent_branch_frags = branch_stack.pop()
|
|
299
|
+
branch_frags = parent_branch_frags
|
|
300
|
+
in_branch -= 1
|
|
301
|
+
|
|
302
|
+
if branch_mol is not None and parent_mol is not None:
|
|
303
|
+
# Attach branch_mol to parent_mol at parent_tip, `count` times
|
|
304
|
+
for _ in range(count):
|
|
305
|
+
parent_mol = _combine(parent_mol, parent_tip,
|
|
306
|
+
branch_mol,
|
|
307
|
+
_attachment_idx(branch_mol))
|
|
308
|
+
elif branch_mol is not None:
|
|
309
|
+
# No parent yet — unusual, but handle gracefully
|
|
310
|
+
parent_mol = branch_mol
|
|
311
|
+
parent_tip = _attachment_idx(branch_mol)
|
|
312
|
+
|
|
313
|
+
mol = parent_mol
|
|
314
|
+
tip = parent_tip
|
|
315
|
+
i += 1
|
|
316
|
+
continue
|
|
317
|
+
|
|
318
|
+
# --- Count (not after paren_close — handled above) ---
|
|
319
|
+
if tok_type == "count":
|
|
320
|
+
# Multiplier after a group/element: sets pending count
|
|
321
|
+
if pending is not None:
|
|
322
|
+
p_mol, p_attach, _ = pending
|
|
323
|
+
pending = (p_mol, p_attach, tok_val)
|
|
324
|
+
i += 1
|
|
325
|
+
continue
|
|
326
|
+
|
|
327
|
+
# --- Abbreviation or element ---
|
|
328
|
+
if tok_type in ("abbrev", "element"):
|
|
329
|
+
frag = _mol_from_token(tok_type, tok_val, table)
|
|
330
|
+
if frag is None:
|
|
331
|
+
return None
|
|
332
|
+
frag_attach = _attachment_idx(frag)
|
|
333
|
+
is_hydrogen = (tok_type == "element" and tok_val == "H")
|
|
334
|
+
|
|
335
|
+
# If we're inside parentheses, collect fragments
|
|
336
|
+
if in_branch > 0:
|
|
337
|
+
branch_frags.append((frag, frag_attach))
|
|
338
|
+
i += 1
|
|
339
|
+
continue
|
|
340
|
+
|
|
341
|
+
# If there's a pending fragment with a count, this token
|
|
342
|
+
# is the central atom. Attach `count` copies of pending
|
|
343
|
+
# to this fragment.
|
|
344
|
+
if pending is not None:
|
|
345
|
+
p_mol, p_attach, p_count = pending
|
|
346
|
+
# This fragment is the central atom
|
|
347
|
+
central = frag
|
|
348
|
+
central_tip = frag_attach
|
|
349
|
+
for _ in range(p_count):
|
|
350
|
+
central = _combine(central, central_tip,
|
|
351
|
+
p_mol, p_attach)
|
|
352
|
+
if mol is not None:
|
|
353
|
+
# Also attach central to the existing molecule
|
|
354
|
+
central = _combine(mol, tip, central, central_tip)
|
|
355
|
+
tip = tip # tip stays on the original attachment
|
|
356
|
+
else:
|
|
357
|
+
tip = central_tip
|
|
358
|
+
mol = central
|
|
359
|
+
pending = None
|
|
360
|
+
i += 1
|
|
361
|
+
continue
|
|
362
|
+
|
|
363
|
+
# Peek ahead: is the next token a count?
|
|
364
|
+
if (i + 1 < len(tokens)
|
|
365
|
+
and tokens[i + 1][0] == "count"):
|
|
366
|
+
count = tokens[i + 1][1]
|
|
367
|
+
|
|
368
|
+
# Hydrogen with count: ALWAYS attach to the previous
|
|
369
|
+
# heavy atom (tip). H is terminal — it can never be a
|
|
370
|
+
# "central" atom in the X_n Y pattern.
|
|
371
|
+
# E.g. CH₂Br → C gets 2H, then Br bonds to C.
|
|
372
|
+
# NaBH₄ → B gets 4H.
|
|
373
|
+
if is_hydrogen:
|
|
374
|
+
if mol is not None:
|
|
375
|
+
for _ in range(count):
|
|
376
|
+
mol = _combine(mol, tip, frag, frag_attach)
|
|
377
|
+
i += 2
|
|
378
|
+
continue
|
|
379
|
+
|
|
380
|
+
# Peek further: is there another group/element after?
|
|
381
|
+
if i + 2 < len(tokens) and tokens[i + 2][0] in (
|
|
382
|
+
"abbrev", "element", "paren_open"):
|
|
383
|
+
# Pattern: group_n + central → stash as pending
|
|
384
|
+
pending = (frag, frag_attach, count)
|
|
385
|
+
i += 2 # skip the group and its count
|
|
386
|
+
continue
|
|
387
|
+
else:
|
|
388
|
+
# Count at end: replicate element on tip
|
|
389
|
+
# e.g., trailing Cl₂ at end → 2 Cl on tip
|
|
390
|
+
if mol is not None:
|
|
391
|
+
for _ in range(count):
|
|
392
|
+
mol = _combine(mol, tip, frag, frag_attach)
|
|
393
|
+
else:
|
|
394
|
+
mol = frag
|
|
395
|
+
tip = frag_attach
|
|
396
|
+
i += 2
|
|
397
|
+
continue
|
|
398
|
+
|
|
399
|
+
# Simple linear attachment
|
|
400
|
+
if mol is None:
|
|
401
|
+
mol = frag
|
|
402
|
+
tip = frag_attach
|
|
403
|
+
else:
|
|
404
|
+
new_mol = _combine(mol, tip, frag, frag_attach)
|
|
405
|
+
# Advance tip to the new fragment's attachment atom
|
|
406
|
+
tip = mol.GetNumAtoms() + frag_attach
|
|
407
|
+
mol = new_mol
|
|
408
|
+
|
|
409
|
+
i += 1
|
|
410
|
+
continue
|
|
411
|
+
|
|
412
|
+
# Unknown token type → bail
|
|
413
|
+
return None
|
|
414
|
+
|
|
415
|
+
# --- Flush any remaining pending fragment ---
|
|
416
|
+
if pending is not None:
|
|
417
|
+
p_mol, p_attach, p_count = pending
|
|
418
|
+
if mol is not None:
|
|
419
|
+
for _ in range(p_count):
|
|
420
|
+
mol = _combine(mol, tip, p_mol, p_attach)
|
|
421
|
+
elif p_count == 1:
|
|
422
|
+
mol = p_mol
|
|
423
|
+
else:
|
|
424
|
+
return None # dangling multiplied fragment with no central
|
|
425
|
+
|
|
426
|
+
if mol is None:
|
|
427
|
+
return None
|
|
428
|
+
|
|
429
|
+
# Validate and canonicalize
|
|
430
|
+
try:
|
|
431
|
+
Chem.SanitizeMol(mol)
|
|
432
|
+
return Chem.MolToSmiles(mol)
|
|
433
|
+
except Exception:
|
|
434
|
+
return None
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
# ---------------------------------------------------------------------------
|
|
438
|
+
# Public API
|
|
439
|
+
# ---------------------------------------------------------------------------
|
|
440
|
+
|
|
441
|
+
# Quick-reject patterns: strings that look like IUPAC names or sentences.
|
|
442
|
+
_IUPAC_LIKE = re.compile(
|
|
443
|
+
r"(?:^[a-z].*\s)" # starts lowercase and has spaces → sentence/name
|
|
444
|
+
r"|(?:amine$|acid$|ether$|oxide$|chloride$|bromide$|iodide$|"
|
|
445
|
+
r"hydride$|phosphine$|carbonate$|aldehyde$|ketone$|alcohol$)",
|
|
446
|
+
re.IGNORECASE,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
# Quick-reject: too long to be a condensed formula
|
|
450
|
+
_MAX_FORMULA_LEN = 40
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def resolve_condensed_formula(formula: str) -> Optional[str]:
|
|
454
|
+
"""Parse a condensed structural formula to canonical SMILES.
|
|
455
|
+
|
|
456
|
+
Tokenizes *formula* against the superatom abbreviation vocabulary
|
|
457
|
+
(~2,854 fragments) and assembles a molecule via RDKit.
|
|
458
|
+
|
|
459
|
+
Returns a canonical SMILES string, or ``None`` if parsing fails or
|
|
460
|
+
the input doesn't look like a condensed formula.
|
|
461
|
+
|
|
462
|
+
This function is designed as a tier in the reagent resolution chain
|
|
463
|
+
(between the reagent dictionary and OPSIN). It returns ``None``
|
|
464
|
+
quickly for inputs it can't handle, letting downstream tiers try.
|
|
465
|
+
"""
|
|
466
|
+
if not formula or len(formula) > _MAX_FORMULA_LEN:
|
|
467
|
+
return None
|
|
468
|
+
|
|
469
|
+
clean = formula.strip()
|
|
470
|
+
if not clean:
|
|
471
|
+
return None
|
|
472
|
+
|
|
473
|
+
# Skip things that look like IUPAC names or common names.
|
|
474
|
+
if _IUPAC_LIKE.search(clean):
|
|
475
|
+
return None
|
|
476
|
+
|
|
477
|
+
# Skip things with multiple words (IUPAC names, reaction descriptions).
|
|
478
|
+
if " " in clean:
|
|
479
|
+
return None
|
|
480
|
+
|
|
481
|
+
# Tokenize
|
|
482
|
+
tokens = tokenize(clean)
|
|
483
|
+
if not tokens:
|
|
484
|
+
return None
|
|
485
|
+
|
|
486
|
+
# Need at least 2 tokens to form a compound
|
|
487
|
+
# (single abbreviations are handled by the reagent DB)
|
|
488
|
+
real_tokens = [t for t in tokens if t[0] not in ("count",)]
|
|
489
|
+
if len(real_tokens) < 2:
|
|
490
|
+
return None
|
|
491
|
+
|
|
492
|
+
# Assemble
|
|
493
|
+
return _assemble(tokens)
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""Manage a bundled JRE for OPSIN.
|
|
2
|
+
|
|
3
|
+
A JRE zip is shipped inside the package (``cdxml_toolkit/_jre/``).
|
|
4
|
+
On first use it is extracted to ``~/.cdxml-toolkit/jre/`` so that
|
|
5
|
+
py2opsin can run without requiring the user to install Java.
|
|
6
|
+
|
|
7
|
+
Discovery order (used by :func:`get_java`):
|
|
8
|
+
1. System Java on PATH or JAVA_HOME
|
|
9
|
+
2. Already-extracted JRE at ``~/.cdxml-toolkit/jre/``
|
|
10
|
+
3. Extract from bundled zip (one-time, ~45 MB)
|
|
11
|
+
4. Download from Adoptium API (network fallback)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import io
|
|
17
|
+
import os
|
|
18
|
+
import shutil
|
|
19
|
+
import sys
|
|
20
|
+
import zipfile
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Optional
|
|
23
|
+
|
|
24
|
+
# Where the extracted JRE lives
|
|
25
|
+
_JRE_BASE = Path.home() / ".cdxml-toolkit" / "jre"
|
|
26
|
+
|
|
27
|
+
# Bundled JRE zip inside the package
|
|
28
|
+
_BUNDLED_ZIP = Path(__file__).resolve().parent.parent / "_jre" / "temurin-21-jre-win-x64.zip"
|
|
29
|
+
|
|
30
|
+
# Network fallback URL (Adoptium API)
|
|
31
|
+
_ADOPTIUM_URL = (
|
|
32
|
+
"https://api.adoptium.net/v3/binary/latest/21/ga/windows/x64/jre/"
|
|
33
|
+
"hotspot/normal/eclipse?project=jdk"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# Cached result
|
|
37
|
+
_java_exe: Optional[str] = None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _find_system_java() -> Optional[str]:
|
|
41
|
+
"""Check PATH and JAVA_HOME for an existing java executable."""
|
|
42
|
+
java = shutil.which("java")
|
|
43
|
+
if java:
|
|
44
|
+
return java
|
|
45
|
+
|
|
46
|
+
java_home = os.environ.get("JAVA_HOME")
|
|
47
|
+
if java_home:
|
|
48
|
+
for name in ("java.exe", "java"):
|
|
49
|
+
candidate = os.path.join(java_home, "bin", name)
|
|
50
|
+
if os.path.isfile(candidate):
|
|
51
|
+
return candidate
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _find_extracted_java() -> Optional[str]:
|
|
56
|
+
"""Check ~/.cdxml-toolkit/jre/ for an already-extracted JRE."""
|
|
57
|
+
if not _JRE_BASE.is_dir():
|
|
58
|
+
return None
|
|
59
|
+
# The zip extracts to a subdirectory like jdk-21.0.10+7-jre/
|
|
60
|
+
for entry in _JRE_BASE.iterdir():
|
|
61
|
+
if entry.is_dir():
|
|
62
|
+
for name in ("java.exe", "java"):
|
|
63
|
+
candidate = entry / "bin" / name
|
|
64
|
+
if candidate.is_file():
|
|
65
|
+
return str(candidate)
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _extract_bundled_jre() -> Optional[str]:
|
|
70
|
+
"""Extract the JRE zip shipped inside the package.
|
|
71
|
+
|
|
72
|
+
Returns the path to java.exe, or None if the bundled zip is missing.
|
|
73
|
+
"""
|
|
74
|
+
if not _BUNDLED_ZIP.is_file():
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
_JRE_BASE.mkdir(parents=True, exist_ok=True)
|
|
78
|
+
|
|
79
|
+
print(" [cdxml-toolkit] Extracting bundled JRE (one-time)...",
|
|
80
|
+
file=sys.stderr)
|
|
81
|
+
try:
|
|
82
|
+
with zipfile.ZipFile(_BUNDLED_ZIP) as zf:
|
|
83
|
+
zf.extractall(_JRE_BASE)
|
|
84
|
+
except Exception as e:
|
|
85
|
+
print(f" [cdxml-toolkit] JRE extraction failed: {e}", file=sys.stderr)
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
java = _find_extracted_java()
|
|
89
|
+
if java:
|
|
90
|
+
print(f" [cdxml-toolkit] JRE ready: {java}", file=sys.stderr)
|
|
91
|
+
return java
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _download_jre() -> Optional[str]:
|
|
95
|
+
"""Download Eclipse Temurin JRE 21 from Adoptium (network fallback).
|
|
96
|
+
|
|
97
|
+
Only used if the bundled zip is missing (e.g. minimal source install).
|
|
98
|
+
Returns the path to java.exe, or None on failure.
|
|
99
|
+
"""
|
|
100
|
+
try:
|
|
101
|
+
import urllib.request
|
|
102
|
+
except ImportError:
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
_JRE_BASE.mkdir(parents=True, exist_ok=True)
|
|
106
|
+
|
|
107
|
+
print(" [cdxml-toolkit] Downloading JRE for OPSIN (~45 MB)...",
|
|
108
|
+
file=sys.stderr)
|
|
109
|
+
try:
|
|
110
|
+
req = urllib.request.Request(
|
|
111
|
+
_ADOPTIUM_URL,
|
|
112
|
+
headers={"User-Agent": "cdxml-toolkit/0.5"},
|
|
113
|
+
)
|
|
114
|
+
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
115
|
+
data = resp.read()
|
|
116
|
+
except Exception as e:
|
|
117
|
+
print(f" [cdxml-toolkit] JRE download failed: {e}", file=sys.stderr)
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
print(" [cdxml-toolkit] Extracting JRE...", file=sys.stderr)
|
|
121
|
+
try:
|
|
122
|
+
with zipfile.ZipFile(io.BytesIO(data)) as zf:
|
|
123
|
+
zf.extractall(_JRE_BASE)
|
|
124
|
+
except Exception as e:
|
|
125
|
+
print(f" [cdxml-toolkit] JRE extraction failed: {e}", file=sys.stderr)
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
java = _find_extracted_java()
|
|
129
|
+
if java:
|
|
130
|
+
print(f" [cdxml-toolkit] JRE ready: {java}", file=sys.stderr)
|
|
131
|
+
return java
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def get_java(download: bool = True) -> Optional[str]:
|
|
135
|
+
"""Return the path to a ``java`` executable.
|
|
136
|
+
|
|
137
|
+
Discovery order:
|
|
138
|
+
1. System Java (PATH / JAVA_HOME)
|
|
139
|
+
2. Already-extracted JRE at ``~/.cdxml-toolkit/jre/``
|
|
140
|
+
3. Extract from bundled zip (ships with the package)
|
|
141
|
+
4. Download from Adoptium API (network fallback)
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
download: If True (default), allow network download as last
|
|
145
|
+
resort when the bundled zip is also missing.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Absolute path to ``java`` or ``java.exe``, or None.
|
|
149
|
+
"""
|
|
150
|
+
global _java_exe
|
|
151
|
+
if _java_exe is not None:
|
|
152
|
+
return _java_exe
|
|
153
|
+
|
|
154
|
+
# 1. System Java
|
|
155
|
+
_java_exe = _find_system_java()
|
|
156
|
+
if _java_exe:
|
|
157
|
+
return _java_exe
|
|
158
|
+
|
|
159
|
+
# 2. Already-extracted bundled JRE
|
|
160
|
+
_java_exe = _find_extracted_java()
|
|
161
|
+
if _java_exe:
|
|
162
|
+
return _java_exe
|
|
163
|
+
|
|
164
|
+
# 3. Extract from bundled zip
|
|
165
|
+
_java_exe = _extract_bundled_jre()
|
|
166
|
+
if _java_exe:
|
|
167
|
+
return _java_exe
|
|
168
|
+
|
|
169
|
+
# 4. Network fallback
|
|
170
|
+
if download:
|
|
171
|
+
_java_exe = _download_jre()
|
|
172
|
+
return _java_exe
|
|
173
|
+
|
|
174
|
+
return None
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def ensure_java_on_path(download: bool = True) -> bool:
|
|
178
|
+
"""Make sure ``java`` is discoverable by subprocess calls.
|
|
179
|
+
|
|
180
|
+
Finds (or extracts/downloads) a JRE, then adds its ``bin/``
|
|
181
|
+
directory to ``PATH`` and sets ``JAVA_HOME`` so that py2opsin's
|
|
182
|
+
``subprocess.run(["java", ...])`` works.
|
|
183
|
+
|
|
184
|
+
Returns True if Java is available, False otherwise.
|
|
185
|
+
"""
|
|
186
|
+
java = get_java(download=download)
|
|
187
|
+
if not java:
|
|
188
|
+
return False
|
|
189
|
+
|
|
190
|
+
java_bin_dir = os.path.dirname(java)
|
|
191
|
+
path = os.environ.get("PATH", "")
|
|
192
|
+
if java_bin_dir not in path:
|
|
193
|
+
os.environ["PATH"] = java_bin_dir + os.pathsep + path
|
|
194
|
+
os.environ["JAVA_HOME"] = os.path.dirname(java_bin_dir)
|
|
195
|
+
return True
|