cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,493 @@
1
+ """Condensed structural formula parser.
2
+
3
+ Converts chemist-shorthand condensed formulae (PhB(OH)₂, Et₃N, MeI)
4
+ to canonical SMILES by tokenizing against the superatom fragment
5
+ vocabulary (~2,850 entries) and assembling via RDKit.
6
+
7
+ This is a *generative* parser — it handles novel combinations like
8
+ PhB(OMe)₂ or PhB(OEt)₂ without needing a dictionary entry for every
9
+ whole molecule.
10
+
11
+ Grammar patterns handled:
12
+
13
+ group + atom/group MeI, BzCl, EtOH
14
+ group_n + central (+ more) Et₃N, Ph₃P, Me₃SiCl
15
+ left + atom + (group)_n PhB(OH)₂, PhB(OMe)₂
16
+ elem_n + chain Cl₂CHOCH₃, PhCH₂Br
17
+
18
+ Usage::
19
+
20
+ >>> from cdxml_toolkit.condensed_formula import resolve_condensed_formula
21
+ >>> resolve_condensed_formula("PhB(OH)2")
22
+ 'OB(O)c1ccccc1'
23
+ >>> resolve_condensed_formula("Et3N")
24
+ 'CCN(CC)CC'
25
+ """
26
+
27
+ import re
28
+ from typing import Any, Dict, List, Optional, Tuple
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Element table (symbols recognised as bare atoms in condensed formulae)
32
+ # ---------------------------------------------------------------------------
33
+
34
+ # Two-letter elements — checked before single-letter to avoid ambiguity.
35
+ _TWO_LETTER_ELEMENTS = {
36
+ "He", "Li", "Be", "Ne", "Na", "Mg", "Al", "Si", "Cl", "Ar",
37
+ "Ca", "Sc", "Ti", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
38
+ "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Zr", "Nb",
39
+ "Mo", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te",
40
+ "Cs", "Ba", "La", "Ce", "Hf", "Ta", "Re", "Os", "Ir", "Pt",
41
+ "Au", "Hg", "Tl", "Pb", "Bi",
42
+ }
43
+
44
+ # Single-letter elements.
45
+ _ONE_LETTER_ELEMENTS = {
46
+ "H", "B", "C", "N", "O", "F", "P", "S", "K", "I", "V", "Y", "W", "U",
47
+ }
48
+
49
+ # Elements that should use bracket notation in SMILES.
50
+ _BRACKET_ELEMENTS = {
51
+ "H", # explicit hydrogen needs brackets
52
+ "Li", "Be", "Na", "Mg", "Al", "Si", "Ca", "Sc", "Ti", "Cr", "Mn",
53
+ "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Rb", "Sr",
54
+ "Zr", "Nb", "Mo", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb",
55
+ "Te", "Cs", "Ba", "La", "Ce", "Hf", "Ta", "Re", "Os", "Ir", "Pt",
56
+ "Au", "Hg", "Tl", "Pb", "Bi", "K", "V", "Y", "W", "U",
57
+ }
58
+
59
+ # Organic-subset elements that don't need brackets in SMILES.
60
+ _ORGANIC_SUBSET = {"B", "C", "N", "O", "P", "S", "F", "Cl", "Br", "I"}
61
+
62
+ # Superatom table keys to EXCLUDE from abbreviation matching because they
63
+ # collide with element symbols. These single/double-letter entries map to
64
+ # bare atoms (n→N, o→O) or to wrong molecules (co→CO carbonyl, sn→NS,
65
+ # zn→CBz-variant). They must be handled by element matching instead.
66
+ _ELEMENT_COLLISIONS = {
67
+ sym.lower() for sym in (_ONE_LETTER_ELEMENTS | _TWO_LETTER_ELEMENTS)
68
+ }
69
+
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # Tokenizer
73
+ # ---------------------------------------------------------------------------
74
+
75
+ def _get_abbrev_table() -> Dict[str, str]:
76
+ """Return the superatom abbreviation table (lowercase key → SMILES)."""
77
+ from .superatom_table import get_superatom_table
78
+ return get_superatom_table()
79
+
80
+
81
+ def tokenize(formula: str) -> List[Tuple[str, Any]]:
82
+ """Tokenize a condensed structural formula.
83
+
84
+ Returns a list of ``(token_type, value)`` tuples where *token_type*
85
+ is one of ``'abbrev'``, ``'element'``, ``'count'``,
86
+ ``'paren_open'``, ``'paren_close'``.
87
+
88
+ Uses the superatom table (~2,854 entries) for abbreviation matching
89
+ with greedy longest-match, case-insensitive. Abbreviations take
90
+ priority over element symbols.
91
+
92
+ Returns an empty list if the formula contains unrecognisable tokens.
93
+ """
94
+ table = _get_abbrev_table()
95
+ tokens: List[Tuple[str, Any]] = []
96
+ i = 0
97
+ s = formula
98
+
99
+ # Pre-compute max abbreviation length for the search window.
100
+ max_abbrev_len = max((len(k) for k in table), default=0)
101
+
102
+ while i < len(s):
103
+ ch = s[i]
104
+
105
+ # Skip whitespace
106
+ if ch == " ":
107
+ i += 1
108
+ continue
109
+
110
+ # Parentheses
111
+ if ch == "(":
112
+ tokens.append(("paren_open", "("))
113
+ i += 1
114
+ continue
115
+ if ch == ")":
116
+ tokens.append(("paren_close", ")"))
117
+ i += 1
118
+ continue
119
+
120
+ # Digit run → count
121
+ if ch.isdigit():
122
+ j = i
123
+ while j < len(s) and s[j].isdigit():
124
+ j += 1
125
+ tokens.append(("count", int(s[i:j])))
126
+ i = j
127
+ continue
128
+
129
+ # Try two-letter element FIRST (exact case: uppercase + lowercase).
130
+ # This prevents superatom entries like "co"→CO (carbonyl) from
131
+ # shadowing the element Co (cobalt).
132
+ if i + 1 < len(s) and s[i:i + 2] in _TWO_LETTER_ELEMENTS:
133
+ tokens.append(("element", s[i:i + 2]))
134
+ i += 2
135
+ continue
136
+
137
+ # Try abbreviation (longest match first, case-insensitive).
138
+ # Skip matches whose key collides with an element symbol
139
+ # (single-letter n/o/s/h or two-letter co/sn/zn) — those are
140
+ # handled by element matching above and below.
141
+ matched = False
142
+ hi = min(max_abbrev_len, len(s) - i)
143
+ for length in range(hi, 0, -1):
144
+ candidate = s[i:i + length]
145
+ key = candidate.lower()
146
+ if key in table and key not in _ELEMENT_COLLISIONS:
147
+ tokens.append(("abbrev", candidate))
148
+ i += length
149
+ matched = True
150
+ break
151
+ if matched:
152
+ continue
153
+
154
+ # Try single-letter element (uppercase only)
155
+ if ch in _ONE_LETTER_ELEMENTS:
156
+ tokens.append(("element", ch))
157
+ i += 1
158
+ continue
159
+
160
+ # Unrecognised character → bail out
161
+ return []
162
+
163
+ return tokens
164
+
165
+
166
+ # ---------------------------------------------------------------------------
167
+ # SMILES assembler
168
+ # ---------------------------------------------------------------------------
169
+
170
+ def _element_smiles(sym: str) -> str:
171
+ """Return SMILES atom string for an element symbol."""
172
+ if sym in _BRACKET_ELEMENTS:
173
+ return f"[{sym}]"
174
+ return sym
175
+
176
+
177
+ def _mol_from_token(tok_type: str, tok_val: str,
178
+ table: Dict[str, str]) -> Optional["Chem.Mol"]:
179
+ """Create an RDKit Mol from a single token."""
180
+ from rdkit import Chem
181
+
182
+ if tok_type == "abbrev":
183
+ smiles = table.get(tok_val.lower())
184
+ if smiles is None:
185
+ return None
186
+ mol = Chem.MolFromSmiles(smiles)
187
+ if mol is None:
188
+ # Some superatom entries are SMARTS
189
+ mol = Chem.MolFromSmarts(smiles)
190
+ if mol is not None:
191
+ try:
192
+ mol = Chem.RWMol(mol)
193
+ Chem.SanitizeMol(mol)
194
+ mol = mol.GetMol()
195
+ except Exception:
196
+ return None
197
+ return mol
198
+
199
+ if tok_type == "element":
200
+ smi = _element_smiles(tok_val)
201
+ return Chem.MolFromSmiles(smi)
202
+
203
+ return None
204
+
205
+
206
+ def _attachment_idx(mol: "Chem.Mol") -> int:
207
+ """Return the atom index used as the attachment point.
208
+
209
+ Superatom SMILES have the first atom in the SMILES string as the
210
+ attachment point. For RDKit mols created from SMILES, atom index 0
211
+ corresponds to the first atom written.
212
+ """
213
+ return 0
214
+
215
+
216
+ def _combine(mol_a: "Chem.Mol", idx_a: int,
217
+ mol_b: "Chem.Mol", idx_b: int) -> "Chem.Mol":
218
+ """Combine two molecules by adding a single bond between them.
219
+
220
+ Returns a new Mol with a bond between atom *idx_a* of *mol_a*
221
+ and atom *idx_b* of *mol_b*.
222
+ """
223
+ from rdkit import Chem
224
+
225
+ combo = Chem.CombineMols(mol_a, mol_b)
226
+ offset = mol_a.GetNumAtoms()
227
+ rw = Chem.RWMol(combo)
228
+ rw.AddBond(idx_a, idx_b + offset, Chem.BondType.SINGLE)
229
+ try:
230
+ Chem.SanitizeMol(rw)
231
+ except Exception:
232
+ pass # Sanitization may fail for organometallics; that's OK
233
+ return rw.GetMol()
234
+
235
+
236
+ def _assemble(tokens: List[Tuple[str, Any]]) -> Optional[str]:
237
+ """Assemble a canonical SMILES from a token list.
238
+
239
+ Implements a stack-based state machine that handles:
240
+ - Linear chaining (MeI, BzCl)
241
+ - Multiplied prefix groups (Et₃N, Ph₃P)
242
+ - Parenthesised branches with multiplier (PhB(OH)₂)
243
+ - Element subscripts in linear chains (Cl₂CH…)
244
+ """
245
+ from rdkit import Chem
246
+
247
+ table = _get_abbrev_table()
248
+
249
+ if not tokens:
250
+ return None
251
+
252
+ # --- State ---
253
+ mol = None # Current molecule being built
254
+ tip = None # Atom index in mol that is the "current attachment point"
255
+ pending = None # (mol, attach_idx, count) — fragment waiting for its central atom
256
+ branch_stack = [] # Stack of (mol, tip) for parenthesised groups
257
+ branch_frags = [] # Fragments collected inside current parentheses
258
+ in_branch = 0 # Nesting depth of parentheses
259
+
260
+ i = 0
261
+ while i < len(tokens):
262
+ tok_type, tok_val = tokens[i]
263
+
264
+ # --- Parenthesis open ---
265
+ if tok_type == "paren_open":
266
+ branch_stack.append((mol, tip, branch_frags[:]))
267
+ branch_frags = []
268
+ in_branch += 1
269
+ i += 1
270
+ continue
271
+
272
+ # --- Parenthesis close ---
273
+ if tok_type == "paren_close":
274
+ if not branch_stack:
275
+ return None # unmatched paren
276
+
277
+ # Determine multiplier (peek ahead for count)
278
+ count = 1
279
+ if (i + 1 < len(tokens)
280
+ and tokens[i + 1][0] == "count"):
281
+ count = tokens[i + 1][1]
282
+ i += 1 # consume the count
283
+
284
+ # Build the branch fragment from collected pieces
285
+ branch_mol = None
286
+ branch_tip = None
287
+ for frag_mol, frag_attach in branch_frags:
288
+ if branch_mol is None:
289
+ branch_mol = frag_mol
290
+ branch_tip = frag_attach
291
+ else:
292
+ new_mol = _combine(branch_mol, branch_tip,
293
+ frag_mol, frag_attach)
294
+ branch_tip = branch_mol.GetNumAtoms() + frag_attach
295
+ branch_mol = new_mol
296
+
297
+ # Restore parent state
298
+ parent_mol, parent_tip, parent_branch_frags = branch_stack.pop()
299
+ branch_frags = parent_branch_frags
300
+ in_branch -= 1
301
+
302
+ if branch_mol is not None and parent_mol is not None:
303
+ # Attach branch_mol to parent_mol at parent_tip, `count` times
304
+ for _ in range(count):
305
+ parent_mol = _combine(parent_mol, parent_tip,
306
+ branch_mol,
307
+ _attachment_idx(branch_mol))
308
+ elif branch_mol is not None:
309
+ # No parent yet — unusual, but handle gracefully
310
+ parent_mol = branch_mol
311
+ parent_tip = _attachment_idx(branch_mol)
312
+
313
+ mol = parent_mol
314
+ tip = parent_tip
315
+ i += 1
316
+ continue
317
+
318
+ # --- Count (not after paren_close — handled above) ---
319
+ if tok_type == "count":
320
+ # Multiplier after a group/element: sets pending count
321
+ if pending is not None:
322
+ p_mol, p_attach, _ = pending
323
+ pending = (p_mol, p_attach, tok_val)
324
+ i += 1
325
+ continue
326
+
327
+ # --- Abbreviation or element ---
328
+ if tok_type in ("abbrev", "element"):
329
+ frag = _mol_from_token(tok_type, tok_val, table)
330
+ if frag is None:
331
+ return None
332
+ frag_attach = _attachment_idx(frag)
333
+ is_hydrogen = (tok_type == "element" and tok_val == "H")
334
+
335
+ # If we're inside parentheses, collect fragments
336
+ if in_branch > 0:
337
+ branch_frags.append((frag, frag_attach))
338
+ i += 1
339
+ continue
340
+
341
+ # If there's a pending fragment with a count, this token
342
+ # is the central atom. Attach `count` copies of pending
343
+ # to this fragment.
344
+ if pending is not None:
345
+ p_mol, p_attach, p_count = pending
346
+ # This fragment is the central atom
347
+ central = frag
348
+ central_tip = frag_attach
349
+ for _ in range(p_count):
350
+ central = _combine(central, central_tip,
351
+ p_mol, p_attach)
352
+ if mol is not None:
353
+ # Also attach central to the existing molecule
354
+ central = _combine(mol, tip, central, central_tip)
355
+ tip = tip # tip stays on the original attachment
356
+ else:
357
+ tip = central_tip
358
+ mol = central
359
+ pending = None
360
+ i += 1
361
+ continue
362
+
363
+ # Peek ahead: is the next token a count?
364
+ if (i + 1 < len(tokens)
365
+ and tokens[i + 1][0] == "count"):
366
+ count = tokens[i + 1][1]
367
+
368
+ # Hydrogen with count: ALWAYS attach to the previous
369
+ # heavy atom (tip). H is terminal — it can never be a
370
+ # "central" atom in the X_n Y pattern.
371
+ # E.g. CH₂Br → C gets 2H, then Br bonds to C.
372
+ # NaBH₄ → B gets 4H.
373
+ if is_hydrogen:
374
+ if mol is not None:
375
+ for _ in range(count):
376
+ mol = _combine(mol, tip, frag, frag_attach)
377
+ i += 2
378
+ continue
379
+
380
+ # Peek further: is there another group/element after?
381
+ if i + 2 < len(tokens) and tokens[i + 2][0] in (
382
+ "abbrev", "element", "paren_open"):
383
+ # Pattern: group_n + central → stash as pending
384
+ pending = (frag, frag_attach, count)
385
+ i += 2 # skip the group and its count
386
+ continue
387
+ else:
388
+ # Count at end: replicate element on tip
389
+ # e.g., trailing Cl₂ at end → 2 Cl on tip
390
+ if mol is not None:
391
+ for _ in range(count):
392
+ mol = _combine(mol, tip, frag, frag_attach)
393
+ else:
394
+ mol = frag
395
+ tip = frag_attach
396
+ i += 2
397
+ continue
398
+
399
+ # Simple linear attachment
400
+ if mol is None:
401
+ mol = frag
402
+ tip = frag_attach
403
+ else:
404
+ new_mol = _combine(mol, tip, frag, frag_attach)
405
+ # Advance tip to the new fragment's attachment atom
406
+ tip = mol.GetNumAtoms() + frag_attach
407
+ mol = new_mol
408
+
409
+ i += 1
410
+ continue
411
+
412
+ # Unknown token type → bail
413
+ return None
414
+
415
+ # --- Flush any remaining pending fragment ---
416
+ if pending is not None:
417
+ p_mol, p_attach, p_count = pending
418
+ if mol is not None:
419
+ for _ in range(p_count):
420
+ mol = _combine(mol, tip, p_mol, p_attach)
421
+ elif p_count == 1:
422
+ mol = p_mol
423
+ else:
424
+ return None # dangling multiplied fragment with no central
425
+
426
+ if mol is None:
427
+ return None
428
+
429
+ # Validate and canonicalize
430
+ try:
431
+ Chem.SanitizeMol(mol)
432
+ return Chem.MolToSmiles(mol)
433
+ except Exception:
434
+ return None
435
+
436
+
437
+ # ---------------------------------------------------------------------------
438
+ # Public API
439
+ # ---------------------------------------------------------------------------
440
+
441
+ # Quick-reject patterns: strings that look like IUPAC names or sentences.
442
+ _IUPAC_LIKE = re.compile(
443
+ r"(?:^[a-z].*\s)" # starts lowercase and has spaces → sentence/name
444
+ r"|(?:amine$|acid$|ether$|oxide$|chloride$|bromide$|iodide$|"
445
+ r"hydride$|phosphine$|carbonate$|aldehyde$|ketone$|alcohol$)",
446
+ re.IGNORECASE,
447
+ )
448
+
449
+ # Quick-reject: too long to be a condensed formula
450
+ _MAX_FORMULA_LEN = 40
451
+
452
+
453
+ def resolve_condensed_formula(formula: str) -> Optional[str]:
454
+ """Parse a condensed structural formula to canonical SMILES.
455
+
456
+ Tokenizes *formula* against the superatom abbreviation vocabulary
457
+ (~2,854 fragments) and assembles a molecule via RDKit.
458
+
459
+ Returns a canonical SMILES string, or ``None`` if parsing fails or
460
+ the input doesn't look like a condensed formula.
461
+
462
+ This function is designed as a tier in the reagent resolution chain
463
+ (between the reagent dictionary and OPSIN). It returns ``None``
464
+ quickly for inputs it can't handle, letting downstream tiers try.
465
+ """
466
+ if not formula or len(formula) > _MAX_FORMULA_LEN:
467
+ return None
468
+
469
+ clean = formula.strip()
470
+ if not clean:
471
+ return None
472
+
473
+ # Skip things that look like IUPAC names or common names.
474
+ if _IUPAC_LIKE.search(clean):
475
+ return None
476
+
477
+ # Skip things with multiple words (IUPAC names, reaction descriptions).
478
+ if " " in clean:
479
+ return None
480
+
481
+ # Tokenize
482
+ tokens = tokenize(clean)
483
+ if not tokens:
484
+ return None
485
+
486
+ # Need at least 2 tokens to form a compound
487
+ # (single abbreviations are handled by the reagent DB)
488
+ real_tokens = [t for t in tokens if t[0] not in ("count",)]
489
+ if len(real_tokens) < 2:
490
+ return None
491
+
492
+ # Assemble
493
+ return _assemble(tokens)
@@ -0,0 +1,195 @@
1
+ """Manage a bundled JRE for OPSIN.
2
+
3
+ A JRE zip is shipped inside the package (``cdxml_toolkit/_jre/``).
4
+ On first use it is extracted to ``~/.cdxml-toolkit/jre/`` so that
5
+ py2opsin can run without requiring the user to install Java.
6
+
7
+ Discovery order (used by :func:`get_java`):
8
+ 1. System Java on PATH or JAVA_HOME
9
+ 2. Already-extracted JRE at ``~/.cdxml-toolkit/jre/``
10
+ 3. Extract from bundled zip (one-time, ~45 MB)
11
+ 4. Download from Adoptium API (network fallback)
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import io
17
+ import os
18
+ import shutil
19
+ import sys
20
+ import zipfile
21
+ from pathlib import Path
22
+ from typing import Optional
23
+
24
+ # Where the extracted JRE lives
25
+ _JRE_BASE = Path.home() / ".cdxml-toolkit" / "jre"
26
+
27
+ # Bundled JRE zip inside the package
28
+ _BUNDLED_ZIP = Path(__file__).resolve().parent.parent / "_jre" / "temurin-21-jre-win-x64.zip"
29
+
30
+ # Network fallback URL (Adoptium API)
31
+ _ADOPTIUM_URL = (
32
+ "https://api.adoptium.net/v3/binary/latest/21/ga/windows/x64/jre/"
33
+ "hotspot/normal/eclipse?project=jdk"
34
+ )
35
+
36
+ # Cached result
37
+ _java_exe: Optional[str] = None
38
+
39
+
40
+ def _find_system_java() -> Optional[str]:
41
+ """Check PATH and JAVA_HOME for an existing java executable."""
42
+ java = shutil.which("java")
43
+ if java:
44
+ return java
45
+
46
+ java_home = os.environ.get("JAVA_HOME")
47
+ if java_home:
48
+ for name in ("java.exe", "java"):
49
+ candidate = os.path.join(java_home, "bin", name)
50
+ if os.path.isfile(candidate):
51
+ return candidate
52
+ return None
53
+
54
+
55
+ def _find_extracted_java() -> Optional[str]:
56
+ """Check ~/.cdxml-toolkit/jre/ for an already-extracted JRE."""
57
+ if not _JRE_BASE.is_dir():
58
+ return None
59
+ # The zip extracts to a subdirectory like jdk-21.0.10+7-jre/
60
+ for entry in _JRE_BASE.iterdir():
61
+ if entry.is_dir():
62
+ for name in ("java.exe", "java"):
63
+ candidate = entry / "bin" / name
64
+ if candidate.is_file():
65
+ return str(candidate)
66
+ return None
67
+
68
+
69
+ def _extract_bundled_jre() -> Optional[str]:
70
+ """Extract the JRE zip shipped inside the package.
71
+
72
+ Returns the path to java.exe, or None if the bundled zip is missing.
73
+ """
74
+ if not _BUNDLED_ZIP.is_file():
75
+ return None
76
+
77
+ _JRE_BASE.mkdir(parents=True, exist_ok=True)
78
+
79
+ print(" [cdxml-toolkit] Extracting bundled JRE (one-time)...",
80
+ file=sys.stderr)
81
+ try:
82
+ with zipfile.ZipFile(_BUNDLED_ZIP) as zf:
83
+ zf.extractall(_JRE_BASE)
84
+ except Exception as e:
85
+ print(f" [cdxml-toolkit] JRE extraction failed: {e}", file=sys.stderr)
86
+ return None
87
+
88
+ java = _find_extracted_java()
89
+ if java:
90
+ print(f" [cdxml-toolkit] JRE ready: {java}", file=sys.stderr)
91
+ return java
92
+
93
+
94
+ def _download_jre() -> Optional[str]:
95
+ """Download Eclipse Temurin JRE 21 from Adoptium (network fallback).
96
+
97
+ Only used if the bundled zip is missing (e.g. minimal source install).
98
+ Returns the path to java.exe, or None on failure.
99
+ """
100
+ try:
101
+ import urllib.request
102
+ except ImportError:
103
+ return None
104
+
105
+ _JRE_BASE.mkdir(parents=True, exist_ok=True)
106
+
107
+ print(" [cdxml-toolkit] Downloading JRE for OPSIN (~45 MB)...",
108
+ file=sys.stderr)
109
+ try:
110
+ req = urllib.request.Request(
111
+ _ADOPTIUM_URL,
112
+ headers={"User-Agent": "cdxml-toolkit/0.5"},
113
+ )
114
+ with urllib.request.urlopen(req, timeout=120) as resp:
115
+ data = resp.read()
116
+ except Exception as e:
117
+ print(f" [cdxml-toolkit] JRE download failed: {e}", file=sys.stderr)
118
+ return None
119
+
120
+ print(" [cdxml-toolkit] Extracting JRE...", file=sys.stderr)
121
+ try:
122
+ with zipfile.ZipFile(io.BytesIO(data)) as zf:
123
+ zf.extractall(_JRE_BASE)
124
+ except Exception as e:
125
+ print(f" [cdxml-toolkit] JRE extraction failed: {e}", file=sys.stderr)
126
+ return None
127
+
128
+ java = _find_extracted_java()
129
+ if java:
130
+ print(f" [cdxml-toolkit] JRE ready: {java}", file=sys.stderr)
131
+ return java
132
+
133
+
134
+ def get_java(download: bool = True) -> Optional[str]:
135
+ """Return the path to a ``java`` executable.
136
+
137
+ Discovery order:
138
+ 1. System Java (PATH / JAVA_HOME)
139
+ 2. Already-extracted JRE at ``~/.cdxml-toolkit/jre/``
140
+ 3. Extract from bundled zip (ships with the package)
141
+ 4. Download from Adoptium API (network fallback)
142
+
143
+ Args:
144
+ download: If True (default), allow network download as last
145
+ resort when the bundled zip is also missing.
146
+
147
+ Returns:
148
+ Absolute path to ``java`` or ``java.exe``, or None.
149
+ """
150
+ global _java_exe
151
+ if _java_exe is not None:
152
+ return _java_exe
153
+
154
+ # 1. System Java
155
+ _java_exe = _find_system_java()
156
+ if _java_exe:
157
+ return _java_exe
158
+
159
+ # 2. Already-extracted bundled JRE
160
+ _java_exe = _find_extracted_java()
161
+ if _java_exe:
162
+ return _java_exe
163
+
164
+ # 3. Extract from bundled zip
165
+ _java_exe = _extract_bundled_jre()
166
+ if _java_exe:
167
+ return _java_exe
168
+
169
+ # 4. Network fallback
170
+ if download:
171
+ _java_exe = _download_jre()
172
+ return _java_exe
173
+
174
+ return None
175
+
176
+
177
+ def ensure_java_on_path(download: bool = True) -> bool:
178
+ """Make sure ``java`` is discoverable by subprocess calls.
179
+
180
+ Finds (or extracts/downloads) a JRE, then adds its ``bin/``
181
+ directory to ``PATH`` and sets ``JAVA_HOME`` so that py2opsin's
182
+ ``subprocess.run(["java", ...])`` works.
183
+
184
+ Returns True if Java is available, False otherwise.
185
+ """
186
+ java = get_java(download=download)
187
+ if not java:
188
+ return False
189
+
190
+ java_bin_dir = os.path.dirname(java)
191
+ path = os.environ.get("PATH", "")
192
+ if java_bin_dir not in path:
193
+ os.environ["PATH"] = java_bin_dir + os.pathsep + path
194
+ os.environ["JAVA_HOME"] = os.path.dirname(java_bin_dir)
195
+ return True