cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,146 @@
1
+ """Superatom label → SMILES lookup table for abbreviation group expansion.
2
+
3
+ Provides a case-insensitive mapping from common abbreviation labels
4
+ (as they appear in ChemDraw CDXML ``NodeType="Fragment"`` nodes) to
5
+ SMILES strings. Used by ``rdkit_utils.frag_to_mw()`` to compute
6
+ molecular weight of fragments that contain abbreviated groups, which
7
+ would otherwise return None (element-0 dummy atoms).
8
+
9
+ Data sources:
10
+
11
+ 1. ``superatom_data.json`` (project root) — ~2,850 entries generated
12
+ from ChemScanner's superatom.txt (MIT/AGPL) plus additional common
13
+ ChemDraw abbreviations. Generated by
14
+ ``experiments/build_superatom_json.py``.
15
+
16
+ 2. RDKit built-in abbreviations (BSD):
17
+ ``rdkit.Chem.rdAbbreviations.GetDefaultAbbreviations()``
18
+ ~40 entries used as secondary source (only adds entries not already
19
+ in the JSON).
20
+
21
+ The SMILES represent standalone fragments whose first atom is the
22
+ attachment point. When computing MW contribution to a parent molecule,
23
+ callers must subtract 1.008 Da per attachment bond (one implicit H is
24
+ lost when the fragment bonds to the parent).
25
+ """
26
+
27
+ import json
28
+ import os
29
+ from typing import Dict, Optional
30
+
31
+ # Pre-cached MW values for superatom SMILES (avoids RDKit import at load time).
32
+ # Computed as: Descriptors.MolWt(Chem.MolFromSmiles(smiles)).
33
+ # This is the "standalone" MW — callers subtract 1.008 per attachment bond.
34
+ _MW_CACHE: Dict[str, float] = {}
35
+
36
+ # The lookup table: lowercase label → SMILES
37
+ _TABLE: Optional[Dict[str, str]] = None
38
+
39
+ # Path to the JSON data file (same directory as this module)
40
+ _JSON_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)),
41
+ "superatom_data.json")
42
+
43
+
44
+ def _build_table() -> Dict[str, str]:
45
+ """Build the case-insensitive label → SMILES lookup table."""
46
+ table: Dict[str, str] = {}
47
+
48
+ # Primary source: superatom_data.json
49
+ if os.path.exists(_JSON_PATH):
50
+ with open(_JSON_PATH, encoding="utf-8") as f:
51
+ data = json.load(f)
52
+ # JSON keys are already lowercase
53
+ table.update(data)
54
+
55
+ # Secondary source: RDKit built-in abbreviations
56
+ try:
57
+ from rdkit.Chem import rdAbbreviations
58
+ for abbrev in rdAbbreviations.GetDefaultAbbreviations():
59
+ label = abbrev.label.lower()
60
+ if label and label not in table:
61
+ sma = abbrev.smarts
62
+ if sma:
63
+ table[label] = sma
64
+ if hasattr(abbrev, "displayLabel"):
65
+ dl = abbrev.displayLabel.lower()
66
+ if dl and dl not in table:
67
+ table[dl] = abbrev.smarts
68
+ if hasattr(abbrev, "displayLabelW"):
69
+ dlw = abbrev.displayLabelW.lower()
70
+ if dlw and dlw not in table:
71
+ table[dlw] = abbrev.smarts
72
+ except (ImportError, AttributeError):
73
+ pass # RDKit not available; JSON entries suffice
74
+
75
+ return table
76
+
77
+
78
+ def get_superatom_table() -> Dict[str, str]:
79
+ """Return the label → SMILES lookup table (singleton, built on first call).
80
+
81
+ Keys are lowercase. Values are SMILES strings representing the
82
+ standalone fragment (first atom = attachment point).
83
+ """
84
+ global _TABLE
85
+ if _TABLE is None:
86
+ _TABLE = _build_table()
87
+ return _TABLE
88
+
89
+
90
+ def lookup_smiles(label: str) -> Optional[str]:
91
+ """Look up a superatom label and return its SMILES, or None."""
92
+ return get_superatom_table().get(label.lower())
93
+
94
+
95
+ def lookup_mw(label: str) -> Optional[float]:
96
+ """Look up a superatom label and return its standalone MW, or None.
97
+
98
+ The returned MW is for the standalone fragment (includes all implicit H).
99
+ Callers computing MW for an attached group should subtract 1.008 per
100
+ attachment bond (one H is lost when the group bonds to the parent).
101
+
102
+ Requires RDKit — returns None if RDKit is not available.
103
+ """
104
+ smiles = lookup_smiles(label)
105
+ if smiles is None:
106
+ return None
107
+
108
+ # Check cache
109
+ if smiles in _MW_CACHE:
110
+ return _MW_CACHE[smiles]
111
+
112
+ try:
113
+ from rdkit import Chem
114
+ from rdkit.Chem import Descriptors
115
+ # Try parsing as SMILES first (most entries), then as SMARTS
116
+ mol = Chem.MolFromSmiles(smiles)
117
+ if mol is None:
118
+ mol = Chem.MolFromSmarts(smiles)
119
+ if mol is not None:
120
+ mol = Chem.AddHs(mol)
121
+ if mol is None:
122
+ return None
123
+ mw = Descriptors.MolWt(mol)
124
+ _MW_CACHE[smiles] = mw
125
+ return mw
126
+ except (ImportError, Exception):
127
+ return None
128
+
129
+
130
+ def get_abbrev_label(node) -> Optional[str]:
131
+ """Extract the visible abbreviation label text from a CDXML node.
132
+
133
+ Expects an ``<n NodeType="Fragment">`` element. The label is in
134
+ a ``<t><s>...</s></t>`` child of the ``<n>`` (not inside the inner
135
+ ``<fragment>``).
136
+
137
+ Returns the concatenated text of all ``<s>`` elements, or None.
138
+ """
139
+ for t in node.findall("t"):
140
+ parts = []
141
+ for s in t.findall("s"):
142
+ if s.text:
143
+ parts.append(s.text)
144
+ if parts:
145
+ return "".join(parts)
146
+ return None
@@ -0,0 +1,298 @@
1
+ """text_formatting.py — Shared chemical text formatting for ChemDraw CDXML.
2
+
3
+ Provides functions for building properly formatted <s> (styled text run)
4
+ elements in CDXML, handling two chemistry-specific typographic conventions:
5
+
6
+ 1. **Subscript digits in chemical formulas.**
7
+ In chemical notation, digits that follow letters are molecular counts and
8
+ must be rendered as subscripts: "CH3OH" → "CH₃OH", "Pd2(dba)3" → "Pd₂(dba)₃".
9
+ Plain numbers (temperatures "80 °C", durations "2 h", percentages "95%")
10
+ are left as normal text.
11
+
12
+ 2. **Italic prefixes in IUPAC / organic nomenclature.**
13
+ Stereochemical descriptors, positional locants, and heteroatom locants at
14
+ the start of a reagent name are italicised per IUPAC convention:
15
+ "n-BuLi" → "*n*-BuLi", "tert-BuOH" → "*tert*-BuOH", "N-Boc" → "*N*-Boc".
16
+
17
+ ChemDraw CDXML face codes used:
18
+ - face="96" (0x60 = Formula) — normal reagent text
19
+ - face="32" (0x20 = Subscript) — subscript digits
20
+ - face="2" (0x02 = Italic) — italic prefix runs
21
+
22
+ Previously duplicated across scheme_polisher.py and reaction_from_image.py.
23
+ Consolidated here for v0.3.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import re
29
+ from typing import Tuple
30
+ from xml.sax.saxutils import escape as xml_escape
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # Regex: letter (or closing paren) followed by one or more digits.
34
+ # Matches subscriptable digit groups in chemical formulas.
35
+ # Examples: CH3 → ("H", "3"), Pd2 → ("d", "2"), (dba)3 → (")", "3")
36
+ # ---------------------------------------------------------------------------
37
+ SUBSCRIPT_RE = re.compile(r'([A-Za-z)])(\d+)')
38
+
39
+ # Keep underscore-prefixed alias for backward compatibility with callers that
40
+ # import the private name directly.
41
+ _SUBSCRIPT_RE = SUBSCRIPT_RE
42
+
43
+ # ---------------------------------------------------------------------------
44
+ # Italic prefixes recognised in organic chemistry nomenclature.
45
+ # Matched at the start of the display name, case-sensitive.
46
+ # Longer forms come first so "tert-" is tried before "t-".
47
+ # ---------------------------------------------------------------------------
48
+ ITALIC_PREFIXES: list[str] = [
49
+ "tert-", "sec-", "iso-", # long forms first
50
+ "n-", "t-", "s-", "i-", # single-letter alkyl descriptors
51
+ "o-", "m-", "p-", # arene positional (ortho/meta/para)
52
+ "cis-", "trans-",
53
+ "rac-", "meso-",
54
+ "R-", "S-",
55
+ "syn-", "anti-",
56
+ "exo-", "endo-",
57
+ "E-", "Z-",
58
+ "D-", "L-",
59
+ "N-", "O-", "S-", "C-", "P-", # heteroatom locants (N-Boc, O-alkyl …)
60
+ ]
61
+
62
+ _ITALIC_PREFIXES = ITALIC_PREFIXES # backward-compat alias
63
+
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Public helpers
67
+ # ---------------------------------------------------------------------------
68
+
69
+ def needs_subscript(text: str) -> bool:
70
+ """Determine whether *text* contains chemical-formula digits that should
71
+ be rendered as subscripts in ChemDraw.
72
+
73
+ Returns ``True`` for reagent formulas like ``"CH3OH"``, ``"Cs2CO3"``,
74
+ ``"Pd2(dba)3"`` where trailing digits represent atom counts.
75
+
76
+ Returns ``False`` for non-formula text that happens to contain digits:
77
+
78
+ * Temperatures — ``"80 °C"``
79
+ * Durations — ``"2 h"``, ``"30 min"``
80
+ * Percentages — ``"95%"``
81
+ * Pure-numeric / unit-only strings — ``"120 °C, 2 h"``
82
+
83
+ Examples::
84
+
85
+ >>> needs_subscript("Et3N")
86
+ True
87
+ >>> needs_subscript("DMF")
88
+ False
89
+ >>> needs_subscript("80 °C")
90
+ False
91
+ """
92
+ # Temperature (digits before °)
93
+ if re.search(r'\d+\s*°', text):
94
+ return False
95
+ # Duration (digits before h/m at word boundary)
96
+ if re.search(r'\d+\s*[hm](?:\s|$|,)', text):
97
+ return False
98
+ # Percentage
99
+ if re.search(r'\d+\s*%', text):
100
+ return False
101
+ # Pure numeric / unit strings like "reflux", "rt", "120 °C, 2 h"
102
+ if re.fullmatch(r'[\d\s.,°ChmsMinHr/]+', text, re.IGNORECASE):
103
+ return False
104
+ return bool(SUBSCRIPT_RE.search(text))
105
+
106
+
107
+ # Private-name alias for callers that import ``_needs_subscript``.
108
+ _needs_subscript = needs_subscript
109
+
110
+
111
+ def split_italic_prefix(text: str) -> Tuple[str, str]:
112
+ """Split *text* into ``(italic_prefix, remainder)`` if it starts with a
113
+ recognised chemistry italic prefix (see :data:`ITALIC_PREFIXES`).
114
+
115
+ Returns ``("", text)`` when no prefix matches.
116
+
117
+ Examples::
118
+
119
+ >>> split_italic_prefix("n-BuLi")
120
+ ('n-', 'BuLi')
121
+ >>> split_italic_prefix("tert-BuOH")
122
+ ('tert-', 'BuOH')
123
+ >>> split_italic_prefix("Cs2CO3")
124
+ ('', 'Cs2CO3')
125
+ """
126
+ for prefix in ITALIC_PREFIXES:
127
+ if text.startswith(prefix):
128
+ return prefix, text[len(prefix):]
129
+ return "", text
130
+
131
+
132
+ _split_italic_prefix = split_italic_prefix # backward-compat alias
133
+
134
+
135
+ def build_formatted_s_xml(
136
+ text: str,
137
+ font: str = "3",
138
+ size: str = "10",
139
+ color: str = "0",
140
+ italic_font: str | None = None,
141
+ ) -> str:
142
+ """Build one or more CDXML ``<s>`` elements with correct chemical styling.
143
+
144
+ This is the primary text-formatting entry point. It handles:
145
+
146
+ 1. **Italic prefix** (``n-``, ``tert-``, ``sec-``, ``N-``, …) rendered
147
+ with ``face="2"`` (Italic).
148
+ 2. **Subscript digits** after letters/closing-parens rendered with
149
+ ``face="32"`` (Subscript).
150
+ 3. **Normal formula text** rendered with ``face="96"`` (Formula).
151
+
152
+ Parameters
153
+ ----------
154
+ text : str
155
+ The display text for a reagent or chemical name (e.g. ``"n-BuLi"``,
156
+ ``"Cs2CO3"``, ``"Pd2(dba)3"``).
157
+ font : str
158
+ CDXML font id for normal + subscript runs (default ``"3"`` = Arial).
159
+ size : str
160
+ Font size in points (default ``"10"``).
161
+ color : str
162
+ CDXML color id (default ``"0"`` = black).
163
+ italic_font : str or None
164
+ If given, use this font id for the italic prefix run instead of
165
+ *font*. Useful when the italic style lives in a separate font entry.
166
+
167
+ Returns
168
+ -------
169
+ str
170
+ Raw XML string of ``<s>`` elements ready to embed inside a ``<t>``
171
+ element. Example for ``"n-BuLi"``::
172
+
173
+ <s font="3" size="10" color="0" face="2">n-</s>
174
+ <s font="3" size="10" color="0" face="96">BuLi</s>
175
+
176
+ Notes
177
+ -----
178
+ The function is XML-safe: all text content is escaped via
179
+ ``xml.sax.saxutils.escape``.
180
+ """
181
+ italic_prefix, rest = split_italic_prefix(text)
182
+ ifont = italic_font if italic_font is not None else font
183
+
184
+ parts: list[str] = []
185
+
186
+ # ---- italic prefix ----
187
+ if italic_prefix:
188
+ parts.append(
189
+ f'<s font="{ifont}" size="{size}" color="{color}" '
190
+ f'face="2">{xml_escape(italic_prefix)}</s>'
191
+ )
192
+
193
+ # ---- remainder with subscript handling ----
194
+ if rest:
195
+ if needs_subscript(rest):
196
+ pos = 0
197
+ for m in SUBSCRIPT_RE.finditer(rest):
198
+ normal_end = m.start(2)
199
+ if pos < normal_end:
200
+ chunk = xml_escape(rest[pos:normal_end])
201
+ parts.append(
202
+ f'<s font="{font}" size="{size}" color="{color}" '
203
+ f'face="96">{chunk}</s>'
204
+ )
205
+ digits = xml_escape(m.group(2))
206
+ parts.append(
207
+ f'<s font="{font}" size="{size}" color="{color}" '
208
+ f'face="32">{digits}</s>'
209
+ )
210
+ pos = m.end()
211
+
212
+ if pos < len(rest):
213
+ chunk = xml_escape(rest[pos:])
214
+ parts.append(
215
+ f'<s font="{font}" size="{size}" color="{color}" '
216
+ f'face="96">{chunk}</s>'
217
+ )
218
+ else:
219
+ parts.append(
220
+ f'<s font="{font}" size="{size}" color="{color}" '
221
+ f'face="96">{xml_escape(rest)}</s>'
222
+ )
223
+
224
+ return "".join(parts)
225
+
226
+
227
+ # Backward-compatible aliases (used by scheme_polisher and reaction_from_image).
228
+ _build_formatted_s_xml = build_formatted_s_xml
229
+ build_subscripted_s_xml = build_formatted_s_xml
230
+ _build_subscripted_s_xml = build_formatted_s_xml
231
+
232
+
233
+ # ---------------------------------------------------------------------------
234
+ # Self-test
235
+ # ---------------------------------------------------------------------------
236
+ if __name__ == "__main__":
237
+ passed = 0
238
+ failed = 0
239
+
240
+ def check(label: str, got, expected):
241
+ global passed, failed
242
+ if got == expected:
243
+ print(f" PASS {label}")
244
+ passed += 1
245
+ else:
246
+ print(f" FAIL {label}")
247
+ print(f" expected: {expected!r}")
248
+ print(f" got: {got!r}")
249
+ failed += 1
250
+
251
+ print("text_formatting.py self-test")
252
+ print("=" * 50)
253
+
254
+ # --- needs_subscript ---
255
+ check("needs_subscript('CH3OH')", needs_subscript("CH3OH"), True)
256
+ check("needs_subscript('DMF')", needs_subscript("DMF"), False)
257
+ check("needs_subscript('Et3N')", needs_subscript("Et3N"), True)
258
+ check("needs_subscript('Cs2CO3')", needs_subscript("Cs2CO3"), True)
259
+ check("needs_subscript('80 °C')", needs_subscript("80 °C"), False)
260
+ check("needs_subscript('2 h')", needs_subscript("2 h"), False)
261
+ check("needs_subscript('95%')", needs_subscript("95%"), False)
262
+
263
+ # --- split_italic_prefix ---
264
+ check("split_italic_prefix('n-BuLi')", split_italic_prefix("n-BuLi"), ("n-", "BuLi"))
265
+ check("split_italic_prefix('Cs2CO3')", split_italic_prefix("Cs2CO3"), ("", "Cs2CO3"))
266
+ check("split_italic_prefix('tert-BuOH')", split_italic_prefix("tert-BuOH"), ("tert-", "BuOH"))
267
+ check("split_italic_prefix('N-Boc')", split_italic_prefix("N-Boc"), ("N-", "Boc"))
268
+
269
+ # --- build_formatted_s_xml ---
270
+ xml_et3n = build_formatted_s_xml("Et3N")
271
+ check("build_formatted_s_xml('Et3N') contains <s>",
272
+ "<s " in xml_et3n, True)
273
+ check("build_formatted_s_xml('Et3N') has subscript face",
274
+ 'face="32"' in xml_et3n, True)
275
+ check("build_formatted_s_xml('Et3N') has formula face",
276
+ 'face="96"' in xml_et3n, True)
277
+
278
+ xml_nbuli = build_formatted_s_xml("n-BuLi")
279
+ check("build_formatted_s_xml('n-BuLi') has italic face",
280
+ 'face="2"' in xml_nbuli, True)
281
+ check("build_formatted_s_xml('n-BuLi') italic run contains 'n-'",
282
+ 'face="2">n-</s>' in xml_nbuli, True)
283
+
284
+ xml_dmf = build_formatted_s_xml("DMF")
285
+ check("build_formatted_s_xml('DMF') — no subscript for plain text",
286
+ 'face="32"' in xml_dmf, False)
287
+
288
+ # --- aliases ---
289
+ check("build_subscripted_s_xml is build_formatted_s_xml",
290
+ build_subscripted_s_xml is build_formatted_s_xml, True)
291
+ check("_build_formatted_s_xml is build_formatted_s_xml",
292
+ _build_formatted_s_xml is build_formatted_s_xml, True)
293
+
294
+ print("=" * 50)
295
+ print(f"Results: {passed} passed, {failed} failed")
296
+ if failed:
297
+ raise SystemExit(1)
298
+ print("All tests passed.")