PyPI - cdxml-toolkit - Versions diffs - 0.5.0__py3-none-any.whl - Mend

cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

cdxml_toolkit/__init__.py +18 -0
cdxml_toolkit/_jre/__init__.py +2 -0
cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
cdxml_toolkit/analysis/__init__.py +35 -0
cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
cdxml_toolkit/analysis/extract_nmr.py +47 -0
cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
cdxml_toolkit/cdxml_builder.py +920 -0
cdxml_toolkit/cdxml_utils.py +342 -0
cdxml_toolkit/chemdraw/__init__.py +5 -0
cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
cdxml_toolkit/constants.py +304 -0
cdxml_toolkit/coord_normalizer.py +438 -0
cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
cdxml_toolkit/image/__init__.py +15 -0
cdxml_toolkit/image/reaction_from_image.py +2103 -0
cdxml_toolkit/image/structure_from_image.py +1711 -0
cdxml_toolkit/layout/__init__.py +5 -0
cdxml_toolkit/layout/alignment.py +1642 -0
cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
cdxml_toolkit/layout/scheme_merger.py +2260 -0
cdxml_toolkit/mcp_server/__init__.py +0 -0
cdxml_toolkit/mcp_server/__main__.py +5 -0
cdxml_toolkit/mcp_server/server.py +1567 -0
cdxml_toolkit/naming/__init__.py +6 -0
cdxml_toolkit/naming/aligned_namer.py +2342 -0
cdxml_toolkit/naming/mol_builder.py +3722 -0
cdxml_toolkit/naming/name_decomposer.py +2843 -0
cdxml_toolkit/naming/reactions_datamol.json +2414 -0
cdxml_toolkit/office/__init__.py +5 -0
cdxml_toolkit/office/doc_from_template.py +722 -0
cdxml_toolkit/office/ole_embedder.py +808 -0
cdxml_toolkit/office/ole_extractor.py +272 -0
cdxml_toolkit/perception/__init__.py +10 -0
cdxml_toolkit/perception/compound_search.py +229 -0
cdxml_toolkit/perception/eln_csv_parser.py +240 -0
cdxml_toolkit/perception/rdf_parser.py +664 -0
cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
cdxml_toolkit/perception/reaction_parser.py +2150 -0
cdxml_toolkit/perception/scheme_reader.py +2948 -0
cdxml_toolkit/perception/scheme_refine.py +1404 -0
cdxml_toolkit/perception/scheme_segmenter.py +619 -0
cdxml_toolkit/perception/spatial_assignment.py +1013 -0
cdxml_toolkit/rdkit_utils.py +605 -0
cdxml_toolkit/render/__init__.py +17 -0
cdxml_toolkit/render/auto_layout.py +229 -0
cdxml_toolkit/render/compact_parser.py +632 -0
cdxml_toolkit/render/parser.py +706 -0
cdxml_toolkit/render/render_scheme.py +267 -0
cdxml_toolkit/render/renderer.py +2387 -0
cdxml_toolkit/render/schema.py +90 -0
cdxml_toolkit/render/scheme_maker.py +1043 -0
cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
cdxml_toolkit/resolve/__init__.py +13 -0
cdxml_toolkit/resolve/cas_resolver.py +430 -0
cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
cdxml_toolkit/resolve/condensed_formula.py +493 -0
cdxml_toolkit/resolve/jre_manager.py +195 -0
cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
cdxml_toolkit/resolve/reagent_db.py +285 -0
cdxml_toolkit/resolve/superatom_data.json +2856 -0
cdxml_toolkit/resolve/superatom_table.py +146 -0
cdxml_toolkit/text_formatting.py +298 -0
cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0

cdxml_toolkit/resolve/superatom_table.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""Superatom label → SMILES lookup table for abbreviation group expansion.
+Provides a case-insensitive mapping from common abbreviation labels
+(as they appear in ChemDraw CDXML ``NodeType="Fragment"`` nodes) to
+SMILES strings.  Used by ``rdkit_utils.frag_to_mw()`` to compute
+molecular weight of fragments that contain abbreviated groups, which
+would otherwise return None (element-0 dummy atoms).
+Data sources:
+  1. ``superatom_data.json`` (project root) — ~2,850 entries generated
+     from ChemScanner's superatom.txt (MIT/AGPL) plus additional common
+     ChemDraw abbreviations.  Generated by
+     ``experiments/build_superatom_json.py``.
+  2. RDKit built-in abbreviations (BSD):
+     ``rdkit.Chem.rdAbbreviations.GetDefaultAbbreviations()``
+     ~40 entries used as secondary source (only adds entries not already
+     in the JSON).
+The SMILES represent standalone fragments whose first atom is the
+attachment point.  When computing MW contribution to a parent molecule,
+callers must subtract 1.008 Da per attachment bond (one implicit H is
+lost when the fragment bonds to the parent).
+"""
+import json
+import os
+from typing import Dict, Optional
+# Pre-cached MW values for superatom SMILES (avoids RDKit import at load time).
+# Computed as: Descriptors.MolWt(Chem.MolFromSmiles(smiles)).
+# This is the "standalone" MW — callers subtract 1.008 per attachment bond.
+_MW_CACHE: Dict[str, float] = {}
+# The lookup table: lowercase label → SMILES
+_TABLE: Optional[Dict[str, str]] = None
+# Path to the JSON data file (same directory as this module)
+_JSON_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                          "superatom_data.json")
+def _build_table() -> Dict[str, str]:
+    """Build the case-insensitive label → SMILES lookup table."""
+    table: Dict[str, str] = {}
+    # Primary source: superatom_data.json
+    if os.path.exists(_JSON_PATH):
+        with open(_JSON_PATH, encoding="utf-8") as f:
+            data = json.load(f)
+        # JSON keys are already lowercase
+        table.update(data)
+    # Secondary source: RDKit built-in abbreviations
+    try:
+        from rdkit.Chem import rdAbbreviations
+        for abbrev in rdAbbreviations.GetDefaultAbbreviations():
+            label = abbrev.label.lower()
+            if label and label not in table:
+                sma = abbrev.smarts
+                if sma:
+                    table[label] = sma
+            if hasattr(abbrev, "displayLabel"):
+                dl = abbrev.displayLabel.lower()
+                if dl and dl not in table:
+                    table[dl] = abbrev.smarts
+            if hasattr(abbrev, "displayLabelW"):
+                dlw = abbrev.displayLabelW.lower()
+                if dlw and dlw not in table:
+                    table[dlw] = abbrev.smarts
+    except (ImportError, AttributeError):
+        pass  # RDKit not available; JSON entries suffice
+    return table
+def get_superatom_table() -> Dict[str, str]:
+    """Return the label → SMILES lookup table (singleton, built on first call).
+    Keys are lowercase.  Values are SMILES strings representing the
+    standalone fragment (first atom = attachment point).
+    """
+    global _TABLE
+    if _TABLE is None:
+        _TABLE = _build_table()
+    return _TABLE
+def lookup_smiles(label: str) -> Optional[str]:
+    """Look up a superatom label and return its SMILES, or None."""
+    return get_superatom_table().get(label.lower())
+def lookup_mw(label: str) -> Optional[float]:
+    """Look up a superatom label and return its standalone MW, or None.
+    The returned MW is for the standalone fragment (includes all implicit H).
+    Callers computing MW for an attached group should subtract 1.008 per
+    attachment bond (one H is lost when the group bonds to the parent).
+    Requires RDKit — returns None if RDKit is not available.
+    """
+    smiles = lookup_smiles(label)
+    if smiles is None:
+        return None
+    # Check cache
+    if smiles in _MW_CACHE:
+        return _MW_CACHE[smiles]
+    try:
+        from rdkit import Chem
+        from rdkit.Chem import Descriptors
+        # Try parsing as SMILES first (most entries), then as SMARTS
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            mol = Chem.MolFromSmarts(smiles)
+            if mol is not None:
+                mol = Chem.AddHs(mol)
+        if mol is None:
+            return None
+        mw = Descriptors.MolWt(mol)
+        _MW_CACHE[smiles] = mw
+        return mw
+    except (ImportError, Exception):
+        return None
+def get_abbrev_label(node) -> Optional[str]:
+    """Extract the visible abbreviation label text from a CDXML node.
+    Expects an ``<n NodeType="Fragment">`` element.  The label is in
+    a ``<t><s>...</s></t>`` child of the ``<n>`` (not inside the inner
+    ``<fragment>``).
+    Returns the concatenated text of all ``<s>`` elements, or None.
+    """
+    for t in node.findall("t"):
+        parts = []
+        for s in t.findall("s"):
+            if s.text:
+                parts.append(s.text)
+        if parts:
+            return "".join(parts)
+    return None

cdxml_toolkit/text_formatting.py ADDED Viewed

@@ -0,0 +1,298 @@
+"""text_formatting.py — Shared chemical text formatting for ChemDraw CDXML.
+Provides functions for building properly formatted <s> (styled text run)
+elements in CDXML, handling two chemistry-specific typographic conventions:
+1. **Subscript digits in chemical formulas.**
+   In chemical notation, digits that follow letters are molecular counts and
+   must be rendered as subscripts: "CH3OH" → "CH₃OH", "Pd2(dba)3" → "Pd₂(dba)₃".
+   Plain numbers (temperatures "80 °C", durations "2 h", percentages "95%")
+   are left as normal text.
+2. **Italic prefixes in IUPAC / organic nomenclature.**
+   Stereochemical descriptors, positional locants, and heteroatom locants at
+   the start of a reagent name are italicised per IUPAC convention:
+   "n-BuLi" → "*n*-BuLi", "tert-BuOH" → "*tert*-BuOH", "N-Boc" → "*N*-Boc".
+ChemDraw CDXML face codes used:
+  - face="96"  (0x60 = Formula)   — normal reagent text
+  - face="32"  (0x20 = Subscript) — subscript digits
+  - face="2"   (0x02 = Italic)    — italic prefix runs
+Previously duplicated across scheme_polisher.py and reaction_from_image.py.
+Consolidated here for v0.3.
+"""
+from __future__ import annotations
+import re
+from typing import Tuple
+from xml.sax.saxutils import escape as xml_escape
+# ---------------------------------------------------------------------------
+# Regex: letter (or closing paren) followed by one or more digits.
+# Matches subscriptable digit groups in chemical formulas.
+# Examples:  CH3 → ("H", "3"),  Pd2 → ("d", "2"),  (dba)3 → (")", "3")
+# ---------------------------------------------------------------------------
+SUBSCRIPT_RE = re.compile(r'([A-Za-z)])(\d+)')
+# Keep underscore-prefixed alias for backward compatibility with callers that
+# import the private name directly.
+_SUBSCRIPT_RE = SUBSCRIPT_RE
+# ---------------------------------------------------------------------------
+# Italic prefixes recognised in organic chemistry nomenclature.
+# Matched at the start of the display name, case-sensitive.
+# Longer forms come first so "tert-" is tried before "t-".
+# ---------------------------------------------------------------------------
+ITALIC_PREFIXES: list[str] = [
+    "tert-", "sec-", "iso-",          # long forms first
+    "n-", "t-", "s-", "i-",           # single-letter alkyl descriptors
+    "o-", "m-", "p-",                 # arene positional (ortho/meta/para)
+    "cis-", "trans-",
+    "rac-", "meso-",
+    "R-", "S-",
+    "syn-", "anti-",
+    "exo-", "endo-",
+    "E-", "Z-",
+    "D-", "L-",
+    "N-", "O-", "S-", "C-", "P-",    # heteroatom locants (N-Boc, O-alkyl …)
+]
+_ITALIC_PREFIXES = ITALIC_PREFIXES  # backward-compat alias
+# ---------------------------------------------------------------------------
+# Public helpers
+# ---------------------------------------------------------------------------
+def needs_subscript(text: str) -> bool:
+    """Determine whether *text* contains chemical-formula digits that should
+    be rendered as subscripts in ChemDraw.
+    Returns ``True`` for reagent formulas like ``"CH3OH"``, ``"Cs2CO3"``,
+    ``"Pd2(dba)3"`` where trailing digits represent atom counts.
+    Returns ``False`` for non-formula text that happens to contain digits:
+    * Temperatures — ``"80 °C"``
+    * Durations — ``"2 h"``, ``"30 min"``
+    * Percentages — ``"95%"``
+    * Pure-numeric / unit-only strings — ``"120 °C, 2 h"``
+    Examples::
+        >>> needs_subscript("Et3N")
+        True
+        >>> needs_subscript("DMF")
+        False
+        >>> needs_subscript("80 °C")
+        False
+    """
+    # Temperature (digits before °)
+    if re.search(r'\d+\s*°', text):
+        return False
+    # Duration (digits before h/m at word boundary)
+    if re.search(r'\d+\s*[hm](?:\s|$|,)', text):
+        return False
+    # Percentage
+    if re.search(r'\d+\s*%', text):
+        return False
+    # Pure numeric / unit strings like "reflux", "rt", "120 °C, 2 h"
+    if re.fullmatch(r'[\d\s.,°ChmsMinHr/]+', text, re.IGNORECASE):
+        return False
+    return bool(SUBSCRIPT_RE.search(text))
+# Private-name alias for callers that import ``_needs_subscript``.
+_needs_subscript = needs_subscript
+def split_italic_prefix(text: str) -> Tuple[str, str]:
+    """Split *text* into ``(italic_prefix, remainder)`` if it starts with a
+    recognised chemistry italic prefix (see :data:`ITALIC_PREFIXES`).
+    Returns ``("", text)`` when no prefix matches.
+    Examples::
+        >>> split_italic_prefix("n-BuLi")
+        ('n-', 'BuLi')
+        >>> split_italic_prefix("tert-BuOH")
+        ('tert-', 'BuOH')
+        >>> split_italic_prefix("Cs2CO3")
+        ('', 'Cs2CO3')
+    """
+    for prefix in ITALIC_PREFIXES:
+        if text.startswith(prefix):
+            return prefix, text[len(prefix):]
+    return "", text
+_split_italic_prefix = split_italic_prefix  # backward-compat alias
+def build_formatted_s_xml(
+    text: str,
+    font: str = "3",
+    size: str = "10",
+    color: str = "0",
+    italic_font: str | None = None,
+) -> str:
+    """Build one or more CDXML ``<s>`` elements with correct chemical styling.
+    This is the primary text-formatting entry point. It handles:
+    1. **Italic prefix** (``n-``, ``tert-``, ``sec-``, ``N-``, …) rendered
+       with ``face="2"`` (Italic).
+    2. **Subscript digits** after letters/closing-parens rendered with
+       ``face="32"`` (Subscript).
+    3. **Normal formula text** rendered with ``face="96"`` (Formula).
+    Parameters
+    ----------
+    text : str
+        The display text for a reagent or chemical name (e.g. ``"n-BuLi"``,
+        ``"Cs2CO3"``, ``"Pd2(dba)3"``).
+    font : str
+        CDXML font id for normal + subscript runs (default ``"3"`` = Arial).
+    size : str
+        Font size in points (default ``"10"``).
+    color : str
+        CDXML color id (default ``"0"`` = black).
+    italic_font : str or None
+        If given, use this font id for the italic prefix run instead of
+        *font*. Useful when the italic style lives in a separate font entry.
+    Returns
+    -------
+    str
+        Raw XML string of ``<s>`` elements ready to embed inside a ``<t>``
+        element.  Example for ``"n-BuLi"``::
+            <s font="3" size="10" color="0" face="2">n-</s>
+            <s font="3" size="10" color="0" face="96">BuLi</s>
+    Notes
+    -----
+    The function is XML-safe: all text content is escaped via
+    ``xml.sax.saxutils.escape``.
+    """
+    italic_prefix, rest = split_italic_prefix(text)
+    ifont = italic_font if italic_font is not None else font
+    parts: list[str] = []
+    # ---- italic prefix ----
+    if italic_prefix:
+        parts.append(
+            f'<s font="{ifont}" size="{size}" color="{color}" '
+            f'face="2">{xml_escape(italic_prefix)}</s>'
+        )
+    # ---- remainder with subscript handling ----
+    if rest:
+        if needs_subscript(rest):
+            pos = 0
+            for m in SUBSCRIPT_RE.finditer(rest):
+                normal_end = m.start(2)
+                if pos < normal_end:
+                    chunk = xml_escape(rest[pos:normal_end])
+                    parts.append(
+                        f'<s font="{font}" size="{size}" color="{color}" '
+                        f'face="96">{chunk}</s>'
+                    )
+                digits = xml_escape(m.group(2))
+                parts.append(
+                    f'<s font="{font}" size="{size}" color="{color}" '
+                    f'face="32">{digits}</s>'
+                )
+                pos = m.end()
+            if pos < len(rest):
+                chunk = xml_escape(rest[pos:])
+                parts.append(
+                    f'<s font="{font}" size="{size}" color="{color}" '
+                    f'face="96">{chunk}</s>'
+                )
+        else:
+            parts.append(
+                f'<s font="{font}" size="{size}" color="{color}" '
+                f'face="96">{xml_escape(rest)}</s>'
+            )
+    return "".join(parts)
+# Backward-compatible aliases (used by scheme_polisher and reaction_from_image).
+_build_formatted_s_xml = build_formatted_s_xml
+build_subscripted_s_xml = build_formatted_s_xml
+_build_subscripted_s_xml = build_formatted_s_xml
+# ---------------------------------------------------------------------------
+# Self-test
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    passed = 0
+    failed = 0
+    def check(label: str, got, expected):
+        global passed, failed
+        if got == expected:
+            print(f"  PASS  {label}")
+            passed += 1
+        else:
+            print(f"  FAIL  {label}")
+            print(f"        expected: {expected!r}")
+            print(f"        got:      {got!r}")
+            failed += 1
+    print("text_formatting.py self-test")
+    print("=" * 50)
+    # --- needs_subscript ---
+    check("needs_subscript('CH3OH')", needs_subscript("CH3OH"), True)
+    check("needs_subscript('DMF')", needs_subscript("DMF"), False)
+    check("needs_subscript('Et3N')", needs_subscript("Et3N"), True)
+    check("needs_subscript('Cs2CO3')", needs_subscript("Cs2CO3"), True)
+    check("needs_subscript('80 °C')", needs_subscript("80 °C"), False)
+    check("needs_subscript('2 h')", needs_subscript("2 h"), False)
+    check("needs_subscript('95%')", needs_subscript("95%"), False)
+    # --- split_italic_prefix ---
+    check("split_italic_prefix('n-BuLi')", split_italic_prefix("n-BuLi"), ("n-", "BuLi"))
+    check("split_italic_prefix('Cs2CO3')", split_italic_prefix("Cs2CO3"), ("", "Cs2CO3"))
+    check("split_italic_prefix('tert-BuOH')", split_italic_prefix("tert-BuOH"), ("tert-", "BuOH"))
+    check("split_italic_prefix('N-Boc')", split_italic_prefix("N-Boc"), ("N-", "Boc"))
+    # --- build_formatted_s_xml ---
+    xml_et3n = build_formatted_s_xml("Et3N")
+    check("build_formatted_s_xml('Et3N') contains <s>",
+          "<s " in xml_et3n, True)
+    check("build_formatted_s_xml('Et3N') has subscript face",
+          'face="32"' in xml_et3n, True)
+    check("build_formatted_s_xml('Et3N') has formula face",
+          'face="96"' in xml_et3n, True)
+    xml_nbuli = build_formatted_s_xml("n-BuLi")
+    check("build_formatted_s_xml('n-BuLi') has italic face",
+          'face="2"' in xml_nbuli, True)
+    check("build_formatted_s_xml('n-BuLi') italic run contains 'n-'",
+          'face="2">n-</s>' in xml_nbuli, True)
+    xml_dmf = build_formatted_s_xml("DMF")
+    check("build_formatted_s_xml('DMF') — no subscript for plain text",
+          'face="32"' in xml_dmf, False)
+    # --- aliases ---
+    check("build_subscripted_s_xml is build_formatted_s_xml",
+          build_subscripted_s_xml is build_formatted_s_xml, True)
+    check("_build_formatted_s_xml is build_formatted_s_xml",
+          _build_formatted_s_xml is build_formatted_s_xml, True)
+    print("=" * 50)
+    print(f"Results: {passed} passed, {failed} failed")
+    if failed:
+        raise SystemExit(1)
+    print("All tests passed.")