PyPI - chemap - Versions diffs - 0.3.0__tar.gz → 0.3.2__tar.gz - Mend

chemap 0.3.0tar.gz → 0.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{chemap-0.3.0 → chemap-0.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chemap
-Version: 0.3.0
+Version: 0.3.2
 Summary: Library for computing molecular fingerprint based similarities as well as dimensionality reduction based chemical space visualizations.
 License-Expression: MIT
 License-File: LICENSE

chemap-0.3.2/chemap/benchmarking/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+from .fingerprint_duplicates import (
+    load_duplicates_npz,
+    load_precomputed_duplicates_folder,
+    save_duplicates_npz,
+)
+from .utils import compute_compound_max_mass_differences, compute_duplicate_max_mass_differences
+__all__ = [
+    "compute_compound_max_mass_differences",
+    "compute_duplicate_max_mass_differences",
+    "load_duplicates_npz",
+    "load_precomputed_duplicates_folder",
+    "save_duplicates_npz",
+]

{chemap-0.3.0 → chemap-0.3.2}/chemap/fingerprint_computation.py RENAMED Viewed

@@ -7,6 +7,7 @@ from joblib import Parallel, delayed
 from rdkit import Chem
 from sklearn.base import BaseEstimator, TransformerMixin
 from tqdm import tqdm
+from chemap.types import UnfoldedBinary, UnfoldedCount
 # -----------------------------
@@ -16,9 +17,6 @@ from tqdm import tqdm
 InvalidPolicy = Literal["drop", "keep", "raise"]
 Scaling = Optional[Literal["log"]]
-UnfoldedBinary = List[np.ndarray]  # list of int64 feature IDs per molecule
-UnfoldedCount = List[Tuple[np.ndarray, np.ndarray]]  # list of (int64 feature IDs, float32 values)
 FingerprintResult = Union[np.ndarray, sp.csr_matrix, UnfoldedBinary, UnfoldedCount]
@@ -532,9 +530,13 @@ def _skfp_configure_output(
     """
     Configure scikit-fingerprints/sklearn transformer to match (folded, return_csr).
-    - folded=True : use the transformer's folded output
-    - folded=False: require variant='raw_bits' if supported
-    - return_csr=True (only when folded=True): prefer transformer sparse CSR if supported
+    Supports two modes:
+    1) scikit-fingerprints classic:
+       - unfolded via variant='raw_bits' (if available)
+       - folded via variant='folded' (if variant exists)
+    2) ChemapBaseFingerprint style:
+       - unfolded via folded=False (no variant)
+       - folded via folded=True
     """
     params = fpgen.get_params(deep=False)
     updates: Dict[str, Any] = {}
@@ -545,22 +547,42 @@ def _skfp_configure_output(
     if "n_jobs" in params:
         updates["n_jobs"] = n_jobs
+    chemap_style = "folded" in params  # ChemapBaseFingerprint exposes folded param
+    # -------------------------
+    # UNFOLDED (cfg.folded=False)
+    # -------------------------
     if not cfg.folded:
+        if chemap_style:
+            if params.get("folded") is not False:
+                updates["folded"] = False
+            # We don't force sparse; unfolded returns lists anyway.
+            return _clone_transformer_with_params(fpgen, updates) if updates else fpgen
+        # classic scikit-fingerprints route: needs variant='raw_bits'
         if "variant" not in params:
             raise NotImplementedError(
-                "Requested folded=False (unfolded), but this transformer does not expose a `variant` parameter "
-                "for an unfolded feature space (e.g., variant='raw_bits')."
+                "Requested folded=False (unfolded), but this transformer does not support "
+                "either chemap-style `folded` switching or an skfp-style `variant='raw_bits'`."
             )
         if params.get("variant") != "raw_bits":
             updates["variant"] = "raw_bits"
-        # For unfolded conversion we can accept either dense or CSR outputs, so we do not force "sparse".
         return _clone_transformer_with_params(fpgen, updates) if updates else fpgen
-    # folded=True
+    # ------------------------
+    # FOLDED (cfg.folded=True)
+    # ------------------------
+    if chemap_style:
+        if params.get("folded") is not True:
+            updates["folded"] = True
+    # If it's classic skfp and currently set to raw_bits, restore folded variant
     if "variant" in params and params.get("variant") == "raw_bits":
         updates["variant"] = "folded"
+    # Prefer CSR if requested and supported
     if "sparse" in params:
         desired = bool(cfg.return_csr)
         if params.get("sparse") != desired:
@@ -577,35 +599,83 @@ def _compute_sklearn(
     show_progress: bool = False,
     n_jobs: int,
 ) -> FingerprintResult:
+    """
+    Compute fingerprints using sklearn/scikit-fingerprints style transformers.
+    Supports two kinds of transformers for unfolded output (cfg.folded=False):
+      1) Classic scikit-fingerprints: unfolded via variant='raw_bits' (matrix output)
+      2) ChemapBaseFingerprint style: unfolded via folded=False (list output)
+    Invalid-policy behavior:
+      - drop: returns only valid rows (shorter output)
+      - keep: aligns output to input, inserting empty rows for invalid smiles
+      - raise: raises on first invalid smiles
+    """
     fp = _skfp_configure_output(fpgen, cfg, show_progress=show_progress, n_jobs=n_jobs)
+    # Parse molecules with robust handling (None for invalid SMILES)
     mol_transformer = RobustMolTransformer(n_jobs=n_jobs)
     mols = mol_transformer.transform(smiles)
-    # Determine valid/invalid molecules and handle invalid according to policy.
     valid_idx = [i for i, m in enumerate(mols) if m is not None]
     invalid_idx = [i for i, m in enumerate(mols) if m is None]
     if invalid_idx and cfg.invalid_policy == "raise":
         raise ValueError(f"Invalid SMILES: {smiles[invalid_idx[0]]}")
-    # Fit/transform only valid mols (safe for most transformers)
     valid_mols = [mols[i] for i in valid_idx]
+    # Most skfp transformers are "fit-less" but expose fit; keep consistent behavior.
     fp.fit(valid_mols)
     X_valid = fp.transform(valid_mols)
-    # If policy is drop: just return X_valid (current behavior)
+    # -----------------------------
+    # Case A: transformer returns chemap-unfolded formats directly (list output)
+    # -----------------------------
+    is_list_unfolded = isinstance(X_valid, list) and (
+        len(X_valid) == 0
+        or isinstance(X_valid[0], np.ndarray)
+        or (isinstance(X_valid[0], tuple) and len(X_valid[0]) == 2)
+    )
+    if is_list_unfolded:
+        # In this case, we assume we are already in unfolded mode (cfg.folded=False).
+        # If cfg.folded=True but the transformer returns lists, that's an API mismatch.
+        if cfg.folded:
+            raise TypeError(
+                "Transformer returned chemap-unfolded list output while cfg.folded=True. "
+                "This likely indicates a misconfigured transformer."
+            )
+        if cfg.invalid_policy == "drop":
+            return X_valid
+        # keep alignment: reinsert empty rows
+        N = len(smiles)
+        if cfg.count:
+            X_full: UnfoldedCount = [_empty_unfolded_count() for _ in range(N)]
+        else:
+            X_full: UnfoldedBinary = [_empty_unfolded_binary() for _ in range(N)]
+        for out_i, orig_i in enumerate(valid_idx):
+            X_full[orig_i] = X_valid[out_i]  # type: ignore[index]
+        return X_full
+    # -----------------------------
+    # Case B: transformer returns a matrix (dense or sparse)
+    # -----------------------------
+    # Handle invalid-policy re-insertion for matrix outputs
     if cfg.invalid_policy == "drop":
         X = X_valid
     else:
-        # policy keep: reinsert empty rows to match input length
         N = len(smiles)
         if sp.issparse(X_valid):
             X_valid = X_valid.tocsr().astype(np.float32)
             D = X_valid.shape[1]
             X = sp.csr_matrix((N, D), dtype=np.float32)
-            # place valid rows
             X[valid_idx, :] = X_valid
         else:
             X_valid = np.asarray(X_valid, dtype=np.float32)
@@ -613,8 +683,8 @@ def _compute_sklearn(
             X = np.zeros((N, D), dtype=np.float32)
             X[valid_idx, :] = X_valid
+    # If unfolded requested, convert matrix -> chemap unfolded formats
     if not cfg.folded:
-        # unfolded output
         if sp.issparse(X):
             return _csr_matrix_to_unfolded(X.tocsr().astype(np.float32), cfg)
         return _dense_matrix_to_unfolded(np.asarray(X, dtype=np.float32), cfg)

chemap-0.3.2/chemap/fingerprints/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from .chemap_base_fingerprint import ChemapBaseFingerprint
+from .element_count_fp import ElementCountFingerprint
+from .lingo import LingoFingerprint
+from .map4 import MAP4FPGen
+from .mhfp import MHFPEncoderLite
+__all__ = [
+    "ChemapBaseFingerprint",
+    "ElementCountFingerprint",
+    "LingoFingerprint",
+    "MAP4FPGen",
+    "MHFPEncoderLite",
+]

chemap-0.3.2/chemap/fingerprints/chemap_base_fingerprint.py ADDED Viewed

@@ -0,0 +1,76 @@
+from collections.abc import Sequence
+from typing import Any
+from joblib import Parallel, delayed
+from rdkit.Chem import Mol
+from skfp.bases import BaseFingerprintTransformer
+from skfp.utils import ensure_smiles
+from chemap.types import UnfoldedBinary, UnfoldedCount
+class ChemapBaseFingerprint(BaseFingerprintTransformer):
+    """
+    Extension of scikit-fingerprints BaseFingerprintTransformer that adds `folded`.
+    - folded=True: behaves like scikit-fingerprints (returns dense ndarray or sparse csr_array)
+    - folded=False: returns chemap unfolded formats (lists of feature IDs / (IDs, values))
+    Important: this class intentionally subclasses scikit-fingerprints' base class
+    to preserve their behavior (validation, parallelization patterns, etc.) where possible.
+    """
+    def __init__(
+        self,
+        *,
+        n_features_out: int,
+        count: bool = False,
+        sparse: bool = False,
+        folded: bool = True,
+        n_jobs: int | None = None,
+        batch_size: int | None = None,
+        verbose: int | dict = 0,
+    ):
+        super().__init__(
+            n_features_out=n_features_out,
+            count=count,
+            sparse=sparse,
+            n_jobs=n_jobs,
+            batch_size=batch_size,
+            verbose=verbose,
+        )
+        self.folded = folded
+    def transform(self, X: Sequence[str | Mol], copy: bool = False) -> Any:
+        """
+        If folded=True: defer to BaseFingerprintTransformer.transform (matrix output).
+        If folded=False: return chemap unfolded formats.
+        """
+        if self.folded:
+            return super().transform(X, copy=copy)
+        # unfolded route: we accept SMILES or Mol, but Lingo-like methods want SMILES
+        smiles = ensure_smiles(X)
+        return self._calculate_unfolded(smiles)
+    # ---- hooks for subclasses ----
+    def _calculate_unfolded(self, X_smiles: Sequence[str]) -> UnfoldedBinary | UnfoldedCount:
+        """
+        Subclasses must implement when folded=False.
+        Must return chemap unfolded formats:
+          - count=False: List[np.ndarray[int64]]
+          - count=True : List[Tuple[np.ndarray[int64], np.ndarray[float32]]]
+        """
+        raise NotImplementedError
+    # ---- helpers ----
+    def _parallel_map(self, fn, items):
+        n_jobs = self.n_jobs if self.n_jobs is not None else 1
+        if n_jobs == 1:
+            return [fn(x) for x in items]
+        batch_size = self.batch_size if self.batch_size is not None else "auto"
+        return Parallel(n_jobs=n_jobs, batch_size=batch_size)(
+            delayed(fn)(x) for x in items
+        )

chemap-0.3.2/chemap/fingerprints/lingo.py ADDED Viewed

@@ -0,0 +1,154 @@
+import hashlib
+import re
+from collections import defaultdict
+from collections.abc import Sequence
+from numbers import Integral
+import numpy as np
+from scipy.sparse import csr_array
+from skfp.utils import ensure_smiles
+from sklearn.utils._param_validation import Interval
+from chemap.fingerprints import ChemapBaseFingerprint
+class LingoFingerprint(ChemapBaseFingerprint):
+    """
+    Lingo fingerprint with chemap unfolded support.
+    folded=True:
+        behaves like scikit-fingerprints: fixed-size hashed vector (dense or CSR)
+    folded=False:
+        returns chemap unfolded formats with stable 64-bit feature IDs derived from SHA-1:
+          - count=False: List[np.ndarray[int64]] (feature IDs)
+          - count=True : List[Tuple[np.ndarray[int64], np.ndarray[float32]]] (IDs + counts)
+    """
+    _parameter_constraints: dict = {
+        **ChemapBaseFingerprint._parameter_constraints,
+        "fp_size": [Interval(Integral, 1, None, closed="left")],
+        "substring_length": [Interval(Integral, 1, None, closed="left")],
+    }
+    def __init__(
+        self,
+        fp_size: int = 4096,
+        substring_length: int = 4,
+        count: bool = False,
+        sparse: bool = False,
+        folded: bool = True,
+        n_jobs: int | None = None,
+        batch_size: int | None = None,
+        verbose: int | dict = 0,
+    ):
+        super().__init__(
+            n_features_out=fp_size,
+            count=count,
+            sparse=sparse,
+            folded=folded,
+            n_jobs=n_jobs,
+            batch_size=batch_size,
+            verbose=verbose,
+        )
+        self.fp_size = fp_size
+        self.substring_length = substring_length
+    # --------------------
+    # Shared preprocessing
+    # --------------------
+    def smiles_to_dicts(self, X: Sequence[str]) -> list[dict[str, int]]:
+        """
+        Convert SMILES to dicts of substring counts (original Lingo raw features).
+        """
+        X = ensure_smiles(X)
+        # same canonicalization as skfp
+        X = [re.sub(r"[123456789]", "0", smi) for smi in X]
+        X = [re.sub(r"Cl", "L", smi) for smi in X]
+        X = [re.sub(r"Br", "R", smi) for smi in X]
+        result: list[dict[str, int]] = []
+        L = self.substring_length
+        for smi in X:
+            d: defaultdict[str, int] = defaultdict(int)
+            # overlapping substrings
+            for i in range(len(smi) - L + 1):
+                d[smi[i : i + L]] += 1
+            result.append(dict(d))
+        return result
+    # --------------------
+    # Folded (matrix) path
+    # --------------------
+    def _calculate_fingerprint(self, X: Sequence[str]) -> np.ndarray | csr_array:
+        """
+        Called by BaseFingerprintTransformer when folded=True.
+        """
+        dicts = self.smiles_to_dicts(X)
+        arr = self._dicts_to_folded_array(dicts)
+        return csr_array(arr) if self.sparse else arr
+    def _dicts_to_folded_array(self, dicts: list[dict[str, int]]) -> np.ndarray:
+        """
+        Hash and fold into [0..fp_size-1], identical to skfp folding rule.
+        """
+        dtype = np.uint32 if self.count else np.uint8
+        out = np.zeros((len(dicts), self.fp_size), dtype=dtype)
+        for i, d in enumerate(dicts):
+            for token, c in d.items():
+                digest = hashlib.sha1(token.encode("utf-8"), usedforsecurity=False).digest()
+                hash_index = int.from_bytes(digest, byteorder="big") % self.fp_size
+                if self.count:
+                    out[i, hash_index] += c
+                else:
+                    out[i, hash_index] = 1
+        return out
+    # -----------------------
+    # Unfolded (chemap) path
+    # -----------------------
+    def _calculate_unfolded(self, X_smiles: Sequence[str]):
+        """
+        Return chemap unfolded formats.
+        Feature IDs are stable int64 derived from SHA-1 digest:
+          id64 = int.from_bytes(digest[:8], "big")  (uint64, then viewed as int64 safely via np.uint64->np.int64 cast)
+        """
+        dicts = self.smiles_to_dicts(X_smiles)
+        def token_to_id32(token: str) -> int:
+            digest = hashlib.sha1(token.encode("utf-8"), usedforsecurity=False).digest()
+            return int.from_bytes(digest[:4], byteorder="big", signed=False)
+        if self.count:
+            def one(d: dict[str, int]) -> tuple[np.ndarray, np.ndarray]:
+                if not d:
+                    return (np.array([], dtype=np.int64), np.array([], dtype=np.float32))
+                agg: dict[int, float] = {}
+                for token, c in d.items():
+                    fid = token_to_id32(token)
+                    agg[fid] = agg.get(fid, 0.0) + float(c)
+                keys = np.array(sorted(agg.keys()), dtype=np.int64)
+                vals = np.array([agg[int(k)] for k in keys], dtype=np.float32)
+                return keys, vals
+            return self._parallel_map(one, dicts)
+        def one_bin(d: dict[str, int]) -> np.ndarray:
+            if not d:
+                return np.array([], dtype=np.int64)
+            ids = np.fromiter((token_to_id32(t) for t in d.keys()), dtype=np.int64)
+            # np.unique sorts ascending
+            return np.unique(ids).astype(np.int64, copy=False)
+        return self._parallel_map(one_bin, dicts)

chemap-0.3.2/chemap/fingerprints/map4.py ADDED Viewed

@@ -0,0 +1,349 @@
+"""
+chemap-compatible MAP4 FP generator (in parts based on Luca Cappelletti's implementation of MAP4:
+https://github.com/LucaCappelletti94/map4/blob/master/map4/map4.py
+Which is based on the original MAP4 implementation
+`Alice Capecchi, Daniel Probst, Jean-Louis Reymond
+        "One molecular fingerprint to rule them all: drugs, biomolecules, and the metabolome"
+        J Cheminform 12, 43 (2020)
+        <https://jcheminf.biomedcentral.com/articles/10.1186/s13321-020-00445-4>`_
+There are a few particular aspects about this implementation tough:
+- Folded output:
+    * binary (uint8) uses MHFP-style MinHash folding (chemap.fingerprints.mhfp)
+    * count  (float32) accumulates true shingle multiplicities into folded bins (not a MinHash signature,
+    so different from the original implementation!)
+- Unfolded output:
+    * count=True  -> true counts per raw feature id
+    * count=False -> keys only (chemap will read keys from GetSparseCountFingerprint)
+    * feature ids are SHA1 by default, unless minhash_for_unfolded=True
+"""
+import itertools
+from collections import defaultdict
+from dataclasses import dataclass
+from hashlib import sha1
+from typing import Dict, List, Optional, Set
+import numpy as np
+from rdkit.Chem import Mol, MolToSmiles, PathToSubmol
+from rdkit.Chem.rdmolops import FindAtomEnvironmentOfRadiusN, GetDistanceMatrix
+from chemap.fingerprints.mhfp import MHFPEncoderLite
+# -----------------------------
+# Minimal RDKit-like return types
+# -----------------------------
+@dataclass(frozen=True)
+class _SparseCountFingerprint:
+    """RDKit SparseIntVect-like shim for chemap."""
+    nz: Dict[int, int]
+    def GetNonzeroElements(self) -> Dict[int, int]:
+        return self.nz
+@dataclass(frozen=True)
+class _BitFingerprint:
+    """RDKit ExplicitBitVect-like shim for chemap size inference."""
+    n_bits: int
+    def GetNumBits(self) -> int:
+        return self.n_bits
+@dataclass(frozen=True)
+class _CountFingerprint:
+    """RDKit IntSparseIntVect-like shim for chemap size inference."""
+    length: int
+    def GetLength(self) -> int:
+        return self.length
+# -----------------------------
+# MAP4 shingling core
+# -----------------------------
+class _MAP4Shingler:
+    """
+    Generates MAP4 shingles as bytes:
+      - envs for radii 1..R
+      - for each atom pair (i<j) and each radius index k in [0..R-1]:
+          shingle = f"{smaller_env}|{dist}|{larger_env}"
+        where smaller/larger chosen by length comparison (ties go to env_b as larger)
+      - optional include_duplicated_shingles "suffix trick" is available, but for chemap counts
+        we SHOULD NOT use it (we want true multiplicities).
+    """
+    def __init__(
+        self,
+        radius: int = 2,
+        *,
+        include_duplicated_shingles: bool = False,
+        max_dist: Optional[int] = None,
+        dist_binning: Optional[np.ndarray] = None,
+    ):
+        if radius <= 0:
+            raise ValueError("radius must be > 0.")
+        self.radius = int(radius)
+        self.include_duplicated_shingles = bool(include_duplicated_shingles)
+        self.max_dist = max_dist
+        self.dist_binning = dist_binning
+    def shingles_unique(self, mol: Mol) -> Set[bytes]:
+        return set(self._all_pairs(mol, self._get_atom_envs(mol)))
+    def shingles_with_counts_true(self, mol: Mol) -> Dict[bytes, int]:
+        """
+        True multiplicities (counts) WITHOUT suffix trick, regardless of include_duplicated_shingles.
+        """
+        counts: Dict[bytes, int] = defaultdict(int)
+        for sh in self._all_pairs(mol, self._get_atom_envs(mol), force_no_suffix=True):
+            counts[sh] += 1
+        return dict(counts)
+    def _convert_dist(self, dist: float) -> int:
+        if self.dist_binning is None:
+            return int(dist)
+        return int(np.digitize(dist, self.dist_binning, right=True))
+    def _get_atom_envs(self, mol: Mol) -> Dict[int, List[Optional[str]]]:
+        atoms_env: Dict[int, List[Optional[str]]] = {}
+        for atom in mol.GetAtoms():
+            atom_identifier = atom.GetIdx()
+            for r in range(1, self.radius + 1):
+                atoms_env.setdefault(atom_identifier, []).append(
+                    self._find_env(mol, atom_identifier, r)
+                )
+        return atoms_env
+    @staticmethod
+    def _find_env(mol: Mol, atom_identifier: int, radius: int) -> Optional[str]:
+        atom_identifiers_within_radius: List[int] = FindAtomEnvironmentOfRadiusN(
+            mol=mol, radius=radius, rootedAtAtom=atom_identifier
+        )
+        atom_map: Dict[int, int] = {}
+        sub_molecule: Mol = PathToSubmol(mol, atom_identifiers_within_radius, atomMap=atom_map)
+        if atom_identifier not in atom_map:
+            return None
+        return MolToSmiles(
+            sub_molecule,
+            rootedAtAtom=atom_map[atom_identifier],
+            canonical=True,
+            isomericSmiles=False,
+        )
+    def _all_pairs(
+        self,
+        mol: Mol,
+        atoms_env: Dict[int, List[Optional[str]]],
+        *,
+        force_no_suffix: bool = False,
+    ) -> List[bytes]:
+        """
+        Return shingles as bytes. If include_duplicated_shingles is enabled and not forced off,
+        suffix trick is applied to make duplicates unique (MAP4C-style behavior).
+        """
+        out: List[bytes] = []
+        dm = GetDistanceMatrix(mol)
+        n = mol.GetNumAtoms()
+        shingle_dict: Dict[str, int] = defaultdict(int)
+        for i, j in itertools.combinations(range(n), 2):
+            dist_val = float(dm[i][j])
+            if self.max_dist is not None and dist_val > self.max_dist:
+                continue
+            dist = str(self._convert_dist(dist_val))
+            for k in range(self.radius):
+                env_a = atoms_env[i][k] or ""
+                env_b = atoms_env[j][k] or ""
+                # compare by length, not lexicographic
+                if len(env_a) > len(env_b):
+                    larger_env, smaller_env = env_a, env_b
+                else:
+                    larger_env, smaller_env = env_b, env_a
+                shingle = f"{smaller_env}|{dist}|{larger_env}"
+                if self.include_duplicated_shingles and not force_no_suffix:
+                    shingle_dict[shingle] += 1
+                    shingle = f"{shingle}|{shingle_dict[shingle]}"
+                out.append(shingle.encode("utf-8"))
+        return out
+# -----------------------------
+# MAP4 fpgen for chemap
+# -----------------------------
+class MAP4FPGen:
+    """
+    chemap-compatible MAP4 fingerprint generator.
+    Folded outputs (fixed length):
+      - GetFingerprintAsNumPy: uint8[D] binary
+          computed by minhash signature (MHFPEncoderLite) folded to bits (mod D)
+      - GetCountFingerprintAsNumPy: float32[D] counts
+          computed by hashing each shingle (token hash32) -> bin (mod D) and summing true counts
+    Unfolded outputs (raw feature ids):
+      - GetSparseCountFingerprint returns {feature_id: count}
+          feature_id:
+            * sha1 truncation (default) OR
+            * token-hash32 (sha1 first 4 bytes) if minhash_for_unfolded=True
+    Parameters
+    ----------
+    folded:
+        Whether folded functions are meaningful (chemap controls this, but we keep for safety).
+    minhash_for_unfolded:
+        If True, unfolded uses MHFP-style token hash32 rather than sha1 truncation.
+    unfolded_bits:
+        32 or 64 (only used when minhash_for_unfolded=False).
+    include_duplicated_shingles:
+        For MAP4C-like behavior in *set shingles* (folded binary). For true counts we ignore suffix.
+    """
+    def __init__(
+        self,
+        dimensions: int = 1024,
+        radius: int = 2,
+        *,
+        seed: int = 75434278,
+        folded: bool = True,
+        # counts/dup behavior
+        include_duplicated_shingles: bool = False,
+        # unfolded hashing behavior
+        minhash_for_unfolded: bool = False,
+        unfolded_bits: int = 32,  # 32 or 64, only if minhash_for_unfolded=False
+        # optional distance handling
+        max_dist: Optional[int] = None,
+        dist_binning: Optional[np.ndarray] = None,
+    ):
+        self.dimensions = int(dimensions)
+        self.radius = int(radius)
+        self.seed = int(seed)
+        self.folded = bool(folded)
+        self.include_duplicated_shingles = bool(include_duplicated_shingles)
+        self.minhash_for_unfolded = bool(minhash_for_unfolded)
+        self.unfolded_bits = int(unfolded_bits)
+        if self.dimensions <= 0:
+            raise ValueError("dimensions must be > 0.")
+        if self.radius <= 0:
+            raise ValueError("radius must be > 0.")
+        if self.unfolded_bits not in (32, 64):
+            raise ValueError("unfolded_bits must be 32 or 64.")
+        self._shingler = _MAP4Shingler(
+            radius=self.radius,
+            include_duplicated_shingles=self.include_duplicated_shingles,
+            max_dist=max_dist,
+            dist_binning=dist_binning,
+        )
+        # Folded uses MHFPEncoderLite
+        self._mhfp = MHFPEncoderLite(
+            n_permutations=self.dimensions,
+            seed=self.seed,
+        )
+    # --------- chemap size inference ---------
+    def GetFingerprint(self, mol: Mol) -> _BitFingerprint:
+        return _BitFingerprint(self.dimensions)
+    def GetCountFingerprint(self, mol: Mol) -> _CountFingerprint:
+        return _CountFingerprint(self.dimensions)
+    # --------- unfolded API ---------
+    def GetSparseCountFingerprint(self, mol: Mol) -> _SparseCountFingerprint:
+        """
+        Returns {feature_id: count} for unfolded outputs.
+        - count=True in chemap: keys+values used
+        - count=False in chemap: keys used, values ignored
+        """
+        counts = self._shingler.shingles_with_counts_true(mol)
+        if not counts:
+            return _SparseCountFingerprint({})
+        nz: Dict[int, int] = defaultdict(int)
+        if self.minhash_for_unfolded:
+            # MHFP token-hash domain: sha1 first 4 bytes (little endian)
+            for sh, c in counts.items():
+                fid32 = int.from_bytes(sha1(sh).digest()[:4], "little", signed=False)
+                nz[int(fid32)] += int(c)
+        else:
+            for sh, c in counts.items():
+                fid = int(self._sha1_to_int(sh, bits=self.unfolded_bits))
+                nz[fid] += int(c)
+        return _SparseCountFingerprint(dict(nz))
+    # --------- folded API ---------
+    def GetFingerprintAsNumPy(self, mol: Mol) -> np.ndarray:
+        """
+        Folded binary vector uint8[D], matching original MAP4Calculator folded path:
+            folded = fold(hash(set(shingles)), D)
+        i.e. hash each unique shingle token -> set bit at (hash % D).
+        """
+        if not self.folded:
+            return np.zeros(self.dimensions, dtype=np.uint8)
+        shingles = self._shingler.shingles_unique(mol)  # set[bytes]
+        if not shingles:
+            return np.zeros(self.dimensions, dtype=np.uint8)
+        # Per-shingle 32-bit hash (matches the common MAP4/scikit-fingerprints style: sha1/sha256 truncated)
+        hashed = np.fromiter(
+            (int.from_bytes(sha1(sh).digest()[:4], "little", signed=False) for sh in shingles),
+            dtype=np.uint32,
+            count=len(shingles),
+        )
+        fp = np.zeros(self.dimensions, dtype=np.uint8)
+        fp[(hashed % np.uint32(self.dimensions)).astype(np.int64, copy=False)] = 1
+        return fp
+    def GetCountFingerprintAsNumPy(self, mol: Mol) -> np.ndarray:
+        """
+        Folded counts float32[D] using TRUE multiplicities.
+        This is *not* a MinHash (classic MAP4 is set-based). We instead provide a stable
+        count-fold baseline:
+          bin = token_hash32(shingle) % D
+          fp[bin] += count
+        """
+        if not self.folded:
+            return np.zeros(self.dimensions, dtype=np.float32)
+        counts = self._shingler.shingles_with_counts_true(mol)
+        if not counts:
+            return np.zeros(self.dimensions, dtype=np.float32)
+        fp = np.zeros(self.dimensions, dtype=np.float32)
+        for sh, c in counts.items():
+            h32 = int.from_bytes(sha1(sh).digest()[:4], "little", signed=False)
+            fp[h32 % self.dimensions] += float(c)
+        return fp
+    # -----------------------------
+    # Hash utilities
+    # -----------------------------
+    @staticmethod
+    def _sha1_to_int(data: bytes, *, bits: int = 64) -> np.uint64:
+        d = sha1(data).digest()
+        if bits == 32:
+            return np.uint64(int.from_bytes(d[:4], byteorder="little", signed=False))
+        return np.uint64(int.from_bytes(d[:8], byteorder="little", signed=False))

chemap-0.3.2/chemap/fingerprints/mhfp.py ADDED Viewed

@@ -0,0 +1,100 @@
+import struct
+from hashlib import sha1
+from typing import Iterable, Sequence, Union
+import numpy as np
+BytesLike = Union[bytes, bytearray, memoryview]
+class MHFPEncoderLite:
+    """
+    Compatibility-focused reimplementation of the original mhfp.encoder.MHFPEncoder.
+    (Original is from the Reymond group: https://github.com/reymond-group/mhfp)
+    Notes
+    -----
+    - The original uses:
+        prime    = 2^61 - 1
+        max_hash = 2^32 - 1
+      and outputs uint32 signatures.
+    - Token hash is:
+        struct.unpack("<I", sha1(token).digest()[:4])[0]
+    """
+    prime: int = (1 << 61) - 1
+    max_hash: int = (1 << 32) - 1
+    def __init__(self, n_permutations: int = 2048, seed: int = 42):
+        if n_permutations <= 0:
+            raise ValueError("n_permutations must be > 0.")
+        self.n_permutations = int(n_permutations)
+        self.seed = int(seed)
+        # Match original: generate uint32 a,b with uniqueness constraints
+        rand = np.random.RandomState(self.seed)
+        a = np.zeros(self.n_permutations, dtype=np.uint32)
+        b = np.zeros(self.n_permutations, dtype=np.uint32)
+        # Original code used `while a in self.permutations_a` checks (O(n)),
+        # but that behavior means "no duplicates". We'll enforce the same.
+        used_a = set()
+        used_b = set()
+        for i in range(self.n_permutations):
+            ai = int(rand.randint(1, MHFPEncoderLite.max_hash, dtype=np.uint32))
+            bi = int(rand.randint(0, MHFPEncoderLite.max_hash, dtype=np.uint32))
+            while ai in used_a:
+                ai = int(rand.randint(1, MHFPEncoderLite.max_hash, dtype=np.uint32))
+            while bi in used_b:
+                bi = int(rand.randint(0, MHFPEncoderLite.max_hash, dtype=np.uint32))
+            used_a.add(ai)
+            used_b.add(bi)
+            a[i] = np.uint32(ai)
+            b[i] = np.uint32(bi)
+        # Match original: reshape to column vectors (n_perm, 1)
+        self._a = a.reshape((self.n_permutations, 1)).astype(np.uint64, copy=False)
+        self._b = b.reshape((self.n_permutations, 1)).astype(np.uint64, copy=False)
+    # -----------------------------
+    # Token hashing (exact)
+    # -----------------------------
+    @staticmethod
+    def _token_hash32(token: BytesLike) -> np.uint32:
+        # EXACT original semantics: struct.unpack("<I", sha1(t).digest()[:4])[0]
+        return np.uint32(struct.unpack("<I", sha1(bytes(token)).digest()[:4])[0])
+    # -----------------------------
+    # Original helper API: hash / fold / merge / distance
+    # -----------------------------
+    @staticmethod
+    def hash(tokens: Iterable[BytesLike]) -> np.ndarray:
+        """
+        For compatibility with original MHFPEncoder.hash(shingling):
+        returns per-token uint32 hash values (NOT minhash signature).
+        """
+        return np.fromiter(
+            (MHFPEncoderLite._token_hash32(t) for t in tokens),
+            dtype=np.uint32,
+        )
+    @staticmethod
+    def fold(hash_values: Sequence[int], length: int = 2048) -> np.ndarray:
+        """
+        Compatibility with original fold(): binary uint8 vector with bits set at hash % length.
+        """
+        length = int(length)
+        if length <= 0:
+            raise ValueError("length must be > 0.")
+        folded = np.zeros(length, dtype=np.uint8)
+        if len(hash_values) == 0:
+            return folded
+        hv = np.asarray(hash_values, dtype=np.uint64)
+        folded[(hv % np.uint64(length)).astype(np.int64, copy=False)] = 1
+        return folded

{chemap-0.3.0 → chemap-0.3.2}/chemap/plotting/benchmark_duplicates.py RENAMED Viewed

@@ -114,6 +114,7 @@ def plot_duplicate_bins(
     xlabel: str = "Compounds with Fingerprint Duplicates",
     title: str = "Duplicate Statistics by Experiment",
     legend_title: str = "Maximum mass difference\n(for identical fingerprints)",
+    ax: Optional[plt.Axes] = None,
 ) -> Tuple[plt.Figure, plt.Axes]:
     """Plot stacked horizontal bars of duplicate counts across bins.
@@ -135,9 +136,6 @@ def plot_duplicate_bins(
     xlabel, title, legend_title:
         Plot labels.
-    Returns
-    -------
-    (fig, ax)
     """
     if len(results) == 0:
         raise ValueError("results must be non-empty")
@@ -157,7 +155,10 @@ def plot_duplicate_bins(
     bin_labels = res[0].bin_labels
     colors = n_colors_from_cmap(n_bins, cmap)
-    fig, ax = plt.subplots(figsize=figsize)
+    if ax is None:
+        fig, ax = plt.subplots(figsize=figsize)
+    else:
+        fig = ax.figure
     y_positions = np.arange(len(res))
     left_stack = np.zeros(len(res), dtype=float)
@@ -207,6 +208,7 @@ def plot_duplicates_by_experiment(
     cmap = green_yellow_red,
     title: str = "Duplicate fingerprints plot",
     figsize: Tuple[float, float] = (10, 6),
+    ax: Optional[plt.Axes] = None,
     sort_by_total: bool = True,
 ) -> Tuple[plt.Figure, plt.Axes, List[DuplicateBinResult]]:
     """Compute binned duplicate stats per experiment and plot them.
@@ -238,5 +240,6 @@ def plot_duplicates_by_experiment(
         cmap=cmap,
         sort_by_total=sort_by_total,
         title=title,
+        ax=ax,
     )
     return fig, ax, results

{chemap-0.3.0 → chemap-0.3.2}/chemap/plotting/chem_space_umap.py RENAMED Viewed

@@ -6,8 +6,7 @@ from chemap import FingerprintConfig, compute_fingerprints
 from chemap.fingerprint_conversions import fingerprints_to_csr
 from chemap.metrics import (
     tanimoto_distance_dense,
-    tanimoto_distance_unfolded_binary,
-    tanimoto_distance_unfolded_count,
+    tanimoto_distance_sparse,
 )
@@ -25,18 +24,17 @@ def _choose_cpu_metric(config: FingerprintConfig, distance_function: str) -> Any
     - unfolded + binary => tanimoto_distance_unfolded_binary
     - folded (usually dense/packed) => tanimoto_distance_dense
     """
+    if distance_function.lower() == "cosine":
+        return "cosine"
     if distance_function.lower() != "tanimoto":
         raise ValueError(
             f"Unsupported distance_function={distance_function!r}. "
-            "Currently only 'tanimoto' is supported here."
+            "Currently only 'tanimoto' and 'cosine' is supported here."
         )
     if getattr(config, "folded", False):
         return tanimoto_distance_dense
-    if getattr(config, "count", False):
-        return tanimoto_distance_unfolded_count
-    return tanimoto_distance_unfolded_binary
+    return tanimoto_distance_sparse
 def _log1p_csr_inplace(X) -> Any:
@@ -61,7 +59,7 @@ def create_chem_space_umap(
     n_neighbors: int = 15,
     min_dist: float = 0.25,
     n_jobs: int = -1,
-    umap_random_state: Optional[int] = 40476,
+    umap_random_state: Optional[int] = None,
     distance_function: str = "tanimoto",
 ) -> pd.DataFrame:
     """Compute fingerprints (CPU) and create 2D UMAP coordinates (CPU).
@@ -220,17 +218,16 @@ def create_chem_space_umap_gpu(
         show_progress=show_progress,
     )
-    # Convert to numeric matrix.
-    fps_csr = fingerprints_to_csr(fingerprints).X
-    fps = fps_csr.toarray()
+    # Convert to sparse array
+    # fps_csr = fingerprints_to_csr(fingerprints).X
     # Reduce memory footprint (works well for count fingerprints)
     if not log_count:
         # stays integer-like
-        fps = fps.astype(np.int8, copy=False)
+        fps = fingerprints.astype(np.int8, copy=False)
     else:
         # log1p returns float
-        fps = np.log1p(fps).astype(np.float32, copy=False)
+        fps = np.log1p(fingerprints).astype(np.float32, copy=False)
     umap_model = cuUMAP(
         n_neighbors=int(n_neighbors),

{chemap-0.3.0 → chemap-0.3.2}/chemap/types.py RENAMED Viewed

@@ -1,7 +1,10 @@
 from typing import Mapping, Sequence, Tuple, Union
+import numpy as np
 Bins = Sequence[Tuple[float, float]]
 Color = Tuple[float, float, float]  # RGB
 ColorA = Tuple[float, float, float, float]  # RGBA
 Palette = Mapping[str, Union[Color, ColorA]]
+UnfoldedBinary = list[np.ndarray]  # list of int64 feature IDs per molecule
+UnfoldedCount = list[tuple[np.ndarray, np.ndarray]]  # (int64 feature IDs, float32 values)

{chemap-0.3.0 → chemap-0.3.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "chemap"
-version = "0.3.0"
+version = "0.3.2"
 description = "Library for computing molecular fingerprint based similarities as well as dimensionality reduction based chemical space visualizations. "
 authors = [
   { name="Florian Huber", email="florian.huber@hs-duesseldorf.de" },

chemap-0.3.0/chemap/additional_fingerprints/__init__.py DELETED Viewed

@@ -1,6 +0,0 @@
-from .element_count_fp import ElementCountFingerprint
-__all__ = [
-    "ElementCountFingerprint",
-]

chemap-0.3.0/chemap/benchmarking/__init__.py DELETED Viewed

@@ -1,7 +0,0 @@
-from .utils import compute_compound_max_mass_differences, compute_duplicate_max_mass_differences
-__all__ = [
-    "compute_compound_max_mass_differences",
-    "compute_duplicate_max_mass_differences",
-]