PyPI - bblean - Versions diffs - 0.7.8__tar.gz → 0.8.1__tar.gz - Mend

bblean 0.7.8tar.gz → 0.8.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

{bblean-0.7.8 → bblean-0.8.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: bblean
-Version: 0.7.8
+Version: 0.8.1
 Summary: BitBirch-Lean Python package
 Author: The Miranda-Quintana Lab and other BitBirch developers
 Author-email: Ramon Alain Miranda Quintana <quintana@chem.ufl.edu>, Krisztina Zsigmond <kzsigmond@ufl.edu>, Ignacio Pickering <ipickering@ufl.edu>, Kenneth Lopez Perez <klopezperez@chem.ufl.edu>, Miroslav Lzicar <miroslav.lzicar@deepmedchem.com>

{bblean-0.7.8 → bblean-0.8.1}/bblean/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.7.8'
-__version_tuple__ = version_tuple = (0, 7, 8)
+__version__ = version = '0.8.1'
+__version_tuple__ = version_tuple = (0, 8, 1)
 __commit_id__ = commit_id = None

{bblean-0.7.8 → bblean-0.8.1}/bblean/bitbirch.py RENAMED Viewed

@@ -47,6 +47,7 @@
 # ./LICENSES/GPL-3.0-only.txt.  If not, see <http://www.gnu.org/licenses/gpl-3.0.html>.
 r"""BitBirch 'Lean' class for fast, memory-efficient O(N) clustering"""
 from __future__ import annotations  # Stringize type annotations for no runtime overhead
+import itertools
 import pickle
 import sys
 import typing_extensions as tpx
@@ -171,8 +172,8 @@ def _split_node(node: "_BFNode") -> tuple["_BFSubcluster", "_BFSubcluster"]:
     """
     n_features = node.n_features
     branching_factor = node.branching_factor
-    new_subcluster1 = _BFSubcluster(n_features=n_features)
-    new_subcluster2 = _BFSubcluster(n_features=n_features)
+    new_subcluster1 = _BFSubcluster.empty(n_features)
+    new_subcluster2 = _BFSubcluster.empty(n_features)
     node1 = _BFNode(branching_factor, n_features)
     node2 = node  # Rename for clarity
@@ -394,13 +395,15 @@ class _BFSubcluster:
     def __init__(
         self,
-        *,
-        linear_sum: NDArray[np.integer] | None = None,
-        mol_indices: tp.Sequence[int] = (),
-        n_features: int = 2048,
-        buffer: NDArray[np.integer] | None = None,
+        buffer: NDArray[np.integer],
+        mol_indices: tp.Sequence[int],
+        packed_centroid: NDArray[np.uint8] | None = None,
         check_indices: bool = True,
-    ):
+    ) -> None:
+        # If packed centroid is passed, it must be equal to the packed centroid
+        # of the linear sum (this is not checked)
+        if mol_indices and check_indices and buffer[-1] != len(mol_indices):
+            raise ValueError("len mol_indices must be equal to buffer[-1] if specified")
         # NOTE: Internally, _buffer holds both "linear_sum" and "n_samples" It is
         # guaranteed to always have the minimum required uint dtype It should not be
         # accessed by external classes, only used internally. The individual parts can
@@ -409,44 +412,40 @@ class _BFSubcluster:
         #
         # IMPORTANT: To mutate instances of this class, *always* use the public API
         # given by replace|add_to_n_samples_and_linear_sum(...)
-        if buffer is not None:
-            if linear_sum is not None:
-                raise ValueError("'linear_sum' and 'buffer' are mutually exclusive")
-            if check_indices and len(mol_indices) != buffer[-1]:
-                raise ValueError(
-                    "Expected len(mol_indices) == buffer[-1],"
-                    f" but found {len(mol_indices)} != {buffer[-1]}"
-                )
-            self._buffer = buffer
-            self.packed_centroid = centroid_from_sum(buffer[:-1], buffer[-1], pack=True)
-        else:
-            if linear_sum is not None:
-                if check_indices and len(mol_indices) != 1:
-                    raise ValueError(
-                        "Expected len(mol_indices) == 1,"
-                        f" but found {len(mol_indices)} != 1"
-                    )
-                buffer = np.empty((len(linear_sum) + 1,), dtype=np.uint8)
-                buffer[:-1] = linear_sum
-                buffer[-1] = 1
-                self._buffer = buffer
-                self.packed_centroid = pack_fingerprints(
-                    linear_sum.astype(np.uint8, copy=False)
-                )
-            else:
-                # Empty subcluster
-                if check_indices and len(mol_indices) != 0:
-                    raise ValueError(
-                        "Expected len(mol_indices) == 0 for empty subcluster,"
-                        f" but found {len(mol_indices)} != 0"
-                    )
-                self._buffer = np.zeros((n_features + 1,), dtype=np.uint8)
-                self.packed_centroid = np.empty(
-                    0, dtype=np.uint8
-                )  # Will be overwritten
+        self._buffer = buffer
         self.mol_indices = list(mol_indices)
+        if packed_centroid is not None:
+            self.packed_centroid = packed_centroid
+        else:
+            self.packed_centroid = centroid_from_sum(buffer[:-1], buffer[-1], pack=True)
         self.child: tp.Optional["_BFNode"] = None
+    @classmethod
+    def empty(cls, n_features: int) -> tpx.Self:
+        packed_centroid = np.empty(0, dtype=np.uint8)  # Will be overwritten
+        return cls(
+            np.zeros((n_features + 1,), dtype=np.uint8),
+            [],
+            packed_centroid,
+            check_indices=False,
+        )
+    @classmethod
+    def from_fingerprint(
+        cls, fp: NDArray[np.uint8], index: int, weight: int | None = None
+    ) -> tpx.Self:
+        if weight is not None:
+            buffer = np.empty((len(fp) + 1,), dtype=min_safe_uint(weight))
+            buffer[:-1] = fp
+            buffer[-1] = 1
+            buffer *= weight
+        else:
+            buffer = np.empty((len(fp) + 1,), dtype=np.uint8)
+            buffer[:-1] = fp
+            buffer[-1] = 1
+        packed_centroid = pack_fingerprints(fp)
+        return cls(buffer, [index], packed_centroid, check_indices=False)
     @property
     def unpacked_centroid(self) -> NDArray[np.uint8]:
         return _unpack_fingerprints(self.packed_centroid, self.n_features)
@@ -711,6 +710,7 @@ class BitBirch:
         input_is_packed: bool = True,
         n_features: int | None = None,
         max_fps: int | None = None,
+        weights: tp.Iterable[int] | None = None,
     ) -> tpx.Self:
         r"""Build a BF Tree for the input data.
@@ -763,15 +763,19 @@ class BitBirch:
         else:
             iterable = zip(reinsert_indices, arr_iterable)
+        it_weights: tp.Iterator[int | None]
+        if weights is None:
+            it_weights = itertools.repeat(None)
+        else:
+            it_weights = iter(weights)
         threshold = self.threshold
         branching_factor = self.branching_factor
         merge_accept_fn = self._merge_accept_fn
         arr_idx = 0
         for idx, fp in iterable:
-            subcluster = _BFSubcluster(
-                linear_sum=fp, mol_indices=[idx], n_features=n_features
-            )
+            subcluster = _BFSubcluster.from_fingerprint(fp, idx, next(it_weights))
             split = self._root.insert_bf_subcluster(
                 subcluster, merge_accept_fn, threshold
             )
@@ -791,22 +795,22 @@ class BitBirch:
     def _fit_buffers(
         self,
         X: _Input | Path | str,
-        reinsert_index_seqs: (
-            tp.Iterable[tp.Sequence[int]] | tp.Literal["omit"]
-        ) = "omit",
+        reinsert_index_seqs: tp.Iterable[tp.Sequence[int]] | None,
+        check_indices: bool = True,
     ) -> tpx.Self:
         r"""Build a BF Tree starting from buffers
         Buffers are arrays of the form:
             - buffer[0:-1] = linear_sum
             - buffer[-1] = n_samples
-        And X is either an array or a list of such buffers
+        X is either an array or a list of such buffers
         If `reinsert_index_seqs` is passed, X corresponds only to the buffers to be
         reinserted into the tree, and `reinsert_index_seqs` are the sequences
         of indices associated with such buffers.
-        If `reinsert_index_seqs` is "omit", then no indices are collected in the tree.
+        If `reinsert_index_seqs` is None, then no indices are collected in the tree.
+        Num samples is mutually exclusive with reinsert_index_seqs.
         Parameters
         ----------
@@ -840,16 +844,13 @@ class BitBirch:
         branching_factor = self.branching_factor
         idx_provider: tp.Iterable[tp.Sequence[int]]
         arr_idx = 0
-        if reinsert_index_seqs == "omit":
-            idx_provider = (() for idx in range(self.num_fitted_fps))
-            check = False
+        if reinsert_index_seqs is None:
+            idx_provider = itertools.repeat(())
         else:
             idx_provider = reinsert_index_seqs
-            check = True
         for idxs, buf in zip(idx_provider, arr_iterable):
-            subcluster = _BFSubcluster(
-                buffer=buf, mol_indices=idxs, n_features=n_features, check_indices=check
-            )
+            subcluster = _BFSubcluster(buf, idxs, check_indices=check_indices)
             split = self._root.insert_bf_subcluster(
                 subcluster, merge_accept_fn, threshold
             )

{bblean-0.7.8 → bblean-0.8.1}/bblean/fingerprints.py RENAMED Viewed

@@ -1,11 +1,15 @@
 r"""Utilites for manipulating fingerprints and fingerprint files"""
+import sys
+import math
+import weakref
 import warnings
 import dataclasses
 from pathlib import Path
 from numpy.typing import NDArray, DTypeLike
 import numpy as np
 import typing as tp
+import multiprocessing as mp
 import multiprocessing.shared_memory as shmem
 from rich.console import Console
@@ -13,6 +17,8 @@ from rdkit.Chem import rdFingerprintGenerator, MolFromSmiles, SanitizeFlags, San
 from bblean._config import DEFAULTS
 from bblean._console import get_console
+from bblean.smiles import _iter_ranges_and_smiles_batches
+from bblean.utils import _num_avail_cpus
 __all__ = [
     "make_fake_fingerprints",
@@ -441,3 +447,112 @@ class _FingerprintArrayFiller:
             fps[i, :] = fp
         fps_shmem.close()
         invalid_mask_shmem.close()
+@tp.overload
+def fps_from_smiles_parallel(
+    smiles: tp.Iterable[str],
+    kind: str = DEFAULTS.fp_kind,
+    n_features: int = DEFAULTS.n_features,
+    dtype: DTypeLike = np.uint8,
+    sanitize: str = "all",
+    skip_invalid: tp.Literal[False] = False,
+    pack: bool = True,
+    num_ps: int = 1,
+    replace_dummy_atoms: bool = False,
+    tab_separated: bool = False,
+    mp_context: tp.Any = None,
+) -> NDArray[np.uint8]:
+    pass
+@tp.overload
+def fps_from_smiles_parallel(
+    smiles: tp.Iterable[str],
+    kind: str = DEFAULTS.fp_kind,
+    n_features: int = DEFAULTS.n_features,
+    dtype: DTypeLike = np.uint8,
+    sanitize: str = "all",
+    skip_invalid: tp.Literal[True] = True,
+    pack: bool = True,
+    num_ps: int = 1,
+    replace_dummy_atoms: bool = False,
+    tab_separated: bool = False,
+    mp_context: tp.Any = None,
+) -> tp.Union[NDArray[np.uint8], tuple[NDArray[np.uint8], NDArray[np.int64]]]:
+    pass
+# NOTE: This function is proof of concept and kinda dangerous since it registers
+# a custom destructor for the numpy array
+# It is also *only usable if called inside an if __name__ == "__main__" guard*
+# For now lets hide it
+def fps_from_smiles_parallel(
+    smiles: tp.Iterable[str],
+    kind: str = DEFAULTS.fp_kind,
+    n_features: int = DEFAULTS.n_features,
+    dtype: DTypeLike = np.uint8,
+    sanitize: str = "all",
+    skip_invalid: bool = False,
+    pack: bool = True,
+    num_ps: int | None = None,
+    replace_dummy_atoms: bool = False,
+    tab_separated: bool = False,
+    mp_context: tp.Any = None,
+) -> tp.Union[NDArray[np.uint8], tuple[NDArray[np.uint8], NDArray[np.int64]]]:
+    r""":meta private:"""
+    if mp_context is None:
+        mp_context = mp.get_context("forkserver" if sys.platform == "linux" else None)
+    if isinstance(smiles, str):
+        smiles = [smiles]
+    smiles = list(smiles)
+    smiles_num = len(smiles)
+    if num_ps is None:
+        num_ps = _num_avail_cpus()
+    if pack:
+        out_dim = (n_features + 7) // 8
+    else:
+        out_dim = n_features
+    shmem_size = smiles_num * out_dim * np.dtype(dtype).itemsize
+    fps_shmem = shmem.SharedMemory(create=True, size=shmem_size)
+    invalid_mask_shmem = shmem.SharedMemory(create=True, size=smiles_num)
+    fps_array_filler = _FingerprintArrayFiller(
+        shmem_name=fps_shmem.name,
+        invalid_mask_shmem_name=invalid_mask_shmem.name,
+        kind=kind,
+        fp_size=n_features,
+        num_smiles=smiles_num,
+        dtype=np.dtype(dtype).name,
+        pack=pack,
+        sanitize=sanitize,
+        skip_invalid=skip_invalid,
+    )
+    num_per_batch = math.ceil(smiles_num / num_ps)
+    with mp_context.Pool(processes=num_ps) as pool:
+        pool.starmap(
+            fps_array_filler,
+            _iter_ranges_and_smiles_batches(
+                smiles,
+                num_per_batch,
+                tab_separated,
+                replace_dummy_atoms,
+                assume_paths=False,
+            ),
+        )
+    fps = np.ndarray((smiles_num, out_dim), dtype=dtype, buffer=fps_shmem.buf)
+    mask = np.ndarray((smiles_num,), dtype=np.bool, buffer=invalid_mask_shmem.buf)
+    if skip_invalid:
+        fps = np.delete(fps, mask, axis=0)
+        weakref.finalize(mask, invalid_mask_shmem.close)
+        weakref.finalize(mask, invalid_mask_shmem.unlink)
+        weakref.finalize(fps, fps_shmem.close)
+        weakref.finalize(fps, fps_shmem.unlink)
+        return fps, mask
+    del mask
+    invalid_mask_shmem.close()
+    invalid_mask_shmem.unlink()
+    weakref.finalize(fps, fps_shmem.close)
+    weakref.finalize(fps, fps_shmem.unlink)
+    return fps

{bblean-0.7.8 → bblean-0.8.1}/bblean/smiles.py RENAMED Viewed

@@ -71,12 +71,14 @@ def _iter_ranges_and_smiles_batches(
     num_per_batch: int,
     tab_separated: bool = False,
     replace_dummy_atoms: bool = False,
+    assume_paths: bool = True,
 ) -> tp.Iterable[tuple[tuple[int, int], tuple[str, ...]]]:
+    if assume_paths:
+        it = iter_smiles_from_paths(smiles_paths, tab_separated, replace_dummy_atoms)
+    else:
+        it = tp.cast(tp.Iterator[str], smiles_paths)
     start_idx = 0
-    for batch in batched(
-        iter_smiles_from_paths(smiles_paths, tab_separated, replace_dummy_atoms),
-        num_per_batch,
-    ):
+    for batch in batched(it, num_per_batch):
         size = len(batch)
         end_idx = start_idx + size
         yield (start_idx, end_idx), batch

{bblean-0.7.8 → bblean-0.8.1}/bblean.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: bblean
-Version: 0.7.8
+Version: 0.8.1
 Summary: BitBirch-Lean Python package
 Author: The Miranda-Quintana Lab and other BitBirch developers
 Author-email: Ramon Alain Miranda Quintana <quintana@chem.ufl.edu>, Krisztina Zsigmond <kzsigmond@ufl.edu>, Ignacio Pickering <ipickering@ufl.edu>, Kenneth Lopez Perez <klopezperez@chem.ufl.edu>, Miroslav Lzicar <miroslav.lzicar@deepmedchem.com>

{bblean-0.7.8 → bblean-0.8.1}/tests/test_simple.py RENAMED Viewed

@@ -1,8 +1,11 @@
+import itertools
 import pytest
 import numpy as np
-from bblean.bitbirch import BitBirch  # type: ignore
-from bblean.fingerprints import pack_fingerprints
+from bblean.bitbirch import BitBirch
+from bblean.fingerprints import pack_fingerprints, make_fake_fingerprints
+from inline_snapshot import snapshot
 # NOTE: Results on this file don't depend on branching factor / threshold
@@ -37,3 +40,16 @@ def test_bb_cluster_simple_repeated_fps() -> None:
         )
         ids = BitBirch().fit(mixed_fp, n_features=2048).get_cluster_mol_ids()
         assert ids == [list(range(repeats))]
+def test_bb_cluster_3_fps() -> None:
+    fps = make_fake_fingerprints(3, n_features=8, seed=12620509540149709235, pack=True)
+    data = BitBirch().fit(fps).get_cluster_mol_ids()
+    assert data == snapshot([[0], [1], [2]])
+    data = BitBirch().fit(fps, weights=itertools.repeat(5)).get_cluster_mol_ids()
+    assert data == snapshot([[1, 2], [0]])
+    data = BitBirch().fit(fps, weights=itertools.repeat(10000)).get_cluster_mol_ids()
+    assert data == snapshot([[1, 2], [0]])
+    data = BitBirch().fit(fps, weights=itertools.repeat(1000000)).get_cluster_mol_ids()
+    assert data == snapshot([[1, 2], [0]])