PyPI - bblean - Versions diffs - 0.6.0b2__cp312-cp312-win_amd64.whl → 0.7.2b0__cp312-cp312-win_amd64.whl - Mend

bblean 0.6.0b2__cp312-cp312-win_amd64.whl → 0.7.2b0__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

bblean/_cpp_similarity.cp312-win_amd64.pyd +0 -0
bblean/_legacy/bb_int64.py +2 -0
bblean/_py_similarity.py +1 -9
bblean/_version.py +3 -3
bblean/bitbirch.py +42 -6
bblean/cli.py +68 -15
bblean/csrc/similarity.cpp +77 -26
bblean/fingerprints.py +5 -1
bblean/multiround.py +31 -16
bblean/plotting.py +7 -0
bblean/similarity.py +70 -15
bblean/sklearn.py +1 -2
bblean/smiles.py +20 -5
{bblean-0.6.0b2.dist-info → bblean-0.7.2b0.dist-info}/METADATA +3 -2
bblean-0.7.2b0.dist-info/RECORD +31 -0
bblean-0.6.0b2.dist-info/RECORD +0 -31
{bblean-0.6.0b2.dist-info → bblean-0.7.2b0.dist-info}/WHEEL +0 -0
{bblean-0.6.0b2.dist-info → bblean-0.7.2b0.dist-info}/entry_points.txt +0 -0
{bblean-0.6.0b2.dist-info → bblean-0.7.2b0.dist-info}/licenses/LICENSE +0 -0
{bblean-0.6.0b2.dist-info → bblean-0.7.2b0.dist-info}/top_level.txt +0 -0

bblean/_cpp_similarity.cp312-win_amd64.pyd CHANGED Viewed

Binary file

bblean/_legacy/bb_int64.py CHANGED Viewed

@@ -633,6 +633,7 @@ class BitBirch:
             X = X[:max_fps]
         threshold = self.threshold
         branching_factor = self.branching_factor
         n_features = _validate_n_features(X, input_is_packed, n_features)
         d_type = X.dtype
@@ -718,6 +719,7 @@ class BitBirch:
         """
         threshold = self.threshold
         branching_factor = self.branching_factor
         n_features = _validate_n_features(X, input_is_packed, n_features)
         d_type = X.dtype

bblean/_py_similarity.py CHANGED Viewed

@@ -76,18 +76,10 @@ def jt_compl_isim(
         warnings.warn(msg, RuntimeWarning, stacklevel=2)
         return np.full(len(fps), fill_value=np.nan, dtype=np.float64)
     linear_sum = np.sum(fps, axis=0)
-    n_objects = len(fps) - 1
     comp_sims = [jt_isim_from_sum(linear_sum - fp, n_objects) for fp in fps]
     return np.array(comp_sims, dtype=np.float64)
-def _jt_isim_medoid_index(
-    fps: NDArray[np.uint8], input_is_packed: bool = True, n_features: int | None = None
-) -> int:
-    return np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
 def jt_isim_medoid(
     fps: NDArray[np.uint8],
     input_is_packed: bool = True,
@@ -110,7 +102,7 @@ def jt_isim_medoid(
     if len(fps) < 3:
         idx = 0  # Medoid undefined for sets of 3 or more fingerprints
     else:
-        idx = _jt_isim_medoid_index(fps, input_is_packed=False)
+        idx = np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
     m = fps[idx]
     if pack:
         return idx, pack_fingerprints(m)

bblean/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.6.0b2'
-__version_tuple__ = version_tuple = (0, 6, 0, 'b2')
+__version__ = version = '0.7.2b0'
+__version_tuple__ = version_tuple = (0, 7, 2, 'b0')
-__commit_id__ = commit_id = None
+__commit_id__ = commit_id = 'g36216813a'

bblean/bitbirch.py CHANGED Viewed

@@ -47,6 +47,8 @@
 # ./LICENSES/GPL-3.0-only.txt.  If not, see <http://www.gnu.org/licenses/gpl-3.0.html>.
 r"""BitBirch 'Lean' class for fast, memory-efficient O(N) clustering"""
 from __future__ import annotations  # Stringize type annotations for no runtime overhead
+import pickle
+import sys
 import typing_extensions as tpx
 import os
 import random
@@ -646,7 +648,7 @@ class BitBirch:
     @merge_criterion.setter
     def merge_criterion(self, value: str) -> None:
-        self.set_merge(criterion=value)
+        self.set_merge(merge_criterion=value)
     @property
     def tolerance(self) -> float | None:
@@ -671,7 +673,7 @@ class BitBirch:
     def set_merge(
         self,
-        criterion: str | MergeAcceptFunction | None = None,
+        merge_criterion: str | MergeAcceptFunction | None = None,
         *,
         tolerance: float | None = None,
         threshold: float | None = None,
@@ -687,10 +689,10 @@ class BitBirch:
                 "the global set_merge() function has *not* been used"
             )
         _tolerance = 0.05 if tolerance is None else tolerance
-        if isinstance(criterion, MergeAcceptFunction):
-            self._merge_accept_fn = criterion
-        elif isinstance(criterion, str):
-            self._merge_accept_fn = get_merge_accept_fn(criterion, _tolerance)
+        if isinstance(merge_criterion, MergeAcceptFunction):
+            self._merge_accept_fn = merge_criterion
+        elif isinstance(merge_criterion, str):
+            self._merge_accept_fn = get_merge_accept_fn(merge_criterion, _tolerance)
         if hasattr(self._merge_accept_fn, "tolerance"):
             self._merge_accept_fn.tolerance = _tolerance
         elif tolerance is not None:
@@ -1316,6 +1318,40 @@ class BitBirch:
             parts.append(f"tolerance={self.tolerance}")
         return f"{self.__class__.__name__}({', '.join(parts)})"
+    def save(self, path: Path | str) -> None:
+        r""":meta private:"""
+        # TODO: BitBIRCH is highly recursive. pickling may crash python,
+        # an alternative solution would be better
+        msg = (
+            "Saving large BitBIRCH trees may result in large memory peaks."
+            " An alternative serialization method may be implemented in the future"
+        )
+        warnings.warn(msg)
+        _old_limit = sys.getrecursionlimit()
+        sys.setrecursionlimit(1_000_000_000)
+        with open(path, mode="wb") as f:
+            pickle.dump(self, f)
+        sys.setrecursionlimit(_old_limit)
+    @classmethod
+    def load(cls, path: Path | str) -> tpx.Self:
+        r""":meta private:"""
+        # TODO: BitBIRCH is highly recursive. pickling may crash python,
+        # an alternative solution would be better
+        msg = (
+            "Loading large BitBIRCH trees may result in large memory peaks."
+            " An alternative serialization method may be implemented in the future"
+        )
+        warnings.warn(msg)
+        _old_limit = sys.getrecursionlimit()
+        sys.setrecursionlimit(1_000_000_000)
+        with open(path, mode="rb") as f:
+            tree = pickle.load(f)
+        sys.setrecursionlimit(_old_limit)
+        if not isinstance(tree, cls):
+            raise ValueError("Path does not contain a bitbirch object")
+        return tree
     def global_clustering(
         self,
         n_clusters: int,

bblean/cli.py CHANGED Viewed

@@ -1096,26 +1096,29 @@ def _run(
     timer.end_timing("total", console, indent=False)
     console.print_peak_mem(out_dir, indent=False)
+    if save_tree:
+        if variant != "lean":
+            console.print("Can't save tree for non-lean variants", style="red")
+        else:
+            # TODO: Find alternative solution
+            tree.save(out_dir / "bitbirch.pkl")
     if variant == "lean":
-        if save_tree:
-            # TODO: BitBIRCH is highly recursive. pickling may crash python,
-            # an alternative solution would be better
-            _old_limit = sys.getrecursionlimit()
-            sys.setrecursionlimit(100_000)
-            with open(out_dir / "bitbirch.pkl", mode="wb") as f:
-                pickle.dump(tree, f)
-            sys.setrecursionlimit(_old_limit)
         tree.delete_internal_nodes()
-        # Dump outputs (peak memory, timings, config, cluster ids)
-        if save_centroids:
+    # Dump outputs (peak memory, timings, config, cluster ids)
+    if save_centroids:
+        if variant != "lean":
+            console.print("Can't save centroids for non-lean variants", style="red")
+            with open(out_dir / "clusters.pkl", mode="wb") as f:
+                pickle.dump(tree.get_cluster_mol_ids(), f)
+        else:
             output = tree.get_centroids_mol_ids()
             with open(out_dir / "clusters.pkl", mode="wb") as f:
                 pickle.dump(output["mol_ids"], f)
             with open(out_dir / "cluster-centroids-packed.pkl", mode="wb") as f:
                 pickle.dump(output["centroids"], f)
-        else:
-            with open(out_dir / "clusters.pkl", mode="wb") as f:
-                pickle.dump(tree.get_cluster_mol_ids(), f)
+    else:
+        with open(out_dir / "clusters.pkl", mode="wb") as f:
+            pickle.dump(tree.get_cluster_mol_ids(), f)
     collect_system_specs_and_dump_config(ctx.params)
     timer.dump(out_dir / "timings.json")
@@ -1193,6 +1196,14 @@ def _multiround(
         bool,
         Option("--save-centroids/--no-save-centroids", rich_help_panel="Advanced"),
     ] = True,
+    sort_fps: Annotated[
+        bool,
+        Option(
+            "--sort-fps/--no-sort-fps",
+            help="Sort the fingerprints by popcount before launching the initial round",
+            rich_help_panel="Advanced",
+        ),
+    ] = False,
     mid_merge_criterion: Annotated[
         str,
         Option(
@@ -1386,6 +1397,7 @@ def _multiround(
         midsection_threshold_change=mid_threshold_change,
         tolerance=tolerance,
         # Advanced
+        sort_fps=sort_fps,
         save_tree=save_tree,
         save_centroids=save_centroids,
         bin_size=bin_size,
@@ -1526,6 +1538,13 @@ def _fps_from_smiles(
             ),
         ),
     ] = False,
+    tab_separated: Annotated[
+        bool,
+        Option(
+            "--tab-sep/--no-tab-sep",
+            help="Whether the smiles file has the format <smiles><tab><field><tab>...",
+        ),
+    ] = False,
 ) -> None:
     r"""Generate a `*.npy` fingerprints file from one or more `*.smi` smiles files
@@ -1631,7 +1650,9 @@ def _fps_from_smiles(
             with mp_context.Pool(processes=num_ps) as pool:
                 pool.map(
                     create_fp_file,
-                    _iter_idxs_and_smiles_batches(smiles_paths, num_per_batch),
+                    _iter_idxs_and_smiles_batches(
+                        smiles_paths, num_per_batch, tab_separated
+                    ),
                 )
         timer.end_timing("total", console, indent=False)
         stem = out_name.split(".")[0]
@@ -1671,7 +1692,9 @@ def _fps_from_smiles(
         with mp_context.Pool(processes=num_ps) as pool:
             pool.starmap(
                 fps_array_filler,
-                _iter_ranges_and_smiles_batches(smiles_paths, num_per_batch),
+                _iter_ranges_and_smiles_batches(
+                    smiles_paths, num_per_batch, tab_separated
+                ),
             )
         fps = np.ndarray((smiles_num, out_dim), dtype=dtype, buffer=fps_shmem.buf)
         mask = np.ndarray((smiles_num,), dtype=np.bool, buffer=invalid_mask_shmem.buf)
@@ -1848,3 +1871,33 @@ def _merge_fps(
             return
         np.save(out_dir / stem, np.concatenate(arrays))
     console.print(f"Finished. Outputs written to {str(out_dir / stem)}.npy")
+@app.command("fps-sort", rich_help_panel="Fingerprints")
+def _sort_fps(
+    in_file: Annotated[
+        Path,
+        Argument(help="`*.npy` file with packed fingerprints"),
+    ],
+    out_dir: Annotated[
+        Path | None,
+        Option("-o", "--out-dir", show_default=False),
+    ] = None,
+    seed: Annotated[
+        int | None,
+        Option("--seed", hidden=True, rich_help_panel="Debug"),
+    ] = None,
+) -> None:
+    import numpy as np
+    from bblean._py_similarity import _popcount
+    fps = np.load(in_file)
+    stem = in_file.stem
+    counts = _popcount(fps)
+    sort_idxs = np.argsort(counts)
+    fps = fps[sort_idxs]
+    if out_dir is None:
+        out_dir = Path.cwd()
+    out_dir.mkdir(exist_ok=True)
+    out_dir = out_dir.resolve()
+    np.save(out_dir / f"sorted-{stem}.npy", fps)

bblean/csrc/similarity.cpp CHANGED Viewed

@@ -300,6 +300,75 @@ double jt_isim_from_sum(const CArrayForcecast<uint64_t>& linear_sum,
     return a / ((a + (n_objects * sum_kq)) - sum_kqsq);
 }
+// NOTE: This is only *slightly* faster for C++ than numpy, **only if the
+// array is uint8_t** if the array is uint64 already, it is slower
+template <typename T>
+py::array_t<uint64_t> add_rows(const CArrayForcecast<T>& arr) {
+    if (arr.ndim() != 2) {
+        throw std::runtime_error("Input array must be 2-dimensional");
+    }
+    auto arr_ptr = arr.data();
+    auto out = py::array_t<uint64_t>(arr.shape(1));
+    auto out_ptr = out.mutable_data();
+    std::memset(out_ptr, 0, out.nbytes());
+    py::ssize_t n_samples = arr.shape(0);
+    py::ssize_t n_features = arr.shape(1);
+    // Check GCC / CLang vectorize this
+    for (py::ssize_t i = 0; i < n_samples; ++i) {
+        const uint8_t* arr_row_ptr = arr_ptr + i * n_features;
+        for (py::ssize_t j = 0; j < n_features; ++j) {
+            out_ptr[j] += static_cast<uint64_t>(arr_row_ptr[j]);
+        }
+    }
+    return out;
+}
+py::array_t<double> _nochecks_jt_compl_isim_unpacked_u8(
+    const py::array_t<uint8_t, py::array::c_style>& fps) {
+    py::ssize_t n_objects = fps.shape(0);
+    py::ssize_t n_features = fps.shape(1);
+    auto out = py::array_t<double>(n_objects);
+    auto out_ptr = out.mutable_data();
+    if (n_objects < 3) {
+        PyErr_WarnEx(PyExc_RuntimeWarning,
+                     "Invalid num fps in compl_isim. Expected n_objects >= 3",
+                     1);
+        for (py::ssize_t i{0}; i != n_objects; ++i) {
+            out_ptr[i] = std::numeric_limits<double>::quiet_NaN();
+        }
+        return out;
+    }
+    auto linear_sum = add_rows<uint8_t>(fps);
+    auto ls_cptr = linear_sum.data();
+    py::array_t<uint64_t> shifted_linear_sum(n_features);
+    auto shifted_ls_ptr = shifted_linear_sum.mutable_data();
+    auto in_cptr = fps.data();
+    for (py::ssize_t i{0}; i != n_objects; ++i) {
+        for (py::ssize_t j{0}; j != n_features; ++j) {
+            shifted_ls_ptr[j] = ls_cptr[j] - in_cptr[i * n_features + j];
+        }
+        // For all compl isim N is n_objects - 1
+        out_ptr[i] = jt_isim_from_sum(shifted_linear_sum, n_objects - 1);
+    }
+    return out;
+}
+py::array_t<double> jt_compl_isim(
+    const CArrayForcecast<uint8_t>& fps, bool input_is_packed = true,
+    std::optional<py::ssize_t> n_features_opt = std::nullopt) {
+    if (fps.ndim() != 2) {
+        throw std::runtime_error("fps arr must be 2D");
+    }
+    if (input_is_packed) {
+        return _nochecks_jt_compl_isim_unpacked_u8(
+            _nochecks_unpack_fingerprints_2d(fps, n_features_opt));
+    }
+    return _nochecks_jt_compl_isim_unpacked_u8(fps);
+}
 // Contraint: T must be uint64_t or uint8_t
 template <typename T>
 void _calc_arr_vec_jt(const py::array_t<uint8_t>& arr,
@@ -372,33 +441,10 @@ py::array_t<double> jt_sim_packed_precalc_cardinalities(
 }
 py::array_t<double> _jt_sim_arr_vec_packed(const py::array_t<uint8_t>& arr,
-                                  const py::array_t<uint8_t>& vec) {
+                                           const py::array_t<uint8_t>& vec) {
     return jt_sim_packed_precalc_cardinalities(arr, vec, _popcount_2d(arr));
 }
-// NOTE: This is only *slightly* faster for C++ than numpy, **only if the
-// array is uint8_t** if the array is uint64 already, it is slower
-template <typename T>
-py::array_t<uint64_t> add_rows(const CArrayForcecast<T>& arr) {
-    if (arr.ndim() != 2) {
-        throw std::runtime_error("Input array must be 2-dimensional");
-    }
-    auto arr_ptr = arr.data();
-    auto out = py::array_t<uint64_t>(arr.shape(1));
-    auto out_ptr = out.mutable_data();
-    std::memset(out_ptr, 0, out.nbytes());
-    py::ssize_t n_samples = arr.shape(0);
-    py::ssize_t n_features = arr.shape(1);
-    // Check GCC / CLang vectorize this
-    for (py::ssize_t i = 0; i < n_samples; ++i) {
-        const uint8_t* arr_row_ptr = arr_ptr + i * n_features;
-        for (py::ssize_t j = 0; j < n_features; ++j) {
-            out_ptr[j] += static_cast<uint64_t>(arr_row_ptr[j]);
-        }
-    }
-    return out;
-}
 double jt_isim_unpacked_u8(const CArrayForcecast<uint8_t>& arr) {
     return jt_isim_from_sum(add_rows<uint8_t>(arr), arr.shape(0));
 }
@@ -406,8 +452,9 @@ double jt_isim_unpacked_u8(const CArrayForcecast<uint8_t>& arr) {
 double jt_isim_packed_u8(
     const CArrayForcecast<uint8_t>& arr,
     std::optional<py::ssize_t> n_features_opt = std::nullopt) {
-    return jt_isim_from_sum(add_rows<uint8_t>(unpack_fingerprints(arr, n_features_opt)),
-                            arr.shape(0));
+    return jt_isim_from_sum(
+        add_rows<uint8_t>(unpack_fingerprints(arr, n_features_opt)),
+        arr.shape(0));
 }
 py::tuple jt_most_dissimilar_packed(
@@ -510,6 +557,10 @@ PYBIND11_MODULE(_cpp_similarity, m) {
     m.def("jt_isim_unpacked_u8", &jt_isim_unpacked_u8,
           "iSIM Tanimoto calculation", py::arg("arr"));
+    m.def("jt_compl_isim", &jt_compl_isim, "Complementary iSIM tanimoto",
+          py::arg("fps"), py::arg("input_is_packed") = true,
+          py::arg("n_features") = std::nullopt);
     m.def("_jt_sim_arr_vec_packed", &_jt_sim_arr_vec_packed,
           "Tanimoto similarity between a matrix of packed fps and a single "
           "packed fp",

bblean/fingerprints.py CHANGED Viewed

@@ -115,7 +115,11 @@ def _get_generator(kind: str, n_features: int) -> tp.Any:
         return rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_features)
     elif kind == "ecfp6":
         return rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=n_features)
-    raise ValueError(f"Unknonw kind {kind}. Should be one of 'rdkit|ecfp4|ecfp6'")
+    elif kind == "topological":
+        return rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=n_features)
+    elif kind == "ap":
+        return rdFingerprintGenerator.GetAtomPairGenerator(fpSize=n_features)
+    raise ValueError(f"Unknown kind {kind}. Use 'rdkit|ecfp4|ecfp6|topological|ap'")
 def _get_sanitize_flags(sanitize: str) -> tp.Any:

bblean/multiround.py CHANGED Viewed

@@ -65,6 +65,7 @@ from bblean._config import DEFAULTS
 from bblean.utils import batched
 from bblean.bitbirch import BitBirch
 from bblean.fingerprints import _get_fps_file_num
+from bblean._py_similarity import _popcount
 __all__ = ["run_multiround_bitbirch"]
@@ -157,6 +158,7 @@ class _InitialRound:
         max_fps: int | None = None,
         merge_criterion: str = DEFAULTS.merge_criterion,
         input_is_packed: bool = True,
+        sort_fps: bool = False,
     ) -> None:
         self.n_features = n_features
         self.refinement_before_midsection = refinement_before_midsection
@@ -171,6 +173,7 @@ class _InitialRound:
         self.refine_merge_criterion = refine_merge_criterion
         self.input_is_packed = input_is_packed
         self.refine_threshold_change = refine_threshold_change
+        self._sort_fps = sort_fps
     def __call__(self, file_info: tuple[str, Path, int, int]) -> None:
         file_label, fp_file, start_idx, end_idx = file_info
@@ -182,6 +185,14 @@ class _InitialRound:
             threshold=self.threshold,
             merge_criterion=self.merge_criterion,
         )
+        if self._sort_fps:
+            fp_input = np.load(fp_file)
+            counts = _popcount(fp_input)
+            sort_idxs = np.argsort(counts)
+            fp_input = fp_input[sort_idxs]
+        else:
+            fp_input = fp_file
         range_ = range(start_idx, end_idx)
         tree.fit(
             fp_file,
@@ -201,7 +212,7 @@ class _InitialRound:
                 # Finish the first refinement step internally in this round
                 tree.reset()
                 tree.set_merge(
-                    self.refine_merge_criterion,
+                    merge_criterion=self.refine_merge_criterion,
                     tolerance=self.tolerance,
                     threshold=self.threshold + self.refine_threshold_change,
                 )
@@ -225,7 +236,7 @@ class _TreeMergingRound:
         round_idx: int,
         out_dir: Path | str,
         split_largest_cluster: bool,
-        criterion: str,
+        merge_criterion: str,
         all_fp_paths: tp.Sequence[Path] = (),
     ) -> None:
         self.all_fp_paths = list(all_fp_paths)
@@ -235,14 +246,14 @@ class _TreeMergingRound:
         self.round_idx = round_idx
         self.out_dir = Path(out_dir)
         self.split_largest_cluster = split_largest_cluster
-        self.criterion = criterion
+        self.merge_criterion = merge_criterion
     def __call__(self, batch_info: tuple[str, tp.Sequence[tuple[Path, Path]]]) -> None:
         batch_label, batch_path_pairs = batch_info
         tree = BitBirch(
             branching_factor=self.branching_factor,
             threshold=self.threshold,
-            merge_criterion=self.criterion,
+            merge_criterion=self.merge_criterion,
             tolerance=self.tolerance,
         )
         # Rebuild a tree, inserting all BitFeatures from the corresponding batch
@@ -270,13 +281,20 @@ class _FinalTreeMergingRound(_TreeMergingRound):
         branching_factor: int,
         threshold: float,
         tolerance: float,
-        criterion: str,
+        merge_criterion: str,
         out_dir: Path | str,
         save_tree: bool,
         save_centroids: bool,
     ) -> None:
         super().__init__(
-            branching_factor, threshold, tolerance, -1, out_dir, False, criterion, ()
+            branching_factor,
+            threshold,
+            tolerance,
+            -1,
+            out_dir,
+            False,
+            merge_criterion,
+            (),
         )
         self.save_tree = save_tree
         self.save_centroids = save_centroids
@@ -286,7 +304,7 @@ class _FinalTreeMergingRound(_TreeMergingRound):
         tree = BitBirch(
             branching_factor=self.branching_factor,
             threshold=self.threshold,
-            merge_criterion=self.criterion,
+            merge_criterion=self.merge_criterion,
             tolerance=self.tolerance,
         )
         # Rebuild a tree, inserting all BitFeatures from the corresponding batch
@@ -298,13 +316,8 @@ class _FinalTreeMergingRound(_TreeMergingRound):
         # Save clusters and exit
         if self.save_tree:
-            # TODO: BitBIRCH is highly recursive. pickling may crash python,
-            # an alternative solution would be better
-            _old_limit = sys.getrecursionlimit()
-            sys.setrecursionlimit(100_000)
-            with open(self.out_dir / "bitbirch.pkl", mode="wb") as f:
-                pickle.dump(tree, f)
-            sys.setrecursionlimit(_old_limit)
+            # TODO: Find alternative solution
+            tree.save(self.out_dir / "bitbirch.pkl")
         tree.delete_internal_nodes()
         if self.save_centroids:
             output = tree.get_centroids_mol_ids()
@@ -358,6 +371,7 @@ def run_multiround_bitbirch(
     mp_context: tp.Any = None,
     save_tree: bool = False,
     save_centroids: bool = True,
+    sort_fps: bool = False,
     # Debug
     max_fps: int | None = None,
     verbose: bool = False,
@@ -404,6 +418,7 @@ def run_multiround_bitbirch(
     console.print(f"(Initial) Round {round_idx}: Cluster initial batch of fingerprints")
     initial_fn = _InitialRound(
+        sort_fps=sort_fps,
         n_features=n_features,
         refinement_before_midsection=refinement_before_midsection,
         max_fps=max_fps,
@@ -441,7 +456,7 @@ def run_multiround_bitbirch(
             round_idx=round_idx,
             all_fp_paths=input_files,
             split_largest_cluster=split_largest_after_each_midsection_round,
-            criterion=midsection_merge_criterion,
+            merge_criterion=midsection_merge_criterion,
             threshold=threshold + midsection_threshold_change,
             **common_kwargs,
         )
@@ -469,7 +484,7 @@ def run_multiround_bitbirch(
     final_fn = _FinalTreeMergingRound(
         save_tree=save_tree,
         save_centroids=save_centroids,
-        criterion=final_merge_criterion,
+        merge_criterion=final_merge_criterion,
         threshold=threshold + midsection_threshold_change,
         **common_kwargs,
     )

bblean/plotting.py CHANGED Viewed

@@ -399,13 +399,17 @@ def dump_mol_images(
     clusters: list[list[int]],
     cluster_idx: int = 0,
     batch_size: int = 30,
+    limit: int = -1,
 ) -> None:
     r"""Dump smiles associated with a specific cluster as ``*.png`` image files"""
     if isinstance(smiles, str):
         smiles = [smiles]
     smiles = np.asarray(smiles)
     idxs = clusters[cluster_idx]
+    num = 0
     for i, idx_seq in enumerate(batched(idxs, batch_size)):
+        if num + len(idx_seq) > limit:
+            idx_seq = idx_seq[: num + len(idx_seq) - limit]
         mols = []
         for smi in smiles[list(idx_seq)]:
             mol = Chem.MolFromSmiles(smi)
@@ -415,6 +419,9 @@ def dump_mol_images(
         img = Draw.MolsToGridImage(mols, molsPerRow=5)
         with open(f"cluster_{cluster_idx}_{i}.png", "wb") as f:
             f.write(img.data)
+        num += len(idx_seq)
+        if num >= limit:
+            break
 # For internal use, dispatches a visualization workflow and optionally saves

bblean/similarity.py CHANGED Viewed

@@ -34,12 +34,8 @@ __all__ = [
     "jt_sim_matrix_packed",
 ]
-from bblean._py_similarity import (
-    centroid_from_sum,
-    centroid,
-    jt_compl_isim,
-    jt_isim_medoid,
-)
+from bblean._py_similarity import centroid_from_sum, centroid
+from bblean.fingerprints import pack_fingerprints, unpack_fingerprints
 # jt_isim_packed and jt_isim_unpacked are not exposed, only used within functions for
 # speed
@@ -49,6 +45,7 @@ if os.getenv("BITBIRCH_NO_EXTENSIONS"):
         jt_isim_from_sum,
         jt_isim_unpacked,
         jt_isim_packed,
+        jt_compl_isim,
         _jt_sim_arr_vec_packed,
         jt_most_dissimilar_packed,
     )
@@ -56,11 +53,13 @@ else:
     try:
         from bblean._cpp_similarity import (  # type: ignore
             jt_isim_from_sum,
-            _jt_sim_arr_vec_packed,
             jt_isim_unpacked_u8,
             jt_isim_packed_u8,
+            jt_compl_isim,  # TODO: Does it need wrappers for non-uint8?
+            _jt_sim_arr_vec_packed,
             jt_most_dissimilar_packed,
-            unpack_fingerprints,
+            # Needed for wrappers
+            unpack_fingerprints as _unpack_fingerprints,
         )
         # Wrap these two since doing
@@ -80,7 +79,7 @@ else:
             if arr.dtype == np.uint64:
                 return jt_isim_from_sum(
                     np.sum(
-                        unpack_fingerprints(arr, n_features),  # type: ignore
+                        _unpack_fingerprints(arr, n_features),  # type: ignore
                         axis=0,
                         dtype=np.uint64,
                     ),
@@ -93,6 +92,7 @@ else:
             jt_isim_from_sum,
             jt_isim_unpacked,
             jt_isim_packed,
+            jt_compl_isim,
             _jt_sim_arr_vec_packed,
             jt_most_dissimilar_packed,
         )
@@ -103,6 +103,35 @@ else:
         )
+def jt_isim_medoid(
+    fps: NDArray[np.uint8],
+    input_is_packed: bool = True,
+    n_features: int | None = None,
+    pack: bool = True,
+) -> tuple[int, NDArray[np.uint8]]:
+    r"""Calculate the (Tanimoto) medoid of a set of fingerprints, using iSIM
+    Returns both the index of the medoid in the input array and the medoid itself
+    .. note::
+        Returns the first (or only) fingerprint for array of size 2 and 1 respectively.
+        Raises ValueError for arrays of size 0
+    """
+    if not fps.size:
+        raise ValueError("Size of fingerprints set must be > 0")
+    if input_is_packed:
+        fps = unpack_fingerprints(fps, n_features)
+    if len(fps) < 3:
+        idx = 0  # Medoid undefined for sets of 3 or more fingerprints
+    else:
+        idx = np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
+    m = fps[idx]
+    if pack:
+        return idx, pack_fingerprints(m)
+    return idx, m
 def jt_isim(
     fps: NDArray[np.integer],
     input_is_packed: bool = True,
@@ -149,7 +178,11 @@ def jt_isim_diameter(
     r"""Calculate the Tanimoto diameter of a set of fingerprints"""
     return jt_isim_diameter_from_sum(
         np.sum(
-            unpack_fingerprints(arr, n_features) if input_is_packed else arr,
+            (
+                unpack_fingerprints(arr.astype(np.uint8, copy=False), n_features)
+                if input_is_packed
+                else arr
+            ),
             axis=0,
             dtype=np.uint64,
         ),  # type: ignore
@@ -165,7 +198,11 @@ def jt_isim_radius(
     r"""Calculate the Tanimoto radius of a set of fingerprints"""
     return jt_isim_radius_from_sum(
         np.sum(
-            unpack_fingerprints(arr, n_features) if input_is_packed else arr,
+            (
+                unpack_fingerprints(arr.astype(np.uint8, copy=False), n_features)
+                if input_is_packed
+                else arr
+            ),
             axis=0,
             dtype=np.uint64,
         ),  # type: ignore
@@ -181,7 +218,11 @@ def jt_isim_radius_compl(
     r"""Calculate the complement of the Tanimoto radius of a set of fingerprints"""
     return jt_isim_radius_compl_from_sum(
         np.sum(
-            unpack_fingerprints(arr, n_features) if input_is_packed else arr,
+            (
+                unpack_fingerprints(arr.astype(np.uint8, copy=False), n_features)
+                if input_is_packed
+                else arr
+            ),
             axis=0,
             dtype=np.uint64,
         ),  # type: ignore
@@ -252,14 +293,28 @@ def estimate_jt_std(
     n_samples: int | None = None,
     input_is_packed: bool = True,
     n_features: int | None = None,
+    min_samples: int = 1_000_000,
 ) -> float:
-    r"""Estimate std of tanimoto sim using a deterministic sample"""
+    r"""Estimate the std of all pairwise Tanimoto.
+    Returns
+    -------
+    std : float
+        The standard deviation of all pairwise Tanimoto among the sampled fingerprints.
+    """
     num_fps = len(fps)
+    if num_fps > min_samples:
+        np.random.seed(42)
+        random_choices = np.random.choice(num_fps, size=min_samples, replace=False)
+        fps = fps[random_choices]
+        num_fps = len(fps)
     if n_samples is None:
-        n_samples = max(num_fps // 1000, 50)
+        # Heuristic: use at least 50 samples, or 1 per 10,000 fingerprints,
+        # to balance statistical representativeness and computational efficiency
+        n_samples = max(num_fps // 10_000, 50)
     sample_idxs = jt_stratified_sampling(fps, n_samples, input_is_packed, n_features)
-    # Work with sample from now on
+    # Work with only the sampled fingerprints
     fps = fps[sample_idxs]
     num_fps = len(fps)
     pairs = np.empty(num_fps * (num_fps - 1) // 2, dtype=np.float64)

bblean/sklearn.py CHANGED Viewed

@@ -131,8 +131,7 @@ class BitBirch(
             .astype(np.uint8, copy=False)
             .view(np.bool)
         )
-        # TODO: Even when both inputs are bool, this function warns for some reason
-        # I believe this may be a sklearn bug
+        # TODO: Due to a sklearn bug this performs unnecessary casts
         centers = self.subcluster_centers_.astype(np.uint8, copy=False).view(np.bool)
         argmin = pairwise_distances_argmin(X, centers, metric="jaccard")
         return self.subcluster_labels_[argmin]

bblean/smiles.py CHANGED Viewed

@@ -32,23 +32,34 @@ def calc_num_smiles(smiles_paths: SmilesPaths) -> int:
 def iter_smiles_from_paths(
-    smiles_paths: SmilesPaths,
+    smiles_paths: SmilesPaths, tab_separated: bool = False
 ) -> tp.Iterator[str]:
-    r"""Iterate over smiles in a sequence of smiles paths"""
+    r"""Iterate over smiles in a sequence of smiles paths
+    If tab_separated = True the file is assumed to have the format
+    <smiles><tab><field><tab><field>..., and only the smiles is returned
+    """
     if isinstance(smiles_paths, (Path, str)):
         smiles_paths = [smiles_paths]
     for smi_path in smiles_paths:
         with open(smi_path, mode="rt", encoding="utf-8") as f:
             for smi in f:
+                smi = smi if not tab_separated else smi.split("\t")[0]
+                # Skip headers
+                if smi.lower().strip() == "smiles":
+                    continue
                 yield smi
 def _iter_ranges_and_smiles_batches(
     smiles_paths: SmilesPaths,
     num_per_batch: int,
+    tab_separated: bool = False,
 ) -> tp.Iterable[tuple[tuple[int, int], tuple[str, ...]]]:
     start_idx = 0
-    for batch in batched(iter_smiles_from_paths(smiles_paths), num_per_batch):
+    for batch in batched(
+        iter_smiles_from_paths(smiles_paths, tab_separated), num_per_batch
+    ):
         size = len(batch)
         end_idx = start_idx + size
         yield (start_idx, end_idx), batch
@@ -56,6 +67,10 @@ def _iter_ranges_and_smiles_batches(
 def _iter_idxs_and_smiles_batches(
-    smiles_paths: SmilesPaths, num_per_batch: int
+    smiles_paths: SmilesPaths,
+    num_per_batch: int,
+    tab_separated: bool = False,
 ) -> tp.Iterable[tuple[int, tuple[str, ...]]]:
-    yield from enumerate(batched(iter_smiles_from_paths(smiles_paths), num_per_batch))
+    yield from enumerate(
+        batched(iter_smiles_from_paths(smiles_paths, tab_separated), num_per_batch)
+    )

{bblean-0.6.0b2.dist-info → bblean-0.7.2b0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: bblean
-Version: 0.6.0b2
+Version: 0.7.2b0
 Summary: BitBirch-Lean Python package
 Author: The Miranda-Quintana Lab and other BitBirch developers
 Author-email: Ramon Alain Miranda Quintana <quintana@chem.ufl.edu>, Krisztina Zsigmond <kzsigmond@ufl.edu>, Ignacio Pickering <ipickering@ufl.edu>, Kenneth Lopez Perez <klopezperez@chem.ufl.edu>, Miroslav Lzicar <miroslav.lzicar@deepmedchem.com>
@@ -90,6 +90,7 @@ macOS via pip, which automatically includes C++ extensions:
 ```bash
 pip install bblean
+# Alternatively you can use 'uv pip install'
 bb --help
 ```
@@ -235,7 +236,7 @@ tree = bblean.BitBirch(branching_factor=50, threshold=0.65, merge_criterion="dia
 tree.fit(fps)
 # Refine the tree (if needed)
-tree.set_merge(merge_criterion="tolerance-diameter", tolerance=0.0)
+tree.set_merge("tolerance-diameter", tolerance=0.0)
 tree.refine_inplace(fps)
 # Visualize the results

bblean-0.7.2b0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,31 @@
+bblean/__init__.py,sha256=9cudBHEt0H5p0jKEvgrhLZIHPSzwNAx0uJRp-_iM32I,686
+bblean/_config.py,sha256=WaONZilOWCLFdZulqWLKRqNM-ZLhY0YCXfwk-84FYmQ,1813
+bblean/_console.py,sha256=Mk1hi1NdPw2HDmjWj1LLbCuV3vCxL5l6u2gXaEeOFBM,8021
+bblean/_cpp_similarity.cp312-win_amd64.pyd,sha256=1tgp4zCFzFZ2F3a99wGfnaXC5dDWZtfwZoujOgm8d9I,182272
+bblean/_memory.py,sha256=eycXzXV_O_VEyIKpAv3QpbxtpB5WkBLChzm_e2Dqaw0,6892
+bblean/_merges.py,sha256=xwFMJUPJ9VMujf2nSROx0NhsPoQ_R84KIxBF81x2hks,6432
+bblean/_py_similarity.py,sha256=VYWu7gVCEDjNaRLgxiCxCGjCfmTity86UPC0dfT83Ok,9633
+bblean/_timer.py,sha256=D1-_tTQFJqIQgzl4HSE__-P3Scw72EIVlNDaChJT8Qs,1402
+bblean/_version.py,sha256=lDvwo76PevPSZqRGkRtOEgFcX8LHv1s_-G_abs3gvZk,754
+bblean/analysis.py,sha256=apD5OgSoNGbIuBLSJFFzlUkVjZHBtb3fVEeEUJGbyqc,8118
+bblean/bitbirch.py,sha256=OjK0IhdXT83dMdtsEcpQQLbAq6yEBb7z-7QojAkgelA,60279
+bblean/cli.py,sha256=3thYaVWDfiMP8Crs7ShJnNa5E2MCbFoPeK5tVwQVY1w,64043
+bblean/fingerprints.py,sha256=IvIzs2ETnQlUW8nNe_sk3GIgrhGBhrhBBAfubtRkS6A,15542
+bblean/metrics.py,sha256=4KB-PIQJtFMsNg7lG2uM1HEId_eR5vhqcdLpCVLuI5Y,7280
+bblean/multiround.py,sha256=rJMdwUJ6p5hBeNDWuoJMBMzo2doCTcxOjOhC1ZfcS7U,20278
+bblean/plotting.py,sha256=OfVVdmvxaVVeyT7iAIL5QinYZwx5Ivzf8OcsAuY-qp4,15886
+bblean/similarity.py,sha256=O2OTW5Dw64go177jwzF5skvDSJEzDS7UImyIQ2nShig,12192
+bblean/sklearn.py,sha256=KK7rbF3gENjlv5-9uOvH-Q0LEW1RUY__xClcnLznuE0,7450
+bblean/smiles.py,sha256=zyLWXzTLebeFmltDMuJcneJqaLLgGOYw0118889nn7A,2356
+bblean/utils.py,sha256=K0ttSPf54nxrKD1TwbLFuwDIRlAD0jdr6KnuTqXs-HQ,3836
+bblean/_legacy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+bblean/_legacy/bb_int64.py,sha256=Otqxu8NBLrfOMpJoMrLgWtDP_9Hn4joQXZVkU1hjges,45774
+bblean/_legacy/bb_uint8.py,sha256=8kbeVAq7MxiR8hS_6lKhSDhVWc6acjLmLzNFCR466iA,41573
+bblean/csrc/README.md,sha256=qOPPK6sTqkYgnlPWtcNu9P3PwuLH8cCNJ1FwJeewsrk,59
+bblean/csrc/similarity.cpp,sha256=q6oMg9Vd0REPmqze8xToTmeXZiEuHTmOfL6QsTRFkDE,23122
+bblean-0.7.2b0.dist-info/licenses/LICENSE,sha256=Dq9t2XHr5wSrykVuVo8etKsAS35ENnDobU1h7t3H_-k,2598
+bblean-0.7.2b0.dist-info/METADATA,sha256=-aZ6OJ4RYBlH3mb6w4c-wOaoFn-4T5u4PmqdeJyjjM8,13053
+bblean-0.7.2b0.dist-info/WHEEL,sha256=8UP9x9puWI0P1V_d7K2oMTBqfeLNm21CTzZ_Ptr0NXU,101
+bblean-0.7.2b0.dist-info/entry_points.txt,sha256=a0jb2L5JFKioMD6CqbvJiI2unaArGzi-AMZsyY-uyGg,38
+bblean-0.7.2b0.dist-info/top_level.txt,sha256=ybxTonvTC9zR25yR5B27aEDLl6CiwID093ZyS_--Cq4,7
+bblean-0.7.2b0.dist-info/RECORD,,

bblean-0.6.0b2.dist-info/RECORD DELETED Viewed

@@ -1,31 +0,0 @@
-bblean/__init__.py,sha256=9cudBHEt0H5p0jKEvgrhLZIHPSzwNAx0uJRp-_iM32I,686
-bblean/_config.py,sha256=WaONZilOWCLFdZulqWLKRqNM-ZLhY0YCXfwk-84FYmQ,1813
-bblean/_console.py,sha256=Mk1hi1NdPw2HDmjWj1LLbCuV3vCxL5l6u2gXaEeOFBM,8021
-bblean/_cpp_similarity.cp312-win_amd64.pyd,sha256=GncQ3lReLTUxYnx66NaGgDA3pjJja2FUiju2NG6hr2g,178688
-bblean/_memory.py,sha256=eycXzXV_O_VEyIKpAv3QpbxtpB5WkBLChzm_e2Dqaw0,6892
-bblean/_merges.py,sha256=xwFMJUPJ9VMujf2nSROx0NhsPoQ_R84KIxBF81x2hks,6432
-bblean/_py_similarity.py,sha256=d1kbEc8lc0MgYsmW6nkFI-tV1Plo12e3bml32_8dkoU,9859
-bblean/_timer.py,sha256=D1-_tTQFJqIQgzl4HSE__-P3Scw72EIVlNDaChJT8Qs,1402
-bblean/_version.py,sha256=Z6NaqO7AvzfKUsoqEpOi7eBkzR_-GLsbF8CpiRFwVJo,746
-bblean/analysis.py,sha256=apD5OgSoNGbIuBLSJFFzlUkVjZHBtb3fVEeEUJGbyqc,8118
-bblean/bitbirch.py,sha256=fRS9dIHu3wx7rJztPYUyEINuv5KsridRpqLYh_DlmT0,58792
-bblean/cli.py,sha256=FwO-jWO9Wt-1CGP8mL_PmbEyJyHPnQxo9BaGT2zLVjE,62506
-bblean/fingerprints.py,sha256=cArsOt-946xjvoKM8qTXc0wfKA39ZFhzIht6MW9x-kQ,15315
-bblean/metrics.py,sha256=4KB-PIQJtFMsNg7lG2uM1HEId_eR5vhqcdLpCVLuI5Y,7280
-bblean/multiround.py,sha256=_-pr5LG_GLSBNZ60uLcy8XZ-qo7lr0Y048Kp041_ug8,19980
-bblean/plotting.py,sha256=1ryJbWJBVY7gkoX_JDyhY4k62spjumz1_V8IhpObzbY,15676
-bblean/similarity.py,sha256=nCrUH0t6k5GMNNWf6gD4r7ZszQEPR3b2qyk5Im7Naa8,10203
-bblean/sklearn.py,sha256=USE5qfGrWLZokz4Ati_RsRIGn1mOwHSCAw82VXD7qhA,7512
-bblean/smiles.py,sha256=fBoU41eLGmxq_uPkX-yWM9SBoPqb7_sWXmy0eo0MtNs,1855
-bblean/utils.py,sha256=K0ttSPf54nxrKD1TwbLFuwDIRlAD0jdr6KnuTqXs-HQ,3836
-bblean/_legacy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-bblean/_legacy/bb_int64.py,sha256=TJ5vd71iVLHZW1gEit_tAd4nwpJ8PMoWys84e9E8RIk,45770
-bblean/_legacy/bb_uint8.py,sha256=8kbeVAq7MxiR8hS_6lKhSDhVWc6acjLmLzNFCR466iA,41573
-bblean/csrc/README.md,sha256=qOPPK6sTqkYgnlPWtcNu9P3PwuLH8cCNJ1FwJeewsrk,59
-bblean/csrc/similarity.cpp,sha256=7zS76zHywEOnxPqK0kFPxrgsRjTKAD_YrSCYMgb1DJ4,21231
-bblean-0.6.0b2.dist-info/licenses/LICENSE,sha256=Dq9t2XHr5wSrykVuVo8etKsAS35ENnDobU1h7t3H_-k,2598
-bblean-0.6.0b2.dist-info/METADATA,sha256=9TcsxKr-RZCJGp6IFRXERdSsPbkO9GuYDYfx31kKg5w,13023
-bblean-0.6.0b2.dist-info/WHEEL,sha256=8UP9x9puWI0P1V_d7K2oMTBqfeLNm21CTzZ_Ptr0NXU,101
-bblean-0.6.0b2.dist-info/entry_points.txt,sha256=a0jb2L5JFKioMD6CqbvJiI2unaArGzi-AMZsyY-uyGg,38
-bblean-0.6.0b2.dist-info/top_level.txt,sha256=ybxTonvTC9zR25yR5B27aEDLl6CiwID093ZyS_--Cq4,7
-bblean-0.6.0b2.dist-info/RECORD,,

{bblean-0.6.0b2.dist-info → bblean-0.7.2b0.dist-info}/WHEEL RENAMED Viewed

File without changes

{bblean-0.6.0b2.dist-info → bblean-0.7.2b0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{bblean-0.6.0b2.dist-info → bblean-0.7.2b0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{bblean-0.6.0b2.dist-info → bblean-0.7.2b0.dist-info}/top_level.txt RENAMED Viewed

File without changes