PyPI - bblean - Versions diffs - 0.6.0b2__tar.gz → 0.7.2b0__tar.gz - Mend

bblean 0.6.0b2tar.gz → 0.7.2b0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

{bblean-0.6.0b2 → bblean-0.7.2b0}/.github/workflows/upload-to-pypi.yaml RENAMED Viewed

@@ -15,10 +15,12 @@ on:
         required: false
         default: false
         type: boolean
+  release:
+    types: [published]
 env:
   PYTHON_VERSION: '3.11'
-  SETUPTOOLS_SCM_PRETEND_VERSION: ${{ github.event.inputs.version }}
+  SETUPTOOLS_SCM_PRETEND_VERSION: ${{ github.event_name == 'release' && github.event.release.tag_name || github.event.inputs.version }}
   # cibuildwheel configuration:
   # Skip py 3.14, 32 bit and musllinux (Alpine) wheels
   CIBW_SKIP: "cp314-* cp314t-* *-manylinux_i686 *-win32 *-musllinux_*"
@@ -29,7 +31,7 @@ env:
   # Build wheels that support both aarch64 and x86_64 on macOS
   CIBW_ARCHS_MACOS: "universal2"
   CIBW_BUILD_VERBOSITY: 3
+  PIP_ONLY_BINARY: "llvmlite,numba"
 jobs:
   make_sdist:
     name: make-source-distribution
@@ -93,7 +95,7 @@ jobs:
   publish_to_testpypi:
     needs: [build_wheels, make_sdist]
     runs-on: ubuntu-latest
-    if: ${{ github.event.inputs.upload-testpypi == 'true' }}
+    if: ${{ github.event_name != 'release' && github.event.inputs.upload-testpypi }}
     environment:
       name: testpypi
       url: https://test.pypi.org/p/bblean
@@ -115,7 +117,7 @@ jobs:
   publish_to_pypi:
     needs: [build_wheels, make_sdist]
     runs-on: ubuntu-latest
-    if: ${{ github.event.inputs.upload-pypi == 'true' }}
+    if: ${{ github.event_name == 'release' || github.event.inputs.upload-pypi }}
     environment:
       name: pypi
       url: https://pypi.org/p/bblean

{bblean-0.6.0b2 → bblean-0.7.2b0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: bblean
-Version: 0.6.0b2
+Version: 0.7.2b0
 Summary: BitBirch-Lean Python package
 Author: The Miranda-Quintana Lab and other BitBirch developers
 Author-email: Ramon Alain Miranda Quintana <quintana@chem.ufl.edu>, Krisztina Zsigmond <kzsigmond@ufl.edu>, Ignacio Pickering <ipickering@ufl.edu>, Kenneth Lopez Perez <klopezperez@chem.ufl.edu>, Miroslav Lzicar <miroslav.lzicar@deepmedchem.com>
@@ -90,6 +90,7 @@ macOS via pip, which automatically includes C++ extensions:
 ```bash
 pip install bblean
+# Alternatively you can use 'uv pip install'
 bb --help
 ```
@@ -235,7 +236,7 @@ tree = bblean.BitBirch(branching_factor=50, threshold=0.65, merge_criterion="dia
 tree.fit(fps)
 # Refine the tree (if needed)
-tree.set_merge(merge_criterion="tolerance-diameter", tolerance=0.0)
+tree.set_merge("tolerance-diameter", tolerance=0.0)
 tree.refine_inplace(fps)
 # Visualize the results

{bblean-0.6.0b2 → bblean-0.7.2b0}/README.md RENAMED Viewed

@@ -47,6 +47,7 @@ macOS via pip, which automatically includes C++ extensions:
 ```bash
 pip install bblean
+# Alternatively you can use 'uv pip install'
 bb --help
 ```
@@ -192,7 +193,7 @@ tree = bblean.BitBirch(branching_factor=50, threshold=0.65, merge_criterion="dia
 tree.fit(fps)
 # Refine the tree (if needed)
-tree.set_merge(merge_criterion="tolerance-diameter", tolerance=0.0)
+tree.set_merge("tolerance-diameter", tolerance=0.0)
 tree.refine_inplace(fps)
 # Visualize the results

{bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_legacy/bb_int64.py RENAMED Viewed

@@ -633,6 +633,7 @@ class BitBirch:
             X = X[:max_fps]
         threshold = self.threshold
         branching_factor = self.branching_factor
         n_features = _validate_n_features(X, input_is_packed, n_features)
         d_type = X.dtype
@@ -718,6 +719,7 @@ class BitBirch:
         """
         threshold = self.threshold
         branching_factor = self.branching_factor
         n_features = _validate_n_features(X, input_is_packed, n_features)
         d_type = X.dtype

{bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_py_similarity.py RENAMED Viewed

@@ -76,18 +76,10 @@ def jt_compl_isim(
         warnings.warn(msg, RuntimeWarning, stacklevel=2)
         return np.full(len(fps), fill_value=np.nan, dtype=np.float64)
     linear_sum = np.sum(fps, axis=0)
-    n_objects = len(fps) - 1
     comp_sims = [jt_isim_from_sum(linear_sum - fp, n_objects) for fp in fps]
     return np.array(comp_sims, dtype=np.float64)
-def _jt_isim_medoid_index(
-    fps: NDArray[np.uint8], input_is_packed: bool = True, n_features: int | None = None
-) -> int:
-    return np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
 def jt_isim_medoid(
     fps: NDArray[np.uint8],
     input_is_packed: bool = True,
@@ -110,7 +102,7 @@ def jt_isim_medoid(
     if len(fps) < 3:
         idx = 0  # Medoid undefined for sets of 3 or more fingerprints
     else:
-        idx = _jt_isim_medoid_index(fps, input_is_packed=False)
+        idx = np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
     m = fps[idx]
     if pack:
         return idx, pack_fingerprints(m)

{bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.6.0b2'
-__version_tuple__ = version_tuple = (0, 6, 0, 'b2')
+__version__ = version = '0.7.2.b0'
+__version_tuple__ = version_tuple = (0, 7, 2, 'b0')
 __commit_id__ = commit_id = None

{bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/bitbirch.py RENAMED Viewed

@@ -47,6 +47,8 @@
 # ./LICENSES/GPL-3.0-only.txt.  If not, see <http://www.gnu.org/licenses/gpl-3.0.html>.
 r"""BitBirch 'Lean' class for fast, memory-efficient O(N) clustering"""
 from __future__ import annotations  # Stringize type annotations for no runtime overhead
+import pickle
+import sys
 import typing_extensions as tpx
 import os
 import random
@@ -646,7 +648,7 @@ class BitBirch:
     @merge_criterion.setter
     def merge_criterion(self, value: str) -> None:
-        self.set_merge(criterion=value)
+        self.set_merge(merge_criterion=value)
     @property
     def tolerance(self) -> float | None:
@@ -671,7 +673,7 @@ class BitBirch:
     def set_merge(
         self,
-        criterion: str | MergeAcceptFunction | None = None,
+        merge_criterion: str | MergeAcceptFunction | None = None,
         *,
         tolerance: float | None = None,
         threshold: float | None = None,
@@ -687,10 +689,10 @@ class BitBirch:
                 "the global set_merge() function has *not* been used"
             )
         _tolerance = 0.05 if tolerance is None else tolerance
-        if isinstance(criterion, MergeAcceptFunction):
-            self._merge_accept_fn = criterion
-        elif isinstance(criterion, str):
-            self._merge_accept_fn = get_merge_accept_fn(criterion, _tolerance)
+        if isinstance(merge_criterion, MergeAcceptFunction):
+            self._merge_accept_fn = merge_criterion
+        elif isinstance(merge_criterion, str):
+            self._merge_accept_fn = get_merge_accept_fn(merge_criterion, _tolerance)
         if hasattr(self._merge_accept_fn, "tolerance"):
             self._merge_accept_fn.tolerance = _tolerance
         elif tolerance is not None:
@@ -1316,6 +1318,40 @@ class BitBirch:
             parts.append(f"tolerance={self.tolerance}")
         return f"{self.__class__.__name__}({', '.join(parts)})"
+    def save(self, path: Path | str) -> None:
+        r""":meta private:"""
+        # TODO: BitBIRCH is highly recursive. pickling may crash python,
+        # an alternative solution would be better
+        msg = (
+            "Saving large BitBIRCH trees may result in large memory peaks."
+            " An alternative serialization method may be implemented in the future"
+        )
+        warnings.warn(msg)
+        _old_limit = sys.getrecursionlimit()
+        sys.setrecursionlimit(1_000_000_000)
+        with open(path, mode="wb") as f:
+            pickle.dump(self, f)
+        sys.setrecursionlimit(_old_limit)
+    @classmethod
+    def load(cls, path: Path | str) -> tpx.Self:
+        r""":meta private:"""
+        # TODO: BitBIRCH is highly recursive. pickling may crash python,
+        # an alternative solution would be better
+        msg = (
+            "Loading large BitBIRCH trees may result in large memory peaks."
+            " An alternative serialization method may be implemented in the future"
+        )
+        warnings.warn(msg)
+        _old_limit = sys.getrecursionlimit()
+        sys.setrecursionlimit(1_000_000_000)
+        with open(path, mode="rb") as f:
+            tree = pickle.load(f)
+        sys.setrecursionlimit(_old_limit)
+        if not isinstance(tree, cls):
+            raise ValueError("Path does not contain a bitbirch object")
+        return tree
     def global_clustering(
         self,
         n_clusters: int,

{bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/cli.py RENAMED Viewed

@@ -1096,26 +1096,29 @@ def _run(
     timer.end_timing("total", console, indent=False)
     console.print_peak_mem(out_dir, indent=False)
+    if save_tree:
+        if variant != "lean":
+            console.print("Can't save tree for non-lean variants", style="red")
+        else:
+            # TODO: Find alternative solution
+            tree.save(out_dir / "bitbirch.pkl")
     if variant == "lean":
-        if save_tree:
-            # TODO: BitBIRCH is highly recursive. pickling may crash python,
-            # an alternative solution would be better
-            _old_limit = sys.getrecursionlimit()
-            sys.setrecursionlimit(100_000)
-            with open(out_dir / "bitbirch.pkl", mode="wb") as f:
-                pickle.dump(tree, f)
-            sys.setrecursionlimit(_old_limit)
         tree.delete_internal_nodes()
-        # Dump outputs (peak memory, timings, config, cluster ids)
-        if save_centroids:
+    # Dump outputs (peak memory, timings, config, cluster ids)
+    if save_centroids:
+        if variant != "lean":
+            console.print("Can't save centroids for non-lean variants", style="red")
+            with open(out_dir / "clusters.pkl", mode="wb") as f:
+                pickle.dump(tree.get_cluster_mol_ids(), f)
+        else:
             output = tree.get_centroids_mol_ids()
             with open(out_dir / "clusters.pkl", mode="wb") as f:
                 pickle.dump(output["mol_ids"], f)
             with open(out_dir / "cluster-centroids-packed.pkl", mode="wb") as f:
                 pickle.dump(output["centroids"], f)
-        else:
-            with open(out_dir / "clusters.pkl", mode="wb") as f:
-                pickle.dump(tree.get_cluster_mol_ids(), f)
+    else:
+        with open(out_dir / "clusters.pkl", mode="wb") as f:
+            pickle.dump(tree.get_cluster_mol_ids(), f)
     collect_system_specs_and_dump_config(ctx.params)
     timer.dump(out_dir / "timings.json")
@@ -1193,6 +1196,14 @@ def _multiround(
         bool,
         Option("--save-centroids/--no-save-centroids", rich_help_panel="Advanced"),
     ] = True,
+    sort_fps: Annotated[
+        bool,
+        Option(
+            "--sort-fps/--no-sort-fps",
+            help="Sort the fingerprints by popcount before launching the initial round",
+            rich_help_panel="Advanced",
+        ),
+    ] = False,
     mid_merge_criterion: Annotated[
         str,
         Option(
@@ -1386,6 +1397,7 @@ def _multiround(
         midsection_threshold_change=mid_threshold_change,
         tolerance=tolerance,
         # Advanced
+        sort_fps=sort_fps,
         save_tree=save_tree,
         save_centroids=save_centroids,
         bin_size=bin_size,
@@ -1526,6 +1538,13 @@ def _fps_from_smiles(
             ),
         ),
     ] = False,
+    tab_separated: Annotated[
+        bool,
+        Option(
+            "--tab-sep/--no-tab-sep",
+            help="Whether the smiles file has the format <smiles><tab><field><tab>...",
+        ),
+    ] = False,
 ) -> None:
     r"""Generate a `*.npy` fingerprints file from one or more `*.smi` smiles files
@@ -1631,7 +1650,9 @@ def _fps_from_smiles(
             with mp_context.Pool(processes=num_ps) as pool:
                 pool.map(
                     create_fp_file,
-                    _iter_idxs_and_smiles_batches(smiles_paths, num_per_batch),
+                    _iter_idxs_and_smiles_batches(
+                        smiles_paths, num_per_batch, tab_separated
+                    ),
                 )
         timer.end_timing("total", console, indent=False)
         stem = out_name.split(".")[0]
@@ -1671,7 +1692,9 @@ def _fps_from_smiles(
         with mp_context.Pool(processes=num_ps) as pool:
             pool.starmap(
                 fps_array_filler,
-                _iter_ranges_and_smiles_batches(smiles_paths, num_per_batch),
+                _iter_ranges_and_smiles_batches(
+                    smiles_paths, num_per_batch, tab_separated
+                ),
             )
         fps = np.ndarray((smiles_num, out_dim), dtype=dtype, buffer=fps_shmem.buf)
         mask = np.ndarray((smiles_num,), dtype=np.bool, buffer=invalid_mask_shmem.buf)
@@ -1848,3 +1871,33 @@ def _merge_fps(
             return
         np.save(out_dir / stem, np.concatenate(arrays))
     console.print(f"Finished. Outputs written to {str(out_dir / stem)}.npy")
+@app.command("fps-sort", rich_help_panel="Fingerprints")
+def _sort_fps(
+    in_file: Annotated[
+        Path,
+        Argument(help="`*.npy` file with packed fingerprints"),
+    ],
+    out_dir: Annotated[
+        Path | None,
+        Option("-o", "--out-dir", show_default=False),
+    ] = None,
+    seed: Annotated[
+        int | None,
+        Option("--seed", hidden=True, rich_help_panel="Debug"),
+    ] = None,
+) -> None:
+    import numpy as np
+    from bblean._py_similarity import _popcount
+    fps = np.load(in_file)
+    stem = in_file.stem
+    counts = _popcount(fps)
+    sort_idxs = np.argsort(counts)
+    fps = fps[sort_idxs]
+    if out_dir is None:
+        out_dir = Path.cwd()
+    out_dir.mkdir(exist_ok=True)
+    out_dir = out_dir.resolve()
+    np.save(out_dir / f"sorted-{stem}.npy", fps)

{bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/csrc/similarity.cpp RENAMED Viewed

@@ -300,6 +300,75 @@ double jt_isim_from_sum(const CArrayForcecast<uint64_t>& linear_sum,
     return a / ((a + (n_objects * sum_kq)) - sum_kqsq);
 }
+// NOTE: This is only *slightly* faster for C++ than numpy, **only if the
+// array is uint8_t** if the array is uint64 already, it is slower
+template <typename T>
+py::array_t<uint64_t> add_rows(const CArrayForcecast<T>& arr) {
+    if (arr.ndim() != 2) {
+        throw std::runtime_error("Input array must be 2-dimensional");
+    }
+    auto arr_ptr = arr.data();
+    auto out = py::array_t<uint64_t>(arr.shape(1));
+    auto out_ptr = out.mutable_data();
+    std::memset(out_ptr, 0, out.nbytes());
+    py::ssize_t n_samples = arr.shape(0);
+    py::ssize_t n_features = arr.shape(1);
+    // Check GCC / CLang vectorize this
+    for (py::ssize_t i = 0; i < n_samples; ++i) {
+        const uint8_t* arr_row_ptr = arr_ptr + i * n_features;
+        for (py::ssize_t j = 0; j < n_features; ++j) {
+            out_ptr[j] += static_cast<uint64_t>(arr_row_ptr[j]);
+        }
+    }
+    return out;
+}
+py::array_t<double> _nochecks_jt_compl_isim_unpacked_u8(
+    const py::array_t<uint8_t, py::array::c_style>& fps) {
+    py::ssize_t n_objects = fps.shape(0);
+    py::ssize_t n_features = fps.shape(1);
+    auto out = py::array_t<double>(n_objects);
+    auto out_ptr = out.mutable_data();
+    if (n_objects < 3) {
+        PyErr_WarnEx(PyExc_RuntimeWarning,
+                     "Invalid num fps in compl_isim. Expected n_objects >= 3",
+                     1);
+        for (py::ssize_t i{0}; i != n_objects; ++i) {
+            out_ptr[i] = std::numeric_limits<double>::quiet_NaN();
+        }
+        return out;
+    }
+    auto linear_sum = add_rows<uint8_t>(fps);
+    auto ls_cptr = linear_sum.data();
+    py::array_t<uint64_t> shifted_linear_sum(n_features);
+    auto shifted_ls_ptr = shifted_linear_sum.mutable_data();
+    auto in_cptr = fps.data();
+    for (py::ssize_t i{0}; i != n_objects; ++i) {
+        for (py::ssize_t j{0}; j != n_features; ++j) {
+            shifted_ls_ptr[j] = ls_cptr[j] - in_cptr[i * n_features + j];
+        }
+        // For all compl isim N is n_objects - 1
+        out_ptr[i] = jt_isim_from_sum(shifted_linear_sum, n_objects - 1);
+    }
+    return out;
+}
+py::array_t<double> jt_compl_isim(
+    const CArrayForcecast<uint8_t>& fps, bool input_is_packed = true,
+    std::optional<py::ssize_t> n_features_opt = std::nullopt) {
+    if (fps.ndim() != 2) {
+        throw std::runtime_error("fps arr must be 2D");
+    }
+    if (input_is_packed) {
+        return _nochecks_jt_compl_isim_unpacked_u8(
+            _nochecks_unpack_fingerprints_2d(fps, n_features_opt));
+    }
+    return _nochecks_jt_compl_isim_unpacked_u8(fps);
+}
 // Contraint: T must be uint64_t or uint8_t
 template <typename T>
 void _calc_arr_vec_jt(const py::array_t<uint8_t>& arr,
@@ -372,33 +441,10 @@ py::array_t<double> jt_sim_packed_precalc_cardinalities(
 }
 py::array_t<double> _jt_sim_arr_vec_packed(const py::array_t<uint8_t>& arr,
-                                  const py::array_t<uint8_t>& vec) {
+                                           const py::array_t<uint8_t>& vec) {
     return jt_sim_packed_precalc_cardinalities(arr, vec, _popcount_2d(arr));
 }
-// NOTE: This is only *slightly* faster for C++ than numpy, **only if the
-// array is uint8_t** if the array is uint64 already, it is slower
-template <typename T>
-py::array_t<uint64_t> add_rows(const CArrayForcecast<T>& arr) {
-    if (arr.ndim() != 2) {
-        throw std::runtime_error("Input array must be 2-dimensional");
-    }
-    auto arr_ptr = arr.data();
-    auto out = py::array_t<uint64_t>(arr.shape(1));
-    auto out_ptr = out.mutable_data();
-    std::memset(out_ptr, 0, out.nbytes());
-    py::ssize_t n_samples = arr.shape(0);
-    py::ssize_t n_features = arr.shape(1);
-    // Check GCC / CLang vectorize this
-    for (py::ssize_t i = 0; i < n_samples; ++i) {
-        const uint8_t* arr_row_ptr = arr_ptr + i * n_features;
-        for (py::ssize_t j = 0; j < n_features; ++j) {
-            out_ptr[j] += static_cast<uint64_t>(arr_row_ptr[j]);
-        }
-    }
-    return out;
-}
 double jt_isim_unpacked_u8(const CArrayForcecast<uint8_t>& arr) {
     return jt_isim_from_sum(add_rows<uint8_t>(arr), arr.shape(0));
 }
@@ -406,8 +452,9 @@ double jt_isim_unpacked_u8(const CArrayForcecast<uint8_t>& arr) {
 double jt_isim_packed_u8(
     const CArrayForcecast<uint8_t>& arr,
     std::optional<py::ssize_t> n_features_opt = std::nullopt) {
-    return jt_isim_from_sum(add_rows<uint8_t>(unpack_fingerprints(arr, n_features_opt)),
-                            arr.shape(0));
+    return jt_isim_from_sum(
+        add_rows<uint8_t>(unpack_fingerprints(arr, n_features_opt)),
+        arr.shape(0));
 }
 py::tuple jt_most_dissimilar_packed(
@@ -510,6 +557,10 @@ PYBIND11_MODULE(_cpp_similarity, m) {
     m.def("jt_isim_unpacked_u8", &jt_isim_unpacked_u8,
           "iSIM Tanimoto calculation", py::arg("arr"));
+    m.def("jt_compl_isim", &jt_compl_isim, "Complementary iSIM tanimoto",
+          py::arg("fps"), py::arg("input_is_packed") = true,
+          py::arg("n_features") = std::nullopt);
     m.def("_jt_sim_arr_vec_packed", &_jt_sim_arr_vec_packed,
           "Tanimoto similarity between a matrix of packed fps and a single "
           "packed fp",

{bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/fingerprints.py RENAMED Viewed

@@ -115,7 +115,11 @@ def _get_generator(kind: str, n_features: int) -> tp.Any:
         return rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_features)
     elif kind == "ecfp6":
         return rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=n_features)
-    raise ValueError(f"Unknonw kind {kind}. Should be one of 'rdkit|ecfp4|ecfp6'")
+    elif kind == "topological":
+        return rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=n_features)
+    elif kind == "ap":
+        return rdFingerprintGenerator.GetAtomPairGenerator(fpSize=n_features)
+    raise ValueError(f"Unknown kind {kind}. Use 'rdkit|ecfp4|ecfp6|topological|ap'")
 def _get_sanitize_flags(sanitize: str) -> tp.Any:

{bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/multiround.py RENAMED Viewed

@@ -65,6 +65,7 @@ from bblean._config import DEFAULTS
 from bblean.utils import batched
 from bblean.bitbirch import BitBirch
 from bblean.fingerprints import _get_fps_file_num
+from bblean._py_similarity import _popcount
 __all__ = ["run_multiround_bitbirch"]
@@ -157,6 +158,7 @@ class _InitialRound:
         max_fps: int | None = None,
         merge_criterion: str = DEFAULTS.merge_criterion,
         input_is_packed: bool = True,
+        sort_fps: bool = False,
     ) -> None:
         self.n_features = n_features
         self.refinement_before_midsection = refinement_before_midsection
@@ -171,6 +173,7 @@ class _InitialRound:
         self.refine_merge_criterion = refine_merge_criterion
         self.input_is_packed = input_is_packed
         self.refine_threshold_change = refine_threshold_change
+        self._sort_fps = sort_fps
     def __call__(self, file_info: tuple[str, Path, int, int]) -> None:
         file_label, fp_file, start_idx, end_idx = file_info
@@ -182,6 +185,14 @@ class _InitialRound:
             threshold=self.threshold,
             merge_criterion=self.merge_criterion,
         )
+        if self._sort_fps:
+            fp_input = np.load(fp_file)
+            counts = _popcount(fp_input)
+            sort_idxs = np.argsort(counts)
+            fp_input = fp_input[sort_idxs]
+        else:
+            fp_input = fp_file
         range_ = range(start_idx, end_idx)
         tree.fit(
             fp_file,
@@ -201,7 +212,7 @@ class _InitialRound:
                 # Finish the first refinement step internally in this round
                 tree.reset()
                 tree.set_merge(
-                    self.refine_merge_criterion,
+                    merge_criterion=self.refine_merge_criterion,
                     tolerance=self.tolerance,
                     threshold=self.threshold + self.refine_threshold_change,
                 )
@@ -225,7 +236,7 @@ class _TreeMergingRound:
         round_idx: int,
         out_dir: Path | str,
         split_largest_cluster: bool,
-        criterion: str,
+        merge_criterion: str,
         all_fp_paths: tp.Sequence[Path] = (),
     ) -> None:
         self.all_fp_paths = list(all_fp_paths)
@@ -235,14 +246,14 @@ class _TreeMergingRound:
         self.round_idx = round_idx
         self.out_dir = Path(out_dir)
         self.split_largest_cluster = split_largest_cluster
-        self.criterion = criterion
+        self.merge_criterion = merge_criterion
     def __call__(self, batch_info: tuple[str, tp.Sequence[tuple[Path, Path]]]) -> None:
         batch_label, batch_path_pairs = batch_info
         tree = BitBirch(
             branching_factor=self.branching_factor,
             threshold=self.threshold,
-            merge_criterion=self.criterion,
+            merge_criterion=self.merge_criterion,
             tolerance=self.tolerance,
         )
         # Rebuild a tree, inserting all BitFeatures from the corresponding batch
@@ -270,13 +281,20 @@ class _FinalTreeMergingRound(_TreeMergingRound):
         branching_factor: int,
         threshold: float,
         tolerance: float,
-        criterion: str,
+        merge_criterion: str,
         out_dir: Path | str,
         save_tree: bool,
         save_centroids: bool,
     ) -> None:
         super().__init__(
-            branching_factor, threshold, tolerance, -1, out_dir, False, criterion, ()
+            branching_factor,
+            threshold,
+            tolerance,
+            -1,
+            out_dir,
+            False,
+            merge_criterion,
+            (),
         )
         self.save_tree = save_tree
         self.save_centroids = save_centroids
@@ -286,7 +304,7 @@ class _FinalTreeMergingRound(_TreeMergingRound):
         tree = BitBirch(
             branching_factor=self.branching_factor,
             threshold=self.threshold,
-            merge_criterion=self.criterion,
+            merge_criterion=self.merge_criterion,
             tolerance=self.tolerance,
         )
         # Rebuild a tree, inserting all BitFeatures from the corresponding batch
@@ -298,13 +316,8 @@ class _FinalTreeMergingRound(_TreeMergingRound):
         # Save clusters and exit
         if self.save_tree:
-            # TODO: BitBIRCH is highly recursive. pickling may crash python,
-            # an alternative solution would be better
-            _old_limit = sys.getrecursionlimit()
-            sys.setrecursionlimit(100_000)
-            with open(self.out_dir / "bitbirch.pkl", mode="wb") as f:
-                pickle.dump(tree, f)
-            sys.setrecursionlimit(_old_limit)
+            # TODO: Find alternative solution
+            tree.save(self.out_dir / "bitbirch.pkl")
         tree.delete_internal_nodes()
         if self.save_centroids:
             output = tree.get_centroids_mol_ids()
@@ -358,6 +371,7 @@ def run_multiround_bitbirch(
     mp_context: tp.Any = None,
     save_tree: bool = False,
     save_centroids: bool = True,
+    sort_fps: bool = False,
     # Debug
     max_fps: int | None = None,
     verbose: bool = False,
@@ -404,6 +418,7 @@ def run_multiround_bitbirch(
     console.print(f"(Initial) Round {round_idx}: Cluster initial batch of fingerprints")
     initial_fn = _InitialRound(
+        sort_fps=sort_fps,
         n_features=n_features,
         refinement_before_midsection=refinement_before_midsection,
         max_fps=max_fps,
@@ -441,7 +456,7 @@ def run_multiround_bitbirch(
             round_idx=round_idx,
             all_fp_paths=input_files,
             split_largest_cluster=split_largest_after_each_midsection_round,
-            criterion=midsection_merge_criterion,
+            merge_criterion=midsection_merge_criterion,
             threshold=threshold + midsection_threshold_change,
             **common_kwargs,
         )
@@ -469,7 +484,7 @@ def run_multiround_bitbirch(
     final_fn = _FinalTreeMergingRound(
         save_tree=save_tree,
         save_centroids=save_centroids,
-        criterion=final_merge_criterion,
+        merge_criterion=final_merge_criterion,
         threshold=threshold + midsection_threshold_change,
         **common_kwargs,
     )

{bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/plotting.py RENAMED Viewed

@@ -399,13 +399,17 @@ def dump_mol_images(
     clusters: list[list[int]],
     cluster_idx: int = 0,
     batch_size: int = 30,
+    limit: int = -1,
 ) -> None:
     r"""Dump smiles associated with a specific cluster as ``*.png`` image files"""
     if isinstance(smiles, str):
         smiles = [smiles]
     smiles = np.asarray(smiles)
     idxs = clusters[cluster_idx]
+    num = 0
     for i, idx_seq in enumerate(batched(idxs, batch_size)):
+        if num + len(idx_seq) > limit:
+            idx_seq = idx_seq[: num + len(idx_seq) - limit]
         mols = []
         for smi in smiles[list(idx_seq)]:
             mol = Chem.MolFromSmiles(smi)
@@ -415,6 +419,9 @@ def dump_mol_images(
         img = Draw.MolsToGridImage(mols, molsPerRow=5)
         with open(f"cluster_{cluster_idx}_{i}.png", "wb") as f:
             f.write(img.data)
+        num += len(idx_seq)
+        if num >= limit:
+            break
 # For internal use, dispatches a visualization workflow and optionally saves

bblean 0.6.0b2__tar.gz → 0.7.2b0__tar.gz

bblean 0.6.0b2tar.gz → 0.7.2b0tar.gz