PyPI - bblean - Versions diffs - 0.6.1b0__tar.gz → 0.7.2b0__tar.gz - Mend

bblean 0.6.1b0tar.gz → 0.7.2b0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

{bblean-0.6.1b0 → bblean-0.7.2b0}/.github/workflows/upload-to-pypi.yaml RENAMED Viewed

@@ -31,7 +31,7 @@ env:
   # Build wheels that support both aarch64 and x86_64 on macOS
   CIBW_ARCHS_MACOS: "universal2"
   CIBW_BUILD_VERBOSITY: 3
+  PIP_ONLY_BINARY: "llvmlite,numba"
 jobs:
   make_sdist:
     name: make-source-distribution

{bblean-0.6.1b0 → bblean-0.7.2b0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: bblean
-Version: 0.6.1b0
+Version: 0.7.2b0
 Summary: BitBirch-Lean Python package
 Author: The Miranda-Quintana Lab and other BitBirch developers
 Author-email: Ramon Alain Miranda Quintana <quintana@chem.ufl.edu>, Krisztina Zsigmond <kzsigmond@ufl.edu>, Ignacio Pickering <ipickering@ufl.edu>, Kenneth Lopez Perez <klopezperez@chem.ufl.edu>, Miroslav Lzicar <miroslav.lzicar@deepmedchem.com>
@@ -236,7 +236,7 @@ tree = bblean.BitBirch(branching_factor=50, threshold=0.65, merge_criterion="dia
 tree.fit(fps)
 # Refine the tree (if needed)
-tree.set_merge(merge_criterion="tolerance-diameter", tolerance=0.0)
+tree.set_merge("tolerance-diameter", tolerance=0.0)
 tree.refine_inplace(fps)
 # Visualize the results

{bblean-0.6.1b0 → bblean-0.7.2b0}/README.md RENAMED Viewed

@@ -193,7 +193,7 @@ tree = bblean.BitBirch(branching_factor=50, threshold=0.65, merge_criterion="dia
 tree.fit(fps)
 # Refine the tree (if needed)
-tree.set_merge(merge_criterion="tolerance-diameter", tolerance=0.0)
+tree.set_merge("tolerance-diameter", tolerance=0.0)
 tree.refine_inplace(fps)
 # Visualize the results

{bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/_py_similarity.py RENAMED Viewed

@@ -76,18 +76,10 @@ def jt_compl_isim(
         warnings.warn(msg, RuntimeWarning, stacklevel=2)
         return np.full(len(fps), fill_value=np.nan, dtype=np.float64)
     linear_sum = np.sum(fps, axis=0)
-    n_objects = len(fps) - 1
     comp_sims = [jt_isim_from_sum(linear_sum - fp, n_objects) for fp in fps]
     return np.array(comp_sims, dtype=np.float64)
-def _jt_isim_medoid_index(
-    fps: NDArray[np.uint8], input_is_packed: bool = True, n_features: int | None = None
-) -> int:
-    return np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
 def jt_isim_medoid(
     fps: NDArray[np.uint8],
     input_is_packed: bool = True,
@@ -110,7 +102,7 @@ def jt_isim_medoid(
     if len(fps) < 3:
         idx = 0  # Medoid undefined for sets of 3 or more fingerprints
     else:
-        idx = _jt_isim_medoid_index(fps, input_is_packed=False)
+        idx = np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
     m = fps[idx]
     if pack:
         return idx, pack_fingerprints(m)

{bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.6.1b0'
-__version_tuple__ = version_tuple = (0, 6, 1, 'b0')
+__version__ = version = '0.7.2.b0'
+__version_tuple__ = version_tuple = (0, 7, 2, 'b0')
 __commit_id__ = commit_id = None

{bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/bitbirch.py RENAMED Viewed

@@ -648,7 +648,7 @@ class BitBirch:
     @merge_criterion.setter
     def merge_criterion(self, value: str) -> None:
-        self.set_merge(criterion=value)
+        self.set_merge(merge_criterion=value)
     @property
     def tolerance(self) -> float | None:
@@ -673,7 +673,7 @@ class BitBirch:
     def set_merge(
         self,
-        criterion: str | MergeAcceptFunction | None = None,
+        merge_criterion: str | MergeAcceptFunction | None = None,
         *,
         tolerance: float | None = None,
         threshold: float | None = None,
@@ -689,10 +689,10 @@ class BitBirch:
                 "the global set_merge() function has *not* been used"
             )
         _tolerance = 0.05 if tolerance is None else tolerance
-        if isinstance(criterion, MergeAcceptFunction):
-            self._merge_accept_fn = criterion
-        elif isinstance(criterion, str):
-            self._merge_accept_fn = get_merge_accept_fn(criterion, _tolerance)
+        if isinstance(merge_criterion, MergeAcceptFunction):
+            self._merge_accept_fn = merge_criterion
+        elif isinstance(merge_criterion, str):
+            self._merge_accept_fn = get_merge_accept_fn(merge_criterion, _tolerance)
         if hasattr(self._merge_accept_fn, "tolerance"):
             self._merge_accept_fn.tolerance = _tolerance
         elif tolerance is not None:

{bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/cli.py RENAMED Viewed

@@ -1101,7 +1101,7 @@ def _run(
             console.print("Can't save tree for non-lean variants", style="red")
         else:
             # TODO: Find alternative solution
-            tree.save_pickle(out_dir / "bitbirch.pkl")
+            tree.save(out_dir / "bitbirch.pkl")
     if variant == "lean":
         tree.delete_internal_nodes()
     # Dump outputs (peak memory, timings, config, cluster ids)
@@ -1196,6 +1196,14 @@ def _multiround(
         bool,
         Option("--save-centroids/--no-save-centroids", rich_help_panel="Advanced"),
     ] = True,
+    sort_fps: Annotated[
+        bool,
+        Option(
+            "--sort-fps/--no-sort-fps",
+            help="Sort the fingerprints by popcount before launching the initial round",
+            rich_help_panel="Advanced",
+        ),
+    ] = False,
     mid_merge_criterion: Annotated[
         str,
         Option(
@@ -1389,6 +1397,7 @@ def _multiround(
         midsection_threshold_change=mid_threshold_change,
         tolerance=tolerance,
         # Advanced
+        sort_fps=sort_fps,
         save_tree=save_tree,
         save_centroids=save_centroids,
         bin_size=bin_size,
@@ -1529,6 +1538,13 @@ def _fps_from_smiles(
             ),
         ),
     ] = False,
+    tab_separated: Annotated[
+        bool,
+        Option(
+            "--tab-sep/--no-tab-sep",
+            help="Whether the smiles file has the format <smiles><tab><field><tab>...",
+        ),
+    ] = False,
 ) -> None:
     r"""Generate a `*.npy` fingerprints file from one or more `*.smi` smiles files
@@ -1634,7 +1650,9 @@ def _fps_from_smiles(
             with mp_context.Pool(processes=num_ps) as pool:
                 pool.map(
                     create_fp_file,
-                    _iter_idxs_and_smiles_batches(smiles_paths, num_per_batch),
+                    _iter_idxs_and_smiles_batches(
+                        smiles_paths, num_per_batch, tab_separated
+                    ),
                 )
         timer.end_timing("total", console, indent=False)
         stem = out_name.split(".")[0]
@@ -1674,7 +1692,9 @@ def _fps_from_smiles(
         with mp_context.Pool(processes=num_ps) as pool:
             pool.starmap(
                 fps_array_filler,
-                _iter_ranges_and_smiles_batches(smiles_paths, num_per_batch),
+                _iter_ranges_and_smiles_batches(
+                    smiles_paths, num_per_batch, tab_separated
+                ),
             )
         fps = np.ndarray((smiles_num, out_dim), dtype=dtype, buffer=fps_shmem.buf)
         mask = np.ndarray((smiles_num,), dtype=np.bool, buffer=invalid_mask_shmem.buf)
@@ -1851,3 +1871,33 @@ def _merge_fps(
             return
         np.save(out_dir / stem, np.concatenate(arrays))
     console.print(f"Finished. Outputs written to {str(out_dir / stem)}.npy")
+@app.command("fps-sort", rich_help_panel="Fingerprints")
+def _sort_fps(
+    in_file: Annotated[
+        Path,
+        Argument(help="`*.npy` file with packed fingerprints"),
+    ],
+    out_dir: Annotated[
+        Path | None,
+        Option("-o", "--out-dir", show_default=False),
+    ] = None,
+    seed: Annotated[
+        int | None,
+        Option("--seed", hidden=True, rich_help_panel="Debug"),
+    ] = None,
+) -> None:
+    import numpy as np
+    from bblean._py_similarity import _popcount
+    fps = np.load(in_file)
+    stem = in_file.stem
+    counts = _popcount(fps)
+    sort_idxs = np.argsort(counts)
+    fps = fps[sort_idxs]
+    if out_dir is None:
+        out_dir = Path.cwd()
+    out_dir.mkdir(exist_ok=True)
+    out_dir = out_dir.resolve()
+    np.save(out_dir / f"sorted-{stem}.npy", fps)

{bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/csrc/similarity.cpp RENAMED Viewed

@@ -300,6 +300,75 @@ double jt_isim_from_sum(const CArrayForcecast<uint64_t>& linear_sum,
     return a / ((a + (n_objects * sum_kq)) - sum_kqsq);
 }
+// NOTE: This is only *slightly* faster for C++ than numpy, **only if the
+// array is uint8_t** if the array is uint64 already, it is slower
+template <typename T>
+py::array_t<uint64_t> add_rows(const CArrayForcecast<T>& arr) {
+    if (arr.ndim() != 2) {
+        throw std::runtime_error("Input array must be 2-dimensional");
+    }
+    auto arr_ptr = arr.data();
+    auto out = py::array_t<uint64_t>(arr.shape(1));
+    auto out_ptr = out.mutable_data();
+    std::memset(out_ptr, 0, out.nbytes());
+    py::ssize_t n_samples = arr.shape(0);
+    py::ssize_t n_features = arr.shape(1);
+    // Check GCC / CLang vectorize this
+    for (py::ssize_t i = 0; i < n_samples; ++i) {
+        const uint8_t* arr_row_ptr = arr_ptr + i * n_features;
+        for (py::ssize_t j = 0; j < n_features; ++j) {
+            out_ptr[j] += static_cast<uint64_t>(arr_row_ptr[j]);
+        }
+    }
+    return out;
+}
+py::array_t<double> _nochecks_jt_compl_isim_unpacked_u8(
+    const py::array_t<uint8_t, py::array::c_style>& fps) {
+    py::ssize_t n_objects = fps.shape(0);
+    py::ssize_t n_features = fps.shape(1);
+    auto out = py::array_t<double>(n_objects);
+    auto out_ptr = out.mutable_data();
+    if (n_objects < 3) {
+        PyErr_WarnEx(PyExc_RuntimeWarning,
+                     "Invalid num fps in compl_isim. Expected n_objects >= 3",
+                     1);
+        for (py::ssize_t i{0}; i != n_objects; ++i) {
+            out_ptr[i] = std::numeric_limits<double>::quiet_NaN();
+        }
+        return out;
+    }
+    auto linear_sum = add_rows<uint8_t>(fps);
+    auto ls_cptr = linear_sum.data();
+    py::array_t<uint64_t> shifted_linear_sum(n_features);
+    auto shifted_ls_ptr = shifted_linear_sum.mutable_data();
+    auto in_cptr = fps.data();
+    for (py::ssize_t i{0}; i != n_objects; ++i) {
+        for (py::ssize_t j{0}; j != n_features; ++j) {
+            shifted_ls_ptr[j] = ls_cptr[j] - in_cptr[i * n_features + j];
+        }
+        // For all compl isim N is n_objects - 1
+        out_ptr[i] = jt_isim_from_sum(shifted_linear_sum, n_objects - 1);
+    }
+    return out;
+}
+py::array_t<double> jt_compl_isim(
+    const CArrayForcecast<uint8_t>& fps, bool input_is_packed = true,
+    std::optional<py::ssize_t> n_features_opt = std::nullopt) {
+    if (fps.ndim() != 2) {
+        throw std::runtime_error("fps arr must be 2D");
+    }
+    if (input_is_packed) {
+        return _nochecks_jt_compl_isim_unpacked_u8(
+            _nochecks_unpack_fingerprints_2d(fps, n_features_opt));
+    }
+    return _nochecks_jt_compl_isim_unpacked_u8(fps);
+}
 // Contraint: T must be uint64_t or uint8_t
 template <typename T>
 void _calc_arr_vec_jt(const py::array_t<uint8_t>& arr,
@@ -372,33 +441,10 @@ py::array_t<double> jt_sim_packed_precalc_cardinalities(
 }
 py::array_t<double> _jt_sim_arr_vec_packed(const py::array_t<uint8_t>& arr,
-                                  const py::array_t<uint8_t>& vec) {
+                                           const py::array_t<uint8_t>& vec) {
     return jt_sim_packed_precalc_cardinalities(arr, vec, _popcount_2d(arr));
 }
-// NOTE: This is only *slightly* faster for C++ than numpy, **only if the
-// array is uint8_t** if the array is uint64 already, it is slower
-template <typename T>
-py::array_t<uint64_t> add_rows(const CArrayForcecast<T>& arr) {
-    if (arr.ndim() != 2) {
-        throw std::runtime_error("Input array must be 2-dimensional");
-    }
-    auto arr_ptr = arr.data();
-    auto out = py::array_t<uint64_t>(arr.shape(1));
-    auto out_ptr = out.mutable_data();
-    std::memset(out_ptr, 0, out.nbytes());
-    py::ssize_t n_samples = arr.shape(0);
-    py::ssize_t n_features = arr.shape(1);
-    // Check GCC / CLang vectorize this
-    for (py::ssize_t i = 0; i < n_samples; ++i) {
-        const uint8_t* arr_row_ptr = arr_ptr + i * n_features;
-        for (py::ssize_t j = 0; j < n_features; ++j) {
-            out_ptr[j] += static_cast<uint64_t>(arr_row_ptr[j]);
-        }
-    }
-    return out;
-}
 double jt_isim_unpacked_u8(const CArrayForcecast<uint8_t>& arr) {
     return jt_isim_from_sum(add_rows<uint8_t>(arr), arr.shape(0));
 }
@@ -406,8 +452,9 @@ double jt_isim_unpacked_u8(const CArrayForcecast<uint8_t>& arr) {
 double jt_isim_packed_u8(
     const CArrayForcecast<uint8_t>& arr,
     std::optional<py::ssize_t> n_features_opt = std::nullopt) {
-    return jt_isim_from_sum(add_rows<uint8_t>(unpack_fingerprints(arr, n_features_opt)),
-                            arr.shape(0));
+    return jt_isim_from_sum(
+        add_rows<uint8_t>(unpack_fingerprints(arr, n_features_opt)),
+        arr.shape(0));
 }
 py::tuple jt_most_dissimilar_packed(
@@ -510,6 +557,10 @@ PYBIND11_MODULE(_cpp_similarity, m) {
     m.def("jt_isim_unpacked_u8", &jt_isim_unpacked_u8,
           "iSIM Tanimoto calculation", py::arg("arr"));
+    m.def("jt_compl_isim", &jt_compl_isim, "Complementary iSIM tanimoto",
+          py::arg("fps"), py::arg("input_is_packed") = true,
+          py::arg("n_features") = std::nullopt);
     m.def("_jt_sim_arr_vec_packed", &_jt_sim_arr_vec_packed,
           "Tanimoto similarity between a matrix of packed fps and a single "
           "packed fp",

{bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/fingerprints.py RENAMED Viewed

@@ -115,7 +115,11 @@ def _get_generator(kind: str, n_features: int) -> tp.Any:
         return rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_features)
     elif kind == "ecfp6":
         return rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=n_features)
-    raise ValueError(f"Unknonw kind {kind}. Should be one of 'rdkit|ecfp4|ecfp6'")
+    elif kind == "topological":
+        return rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=n_features)
+    elif kind == "ap":
+        return rdFingerprintGenerator.GetAtomPairGenerator(fpSize=n_features)
+    raise ValueError(f"Unknown kind {kind}. Use 'rdkit|ecfp4|ecfp6|topological|ap'")
 def _get_sanitize_flags(sanitize: str) -> tp.Any:

{bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/multiround.py RENAMED Viewed

@@ -65,6 +65,7 @@ from bblean._config import DEFAULTS
 from bblean.utils import batched
 from bblean.bitbirch import BitBirch
 from bblean.fingerprints import _get_fps_file_num
+from bblean._py_similarity import _popcount
 __all__ = ["run_multiround_bitbirch"]
@@ -157,6 +158,7 @@ class _InitialRound:
         max_fps: int | None = None,
         merge_criterion: str = DEFAULTS.merge_criterion,
         input_is_packed: bool = True,
+        sort_fps: bool = False,
     ) -> None:
         self.n_features = n_features
         self.refinement_before_midsection = refinement_before_midsection
@@ -171,6 +173,7 @@ class _InitialRound:
         self.refine_merge_criterion = refine_merge_criterion
         self.input_is_packed = input_is_packed
         self.refine_threshold_change = refine_threshold_change
+        self._sort_fps = sort_fps
     def __call__(self, file_info: tuple[str, Path, int, int]) -> None:
         file_label, fp_file, start_idx, end_idx = file_info
@@ -182,6 +185,14 @@ class _InitialRound:
             threshold=self.threshold,
             merge_criterion=self.merge_criterion,
         )
+        if self._sort_fps:
+            fp_input = np.load(fp_file)
+            counts = _popcount(fp_input)
+            sort_idxs = np.argsort(counts)
+            fp_input = fp_input[sort_idxs]
+        else:
+            fp_input = fp_file
         range_ = range(start_idx, end_idx)
         tree.fit(
             fp_file,
@@ -201,7 +212,7 @@ class _InitialRound:
                 # Finish the first refinement step internally in this round
                 tree.reset()
                 tree.set_merge(
-                    self.refine_merge_criterion,
+                    merge_criterion=self.refine_merge_criterion,
                     tolerance=self.tolerance,
                     threshold=self.threshold + self.refine_threshold_change,
                 )
@@ -225,7 +236,7 @@ class _TreeMergingRound:
         round_idx: int,
         out_dir: Path | str,
         split_largest_cluster: bool,
-        criterion: str,
+        merge_criterion: str,
         all_fp_paths: tp.Sequence[Path] = (),
     ) -> None:
         self.all_fp_paths = list(all_fp_paths)
@@ -235,14 +246,14 @@ class _TreeMergingRound:
         self.round_idx = round_idx
         self.out_dir = Path(out_dir)
         self.split_largest_cluster = split_largest_cluster
-        self.criterion = criterion
+        self.merge_criterion = merge_criterion
     def __call__(self, batch_info: tuple[str, tp.Sequence[tuple[Path, Path]]]) -> None:
         batch_label, batch_path_pairs = batch_info
         tree = BitBirch(
             branching_factor=self.branching_factor,
             threshold=self.threshold,
-            merge_criterion=self.criterion,
+            merge_criterion=self.merge_criterion,
             tolerance=self.tolerance,
         )
         # Rebuild a tree, inserting all BitFeatures from the corresponding batch
@@ -270,13 +281,20 @@ class _FinalTreeMergingRound(_TreeMergingRound):
         branching_factor: int,
         threshold: float,
         tolerance: float,
-        criterion: str,
+        merge_criterion: str,
         out_dir: Path | str,
         save_tree: bool,
         save_centroids: bool,
     ) -> None:
         super().__init__(
-            branching_factor, threshold, tolerance, -1, out_dir, False, criterion, ()
+            branching_factor,
+            threshold,
+            tolerance,
+            -1,
+            out_dir,
+            False,
+            merge_criterion,
+            (),
         )
         self.save_tree = save_tree
         self.save_centroids = save_centroids
@@ -286,7 +304,7 @@ class _FinalTreeMergingRound(_TreeMergingRound):
         tree = BitBirch(
             branching_factor=self.branching_factor,
             threshold=self.threshold,
-            merge_criterion=self.criterion,
+            merge_criterion=self.merge_criterion,
             tolerance=self.tolerance,
         )
         # Rebuild a tree, inserting all BitFeatures from the corresponding batch
@@ -299,7 +317,7 @@ class _FinalTreeMergingRound(_TreeMergingRound):
         # Save clusters and exit
         if self.save_tree:
             # TODO: Find alternative solution
-            tree.save_pickle(self.out_dir / "bitbirch.pkl")
+            tree.save(self.out_dir / "bitbirch.pkl")
         tree.delete_internal_nodes()
         if self.save_centroids:
             output = tree.get_centroids_mol_ids()
@@ -353,6 +371,7 @@ def run_multiround_bitbirch(
     mp_context: tp.Any = None,
     save_tree: bool = False,
     save_centroids: bool = True,
+    sort_fps: bool = False,
     # Debug
     max_fps: int | None = None,
     verbose: bool = False,
@@ -399,6 +418,7 @@ def run_multiround_bitbirch(
     console.print(f"(Initial) Round {round_idx}: Cluster initial batch of fingerprints")
     initial_fn = _InitialRound(
+        sort_fps=sort_fps,
         n_features=n_features,
         refinement_before_midsection=refinement_before_midsection,
         max_fps=max_fps,
@@ -436,7 +456,7 @@ def run_multiround_bitbirch(
             round_idx=round_idx,
             all_fp_paths=input_files,
             split_largest_cluster=split_largest_after_each_midsection_round,
-            criterion=midsection_merge_criterion,
+            merge_criterion=midsection_merge_criterion,
             threshold=threshold + midsection_threshold_change,
             **common_kwargs,
         )
@@ -464,7 +484,7 @@ def run_multiround_bitbirch(
     final_fn = _FinalTreeMergingRound(
         save_tree=save_tree,
         save_centroids=save_centroids,
-        criterion=final_merge_criterion,
+        merge_criterion=final_merge_criterion,
         threshold=threshold + midsection_threshold_change,
         **common_kwargs,
     )

{bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/similarity.py RENAMED Viewed

@@ -34,12 +34,8 @@ __all__ = [
     "jt_sim_matrix_packed",
 ]
-from bblean._py_similarity import (
-    centroid_from_sum,
-    centroid,
-    jt_compl_isim,
-    jt_isim_medoid,
-)
+from bblean._py_similarity import centroid_from_sum, centroid
+from bblean.fingerprints import pack_fingerprints, unpack_fingerprints
 # jt_isim_packed and jt_isim_unpacked are not exposed, only used within functions for
 # speed
@@ -49,6 +45,7 @@ if os.getenv("BITBIRCH_NO_EXTENSIONS"):
         jt_isim_from_sum,
         jt_isim_unpacked,
         jt_isim_packed,
+        jt_compl_isim,
         _jt_sim_arr_vec_packed,
         jt_most_dissimilar_packed,
     )
@@ -56,11 +53,13 @@ else:
     try:
         from bblean._cpp_similarity import (  # type: ignore
             jt_isim_from_sum,
-            _jt_sim_arr_vec_packed,
             jt_isim_unpacked_u8,
             jt_isim_packed_u8,
+            jt_compl_isim,  # TODO: Does it need wrappers for non-uint8?
+            _jt_sim_arr_vec_packed,
             jt_most_dissimilar_packed,
-            unpack_fingerprints,
+            # Needed for wrappers
+            unpack_fingerprints as _unpack_fingerprints,
         )
         # Wrap these two since doing
@@ -80,7 +79,7 @@ else:
             if arr.dtype == np.uint64:
                 return jt_isim_from_sum(
                     np.sum(
-                        unpack_fingerprints(arr, n_features),  # type: ignore
+                        _unpack_fingerprints(arr, n_features),  # type: ignore
                         axis=0,
                         dtype=np.uint64,
                     ),
@@ -93,6 +92,7 @@ else:
             jt_isim_from_sum,
             jt_isim_unpacked,
             jt_isim_packed,
+            jt_compl_isim,
             _jt_sim_arr_vec_packed,
             jt_most_dissimilar_packed,
         )
@@ -103,6 +103,35 @@ else:
         )
+def jt_isim_medoid(
+    fps: NDArray[np.uint8],
+    input_is_packed: bool = True,
+    n_features: int | None = None,
+    pack: bool = True,
+) -> tuple[int, NDArray[np.uint8]]:
+    r"""Calculate the (Tanimoto) medoid of a set of fingerprints, using iSIM
+    Returns both the index of the medoid in the input array and the medoid itself
+    .. note::
+        Returns the first (or only) fingerprint for array of size 2 and 1 respectively.
+        Raises ValueError for arrays of size 0
+    """
+    if not fps.size:
+        raise ValueError("Size of fingerprints set must be > 0")
+    if input_is_packed:
+        fps = unpack_fingerprints(fps, n_features)
+    if len(fps) < 3:
+        idx = 0  # Medoid undefined for sets of 3 or more fingerprints
+    else:
+        idx = np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
+    m = fps[idx]
+    if pack:
+        return idx, pack_fingerprints(m)
+    return idx, m
 def jt_isim(
     fps: NDArray[np.integer],
     input_is_packed: bool = True,
@@ -149,7 +178,11 @@ def jt_isim_diameter(
     r"""Calculate the Tanimoto diameter of a set of fingerprints"""
     return jt_isim_diameter_from_sum(
         np.sum(
-            unpack_fingerprints(arr, n_features) if input_is_packed else arr,
+            (
+                unpack_fingerprints(arr.astype(np.uint8, copy=False), n_features)
+                if input_is_packed
+                else arr
+            ),
             axis=0,
             dtype=np.uint64,
         ),  # type: ignore
@@ -165,7 +198,11 @@ def jt_isim_radius(
     r"""Calculate the Tanimoto radius of a set of fingerprints"""
     return jt_isim_radius_from_sum(
         np.sum(
-            unpack_fingerprints(arr, n_features) if input_is_packed else arr,
+            (
+                unpack_fingerprints(arr.astype(np.uint8, copy=False), n_features)
+                if input_is_packed
+                else arr
+            ),
             axis=0,
             dtype=np.uint64,
         ),  # type: ignore
@@ -181,7 +218,11 @@ def jt_isim_radius_compl(
     r"""Calculate the complement of the Tanimoto radius of a set of fingerprints"""
     return jt_isim_radius_compl_from_sum(
         np.sum(
-            unpack_fingerprints(arr, n_features) if input_is_packed else arr,
+            (
+                unpack_fingerprints(arr.astype(np.uint8, copy=False), n_features)
+                if input_is_packed
+                else arr
+            ),
             axis=0,
             dtype=np.uint64,
         ),  # type: ignore
@@ -252,14 +293,28 @@ def estimate_jt_std(
     n_samples: int | None = None,
     input_is_packed: bool = True,
     n_features: int | None = None,
+    min_samples: int = 1_000_000,
 ) -> float:
-    r"""Estimate std of tanimoto sim using a deterministic sample"""
+    r"""Estimate the std of all pairwise Tanimoto.
+    Returns
+    -------
+    std : float
+        The standard deviation of all pairwise Tanimoto among the sampled fingerprints.
+    """
     num_fps = len(fps)
+    if num_fps > min_samples:
+        np.random.seed(42)
+        random_choices = np.random.choice(num_fps, size=min_samples, replace=False)
+        fps = fps[random_choices]
+        num_fps = len(fps)
     if n_samples is None:
-        n_samples = max(num_fps // 1000, 50)
+        # Heuristic: use at least 50 samples, or 1 per 10,000 fingerprints,
+        # to balance statistical representativeness and computational efficiency
+        n_samples = max(num_fps // 10_000, 50)
     sample_idxs = jt_stratified_sampling(fps, n_samples, input_is_packed, n_features)
-    # Work with sample from now on
+    # Work with only the sampled fingerprints
     fps = fps[sample_idxs]
     num_fps = len(fps)
     pairs = np.empty(num_fps * (num_fps - 1) // 2, dtype=np.float64)

bblean 0.6.1b0__tar.gz → 0.7.2b0__tar.gz

bblean 0.6.1b0tar.gz → 0.7.2b0tar.gz