PyPI - bblean - Versions diffs - 0.8.1__cp313-cp313-win_amd64.whl → 0.8.2__cp313-cp313-win_amd64.whl - Mend

bblean 0.8.1__cp313-cp313-win_amd64.whl → 0.8.2__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

bblean/_cpp_similarity.cp313-win_amd64.pyd +0 -0
bblean/_merges.py +44 -0
bblean/_version.py +3 -3
bblean/bitbirch.py +60 -0
bblean/cli.py +49 -0
bblean/similarity.py +9 -5
{bblean-0.8.1.dist-info → bblean-0.8.2.dist-info}/METADATA +1 -1
{bblean-0.8.1.dist-info → bblean-0.8.2.dist-info}/RECORD +12 -12
{bblean-0.8.1.dist-info → bblean-0.8.2.dist-info}/WHEEL +1 -1
{bblean-0.8.1.dist-info → bblean-0.8.2.dist-info}/entry_points.txt +0 -0
{bblean-0.8.1.dist-info → bblean-0.8.2.dist-info}/licenses/LICENSE +0 -0
{bblean-0.8.1.dist-info → bblean-0.8.2.dist-info}/top_level.txt +0 -0

bblean/_cpp_similarity.cp313-win_amd64.pyd CHANGED Viewed

Binary file

bblean/_merges.py CHANGED Viewed

@@ -69,6 +69,48 @@ class DiameterMerge(MergeAcceptFunction):
         return jt_isim_from_sum(new_ls, new_n) >= threshold
+class FlexibleToleranceDiameterMerge(MergeAcceptFunction):
+    name = "flexible-tolerance-diameter"
+    # NOTE: Equivalent to tolerance-diameter but uses min(old_dc, threshold) as the
+    # criteria
+    def __init__(
+        self,
+        tolerance: float = 0.05,
+        n_max: int = 1000,
+        decay: float = 1e-3,
+        adaptive: bool = True,
+    ) -> None:
+        self.tolerance = tolerance
+        self.decay = decay
+        self.offset = np.exp(-decay * n_max)
+        if not adaptive:
+            self.decay = 0.0
+            self.offset = 0.0
+    def __call__(
+        self,
+        threshold: float,
+        new_ls: NDArray[np.integer],
+        new_n: int,
+        old_ls: NDArray[np.integer],
+        nom_ls: NDArray[np.integer],
+        old_n: int,
+        nom_n: int,
+    ) -> bool:
+        new_dc = jt_isim_from_sum(new_ls, new_n)
+        if new_dc < threshold:
+            return False
+        if old_n == 1:
+            return True
+        old_dc = jt_isim_from_sum(old_ls, old_n)
+        tol = max(self.tolerance * (np.exp(-self.decay * old_n) - self.offset), 0.0)
+        return new_dc >= min(old_dc, threshold) - tol
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.tolerance})"
 class ToleranceDiameterMerge(MergeAcceptFunction):
     name = "tolerance-diameter"
     # NOTE: The reliability of the estimate of the cluster should be a function of the
@@ -202,6 +244,8 @@ def get_merge_accept_fn(
         return ToleranceMerge(tolerance)
     elif merge_criterion == "tolerance-diameter":
         return ToleranceDiameterMerge(tolerance)
+    elif merge_criterion == "flexible-tolerance-diameter":
+        return FlexibleToleranceDiameterMerge(tolerance)
     elif merge_criterion == "tolerance-radius":
         return ToleranceRadiusMerge(tolerance)
     elif merge_criterion == "never-merge":

bblean/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.8.1'
-__version_tuple__ = version_tuple = (0, 8, 1)
+__version__ = version = '0.8.2'
+__version_tuple__ = version_tuple = (0, 8, 2)
-__commit_id__ = commit_id = 'g5aa6bced2'
+__commit_id__ = commit_id = 'g83842e3cd'

bblean/bitbirch.py CHANGED Viewed

@@ -75,6 +75,8 @@ from bblean.similarity import (
     jt_most_dissimilar_packed,
     jt_isim_medoid,
     centroid_from_sum,
+    estimate_jt_std,
+    jt_isim,
 )
 if os.getenv("BITBIRCH_NO_EXTENSIONS"):
@@ -90,6 +92,64 @@ else:
 __all__ = ["BitBirch"]
+@tp.overload
+def guess_threshold(
+    fps: NDArray[np.uint8],
+    input_is_packed: bool = True,
+    n_features: int | None = None,
+    max_samples: int = 1_000_000,
+    factor: float = 3.0,
+    return_mean_std: tp.Literal[False] = False,
+) -> float:
+    pass
+@tp.overload
+def guess_threshold(
+    fps: NDArray[np.uint8],
+    input_is_packed: bool = True,
+    n_features: int | None = None,
+    max_samples: int = 1_000_000,
+    factor: float = 3.0,
+    return_mean_std: tp.Literal[True] = True,
+) -> tuple[float, float, float]:
+    pass
+def guess_threshold(
+    fps: NDArray[np.uint8],
+    input_is_packed: bool = True,
+    n_features: int | None = None,
+    max_samples: int = 1_000_000,
+    factor: float = 3.0,
+    return_mean_std: bool = False,
+) -> float | tuple[float, float, float]:
+    r""":meta private:
+    Guess the optimal bitbirch threshold
+    Uses the heuristic mean_tanimoto + 3.0 * std_tanimoto
+    """
+    num_fps = len(fps)
+    if num_fps > max_samples:
+        rng = np.random.default_rng(42)
+        random_choices = rng.choice(num_fps, size=max_samples, replace=False)
+        fps = fps[random_choices]
+        num_fps = len(fps)
+    mean = jt_isim(fps, input_is_packed, n_features)
+    if num_fps <= 50:
+        n_samples = num_fps
+    else:
+        n_samples = max(5 * np.sqrt(num_fps), 50)
+    std = estimate_jt_std(
+        fps, input_is_packed=input_is_packed, n_features=n_features, n_samples=n_samples
+    )
+    thresh = mean + factor * std
+    if return_mean_std:
+        return thresh, mean, std
+    return thresh
 # For backwards compatibility with the global "set_merge", keep weak references to all
 # the BitBirch instances and update them when set_merge is called
 _BITBIRCH_INSTANCES: WeakSet["BitBirch"] = WeakSet()

bblean/cli.py CHANGED Viewed

@@ -1,5 +1,6 @@
 r"""Command line interface entrypoints"""
+import numpy as np
 import warnings
 import random
 import typing as tp
@@ -930,6 +931,54 @@ def _plot_summary(
         )
+@app.command("thresh")
+def _guess_threshold(
+    ctx: Context,
+    input_: Annotated[
+        Path,
+        Argument(help="`*.npy` file with fingerprints"),
+    ],
+    factor: Annotated[
+        float,
+        Option("-f", "--factor"),
+    ] = 3.0,
+    n_features: Annotated[
+        int | None,
+        Option(
+            "--n-features",
+            help="Number of features in the fingerprints."
+            " It must be provided for packed inputs *if it is not a multiple of 8*."
+            " For typical fingerprint sizes (e.g. 2048, 1024), it is not required",
+            rich_help_panel="Advanced",
+        ),
+    ] = None,
+    input_is_packed: Annotated[
+        bool,
+        Option(
+            "--packed-input/--unpacked-input",
+            help="Toggle whether the input consists on packed or unpacked fingerprints",
+            rich_help_panel="Advanced",
+        ),
+    ] = True,
+    max_samples: Annotated[
+        int,
+        Option("-m", "--max-samples"),
+    ] = 1_000_000,
+) -> None:
+    r"""Estimate the optimal BitBirch threshold for a fingerprints file"""
+    from bblean.bitbirch import guess_threshold
+    from bblean._console import get_console
+    console = get_console()
+    fps = np.load(input_)
+    thresh, mean, std = guess_threshold(
+        fps, input_is_packed, n_features, max_samples, factor, return_mean_std=True
+    )
+    console.print(f"Estimated average similarity: {mean:.4f}")
+    console.print(f"Estimated similarity deviation: {std:.4f}")
+    console.print(f"Estimated optimal threshold: {thresh:.4f}")
 @app.command("run")
 def _run(
     ctx: Context,

bblean/similarity.py CHANGED Viewed

@@ -293,7 +293,7 @@ def estimate_jt_std(
     n_samples: int | None = None,
     input_is_packed: bool = True,
     n_features: int | None = None,
-    min_samples: int = 1_000_000,
+    max_samples: int = 1_000_000,
 ) -> float:
     r"""Estimate the std of all pairwise Tanimoto.
@@ -303,15 +303,19 @@ def estimate_jt_std(
         The standard deviation of all pairwise Tanimoto among the sampled fingerprints.
     """
     num_fps = len(fps)
-    if num_fps > min_samples:
-        np.random.seed(42)
-        random_choices = np.random.choice(num_fps, size=min_samples, replace=False)
+    if num_fps > max_samples:
+        rng = np.random.default_rng(42)
+        random_choices = rng.choice(num_fps, size=max_samples, replace=False)
         fps = fps[random_choices]
         num_fps = len(fps)
     if n_samples is None:
         # Heuristic: use at least 50 samples, or 1 per 10,000 fingerprints,
         # to balance statistical representativeness and computational efficiency
-        n_samples = max(num_fps // 10_000, 50)
+        # TODO: This heuristic is broken, too few samples until 500k
+        if num_fps <= 500_000:
+            n_samples = 50
+        else:
+            n_samples = num_fps // 10_000
     sample_idxs = jt_stratified_sampling(fps, n_samples, input_is_packed, n_features)
     # Work with only the sampled fingerprints

{bblean-0.8.1.dist-info → bblean-0.8.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: bblean
-Version: 0.8.1
+Version: 0.8.2
 Summary: BitBirch-Lean Python package
 Author: The Miranda-Quintana Lab and other BitBirch developers
 Author-email: Ramon Alain Miranda Quintana <quintana@chem.ufl.edu>, Krisztina Zsigmond <kzsigmond@ufl.edu>, Ignacio Pickering <ipickering@ufl.edu>, Kenneth Lopez Perez <klopezperez@chem.ufl.edu>, Miroslav Lzicar <miroslav.lzicar@deepmedchem.com>

{bblean-0.8.1.dist-info → bblean-0.8.2.dist-info}/RECORD RENAMED Viewed

@@ -1,20 +1,20 @@
 bblean/__init__.py,sha256=9cudBHEt0H5p0jKEvgrhLZIHPSzwNAx0uJRp-_iM32I,686
 bblean/_config.py,sha256=WaONZilOWCLFdZulqWLKRqNM-ZLhY0YCXfwk-84FYmQ,1813
 bblean/_console.py,sha256=Mk1hi1NdPw2HDmjWj1LLbCuV3vCxL5l6u2gXaEeOFBM,8021
-bblean/_cpp_similarity.cp313-win_amd64.pyd,sha256=5lMeakYQaYsOtx9iqFaVmFY2ihrwAIUyvU5tlzdAYb0,182272
+bblean/_cpp_similarity.cp313-win_amd64.pyd,sha256=aSYT0QK8vUOafpYdGmkznvg0Xg2nslMU5rfSgxgvgnQ,182272
 bblean/_memory.py,sha256=eycXzXV_O_VEyIKpAv3QpbxtpB5WkBLChzm_e2Dqaw0,6892
-bblean/_merges.py,sha256=xwFMJUPJ9VMujf2nSROx0NhsPoQ_R84KIxBF81x2hks,6432
+bblean/_merges.py,sha256=jcukDaE0G-0UtVF_427-17gS8vyjN0F_gx6mwWv0wmo,7831
 bblean/_py_similarity.py,sha256=VYWu7gVCEDjNaRLgxiCxCGjCfmTity86UPC0dfT83Ok,9633
 bblean/_timer.py,sha256=D1-_tTQFJqIQgzl4HSE__-P3Scw72EIVlNDaChJT8Qs,1402
-bblean/_version.py,sha256=htK7xKc7g_IbAV_cXMJZjpQJZZby8nFR-IRwkjsF5YA,746
+bblean/_version.py,sha256=wqA_6I8sCkXGJHVuQ9EfxYbaHviAFhfz9h7Mm_kZQLM,746
 bblean/analysis.py,sha256=apD5OgSoNGbIuBLSJFFzlUkVjZHBtb3fVEeEUJGbyqc,8118
-bblean/bitbirch.py,sha256=0zaClnIn9Pp5h2cpI17zAg1NbEr0aVMnywHI1ZfWcF8,60517
-bblean/cli.py,sha256=S_y0sY5M5fFj--DfvC4I04Bzs7OAxyEoidt6jjgeavQ,74199
+bblean/bitbirch.py,sha256=dgxoMUb_g6eTMLHhTq5n9vxAF8BgCgHRx-ggqTxHuTM,62164
+bblean/cli.py,sha256=AamdYqxzqsiJ7WPK-z5dCSr8CmGSRfNbXq0_LeloU_E,75819
 bblean/fingerprints.py,sha256=Dz_exFq9CzkFbQvaswIqWloA83Ac_ZBahiVbVrlOFtc,20049
 bblean/metrics.py,sha256=4KB-PIQJtFMsNg7lG2uM1HEId_eR5vhqcdLpCVLuI5Y,7280
 bblean/multiround.py,sha256=5VAACXTQfLxgl6UefVpF2tQo0ifFG3ehq1_ELjoMt5k,19862
 bblean/plotting.py,sha256=B2Kpw_HuKx1KxuKXI83IIWPQVsd-uJyDSu47a6mhzwE,15956
-bblean/similarity.py,sha256=O2OTW5Dw64go177jwzF5skvDSJEzDS7UImyIQ2nShig,12192
+bblean/similarity.py,sha256=Ih-DkzERdd5pUKJKsgJ2pBmIPhVo57zSvRfnzqYTzsY,12339
 bblean/sklearn.py,sha256=KK7rbF3gENjlv5-9uOvH-Q0LEW1RUY__xClcnLznuE0,7450
 bblean/smiles.py,sha256=ppCqAbYUElnv5NeLRgU0aaJBBGczH9j9BYEWlzNjb-g,3213
 bblean/utils.py,sha256=K0ttSPf54nxrKD1TwbLFuwDIRlAD0jdr6KnuTqXs-HQ,3836
@@ -23,9 +23,9 @@ bblean/_legacy/bb_int64.py,sha256=Otqxu8NBLrfOMpJoMrLgWtDP_9Hn4joQXZVkU1hjges,45
 bblean/_legacy/bb_uint8.py,sha256=8kbeVAq7MxiR8hS_6lKhSDhVWc6acjLmLzNFCR466iA,41573
 bblean/csrc/README.md,sha256=qOPPK6sTqkYgnlPWtcNu9P3PwuLH8cCNJ1FwJeewsrk,59
 bblean/csrc/similarity.cpp,sha256=q6oMg9Vd0REPmqze8xToTmeXZiEuHTmOfL6QsTRFkDE,23122
-bblean-0.8.1.dist-info/licenses/LICENSE,sha256=Dq9t2XHr5wSrykVuVo8etKsAS35ENnDobU1h7t3H_-k,2598
-bblean-0.8.1.dist-info/METADATA,sha256=VIXmMyP48rQgUQsLPPEqDhpou4S7xspQK9oW9MAehu0,13051
-bblean-0.8.1.dist-info/WHEEL,sha256=qV0EIPljj1XC_vuSatRWjn02nZIz3N1t8jsZz7HBr2U,101
-bblean-0.8.1.dist-info/entry_points.txt,sha256=a0jb2L5JFKioMD6CqbvJiI2unaArGzi-AMZsyY-uyGg,38
-bblean-0.8.1.dist-info/top_level.txt,sha256=ybxTonvTC9zR25yR5B27aEDLl6CiwID093ZyS_--Cq4,7
-bblean-0.8.1.dist-info/RECORD,,
+bblean-0.8.2.dist-info/licenses/LICENSE,sha256=Dq9t2XHr5wSrykVuVo8etKsAS35ENnDobU1h7t3H_-k,2598
+bblean-0.8.2.dist-info/METADATA,sha256=nKe-e0-WKKM2Sg67leuUCTm26Ie_8XP42C9uAf3MnCo,13051
+bblean-0.8.2.dist-info/WHEEL,sha256=-WvvtQtdhM1F5HMi-4hSXLQ_1Tg6qJRWO1HnLNr4mCU,102
+bblean-0.8.2.dist-info/entry_points.txt,sha256=a0jb2L5JFKioMD6CqbvJiI2unaArGzi-AMZsyY-uyGg,38
+bblean-0.8.2.dist-info/top_level.txt,sha256=ybxTonvTC9zR25yR5B27aEDLl6CiwID093ZyS_--Cq4,7
+bblean-0.8.2.dist-info/RECORD,,

{bblean-0.8.1.dist-info → bblean-0.8.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: false
 Tag: cp313-cp313-win_amd64

{bblean-0.8.1.dist-info → bblean-0.8.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{bblean-0.8.1.dist-info → bblean-0.8.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{bblean-0.8.1.dist-info → bblean-0.8.2.dist-info}/top_level.txt RENAMED Viewed

File without changes