bblean 0.8.0__tar.gz → 0.8.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {bblean-0.8.0 → bblean-0.8.2}/PKG-INFO +1 -1
  2. {bblean-0.8.0 → bblean-0.8.2}/bblean/_merges.py +44 -0
  3. {bblean-0.8.0 → bblean-0.8.2}/bblean/_version.py +2 -2
  4. {bblean-0.8.0 → bblean-0.8.2}/bblean/bitbirch.py +60 -0
  5. {bblean-0.8.0 → bblean-0.8.2}/bblean/cli.py +49 -0
  6. {bblean-0.8.0 → bblean-0.8.2}/bblean/fingerprints.py +115 -0
  7. {bblean-0.8.0 → bblean-0.8.2}/bblean/similarity.py +9 -5
  8. {bblean-0.8.0 → bblean-0.8.2}/bblean/smiles.py +6 -4
  9. {bblean-0.8.0 → bblean-0.8.2}/bblean.egg-info/PKG-INFO +1 -1
  10. {bblean-0.8.0 → bblean-0.8.2}/tests/test_simple.py +15 -1
  11. {bblean-0.8.0 → bblean-0.8.2}/.cruft.json +0 -0
  12. {bblean-0.8.0 → bblean-0.8.2}/.flake8 +0 -0
  13. {bblean-0.8.0 → bblean-0.8.2}/.github/CODEOWNERS +0 -0
  14. {bblean-0.8.0 → bblean-0.8.2}/.github/workflows/ci-cpp.yaml +0 -0
  15. {bblean-0.8.0 → bblean-0.8.2}/.github/workflows/ci.yaml +0 -0
  16. {bblean-0.8.0 → bblean-0.8.2}/.github/workflows/upload-to-pypi.yaml +0 -0
  17. {bblean-0.8.0 → bblean-0.8.2}/.gitignore +0 -0
  18. {bblean-0.8.0 → bblean-0.8.2}/.pre-commit-config.yaml +0 -0
  19. {bblean-0.8.0 → bblean-0.8.2}/LICENSE +0 -0
  20. {bblean-0.8.0 → bblean-0.8.2}/LICENSES/BSD-3-Clause.txt +0 -0
  21. {bblean-0.8.0 → bblean-0.8.2}/LICENSES/GPL-3.0-only.txt +0 -0
  22. {bblean-0.8.0 → bblean-0.8.2}/README.md +0 -0
  23. {bblean-0.8.0 → bblean-0.8.2}/bblean/__init__.py +0 -0
  24. {bblean-0.8.0 → bblean-0.8.2}/bblean/_config.py +0 -0
  25. {bblean-0.8.0 → bblean-0.8.2}/bblean/_console.py +0 -0
  26. {bblean-0.8.0 → bblean-0.8.2}/bblean/_legacy/__init__.py +0 -0
  27. {bblean-0.8.0 → bblean-0.8.2}/bblean/_legacy/bb_int64.py +0 -0
  28. {bblean-0.8.0 → bblean-0.8.2}/bblean/_legacy/bb_uint8.py +0 -0
  29. {bblean-0.8.0 → bblean-0.8.2}/bblean/_memory.py +0 -0
  30. {bblean-0.8.0 → bblean-0.8.2}/bblean/_py_similarity.py +0 -0
  31. {bblean-0.8.0 → bblean-0.8.2}/bblean/_timer.py +0 -0
  32. {bblean-0.8.0 → bblean-0.8.2}/bblean/analysis.py +0 -0
  33. {bblean-0.8.0 → bblean-0.8.2}/bblean/csrc/README.md +0 -0
  34. {bblean-0.8.0 → bblean-0.8.2}/bblean/csrc/similarity.cpp +0 -0
  35. {bblean-0.8.0 → bblean-0.8.2}/bblean/metrics.py +0 -0
  36. {bblean-0.8.0 → bblean-0.8.2}/bblean/multiround.py +0 -0
  37. {bblean-0.8.0 → bblean-0.8.2}/bblean/plotting.py +0 -0
  38. {bblean-0.8.0 → bblean-0.8.2}/bblean/sklearn.py +0 -0
  39. {bblean-0.8.0 → bblean-0.8.2}/bblean/utils.py +0 -0
  40. {bblean-0.8.0 → bblean-0.8.2}/bblean-demo-v2.gif +0 -0
  41. {bblean-0.8.0 → bblean-0.8.2}/bblean-demo.cast +0 -0
  42. {bblean-0.8.0 → bblean-0.8.2}/bblean.egg-info/SOURCES.txt +0 -0
  43. {bblean-0.8.0 → bblean-0.8.2}/bblean.egg-info/dependency_links.txt +0 -0
  44. {bblean-0.8.0 → bblean-0.8.2}/bblean.egg-info/entry_points.txt +0 -0
  45. {bblean-0.8.0 → bblean-0.8.2}/bblean.egg-info/requires.txt +0 -0
  46. {bblean-0.8.0 → bblean-0.8.2}/bblean.egg-info/top_level.txt +0 -0
  47. {bblean-0.8.0 → bblean-0.8.2}/docs/src/_static/api.svg +0 -0
  48. {bblean-0.8.0 → bblean-0.8.2}/docs/src/_static/installing.svg +0 -0
  49. {bblean-0.8.0 → bblean-0.8.2}/docs/src/_static/logo-dark-bw.svg +0 -0
  50. {bblean-0.8.0 → bblean-0.8.2}/docs/src/_static/logo-light-bw.svg +0 -0
  51. {bblean-0.8.0 → bblean-0.8.2}/docs/src/_static/publications.svg +0 -0
  52. {bblean-0.8.0 → bblean-0.8.2}/docs/src/_static/style.css +0 -0
  53. {bblean-0.8.0 → bblean-0.8.2}/docs/src/_static/user-guide.svg +0 -0
  54. {bblean-0.8.0 → bblean-0.8.2}/docs/src/_templates/module.rst +0 -0
  55. {bblean-0.8.0 → bblean-0.8.2}/docs/src/api-reference.rst +0 -0
  56. {bblean-0.8.0 → bblean-0.8.2}/docs/src/conf.py +0 -0
  57. {bblean-0.8.0 → bblean-0.8.2}/docs/src/index.rst +0 -0
  58. {bblean-0.8.0 → bblean-0.8.2}/docs/src/installing.rst +0 -0
  59. {bblean-0.8.0 → bblean-0.8.2}/docs/src/publications.rst +0 -0
  60. {bblean-0.8.0 → bblean-0.8.2}/docs/src/user-guide/linux_memory_setup.rst +0 -0
  61. {bblean-0.8.0 → bblean-0.8.2}/docs/src/user-guide/notebooks/bitbirch_best_practices.ipynb +0 -0
  62. {bblean-0.8.0 → bblean-0.8.2}/docs/src/user-guide/notebooks/bitbirch_quickstart.ipynb +0 -0
  63. {bblean-0.8.0 → bblean-0.8.2}/docs/src/user-guide/parameters.rst +0 -0
  64. {bblean-0.8.0 → bblean-0.8.2}/docs/src/user-guide.rst +0 -0
  65. {bblean-0.8.0 → bblean-0.8.2}/environment.yaml +0 -0
  66. {bblean-0.8.0 → bblean-0.8.2}/examples/best_practices/best_practices_functions.py +0 -0
  67. {bblean-0.8.0 → bblean-0.8.2}/examples/best_practices/best_practices_plots.py +0 -0
  68. {bblean-0.8.0 → bblean-0.8.2}/examples/best_practices/bitbirch_best_practices.ipynb +0 -0
  69. {bblean-0.8.0 → bblean-0.8.2}/examples/best_practices/bitbirch_best_practices_RDKit.ipynb +0 -0
  70. {bblean-0.8.0 → bblean-0.8.2}/examples/best_practices/bitbirch_parameter.ipynb +0 -0
  71. {bblean-0.8.0 → bblean-0.8.2}/examples/biogen_logS.csv +0 -0
  72. {bblean-0.8.0 → bblean-0.8.2}/examples/bitbirch_best_practices.ipynb +0 -0
  73. {bblean-0.8.0 → bblean-0.8.2}/examples/bitbirch_quickstart.ipynb +0 -0
  74. {bblean-0.8.0 → bblean-0.8.2}/examples/chembl-33-natural-products-subset.smi +0 -0
  75. {bblean-0.8.0 → bblean-0.8.2}/examples/dataset_splitting.ipynb +0 -0
  76. {bblean-0.8.0 → bblean-0.8.2}/pyproject.toml +0 -0
  77. {bblean-0.8.0 → bblean-0.8.2}/setup.cfg +0 -0
  78. {bblean-0.8.0 → bblean-0.8.2}/setup.py +0 -0
  79. {bblean-0.8.0 → bblean-0.8.2}/tests/chembl-sample-3k.smi +0 -0
  80. {bblean-0.8.0 → bblean-0.8.2}/tests/chembl-sample-bad.smi +0 -0
  81. {bblean-0.8.0 → bblean-0.8.2}/tests/legacy_merges.py +0 -0
  82. {bblean-0.8.0 → bblean-0.8.2}/tests/legacy_metrics.py +0 -0
  83. {bblean-0.8.0 → bblean-0.8.2}/tests/test_bb_consistency.py +0 -0
  84. {bblean-0.8.0 → bblean-0.8.2}/tests/test_cli.py +0 -0
  85. {bblean-0.8.0 → bblean-0.8.2}/tests/test_fake_fps.py +0 -0
  86. {bblean-0.8.0 → bblean-0.8.2}/tests/test_fingerprints.py +0 -0
  87. {bblean-0.8.0 → bblean-0.8.2}/tests/test_global_clustering.py +0 -0
  88. {bblean-0.8.0 → bblean-0.8.2}/tests/test_import_bblean.py +0 -0
  89. {bblean-0.8.0 → bblean-0.8.2}/tests/test_merges.py +0 -0
  90. {bblean-0.8.0 → bblean-0.8.2}/tests/test_metrics.py +0 -0
  91. {bblean-0.8.0 → bblean-0.8.2}/tests/test_multiround.py +0 -0
  92. {bblean-0.8.0 → bblean-0.8.2}/tests/test_refine.py +0 -0
  93. {bblean-0.8.0 → bblean-0.8.2}/tests/test_regression.py +0 -0
  94. {bblean-0.8.0 → bblean-0.8.2}/tests/test_sampling.py +0 -0
  95. {bblean-0.8.0 → bblean-0.8.2}/tests/test_similarity.py +0 -0
  96. {bblean-0.8.0 → bblean-0.8.2}/tests/test_sklearn.py +0 -0
  97. {bblean-0.8.0 → bblean-0.8.2}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bblean
3
- Version: 0.8.0
3
+ Version: 0.8.2
4
4
  Summary: BitBirch-Lean Python package
5
5
  Author: The Miranda-Quintana Lab and other BitBirch developers
6
6
  Author-email: Ramon Alain Miranda Quintana <quintana@chem.ufl.edu>, Krisztina Zsigmond <kzsigmond@ufl.edu>, Ignacio Pickering <ipickering@ufl.edu>, Kenneth Lopez Perez <klopezperez@chem.ufl.edu>, Miroslav Lzicar <miroslav.lzicar@deepmedchem.com>
@@ -69,6 +69,48 @@ class DiameterMerge(MergeAcceptFunction):
69
69
  return jt_isim_from_sum(new_ls, new_n) >= threshold
70
70
 
71
71
 
72
+ class FlexibleToleranceDiameterMerge(MergeAcceptFunction):
73
+ name = "flexible-tolerance-diameter"
74
+ # NOTE: Equivalent to tolerance-diameter but uses min(old_dc, threshold) as the
75
+ # criteria
76
+
77
+ def __init__(
78
+ self,
79
+ tolerance: float = 0.05,
80
+ n_max: int = 1000,
81
+ decay: float = 1e-3,
82
+ adaptive: bool = True,
83
+ ) -> None:
84
+ self.tolerance = tolerance
85
+ self.decay = decay
86
+ self.offset = np.exp(-decay * n_max)
87
+ if not adaptive:
88
+ self.decay = 0.0
89
+ self.offset = 0.0
90
+
91
+ def __call__(
92
+ self,
93
+ threshold: float,
94
+ new_ls: NDArray[np.integer],
95
+ new_n: int,
96
+ old_ls: NDArray[np.integer],
97
+ nom_ls: NDArray[np.integer],
98
+ old_n: int,
99
+ nom_n: int,
100
+ ) -> bool:
101
+ new_dc = jt_isim_from_sum(new_ls, new_n)
102
+ if new_dc < threshold:
103
+ return False
104
+ if old_n == 1:
105
+ return True
106
+ old_dc = jt_isim_from_sum(old_ls, old_n)
107
+ tol = max(self.tolerance * (np.exp(-self.decay * old_n) - self.offset), 0.0)
108
+ return new_dc >= min(old_dc, threshold) - tol
109
+
110
+ def __repr__(self) -> str:
111
+ return f"{self.__class__.__name__}({self.tolerance})"
112
+
113
+
72
114
  class ToleranceDiameterMerge(MergeAcceptFunction):
73
115
  name = "tolerance-diameter"
74
116
  # NOTE: The reliability of the estimate of the cluster should be a function of the
@@ -202,6 +244,8 @@ def get_merge_accept_fn(
202
244
  return ToleranceMerge(tolerance)
203
245
  elif merge_criterion == "tolerance-diameter":
204
246
  return ToleranceDiameterMerge(tolerance)
247
+ elif merge_criterion == "flexible-tolerance-diameter":
248
+ return FlexibleToleranceDiameterMerge(tolerance)
205
249
  elif merge_criterion == "tolerance-radius":
206
250
  return ToleranceRadiusMerge(tolerance)
207
251
  elif merge_criterion == "never-merge":
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.8.0'
32
- __version_tuple__ = version_tuple = (0, 8, 0)
31
+ __version__ = version = '0.8.2'
32
+ __version_tuple__ = version_tuple = (0, 8, 2)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -75,6 +75,8 @@ from bblean.similarity import (
75
75
  jt_most_dissimilar_packed,
76
76
  jt_isim_medoid,
77
77
  centroid_from_sum,
78
+ estimate_jt_std,
79
+ jt_isim,
78
80
  )
79
81
 
80
82
  if os.getenv("BITBIRCH_NO_EXTENSIONS"):
@@ -90,6 +92,64 @@ else:
90
92
  __all__ = ["BitBirch"]
91
93
 
92
94
 
95
+ @tp.overload
96
+ def guess_threshold(
97
+ fps: NDArray[np.uint8],
98
+ input_is_packed: bool = True,
99
+ n_features: int | None = None,
100
+ max_samples: int = 1_000_000,
101
+ factor: float = 3.0,
102
+ return_mean_std: tp.Literal[False] = False,
103
+ ) -> float:
104
+ pass
105
+
106
+
107
+ @tp.overload
108
+ def guess_threshold(
109
+ fps: NDArray[np.uint8],
110
+ input_is_packed: bool = True,
111
+ n_features: int | None = None,
112
+ max_samples: int = 1_000_000,
113
+ factor: float = 3.0,
114
+ return_mean_std: tp.Literal[True] = True,
115
+ ) -> tuple[float, float, float]:
116
+ pass
117
+
118
+
119
+ def guess_threshold(
120
+ fps: NDArray[np.uint8],
121
+ input_is_packed: bool = True,
122
+ n_features: int | None = None,
123
+ max_samples: int = 1_000_000,
124
+ factor: float = 3.0,
125
+ return_mean_std: bool = False,
126
+ ) -> float | tuple[float, float, float]:
127
+ r""":meta private:
128
+
129
+ Guess the optimal bitbirch threshold
130
+
131
+ Uses the heuristic mean_tanimoto + 3.0 * std_tanimoto
132
+ """
133
+ num_fps = len(fps)
134
+ if num_fps > max_samples:
135
+ rng = np.random.default_rng(42)
136
+ random_choices = rng.choice(num_fps, size=max_samples, replace=False)
137
+ fps = fps[random_choices]
138
+ num_fps = len(fps)
139
+ mean = jt_isim(fps, input_is_packed, n_features)
140
+ if num_fps <= 50:
141
+ n_samples = num_fps
142
+ else:
143
+ n_samples = max(5 * np.sqrt(num_fps), 50)
144
+ std = estimate_jt_std(
145
+ fps, input_is_packed=input_is_packed, n_features=n_features, n_samples=n_samples
146
+ )
147
+ thresh = mean + factor * std
148
+ if return_mean_std:
149
+ return thresh, mean, std
150
+ return thresh
151
+
152
+
93
153
  # For backwards compatibility with the global "set_merge", keep weak references to all
94
154
  # the BitBirch instances and update them when set_merge is called
95
155
  _BITBIRCH_INSTANCES: WeakSet["BitBirch"] = WeakSet()
@@ -1,5 +1,6 @@
1
1
  r"""Command line interface entrypoints"""
2
2
 
3
+ import numpy as np
3
4
  import warnings
4
5
  import random
5
6
  import typing as tp
@@ -930,6 +931,54 @@ def _plot_summary(
930
931
  )
931
932
 
932
933
 
934
+ @app.command("thresh")
935
+ def _guess_threshold(
936
+ ctx: Context,
937
+ input_: Annotated[
938
+ Path,
939
+ Argument(help="`*.npy` file with fingerprints"),
940
+ ],
941
+ factor: Annotated[
942
+ float,
943
+ Option("-f", "--factor"),
944
+ ] = 3.0,
945
+ n_features: Annotated[
946
+ int | None,
947
+ Option(
948
+ "--n-features",
949
+ help="Number of features in the fingerprints."
950
+ " It must be provided for packed inputs *if it is not a multiple of 8*."
951
+ " For typical fingerprint sizes (e.g. 2048, 1024), it is not required",
952
+ rich_help_panel="Advanced",
953
+ ),
954
+ ] = None,
955
+ input_is_packed: Annotated[
956
+ bool,
957
+ Option(
958
+ "--packed-input/--unpacked-input",
959
+ help="Toggle whether the input consists on packed or unpacked fingerprints",
960
+ rich_help_panel="Advanced",
961
+ ),
962
+ ] = True,
963
+ max_samples: Annotated[
964
+ int,
965
+ Option("-m", "--max-samples"),
966
+ ] = 1_000_000,
967
+ ) -> None:
968
+ r"""Estimate the optimal BitBirch threshold for a fingerprints file"""
969
+ from bblean.bitbirch import guess_threshold
970
+ from bblean._console import get_console
971
+
972
+ console = get_console()
973
+ fps = np.load(input_)
974
+ thresh, mean, std = guess_threshold(
975
+ fps, input_is_packed, n_features, max_samples, factor, return_mean_std=True
976
+ )
977
+ console.print(f"Estimated average similarity: {mean:.4f}")
978
+ console.print(f"Estimated similarity deviation: {std:.4f}")
979
+ console.print(f"Estimated optimal threshold: {thresh:.4f}")
980
+
981
+
933
982
  @app.command("run")
934
983
  def _run(
935
984
  ctx: Context,
@@ -1,11 +1,15 @@
1
1
  r"""Utilites for manipulating fingerprints and fingerprint files"""
2
2
 
3
+ import sys
4
+ import math
5
+ import weakref
3
6
  import warnings
4
7
  import dataclasses
5
8
  from pathlib import Path
6
9
  from numpy.typing import NDArray, DTypeLike
7
10
  import numpy as np
8
11
  import typing as tp
12
+ import multiprocessing as mp
9
13
  import multiprocessing.shared_memory as shmem
10
14
 
11
15
  from rich.console import Console
@@ -13,6 +17,8 @@ from rdkit.Chem import rdFingerprintGenerator, MolFromSmiles, SanitizeFlags, San
13
17
 
14
18
  from bblean._config import DEFAULTS
15
19
  from bblean._console import get_console
20
+ from bblean.smiles import _iter_ranges_and_smiles_batches
21
+ from bblean.utils import _num_avail_cpus
16
22
 
17
23
  __all__ = [
18
24
  "make_fake_fingerprints",
@@ -441,3 +447,112 @@ class _FingerprintArrayFiller:
441
447
  fps[i, :] = fp
442
448
  fps_shmem.close()
443
449
  invalid_mask_shmem.close()
450
+
451
+
452
+ @tp.overload
453
+ def fps_from_smiles_parallel(
454
+ smiles: tp.Iterable[str],
455
+ kind: str = DEFAULTS.fp_kind,
456
+ n_features: int = DEFAULTS.n_features,
457
+ dtype: DTypeLike = np.uint8,
458
+ sanitize: str = "all",
459
+ skip_invalid: tp.Literal[False] = False,
460
+ pack: bool = True,
461
+ num_ps: int = 1,
462
+ replace_dummy_atoms: bool = False,
463
+ tab_separated: bool = False,
464
+ mp_context: tp.Any = None,
465
+ ) -> NDArray[np.uint8]:
466
+ pass
467
+
468
+
469
+ @tp.overload
470
+ def fps_from_smiles_parallel(
471
+ smiles: tp.Iterable[str],
472
+ kind: str = DEFAULTS.fp_kind,
473
+ n_features: int = DEFAULTS.n_features,
474
+ dtype: DTypeLike = np.uint8,
475
+ sanitize: str = "all",
476
+ skip_invalid: tp.Literal[True] = True,
477
+ pack: bool = True,
478
+ num_ps: int = 1,
479
+ replace_dummy_atoms: bool = False,
480
+ tab_separated: bool = False,
481
+ mp_context: tp.Any = None,
482
+ ) -> tp.Union[NDArray[np.uint8], tuple[NDArray[np.uint8], NDArray[np.int64]]]:
483
+ pass
484
+
485
+
486
+ # NOTE: This function is proof of concept and kinda dangerous since it registers
487
+ # a custom destructor for the numpy array
488
+ # It is also *only usable if called inside an if __name__ == "__main__" guard*
489
+ # For now lets hide it
490
+ def fps_from_smiles_parallel(
491
+ smiles: tp.Iterable[str],
492
+ kind: str = DEFAULTS.fp_kind,
493
+ n_features: int = DEFAULTS.n_features,
494
+ dtype: DTypeLike = np.uint8,
495
+ sanitize: str = "all",
496
+ skip_invalid: bool = False,
497
+ pack: bool = True,
498
+ num_ps: int | None = None,
499
+ replace_dummy_atoms: bool = False,
500
+ tab_separated: bool = False,
501
+ mp_context: tp.Any = None,
502
+ ) -> tp.Union[NDArray[np.uint8], tuple[NDArray[np.uint8], NDArray[np.int64]]]:
503
+ r""":meta private:"""
504
+ if mp_context is None:
505
+ mp_context = mp.get_context("forkserver" if sys.platform == "linux" else None)
506
+ if isinstance(smiles, str):
507
+ smiles = [smiles]
508
+ smiles = list(smiles)
509
+ smiles_num = len(smiles)
510
+ if num_ps is None:
511
+ num_ps = _num_avail_cpus()
512
+
513
+ if pack:
514
+ out_dim = (n_features + 7) // 8
515
+ else:
516
+ out_dim = n_features
517
+ shmem_size = smiles_num * out_dim * np.dtype(dtype).itemsize
518
+ fps_shmem = shmem.SharedMemory(create=True, size=shmem_size)
519
+ invalid_mask_shmem = shmem.SharedMemory(create=True, size=smiles_num)
520
+ fps_array_filler = _FingerprintArrayFiller(
521
+ shmem_name=fps_shmem.name,
522
+ invalid_mask_shmem_name=invalid_mask_shmem.name,
523
+ kind=kind,
524
+ fp_size=n_features,
525
+ num_smiles=smiles_num,
526
+ dtype=np.dtype(dtype).name,
527
+ pack=pack,
528
+ sanitize=sanitize,
529
+ skip_invalid=skip_invalid,
530
+ )
531
+ num_per_batch = math.ceil(smiles_num / num_ps)
532
+ with mp_context.Pool(processes=num_ps) as pool:
533
+ pool.starmap(
534
+ fps_array_filler,
535
+ _iter_ranges_and_smiles_batches(
536
+ smiles,
537
+ num_per_batch,
538
+ tab_separated,
539
+ replace_dummy_atoms,
540
+ assume_paths=False,
541
+ ),
542
+ )
543
+ fps = np.ndarray((smiles_num, out_dim), dtype=dtype, buffer=fps_shmem.buf)
544
+ mask = np.ndarray((smiles_num,), dtype=np.bool, buffer=invalid_mask_shmem.buf)
545
+ if skip_invalid:
546
+ fps = np.delete(fps, mask, axis=0)
547
+ weakref.finalize(mask, invalid_mask_shmem.close)
548
+ weakref.finalize(mask, invalid_mask_shmem.unlink)
549
+ weakref.finalize(fps, fps_shmem.close)
550
+ weakref.finalize(fps, fps_shmem.unlink)
551
+ return fps, mask
552
+
553
+ del mask
554
+ invalid_mask_shmem.close()
555
+ invalid_mask_shmem.unlink()
556
+ weakref.finalize(fps, fps_shmem.close)
557
+ weakref.finalize(fps, fps_shmem.unlink)
558
+ return fps
@@ -293,7 +293,7 @@ def estimate_jt_std(
293
293
  n_samples: int | None = None,
294
294
  input_is_packed: bool = True,
295
295
  n_features: int | None = None,
296
- min_samples: int = 1_000_000,
296
+ max_samples: int = 1_000_000,
297
297
  ) -> float:
298
298
  r"""Estimate the std of all pairwise Tanimoto.
299
299
 
@@ -303,15 +303,19 @@ def estimate_jt_std(
303
303
  The standard deviation of all pairwise Tanimoto among the sampled fingerprints.
304
304
  """
305
305
  num_fps = len(fps)
306
- if num_fps > min_samples:
307
- np.random.seed(42)
308
- random_choices = np.random.choice(num_fps, size=min_samples, replace=False)
306
+ if num_fps > max_samples:
307
+ rng = np.random.default_rng(42)
308
+ random_choices = rng.choice(num_fps, size=max_samples, replace=False)
309
309
  fps = fps[random_choices]
310
310
  num_fps = len(fps)
311
311
  if n_samples is None:
312
312
  # Heuristic: use at least 50 samples, or 1 per 10,000 fingerprints,
313
313
  # to balance statistical representativeness and computational efficiency
314
- n_samples = max(num_fps // 10_000, 50)
314
+ # TODO: This heuristic is broken, too few samples until 500k
315
+ if num_fps <= 500_000:
316
+ n_samples = 50
317
+ else:
318
+ n_samples = num_fps // 10_000
315
319
  sample_idxs = jt_stratified_sampling(fps, n_samples, input_is_packed, n_features)
316
320
 
317
321
  # Work with only the sampled fingerprints
@@ -71,12 +71,14 @@ def _iter_ranges_and_smiles_batches(
71
71
  num_per_batch: int,
72
72
  tab_separated: bool = False,
73
73
  replace_dummy_atoms: bool = False,
74
+ assume_paths: bool = True,
74
75
  ) -> tp.Iterable[tuple[tuple[int, int], tuple[str, ...]]]:
76
+ if assume_paths:
77
+ it = iter_smiles_from_paths(smiles_paths, tab_separated, replace_dummy_atoms)
78
+ else:
79
+ it = tp.cast(tp.Iterator[str], smiles_paths)
75
80
  start_idx = 0
76
- for batch in batched(
77
- iter_smiles_from_paths(smiles_paths, tab_separated, replace_dummy_atoms),
78
- num_per_batch,
79
- ):
81
+ for batch in batched(it, num_per_batch):
80
82
  size = len(batch)
81
83
  end_idx = start_idx + size
82
84
  yield (start_idx, end_idx), batch
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bblean
3
- Version: 0.8.0
3
+ Version: 0.8.2
4
4
  Summary: BitBirch-Lean Python package
5
5
  Author: The Miranda-Quintana Lab and other BitBirch developers
6
6
  Author-email: Ramon Alain Miranda Quintana <quintana@chem.ufl.edu>, Krisztina Zsigmond <kzsigmond@ufl.edu>, Ignacio Pickering <ipickering@ufl.edu>, Kenneth Lopez Perez <klopezperez@chem.ufl.edu>, Miroslav Lzicar <miroslav.lzicar@deepmedchem.com>
@@ -2,7 +2,7 @@ import itertools
2
2
  import pytest
3
3
  import numpy as np
4
4
 
5
- from bblean.bitbirch import BitBirch
5
+ from bblean.bitbirch import BitBirch, guess_threshold
6
6
  from bblean.fingerprints import pack_fingerprints, make_fake_fingerprints
7
7
 
8
8
  from inline_snapshot import snapshot
@@ -42,6 +42,20 @@ def test_bb_cluster_simple_repeated_fps() -> None:
42
42
  assert ids == [list(range(repeats))]
43
43
 
44
44
 
45
+ def test_guess_threhsold() -> None:
46
+ fps = make_fake_fingerprints(
47
+ 100, n_features=8, seed=12620509540149709235, pack=True
48
+ )
49
+ thresh = guess_threshold(fps, return_mean_std=False)
50
+ assert thresh > 0.9 and thresh < 1.0
51
+
52
+ fps = make_fake_fingerprints(
53
+ 100, n_features=2048, seed=12620509540149709235, pack=True
54
+ )
55
+ thresh = guess_threshold(fps, return_mean_std=False)
56
+ assert thresh > 0.4 and thresh < 0.6
57
+
58
+
45
59
  def test_bb_cluster_3_fps() -> None:
46
60
  fps = make_fake_fingerprints(3, n_features=8, seed=12620509540149709235, pack=True)
47
61
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes