bblean 0.6.0b2__tar.gz → 0.7.2b0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {bblean-0.6.0b2 → bblean-0.7.2b0}/.github/workflows/upload-to-pypi.yaml +6 -4
  2. {bblean-0.6.0b2 → bblean-0.7.2b0}/PKG-INFO +3 -2
  3. {bblean-0.6.0b2 → bblean-0.7.2b0}/README.md +2 -1
  4. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_legacy/bb_int64.py +2 -0
  5. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_py_similarity.py +1 -9
  6. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_version.py +2 -2
  7. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/bitbirch.py +42 -6
  8. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/cli.py +68 -15
  9. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/csrc/similarity.cpp +77 -26
  10. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/fingerprints.py +5 -1
  11. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/multiround.py +31 -16
  12. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/plotting.py +7 -0
  13. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/similarity.py +70 -15
  14. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/sklearn.py +1 -2
  15. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/smiles.py +20 -5
  16. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean.egg-info/PKG-INFO +3 -2
  17. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean.egg-info/SOURCES.txt +5 -2
  18. {bblean-0.6.0b2/examples → bblean-0.7.2b0/docs/src/user-guide/notebooks}/bitbirch_quickstart.ipynb +1 -1
  19. bblean-0.7.2b0/examples/best_practices/best_practices_functions.py +188 -0
  20. bblean-0.7.2b0/examples/best_practices/best_practices_plots.py +465 -0
  21. bblean-0.7.2b0/examples/best_practices/bitbirch_best_practices.ipynb +601 -0
  22. bblean-0.7.2b0/examples/best_practices/bitbirch_best_practices_RDKit.ipynb +571 -0
  23. bblean-0.7.2b0/examples/best_practices/bitbirch_parameter.ipynb +1755 -0
  24. {bblean-0.6.0b2/docs/src/user-guide/notebooks → bblean-0.7.2b0/examples}/bitbirch_quickstart.ipynb +1 -1
  25. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_cli.py +60 -5
  26. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_global_clustering.py +5 -25
  27. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_similarity.py +12 -1
  28. bblean-0.6.0b2/docs/src/user-guide/notebooks/bitbirch_best_practices.ipynb +0 -526
  29. bblean-0.6.0b2/examples/bitbirch_best_practices.ipynb +0 -526
  30. {bblean-0.6.0b2 → bblean-0.7.2b0}/.cruft.json +0 -0
  31. {bblean-0.6.0b2 → bblean-0.7.2b0}/.flake8 +0 -0
  32. {bblean-0.6.0b2 → bblean-0.7.2b0}/.github/CODEOWNERS +0 -0
  33. {bblean-0.6.0b2 → bblean-0.7.2b0}/.github/workflows/ci-cpp.yaml +0 -0
  34. {bblean-0.6.0b2 → bblean-0.7.2b0}/.github/workflows/ci.yaml +0 -0
  35. {bblean-0.6.0b2 → bblean-0.7.2b0}/.gitignore +0 -0
  36. {bblean-0.6.0b2 → bblean-0.7.2b0}/.pre-commit-config.yaml +0 -0
  37. {bblean-0.6.0b2 → bblean-0.7.2b0}/LICENSE +0 -0
  38. {bblean-0.6.0b2 → bblean-0.7.2b0}/LICENSES/BSD-3-Clause.txt +0 -0
  39. {bblean-0.6.0b2 → bblean-0.7.2b0}/LICENSES/GPL-3.0-only.txt +0 -0
  40. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/__init__.py +0 -0
  41. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_config.py +0 -0
  42. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_console.py +0 -0
  43. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_legacy/__init__.py +0 -0
  44. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_legacy/bb_uint8.py +0 -0
  45. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_memory.py +0 -0
  46. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_merges.py +0 -0
  47. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_timer.py +0 -0
  48. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/analysis.py +0 -0
  49. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/csrc/README.md +0 -0
  50. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/metrics.py +0 -0
  51. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/utils.py +0 -0
  52. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean-demo-v2.gif +0 -0
  53. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean-demo.cast +0 -0
  54. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean.egg-info/dependency_links.txt +0 -0
  55. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean.egg-info/entry_points.txt +0 -0
  56. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean.egg-info/requires.txt +0 -0
  57. {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean.egg-info/top_level.txt +0 -0
  58. {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/_static/api.svg +0 -0
  59. {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/_static/installing.svg +0 -0
  60. {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/_static/logo-dark-bw.svg +0 -0
  61. {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/_static/logo-light-bw.svg +0 -0
  62. {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/_static/publications.svg +0 -0
  63. {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/_static/style.css +0 -0
  64. {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/_static/user-guide.svg +0 -0
  65. {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/_templates/module.rst +0 -0
  66. {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/api-reference.rst +0 -0
  67. {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/conf.py +0 -0
  68. {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/index.rst +0 -0
  69. {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/installing.rst +0 -0
  70. {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/publications.rst +0 -0
  71. {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/user-guide/linux_memory_setup.rst +0 -0
  72. {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/user-guide/parameters.rst +0 -0
  73. {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/user-guide.rst +0 -0
  74. {bblean-0.6.0b2 → bblean-0.7.2b0}/environment.yaml +0 -0
  75. {bblean-0.6.0b2 → bblean-0.7.2b0}/examples/biogen_logS.csv +0 -0
  76. {bblean-0.6.0b2 → bblean-0.7.2b0}/examples/chembl-33-natural-products-subset.smi +0 -0
  77. {bblean-0.6.0b2 → bblean-0.7.2b0}/examples/dataset_splitting.ipynb +0 -0
  78. {bblean-0.6.0b2 → bblean-0.7.2b0}/pyproject.toml +0 -0
  79. {bblean-0.6.0b2 → bblean-0.7.2b0}/setup.cfg +0 -0
  80. {bblean-0.6.0b2 → bblean-0.7.2b0}/setup.py +0 -0
  81. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/chembl-sample-3k.smi +0 -0
  82. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/chembl-sample-bad.smi +0 -0
  83. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/legacy_merges.py +0 -0
  84. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/legacy_metrics.py +0 -0
  85. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_bb_consistency.py +0 -0
  86. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_fake_fps.py +0 -0
  87. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_fingerprints.py +0 -0
  88. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_import_bblean.py +0 -0
  89. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_merges.py +0 -0
  90. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_metrics.py +0 -0
  91. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_multiround.py +0 -0
  92. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_refine.py +0 -0
  93. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_regression.py +0 -0
  94. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_sampling.py +0 -0
  95. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_simple.py +0 -0
  96. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_sklearn.py +0 -0
  97. {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_utils.py +0 -0
@@ -15,10 +15,12 @@ on:
15
15
  required: false
16
16
  default: false
17
17
  type: boolean
18
+ release:
19
+ types: [published]
18
20
 
19
21
  env:
20
22
  PYTHON_VERSION: '3.11'
21
- SETUPTOOLS_SCM_PRETEND_VERSION: ${{ github.event.inputs.version }}
23
+ SETUPTOOLS_SCM_PRETEND_VERSION: ${{ github.event_name == 'release' && github.event.release.tag_name || github.event.inputs.version }}
22
24
  # cibuildwheel configuration:
23
25
  # Skip py 3.14, 32 bit and musllinux (Alpine) wheels
24
26
  CIBW_SKIP: "cp314-* cp314t-* *-manylinux_i686 *-win32 *-musllinux_*"
@@ -29,7 +31,7 @@ env:
29
31
  # Build wheels that support both aarch64 and x86_64 on macOS
30
32
  CIBW_ARCHS_MACOS: "universal2"
31
33
  CIBW_BUILD_VERBOSITY: 3
32
-
34
+ PIP_ONLY_BINARY: "llvmlite,numba"
33
35
  jobs:
34
36
  make_sdist:
35
37
  name: make-source-distribution
@@ -93,7 +95,7 @@ jobs:
93
95
  publish_to_testpypi:
94
96
  needs: [build_wheels, make_sdist]
95
97
  runs-on: ubuntu-latest
96
- if: ${{ github.event.inputs.upload-testpypi == 'true' }}
98
+ if: ${{ github.event_name != 'release' && github.event.inputs.upload-testpypi }}
97
99
  environment:
98
100
  name: testpypi
99
101
  url: https://test.pypi.org/p/bblean
@@ -115,7 +117,7 @@ jobs:
115
117
  publish_to_pypi:
116
118
  needs: [build_wheels, make_sdist]
117
119
  runs-on: ubuntu-latest
118
- if: ${{ github.event.inputs.upload-pypi == 'true' }}
120
+ if: ${{ github.event_name == 'release' || github.event.inputs.upload-pypi }}
119
121
  environment:
120
122
  name: pypi
121
123
  url: https://pypi.org/p/bblean
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bblean
3
- Version: 0.6.0b2
3
+ Version: 0.7.2b0
4
4
  Summary: BitBirch-Lean Python package
5
5
  Author: The Miranda-Quintana Lab and other BitBirch developers
6
6
  Author-email: Ramon Alain Miranda Quintana <quintana@chem.ufl.edu>, Krisztina Zsigmond <kzsigmond@ufl.edu>, Ignacio Pickering <ipickering@ufl.edu>, Kenneth Lopez Perez <klopezperez@chem.ufl.edu>, Miroslav Lzicar <miroslav.lzicar@deepmedchem.com>
@@ -90,6 +90,7 @@ macOS via pip, which automatically includes C++ extensions:
90
90
 
91
91
  ```bash
92
92
  pip install bblean
93
+ # Alternatively you can use 'uv pip install'
93
94
  bb --help
94
95
  ```
95
96
 
@@ -235,7 +236,7 @@ tree = bblean.BitBirch(branching_factor=50, threshold=0.65, merge_criterion="dia
235
236
  tree.fit(fps)
236
237
 
237
238
  # Refine the tree (if needed)
238
- tree.set_merge(merge_criterion="tolerance-diameter", tolerance=0.0)
239
+ tree.set_merge("tolerance-diameter", tolerance=0.0)
239
240
  tree.refine_inplace(fps)
240
241
 
241
242
  # Visualize the results
@@ -47,6 +47,7 @@ macOS via pip, which automatically includes C++ extensions:
47
47
 
48
48
  ```bash
49
49
  pip install bblean
50
+ # Alternatively you can use 'uv pip install'
50
51
  bb --help
51
52
  ```
52
53
 
@@ -192,7 +193,7 @@ tree = bblean.BitBirch(branching_factor=50, threshold=0.65, merge_criterion="dia
192
193
  tree.fit(fps)
193
194
 
194
195
  # Refine the tree (if needed)
195
- tree.set_merge(merge_criterion="tolerance-diameter", tolerance=0.0)
196
+ tree.set_merge("tolerance-diameter", tolerance=0.0)
196
197
  tree.refine_inplace(fps)
197
198
 
198
199
  # Visualize the results
@@ -633,6 +633,7 @@ class BitBirch:
633
633
  X = X[:max_fps]
634
634
  threshold = self.threshold
635
635
  branching_factor = self.branching_factor
636
+
636
637
  n_features = _validate_n_features(X, input_is_packed, n_features)
637
638
  d_type = X.dtype
638
639
 
@@ -718,6 +719,7 @@ class BitBirch:
718
719
  """
719
720
  threshold = self.threshold
720
721
  branching_factor = self.branching_factor
722
+
721
723
  n_features = _validate_n_features(X, input_is_packed, n_features)
722
724
  d_type = X.dtype
723
725
 
@@ -76,18 +76,10 @@ def jt_compl_isim(
76
76
  warnings.warn(msg, RuntimeWarning, stacklevel=2)
77
77
  return np.full(len(fps), fill_value=np.nan, dtype=np.float64)
78
78
  linear_sum = np.sum(fps, axis=0)
79
- n_objects = len(fps) - 1
80
79
  comp_sims = [jt_isim_from_sum(linear_sum - fp, n_objects) for fp in fps]
81
-
82
80
  return np.array(comp_sims, dtype=np.float64)
83
81
 
84
82
 
85
- def _jt_isim_medoid_index(
86
- fps: NDArray[np.uint8], input_is_packed: bool = True, n_features: int | None = None
87
- ) -> int:
88
- return np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
89
-
90
-
91
83
  def jt_isim_medoid(
92
84
  fps: NDArray[np.uint8],
93
85
  input_is_packed: bool = True,
@@ -110,7 +102,7 @@ def jt_isim_medoid(
110
102
  if len(fps) < 3:
111
103
  idx = 0 # Medoid undefined for sets of 3 or more fingerprints
112
104
  else:
113
- idx = _jt_isim_medoid_index(fps, input_is_packed=False)
105
+ idx = np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
114
106
  m = fps[idx]
115
107
  if pack:
116
108
  return idx, pack_fingerprints(m)
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.6.0b2'
32
- __version_tuple__ = version_tuple = (0, 6, 0, 'b2')
31
+ __version__ = version = '0.7.2.b0'
32
+ __version_tuple__ = version_tuple = (0, 7, 2, 'b0')
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -47,6 +47,8 @@
47
47
  # ./LICENSES/GPL-3.0-only.txt. If not, see <http://www.gnu.org/licenses/gpl-3.0.html>.
48
48
  r"""BitBirch 'Lean' class for fast, memory-efficient O(N) clustering"""
49
49
  from __future__ import annotations # Stringize type annotations for no runtime overhead
50
+ import pickle
51
+ import sys
50
52
  import typing_extensions as tpx
51
53
  import os
52
54
  import random
@@ -646,7 +648,7 @@ class BitBirch:
646
648
 
647
649
  @merge_criterion.setter
648
650
  def merge_criterion(self, value: str) -> None:
649
- self.set_merge(criterion=value)
651
+ self.set_merge(merge_criterion=value)
650
652
 
651
653
  @property
652
654
  def tolerance(self) -> float | None:
@@ -671,7 +673,7 @@ class BitBirch:
671
673
 
672
674
  def set_merge(
673
675
  self,
674
- criterion: str | MergeAcceptFunction | None = None,
676
+ merge_criterion: str | MergeAcceptFunction | None = None,
675
677
  *,
676
678
  tolerance: float | None = None,
677
679
  threshold: float | None = None,
@@ -687,10 +689,10 @@ class BitBirch:
687
689
  "the global set_merge() function has *not* been used"
688
690
  )
689
691
  _tolerance = 0.05 if tolerance is None else tolerance
690
- if isinstance(criterion, MergeAcceptFunction):
691
- self._merge_accept_fn = criterion
692
- elif isinstance(criterion, str):
693
- self._merge_accept_fn = get_merge_accept_fn(criterion, _tolerance)
692
+ if isinstance(merge_criterion, MergeAcceptFunction):
693
+ self._merge_accept_fn = merge_criterion
694
+ elif isinstance(merge_criterion, str):
695
+ self._merge_accept_fn = get_merge_accept_fn(merge_criterion, _tolerance)
694
696
  if hasattr(self._merge_accept_fn, "tolerance"):
695
697
  self._merge_accept_fn.tolerance = _tolerance
696
698
  elif tolerance is not None:
@@ -1316,6 +1318,40 @@ class BitBirch:
1316
1318
  parts.append(f"tolerance={self.tolerance}")
1317
1319
  return f"{self.__class__.__name__}({', '.join(parts)})"
1318
1320
 
1321
+ def save(self, path: Path | str) -> None:
1322
+ r""":meta private:"""
1323
+ # TODO: BitBIRCH is highly recursive. pickling may crash python,
1324
+ # an alternative solution would be better
1325
+ msg = (
1326
+ "Saving large BitBIRCH trees may result in large memory peaks."
1327
+ " An alternative serialization method may be implemented in the future"
1328
+ )
1329
+ warnings.warn(msg)
1330
+ _old_limit = sys.getrecursionlimit()
1331
+ sys.setrecursionlimit(1_000_000_000)
1332
+ with open(path, mode="wb") as f:
1333
+ pickle.dump(self, f)
1334
+ sys.setrecursionlimit(_old_limit)
1335
+
1336
+ @classmethod
1337
+ def load(cls, path: Path | str) -> tpx.Self:
1338
+ r""":meta private:"""
1339
+ # TODO: BitBIRCH is highly recursive. pickling may crash python,
1340
+ # an alternative solution would be better
1341
+ msg = (
1342
+ "Loading large BitBIRCH trees may result in large memory peaks."
1343
+ " An alternative serialization method may be implemented in the future"
1344
+ )
1345
+ warnings.warn(msg)
1346
+ _old_limit = sys.getrecursionlimit()
1347
+ sys.setrecursionlimit(1_000_000_000)
1348
+ with open(path, mode="rb") as f:
1349
+ tree = pickle.load(f)
1350
+ sys.setrecursionlimit(_old_limit)
1351
+ if not isinstance(tree, cls):
1352
+ raise ValueError("Path does not contain a bitbirch object")
1353
+ return tree
1354
+
1319
1355
  def global_clustering(
1320
1356
  self,
1321
1357
  n_clusters: int,
@@ -1096,26 +1096,29 @@ def _run(
1096
1096
 
1097
1097
  timer.end_timing("total", console, indent=False)
1098
1098
  console.print_peak_mem(out_dir, indent=False)
1099
+ if save_tree:
1100
+ if variant != "lean":
1101
+ console.print("Can't save tree for non-lean variants", style="red")
1102
+ else:
1103
+ # TODO: Find alternative solution
1104
+ tree.save(out_dir / "bitbirch.pkl")
1099
1105
  if variant == "lean":
1100
- if save_tree:
1101
- # TODO: BitBIRCH is highly recursive. pickling may crash python,
1102
- # an alternative solution would be better
1103
- _old_limit = sys.getrecursionlimit()
1104
- sys.setrecursionlimit(100_000)
1105
- with open(out_dir / "bitbirch.pkl", mode="wb") as f:
1106
- pickle.dump(tree, f)
1107
- sys.setrecursionlimit(_old_limit)
1108
1106
  tree.delete_internal_nodes()
1109
- # Dump outputs (peak memory, timings, config, cluster ids)
1110
- if save_centroids:
1107
+ # Dump outputs (peak memory, timings, config, cluster ids)
1108
+ if save_centroids:
1109
+ if variant != "lean":
1110
+ console.print("Can't save centroids for non-lean variants", style="red")
1111
+ with open(out_dir / "clusters.pkl", mode="wb") as f:
1112
+ pickle.dump(tree.get_cluster_mol_ids(), f)
1113
+ else:
1111
1114
  output = tree.get_centroids_mol_ids()
1112
1115
  with open(out_dir / "clusters.pkl", mode="wb") as f:
1113
1116
  pickle.dump(output["mol_ids"], f)
1114
1117
  with open(out_dir / "cluster-centroids-packed.pkl", mode="wb") as f:
1115
1118
  pickle.dump(output["centroids"], f)
1116
- else:
1117
- with open(out_dir / "clusters.pkl", mode="wb") as f:
1118
- pickle.dump(tree.get_cluster_mol_ids(), f)
1119
+ else:
1120
+ with open(out_dir / "clusters.pkl", mode="wb") as f:
1121
+ pickle.dump(tree.get_cluster_mol_ids(), f)
1119
1122
 
1120
1123
  collect_system_specs_and_dump_config(ctx.params)
1121
1124
  timer.dump(out_dir / "timings.json")
@@ -1193,6 +1196,14 @@ def _multiround(
1193
1196
  bool,
1194
1197
  Option("--save-centroids/--no-save-centroids", rich_help_panel="Advanced"),
1195
1198
  ] = True,
1199
+ sort_fps: Annotated[
1200
+ bool,
1201
+ Option(
1202
+ "--sort-fps/--no-sort-fps",
1203
+ help="Sort the fingerprints by popcount before launching the initial round",
1204
+ rich_help_panel="Advanced",
1205
+ ),
1206
+ ] = False,
1196
1207
  mid_merge_criterion: Annotated[
1197
1208
  str,
1198
1209
  Option(
@@ -1386,6 +1397,7 @@ def _multiround(
1386
1397
  midsection_threshold_change=mid_threshold_change,
1387
1398
  tolerance=tolerance,
1388
1399
  # Advanced
1400
+ sort_fps=sort_fps,
1389
1401
  save_tree=save_tree,
1390
1402
  save_centroids=save_centroids,
1391
1403
  bin_size=bin_size,
@@ -1526,6 +1538,13 @@ def _fps_from_smiles(
1526
1538
  ),
1527
1539
  ),
1528
1540
  ] = False,
1541
+ tab_separated: Annotated[
1542
+ bool,
1543
+ Option(
1544
+ "--tab-sep/--no-tab-sep",
1545
+ help="Whether the smiles file has the format <smiles><tab><field><tab>...",
1546
+ ),
1547
+ ] = False,
1529
1548
  ) -> None:
1530
1549
  r"""Generate a `*.npy` fingerprints file from one or more `*.smi` smiles files
1531
1550
 
@@ -1631,7 +1650,9 @@ def _fps_from_smiles(
1631
1650
  with mp_context.Pool(processes=num_ps) as pool:
1632
1651
  pool.map(
1633
1652
  create_fp_file,
1634
- _iter_idxs_and_smiles_batches(smiles_paths, num_per_batch),
1653
+ _iter_idxs_and_smiles_batches(
1654
+ smiles_paths, num_per_batch, tab_separated
1655
+ ),
1635
1656
  )
1636
1657
  timer.end_timing("total", console, indent=False)
1637
1658
  stem = out_name.split(".")[0]
@@ -1671,7 +1692,9 @@ def _fps_from_smiles(
1671
1692
  with mp_context.Pool(processes=num_ps) as pool:
1672
1693
  pool.starmap(
1673
1694
  fps_array_filler,
1674
- _iter_ranges_and_smiles_batches(smiles_paths, num_per_batch),
1695
+ _iter_ranges_and_smiles_batches(
1696
+ smiles_paths, num_per_batch, tab_separated
1697
+ ),
1675
1698
  )
1676
1699
  fps = np.ndarray((smiles_num, out_dim), dtype=dtype, buffer=fps_shmem.buf)
1677
1700
  mask = np.ndarray((smiles_num,), dtype=np.bool, buffer=invalid_mask_shmem.buf)
@@ -1848,3 +1871,33 @@ def _merge_fps(
1848
1871
  return
1849
1872
  np.save(out_dir / stem, np.concatenate(arrays))
1850
1873
  console.print(f"Finished. Outputs written to {str(out_dir / stem)}.npy")
1874
+
1875
+
1876
+ @app.command("fps-sort", rich_help_panel="Fingerprints")
1877
+ def _sort_fps(
1878
+ in_file: Annotated[
1879
+ Path,
1880
+ Argument(help="`*.npy` file with packed fingerprints"),
1881
+ ],
1882
+ out_dir: Annotated[
1883
+ Path | None,
1884
+ Option("-o", "--out-dir", show_default=False),
1885
+ ] = None,
1886
+ seed: Annotated[
1887
+ int | None,
1888
+ Option("--seed", hidden=True, rich_help_panel="Debug"),
1889
+ ] = None,
1890
+ ) -> None:
1891
+ import numpy as np
1892
+ from bblean._py_similarity import _popcount
1893
+
1894
+ fps = np.load(in_file)
1895
+ stem = in_file.stem
1896
+ counts = _popcount(fps)
1897
+ sort_idxs = np.argsort(counts)
1898
+ fps = fps[sort_idxs]
1899
+ if out_dir is None:
1900
+ out_dir = Path.cwd()
1901
+ out_dir.mkdir(exist_ok=True)
1902
+ out_dir = out_dir.resolve()
1903
+ np.save(out_dir / f"sorted-{stem}.npy", fps)
@@ -300,6 +300,75 @@ double jt_isim_from_sum(const CArrayForcecast<uint64_t>& linear_sum,
300
300
  return a / ((a + (n_objects * sum_kq)) - sum_kqsq);
301
301
  }
302
302
 
303
+ // NOTE: This is only *slightly* faster for C++ than numpy, **only if the
304
+ // array is uint8_t** if the array is uint64 already, it is slower
305
+ template <typename T>
306
+ py::array_t<uint64_t> add_rows(const CArrayForcecast<T>& arr) {
307
+ if (arr.ndim() != 2) {
308
+ throw std::runtime_error("Input array must be 2-dimensional");
309
+ }
310
+ auto arr_ptr = arr.data();
311
+ auto out = py::array_t<uint64_t>(arr.shape(1));
312
+ auto out_ptr = out.mutable_data();
313
+ std::memset(out_ptr, 0, out.nbytes());
314
+ py::ssize_t n_samples = arr.shape(0);
315
+ py::ssize_t n_features = arr.shape(1);
316
+ // Check GCC / CLang vectorize this
317
+ for (py::ssize_t i = 0; i < n_samples; ++i) {
318
+ const uint8_t* arr_row_ptr = arr_ptr + i * n_features;
319
+ for (py::ssize_t j = 0; j < n_features; ++j) {
320
+ out_ptr[j] += static_cast<uint64_t>(arr_row_ptr[j]);
321
+ }
322
+ }
323
+ return out;
324
+ }
325
+ py::array_t<double> _nochecks_jt_compl_isim_unpacked_u8(
326
+ const py::array_t<uint8_t, py::array::c_style>& fps) {
327
+ py::ssize_t n_objects = fps.shape(0);
328
+ py::ssize_t n_features = fps.shape(1);
329
+ auto out = py::array_t<double>(n_objects);
330
+ auto out_ptr = out.mutable_data();
331
+
332
+ if (n_objects < 3) {
333
+ PyErr_WarnEx(PyExc_RuntimeWarning,
334
+ "Invalid num fps in compl_isim. Expected n_objects >= 3",
335
+ 1);
336
+ for (py::ssize_t i{0}; i != n_objects; ++i) {
337
+ out_ptr[i] = std::numeric_limits<double>::quiet_NaN();
338
+ }
339
+ return out;
340
+ }
341
+
342
+ auto linear_sum = add_rows<uint8_t>(fps);
343
+ auto ls_cptr = linear_sum.data();
344
+
345
+ py::array_t<uint64_t> shifted_linear_sum(n_features);
346
+ auto shifted_ls_ptr = shifted_linear_sum.mutable_data();
347
+
348
+ auto in_cptr = fps.data();
349
+ for (py::ssize_t i{0}; i != n_objects; ++i) {
350
+ for (py::ssize_t j{0}; j != n_features; ++j) {
351
+ shifted_ls_ptr[j] = ls_cptr[j] - in_cptr[i * n_features + j];
352
+ }
353
+ // For all compl isim N is n_objects - 1
354
+ out_ptr[i] = jt_isim_from_sum(shifted_linear_sum, n_objects - 1);
355
+ }
356
+ return out;
357
+ }
358
+
359
+ py::array_t<double> jt_compl_isim(
360
+ const CArrayForcecast<uint8_t>& fps, bool input_is_packed = true,
361
+ std::optional<py::ssize_t> n_features_opt = std::nullopt) {
362
+ if (fps.ndim() != 2) {
363
+ throw std::runtime_error("fps arr must be 2D");
364
+ }
365
+ if (input_is_packed) {
366
+ return _nochecks_jt_compl_isim_unpacked_u8(
367
+ _nochecks_unpack_fingerprints_2d(fps, n_features_opt));
368
+ }
369
+ return _nochecks_jt_compl_isim_unpacked_u8(fps);
370
+ }
371
+
303
372
  // Contraint: T must be uint64_t or uint8_t
304
373
  template <typename T>
305
374
  void _calc_arr_vec_jt(const py::array_t<uint8_t>& arr,
@@ -372,33 +441,10 @@ py::array_t<double> jt_sim_packed_precalc_cardinalities(
372
441
  }
373
442
 
374
443
  py::array_t<double> _jt_sim_arr_vec_packed(const py::array_t<uint8_t>& arr,
375
- const py::array_t<uint8_t>& vec) {
444
+ const py::array_t<uint8_t>& vec) {
376
445
  return jt_sim_packed_precalc_cardinalities(arr, vec, _popcount_2d(arr));
377
446
  }
378
447
 
379
- // NOTE: This is only *slightly* faster for C++ than numpy, **only if the
380
- // array is uint8_t** if the array is uint64 already, it is slower
381
- template <typename T>
382
- py::array_t<uint64_t> add_rows(const CArrayForcecast<T>& arr) {
383
- if (arr.ndim() != 2) {
384
- throw std::runtime_error("Input array must be 2-dimensional");
385
- }
386
- auto arr_ptr = arr.data();
387
- auto out = py::array_t<uint64_t>(arr.shape(1));
388
- auto out_ptr = out.mutable_data();
389
- std::memset(out_ptr, 0, out.nbytes());
390
- py::ssize_t n_samples = arr.shape(0);
391
- py::ssize_t n_features = arr.shape(1);
392
- // Check GCC / CLang vectorize this
393
- for (py::ssize_t i = 0; i < n_samples; ++i) {
394
- const uint8_t* arr_row_ptr = arr_ptr + i * n_features;
395
- for (py::ssize_t j = 0; j < n_features; ++j) {
396
- out_ptr[j] += static_cast<uint64_t>(arr_row_ptr[j]);
397
- }
398
- }
399
- return out;
400
- }
401
-
402
448
  double jt_isim_unpacked_u8(const CArrayForcecast<uint8_t>& arr) {
403
449
  return jt_isim_from_sum(add_rows<uint8_t>(arr), arr.shape(0));
404
450
  }
@@ -406,8 +452,9 @@ double jt_isim_unpacked_u8(const CArrayForcecast<uint8_t>& arr) {
406
452
  double jt_isim_packed_u8(
407
453
  const CArrayForcecast<uint8_t>& arr,
408
454
  std::optional<py::ssize_t> n_features_opt = std::nullopt) {
409
- return jt_isim_from_sum(add_rows<uint8_t>(unpack_fingerprints(arr, n_features_opt)),
410
- arr.shape(0));
455
+ return jt_isim_from_sum(
456
+ add_rows<uint8_t>(unpack_fingerprints(arr, n_features_opt)),
457
+ arr.shape(0));
411
458
  }
412
459
 
413
460
  py::tuple jt_most_dissimilar_packed(
@@ -510,6 +557,10 @@ PYBIND11_MODULE(_cpp_similarity, m) {
510
557
  m.def("jt_isim_unpacked_u8", &jt_isim_unpacked_u8,
511
558
  "iSIM Tanimoto calculation", py::arg("arr"));
512
559
 
560
+ m.def("jt_compl_isim", &jt_compl_isim, "Complementary iSIM tanimoto",
561
+ py::arg("fps"), py::arg("input_is_packed") = true,
562
+ py::arg("n_features") = std::nullopt);
563
+
513
564
  m.def("_jt_sim_arr_vec_packed", &_jt_sim_arr_vec_packed,
514
565
  "Tanimoto similarity between a matrix of packed fps and a single "
515
566
  "packed fp",
@@ -115,7 +115,11 @@ def _get_generator(kind: str, n_features: int) -> tp.Any:
115
115
  return rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_features)
116
116
  elif kind == "ecfp6":
117
117
  return rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=n_features)
118
- raise ValueError(f"Unknonw kind {kind}. Should be one of 'rdkit|ecfp4|ecfp6'")
118
+ elif kind == "topological":
119
+ return rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=n_features)
120
+ elif kind == "ap":
121
+ return rdFingerprintGenerator.GetAtomPairGenerator(fpSize=n_features)
122
+ raise ValueError(f"Unknown kind {kind}. Use 'rdkit|ecfp4|ecfp6|topological|ap'")
119
123
 
120
124
 
121
125
  def _get_sanitize_flags(sanitize: str) -> tp.Any:
@@ -65,6 +65,7 @@ from bblean._config import DEFAULTS
65
65
  from bblean.utils import batched
66
66
  from bblean.bitbirch import BitBirch
67
67
  from bblean.fingerprints import _get_fps_file_num
68
+ from bblean._py_similarity import _popcount
68
69
 
69
70
  __all__ = ["run_multiround_bitbirch"]
70
71
 
@@ -157,6 +158,7 @@ class _InitialRound:
157
158
  max_fps: int | None = None,
158
159
  merge_criterion: str = DEFAULTS.merge_criterion,
159
160
  input_is_packed: bool = True,
161
+ sort_fps: bool = False,
160
162
  ) -> None:
161
163
  self.n_features = n_features
162
164
  self.refinement_before_midsection = refinement_before_midsection
@@ -171,6 +173,7 @@ class _InitialRound:
171
173
  self.refine_merge_criterion = refine_merge_criterion
172
174
  self.input_is_packed = input_is_packed
173
175
  self.refine_threshold_change = refine_threshold_change
176
+ self._sort_fps = sort_fps
174
177
 
175
178
  def __call__(self, file_info: tuple[str, Path, int, int]) -> None:
176
179
  file_label, fp_file, start_idx, end_idx = file_info
@@ -182,6 +185,14 @@ class _InitialRound:
182
185
  threshold=self.threshold,
183
186
  merge_criterion=self.merge_criterion,
184
187
  )
188
+ if self._sort_fps:
189
+ fp_input = np.load(fp_file)
190
+ counts = _popcount(fp_input)
191
+ sort_idxs = np.argsort(counts)
192
+ fp_input = fp_input[sort_idxs]
193
+ else:
194
+ fp_input = fp_file
195
+
185
196
  range_ = range(start_idx, end_idx)
186
197
  tree.fit(
187
198
  fp_file,
@@ -201,7 +212,7 @@ class _InitialRound:
201
212
  # Finish the first refinement step internally in this round
202
213
  tree.reset()
203
214
  tree.set_merge(
204
- self.refine_merge_criterion,
215
+ merge_criterion=self.refine_merge_criterion,
205
216
  tolerance=self.tolerance,
206
217
  threshold=self.threshold + self.refine_threshold_change,
207
218
  )
@@ -225,7 +236,7 @@ class _TreeMergingRound:
225
236
  round_idx: int,
226
237
  out_dir: Path | str,
227
238
  split_largest_cluster: bool,
228
- criterion: str,
239
+ merge_criterion: str,
229
240
  all_fp_paths: tp.Sequence[Path] = (),
230
241
  ) -> None:
231
242
  self.all_fp_paths = list(all_fp_paths)
@@ -235,14 +246,14 @@ class _TreeMergingRound:
235
246
  self.round_idx = round_idx
236
247
  self.out_dir = Path(out_dir)
237
248
  self.split_largest_cluster = split_largest_cluster
238
- self.criterion = criterion
249
+ self.merge_criterion = merge_criterion
239
250
 
240
251
  def __call__(self, batch_info: tuple[str, tp.Sequence[tuple[Path, Path]]]) -> None:
241
252
  batch_label, batch_path_pairs = batch_info
242
253
  tree = BitBirch(
243
254
  branching_factor=self.branching_factor,
244
255
  threshold=self.threshold,
245
- merge_criterion=self.criterion,
256
+ merge_criterion=self.merge_criterion,
246
257
  tolerance=self.tolerance,
247
258
  )
248
259
  # Rebuild a tree, inserting all BitFeatures from the corresponding batch
@@ -270,13 +281,20 @@ class _FinalTreeMergingRound(_TreeMergingRound):
270
281
  branching_factor: int,
271
282
  threshold: float,
272
283
  tolerance: float,
273
- criterion: str,
284
+ merge_criterion: str,
274
285
  out_dir: Path | str,
275
286
  save_tree: bool,
276
287
  save_centroids: bool,
277
288
  ) -> None:
278
289
  super().__init__(
279
- branching_factor, threshold, tolerance, -1, out_dir, False, criterion, ()
290
+ branching_factor,
291
+ threshold,
292
+ tolerance,
293
+ -1,
294
+ out_dir,
295
+ False,
296
+ merge_criterion,
297
+ (),
280
298
  )
281
299
  self.save_tree = save_tree
282
300
  self.save_centroids = save_centroids
@@ -286,7 +304,7 @@ class _FinalTreeMergingRound(_TreeMergingRound):
286
304
  tree = BitBirch(
287
305
  branching_factor=self.branching_factor,
288
306
  threshold=self.threshold,
289
- merge_criterion=self.criterion,
307
+ merge_criterion=self.merge_criterion,
290
308
  tolerance=self.tolerance,
291
309
  )
292
310
  # Rebuild a tree, inserting all BitFeatures from the corresponding batch
@@ -298,13 +316,8 @@ class _FinalTreeMergingRound(_TreeMergingRound):
298
316
 
299
317
  # Save clusters and exit
300
318
  if self.save_tree:
301
- # TODO: BitBIRCH is highly recursive. pickling may crash python,
302
- # an alternative solution would be better
303
- _old_limit = sys.getrecursionlimit()
304
- sys.setrecursionlimit(100_000)
305
- with open(self.out_dir / "bitbirch.pkl", mode="wb") as f:
306
- pickle.dump(tree, f)
307
- sys.setrecursionlimit(_old_limit)
319
+ # TODO: Find alternative solution
320
+ tree.save(self.out_dir / "bitbirch.pkl")
308
321
  tree.delete_internal_nodes()
309
322
  if self.save_centroids:
310
323
  output = tree.get_centroids_mol_ids()
@@ -358,6 +371,7 @@ def run_multiround_bitbirch(
358
371
  mp_context: tp.Any = None,
359
372
  save_tree: bool = False,
360
373
  save_centroids: bool = True,
374
+ sort_fps: bool = False,
361
375
  # Debug
362
376
  max_fps: int | None = None,
363
377
  verbose: bool = False,
@@ -404,6 +418,7 @@ def run_multiround_bitbirch(
404
418
  console.print(f"(Initial) Round {round_idx}: Cluster initial batch of fingerprints")
405
419
 
406
420
  initial_fn = _InitialRound(
421
+ sort_fps=sort_fps,
407
422
  n_features=n_features,
408
423
  refinement_before_midsection=refinement_before_midsection,
409
424
  max_fps=max_fps,
@@ -441,7 +456,7 @@ def run_multiround_bitbirch(
441
456
  round_idx=round_idx,
442
457
  all_fp_paths=input_files,
443
458
  split_largest_cluster=split_largest_after_each_midsection_round,
444
- criterion=midsection_merge_criterion,
459
+ merge_criterion=midsection_merge_criterion,
445
460
  threshold=threshold + midsection_threshold_change,
446
461
  **common_kwargs,
447
462
  )
@@ -469,7 +484,7 @@ def run_multiround_bitbirch(
469
484
  final_fn = _FinalTreeMergingRound(
470
485
  save_tree=save_tree,
471
486
  save_centroids=save_centroids,
472
- criterion=final_merge_criterion,
487
+ merge_criterion=final_merge_criterion,
473
488
  threshold=threshold + midsection_threshold_change,
474
489
  **common_kwargs,
475
490
  )
@@ -399,13 +399,17 @@ def dump_mol_images(
399
399
  clusters: list[list[int]],
400
400
  cluster_idx: int = 0,
401
401
  batch_size: int = 30,
402
+ limit: int = -1,
402
403
  ) -> None:
403
404
  r"""Dump smiles associated with a specific cluster as ``*.png`` image files"""
404
405
  if isinstance(smiles, str):
405
406
  smiles = [smiles]
406
407
  smiles = np.asarray(smiles)
407
408
  idxs = clusters[cluster_idx]
409
+ num = 0
408
410
  for i, idx_seq in enumerate(batched(idxs, batch_size)):
411
+ if num + len(idx_seq) > limit:
412
+ idx_seq = idx_seq[: num + len(idx_seq) - limit]
409
413
  mols = []
410
414
  for smi in smiles[list(idx_seq)]:
411
415
  mol = Chem.MolFromSmiles(smi)
@@ -415,6 +419,9 @@ def dump_mol_images(
415
419
  img = Draw.MolsToGridImage(mols, molsPerRow=5)
416
420
  with open(f"cluster_{cluster_idx}_{i}.png", "wb") as f:
417
421
  f.write(img.data)
422
+ num += len(idx_seq)
423
+ if num >= limit:
424
+ break
418
425
 
419
426
 
420
427
  # For internal use, dispatches a visualization workflow and optionally saves