bblean 0.6.1b0__tar.gz → 0.7.2b0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {bblean-0.6.1b0 → bblean-0.7.2b0}/.github/workflows/upload-to-pypi.yaml +1 -1
  2. {bblean-0.6.1b0 → bblean-0.7.2b0}/PKG-INFO +2 -2
  3. {bblean-0.6.1b0 → bblean-0.7.2b0}/README.md +1 -1
  4. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/_py_similarity.py +1 -9
  5. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/_version.py +2 -2
  6. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/bitbirch.py +6 -6
  7. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/cli.py +53 -3
  8. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/csrc/similarity.cpp +77 -26
  9. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/fingerprints.py +5 -1
  10. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/multiround.py +30 -10
  11. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/similarity.py +70 -15
  12. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/smiles.py +20 -5
  13. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean.egg-info/PKG-INFO +2 -2
  14. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean.egg-info/SOURCES.txt +5 -2
  15. {bblean-0.6.1b0/examples → bblean-0.7.2b0/docs/src/user-guide/notebooks}/bitbirch_quickstart.ipynb +1 -1
  16. bblean-0.7.2b0/examples/best_practices/best_practices_functions.py +188 -0
  17. bblean-0.7.2b0/examples/best_practices/best_practices_plots.py +465 -0
  18. bblean-0.7.2b0/examples/best_practices/bitbirch_best_practices.ipynb +601 -0
  19. bblean-0.7.2b0/examples/best_practices/bitbirch_best_practices_RDKit.ipynb +571 -0
  20. bblean-0.7.2b0/examples/best_practices/bitbirch_parameter.ipynb +1755 -0
  21. {bblean-0.6.1b0/docs/src/user-guide/notebooks → bblean-0.7.2b0/examples}/bitbirch_quickstart.ipynb +1 -1
  22. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/test_cli.py +60 -5
  23. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/test_global_clustering.py +5 -25
  24. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/test_similarity.py +12 -1
  25. bblean-0.6.1b0/docs/src/user-guide/notebooks/bitbirch_best_practices.ipynb +0 -526
  26. bblean-0.6.1b0/examples/bitbirch_best_practices.ipynb +0 -526
  27. {bblean-0.6.1b0 → bblean-0.7.2b0}/.cruft.json +0 -0
  28. {bblean-0.6.1b0 → bblean-0.7.2b0}/.flake8 +0 -0
  29. {bblean-0.6.1b0 → bblean-0.7.2b0}/.github/CODEOWNERS +0 -0
  30. {bblean-0.6.1b0 → bblean-0.7.2b0}/.github/workflows/ci-cpp.yaml +0 -0
  31. {bblean-0.6.1b0 → bblean-0.7.2b0}/.github/workflows/ci.yaml +0 -0
  32. {bblean-0.6.1b0 → bblean-0.7.2b0}/.gitignore +0 -0
  33. {bblean-0.6.1b0 → bblean-0.7.2b0}/.pre-commit-config.yaml +0 -0
  34. {bblean-0.6.1b0 → bblean-0.7.2b0}/LICENSE +0 -0
  35. {bblean-0.6.1b0 → bblean-0.7.2b0}/LICENSES/BSD-3-Clause.txt +0 -0
  36. {bblean-0.6.1b0 → bblean-0.7.2b0}/LICENSES/GPL-3.0-only.txt +0 -0
  37. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/__init__.py +0 -0
  38. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/_config.py +0 -0
  39. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/_console.py +0 -0
  40. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/_legacy/__init__.py +0 -0
  41. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/_legacy/bb_int64.py +0 -0
  42. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/_legacy/bb_uint8.py +0 -0
  43. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/_memory.py +0 -0
  44. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/_merges.py +0 -0
  45. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/_timer.py +0 -0
  46. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/analysis.py +0 -0
  47. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/csrc/README.md +0 -0
  48. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/metrics.py +0 -0
  49. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/plotting.py +0 -0
  50. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/sklearn.py +0 -0
  51. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean/utils.py +0 -0
  52. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean-demo-v2.gif +0 -0
  53. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean-demo.cast +0 -0
  54. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean.egg-info/dependency_links.txt +0 -0
  55. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean.egg-info/entry_points.txt +0 -0
  56. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean.egg-info/requires.txt +0 -0
  57. {bblean-0.6.1b0 → bblean-0.7.2b0}/bblean.egg-info/top_level.txt +0 -0
  58. {bblean-0.6.1b0 → bblean-0.7.2b0}/docs/src/_static/api.svg +0 -0
  59. {bblean-0.6.1b0 → bblean-0.7.2b0}/docs/src/_static/installing.svg +0 -0
  60. {bblean-0.6.1b0 → bblean-0.7.2b0}/docs/src/_static/logo-dark-bw.svg +0 -0
  61. {bblean-0.6.1b0 → bblean-0.7.2b0}/docs/src/_static/logo-light-bw.svg +0 -0
  62. {bblean-0.6.1b0 → bblean-0.7.2b0}/docs/src/_static/publications.svg +0 -0
  63. {bblean-0.6.1b0 → bblean-0.7.2b0}/docs/src/_static/style.css +0 -0
  64. {bblean-0.6.1b0 → bblean-0.7.2b0}/docs/src/_static/user-guide.svg +0 -0
  65. {bblean-0.6.1b0 → bblean-0.7.2b0}/docs/src/_templates/module.rst +0 -0
  66. {bblean-0.6.1b0 → bblean-0.7.2b0}/docs/src/api-reference.rst +0 -0
  67. {bblean-0.6.1b0 → bblean-0.7.2b0}/docs/src/conf.py +0 -0
  68. {bblean-0.6.1b0 → bblean-0.7.2b0}/docs/src/index.rst +0 -0
  69. {bblean-0.6.1b0 → bblean-0.7.2b0}/docs/src/installing.rst +0 -0
  70. {bblean-0.6.1b0 → bblean-0.7.2b0}/docs/src/publications.rst +0 -0
  71. {bblean-0.6.1b0 → bblean-0.7.2b0}/docs/src/user-guide/linux_memory_setup.rst +0 -0
  72. {bblean-0.6.1b0 → bblean-0.7.2b0}/docs/src/user-guide/parameters.rst +0 -0
  73. {bblean-0.6.1b0 → bblean-0.7.2b0}/docs/src/user-guide.rst +0 -0
  74. {bblean-0.6.1b0 → bblean-0.7.2b0}/environment.yaml +0 -0
  75. {bblean-0.6.1b0 → bblean-0.7.2b0}/examples/biogen_logS.csv +0 -0
  76. {bblean-0.6.1b0 → bblean-0.7.2b0}/examples/chembl-33-natural-products-subset.smi +0 -0
  77. {bblean-0.6.1b0 → bblean-0.7.2b0}/examples/dataset_splitting.ipynb +0 -0
  78. {bblean-0.6.1b0 → bblean-0.7.2b0}/pyproject.toml +0 -0
  79. {bblean-0.6.1b0 → bblean-0.7.2b0}/setup.cfg +0 -0
  80. {bblean-0.6.1b0 → bblean-0.7.2b0}/setup.py +0 -0
  81. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/chembl-sample-3k.smi +0 -0
  82. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/chembl-sample-bad.smi +0 -0
  83. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/legacy_merges.py +0 -0
  84. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/legacy_metrics.py +0 -0
  85. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/test_bb_consistency.py +0 -0
  86. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/test_fake_fps.py +0 -0
  87. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/test_fingerprints.py +0 -0
  88. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/test_import_bblean.py +0 -0
  89. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/test_merges.py +0 -0
  90. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/test_metrics.py +0 -0
  91. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/test_multiround.py +0 -0
  92. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/test_refine.py +0 -0
  93. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/test_regression.py +0 -0
  94. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/test_sampling.py +0 -0
  95. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/test_simple.py +0 -0
  96. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/test_sklearn.py +0 -0
  97. {bblean-0.6.1b0 → bblean-0.7.2b0}/tests/test_utils.py +0 -0
@@ -31,7 +31,7 @@ env:
31
31
  # Build wheels that support both aarch64 and x86_64 on macOS
32
32
  CIBW_ARCHS_MACOS: "universal2"
33
33
  CIBW_BUILD_VERBOSITY: 3
34
-
34
+ PIP_ONLY_BINARY: "llvmlite,numba"
35
35
  jobs:
36
36
  make_sdist:
37
37
  name: make-source-distribution
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bblean
3
- Version: 0.6.1b0
3
+ Version: 0.7.2b0
4
4
  Summary: BitBirch-Lean Python package
5
5
  Author: The Miranda-Quintana Lab and other BitBirch developers
6
6
  Author-email: Ramon Alain Miranda Quintana <quintana@chem.ufl.edu>, Krisztina Zsigmond <kzsigmond@ufl.edu>, Ignacio Pickering <ipickering@ufl.edu>, Kenneth Lopez Perez <klopezperez@chem.ufl.edu>, Miroslav Lzicar <miroslav.lzicar@deepmedchem.com>
@@ -236,7 +236,7 @@ tree = bblean.BitBirch(branching_factor=50, threshold=0.65, merge_criterion="dia
236
236
  tree.fit(fps)
237
237
 
238
238
  # Refine the tree (if needed)
239
- tree.set_merge(merge_criterion="tolerance-diameter", tolerance=0.0)
239
+ tree.set_merge("tolerance-diameter", tolerance=0.0)
240
240
  tree.refine_inplace(fps)
241
241
 
242
242
  # Visualize the results
@@ -193,7 +193,7 @@ tree = bblean.BitBirch(branching_factor=50, threshold=0.65, merge_criterion="dia
193
193
  tree.fit(fps)
194
194
 
195
195
  # Refine the tree (if needed)
196
- tree.set_merge(merge_criterion="tolerance-diameter", tolerance=0.0)
196
+ tree.set_merge("tolerance-diameter", tolerance=0.0)
197
197
  tree.refine_inplace(fps)
198
198
 
199
199
  # Visualize the results
@@ -76,18 +76,10 @@ def jt_compl_isim(
76
76
  warnings.warn(msg, RuntimeWarning, stacklevel=2)
77
77
  return np.full(len(fps), fill_value=np.nan, dtype=np.float64)
78
78
  linear_sum = np.sum(fps, axis=0)
79
- n_objects = len(fps) - 1
80
79
  comp_sims = [jt_isim_from_sum(linear_sum - fp, n_objects) for fp in fps]
81
-
82
80
  return np.array(comp_sims, dtype=np.float64)
83
81
 
84
82
 
85
- def _jt_isim_medoid_index(
86
- fps: NDArray[np.uint8], input_is_packed: bool = True, n_features: int | None = None
87
- ) -> int:
88
- return np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
89
-
90
-
91
83
  def jt_isim_medoid(
92
84
  fps: NDArray[np.uint8],
93
85
  input_is_packed: bool = True,
@@ -110,7 +102,7 @@ def jt_isim_medoid(
110
102
  if len(fps) < 3:
111
103
  idx = 0 # Medoid undefined for sets of 3 or more fingerprints
112
104
  else:
113
- idx = _jt_isim_medoid_index(fps, input_is_packed=False)
105
+ idx = np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
114
106
  m = fps[idx]
115
107
  if pack:
116
108
  return idx, pack_fingerprints(m)
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.6.1b0'
32
- __version_tuple__ = version_tuple = (0, 6, 1, 'b0')
31
+ __version__ = version = '0.7.2.b0'
32
+ __version_tuple__ = version_tuple = (0, 7, 2, 'b0')
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -648,7 +648,7 @@ class BitBirch:
648
648
 
649
649
  @merge_criterion.setter
650
650
  def merge_criterion(self, value: str) -> None:
651
- self.set_merge(criterion=value)
651
+ self.set_merge(merge_criterion=value)
652
652
 
653
653
  @property
654
654
  def tolerance(self) -> float | None:
@@ -673,7 +673,7 @@ class BitBirch:
673
673
 
674
674
  def set_merge(
675
675
  self,
676
- criterion: str | MergeAcceptFunction | None = None,
676
+ merge_criterion: str | MergeAcceptFunction | None = None,
677
677
  *,
678
678
  tolerance: float | None = None,
679
679
  threshold: float | None = None,
@@ -689,10 +689,10 @@ class BitBirch:
689
689
  "the global set_merge() function has *not* been used"
690
690
  )
691
691
  _tolerance = 0.05 if tolerance is None else tolerance
692
- if isinstance(criterion, MergeAcceptFunction):
693
- self._merge_accept_fn = criterion
694
- elif isinstance(criterion, str):
695
- self._merge_accept_fn = get_merge_accept_fn(criterion, _tolerance)
692
+ if isinstance(merge_criterion, MergeAcceptFunction):
693
+ self._merge_accept_fn = merge_criterion
694
+ elif isinstance(merge_criterion, str):
695
+ self._merge_accept_fn = get_merge_accept_fn(merge_criterion, _tolerance)
696
696
  if hasattr(self._merge_accept_fn, "tolerance"):
697
697
  self._merge_accept_fn.tolerance = _tolerance
698
698
  elif tolerance is not None:
@@ -1101,7 +1101,7 @@ def _run(
1101
1101
  console.print("Can't save tree for non-lean variants", style="red")
1102
1102
  else:
1103
1103
  # TODO: Find alternative solution
1104
- tree.save_pickle(out_dir / "bitbirch.pkl")
1104
+ tree.save(out_dir / "bitbirch.pkl")
1105
1105
  if variant == "lean":
1106
1106
  tree.delete_internal_nodes()
1107
1107
  # Dump outputs (peak memory, timings, config, cluster ids)
@@ -1196,6 +1196,14 @@ def _multiround(
1196
1196
  bool,
1197
1197
  Option("--save-centroids/--no-save-centroids", rich_help_panel="Advanced"),
1198
1198
  ] = True,
1199
+ sort_fps: Annotated[
1200
+ bool,
1201
+ Option(
1202
+ "--sort-fps/--no-sort-fps",
1203
+ help="Sort the fingerprints by popcount before launching the initial round",
1204
+ rich_help_panel="Advanced",
1205
+ ),
1206
+ ] = False,
1199
1207
  mid_merge_criterion: Annotated[
1200
1208
  str,
1201
1209
  Option(
@@ -1389,6 +1397,7 @@ def _multiround(
1389
1397
  midsection_threshold_change=mid_threshold_change,
1390
1398
  tolerance=tolerance,
1391
1399
  # Advanced
1400
+ sort_fps=sort_fps,
1392
1401
  save_tree=save_tree,
1393
1402
  save_centroids=save_centroids,
1394
1403
  bin_size=bin_size,
@@ -1529,6 +1538,13 @@ def _fps_from_smiles(
1529
1538
  ),
1530
1539
  ),
1531
1540
  ] = False,
1541
+ tab_separated: Annotated[
1542
+ bool,
1543
+ Option(
1544
+ "--tab-sep/--no-tab-sep",
1545
+ help="Whether the smiles file has the format <smiles><tab><field><tab>...",
1546
+ ),
1547
+ ] = False,
1532
1548
  ) -> None:
1533
1549
  r"""Generate a `*.npy` fingerprints file from one or more `*.smi` smiles files
1534
1550
 
@@ -1634,7 +1650,9 @@ def _fps_from_smiles(
1634
1650
  with mp_context.Pool(processes=num_ps) as pool:
1635
1651
  pool.map(
1636
1652
  create_fp_file,
1637
- _iter_idxs_and_smiles_batches(smiles_paths, num_per_batch),
1653
+ _iter_idxs_and_smiles_batches(
1654
+ smiles_paths, num_per_batch, tab_separated
1655
+ ),
1638
1656
  )
1639
1657
  timer.end_timing("total", console, indent=False)
1640
1658
  stem = out_name.split(".")[0]
@@ -1674,7 +1692,9 @@ def _fps_from_smiles(
1674
1692
  with mp_context.Pool(processes=num_ps) as pool:
1675
1693
  pool.starmap(
1676
1694
  fps_array_filler,
1677
- _iter_ranges_and_smiles_batches(smiles_paths, num_per_batch),
1695
+ _iter_ranges_and_smiles_batches(
1696
+ smiles_paths, num_per_batch, tab_separated
1697
+ ),
1678
1698
  )
1679
1699
  fps = np.ndarray((smiles_num, out_dim), dtype=dtype, buffer=fps_shmem.buf)
1680
1700
  mask = np.ndarray((smiles_num,), dtype=np.bool, buffer=invalid_mask_shmem.buf)
@@ -1851,3 +1871,33 @@ def _merge_fps(
1851
1871
  return
1852
1872
  np.save(out_dir / stem, np.concatenate(arrays))
1853
1873
  console.print(f"Finished. Outputs written to {str(out_dir / stem)}.npy")
1874
+
1875
+
1876
+ @app.command("fps-sort", rich_help_panel="Fingerprints")
1877
+ def _sort_fps(
1878
+ in_file: Annotated[
1879
+ Path,
1880
+ Argument(help="`*.npy` file with packed fingerprints"),
1881
+ ],
1882
+ out_dir: Annotated[
1883
+ Path | None,
1884
+ Option("-o", "--out-dir", show_default=False),
1885
+ ] = None,
1886
+ seed: Annotated[
1887
+ int | None,
1888
+ Option("--seed", hidden=True, rich_help_panel="Debug"),
1889
+ ] = None,
1890
+ ) -> None:
1891
+ import numpy as np
1892
+ from bblean._py_similarity import _popcount
1893
+
1894
+ fps = np.load(in_file)
1895
+ stem = in_file.stem
1896
+ counts = _popcount(fps)
1897
+ sort_idxs = np.argsort(counts)
1898
+ fps = fps[sort_idxs]
1899
+ if out_dir is None:
1900
+ out_dir = Path.cwd()
1901
+ out_dir.mkdir(exist_ok=True)
1902
+ out_dir = out_dir.resolve()
1903
+ np.save(out_dir / f"sorted-{stem}.npy", fps)
@@ -300,6 +300,75 @@ double jt_isim_from_sum(const CArrayForcecast<uint64_t>& linear_sum,
300
300
  return a / ((a + (n_objects * sum_kq)) - sum_kqsq);
301
301
  }
302
302
 
303
+ // NOTE: This is only *slightly* faster for C++ than numpy, **only if the
304
+ // array is uint8_t** if the array is uint64 already, it is slower
305
+ template <typename T>
306
+ py::array_t<uint64_t> add_rows(const CArrayForcecast<T>& arr) {
307
+ if (arr.ndim() != 2) {
308
+ throw std::runtime_error("Input array must be 2-dimensional");
309
+ }
310
+ auto arr_ptr = arr.data();
311
+ auto out = py::array_t<uint64_t>(arr.shape(1));
312
+ auto out_ptr = out.mutable_data();
313
+ std::memset(out_ptr, 0, out.nbytes());
314
+ py::ssize_t n_samples = arr.shape(0);
315
+ py::ssize_t n_features = arr.shape(1);
316
+ // Check GCC / CLang vectorize this
317
+ for (py::ssize_t i = 0; i < n_samples; ++i) {
318
+ const uint8_t* arr_row_ptr = arr_ptr + i * n_features;
319
+ for (py::ssize_t j = 0; j < n_features; ++j) {
320
+ out_ptr[j] += static_cast<uint64_t>(arr_row_ptr[j]);
321
+ }
322
+ }
323
+ return out;
324
+ }
325
+ py::array_t<double> _nochecks_jt_compl_isim_unpacked_u8(
326
+ const py::array_t<uint8_t, py::array::c_style>& fps) {
327
+ py::ssize_t n_objects = fps.shape(0);
328
+ py::ssize_t n_features = fps.shape(1);
329
+ auto out = py::array_t<double>(n_objects);
330
+ auto out_ptr = out.mutable_data();
331
+
332
+ if (n_objects < 3) {
333
+ PyErr_WarnEx(PyExc_RuntimeWarning,
334
+ "Invalid num fps in compl_isim. Expected n_objects >= 3",
335
+ 1);
336
+ for (py::ssize_t i{0}; i != n_objects; ++i) {
337
+ out_ptr[i] = std::numeric_limits<double>::quiet_NaN();
338
+ }
339
+ return out;
340
+ }
341
+
342
+ auto linear_sum = add_rows<uint8_t>(fps);
343
+ auto ls_cptr = linear_sum.data();
344
+
345
+ py::array_t<uint64_t> shifted_linear_sum(n_features);
346
+ auto shifted_ls_ptr = shifted_linear_sum.mutable_data();
347
+
348
+ auto in_cptr = fps.data();
349
+ for (py::ssize_t i{0}; i != n_objects; ++i) {
350
+ for (py::ssize_t j{0}; j != n_features; ++j) {
351
+ shifted_ls_ptr[j] = ls_cptr[j] - in_cptr[i * n_features + j];
352
+ }
353
+ // For all compl isim N is n_objects - 1
354
+ out_ptr[i] = jt_isim_from_sum(shifted_linear_sum, n_objects - 1);
355
+ }
356
+ return out;
357
+ }
358
+
359
+ py::array_t<double> jt_compl_isim(
360
+ const CArrayForcecast<uint8_t>& fps, bool input_is_packed = true,
361
+ std::optional<py::ssize_t> n_features_opt = std::nullopt) {
362
+ if (fps.ndim() != 2) {
363
+ throw std::runtime_error("fps arr must be 2D");
364
+ }
365
+ if (input_is_packed) {
366
+ return _nochecks_jt_compl_isim_unpacked_u8(
367
+ _nochecks_unpack_fingerprints_2d(fps, n_features_opt));
368
+ }
369
+ return _nochecks_jt_compl_isim_unpacked_u8(fps);
370
+ }
371
+
303
372
  // Contraint: T must be uint64_t or uint8_t
304
373
  template <typename T>
305
374
  void _calc_arr_vec_jt(const py::array_t<uint8_t>& arr,
@@ -372,33 +441,10 @@ py::array_t<double> jt_sim_packed_precalc_cardinalities(
372
441
  }
373
442
 
374
443
  py::array_t<double> _jt_sim_arr_vec_packed(const py::array_t<uint8_t>& arr,
375
- const py::array_t<uint8_t>& vec) {
444
+ const py::array_t<uint8_t>& vec) {
376
445
  return jt_sim_packed_precalc_cardinalities(arr, vec, _popcount_2d(arr));
377
446
  }
378
447
 
379
- // NOTE: This is only *slightly* faster for C++ than numpy, **only if the
380
- // array is uint8_t** if the array is uint64 already, it is slower
381
- template <typename T>
382
- py::array_t<uint64_t> add_rows(const CArrayForcecast<T>& arr) {
383
- if (arr.ndim() != 2) {
384
- throw std::runtime_error("Input array must be 2-dimensional");
385
- }
386
- auto arr_ptr = arr.data();
387
- auto out = py::array_t<uint64_t>(arr.shape(1));
388
- auto out_ptr = out.mutable_data();
389
- std::memset(out_ptr, 0, out.nbytes());
390
- py::ssize_t n_samples = arr.shape(0);
391
- py::ssize_t n_features = arr.shape(1);
392
- // Check GCC / CLang vectorize this
393
- for (py::ssize_t i = 0; i < n_samples; ++i) {
394
- const uint8_t* arr_row_ptr = arr_ptr + i * n_features;
395
- for (py::ssize_t j = 0; j < n_features; ++j) {
396
- out_ptr[j] += static_cast<uint64_t>(arr_row_ptr[j]);
397
- }
398
- }
399
- return out;
400
- }
401
-
402
448
  double jt_isim_unpacked_u8(const CArrayForcecast<uint8_t>& arr) {
403
449
  return jt_isim_from_sum(add_rows<uint8_t>(arr), arr.shape(0));
404
450
  }
@@ -406,8 +452,9 @@ double jt_isim_unpacked_u8(const CArrayForcecast<uint8_t>& arr) {
406
452
  double jt_isim_packed_u8(
407
453
  const CArrayForcecast<uint8_t>& arr,
408
454
  std::optional<py::ssize_t> n_features_opt = std::nullopt) {
409
- return jt_isim_from_sum(add_rows<uint8_t>(unpack_fingerprints(arr, n_features_opt)),
410
- arr.shape(0));
455
+ return jt_isim_from_sum(
456
+ add_rows<uint8_t>(unpack_fingerprints(arr, n_features_opt)),
457
+ arr.shape(0));
411
458
  }
412
459
 
413
460
  py::tuple jt_most_dissimilar_packed(
@@ -510,6 +557,10 @@ PYBIND11_MODULE(_cpp_similarity, m) {
510
557
  m.def("jt_isim_unpacked_u8", &jt_isim_unpacked_u8,
511
558
  "iSIM Tanimoto calculation", py::arg("arr"));
512
559
 
560
+ m.def("jt_compl_isim", &jt_compl_isim, "Complementary iSIM tanimoto",
561
+ py::arg("fps"), py::arg("input_is_packed") = true,
562
+ py::arg("n_features") = std::nullopt);
563
+
513
564
  m.def("_jt_sim_arr_vec_packed", &_jt_sim_arr_vec_packed,
514
565
  "Tanimoto similarity between a matrix of packed fps and a single "
515
566
  "packed fp",
@@ -115,7 +115,11 @@ def _get_generator(kind: str, n_features: int) -> tp.Any:
115
115
  return rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_features)
116
116
  elif kind == "ecfp6":
117
117
  return rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=n_features)
118
- raise ValueError(f"Unknonw kind {kind}. Should be one of 'rdkit|ecfp4|ecfp6'")
118
+ elif kind == "topological":
119
+ return rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=n_features)
120
+ elif kind == "ap":
121
+ return rdFingerprintGenerator.GetAtomPairGenerator(fpSize=n_features)
122
+ raise ValueError(f"Unknown kind {kind}. Use 'rdkit|ecfp4|ecfp6|topological|ap'")
119
123
 
120
124
 
121
125
  def _get_sanitize_flags(sanitize: str) -> tp.Any:
@@ -65,6 +65,7 @@ from bblean._config import DEFAULTS
65
65
  from bblean.utils import batched
66
66
  from bblean.bitbirch import BitBirch
67
67
  from bblean.fingerprints import _get_fps_file_num
68
+ from bblean._py_similarity import _popcount
68
69
 
69
70
  __all__ = ["run_multiround_bitbirch"]
70
71
 
@@ -157,6 +158,7 @@ class _InitialRound:
157
158
  max_fps: int | None = None,
158
159
  merge_criterion: str = DEFAULTS.merge_criterion,
159
160
  input_is_packed: bool = True,
161
+ sort_fps: bool = False,
160
162
  ) -> None:
161
163
  self.n_features = n_features
162
164
  self.refinement_before_midsection = refinement_before_midsection
@@ -171,6 +173,7 @@ class _InitialRound:
171
173
  self.refine_merge_criterion = refine_merge_criterion
172
174
  self.input_is_packed = input_is_packed
173
175
  self.refine_threshold_change = refine_threshold_change
176
+ self._sort_fps = sort_fps
174
177
 
175
178
  def __call__(self, file_info: tuple[str, Path, int, int]) -> None:
176
179
  file_label, fp_file, start_idx, end_idx = file_info
@@ -182,6 +185,14 @@ class _InitialRound:
182
185
  threshold=self.threshold,
183
186
  merge_criterion=self.merge_criterion,
184
187
  )
188
+ if self._sort_fps:
189
+ fp_input = np.load(fp_file)
190
+ counts = _popcount(fp_input)
191
+ sort_idxs = np.argsort(counts)
192
+ fp_input = fp_input[sort_idxs]
193
+ else:
194
+ fp_input = fp_file
195
+
185
196
  range_ = range(start_idx, end_idx)
186
197
  tree.fit(
187
198
  fp_file,
@@ -201,7 +212,7 @@ class _InitialRound:
201
212
  # Finish the first refinement step internally in this round
202
213
  tree.reset()
203
214
  tree.set_merge(
204
- self.refine_merge_criterion,
215
+ merge_criterion=self.refine_merge_criterion,
205
216
  tolerance=self.tolerance,
206
217
  threshold=self.threshold + self.refine_threshold_change,
207
218
  )
@@ -225,7 +236,7 @@ class _TreeMergingRound:
225
236
  round_idx: int,
226
237
  out_dir: Path | str,
227
238
  split_largest_cluster: bool,
228
- criterion: str,
239
+ merge_criterion: str,
229
240
  all_fp_paths: tp.Sequence[Path] = (),
230
241
  ) -> None:
231
242
  self.all_fp_paths = list(all_fp_paths)
@@ -235,14 +246,14 @@ class _TreeMergingRound:
235
246
  self.round_idx = round_idx
236
247
  self.out_dir = Path(out_dir)
237
248
  self.split_largest_cluster = split_largest_cluster
238
- self.criterion = criterion
249
+ self.merge_criterion = merge_criterion
239
250
 
240
251
  def __call__(self, batch_info: tuple[str, tp.Sequence[tuple[Path, Path]]]) -> None:
241
252
  batch_label, batch_path_pairs = batch_info
242
253
  tree = BitBirch(
243
254
  branching_factor=self.branching_factor,
244
255
  threshold=self.threshold,
245
- merge_criterion=self.criterion,
256
+ merge_criterion=self.merge_criterion,
246
257
  tolerance=self.tolerance,
247
258
  )
248
259
  # Rebuild a tree, inserting all BitFeatures from the corresponding batch
@@ -270,13 +281,20 @@ class _FinalTreeMergingRound(_TreeMergingRound):
270
281
  branching_factor: int,
271
282
  threshold: float,
272
283
  tolerance: float,
273
- criterion: str,
284
+ merge_criterion: str,
274
285
  out_dir: Path | str,
275
286
  save_tree: bool,
276
287
  save_centroids: bool,
277
288
  ) -> None:
278
289
  super().__init__(
279
- branching_factor, threshold, tolerance, -1, out_dir, False, criterion, ()
290
+ branching_factor,
291
+ threshold,
292
+ tolerance,
293
+ -1,
294
+ out_dir,
295
+ False,
296
+ merge_criterion,
297
+ (),
280
298
  )
281
299
  self.save_tree = save_tree
282
300
  self.save_centroids = save_centroids
@@ -286,7 +304,7 @@ class _FinalTreeMergingRound(_TreeMergingRound):
286
304
  tree = BitBirch(
287
305
  branching_factor=self.branching_factor,
288
306
  threshold=self.threshold,
289
- merge_criterion=self.criterion,
307
+ merge_criterion=self.merge_criterion,
290
308
  tolerance=self.tolerance,
291
309
  )
292
310
  # Rebuild a tree, inserting all BitFeatures from the corresponding batch
@@ -299,7 +317,7 @@ class _FinalTreeMergingRound(_TreeMergingRound):
299
317
  # Save clusters and exit
300
318
  if self.save_tree:
301
319
  # TODO: Find alternative solution
302
- tree.save_pickle(self.out_dir / "bitbirch.pkl")
320
+ tree.save(self.out_dir / "bitbirch.pkl")
303
321
  tree.delete_internal_nodes()
304
322
  if self.save_centroids:
305
323
  output = tree.get_centroids_mol_ids()
@@ -353,6 +371,7 @@ def run_multiround_bitbirch(
353
371
  mp_context: tp.Any = None,
354
372
  save_tree: bool = False,
355
373
  save_centroids: bool = True,
374
+ sort_fps: bool = False,
356
375
  # Debug
357
376
  max_fps: int | None = None,
358
377
  verbose: bool = False,
@@ -399,6 +418,7 @@ def run_multiround_bitbirch(
399
418
  console.print(f"(Initial) Round {round_idx}: Cluster initial batch of fingerprints")
400
419
 
401
420
  initial_fn = _InitialRound(
421
+ sort_fps=sort_fps,
402
422
  n_features=n_features,
403
423
  refinement_before_midsection=refinement_before_midsection,
404
424
  max_fps=max_fps,
@@ -436,7 +456,7 @@ def run_multiround_bitbirch(
436
456
  round_idx=round_idx,
437
457
  all_fp_paths=input_files,
438
458
  split_largest_cluster=split_largest_after_each_midsection_round,
439
- criterion=midsection_merge_criterion,
459
+ merge_criterion=midsection_merge_criterion,
440
460
  threshold=threshold + midsection_threshold_change,
441
461
  **common_kwargs,
442
462
  )
@@ -464,7 +484,7 @@ def run_multiround_bitbirch(
464
484
  final_fn = _FinalTreeMergingRound(
465
485
  save_tree=save_tree,
466
486
  save_centroids=save_centroids,
467
- criterion=final_merge_criterion,
487
+ merge_criterion=final_merge_criterion,
468
488
  threshold=threshold + midsection_threshold_change,
469
489
  **common_kwargs,
470
490
  )
@@ -34,12 +34,8 @@ __all__ = [
34
34
  "jt_sim_matrix_packed",
35
35
  ]
36
36
 
37
- from bblean._py_similarity import (
38
- centroid_from_sum,
39
- centroid,
40
- jt_compl_isim,
41
- jt_isim_medoid,
42
- )
37
+ from bblean._py_similarity import centroid_from_sum, centroid
38
+ from bblean.fingerprints import pack_fingerprints, unpack_fingerprints
43
39
 
44
40
  # jt_isim_packed and jt_isim_unpacked are not exposed, only used within functions for
45
41
  # speed
@@ -49,6 +45,7 @@ if os.getenv("BITBIRCH_NO_EXTENSIONS"):
49
45
  jt_isim_from_sum,
50
46
  jt_isim_unpacked,
51
47
  jt_isim_packed,
48
+ jt_compl_isim,
52
49
  _jt_sim_arr_vec_packed,
53
50
  jt_most_dissimilar_packed,
54
51
  )
@@ -56,11 +53,13 @@ else:
56
53
  try:
57
54
  from bblean._cpp_similarity import ( # type: ignore
58
55
  jt_isim_from_sum,
59
- _jt_sim_arr_vec_packed,
60
56
  jt_isim_unpacked_u8,
61
57
  jt_isim_packed_u8,
58
+ jt_compl_isim, # TODO: Does it need wrappers for non-uint8?
59
+ _jt_sim_arr_vec_packed,
62
60
  jt_most_dissimilar_packed,
63
- unpack_fingerprints,
61
+ # Needed for wrappers
62
+ unpack_fingerprints as _unpack_fingerprints,
64
63
  )
65
64
 
66
65
  # Wrap these two since doing
@@ -80,7 +79,7 @@ else:
80
79
  if arr.dtype == np.uint64:
81
80
  return jt_isim_from_sum(
82
81
  np.sum(
83
- unpack_fingerprints(arr, n_features), # type: ignore
82
+ _unpack_fingerprints(arr, n_features), # type: ignore
84
83
  axis=0,
85
84
  dtype=np.uint64,
86
85
  ),
@@ -93,6 +92,7 @@ else:
93
92
  jt_isim_from_sum,
94
93
  jt_isim_unpacked,
95
94
  jt_isim_packed,
95
+ jt_compl_isim,
96
96
  _jt_sim_arr_vec_packed,
97
97
  jt_most_dissimilar_packed,
98
98
  )
@@ -103,6 +103,35 @@ else:
103
103
  )
104
104
 
105
105
 
106
+ def jt_isim_medoid(
107
+ fps: NDArray[np.uint8],
108
+ input_is_packed: bool = True,
109
+ n_features: int | None = None,
110
+ pack: bool = True,
111
+ ) -> tuple[int, NDArray[np.uint8]]:
112
+ r"""Calculate the (Tanimoto) medoid of a set of fingerprints, using iSIM
113
+
114
+ Returns both the index of the medoid in the input array and the medoid itself
115
+
116
+ .. note::
117
+ Returns the first (or only) fingerprint for array of size 2 and 1 respectively.
118
+ Raises ValueError for arrays of size 0
119
+
120
+ """
121
+ if not fps.size:
122
+ raise ValueError("Size of fingerprints set must be > 0")
123
+ if input_is_packed:
124
+ fps = unpack_fingerprints(fps, n_features)
125
+ if len(fps) < 3:
126
+ idx = 0 # Medoid undefined for sets of 3 or more fingerprints
127
+ else:
128
+ idx = np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
129
+ m = fps[idx]
130
+ if pack:
131
+ return idx, pack_fingerprints(m)
132
+ return idx, m
133
+
134
+
106
135
  def jt_isim(
107
136
  fps: NDArray[np.integer],
108
137
  input_is_packed: bool = True,
@@ -149,7 +178,11 @@ def jt_isim_diameter(
149
178
  r"""Calculate the Tanimoto diameter of a set of fingerprints"""
150
179
  return jt_isim_diameter_from_sum(
151
180
  np.sum(
152
- unpack_fingerprints(arr, n_features) if input_is_packed else arr,
181
+ (
182
+ unpack_fingerprints(arr.astype(np.uint8, copy=False), n_features)
183
+ if input_is_packed
184
+ else arr
185
+ ),
153
186
  axis=0,
154
187
  dtype=np.uint64,
155
188
  ), # type: ignore
@@ -165,7 +198,11 @@ def jt_isim_radius(
165
198
  r"""Calculate the Tanimoto radius of a set of fingerprints"""
166
199
  return jt_isim_radius_from_sum(
167
200
  np.sum(
168
- unpack_fingerprints(arr, n_features) if input_is_packed else arr,
201
+ (
202
+ unpack_fingerprints(arr.astype(np.uint8, copy=False), n_features)
203
+ if input_is_packed
204
+ else arr
205
+ ),
169
206
  axis=0,
170
207
  dtype=np.uint64,
171
208
  ), # type: ignore
@@ -181,7 +218,11 @@ def jt_isim_radius_compl(
181
218
  r"""Calculate the complement of the Tanimoto radius of a set of fingerprints"""
182
219
  return jt_isim_radius_compl_from_sum(
183
220
  np.sum(
184
- unpack_fingerprints(arr, n_features) if input_is_packed else arr,
221
+ (
222
+ unpack_fingerprints(arr.astype(np.uint8, copy=False), n_features)
223
+ if input_is_packed
224
+ else arr
225
+ ),
185
226
  axis=0,
186
227
  dtype=np.uint64,
187
228
  ), # type: ignore
@@ -252,14 +293,28 @@ def estimate_jt_std(
252
293
  n_samples: int | None = None,
253
294
  input_is_packed: bool = True,
254
295
  n_features: int | None = None,
296
+ min_samples: int = 1_000_000,
255
297
  ) -> float:
256
- r"""Estimate std of tanimoto sim using a deterministic sample"""
298
+ r"""Estimate the std of all pairwise Tanimoto.
299
+
300
+ Returns
301
+ -------
302
+ std : float
303
+ The standard deviation of all pairwise Tanimoto among the sampled fingerprints.
304
+ """
257
305
  num_fps = len(fps)
306
+ if num_fps > min_samples:
307
+ np.random.seed(42)
308
+ random_choices = np.random.choice(num_fps, size=min_samples, replace=False)
309
+ fps = fps[random_choices]
310
+ num_fps = len(fps)
258
311
  if n_samples is None:
259
- n_samples = max(num_fps // 1000, 50)
312
+ # Heuristic: use at least 50 samples, or 1 per 10,000 fingerprints,
313
+ # to balance statistical representativeness and computational efficiency
314
+ n_samples = max(num_fps // 10_000, 50)
260
315
  sample_idxs = jt_stratified_sampling(fps, n_samples, input_is_packed, n_features)
261
316
 
262
- # Work with sample from now on
317
+ # Work with only the sampled fingerprints
263
318
  fps = fps[sample_idxs]
264
319
  num_fps = len(fps)
265
320
  pairs = np.empty(num_fps * (num_fps - 1) // 2, dtype=np.float64)