bblean 0.7.8__tar.gz → 0.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {bblean-0.7.8 → bblean-0.8.1}/PKG-INFO +1 -1
  2. {bblean-0.7.8 → bblean-0.8.1}/bblean/_version.py +2 -2
  3. {bblean-0.7.8 → bblean-0.8.1}/bblean/bitbirch.py +59 -58
  4. {bblean-0.7.8 → bblean-0.8.1}/bblean/fingerprints.py +115 -0
  5. {bblean-0.7.8 → bblean-0.8.1}/bblean/smiles.py +6 -4
  6. {bblean-0.7.8 → bblean-0.8.1}/bblean.egg-info/PKG-INFO +1 -1
  7. {bblean-0.7.8 → bblean-0.8.1}/tests/test_simple.py +18 -2
  8. {bblean-0.7.8 → bblean-0.8.1}/.cruft.json +0 -0
  9. {bblean-0.7.8 → bblean-0.8.1}/.flake8 +0 -0
  10. {bblean-0.7.8 → bblean-0.8.1}/.github/CODEOWNERS +0 -0
  11. {bblean-0.7.8 → bblean-0.8.1}/.github/workflows/ci-cpp.yaml +0 -0
  12. {bblean-0.7.8 → bblean-0.8.1}/.github/workflows/ci.yaml +0 -0
  13. {bblean-0.7.8 → bblean-0.8.1}/.github/workflows/upload-to-pypi.yaml +0 -0
  14. {bblean-0.7.8 → bblean-0.8.1}/.gitignore +0 -0
  15. {bblean-0.7.8 → bblean-0.8.1}/.pre-commit-config.yaml +0 -0
  16. {bblean-0.7.8 → bblean-0.8.1}/LICENSE +0 -0
  17. {bblean-0.7.8 → bblean-0.8.1}/LICENSES/BSD-3-Clause.txt +0 -0
  18. {bblean-0.7.8 → bblean-0.8.1}/LICENSES/GPL-3.0-only.txt +0 -0
  19. {bblean-0.7.8 → bblean-0.8.1}/README.md +0 -0
  20. {bblean-0.7.8 → bblean-0.8.1}/bblean/__init__.py +0 -0
  21. {bblean-0.7.8 → bblean-0.8.1}/bblean/_config.py +0 -0
  22. {bblean-0.7.8 → bblean-0.8.1}/bblean/_console.py +0 -0
  23. {bblean-0.7.8 → bblean-0.8.1}/bblean/_legacy/__init__.py +0 -0
  24. {bblean-0.7.8 → bblean-0.8.1}/bblean/_legacy/bb_int64.py +0 -0
  25. {bblean-0.7.8 → bblean-0.8.1}/bblean/_legacy/bb_uint8.py +0 -0
  26. {bblean-0.7.8 → bblean-0.8.1}/bblean/_memory.py +0 -0
  27. {bblean-0.7.8 → bblean-0.8.1}/bblean/_merges.py +0 -0
  28. {bblean-0.7.8 → bblean-0.8.1}/bblean/_py_similarity.py +0 -0
  29. {bblean-0.7.8 → bblean-0.8.1}/bblean/_timer.py +0 -0
  30. {bblean-0.7.8 → bblean-0.8.1}/bblean/analysis.py +0 -0
  31. {bblean-0.7.8 → bblean-0.8.1}/bblean/cli.py +0 -0
  32. {bblean-0.7.8 → bblean-0.8.1}/bblean/csrc/README.md +0 -0
  33. {bblean-0.7.8 → bblean-0.8.1}/bblean/csrc/similarity.cpp +0 -0
  34. {bblean-0.7.8 → bblean-0.8.1}/bblean/metrics.py +0 -0
  35. {bblean-0.7.8 → bblean-0.8.1}/bblean/multiround.py +0 -0
  36. {bblean-0.7.8 → bblean-0.8.1}/bblean/plotting.py +0 -0
  37. {bblean-0.7.8 → bblean-0.8.1}/bblean/similarity.py +0 -0
  38. {bblean-0.7.8 → bblean-0.8.1}/bblean/sklearn.py +0 -0
  39. {bblean-0.7.8 → bblean-0.8.1}/bblean/utils.py +0 -0
  40. {bblean-0.7.8 → bblean-0.8.1}/bblean-demo-v2.gif +0 -0
  41. {bblean-0.7.8 → bblean-0.8.1}/bblean-demo.cast +0 -0
  42. {bblean-0.7.8 → bblean-0.8.1}/bblean.egg-info/SOURCES.txt +0 -0
  43. {bblean-0.7.8 → bblean-0.8.1}/bblean.egg-info/dependency_links.txt +0 -0
  44. {bblean-0.7.8 → bblean-0.8.1}/bblean.egg-info/entry_points.txt +0 -0
  45. {bblean-0.7.8 → bblean-0.8.1}/bblean.egg-info/requires.txt +0 -0
  46. {bblean-0.7.8 → bblean-0.8.1}/bblean.egg-info/top_level.txt +0 -0
  47. {bblean-0.7.8 → bblean-0.8.1}/docs/src/_static/api.svg +0 -0
  48. {bblean-0.7.8 → bblean-0.8.1}/docs/src/_static/installing.svg +0 -0
  49. {bblean-0.7.8 → bblean-0.8.1}/docs/src/_static/logo-dark-bw.svg +0 -0
  50. {bblean-0.7.8 → bblean-0.8.1}/docs/src/_static/logo-light-bw.svg +0 -0
  51. {bblean-0.7.8 → bblean-0.8.1}/docs/src/_static/publications.svg +0 -0
  52. {bblean-0.7.8 → bblean-0.8.1}/docs/src/_static/style.css +0 -0
  53. {bblean-0.7.8 → bblean-0.8.1}/docs/src/_static/user-guide.svg +0 -0
  54. {bblean-0.7.8 → bblean-0.8.1}/docs/src/_templates/module.rst +0 -0
  55. {bblean-0.7.8 → bblean-0.8.1}/docs/src/api-reference.rst +0 -0
  56. {bblean-0.7.8 → bblean-0.8.1}/docs/src/conf.py +0 -0
  57. {bblean-0.7.8 → bblean-0.8.1}/docs/src/index.rst +0 -0
  58. {bblean-0.7.8 → bblean-0.8.1}/docs/src/installing.rst +0 -0
  59. {bblean-0.7.8 → bblean-0.8.1}/docs/src/publications.rst +0 -0
  60. {bblean-0.7.8 → bblean-0.8.1}/docs/src/user-guide/linux_memory_setup.rst +0 -0
  61. {bblean-0.7.8 → bblean-0.8.1}/docs/src/user-guide/notebooks/bitbirch_best_practices.ipynb +0 -0
  62. {bblean-0.7.8 → bblean-0.8.1}/docs/src/user-guide/notebooks/bitbirch_quickstart.ipynb +0 -0
  63. {bblean-0.7.8 → bblean-0.8.1}/docs/src/user-guide/parameters.rst +0 -0
  64. {bblean-0.7.8 → bblean-0.8.1}/docs/src/user-guide.rst +0 -0
  65. {bblean-0.7.8 → bblean-0.8.1}/environment.yaml +0 -0
  66. {bblean-0.7.8 → bblean-0.8.1}/examples/best_practices/best_practices_functions.py +0 -0
  67. {bblean-0.7.8 → bblean-0.8.1}/examples/best_practices/best_practices_plots.py +0 -0
  68. {bblean-0.7.8 → bblean-0.8.1}/examples/best_practices/bitbirch_best_practices.ipynb +0 -0
  69. {bblean-0.7.8 → bblean-0.8.1}/examples/best_practices/bitbirch_best_practices_RDKit.ipynb +0 -0
  70. {bblean-0.7.8 → bblean-0.8.1}/examples/best_practices/bitbirch_parameter.ipynb +0 -0
  71. {bblean-0.7.8 → bblean-0.8.1}/examples/biogen_logS.csv +0 -0
  72. {bblean-0.7.8 → bblean-0.8.1}/examples/bitbirch_best_practices.ipynb +0 -0
  73. {bblean-0.7.8 → bblean-0.8.1}/examples/bitbirch_quickstart.ipynb +0 -0
  74. {bblean-0.7.8 → bblean-0.8.1}/examples/chembl-33-natural-products-subset.smi +0 -0
  75. {bblean-0.7.8 → bblean-0.8.1}/examples/dataset_splitting.ipynb +0 -0
  76. {bblean-0.7.8 → bblean-0.8.1}/pyproject.toml +0 -0
  77. {bblean-0.7.8 → bblean-0.8.1}/setup.cfg +0 -0
  78. {bblean-0.7.8 → bblean-0.8.1}/setup.py +0 -0
  79. {bblean-0.7.8 → bblean-0.8.1}/tests/chembl-sample-3k.smi +0 -0
  80. {bblean-0.7.8 → bblean-0.8.1}/tests/chembl-sample-bad.smi +0 -0
  81. {bblean-0.7.8 → bblean-0.8.1}/tests/legacy_merges.py +0 -0
  82. {bblean-0.7.8 → bblean-0.8.1}/tests/legacy_metrics.py +0 -0
  83. {bblean-0.7.8 → bblean-0.8.1}/tests/test_bb_consistency.py +0 -0
  84. {bblean-0.7.8 → bblean-0.8.1}/tests/test_cli.py +0 -0
  85. {bblean-0.7.8 → bblean-0.8.1}/tests/test_fake_fps.py +0 -0
  86. {bblean-0.7.8 → bblean-0.8.1}/tests/test_fingerprints.py +0 -0
  87. {bblean-0.7.8 → bblean-0.8.1}/tests/test_global_clustering.py +0 -0
  88. {bblean-0.7.8 → bblean-0.8.1}/tests/test_import_bblean.py +0 -0
  89. {bblean-0.7.8 → bblean-0.8.1}/tests/test_merges.py +0 -0
  90. {bblean-0.7.8 → bblean-0.8.1}/tests/test_metrics.py +0 -0
  91. {bblean-0.7.8 → bblean-0.8.1}/tests/test_multiround.py +0 -0
  92. {bblean-0.7.8 → bblean-0.8.1}/tests/test_refine.py +0 -0
  93. {bblean-0.7.8 → bblean-0.8.1}/tests/test_regression.py +0 -0
  94. {bblean-0.7.8 → bblean-0.8.1}/tests/test_sampling.py +0 -0
  95. {bblean-0.7.8 → bblean-0.8.1}/tests/test_similarity.py +0 -0
  96. {bblean-0.7.8 → bblean-0.8.1}/tests/test_sklearn.py +0 -0
  97. {bblean-0.7.8 → bblean-0.8.1}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bblean
3
- Version: 0.7.8
3
+ Version: 0.8.1
4
4
  Summary: BitBirch-Lean Python package
5
5
  Author: The Miranda-Quintana Lab and other BitBirch developers
6
6
  Author-email: Ramon Alain Miranda Quintana <quintana@chem.ufl.edu>, Krisztina Zsigmond <kzsigmond@ufl.edu>, Ignacio Pickering <ipickering@ufl.edu>, Kenneth Lopez Perez <klopezperez@chem.ufl.edu>, Miroslav Lzicar <miroslav.lzicar@deepmedchem.com>
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.7.8'
32
- __version_tuple__ = version_tuple = (0, 7, 8)
31
+ __version__ = version = '0.8.1'
32
+ __version_tuple__ = version_tuple = (0, 8, 1)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -47,6 +47,7 @@
47
47
  # ./LICENSES/GPL-3.0-only.txt. If not, see <http://www.gnu.org/licenses/gpl-3.0.html>.
48
48
  r"""BitBirch 'Lean' class for fast, memory-efficient O(N) clustering"""
49
49
  from __future__ import annotations # Stringize type annotations for no runtime overhead
50
+ import itertools
50
51
  import pickle
51
52
  import sys
52
53
  import typing_extensions as tpx
@@ -171,8 +172,8 @@ def _split_node(node: "_BFNode") -> tuple["_BFSubcluster", "_BFSubcluster"]:
171
172
  """
172
173
  n_features = node.n_features
173
174
  branching_factor = node.branching_factor
174
- new_subcluster1 = _BFSubcluster(n_features=n_features)
175
- new_subcluster2 = _BFSubcluster(n_features=n_features)
175
+ new_subcluster1 = _BFSubcluster.empty(n_features)
176
+ new_subcluster2 = _BFSubcluster.empty(n_features)
176
177
 
177
178
  node1 = _BFNode(branching_factor, n_features)
178
179
  node2 = node # Rename for clarity
@@ -394,13 +395,15 @@ class _BFSubcluster:
394
395
 
395
396
  def __init__(
396
397
  self,
397
- *,
398
- linear_sum: NDArray[np.integer] | None = None,
399
- mol_indices: tp.Sequence[int] = (),
400
- n_features: int = 2048,
401
- buffer: NDArray[np.integer] | None = None,
398
+ buffer: NDArray[np.integer],
399
+ mol_indices: tp.Sequence[int],
400
+ packed_centroid: NDArray[np.uint8] | None = None,
402
401
  check_indices: bool = True,
403
- ):
402
+ ) -> None:
403
+ # If packed centroid is passed, it must be equal to the packed centroid
404
+ # of the linear sum (this is not checked)
405
+ if mol_indices and check_indices and buffer[-1] != len(mol_indices):
406
+ raise ValueError("len mol_indices must be equal to buffer[-1] if specified")
404
407
  # NOTE: Internally, _buffer holds both "linear_sum" and "n_samples" It is
405
408
  # guaranteed to always have the minimum required uint dtype It should not be
406
409
  # accessed by external classes, only used internally. The individual parts can
@@ -409,44 +412,40 @@ class _BFSubcluster:
409
412
  #
410
413
  # IMPORTANT: To mutate instances of this class, *always* use the public API
411
414
  # given by replace|add_to_n_samples_and_linear_sum(...)
412
- if buffer is not None:
413
- if linear_sum is not None:
414
- raise ValueError("'linear_sum' and 'buffer' are mutually exclusive")
415
- if check_indices and len(mol_indices) != buffer[-1]:
416
- raise ValueError(
417
- "Expected len(mol_indices) == buffer[-1],"
418
- f" but found {len(mol_indices)} != {buffer[-1]}"
419
- )
420
- self._buffer = buffer
421
- self.packed_centroid = centroid_from_sum(buffer[:-1], buffer[-1], pack=True)
422
- else:
423
- if linear_sum is not None:
424
- if check_indices and len(mol_indices) != 1:
425
- raise ValueError(
426
- "Expected len(mol_indices) == 1,"
427
- f" but found {len(mol_indices)} != 1"
428
- )
429
- buffer = np.empty((len(linear_sum) + 1,), dtype=np.uint8)
430
- buffer[:-1] = linear_sum
431
- buffer[-1] = 1
432
- self._buffer = buffer
433
- self.packed_centroid = pack_fingerprints(
434
- linear_sum.astype(np.uint8, copy=False)
435
- )
436
- else:
437
- # Empty subcluster
438
- if check_indices and len(mol_indices) != 0:
439
- raise ValueError(
440
- "Expected len(mol_indices) == 0 for empty subcluster,"
441
- f" but found {len(mol_indices)} != 0"
442
- )
443
- self._buffer = np.zeros((n_features + 1,), dtype=np.uint8)
444
- self.packed_centroid = np.empty(
445
- 0, dtype=np.uint8
446
- ) # Will be overwritten
415
+ self._buffer = buffer
447
416
  self.mol_indices = list(mol_indices)
417
+ if packed_centroid is not None:
418
+ self.packed_centroid = packed_centroid
419
+ else:
420
+ self.packed_centroid = centroid_from_sum(buffer[:-1], buffer[-1], pack=True)
448
421
  self.child: tp.Optional["_BFNode"] = None
449
422
 
423
+ @classmethod
424
+ def empty(cls, n_features: int) -> tpx.Self:
425
+ packed_centroid = np.empty(0, dtype=np.uint8) # Will be overwritten
426
+ return cls(
427
+ np.zeros((n_features + 1,), dtype=np.uint8),
428
+ [],
429
+ packed_centroid,
430
+ check_indices=False,
431
+ )
432
+
433
+ @classmethod
434
+ def from_fingerprint(
435
+ cls, fp: NDArray[np.uint8], index: int, weight: int | None = None
436
+ ) -> tpx.Self:
437
+ if weight is not None:
438
+ buffer = np.empty((len(fp) + 1,), dtype=min_safe_uint(weight))
439
+ buffer[:-1] = fp
440
+ buffer[-1] = 1
441
+ buffer *= weight
442
+ else:
443
+ buffer = np.empty((len(fp) + 1,), dtype=np.uint8)
444
+ buffer[:-1] = fp
445
+ buffer[-1] = 1
446
+ packed_centroid = pack_fingerprints(fp)
447
+ return cls(buffer, [index], packed_centroid, check_indices=False)
448
+
450
449
  @property
451
450
  def unpacked_centroid(self) -> NDArray[np.uint8]:
452
451
  return _unpack_fingerprints(self.packed_centroid, self.n_features)
@@ -711,6 +710,7 @@ class BitBirch:
711
710
  input_is_packed: bool = True,
712
711
  n_features: int | None = None,
713
712
  max_fps: int | None = None,
713
+ weights: tp.Iterable[int] | None = None,
714
714
  ) -> tpx.Self:
715
715
  r"""Build a BF Tree for the input data.
716
716
 
@@ -763,15 +763,19 @@ class BitBirch:
763
763
  else:
764
764
  iterable = zip(reinsert_indices, arr_iterable)
765
765
 
766
+ it_weights: tp.Iterator[int | None]
767
+ if weights is None:
768
+ it_weights = itertools.repeat(None)
769
+ else:
770
+ it_weights = iter(weights)
771
+
766
772
  threshold = self.threshold
767
773
  branching_factor = self.branching_factor
768
774
  merge_accept_fn = self._merge_accept_fn
769
775
 
770
776
  arr_idx = 0
771
777
  for idx, fp in iterable:
772
- subcluster = _BFSubcluster(
773
- linear_sum=fp, mol_indices=[idx], n_features=n_features
774
- )
778
+ subcluster = _BFSubcluster.from_fingerprint(fp, idx, next(it_weights))
775
779
  split = self._root.insert_bf_subcluster(
776
780
  subcluster, merge_accept_fn, threshold
777
781
  )
@@ -791,22 +795,22 @@ class BitBirch:
791
795
  def _fit_buffers(
792
796
  self,
793
797
  X: _Input | Path | str,
794
- reinsert_index_seqs: (
795
- tp.Iterable[tp.Sequence[int]] | tp.Literal["omit"]
796
- ) = "omit",
798
+ reinsert_index_seqs: tp.Iterable[tp.Sequence[int]] | None,
799
+ check_indices: bool = True,
797
800
  ) -> tpx.Self:
798
801
  r"""Build a BF Tree starting from buffers
799
802
 
800
803
  Buffers are arrays of the form:
801
804
  - buffer[0:-1] = linear_sum
802
805
  - buffer[-1] = n_samples
803
- And X is either an array or a list of such buffers
806
+ X is either an array or a list of such buffers
804
807
 
805
808
  If `reinsert_index_seqs` is passed, X corresponds only to the buffers to be
806
809
  reinserted into the tree, and `reinsert_index_seqs` are the sequences
807
810
  of indices associated with such buffers.
808
811
 
809
- If `reinsert_index_seqs` is "omit", then no indices are collected in the tree.
812
+ If `reinsert_index_seqs` is None, then no indices are collected in the tree.
813
+ Num samples is mutually exclusive with reinsert_index_seqs.
810
814
 
811
815
  Parameters
812
816
  ----------
@@ -840,16 +844,13 @@ class BitBirch:
840
844
  branching_factor = self.branching_factor
841
845
  idx_provider: tp.Iterable[tp.Sequence[int]]
842
846
  arr_idx = 0
843
- if reinsert_index_seqs == "omit":
844
- idx_provider = (() for idx in range(self.num_fitted_fps))
845
- check = False
847
+ if reinsert_index_seqs is None:
848
+ idx_provider = itertools.repeat(())
846
849
  else:
847
850
  idx_provider = reinsert_index_seqs
848
- check = True
851
+
849
852
  for idxs, buf in zip(idx_provider, arr_iterable):
850
- subcluster = _BFSubcluster(
851
- buffer=buf, mol_indices=idxs, n_features=n_features, check_indices=check
852
- )
853
+ subcluster = _BFSubcluster(buf, idxs, check_indices=check_indices)
853
854
  split = self._root.insert_bf_subcluster(
854
855
  subcluster, merge_accept_fn, threshold
855
856
  )
@@ -1,11 +1,15 @@
1
1
  r"""Utilites for manipulating fingerprints and fingerprint files"""
2
2
 
3
+ import sys
4
+ import math
5
+ import weakref
3
6
  import warnings
4
7
  import dataclasses
5
8
  from pathlib import Path
6
9
  from numpy.typing import NDArray, DTypeLike
7
10
  import numpy as np
8
11
  import typing as tp
12
+ import multiprocessing as mp
9
13
  import multiprocessing.shared_memory as shmem
10
14
 
11
15
  from rich.console import Console
@@ -13,6 +17,8 @@ from rdkit.Chem import rdFingerprintGenerator, MolFromSmiles, SanitizeFlags, San
13
17
 
14
18
  from bblean._config import DEFAULTS
15
19
  from bblean._console import get_console
20
+ from bblean.smiles import _iter_ranges_and_smiles_batches
21
+ from bblean.utils import _num_avail_cpus
16
22
 
17
23
  __all__ = [
18
24
  "make_fake_fingerprints",
@@ -441,3 +447,112 @@ class _FingerprintArrayFiller:
441
447
  fps[i, :] = fp
442
448
  fps_shmem.close()
443
449
  invalid_mask_shmem.close()
450
+
451
+
452
+ @tp.overload
453
+ def fps_from_smiles_parallel(
454
+ smiles: tp.Iterable[str],
455
+ kind: str = DEFAULTS.fp_kind,
456
+ n_features: int = DEFAULTS.n_features,
457
+ dtype: DTypeLike = np.uint8,
458
+ sanitize: str = "all",
459
+ skip_invalid: tp.Literal[False] = False,
460
+ pack: bool = True,
461
+ num_ps: int = 1,
462
+ replace_dummy_atoms: bool = False,
463
+ tab_separated: bool = False,
464
+ mp_context: tp.Any = None,
465
+ ) -> NDArray[np.uint8]:
466
+ pass
467
+
468
+
469
+ @tp.overload
470
+ def fps_from_smiles_parallel(
471
+ smiles: tp.Iterable[str],
472
+ kind: str = DEFAULTS.fp_kind,
473
+ n_features: int = DEFAULTS.n_features,
474
+ dtype: DTypeLike = np.uint8,
475
+ sanitize: str = "all",
476
+ skip_invalid: tp.Literal[True] = True,
477
+ pack: bool = True,
478
+ num_ps: int = 1,
479
+ replace_dummy_atoms: bool = False,
480
+ tab_separated: bool = False,
481
+ mp_context: tp.Any = None,
482
+ ) -> tp.Union[NDArray[np.uint8], tuple[NDArray[np.uint8], NDArray[np.int64]]]:
483
+ pass
484
+
485
+
486
+ # NOTE: This function is proof of concept and kinda dangerous since it registers
487
+ # a custom destructor for the numpy array
488
+ # It is also *only usable if called inside an if __name__ == "__main__" guard*
489
+ # For now lets hide it
490
+ def fps_from_smiles_parallel(
491
+ smiles: tp.Iterable[str],
492
+ kind: str = DEFAULTS.fp_kind,
493
+ n_features: int = DEFAULTS.n_features,
494
+ dtype: DTypeLike = np.uint8,
495
+ sanitize: str = "all",
496
+ skip_invalid: bool = False,
497
+ pack: bool = True,
498
+ num_ps: int | None = None,
499
+ replace_dummy_atoms: bool = False,
500
+ tab_separated: bool = False,
501
+ mp_context: tp.Any = None,
502
+ ) -> tp.Union[NDArray[np.uint8], tuple[NDArray[np.uint8], NDArray[np.int64]]]:
503
+ r""":meta private:"""
504
+ if mp_context is None:
505
+ mp_context = mp.get_context("forkserver" if sys.platform == "linux" else None)
506
+ if isinstance(smiles, str):
507
+ smiles = [smiles]
508
+ smiles = list(smiles)
509
+ smiles_num = len(smiles)
510
+ if num_ps is None:
511
+ num_ps = _num_avail_cpus()
512
+
513
+ if pack:
514
+ out_dim = (n_features + 7) // 8
515
+ else:
516
+ out_dim = n_features
517
+ shmem_size = smiles_num * out_dim * np.dtype(dtype).itemsize
518
+ fps_shmem = shmem.SharedMemory(create=True, size=shmem_size)
519
+ invalid_mask_shmem = shmem.SharedMemory(create=True, size=smiles_num)
520
+ fps_array_filler = _FingerprintArrayFiller(
521
+ shmem_name=fps_shmem.name,
522
+ invalid_mask_shmem_name=invalid_mask_shmem.name,
523
+ kind=kind,
524
+ fp_size=n_features,
525
+ num_smiles=smiles_num,
526
+ dtype=np.dtype(dtype).name,
527
+ pack=pack,
528
+ sanitize=sanitize,
529
+ skip_invalid=skip_invalid,
530
+ )
531
+ num_per_batch = math.ceil(smiles_num / num_ps)
532
+ with mp_context.Pool(processes=num_ps) as pool:
533
+ pool.starmap(
534
+ fps_array_filler,
535
+ _iter_ranges_and_smiles_batches(
536
+ smiles,
537
+ num_per_batch,
538
+ tab_separated,
539
+ replace_dummy_atoms,
540
+ assume_paths=False,
541
+ ),
542
+ )
543
+ fps = np.ndarray((smiles_num, out_dim), dtype=dtype, buffer=fps_shmem.buf)
544
+ mask = np.ndarray((smiles_num,), dtype=np.bool, buffer=invalid_mask_shmem.buf)
545
+ if skip_invalid:
546
+ fps = np.delete(fps, mask, axis=0)
547
+ weakref.finalize(mask, invalid_mask_shmem.close)
548
+ weakref.finalize(mask, invalid_mask_shmem.unlink)
549
+ weakref.finalize(fps, fps_shmem.close)
550
+ weakref.finalize(fps, fps_shmem.unlink)
551
+ return fps, mask
552
+
553
+ del mask
554
+ invalid_mask_shmem.close()
555
+ invalid_mask_shmem.unlink()
556
+ weakref.finalize(fps, fps_shmem.close)
557
+ weakref.finalize(fps, fps_shmem.unlink)
558
+ return fps
@@ -71,12 +71,14 @@ def _iter_ranges_and_smiles_batches(
71
71
  num_per_batch: int,
72
72
  tab_separated: bool = False,
73
73
  replace_dummy_atoms: bool = False,
74
+ assume_paths: bool = True,
74
75
  ) -> tp.Iterable[tuple[tuple[int, int], tuple[str, ...]]]:
76
+ if assume_paths:
77
+ it = iter_smiles_from_paths(smiles_paths, tab_separated, replace_dummy_atoms)
78
+ else:
79
+ it = tp.cast(tp.Iterator[str], smiles_paths)
75
80
  start_idx = 0
76
- for batch in batched(
77
- iter_smiles_from_paths(smiles_paths, tab_separated, replace_dummy_atoms),
78
- num_per_batch,
79
- ):
81
+ for batch in batched(it, num_per_batch):
80
82
  size = len(batch)
81
83
  end_idx = start_idx + size
82
84
  yield (start_idx, end_idx), batch
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bblean
3
- Version: 0.7.8
3
+ Version: 0.8.1
4
4
  Summary: BitBirch-Lean Python package
5
5
  Author: The Miranda-Quintana Lab and other BitBirch developers
6
6
  Author-email: Ramon Alain Miranda Quintana <quintana@chem.ufl.edu>, Krisztina Zsigmond <kzsigmond@ufl.edu>, Ignacio Pickering <ipickering@ufl.edu>, Kenneth Lopez Perez <klopezperez@chem.ufl.edu>, Miroslav Lzicar <miroslav.lzicar@deepmedchem.com>
@@ -1,8 +1,11 @@
1
+ import itertools
1
2
  import pytest
2
3
  import numpy as np
3
4
 
4
- from bblean.bitbirch import BitBirch # type: ignore
5
- from bblean.fingerprints import pack_fingerprints
5
+ from bblean.bitbirch import BitBirch
6
+ from bblean.fingerprints import pack_fingerprints, make_fake_fingerprints
7
+
8
+ from inline_snapshot import snapshot
6
9
 
7
10
  # NOTE: Results on this file don't depend on branching factor / threshold
8
11
 
@@ -37,3 +40,16 @@ def test_bb_cluster_simple_repeated_fps() -> None:
37
40
  )
38
41
  ids = BitBirch().fit(mixed_fp, n_features=2048).get_cluster_mol_ids()
39
42
  assert ids == [list(range(repeats))]
43
+
44
+
45
+ def test_bb_cluster_3_fps() -> None:
46
+ fps = make_fake_fingerprints(3, n_features=8, seed=12620509540149709235, pack=True)
47
+
48
+ data = BitBirch().fit(fps).get_cluster_mol_ids()
49
+ assert data == snapshot([[0], [1], [2]])
50
+ data = BitBirch().fit(fps, weights=itertools.repeat(5)).get_cluster_mol_ids()
51
+ assert data == snapshot([[1, 2], [0]])
52
+ data = BitBirch().fit(fps, weights=itertools.repeat(10000)).get_cluster_mol_ids()
53
+ assert data == snapshot([[1, 2], [0]])
54
+ data = BitBirch().fit(fps, weights=itertools.repeat(1000000)).get_cluster_mol_ids()
55
+ assert data == snapshot([[1, 2], [0]])
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes