chemap 0.3.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chemap-0.3.0 → chemap-0.3.2}/PKG-INFO +1 -1
- chemap-0.3.2/chemap/benchmarking/__init__.py +15 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/fingerprint_computation.py +86 -16
- chemap-0.3.2/chemap/fingerprints/__init__.py +14 -0
- chemap-0.3.2/chemap/fingerprints/chemap_base_fingerprint.py +76 -0
- chemap-0.3.2/chemap/fingerprints/lingo.py +154 -0
- chemap-0.3.2/chemap/fingerprints/map4.py +349 -0
- chemap-0.3.2/chemap/fingerprints/mhfp.py +100 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/plotting/benchmark_duplicates.py +7 -4
- {chemap-0.3.0 → chemap-0.3.2}/chemap/plotting/chem_space_umap.py +10 -13
- {chemap-0.3.0 → chemap-0.3.2}/chemap/types.py +3 -0
- {chemap-0.3.0 → chemap-0.3.2}/pyproject.toml +1 -1
- chemap-0.3.0/chemap/additional_fingerprints/__init__.py +0 -6
- chemap-0.3.0/chemap/benchmarking/__init__.py +0 -7
- {chemap-0.3.0 → chemap-0.3.2}/LICENSE +0 -0
- {chemap-0.3.0 → chemap-0.3.2}/README.md +0 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/__init__.py +0 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/approx_nn.py +0 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/benchmarking/fingerprint_duplicates.py +0 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/benchmarking/utils.py +0 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/data_loader.py +0 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/fingerprint_conversions.py +0 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/fingerprint_statistics.py +0 -0
- {chemap-0.3.0/chemap/additional_fingerprints → chemap-0.3.2/chemap/fingerprints}/element_count_fp.py +0 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/mbp.py +0 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/metrics.py +0 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/plotting/__init__.py +0 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/plotting/colormap_handling.py +0 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/plotting/colormaps.py +0 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/plotting/scatter_plots.py +0 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/utils.py +0 -0
- {chemap-0.3.0 → chemap-0.3.2}/chemap/visualizations.py +0 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from .fingerprint_duplicates import (
|
|
2
|
+
load_duplicates_npz,
|
|
3
|
+
load_precomputed_duplicates_folder,
|
|
4
|
+
save_duplicates_npz,
|
|
5
|
+
)
|
|
6
|
+
from .utils import compute_compound_max_mass_differences, compute_duplicate_max_mass_differences
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"compute_compound_max_mass_differences",
|
|
11
|
+
"compute_duplicate_max_mass_differences",
|
|
12
|
+
"load_duplicates_npz",
|
|
13
|
+
"load_precomputed_duplicates_folder",
|
|
14
|
+
"save_duplicates_npz",
|
|
15
|
+
]
|
|
@@ -7,6 +7,7 @@ from joblib import Parallel, delayed
|
|
|
7
7
|
from rdkit import Chem
|
|
8
8
|
from sklearn.base import BaseEstimator, TransformerMixin
|
|
9
9
|
from tqdm import tqdm
|
|
10
|
+
from chemap.types import UnfoldedBinary, UnfoldedCount
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
# -----------------------------
|
|
@@ -16,9 +17,6 @@ from tqdm import tqdm
|
|
|
16
17
|
InvalidPolicy = Literal["drop", "keep", "raise"]
|
|
17
18
|
Scaling = Optional[Literal["log"]]
|
|
18
19
|
|
|
19
|
-
UnfoldedBinary = List[np.ndarray] # list of int64 feature IDs per molecule
|
|
20
|
-
UnfoldedCount = List[Tuple[np.ndarray, np.ndarray]] # list of (int64 feature IDs, float32 values)
|
|
21
|
-
|
|
22
20
|
FingerprintResult = Union[np.ndarray, sp.csr_matrix, UnfoldedBinary, UnfoldedCount]
|
|
23
21
|
|
|
24
22
|
|
|
@@ -532,9 +530,13 @@ def _skfp_configure_output(
|
|
|
532
530
|
"""
|
|
533
531
|
Configure scikit-fingerprints/sklearn transformer to match (folded, return_csr).
|
|
534
532
|
|
|
535
|
-
|
|
536
|
-
-
|
|
537
|
-
|
|
533
|
+
Supports two modes:
|
|
534
|
+
1) scikit-fingerprints classic:
|
|
535
|
+
- unfolded via variant='raw_bits' (if available)
|
|
536
|
+
- folded via variant='folded' (if variant exists)
|
|
537
|
+
2) ChemapBaseFingerprint style:
|
|
538
|
+
- unfolded via folded=False (no variant)
|
|
539
|
+
- folded via folded=True
|
|
538
540
|
"""
|
|
539
541
|
params = fpgen.get_params(deep=False)
|
|
540
542
|
updates: Dict[str, Any] = {}
|
|
@@ -545,22 +547,42 @@ def _skfp_configure_output(
|
|
|
545
547
|
if "n_jobs" in params:
|
|
546
548
|
updates["n_jobs"] = n_jobs
|
|
547
549
|
|
|
550
|
+
chemap_style = "folded" in params # ChemapBaseFingerprint exposes folded param
|
|
551
|
+
|
|
552
|
+
# -------------------------
|
|
553
|
+
# UNFOLDED (cfg.folded=False)
|
|
554
|
+
# -------------------------
|
|
548
555
|
if not cfg.folded:
|
|
556
|
+
if chemap_style:
|
|
557
|
+
if params.get("folded") is not False:
|
|
558
|
+
updates["folded"] = False
|
|
559
|
+
# We don't force sparse; unfolded returns lists anyway.
|
|
560
|
+
return _clone_transformer_with_params(fpgen, updates) if updates else fpgen
|
|
561
|
+
|
|
562
|
+
# classic scikit-fingerprints route: needs variant='raw_bits'
|
|
549
563
|
if "variant" not in params:
|
|
550
564
|
raise NotImplementedError(
|
|
551
|
-
"Requested folded=False (unfolded), but this transformer does not
|
|
552
|
-
"
|
|
565
|
+
"Requested folded=False (unfolded), but this transformer does not support "
|
|
566
|
+
"either chemap-style `folded` switching or an skfp-style `variant='raw_bits'`."
|
|
553
567
|
)
|
|
568
|
+
|
|
554
569
|
if params.get("variant") != "raw_bits":
|
|
555
570
|
updates["variant"] = "raw_bits"
|
|
556
571
|
|
|
557
|
-
# For unfolded conversion we can accept either dense or CSR outputs, so we do not force "sparse".
|
|
558
572
|
return _clone_transformer_with_params(fpgen, updates) if updates else fpgen
|
|
559
573
|
|
|
560
|
-
#
|
|
574
|
+
# ------------------------
|
|
575
|
+
# FOLDED (cfg.folded=True)
|
|
576
|
+
# ------------------------
|
|
577
|
+
if chemap_style:
|
|
578
|
+
if params.get("folded") is not True:
|
|
579
|
+
updates["folded"] = True
|
|
580
|
+
|
|
581
|
+
# If it's classic skfp and currently set to raw_bits, restore folded variant
|
|
561
582
|
if "variant" in params and params.get("variant") == "raw_bits":
|
|
562
583
|
updates["variant"] = "folded"
|
|
563
584
|
|
|
585
|
+
# Prefer CSR if requested and supported
|
|
564
586
|
if "sparse" in params:
|
|
565
587
|
desired = bool(cfg.return_csr)
|
|
566
588
|
if params.get("sparse") != desired:
|
|
@@ -577,35 +599,83 @@ def _compute_sklearn(
|
|
|
577
599
|
show_progress: bool = False,
|
|
578
600
|
n_jobs: int,
|
|
579
601
|
) -> FingerprintResult:
|
|
602
|
+
"""
|
|
603
|
+
Compute fingerprints using sklearn/scikit-fingerprints style transformers.
|
|
604
|
+
|
|
605
|
+
Supports two kinds of transformers for unfolded output (cfg.folded=False):
|
|
606
|
+
1) Classic scikit-fingerprints: unfolded via variant='raw_bits' (matrix output)
|
|
607
|
+
2) ChemapBaseFingerprint style: unfolded via folded=False (list output)
|
|
608
|
+
|
|
609
|
+
Invalid-policy behavior:
|
|
610
|
+
- drop: returns only valid rows (shorter output)
|
|
611
|
+
- keep: aligns output to input, inserting empty rows for invalid smiles
|
|
612
|
+
- raise: raises on first invalid smiles
|
|
613
|
+
"""
|
|
580
614
|
fp = _skfp_configure_output(fpgen, cfg, show_progress=show_progress, n_jobs=n_jobs)
|
|
615
|
+
|
|
616
|
+
# Parse molecules with robust handling (None for invalid SMILES)
|
|
581
617
|
mol_transformer = RobustMolTransformer(n_jobs=n_jobs)
|
|
582
618
|
mols = mol_transformer.transform(smiles)
|
|
583
619
|
|
|
584
|
-
# Determine valid/invalid molecules and handle invalid according to policy.
|
|
585
620
|
valid_idx = [i for i, m in enumerate(mols) if m is not None]
|
|
586
621
|
invalid_idx = [i for i, m in enumerate(mols) if m is None]
|
|
587
622
|
|
|
588
623
|
if invalid_idx and cfg.invalid_policy == "raise":
|
|
589
624
|
raise ValueError(f"Invalid SMILES: {smiles[invalid_idx[0]]}")
|
|
590
625
|
|
|
591
|
-
# Fit/transform only valid mols (safe for most transformers)
|
|
592
626
|
valid_mols = [mols[i] for i in valid_idx]
|
|
593
627
|
|
|
628
|
+
# Most skfp transformers are "fit-less" but expose fit; keep consistent behavior.
|
|
594
629
|
fp.fit(valid_mols)
|
|
595
630
|
X_valid = fp.transform(valid_mols)
|
|
596
631
|
|
|
597
|
-
#
|
|
632
|
+
# -----------------------------
|
|
633
|
+
# Case A: transformer returns chemap-unfolded formats directly (list output)
|
|
634
|
+
# -----------------------------
|
|
635
|
+
is_list_unfolded = isinstance(X_valid, list) and (
|
|
636
|
+
len(X_valid) == 0
|
|
637
|
+
or isinstance(X_valid[0], np.ndarray)
|
|
638
|
+
or (isinstance(X_valid[0], tuple) and len(X_valid[0]) == 2)
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
if is_list_unfolded:
|
|
642
|
+
# In this case, we assume we are already in unfolded mode (cfg.folded=False).
|
|
643
|
+
# If cfg.folded=True but the transformer returns lists, that's an API mismatch.
|
|
644
|
+
if cfg.folded:
|
|
645
|
+
raise TypeError(
|
|
646
|
+
"Transformer returned chemap-unfolded list output while cfg.folded=True. "
|
|
647
|
+
"This likely indicates a misconfigured transformer."
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
if cfg.invalid_policy == "drop":
|
|
651
|
+
return X_valid
|
|
652
|
+
|
|
653
|
+
# keep alignment: reinsert empty rows
|
|
654
|
+
N = len(smiles)
|
|
655
|
+
if cfg.count:
|
|
656
|
+
X_full: UnfoldedCount = [_empty_unfolded_count() for _ in range(N)]
|
|
657
|
+
else:
|
|
658
|
+
X_full: UnfoldedBinary = [_empty_unfolded_binary() for _ in range(N)]
|
|
659
|
+
|
|
660
|
+
for out_i, orig_i in enumerate(valid_idx):
|
|
661
|
+
X_full[orig_i] = X_valid[out_i] # type: ignore[index]
|
|
662
|
+
|
|
663
|
+
return X_full
|
|
664
|
+
|
|
665
|
+
# -----------------------------
|
|
666
|
+
# Case B: transformer returns a matrix (dense or sparse)
|
|
667
|
+
# -----------------------------
|
|
668
|
+
|
|
669
|
+
# Handle invalid-policy re-insertion for matrix outputs
|
|
598
670
|
if cfg.invalid_policy == "drop":
|
|
599
671
|
X = X_valid
|
|
600
672
|
else:
|
|
601
|
-
# policy keep: reinsert empty rows to match input length
|
|
602
673
|
N = len(smiles)
|
|
603
674
|
|
|
604
675
|
if sp.issparse(X_valid):
|
|
605
676
|
X_valid = X_valid.tocsr().astype(np.float32)
|
|
606
677
|
D = X_valid.shape[1]
|
|
607
678
|
X = sp.csr_matrix((N, D), dtype=np.float32)
|
|
608
|
-
# place valid rows
|
|
609
679
|
X[valid_idx, :] = X_valid
|
|
610
680
|
else:
|
|
611
681
|
X_valid = np.asarray(X_valid, dtype=np.float32)
|
|
@@ -613,8 +683,8 @@ def _compute_sklearn(
|
|
|
613
683
|
X = np.zeros((N, D), dtype=np.float32)
|
|
614
684
|
X[valid_idx, :] = X_valid
|
|
615
685
|
|
|
686
|
+
# If unfolded requested, convert matrix -> chemap unfolded formats
|
|
616
687
|
if not cfg.folded:
|
|
617
|
-
# unfolded output
|
|
618
688
|
if sp.issparse(X):
|
|
619
689
|
return _csr_matrix_to_unfolded(X.tocsr().astype(np.float32), cfg)
|
|
620
690
|
return _dense_matrix_to_unfolded(np.asarray(X, dtype=np.float32), cfg)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .chemap_base_fingerprint import ChemapBaseFingerprint
|
|
2
|
+
from .element_count_fp import ElementCountFingerprint
|
|
3
|
+
from .lingo import LingoFingerprint
|
|
4
|
+
from .map4 import MAP4FPGen
|
|
5
|
+
from .mhfp import MHFPEncoderLite
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"ChemapBaseFingerprint",
|
|
10
|
+
"ElementCountFingerprint",
|
|
11
|
+
"LingoFingerprint",
|
|
12
|
+
"MAP4FPGen",
|
|
13
|
+
"MHFPEncoderLite",
|
|
14
|
+
]
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from typing import Any
|
|
3
|
+
from joblib import Parallel, delayed
|
|
4
|
+
from rdkit.Chem import Mol
|
|
5
|
+
from skfp.bases import BaseFingerprintTransformer
|
|
6
|
+
from skfp.utils import ensure_smiles
|
|
7
|
+
from chemap.types import UnfoldedBinary, UnfoldedCount
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ChemapBaseFingerprint(BaseFingerprintTransformer):
|
|
11
|
+
"""
|
|
12
|
+
Extension of scikit-fingerprints BaseFingerprintTransformer that adds `folded`.
|
|
13
|
+
|
|
14
|
+
- folded=True: behaves like scikit-fingerprints (returns dense ndarray or sparse csr_array)
|
|
15
|
+
- folded=False: returns chemap unfolded formats (lists of feature IDs / (IDs, values))
|
|
16
|
+
|
|
17
|
+
Important: this class intentionally subclasses scikit-fingerprints' base class
|
|
18
|
+
to preserve their behavior (validation, parallelization patterns, etc.) where possible.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
*,
|
|
24
|
+
n_features_out: int,
|
|
25
|
+
count: bool = False,
|
|
26
|
+
sparse: bool = False,
|
|
27
|
+
folded: bool = True,
|
|
28
|
+
n_jobs: int | None = None,
|
|
29
|
+
batch_size: int | None = None,
|
|
30
|
+
verbose: int | dict = 0,
|
|
31
|
+
):
|
|
32
|
+
super().__init__(
|
|
33
|
+
n_features_out=n_features_out,
|
|
34
|
+
count=count,
|
|
35
|
+
sparse=sparse,
|
|
36
|
+
n_jobs=n_jobs,
|
|
37
|
+
batch_size=batch_size,
|
|
38
|
+
verbose=verbose,
|
|
39
|
+
)
|
|
40
|
+
self.folded = folded
|
|
41
|
+
|
|
42
|
+
def transform(self, X: Sequence[str | Mol], copy: bool = False) -> Any:
|
|
43
|
+
"""
|
|
44
|
+
If folded=True: defer to BaseFingerprintTransformer.transform (matrix output).
|
|
45
|
+
If folded=False: return chemap unfolded formats.
|
|
46
|
+
"""
|
|
47
|
+
if self.folded:
|
|
48
|
+
return super().transform(X, copy=copy)
|
|
49
|
+
|
|
50
|
+
# unfolded route: we accept SMILES or Mol, but Lingo-like methods want SMILES
|
|
51
|
+
smiles = ensure_smiles(X)
|
|
52
|
+
return self._calculate_unfolded(smiles)
|
|
53
|
+
|
|
54
|
+
# ---- hooks for subclasses ----
|
|
55
|
+
|
|
56
|
+
def _calculate_unfolded(self, X_smiles: Sequence[str]) -> UnfoldedBinary | UnfoldedCount:
|
|
57
|
+
"""
|
|
58
|
+
Subclasses must implement when folded=False.
|
|
59
|
+
Must return chemap unfolded formats:
|
|
60
|
+
- count=False: List[np.ndarray[int64]]
|
|
61
|
+
- count=True : List[Tuple[np.ndarray[int64], np.ndarray[float32]]]
|
|
62
|
+
"""
|
|
63
|
+
raise NotImplementedError
|
|
64
|
+
|
|
65
|
+
# ---- helpers ----
|
|
66
|
+
|
|
67
|
+
def _parallel_map(self, fn, items):
|
|
68
|
+
n_jobs = self.n_jobs if self.n_jobs is not None else 1
|
|
69
|
+
|
|
70
|
+
if n_jobs == 1:
|
|
71
|
+
return [fn(x) for x in items]
|
|
72
|
+
|
|
73
|
+
batch_size = self.batch_size if self.batch_size is not None else "auto"
|
|
74
|
+
return Parallel(n_jobs=n_jobs, batch_size=batch_size)(
|
|
75
|
+
delayed(fn)(x) for x in items
|
|
76
|
+
)
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import re
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from collections.abc import Sequence
|
|
5
|
+
from numbers import Integral
|
|
6
|
+
import numpy as np
|
|
7
|
+
from scipy.sparse import csr_array
|
|
8
|
+
from skfp.utils import ensure_smiles
|
|
9
|
+
from sklearn.utils._param_validation import Interval
|
|
10
|
+
from chemap.fingerprints import ChemapBaseFingerprint
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class LingoFingerprint(ChemapBaseFingerprint):
|
|
14
|
+
"""
|
|
15
|
+
Lingo fingerprint with chemap unfolded support.
|
|
16
|
+
|
|
17
|
+
folded=True:
|
|
18
|
+
behaves like scikit-fingerprints: fixed-size hashed vector (dense or CSR)
|
|
19
|
+
folded=False:
|
|
20
|
+
returns chemap unfolded formats with stable 64-bit feature IDs derived from SHA-1:
|
|
21
|
+
- count=False: List[np.ndarray[int64]] (feature IDs)
|
|
22
|
+
- count=True : List[Tuple[np.ndarray[int64], np.ndarray[float32]]] (IDs + counts)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
_parameter_constraints: dict = {
|
|
26
|
+
**ChemapBaseFingerprint._parameter_constraints,
|
|
27
|
+
"fp_size": [Interval(Integral, 1, None, closed="left")],
|
|
28
|
+
"substring_length": [Interval(Integral, 1, None, closed="left")],
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
fp_size: int = 4096,
|
|
34
|
+
substring_length: int = 4,
|
|
35
|
+
count: bool = False,
|
|
36
|
+
sparse: bool = False,
|
|
37
|
+
folded: bool = True,
|
|
38
|
+
n_jobs: int | None = None,
|
|
39
|
+
batch_size: int | None = None,
|
|
40
|
+
verbose: int | dict = 0,
|
|
41
|
+
):
|
|
42
|
+
super().__init__(
|
|
43
|
+
n_features_out=fp_size,
|
|
44
|
+
count=count,
|
|
45
|
+
sparse=sparse,
|
|
46
|
+
folded=folded,
|
|
47
|
+
n_jobs=n_jobs,
|
|
48
|
+
batch_size=batch_size,
|
|
49
|
+
verbose=verbose,
|
|
50
|
+
)
|
|
51
|
+
self.fp_size = fp_size
|
|
52
|
+
self.substring_length = substring_length
|
|
53
|
+
|
|
54
|
+
# --------------------
|
|
55
|
+
# Shared preprocessing
|
|
56
|
+
# --------------------
|
|
57
|
+
|
|
58
|
+
def smiles_to_dicts(self, X: Sequence[str]) -> list[dict[str, int]]:
|
|
59
|
+
"""
|
|
60
|
+
Convert SMILES to dicts of substring counts (original Lingo raw features).
|
|
61
|
+
"""
|
|
62
|
+
X = ensure_smiles(X)
|
|
63
|
+
|
|
64
|
+
# same canonicalization as skfp
|
|
65
|
+
X = [re.sub(r"[123456789]", "0", smi) for smi in X]
|
|
66
|
+
X = [re.sub(r"Cl", "L", smi) for smi in X]
|
|
67
|
+
X = [re.sub(r"Br", "R", smi) for smi in X]
|
|
68
|
+
|
|
69
|
+
result: list[dict[str, int]] = []
|
|
70
|
+
L = self.substring_length
|
|
71
|
+
|
|
72
|
+
for smi in X:
|
|
73
|
+
d: defaultdict[str, int] = defaultdict(int)
|
|
74
|
+
# overlapping substrings
|
|
75
|
+
for i in range(len(smi) - L + 1):
|
|
76
|
+
d[smi[i : i + L]] += 1
|
|
77
|
+
result.append(dict(d))
|
|
78
|
+
|
|
79
|
+
return result
|
|
80
|
+
|
|
81
|
+
# --------------------
|
|
82
|
+
# Folded (matrix) path
|
|
83
|
+
# --------------------
|
|
84
|
+
|
|
85
|
+
def _calculate_fingerprint(self, X: Sequence[str]) -> np.ndarray | csr_array:
|
|
86
|
+
"""
|
|
87
|
+
Called by BaseFingerprintTransformer when folded=True.
|
|
88
|
+
"""
|
|
89
|
+
dicts = self.smiles_to_dicts(X)
|
|
90
|
+
arr = self._dicts_to_folded_array(dicts)
|
|
91
|
+
return csr_array(arr) if self.sparse else arr
|
|
92
|
+
|
|
93
|
+
def _dicts_to_folded_array(self, dicts: list[dict[str, int]]) -> np.ndarray:
|
|
94
|
+
"""
|
|
95
|
+
Hash and fold into [0..fp_size-1], identical to skfp folding rule.
|
|
96
|
+
"""
|
|
97
|
+
dtype = np.uint32 if self.count else np.uint8
|
|
98
|
+
out = np.zeros((len(dicts), self.fp_size), dtype=dtype)
|
|
99
|
+
|
|
100
|
+
for i, d in enumerate(dicts):
|
|
101
|
+
for token, c in d.items():
|
|
102
|
+
digest = hashlib.sha1(token.encode("utf-8"), usedforsecurity=False).digest()
|
|
103
|
+
hash_index = int.from_bytes(digest, byteorder="big") % self.fp_size
|
|
104
|
+
|
|
105
|
+
if self.count:
|
|
106
|
+
out[i, hash_index] += c
|
|
107
|
+
else:
|
|
108
|
+
out[i, hash_index] = 1
|
|
109
|
+
|
|
110
|
+
return out
|
|
111
|
+
|
|
112
|
+
# -----------------------
|
|
113
|
+
# Unfolded (chemap) path
|
|
114
|
+
# -----------------------
|
|
115
|
+
|
|
116
|
+
def _calculate_unfolded(self, X_smiles: Sequence[str]):
|
|
117
|
+
"""
|
|
118
|
+
Return chemap unfolded formats.
|
|
119
|
+
|
|
120
|
+
Feature IDs are stable int64 derived from SHA-1 digest:
|
|
121
|
+
id64 = int.from_bytes(digest[:8], "big") (uint64, then viewed as int64 safely via np.uint64->np.int64 cast)
|
|
122
|
+
"""
|
|
123
|
+
dicts = self.smiles_to_dicts(X_smiles)
|
|
124
|
+
|
|
125
|
+
def token_to_id32(token: str) -> int:
|
|
126
|
+
digest = hashlib.sha1(token.encode("utf-8"), usedforsecurity=False).digest()
|
|
127
|
+
return int.from_bytes(digest[:4], byteorder="big", signed=False)
|
|
128
|
+
|
|
129
|
+
if self.count:
|
|
130
|
+
def one(d: dict[str, int]) -> tuple[np.ndarray, np.ndarray]:
|
|
131
|
+
if not d:
|
|
132
|
+
return (np.array([], dtype=np.int64), np.array([], dtype=np.float32))
|
|
133
|
+
|
|
134
|
+
agg: dict[int, float] = {}
|
|
135
|
+
for token, c in d.items():
|
|
136
|
+
fid = token_to_id32(token)
|
|
137
|
+
agg[fid] = agg.get(fid, 0.0) + float(c)
|
|
138
|
+
|
|
139
|
+
keys = np.array(sorted(agg.keys()), dtype=np.int64)
|
|
140
|
+
vals = np.array([agg[int(k)] for k in keys], dtype=np.float32)
|
|
141
|
+
return keys, vals
|
|
142
|
+
|
|
143
|
+
return self._parallel_map(one, dicts)
|
|
144
|
+
|
|
145
|
+
def one_bin(d: dict[str, int]) -> np.ndarray:
|
|
146
|
+
if not d:
|
|
147
|
+
return np.array([], dtype=np.int64)
|
|
148
|
+
|
|
149
|
+
ids = np.fromiter((token_to_id32(t) for t in d.keys()), dtype=np.int64)
|
|
150
|
+
# np.unique sorts ascending
|
|
151
|
+
return np.unique(ids).astype(np.int64, copy=False)
|
|
152
|
+
|
|
153
|
+
return self._parallel_map(one_bin, dicts)
|
|
154
|
+
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
"""
|
|
2
|
+
chemap-compatible MAP4 FP generator (in parts based on Luca Cappelletti's implementation of MAP4:
|
|
3
|
+
https://github.com/LucaCappelletti94/map4/blob/master/map4/map4.py
|
|
4
|
+
Which is based on the original MAP4 implementation
|
|
5
|
+
`Alice Capecchi, Daniel Probst, Jean-Louis Reymond
|
|
6
|
+
"One molecular fingerprint to rule them all: drugs, biomolecules, and the metabolome"
|
|
7
|
+
J Cheminform 12, 43 (2020)
|
|
8
|
+
<https://jcheminf.biomedcentral.com/articles/10.1186/s13321-020-00445-4>`_
|
|
9
|
+
|
|
10
|
+
There are a few particular aspects about this implementation tough:
|
|
11
|
+
- Folded output:
|
|
12
|
+
* binary (uint8) uses MHFP-style MinHash folding (chemap.fingerprints.mhfp)
|
|
13
|
+
* count (float32) accumulates true shingle multiplicities into folded bins (not a MinHash signature,
|
|
14
|
+
so different from the original implementation!)
|
|
15
|
+
- Unfolded output:
|
|
16
|
+
* count=True -> true counts per raw feature id
|
|
17
|
+
* count=False -> keys only (chemap will read keys from GetSparseCountFingerprint)
|
|
18
|
+
* feature ids are SHA1 by default, unless minhash_for_unfolded=True
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import itertools
|
|
22
|
+
from collections import defaultdict
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
from hashlib import sha1
|
|
25
|
+
from typing import Dict, List, Optional, Set
|
|
26
|
+
import numpy as np
|
|
27
|
+
from rdkit.Chem import Mol, MolToSmiles, PathToSubmol
|
|
28
|
+
from rdkit.Chem.rdmolops import FindAtomEnvironmentOfRadiusN, GetDistanceMatrix
|
|
29
|
+
from chemap.fingerprints.mhfp import MHFPEncoderLite
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# -----------------------------
|
|
33
|
+
# Minimal RDKit-like return types
|
|
34
|
+
# -----------------------------
|
|
35
|
+
|
|
36
|
+
@dataclass(frozen=True)
|
|
37
|
+
class _SparseCountFingerprint:
|
|
38
|
+
"""RDKit SparseIntVect-like shim for chemap."""
|
|
39
|
+
nz: Dict[int, int]
|
|
40
|
+
def GetNonzeroElements(self) -> Dict[int, int]:
|
|
41
|
+
return self.nz
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass(frozen=True)
|
|
45
|
+
class _BitFingerprint:
|
|
46
|
+
"""RDKit ExplicitBitVect-like shim for chemap size inference."""
|
|
47
|
+
n_bits: int
|
|
48
|
+
def GetNumBits(self) -> int:
|
|
49
|
+
return self.n_bits
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass(frozen=True)
|
|
53
|
+
class _CountFingerprint:
|
|
54
|
+
"""RDKit IntSparseIntVect-like shim for chemap size inference."""
|
|
55
|
+
length: int
|
|
56
|
+
def GetLength(self) -> int:
|
|
57
|
+
return self.length
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# -----------------------------
|
|
61
|
+
# MAP4 shingling core
|
|
62
|
+
# -----------------------------
|
|
63
|
+
|
|
64
|
+
class _MAP4Shingler:
|
|
65
|
+
"""
|
|
66
|
+
Generates MAP4 shingles as bytes:
|
|
67
|
+
- envs for radii 1..R
|
|
68
|
+
- for each atom pair (i<j) and each radius index k in [0..R-1]:
|
|
69
|
+
shingle = f"{smaller_env}|{dist}|{larger_env}"
|
|
70
|
+
where smaller/larger chosen by length comparison (ties go to env_b as larger)
|
|
71
|
+
- optional include_duplicated_shingles "suffix trick" is available, but for chemap counts
|
|
72
|
+
we SHOULD NOT use it (we want true multiplicities).
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
radius: int = 2,
|
|
78
|
+
*,
|
|
79
|
+
include_duplicated_shingles: bool = False,
|
|
80
|
+
max_dist: Optional[int] = None,
|
|
81
|
+
dist_binning: Optional[np.ndarray] = None,
|
|
82
|
+
):
|
|
83
|
+
if radius <= 0:
|
|
84
|
+
raise ValueError("radius must be > 0.")
|
|
85
|
+
self.radius = int(radius)
|
|
86
|
+
self.include_duplicated_shingles = bool(include_duplicated_shingles)
|
|
87
|
+
self.max_dist = max_dist
|
|
88
|
+
self.dist_binning = dist_binning
|
|
89
|
+
|
|
90
|
+
def shingles_unique(self, mol: Mol) -> Set[bytes]:
|
|
91
|
+
return set(self._all_pairs(mol, self._get_atom_envs(mol)))
|
|
92
|
+
|
|
93
|
+
def shingles_with_counts_true(self, mol: Mol) -> Dict[bytes, int]:
|
|
94
|
+
"""
|
|
95
|
+
True multiplicities (counts) WITHOUT suffix trick, regardless of include_duplicated_shingles.
|
|
96
|
+
"""
|
|
97
|
+
counts: Dict[bytes, int] = defaultdict(int)
|
|
98
|
+
for sh in self._all_pairs(mol, self._get_atom_envs(mol), force_no_suffix=True):
|
|
99
|
+
counts[sh] += 1
|
|
100
|
+
return dict(counts)
|
|
101
|
+
|
|
102
|
+
def _convert_dist(self, dist: float) -> int:
|
|
103
|
+
if self.dist_binning is None:
|
|
104
|
+
return int(dist)
|
|
105
|
+
return int(np.digitize(dist, self.dist_binning, right=True))
|
|
106
|
+
|
|
107
|
+
def _get_atom_envs(self, mol: Mol) -> Dict[int, List[Optional[str]]]:
|
|
108
|
+
atoms_env: Dict[int, List[Optional[str]]] = {}
|
|
109
|
+
for atom in mol.GetAtoms():
|
|
110
|
+
atom_identifier = atom.GetIdx()
|
|
111
|
+
for r in range(1, self.radius + 1):
|
|
112
|
+
atoms_env.setdefault(atom_identifier, []).append(
|
|
113
|
+
self._find_env(mol, atom_identifier, r)
|
|
114
|
+
)
|
|
115
|
+
return atoms_env
|
|
116
|
+
|
|
117
|
+
@staticmethod
|
|
118
|
+
def _find_env(mol: Mol, atom_identifier: int, radius: int) -> Optional[str]:
|
|
119
|
+
atom_identifiers_within_radius: List[int] = FindAtomEnvironmentOfRadiusN(
|
|
120
|
+
mol=mol, radius=radius, rootedAtAtom=atom_identifier
|
|
121
|
+
)
|
|
122
|
+
atom_map: Dict[int, int] = {}
|
|
123
|
+
sub_molecule: Mol = PathToSubmol(mol, atom_identifiers_within_radius, atomMap=atom_map)
|
|
124
|
+
|
|
125
|
+
if atom_identifier not in atom_map:
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
return MolToSmiles(
|
|
129
|
+
sub_molecule,
|
|
130
|
+
rootedAtAtom=atom_map[atom_identifier],
|
|
131
|
+
canonical=True,
|
|
132
|
+
isomericSmiles=False,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
def _all_pairs(
|
|
136
|
+
self,
|
|
137
|
+
mol: Mol,
|
|
138
|
+
atoms_env: Dict[int, List[Optional[str]]],
|
|
139
|
+
*,
|
|
140
|
+
force_no_suffix: bool = False,
|
|
141
|
+
) -> List[bytes]:
|
|
142
|
+
"""
|
|
143
|
+
Return shingles as bytes. If include_duplicated_shingles is enabled and not forced off,
|
|
144
|
+
suffix trick is applied to make duplicates unique (MAP4C-style behavior).
|
|
145
|
+
"""
|
|
146
|
+
out: List[bytes] = []
|
|
147
|
+
dm = GetDistanceMatrix(mol)
|
|
148
|
+
n = mol.GetNumAtoms()
|
|
149
|
+
shingle_dict: Dict[str, int] = defaultdict(int)
|
|
150
|
+
|
|
151
|
+
for i, j in itertools.combinations(range(n), 2):
|
|
152
|
+
dist_val = float(dm[i][j])
|
|
153
|
+
if self.max_dist is not None and dist_val > self.max_dist:
|
|
154
|
+
continue
|
|
155
|
+
dist = str(self._convert_dist(dist_val))
|
|
156
|
+
|
|
157
|
+
for k in range(self.radius):
|
|
158
|
+
env_a = atoms_env[i][k] or ""
|
|
159
|
+
env_b = atoms_env[j][k] or ""
|
|
160
|
+
|
|
161
|
+
# compare by length, not lexicographic
|
|
162
|
+
if len(env_a) > len(env_b):
|
|
163
|
+
larger_env, smaller_env = env_a, env_b
|
|
164
|
+
else:
|
|
165
|
+
larger_env, smaller_env = env_b, env_a
|
|
166
|
+
|
|
167
|
+
shingle = f"{smaller_env}|{dist}|{larger_env}"
|
|
168
|
+
|
|
169
|
+
if self.include_duplicated_shingles and not force_no_suffix:
|
|
170
|
+
shingle_dict[shingle] += 1
|
|
171
|
+
shingle = f"{shingle}|{shingle_dict[shingle]}"
|
|
172
|
+
|
|
173
|
+
out.append(shingle.encode("utf-8"))
|
|
174
|
+
|
|
175
|
+
return out
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# -----------------------------
|
|
179
|
+
# MAP4 fpgen for chemap
|
|
180
|
+
# -----------------------------
|
|
181
|
+
|
|
182
|
+
class MAP4FPGen:
|
|
183
|
+
"""
|
|
184
|
+
chemap-compatible MAP4 fingerprint generator.
|
|
185
|
+
|
|
186
|
+
Folded outputs (fixed length):
|
|
187
|
+
- GetFingerprintAsNumPy: uint8[D] binary
|
|
188
|
+
computed by minhash signature (MHFPEncoderLite) folded to bits (mod D)
|
|
189
|
+
- GetCountFingerprintAsNumPy: float32[D] counts
|
|
190
|
+
computed by hashing each shingle (token hash32) -> bin (mod D) and summing true counts
|
|
191
|
+
|
|
192
|
+
Unfolded outputs (raw feature ids):
|
|
193
|
+
- GetSparseCountFingerprint returns {feature_id: count}
|
|
194
|
+
feature_id:
|
|
195
|
+
* sha1 truncation (default) OR
|
|
196
|
+
* token-hash32 (sha1 first 4 bytes) if minhash_for_unfolded=True
|
|
197
|
+
|
|
198
|
+
Parameters
|
|
199
|
+
----------
|
|
200
|
+
folded:
|
|
201
|
+
Whether folded functions are meaningful (chemap controls this, but we keep for safety).
|
|
202
|
+
minhash_for_unfolded:
|
|
203
|
+
If True, unfolded uses MHFP-style token hash32 rather than sha1 truncation.
|
|
204
|
+
unfolded_bits:
|
|
205
|
+
32 or 64 (only used when minhash_for_unfolded=False).
|
|
206
|
+
include_duplicated_shingles:
|
|
207
|
+
For MAP4C-like behavior in *set shingles* (folded binary). For true counts we ignore suffix.
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
def __init__(
|
|
211
|
+
self,
|
|
212
|
+
dimensions: int = 1024,
|
|
213
|
+
radius: int = 2,
|
|
214
|
+
*,
|
|
215
|
+
seed: int = 75434278,
|
|
216
|
+
folded: bool = True,
|
|
217
|
+
# counts/dup behavior
|
|
218
|
+
include_duplicated_shingles: bool = False,
|
|
219
|
+
# unfolded hashing behavior
|
|
220
|
+
minhash_for_unfolded: bool = False,
|
|
221
|
+
unfolded_bits: int = 32, # 32 or 64, only if minhash_for_unfolded=False
|
|
222
|
+
# optional distance handling
|
|
223
|
+
max_dist: Optional[int] = None,
|
|
224
|
+
dist_binning: Optional[np.ndarray] = None,
|
|
225
|
+
):
|
|
226
|
+
self.dimensions = int(dimensions)
|
|
227
|
+
self.radius = int(radius)
|
|
228
|
+
self.seed = int(seed)
|
|
229
|
+
self.folded = bool(folded)
|
|
230
|
+
|
|
231
|
+
self.include_duplicated_shingles = bool(include_duplicated_shingles)
|
|
232
|
+
self.minhash_for_unfolded = bool(minhash_for_unfolded)
|
|
233
|
+
self.unfolded_bits = int(unfolded_bits)
|
|
234
|
+
|
|
235
|
+
if self.dimensions <= 0:
|
|
236
|
+
raise ValueError("dimensions must be > 0.")
|
|
237
|
+
if self.radius <= 0:
|
|
238
|
+
raise ValueError("radius must be > 0.")
|
|
239
|
+
if self.unfolded_bits not in (32, 64):
|
|
240
|
+
raise ValueError("unfolded_bits must be 32 or 64.")
|
|
241
|
+
|
|
242
|
+
self._shingler = _MAP4Shingler(
|
|
243
|
+
radius=self.radius,
|
|
244
|
+
include_duplicated_shingles=self.include_duplicated_shingles,
|
|
245
|
+
max_dist=max_dist,
|
|
246
|
+
dist_binning=dist_binning,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Folded uses MHFPEncoderLite
|
|
250
|
+
self._mhfp = MHFPEncoderLite(
|
|
251
|
+
n_permutations=self.dimensions,
|
|
252
|
+
seed=self.seed,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# --------- chemap size inference ---------
|
|
256
|
+
|
|
257
|
+
def GetFingerprint(self, mol: Mol) -> _BitFingerprint:
|
|
258
|
+
return _BitFingerprint(self.dimensions)
|
|
259
|
+
|
|
260
|
+
def GetCountFingerprint(self, mol: Mol) -> _CountFingerprint:
|
|
261
|
+
return _CountFingerprint(self.dimensions)
|
|
262
|
+
|
|
263
|
+
# --------- unfolded API ---------
|
|
264
|
+
|
|
265
|
+
def GetSparseCountFingerprint(self, mol: Mol) -> _SparseCountFingerprint:
|
|
266
|
+
"""
|
|
267
|
+
Returns {feature_id: count} for unfolded outputs.
|
|
268
|
+
|
|
269
|
+
- count=True in chemap: keys+values used
|
|
270
|
+
- count=False in chemap: keys used, values ignored
|
|
271
|
+
"""
|
|
272
|
+
counts = self._shingler.shingles_with_counts_true(mol)
|
|
273
|
+
if not counts:
|
|
274
|
+
return _SparseCountFingerprint({})
|
|
275
|
+
|
|
276
|
+
nz: Dict[int, int] = defaultdict(int)
|
|
277
|
+
|
|
278
|
+
if self.minhash_for_unfolded:
|
|
279
|
+
# MHFP token-hash domain: sha1 first 4 bytes (little endian)
|
|
280
|
+
for sh, c in counts.items():
|
|
281
|
+
fid32 = int.from_bytes(sha1(sh).digest()[:4], "little", signed=False)
|
|
282
|
+
nz[int(fid32)] += int(c)
|
|
283
|
+
else:
|
|
284
|
+
for sh, c in counts.items():
|
|
285
|
+
fid = int(self._sha1_to_int(sh, bits=self.unfolded_bits))
|
|
286
|
+
nz[fid] += int(c)
|
|
287
|
+
|
|
288
|
+
return _SparseCountFingerprint(dict(nz))
|
|
289
|
+
|
|
290
|
+
# --------- folded API ---------
|
|
291
|
+
|
|
292
|
+
def GetFingerprintAsNumPy(self, mol: Mol) -> np.ndarray:
|
|
293
|
+
"""
|
|
294
|
+
Folded binary vector uint8[D], matching original MAP4Calculator folded path:
|
|
295
|
+
|
|
296
|
+
folded = fold(hash(set(shingles)), D)
|
|
297
|
+
|
|
298
|
+
i.e. hash each unique shingle token -> set bit at (hash % D).
|
|
299
|
+
"""
|
|
300
|
+
if not self.folded:
|
|
301
|
+
return np.zeros(self.dimensions, dtype=np.uint8)
|
|
302
|
+
|
|
303
|
+
shingles = self._shingler.shingles_unique(mol) # set[bytes]
|
|
304
|
+
if not shingles:
|
|
305
|
+
return np.zeros(self.dimensions, dtype=np.uint8)
|
|
306
|
+
|
|
307
|
+
# Per-shingle 32-bit hash (matches the common MAP4/scikit-fingerprints style: sha1/sha256 truncated)
|
|
308
|
+
hashed = np.fromiter(
|
|
309
|
+
(int.from_bytes(sha1(sh).digest()[:4], "little", signed=False) for sh in shingles),
|
|
310
|
+
dtype=np.uint32,
|
|
311
|
+
count=len(shingles),
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
fp = np.zeros(self.dimensions, dtype=np.uint8)
|
|
315
|
+
fp[(hashed % np.uint32(self.dimensions)).astype(np.int64, copy=False)] = 1
|
|
316
|
+
return fp
|
|
317
|
+
|
|
318
|
+
def GetCountFingerprintAsNumPy(self, mol: Mol) -> np.ndarray:
|
|
319
|
+
"""
|
|
320
|
+
Folded counts float32[D] using TRUE multiplicities.
|
|
321
|
+
|
|
322
|
+
This is *not* a MinHash (classic MAP4 is set-based). We instead provide a stable
|
|
323
|
+
count-fold baseline:
|
|
324
|
+
bin = token_hash32(shingle) % D
|
|
325
|
+
fp[bin] += count
|
|
326
|
+
"""
|
|
327
|
+
if not self.folded:
|
|
328
|
+
return np.zeros(self.dimensions, dtype=np.float32)
|
|
329
|
+
|
|
330
|
+
counts = self._shingler.shingles_with_counts_true(mol)
|
|
331
|
+
if not counts:
|
|
332
|
+
return np.zeros(self.dimensions, dtype=np.float32)
|
|
333
|
+
|
|
334
|
+
fp = np.zeros(self.dimensions, dtype=np.float32)
|
|
335
|
+
for sh, c in counts.items():
|
|
336
|
+
h32 = int.from_bytes(sha1(sh).digest()[:4], "little", signed=False)
|
|
337
|
+
fp[h32 % self.dimensions] += float(c)
|
|
338
|
+
return fp
|
|
339
|
+
|
|
340
|
+
# -----------------------------
|
|
341
|
+
# Hash utilities
|
|
342
|
+
# -----------------------------
|
|
343
|
+
|
|
344
|
+
@staticmethod
|
|
345
|
+
def _sha1_to_int(data: bytes, *, bits: int = 64) -> np.uint64:
|
|
346
|
+
d = sha1(data).digest()
|
|
347
|
+
if bits == 32:
|
|
348
|
+
return np.uint64(int.from_bytes(d[:4], byteorder="little", signed=False))
|
|
349
|
+
return np.uint64(int.from_bytes(d[:8], byteorder="little", signed=False))
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import struct
|
|
2
|
+
from hashlib import sha1
|
|
3
|
+
from typing import Iterable, Sequence, Union
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
BytesLike = Union[bytes, bytearray, memoryview]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MHFPEncoderLite:
|
|
11
|
+
"""
|
|
12
|
+
Compatibility-focused reimplementation of the original mhfp.encoder.MHFPEncoder.
|
|
13
|
+
(Original is from the Reymond group: https://github.com/reymond-group/mhfp)
|
|
14
|
+
|
|
15
|
+
Notes
|
|
16
|
+
-----
|
|
17
|
+
- The original uses:
|
|
18
|
+
prime = 2^61 - 1
|
|
19
|
+
max_hash = 2^32 - 1
|
|
20
|
+
and outputs uint32 signatures.
|
|
21
|
+
- Token hash is:
|
|
22
|
+
struct.unpack("<I", sha1(token).digest()[:4])[0]
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
prime: int = (1 << 61) - 1
|
|
26
|
+
max_hash: int = (1 << 32) - 1
|
|
27
|
+
|
|
28
|
+
def __init__(self, n_permutations: int = 2048, seed: int = 42):
|
|
29
|
+
if n_permutations <= 0:
|
|
30
|
+
raise ValueError("n_permutations must be > 0.")
|
|
31
|
+
self.n_permutations = int(n_permutations)
|
|
32
|
+
self.seed = int(seed)
|
|
33
|
+
|
|
34
|
+
# Match original: generate uint32 a,b with uniqueness constraints
|
|
35
|
+
rand = np.random.RandomState(self.seed)
|
|
36
|
+
|
|
37
|
+
a = np.zeros(self.n_permutations, dtype=np.uint32)
|
|
38
|
+
b = np.zeros(self.n_permutations, dtype=np.uint32)
|
|
39
|
+
|
|
40
|
+
# Original code used `while a in self.permutations_a` checks (O(n)),
|
|
41
|
+
# but that behavior means "no duplicates". We'll enforce the same.
|
|
42
|
+
used_a = set()
|
|
43
|
+
used_b = set()
|
|
44
|
+
|
|
45
|
+
for i in range(self.n_permutations):
|
|
46
|
+
ai = int(rand.randint(1, MHFPEncoderLite.max_hash, dtype=np.uint32))
|
|
47
|
+
bi = int(rand.randint(0, MHFPEncoderLite.max_hash, dtype=np.uint32))
|
|
48
|
+
|
|
49
|
+
while ai in used_a:
|
|
50
|
+
ai = int(rand.randint(1, MHFPEncoderLite.max_hash, dtype=np.uint32))
|
|
51
|
+
while bi in used_b:
|
|
52
|
+
bi = int(rand.randint(0, MHFPEncoderLite.max_hash, dtype=np.uint32))
|
|
53
|
+
|
|
54
|
+
used_a.add(ai)
|
|
55
|
+
used_b.add(bi)
|
|
56
|
+
a[i] = np.uint32(ai)
|
|
57
|
+
b[i] = np.uint32(bi)
|
|
58
|
+
|
|
59
|
+
# Match original: reshape to column vectors (n_perm, 1)
|
|
60
|
+
self._a = a.reshape((self.n_permutations, 1)).astype(np.uint64, copy=False)
|
|
61
|
+
self._b = b.reshape((self.n_permutations, 1)).astype(np.uint64, copy=False)
|
|
62
|
+
|
|
63
|
+
# -----------------------------
|
|
64
|
+
# Token hashing (exact)
|
|
65
|
+
# -----------------------------
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def _token_hash32(token: BytesLike) -> np.uint32:
|
|
69
|
+
# EXACT original semantics: struct.unpack("<I", sha1(t).digest()[:4])[0]
|
|
70
|
+
return np.uint32(struct.unpack("<I", sha1(bytes(token)).digest()[:4])[0])
|
|
71
|
+
|
|
72
|
+
# -----------------------------
|
|
73
|
+
# Original helper API: hash / fold / merge / distance
|
|
74
|
+
# -----------------------------
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def hash(tokens: Iterable[BytesLike]) -> np.ndarray:
|
|
78
|
+
"""
|
|
79
|
+
For compatibility with original MHFPEncoder.hash(shingling):
|
|
80
|
+
returns per-token uint32 hash values (NOT minhash signature).
|
|
81
|
+
"""
|
|
82
|
+
return np.fromiter(
|
|
83
|
+
(MHFPEncoderLite._token_hash32(t) for t in tokens),
|
|
84
|
+
dtype=np.uint32,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
@staticmethod
|
|
88
|
+
def fold(hash_values: Sequence[int], length: int = 2048) -> np.ndarray:
|
|
89
|
+
"""
|
|
90
|
+
Compatibility with original fold(): binary uint8 vector with bits set at hash % length.
|
|
91
|
+
"""
|
|
92
|
+
length = int(length)
|
|
93
|
+
if length <= 0:
|
|
94
|
+
raise ValueError("length must be > 0.")
|
|
95
|
+
folded = np.zeros(length, dtype=np.uint8)
|
|
96
|
+
if len(hash_values) == 0:
|
|
97
|
+
return folded
|
|
98
|
+
hv = np.asarray(hash_values, dtype=np.uint64)
|
|
99
|
+
folded[(hv % np.uint64(length)).astype(np.int64, copy=False)] = 1
|
|
100
|
+
return folded
|
|
@@ -114,6 +114,7 @@ def plot_duplicate_bins(
|
|
|
114
114
|
xlabel: str = "Compounds with Fingerprint Duplicates",
|
|
115
115
|
title: str = "Duplicate Statistics by Experiment",
|
|
116
116
|
legend_title: str = "Maximum mass difference\n(for identical fingerprints)",
|
|
117
|
+
ax: Optional[plt.Axes] = None,
|
|
117
118
|
) -> Tuple[plt.Figure, plt.Axes]:
|
|
118
119
|
"""Plot stacked horizontal bars of duplicate counts across bins.
|
|
119
120
|
|
|
@@ -135,9 +136,6 @@ def plot_duplicate_bins(
|
|
|
135
136
|
xlabel, title, legend_title:
|
|
136
137
|
Plot labels.
|
|
137
138
|
|
|
138
|
-
Returns
|
|
139
|
-
-------
|
|
140
|
-
(fig, ax)
|
|
141
139
|
"""
|
|
142
140
|
if len(results) == 0:
|
|
143
141
|
raise ValueError("results must be non-empty")
|
|
@@ -157,7 +155,10 @@ def plot_duplicate_bins(
|
|
|
157
155
|
bin_labels = res[0].bin_labels
|
|
158
156
|
colors = n_colors_from_cmap(n_bins, cmap)
|
|
159
157
|
|
|
160
|
-
|
|
158
|
+
if ax is None:
|
|
159
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
160
|
+
else:
|
|
161
|
+
fig = ax.figure
|
|
161
162
|
|
|
162
163
|
y_positions = np.arange(len(res))
|
|
163
164
|
left_stack = np.zeros(len(res), dtype=float)
|
|
@@ -207,6 +208,7 @@ def plot_duplicates_by_experiment(
|
|
|
207
208
|
cmap = green_yellow_red,
|
|
208
209
|
title: str = "Duplicate fingerprints plot",
|
|
209
210
|
figsize: Tuple[float, float] = (10, 6),
|
|
211
|
+
ax: Optional[plt.Axes] = None,
|
|
210
212
|
sort_by_total: bool = True,
|
|
211
213
|
) -> Tuple[plt.Figure, plt.Axes, List[DuplicateBinResult]]:
|
|
212
214
|
"""Compute binned duplicate stats per experiment and plot them.
|
|
@@ -238,5 +240,6 @@ def plot_duplicates_by_experiment(
|
|
|
238
240
|
cmap=cmap,
|
|
239
241
|
sort_by_total=sort_by_total,
|
|
240
242
|
title=title,
|
|
243
|
+
ax=ax,
|
|
241
244
|
)
|
|
242
245
|
return fig, ax, results
|
|
@@ -6,8 +6,7 @@ from chemap import FingerprintConfig, compute_fingerprints
|
|
|
6
6
|
from chemap.fingerprint_conversions import fingerprints_to_csr
|
|
7
7
|
from chemap.metrics import (
|
|
8
8
|
tanimoto_distance_dense,
|
|
9
|
-
|
|
10
|
-
tanimoto_distance_unfolded_count,
|
|
9
|
+
tanimoto_distance_sparse,
|
|
11
10
|
)
|
|
12
11
|
|
|
13
12
|
|
|
@@ -25,18 +24,17 @@ def _choose_cpu_metric(config: FingerprintConfig, distance_function: str) -> Any
|
|
|
25
24
|
- unfolded + binary => tanimoto_distance_unfolded_binary
|
|
26
25
|
- folded (usually dense/packed) => tanimoto_distance_dense
|
|
27
26
|
"""
|
|
27
|
+
if distance_function.lower() == "cosine":
|
|
28
|
+
return "cosine"
|
|
28
29
|
if distance_function.lower() != "tanimoto":
|
|
29
30
|
raise ValueError(
|
|
30
31
|
f"Unsupported distance_function={distance_function!r}. "
|
|
31
|
-
"Currently only 'tanimoto' is supported here."
|
|
32
|
+
"Currently only 'tanimoto' and 'cosine' is supported here."
|
|
32
33
|
)
|
|
33
34
|
|
|
34
|
-
|
|
35
35
|
if getattr(config, "folded", False):
|
|
36
36
|
return tanimoto_distance_dense
|
|
37
|
-
|
|
38
|
-
return tanimoto_distance_unfolded_count
|
|
39
|
-
return tanimoto_distance_unfolded_binary
|
|
37
|
+
return tanimoto_distance_sparse
|
|
40
38
|
|
|
41
39
|
|
|
42
40
|
def _log1p_csr_inplace(X) -> Any:
|
|
@@ -61,7 +59,7 @@ def create_chem_space_umap(
|
|
|
61
59
|
n_neighbors: int = 15,
|
|
62
60
|
min_dist: float = 0.25,
|
|
63
61
|
n_jobs: int = -1,
|
|
64
|
-
umap_random_state: Optional[int] =
|
|
62
|
+
umap_random_state: Optional[int] = None,
|
|
65
63
|
distance_function: str = "tanimoto",
|
|
66
64
|
) -> pd.DataFrame:
|
|
67
65
|
"""Compute fingerprints (CPU) and create 2D UMAP coordinates (CPU).
|
|
@@ -220,17 +218,16 @@ def create_chem_space_umap_gpu(
|
|
|
220
218
|
show_progress=show_progress,
|
|
221
219
|
)
|
|
222
220
|
|
|
223
|
-
# Convert to
|
|
224
|
-
fps_csr = fingerprints_to_csr(fingerprints).X
|
|
225
|
-
fps = fps_csr.toarray()
|
|
221
|
+
# Convert to sparse array
|
|
222
|
+
# fps_csr = fingerprints_to_csr(fingerprints).X
|
|
226
223
|
|
|
227
224
|
# Reduce memory footprint (works well for count fingerprints)
|
|
228
225
|
if not log_count:
|
|
229
226
|
# stays integer-like
|
|
230
|
-
fps =
|
|
227
|
+
fps = fingerprints.astype(np.int8, copy=False)
|
|
231
228
|
else:
|
|
232
229
|
# log1p returns float
|
|
233
|
-
fps = np.log1p(
|
|
230
|
+
fps = np.log1p(fingerprints).astype(np.float32, copy=False)
|
|
234
231
|
|
|
235
232
|
umap_model = cuUMAP(
|
|
236
233
|
n_neighbors=int(n_neighbors),
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
from typing import Mapping, Sequence, Tuple, Union
|
|
2
|
+
import numpy as np
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
Bins = Sequence[Tuple[float, float]]
|
|
5
6
|
Color = Tuple[float, float, float] # RGB
|
|
6
7
|
ColorA = Tuple[float, float, float, float] # RGBA
|
|
7
8
|
Palette = Mapping[str, Union[Color, ColorA]]
|
|
9
|
+
UnfoldedBinary = list[np.ndarray] # list of int64 feature IDs per molecule
|
|
10
|
+
UnfoldedCount = list[tuple[np.ndarray, np.ndarray]] # (int64 feature IDs, float32 values)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "chemap"
|
|
3
|
-
version = "0.3.
|
|
3
|
+
version = "0.3.2"
|
|
4
4
|
description = "Library for computing molecular fingerprint based similarities as well as dimensionality reduction based chemical space visualizations. "
|
|
5
5
|
authors = [
|
|
6
6
|
{ name="Florian Huber", email="florian.huber@hs-duesseldorf.de" },
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{chemap-0.3.0/chemap/additional_fingerprints → chemap-0.3.2/chemap/fingerprints}/element_count_fp.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|