chemap 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {chemap-0.3.0 → chemap-0.3.2}/PKG-INFO +1 -1
  2. chemap-0.3.2/chemap/benchmarking/__init__.py +15 -0
  3. {chemap-0.3.0 → chemap-0.3.2}/chemap/fingerprint_computation.py +86 -16
  4. chemap-0.3.2/chemap/fingerprints/__init__.py +14 -0
  5. chemap-0.3.2/chemap/fingerprints/chemap_base_fingerprint.py +76 -0
  6. chemap-0.3.2/chemap/fingerprints/lingo.py +154 -0
  7. chemap-0.3.2/chemap/fingerprints/map4.py +349 -0
  8. chemap-0.3.2/chemap/fingerprints/mhfp.py +100 -0
  9. {chemap-0.3.0 → chemap-0.3.2}/chemap/plotting/benchmark_duplicates.py +7 -4
  10. {chemap-0.3.0 → chemap-0.3.2}/chemap/plotting/chem_space_umap.py +10 -13
  11. {chemap-0.3.0 → chemap-0.3.2}/chemap/types.py +3 -0
  12. {chemap-0.3.0 → chemap-0.3.2}/pyproject.toml +1 -1
  13. chemap-0.3.0/chemap/additional_fingerprints/__init__.py +0 -6
  14. chemap-0.3.0/chemap/benchmarking/__init__.py +0 -7
  15. {chemap-0.3.0 → chemap-0.3.2}/LICENSE +0 -0
  16. {chemap-0.3.0 → chemap-0.3.2}/README.md +0 -0
  17. {chemap-0.3.0 → chemap-0.3.2}/chemap/__init__.py +0 -0
  18. {chemap-0.3.0 → chemap-0.3.2}/chemap/approx_nn.py +0 -0
  19. {chemap-0.3.0 → chemap-0.3.2}/chemap/benchmarking/fingerprint_duplicates.py +0 -0
  20. {chemap-0.3.0 → chemap-0.3.2}/chemap/benchmarking/utils.py +0 -0
  21. {chemap-0.3.0 → chemap-0.3.2}/chemap/data_loader.py +0 -0
  22. {chemap-0.3.0 → chemap-0.3.2}/chemap/fingerprint_conversions.py +0 -0
  23. {chemap-0.3.0 → chemap-0.3.2}/chemap/fingerprint_statistics.py +0 -0
  24. {chemap-0.3.0/chemap/additional_fingerprints → chemap-0.3.2/chemap/fingerprints}/element_count_fp.py +0 -0
  25. {chemap-0.3.0 → chemap-0.3.2}/chemap/mbp.py +0 -0
  26. {chemap-0.3.0 → chemap-0.3.2}/chemap/metrics.py +0 -0
  27. {chemap-0.3.0 → chemap-0.3.2}/chemap/plotting/__init__.py +0 -0
  28. {chemap-0.3.0 → chemap-0.3.2}/chemap/plotting/colormap_handling.py +0 -0
  29. {chemap-0.3.0 → chemap-0.3.2}/chemap/plotting/colormaps.py +0 -0
  30. {chemap-0.3.0 → chemap-0.3.2}/chemap/plotting/scatter_plots.py +0 -0
  31. {chemap-0.3.0 → chemap-0.3.2}/chemap/utils.py +0 -0
  32. {chemap-0.3.0 → chemap-0.3.2}/chemap/visualizations.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chemap
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Library for computing molecular fingerprint based similarities as well as dimensionality reduction based chemical space visualizations.
5
5
  License-Expression: MIT
6
6
  License-File: LICENSE
@@ -0,0 +1,15 @@
1
+ from .fingerprint_duplicates import (
2
+ load_duplicates_npz,
3
+ load_precomputed_duplicates_folder,
4
+ save_duplicates_npz,
5
+ )
6
+ from .utils import compute_compound_max_mass_differences, compute_duplicate_max_mass_differences
7
+
8
+
9
+ __all__ = [
10
+ "compute_compound_max_mass_differences",
11
+ "compute_duplicate_max_mass_differences",
12
+ "load_duplicates_npz",
13
+ "load_precomputed_duplicates_folder",
14
+ "save_duplicates_npz",
15
+ ]
@@ -7,6 +7,7 @@ from joblib import Parallel, delayed
7
7
  from rdkit import Chem
8
8
  from sklearn.base import BaseEstimator, TransformerMixin
9
9
  from tqdm import tqdm
10
+ from chemap.types import UnfoldedBinary, UnfoldedCount
10
11
 
11
12
 
12
13
  # -----------------------------
@@ -16,9 +17,6 @@ from tqdm import tqdm
16
17
  InvalidPolicy = Literal["drop", "keep", "raise"]
17
18
  Scaling = Optional[Literal["log"]]
18
19
 
19
- UnfoldedBinary = List[np.ndarray] # list of int64 feature IDs per molecule
20
- UnfoldedCount = List[Tuple[np.ndarray, np.ndarray]] # list of (int64 feature IDs, float32 values)
21
-
22
20
  FingerprintResult = Union[np.ndarray, sp.csr_matrix, UnfoldedBinary, UnfoldedCount]
23
21
 
24
22
 
@@ -532,9 +530,13 @@ def _skfp_configure_output(
532
530
  """
533
531
  Configure scikit-fingerprints/sklearn transformer to match (folded, return_csr).
534
532
 
535
- - folded=True : use the transformer's folded output
536
- - folded=False: require variant='raw_bits' if supported
537
- - return_csr=True (only when folded=True): prefer transformer sparse CSR if supported
533
+ Supports two modes:
534
+ 1) scikit-fingerprints classic:
535
+ - unfolded via variant='raw_bits' (if available)
536
+ - folded via variant='folded' (if variant exists)
537
+ 2) ChemapBaseFingerprint style:
538
+ - unfolded via folded=False (no variant)
539
+ - folded via folded=True
538
540
  """
539
541
  params = fpgen.get_params(deep=False)
540
542
  updates: Dict[str, Any] = {}
@@ -545,22 +547,42 @@ def _skfp_configure_output(
545
547
  if "n_jobs" in params:
546
548
  updates["n_jobs"] = n_jobs
547
549
 
550
+ chemap_style = "folded" in params # ChemapBaseFingerprint exposes folded param
551
+
552
+ # -------------------------
553
+ # UNFOLDED (cfg.folded=False)
554
+ # -------------------------
548
555
  if not cfg.folded:
556
+ if chemap_style:
557
+ if params.get("folded") is not False:
558
+ updates["folded"] = False
559
+ # We don't force sparse; unfolded returns lists anyway.
560
+ return _clone_transformer_with_params(fpgen, updates) if updates else fpgen
561
+
562
+ # classic scikit-fingerprints route: needs variant='raw_bits'
549
563
  if "variant" not in params:
550
564
  raise NotImplementedError(
551
- "Requested folded=False (unfolded), but this transformer does not expose a `variant` parameter "
552
- "for an unfolded feature space (e.g., variant='raw_bits')."
565
+ "Requested folded=False (unfolded), but this transformer does not support "
566
+ "either chemap-style `folded` switching or an skfp-style `variant='raw_bits'`."
553
567
  )
568
+
554
569
  if params.get("variant") != "raw_bits":
555
570
  updates["variant"] = "raw_bits"
556
571
 
557
- # For unfolded conversion we can accept either dense or CSR outputs, so we do not force "sparse".
558
572
  return _clone_transformer_with_params(fpgen, updates) if updates else fpgen
559
573
 
560
- # folded=True
574
+ # ------------------------
575
+ # FOLDED (cfg.folded=True)
576
+ # ------------------------
577
+ if chemap_style:
578
+ if params.get("folded") is not True:
579
+ updates["folded"] = True
580
+
581
+ # If it's classic skfp and currently set to raw_bits, restore folded variant
561
582
  if "variant" in params and params.get("variant") == "raw_bits":
562
583
  updates["variant"] = "folded"
563
584
 
585
+ # Prefer CSR if requested and supported
564
586
  if "sparse" in params:
565
587
  desired = bool(cfg.return_csr)
566
588
  if params.get("sparse") != desired:
@@ -577,35 +599,83 @@ def _compute_sklearn(
577
599
  show_progress: bool = False,
578
600
  n_jobs: int,
579
601
  ) -> FingerprintResult:
602
+ """
603
+ Compute fingerprints using sklearn/scikit-fingerprints style transformers.
604
+
605
+ Supports two kinds of transformers for unfolded output (cfg.folded=False):
606
+ 1) Classic scikit-fingerprints: unfolded via variant='raw_bits' (matrix output)
607
+ 2) ChemapBaseFingerprint style: unfolded via folded=False (list output)
608
+
609
+ Invalid-policy behavior:
610
+ - drop: returns only valid rows (shorter output)
611
+ - keep: aligns output to input, inserting empty rows for invalid smiles
612
+ - raise: raises on first invalid smiles
613
+ """
580
614
  fp = _skfp_configure_output(fpgen, cfg, show_progress=show_progress, n_jobs=n_jobs)
615
+
616
+ # Parse molecules with robust handling (None for invalid SMILES)
581
617
  mol_transformer = RobustMolTransformer(n_jobs=n_jobs)
582
618
  mols = mol_transformer.transform(smiles)
583
619
 
584
- # Determine valid/invalid molecules and handle invalid according to policy.
585
620
  valid_idx = [i for i, m in enumerate(mols) if m is not None]
586
621
  invalid_idx = [i for i, m in enumerate(mols) if m is None]
587
622
 
588
623
  if invalid_idx and cfg.invalid_policy == "raise":
589
624
  raise ValueError(f"Invalid SMILES: {smiles[invalid_idx[0]]}")
590
625
 
591
- # Fit/transform only valid mols (safe for most transformers)
592
626
  valid_mols = [mols[i] for i in valid_idx]
593
627
 
628
+ # Most skfp transformers are "fit-less" but expose fit; keep consistent behavior.
594
629
  fp.fit(valid_mols)
595
630
  X_valid = fp.transform(valid_mols)
596
631
 
597
- # If policy is drop: just return X_valid (current behavior)
632
+ # -----------------------------
633
+ # Case A: transformer returns chemap-unfolded formats directly (list output)
634
+ # -----------------------------
635
+ is_list_unfolded = isinstance(X_valid, list) and (
636
+ len(X_valid) == 0
637
+ or isinstance(X_valid[0], np.ndarray)
638
+ or (isinstance(X_valid[0], tuple) and len(X_valid[0]) == 2)
639
+ )
640
+
641
+ if is_list_unfolded:
642
+ # In this case, we assume we are already in unfolded mode (cfg.folded=False).
643
+ # If cfg.folded=True but the transformer returns lists, that's an API mismatch.
644
+ if cfg.folded:
645
+ raise TypeError(
646
+ "Transformer returned chemap-unfolded list output while cfg.folded=True. "
647
+ "This likely indicates a misconfigured transformer."
648
+ )
649
+
650
+ if cfg.invalid_policy == "drop":
651
+ return X_valid
652
+
653
+ # keep alignment: reinsert empty rows
654
+ N = len(smiles)
655
+ if cfg.count:
656
+ X_full: UnfoldedCount = [_empty_unfolded_count() for _ in range(N)]
657
+ else:
658
+ X_full: UnfoldedBinary = [_empty_unfolded_binary() for _ in range(N)]
659
+
660
+ for out_i, orig_i in enumerate(valid_idx):
661
+ X_full[orig_i] = X_valid[out_i] # type: ignore[index]
662
+
663
+ return X_full
664
+
665
+ # -----------------------------
666
+ # Case B: transformer returns a matrix (dense or sparse)
667
+ # -----------------------------
668
+
669
+ # Handle invalid-policy re-insertion for matrix outputs
598
670
  if cfg.invalid_policy == "drop":
599
671
  X = X_valid
600
672
  else:
601
- # policy keep: reinsert empty rows to match input length
602
673
  N = len(smiles)
603
674
 
604
675
  if sp.issparse(X_valid):
605
676
  X_valid = X_valid.tocsr().astype(np.float32)
606
677
  D = X_valid.shape[1]
607
678
  X = sp.csr_matrix((N, D), dtype=np.float32)
608
- # place valid rows
609
679
  X[valid_idx, :] = X_valid
610
680
  else:
611
681
  X_valid = np.asarray(X_valid, dtype=np.float32)
@@ -613,8 +683,8 @@ def _compute_sklearn(
613
683
  X = np.zeros((N, D), dtype=np.float32)
614
684
  X[valid_idx, :] = X_valid
615
685
 
686
+ # If unfolded requested, convert matrix -> chemap unfolded formats
616
687
  if not cfg.folded:
617
- # unfolded output
618
688
  if sp.issparse(X):
619
689
  return _csr_matrix_to_unfolded(X.tocsr().astype(np.float32), cfg)
620
690
  return _dense_matrix_to_unfolded(np.asarray(X, dtype=np.float32), cfg)
@@ -0,0 +1,14 @@
1
+ from .chemap_base_fingerprint import ChemapBaseFingerprint
2
+ from .element_count_fp import ElementCountFingerprint
3
+ from .lingo import LingoFingerprint
4
+ from .map4 import MAP4FPGen
5
+ from .mhfp import MHFPEncoderLite
6
+
7
+
8
+ __all__ = [
9
+ "ChemapBaseFingerprint",
10
+ "ElementCountFingerprint",
11
+ "LingoFingerprint",
12
+ "MAP4FPGen",
13
+ "MHFPEncoderLite",
14
+ ]
@@ -0,0 +1,76 @@
1
+ from collections.abc import Sequence
2
+ from typing import Any
3
+ from joblib import Parallel, delayed
4
+ from rdkit.Chem import Mol
5
+ from skfp.bases import BaseFingerprintTransformer
6
+ from skfp.utils import ensure_smiles
7
+ from chemap.types import UnfoldedBinary, UnfoldedCount
8
+
9
+
10
+ class ChemapBaseFingerprint(BaseFingerprintTransformer):
11
+ """
12
+ Extension of scikit-fingerprints BaseFingerprintTransformer that adds `folded`.
13
+
14
+ - folded=True: behaves like scikit-fingerprints (returns dense ndarray or sparse csr_array)
15
+ - folded=False: returns chemap unfolded formats (lists of feature IDs / (IDs, values))
16
+
17
+ Important: this class intentionally subclasses scikit-fingerprints' base class
18
+ to preserve their behavior (validation, parallelization patterns, etc.) where possible.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ *,
24
+ n_features_out: int,
25
+ count: bool = False,
26
+ sparse: bool = False,
27
+ folded: bool = True,
28
+ n_jobs: int | None = None,
29
+ batch_size: int | None = None,
30
+ verbose: int | dict = 0,
31
+ ):
32
+ super().__init__(
33
+ n_features_out=n_features_out,
34
+ count=count,
35
+ sparse=sparse,
36
+ n_jobs=n_jobs,
37
+ batch_size=batch_size,
38
+ verbose=verbose,
39
+ )
40
+ self.folded = folded
41
+
42
+ def transform(self, X: Sequence[str | Mol], copy: bool = False) -> Any:
43
+ """
44
+ If folded=True: defer to BaseFingerprintTransformer.transform (matrix output).
45
+ If folded=False: return chemap unfolded formats.
46
+ """
47
+ if self.folded:
48
+ return super().transform(X, copy=copy)
49
+
50
+ # unfolded route: we accept SMILES or Mol, but Lingo-like methods want SMILES
51
+ smiles = ensure_smiles(X)
52
+ return self._calculate_unfolded(smiles)
53
+
54
+ # ---- hooks for subclasses ----
55
+
56
+ def _calculate_unfolded(self, X_smiles: Sequence[str]) -> UnfoldedBinary | UnfoldedCount:
57
+ """
58
+ Subclasses must implement when folded=False.
59
+ Must return chemap unfolded formats:
60
+ - count=False: List[np.ndarray[int64]]
61
+ - count=True : List[Tuple[np.ndarray[int64], np.ndarray[float32]]]
62
+ """
63
+ raise NotImplementedError
64
+
65
+ # ---- helpers ----
66
+
67
+ def _parallel_map(self, fn, items):
68
+ n_jobs = self.n_jobs if self.n_jobs is not None else 1
69
+
70
+ if n_jobs == 1:
71
+ return [fn(x) for x in items]
72
+
73
+ batch_size = self.batch_size if self.batch_size is not None else "auto"
74
+ return Parallel(n_jobs=n_jobs, batch_size=batch_size)(
75
+ delayed(fn)(x) for x in items
76
+ )
@@ -0,0 +1,154 @@
1
+ import hashlib
2
+ import re
3
+ from collections import defaultdict
4
+ from collections.abc import Sequence
5
+ from numbers import Integral
6
+ import numpy as np
7
+ from scipy.sparse import csr_array
8
+ from skfp.utils import ensure_smiles
9
+ from sklearn.utils._param_validation import Interval
10
+ from chemap.fingerprints import ChemapBaseFingerprint
11
+
12
+
13
+ class LingoFingerprint(ChemapBaseFingerprint):
14
+ """
15
+ Lingo fingerprint with chemap unfolded support.
16
+
17
+ folded=True:
18
+ behaves like scikit-fingerprints: fixed-size hashed vector (dense or CSR)
19
+ folded=False:
20
+ returns chemap unfolded formats with stable 64-bit feature IDs derived from SHA-1:
21
+ - count=False: List[np.ndarray[int64]] (feature IDs)
22
+ - count=True : List[Tuple[np.ndarray[int64], np.ndarray[float32]]] (IDs + counts)
23
+ """
24
+
25
+ _parameter_constraints: dict = {
26
+ **ChemapBaseFingerprint._parameter_constraints,
27
+ "fp_size": [Interval(Integral, 1, None, closed="left")],
28
+ "substring_length": [Interval(Integral, 1, None, closed="left")],
29
+ }
30
+
31
+ def __init__(
32
+ self,
33
+ fp_size: int = 4096,
34
+ substring_length: int = 4,
35
+ count: bool = False,
36
+ sparse: bool = False,
37
+ folded: bool = True,
38
+ n_jobs: int | None = None,
39
+ batch_size: int | None = None,
40
+ verbose: int | dict = 0,
41
+ ):
42
+ super().__init__(
43
+ n_features_out=fp_size,
44
+ count=count,
45
+ sparse=sparse,
46
+ folded=folded,
47
+ n_jobs=n_jobs,
48
+ batch_size=batch_size,
49
+ verbose=verbose,
50
+ )
51
+ self.fp_size = fp_size
52
+ self.substring_length = substring_length
53
+
54
+ # --------------------
55
+ # Shared preprocessing
56
+ # --------------------
57
+
58
+ def smiles_to_dicts(self, X: Sequence[str]) -> list[dict[str, int]]:
59
+ """
60
+ Convert SMILES to dicts of substring counts (original Lingo raw features).
61
+ """
62
+ X = ensure_smiles(X)
63
+
64
+ # same canonicalization as skfp
65
+ X = [re.sub(r"[123456789]", "0", smi) for smi in X]
66
+ X = [re.sub(r"Cl", "L", smi) for smi in X]
67
+ X = [re.sub(r"Br", "R", smi) for smi in X]
68
+
69
+ result: list[dict[str, int]] = []
70
+ L = self.substring_length
71
+
72
+ for smi in X:
73
+ d: defaultdict[str, int] = defaultdict(int)
74
+ # overlapping substrings
75
+ for i in range(len(smi) - L + 1):
76
+ d[smi[i : i + L]] += 1
77
+ result.append(dict(d))
78
+
79
+ return result
80
+
81
+ # --------------------
82
+ # Folded (matrix) path
83
+ # --------------------
84
+
85
+ def _calculate_fingerprint(self, X: Sequence[str]) -> np.ndarray | csr_array:
86
+ """
87
+ Called by BaseFingerprintTransformer when folded=True.
88
+ """
89
+ dicts = self.smiles_to_dicts(X)
90
+ arr = self._dicts_to_folded_array(dicts)
91
+ return csr_array(arr) if self.sparse else arr
92
+
93
+ def _dicts_to_folded_array(self, dicts: list[dict[str, int]]) -> np.ndarray:
94
+ """
95
+ Hash and fold into [0..fp_size-1], identical to skfp folding rule.
96
+ """
97
+ dtype = np.uint32 if self.count else np.uint8
98
+ out = np.zeros((len(dicts), self.fp_size), dtype=dtype)
99
+
100
+ for i, d in enumerate(dicts):
101
+ for token, c in d.items():
102
+ digest = hashlib.sha1(token.encode("utf-8"), usedforsecurity=False).digest()
103
+ hash_index = int.from_bytes(digest, byteorder="big") % self.fp_size
104
+
105
+ if self.count:
106
+ out[i, hash_index] += c
107
+ else:
108
+ out[i, hash_index] = 1
109
+
110
+ return out
111
+
112
+ # -----------------------
113
+ # Unfolded (chemap) path
114
+ # -----------------------
115
+
116
+ def _calculate_unfolded(self, X_smiles: Sequence[str]):
117
+ """
118
+ Return chemap unfolded formats.
119
+
120
+ Feature IDs are stable int64 derived from SHA-1 digest:
121
+ id64 = int.from_bytes(digest[:8], "big") (uint64, then viewed as int64 safely via np.uint64->np.int64 cast)
122
+ """
123
+ dicts = self.smiles_to_dicts(X_smiles)
124
+
125
+ def token_to_id32(token: str) -> int:
126
+ digest = hashlib.sha1(token.encode("utf-8"), usedforsecurity=False).digest()
127
+ return int.from_bytes(digest[:4], byteorder="big", signed=False)
128
+
129
+ if self.count:
130
+ def one(d: dict[str, int]) -> tuple[np.ndarray, np.ndarray]:
131
+ if not d:
132
+ return (np.array([], dtype=np.int64), np.array([], dtype=np.float32))
133
+
134
+ agg: dict[int, float] = {}
135
+ for token, c in d.items():
136
+ fid = token_to_id32(token)
137
+ agg[fid] = agg.get(fid, 0.0) + float(c)
138
+
139
+ keys = np.array(sorted(agg.keys()), dtype=np.int64)
140
+ vals = np.array([agg[int(k)] for k in keys], dtype=np.float32)
141
+ return keys, vals
142
+
143
+ return self._parallel_map(one, dicts)
144
+
145
+ def one_bin(d: dict[str, int]) -> np.ndarray:
146
+ if not d:
147
+ return np.array([], dtype=np.int64)
148
+
149
+ ids = np.fromiter((token_to_id32(t) for t in d.keys()), dtype=np.int64)
150
+ # np.unique sorts ascending
151
+ return np.unique(ids).astype(np.int64, copy=False)
152
+
153
+ return self._parallel_map(one_bin, dicts)
154
+
@@ -0,0 +1,349 @@
1
+ """
2
+ chemap-compatible MAP4 FP generator (in parts based on Luca Cappelletti's implementation of MAP4:
3
+ https://github.com/LucaCappelletti94/map4/blob/master/map4/map4.py
4
+ Which is based on the original MAP4 implementation
5
+ `Alice Capecchi, Daniel Probst, Jean-Louis Reymond
6
+ "One molecular fingerprint to rule them all: drugs, biomolecules, and the metabolome"
7
+ J Cheminform 12, 43 (2020)
8
+ <https://jcheminf.biomedcentral.com/articles/10.1186/s13321-020-00445-4>`_
9
+
10
+ There are a few particular aspects about this implementation tough:
11
+ - Folded output:
12
+ * binary (uint8) uses MHFP-style MinHash folding (chemap.fingerprints.mhfp)
13
+ * count (float32) accumulates true shingle multiplicities into folded bins (not a MinHash signature,
14
+ so different from the original implementation!)
15
+ - Unfolded output:
16
+ * count=True -> true counts per raw feature id
17
+ * count=False -> keys only (chemap will read keys from GetSparseCountFingerprint)
18
+ * feature ids are SHA1 by default, unless minhash_for_unfolded=True
19
+ """
20
+
21
+ import itertools
22
+ from collections import defaultdict
23
+ from dataclasses import dataclass
24
+ from hashlib import sha1
25
+ from typing import Dict, List, Optional, Set
26
+ import numpy as np
27
+ from rdkit.Chem import Mol, MolToSmiles, PathToSubmol
28
+ from rdkit.Chem.rdmolops import FindAtomEnvironmentOfRadiusN, GetDistanceMatrix
29
+ from chemap.fingerprints.mhfp import MHFPEncoderLite
30
+
31
+
32
+ # -----------------------------
33
+ # Minimal RDKit-like return types
34
+ # -----------------------------
35
+
36
+ @dataclass(frozen=True)
37
+ class _SparseCountFingerprint:
38
+ """RDKit SparseIntVect-like shim for chemap."""
39
+ nz: Dict[int, int]
40
+ def GetNonzeroElements(self) -> Dict[int, int]:
41
+ return self.nz
42
+
43
+
44
+ @dataclass(frozen=True)
45
+ class _BitFingerprint:
46
+ """RDKit ExplicitBitVect-like shim for chemap size inference."""
47
+ n_bits: int
48
+ def GetNumBits(self) -> int:
49
+ return self.n_bits
50
+
51
+
52
+ @dataclass(frozen=True)
53
+ class _CountFingerprint:
54
+ """RDKit IntSparseIntVect-like shim for chemap size inference."""
55
+ length: int
56
+ def GetLength(self) -> int:
57
+ return self.length
58
+
59
+
60
+ # -----------------------------
61
+ # MAP4 shingling core
62
+ # -----------------------------
63
+
64
+ class _MAP4Shingler:
65
+ """
66
+ Generates MAP4 shingles as bytes:
67
+ - envs for radii 1..R
68
+ - for each atom pair (i<j) and each radius index k in [0..R-1]:
69
+ shingle = f"{smaller_env}|{dist}|{larger_env}"
70
+ where smaller/larger chosen by length comparison (ties go to env_b as larger)
71
+ - optional include_duplicated_shingles "suffix trick" is available, but for chemap counts
72
+ we SHOULD NOT use it (we want true multiplicities).
73
+ """
74
+
75
+ def __init__(
76
+ self,
77
+ radius: int = 2,
78
+ *,
79
+ include_duplicated_shingles: bool = False,
80
+ max_dist: Optional[int] = None,
81
+ dist_binning: Optional[np.ndarray] = None,
82
+ ):
83
+ if radius <= 0:
84
+ raise ValueError("radius must be > 0.")
85
+ self.radius = int(radius)
86
+ self.include_duplicated_shingles = bool(include_duplicated_shingles)
87
+ self.max_dist = max_dist
88
+ self.dist_binning = dist_binning
89
+
90
+ def shingles_unique(self, mol: Mol) -> Set[bytes]:
91
+ return set(self._all_pairs(mol, self._get_atom_envs(mol)))
92
+
93
+ def shingles_with_counts_true(self, mol: Mol) -> Dict[bytes, int]:
94
+ """
95
+ True multiplicities (counts) WITHOUT suffix trick, regardless of include_duplicated_shingles.
96
+ """
97
+ counts: Dict[bytes, int] = defaultdict(int)
98
+ for sh in self._all_pairs(mol, self._get_atom_envs(mol), force_no_suffix=True):
99
+ counts[sh] += 1
100
+ return dict(counts)
101
+
102
+ def _convert_dist(self, dist: float) -> int:
103
+ if self.dist_binning is None:
104
+ return int(dist)
105
+ return int(np.digitize(dist, self.dist_binning, right=True))
106
+
107
+ def _get_atom_envs(self, mol: Mol) -> Dict[int, List[Optional[str]]]:
108
+ atoms_env: Dict[int, List[Optional[str]]] = {}
109
+ for atom in mol.GetAtoms():
110
+ atom_identifier = atom.GetIdx()
111
+ for r in range(1, self.radius + 1):
112
+ atoms_env.setdefault(atom_identifier, []).append(
113
+ self._find_env(mol, atom_identifier, r)
114
+ )
115
+ return atoms_env
116
+
117
+ @staticmethod
118
+ def _find_env(mol: Mol, atom_identifier: int, radius: int) -> Optional[str]:
119
+ atom_identifiers_within_radius: List[int] = FindAtomEnvironmentOfRadiusN(
120
+ mol=mol, radius=radius, rootedAtAtom=atom_identifier
121
+ )
122
+ atom_map: Dict[int, int] = {}
123
+ sub_molecule: Mol = PathToSubmol(mol, atom_identifiers_within_radius, atomMap=atom_map)
124
+
125
+ if atom_identifier not in atom_map:
126
+ return None
127
+
128
+ return MolToSmiles(
129
+ sub_molecule,
130
+ rootedAtAtom=atom_map[atom_identifier],
131
+ canonical=True,
132
+ isomericSmiles=False,
133
+ )
134
+
135
+ def _all_pairs(
136
+ self,
137
+ mol: Mol,
138
+ atoms_env: Dict[int, List[Optional[str]]],
139
+ *,
140
+ force_no_suffix: bool = False,
141
+ ) -> List[bytes]:
142
+ """
143
+ Return shingles as bytes. If include_duplicated_shingles is enabled and not forced off,
144
+ suffix trick is applied to make duplicates unique (MAP4C-style behavior).
145
+ """
146
+ out: List[bytes] = []
147
+ dm = GetDistanceMatrix(mol)
148
+ n = mol.GetNumAtoms()
149
+ shingle_dict: Dict[str, int] = defaultdict(int)
150
+
151
+ for i, j in itertools.combinations(range(n), 2):
152
+ dist_val = float(dm[i][j])
153
+ if self.max_dist is not None and dist_val > self.max_dist:
154
+ continue
155
+ dist = str(self._convert_dist(dist_val))
156
+
157
+ for k in range(self.radius):
158
+ env_a = atoms_env[i][k] or ""
159
+ env_b = atoms_env[j][k] or ""
160
+
161
+ # compare by length, not lexicographic
162
+ if len(env_a) > len(env_b):
163
+ larger_env, smaller_env = env_a, env_b
164
+ else:
165
+ larger_env, smaller_env = env_b, env_a
166
+
167
+ shingle = f"{smaller_env}|{dist}|{larger_env}"
168
+
169
+ if self.include_duplicated_shingles and not force_no_suffix:
170
+ shingle_dict[shingle] += 1
171
+ shingle = f"{shingle}|{shingle_dict[shingle]}"
172
+
173
+ out.append(shingle.encode("utf-8"))
174
+
175
+ return out
176
+
177
+
178
+ # -----------------------------
179
+ # MAP4 fpgen for chemap
180
+ # -----------------------------
181
+
182
+ class MAP4FPGen:
183
+ """
184
+ chemap-compatible MAP4 fingerprint generator.
185
+
186
+ Folded outputs (fixed length):
187
+ - GetFingerprintAsNumPy: uint8[D] binary
188
+ computed by minhash signature (MHFPEncoderLite) folded to bits (mod D)
189
+ - GetCountFingerprintAsNumPy: float32[D] counts
190
+ computed by hashing each shingle (token hash32) -> bin (mod D) and summing true counts
191
+
192
+ Unfolded outputs (raw feature ids):
193
+ - GetSparseCountFingerprint returns {feature_id: count}
194
+ feature_id:
195
+ * sha1 truncation (default) OR
196
+ * token-hash32 (sha1 first 4 bytes) if minhash_for_unfolded=True
197
+
198
+ Parameters
199
+ ----------
200
+ folded:
201
+ Whether folded functions are meaningful (chemap controls this, but we keep for safety).
202
+ minhash_for_unfolded:
203
+ If True, unfolded uses MHFP-style token hash32 rather than sha1 truncation.
204
+ unfolded_bits:
205
+ 32 or 64 (only used when minhash_for_unfolded=False).
206
+ include_duplicated_shingles:
207
+ For MAP4C-like behavior in *set shingles* (folded binary). For true counts we ignore suffix.
208
+ """
209
+
210
+ def __init__(
211
+ self,
212
+ dimensions: int = 1024,
213
+ radius: int = 2,
214
+ *,
215
+ seed: int = 75434278,
216
+ folded: bool = True,
217
+ # counts/dup behavior
218
+ include_duplicated_shingles: bool = False,
219
+ # unfolded hashing behavior
220
+ minhash_for_unfolded: bool = False,
221
+ unfolded_bits: int = 32, # 32 or 64, only if minhash_for_unfolded=False
222
+ # optional distance handling
223
+ max_dist: Optional[int] = None,
224
+ dist_binning: Optional[np.ndarray] = None,
225
+ ):
226
+ self.dimensions = int(dimensions)
227
+ self.radius = int(radius)
228
+ self.seed = int(seed)
229
+ self.folded = bool(folded)
230
+
231
+ self.include_duplicated_shingles = bool(include_duplicated_shingles)
232
+ self.minhash_for_unfolded = bool(minhash_for_unfolded)
233
+ self.unfolded_bits = int(unfolded_bits)
234
+
235
+ if self.dimensions <= 0:
236
+ raise ValueError("dimensions must be > 0.")
237
+ if self.radius <= 0:
238
+ raise ValueError("radius must be > 0.")
239
+ if self.unfolded_bits not in (32, 64):
240
+ raise ValueError("unfolded_bits must be 32 or 64.")
241
+
242
+ self._shingler = _MAP4Shingler(
243
+ radius=self.radius,
244
+ include_duplicated_shingles=self.include_duplicated_shingles,
245
+ max_dist=max_dist,
246
+ dist_binning=dist_binning,
247
+ )
248
+
249
+ # Folded uses MHFPEncoderLite
250
+ self._mhfp = MHFPEncoderLite(
251
+ n_permutations=self.dimensions,
252
+ seed=self.seed,
253
+ )
254
+
255
+ # --------- chemap size inference ---------
256
+
257
+ def GetFingerprint(self, mol: Mol) -> _BitFingerprint:
258
+ return _BitFingerprint(self.dimensions)
259
+
260
+ def GetCountFingerprint(self, mol: Mol) -> _CountFingerprint:
261
+ return _CountFingerprint(self.dimensions)
262
+
263
+ # --------- unfolded API ---------
264
+
265
+ def GetSparseCountFingerprint(self, mol: Mol) -> _SparseCountFingerprint:
266
+ """
267
+ Returns {feature_id: count} for unfolded outputs.
268
+
269
+ - count=True in chemap: keys+values used
270
+ - count=False in chemap: keys used, values ignored
271
+ """
272
+ counts = self._shingler.shingles_with_counts_true(mol)
273
+ if not counts:
274
+ return _SparseCountFingerprint({})
275
+
276
+ nz: Dict[int, int] = defaultdict(int)
277
+
278
+ if self.minhash_for_unfolded:
279
+ # MHFP token-hash domain: sha1 first 4 bytes (little endian)
280
+ for sh, c in counts.items():
281
+ fid32 = int.from_bytes(sha1(sh).digest()[:4], "little", signed=False)
282
+ nz[int(fid32)] += int(c)
283
+ else:
284
+ for sh, c in counts.items():
285
+ fid = int(self._sha1_to_int(sh, bits=self.unfolded_bits))
286
+ nz[fid] += int(c)
287
+
288
+ return _SparseCountFingerprint(dict(nz))
289
+
290
+ # --------- folded API ---------
291
+
292
+ def GetFingerprintAsNumPy(self, mol: Mol) -> np.ndarray:
293
+ """
294
+ Folded binary vector uint8[D], matching original MAP4Calculator folded path:
295
+
296
+ folded = fold(hash(set(shingles)), D)
297
+
298
+ i.e. hash each unique shingle token -> set bit at (hash % D).
299
+ """
300
+ if not self.folded:
301
+ return np.zeros(self.dimensions, dtype=np.uint8)
302
+
303
+ shingles = self._shingler.shingles_unique(mol) # set[bytes]
304
+ if not shingles:
305
+ return np.zeros(self.dimensions, dtype=np.uint8)
306
+
307
+ # Per-shingle 32-bit hash (matches the common MAP4/scikit-fingerprints style: sha1/sha256 truncated)
308
+ hashed = np.fromiter(
309
+ (int.from_bytes(sha1(sh).digest()[:4], "little", signed=False) for sh in shingles),
310
+ dtype=np.uint32,
311
+ count=len(shingles),
312
+ )
313
+
314
+ fp = np.zeros(self.dimensions, dtype=np.uint8)
315
+ fp[(hashed % np.uint32(self.dimensions)).astype(np.int64, copy=False)] = 1
316
+ return fp
317
+
318
+ def GetCountFingerprintAsNumPy(self, mol: Mol) -> np.ndarray:
319
+ """
320
+ Folded counts float32[D] using TRUE multiplicities.
321
+
322
+ This is *not* a MinHash (classic MAP4 is set-based). We instead provide a stable
323
+ count-fold baseline:
324
+ bin = token_hash32(shingle) % D
325
+ fp[bin] += count
326
+ """
327
+ if not self.folded:
328
+ return np.zeros(self.dimensions, dtype=np.float32)
329
+
330
+ counts = self._shingler.shingles_with_counts_true(mol)
331
+ if not counts:
332
+ return np.zeros(self.dimensions, dtype=np.float32)
333
+
334
+ fp = np.zeros(self.dimensions, dtype=np.float32)
335
+ for sh, c in counts.items():
336
+ h32 = int.from_bytes(sha1(sh).digest()[:4], "little", signed=False)
337
+ fp[h32 % self.dimensions] += float(c)
338
+ return fp
339
+
340
+ # -----------------------------
341
+ # Hash utilities
342
+ # -----------------------------
343
+
344
+ @staticmethod
345
+ def _sha1_to_int(data: bytes, *, bits: int = 64) -> np.uint64:
346
+ d = sha1(data).digest()
347
+ if bits == 32:
348
+ return np.uint64(int.from_bytes(d[:4], byteorder="little", signed=False))
349
+ return np.uint64(int.from_bytes(d[:8], byteorder="little", signed=False))
@@ -0,0 +1,100 @@
1
+ import struct
2
+ from hashlib import sha1
3
+ from typing import Iterable, Sequence, Union
4
+ import numpy as np
5
+
6
+
7
+ BytesLike = Union[bytes, bytearray, memoryview]
8
+
9
+
10
+ class MHFPEncoderLite:
11
+ """
12
+ Compatibility-focused reimplementation of the original mhfp.encoder.MHFPEncoder.
13
+ (Original is from the Reymond group: https://github.com/reymond-group/mhfp)
14
+
15
+ Notes
16
+ -----
17
+ - The original uses:
18
+ prime = 2^61 - 1
19
+ max_hash = 2^32 - 1
20
+ and outputs uint32 signatures.
21
+ - Token hash is:
22
+ struct.unpack("<I", sha1(token).digest()[:4])[0]
23
+ """
24
+
25
+ prime: int = (1 << 61) - 1
26
+ max_hash: int = (1 << 32) - 1
27
+
28
+ def __init__(self, n_permutations: int = 2048, seed: int = 42):
29
+ if n_permutations <= 0:
30
+ raise ValueError("n_permutations must be > 0.")
31
+ self.n_permutations = int(n_permutations)
32
+ self.seed = int(seed)
33
+
34
+ # Match original: generate uint32 a,b with uniqueness constraints
35
+ rand = np.random.RandomState(self.seed)
36
+
37
+ a = np.zeros(self.n_permutations, dtype=np.uint32)
38
+ b = np.zeros(self.n_permutations, dtype=np.uint32)
39
+
40
+ # Original code used `while a in self.permutations_a` checks (O(n)),
41
+ # but that behavior means "no duplicates". We'll enforce the same.
42
+ used_a = set()
43
+ used_b = set()
44
+
45
+ for i in range(self.n_permutations):
46
+ ai = int(rand.randint(1, MHFPEncoderLite.max_hash, dtype=np.uint32))
47
+ bi = int(rand.randint(0, MHFPEncoderLite.max_hash, dtype=np.uint32))
48
+
49
+ while ai in used_a:
50
+ ai = int(rand.randint(1, MHFPEncoderLite.max_hash, dtype=np.uint32))
51
+ while bi in used_b:
52
+ bi = int(rand.randint(0, MHFPEncoderLite.max_hash, dtype=np.uint32))
53
+
54
+ used_a.add(ai)
55
+ used_b.add(bi)
56
+ a[i] = np.uint32(ai)
57
+ b[i] = np.uint32(bi)
58
+
59
+ # Match original: reshape to column vectors (n_perm, 1)
60
+ self._a = a.reshape((self.n_permutations, 1)).astype(np.uint64, copy=False)
61
+ self._b = b.reshape((self.n_permutations, 1)).astype(np.uint64, copy=False)
62
+
63
+ # -----------------------------
64
+ # Token hashing (exact)
65
+ # -----------------------------
66
+
67
+ @staticmethod
68
+ def _token_hash32(token: BytesLike) -> np.uint32:
69
+ # EXACT original semantics: struct.unpack("<I", sha1(t).digest()[:4])[0]
70
+ return np.uint32(struct.unpack("<I", sha1(bytes(token)).digest()[:4])[0])
71
+
72
+ # -----------------------------
73
+ # Original helper API: hash / fold / merge / distance
74
+ # -----------------------------
75
+
76
+ @staticmethod
77
+ def hash(tokens: Iterable[BytesLike]) -> np.ndarray:
78
+ """
79
+ For compatibility with original MHFPEncoder.hash(shingling):
80
+ returns per-token uint32 hash values (NOT minhash signature).
81
+ """
82
+ return np.fromiter(
83
+ (MHFPEncoderLite._token_hash32(t) for t in tokens),
84
+ dtype=np.uint32,
85
+ )
86
+
87
+ @staticmethod
88
+ def fold(hash_values: Sequence[int], length: int = 2048) -> np.ndarray:
89
+ """
90
+ Compatibility with original fold(): binary uint8 vector with bits set at hash % length.
91
+ """
92
+ length = int(length)
93
+ if length <= 0:
94
+ raise ValueError("length must be > 0.")
95
+ folded = np.zeros(length, dtype=np.uint8)
96
+ if len(hash_values) == 0:
97
+ return folded
98
+ hv = np.asarray(hash_values, dtype=np.uint64)
99
+ folded[(hv % np.uint64(length)).astype(np.int64, copy=False)] = 1
100
+ return folded
@@ -114,6 +114,7 @@ def plot_duplicate_bins(
114
114
  xlabel: str = "Compounds with Fingerprint Duplicates",
115
115
  title: str = "Duplicate Statistics by Experiment",
116
116
  legend_title: str = "Maximum mass difference\n(for identical fingerprints)",
117
+ ax: Optional[plt.Axes] = None,
117
118
  ) -> Tuple[plt.Figure, plt.Axes]:
118
119
  """Plot stacked horizontal bars of duplicate counts across bins.
119
120
 
@@ -135,9 +136,6 @@ def plot_duplicate_bins(
135
136
  xlabel, title, legend_title:
136
137
  Plot labels.
137
138
 
138
- Returns
139
- -------
140
- (fig, ax)
141
139
  """
142
140
  if len(results) == 0:
143
141
  raise ValueError("results must be non-empty")
@@ -157,7 +155,10 @@ def plot_duplicate_bins(
157
155
  bin_labels = res[0].bin_labels
158
156
  colors = n_colors_from_cmap(n_bins, cmap)
159
157
 
160
- fig, ax = plt.subplots(figsize=figsize)
158
+ if ax is None:
159
+ fig, ax = plt.subplots(figsize=figsize)
160
+ else:
161
+ fig = ax.figure
161
162
 
162
163
  y_positions = np.arange(len(res))
163
164
  left_stack = np.zeros(len(res), dtype=float)
@@ -207,6 +208,7 @@ def plot_duplicates_by_experiment(
207
208
  cmap = green_yellow_red,
208
209
  title: str = "Duplicate fingerprints plot",
209
210
  figsize: Tuple[float, float] = (10, 6),
211
+ ax: Optional[plt.Axes] = None,
210
212
  sort_by_total: bool = True,
211
213
  ) -> Tuple[plt.Figure, plt.Axes, List[DuplicateBinResult]]:
212
214
  """Compute binned duplicate stats per experiment and plot them.
@@ -238,5 +240,6 @@ def plot_duplicates_by_experiment(
238
240
  cmap=cmap,
239
241
  sort_by_total=sort_by_total,
240
242
  title=title,
243
+ ax=ax,
241
244
  )
242
245
  return fig, ax, results
@@ -6,8 +6,7 @@ from chemap import FingerprintConfig, compute_fingerprints
6
6
  from chemap.fingerprint_conversions import fingerprints_to_csr
7
7
  from chemap.metrics import (
8
8
  tanimoto_distance_dense,
9
- tanimoto_distance_unfolded_binary,
10
- tanimoto_distance_unfolded_count,
9
+ tanimoto_distance_sparse,
11
10
  )
12
11
 
13
12
 
@@ -25,18 +24,17 @@ def _choose_cpu_metric(config: FingerprintConfig, distance_function: str) -> Any
25
24
  - unfolded + binary => tanimoto_distance_unfolded_binary
26
25
  - folded (usually dense/packed) => tanimoto_distance_dense
27
26
  """
27
+ if distance_function.lower() == "cosine":
28
+ return "cosine"
28
29
  if distance_function.lower() != "tanimoto":
29
30
  raise ValueError(
30
31
  f"Unsupported distance_function={distance_function!r}. "
31
- "Currently only 'tanimoto' is supported here."
32
+ "Currently only 'tanimoto' and 'cosine' is supported here."
32
33
  )
33
34
 
34
-
35
35
  if getattr(config, "folded", False):
36
36
  return tanimoto_distance_dense
37
- if getattr(config, "count", False):
38
- return tanimoto_distance_unfolded_count
39
- return tanimoto_distance_unfolded_binary
37
+ return tanimoto_distance_sparse
40
38
 
41
39
 
42
40
  def _log1p_csr_inplace(X) -> Any:
@@ -61,7 +59,7 @@ def create_chem_space_umap(
61
59
  n_neighbors: int = 15,
62
60
  min_dist: float = 0.25,
63
61
  n_jobs: int = -1,
64
- umap_random_state: Optional[int] = 40476,
62
+ umap_random_state: Optional[int] = None,
65
63
  distance_function: str = "tanimoto",
66
64
  ) -> pd.DataFrame:
67
65
  """Compute fingerprints (CPU) and create 2D UMAP coordinates (CPU).
@@ -220,17 +218,16 @@ def create_chem_space_umap_gpu(
220
218
  show_progress=show_progress,
221
219
  )
222
220
 
223
- # Convert to numeric matrix.
224
- fps_csr = fingerprints_to_csr(fingerprints).X
225
- fps = fps_csr.toarray()
221
+ # Convert to sparse array
222
+ # fps_csr = fingerprints_to_csr(fingerprints).X
226
223
 
227
224
  # Reduce memory footprint (works well for count fingerprints)
228
225
  if not log_count:
229
226
  # stays integer-like
230
- fps = fps.astype(np.int8, copy=False)
227
+ fps = fingerprints.astype(np.int8, copy=False)
231
228
  else:
232
229
  # log1p returns float
233
- fps = np.log1p(fps).astype(np.float32, copy=False)
230
+ fps = np.log1p(fingerprints).astype(np.float32, copy=False)
234
231
 
235
232
  umap_model = cuUMAP(
236
233
  n_neighbors=int(n_neighbors),
@@ -1,7 +1,10 @@
1
1
  from typing import Mapping, Sequence, Tuple, Union
2
+ import numpy as np
2
3
 
3
4
 
4
5
  Bins = Sequence[Tuple[float, float]]
5
6
  Color = Tuple[float, float, float] # RGB
6
7
  ColorA = Tuple[float, float, float, float] # RGBA
7
8
  Palette = Mapping[str, Union[Color, ColorA]]
9
+ UnfoldedBinary = list[np.ndarray] # list of int64 feature IDs per molecule
10
+ UnfoldedCount = list[tuple[np.ndarray, np.ndarray]] # (int64 feature IDs, float32 values)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "chemap"
3
- version = "0.3.0"
3
+ version = "0.3.2"
4
4
  description = "Library for computing molecular fingerprint based similarities as well as dimensionality reduction based chemical space visualizations. "
5
5
  authors = [
6
6
  { name="Florian Huber", email="florian.huber@hs-duesseldorf.de" },
@@ -1,6 +0,0 @@
1
- from .element_count_fp import ElementCountFingerprint
2
-
3
-
4
- __all__ = [
5
- "ElementCountFingerprint",
6
- ]
@@ -1,7 +0,0 @@
1
- from .utils import compute_compound_max_mass_differences, compute_duplicate_max_mass_differences
2
-
3
-
4
- __all__ = [
5
- "compute_compound_max_mass_differences",
6
- "compute_duplicate_max_mass_differences",
7
- ]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes