chemap 0.3.1__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chemap-0.3.1 → chemap-0.3.2}/PKG-INFO +1 -1
- {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprint_computation.py +86 -16
- {chemap-0.3.1 → chemap-0.3.2}/chemap/plotting/chem_space_umap.py +10 -13
- {chemap-0.3.1 → chemap-0.3.2}/pyproject.toml +1 -1
- {chemap-0.3.1 → chemap-0.3.2}/LICENSE +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/README.md +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/__init__.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/approx_nn.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/benchmarking/__init__.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/benchmarking/fingerprint_duplicates.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/benchmarking/utils.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/data_loader.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprint_conversions.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprint_statistics.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprints/__init__.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprints/chemap_base_fingerprint.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprints/element_count_fp.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprints/lingo.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprints/map4.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprints/mhfp.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/mbp.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/metrics.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/plotting/__init__.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/plotting/benchmark_duplicates.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/plotting/colormap_handling.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/plotting/colormaps.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/plotting/scatter_plots.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/types.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/utils.py +0 -0
- {chemap-0.3.1 → chemap-0.3.2}/chemap/visualizations.py +0 -0
|
@@ -7,6 +7,7 @@ from joblib import Parallel, delayed
|
|
|
7
7
|
from rdkit import Chem
|
|
8
8
|
from sklearn.base import BaseEstimator, TransformerMixin
|
|
9
9
|
from tqdm import tqdm
|
|
10
|
+
from chemap.types import UnfoldedBinary, UnfoldedCount
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
# -----------------------------
|
|
@@ -16,9 +17,6 @@ from tqdm import tqdm
|
|
|
16
17
|
InvalidPolicy = Literal["drop", "keep", "raise"]
|
|
17
18
|
Scaling = Optional[Literal["log"]]
|
|
18
19
|
|
|
19
|
-
UnfoldedBinary = List[np.ndarray] # list of int64 feature IDs per molecule
|
|
20
|
-
UnfoldedCount = List[Tuple[np.ndarray, np.ndarray]] # list of (int64 feature IDs, float32 values)
|
|
21
|
-
|
|
22
20
|
FingerprintResult = Union[np.ndarray, sp.csr_matrix, UnfoldedBinary, UnfoldedCount]
|
|
23
21
|
|
|
24
22
|
|
|
@@ -532,9 +530,13 @@ def _skfp_configure_output(
|
|
|
532
530
|
"""
|
|
533
531
|
Configure scikit-fingerprints/sklearn transformer to match (folded, return_csr).
|
|
534
532
|
|
|
535
|
-
|
|
536
|
-
-
|
|
537
|
-
|
|
533
|
+
Supports two modes:
|
|
534
|
+
1) scikit-fingerprints classic:
|
|
535
|
+
- unfolded via variant='raw_bits' (if available)
|
|
536
|
+
- folded via variant='folded' (if variant exists)
|
|
537
|
+
2) ChemapBaseFingerprint style:
|
|
538
|
+
- unfolded via folded=False (no variant)
|
|
539
|
+
- folded via folded=True
|
|
538
540
|
"""
|
|
539
541
|
params = fpgen.get_params(deep=False)
|
|
540
542
|
updates: Dict[str, Any] = {}
|
|
@@ -545,22 +547,42 @@ def _skfp_configure_output(
|
|
|
545
547
|
if "n_jobs" in params:
|
|
546
548
|
updates["n_jobs"] = n_jobs
|
|
547
549
|
|
|
550
|
+
chemap_style = "folded" in params # ChemapBaseFingerprint exposes folded param
|
|
551
|
+
|
|
552
|
+
# -------------------------
|
|
553
|
+
# UNFOLDED (cfg.folded=False)
|
|
554
|
+
# -------------------------
|
|
548
555
|
if not cfg.folded:
|
|
556
|
+
if chemap_style:
|
|
557
|
+
if params.get("folded") is not False:
|
|
558
|
+
updates["folded"] = False
|
|
559
|
+
# We don't force sparse; unfolded returns lists anyway.
|
|
560
|
+
return _clone_transformer_with_params(fpgen, updates) if updates else fpgen
|
|
561
|
+
|
|
562
|
+
# classic scikit-fingerprints route: needs variant='raw_bits'
|
|
549
563
|
if "variant" not in params:
|
|
550
564
|
raise NotImplementedError(
|
|
551
|
-
"Requested folded=False (unfolded), but this transformer does not
|
|
552
|
-
"
|
|
565
|
+
"Requested folded=False (unfolded), but this transformer does not support "
|
|
566
|
+
"either chemap-style `folded` switching or an skfp-style `variant='raw_bits'`."
|
|
553
567
|
)
|
|
568
|
+
|
|
554
569
|
if params.get("variant") != "raw_bits":
|
|
555
570
|
updates["variant"] = "raw_bits"
|
|
556
571
|
|
|
557
|
-
# For unfolded conversion we can accept either dense or CSR outputs, so we do not force "sparse".
|
|
558
572
|
return _clone_transformer_with_params(fpgen, updates) if updates else fpgen
|
|
559
573
|
|
|
560
|
-
#
|
|
574
|
+
# ------------------------
|
|
575
|
+
# FOLDED (cfg.folded=True)
|
|
576
|
+
# ------------------------
|
|
577
|
+
if chemap_style:
|
|
578
|
+
if params.get("folded") is not True:
|
|
579
|
+
updates["folded"] = True
|
|
580
|
+
|
|
581
|
+
# If it's classic skfp and currently set to raw_bits, restore folded variant
|
|
561
582
|
if "variant" in params and params.get("variant") == "raw_bits":
|
|
562
583
|
updates["variant"] = "folded"
|
|
563
584
|
|
|
585
|
+
# Prefer CSR if requested and supported
|
|
564
586
|
if "sparse" in params:
|
|
565
587
|
desired = bool(cfg.return_csr)
|
|
566
588
|
if params.get("sparse") != desired:
|
|
@@ -577,35 +599,83 @@ def _compute_sklearn(
|
|
|
577
599
|
show_progress: bool = False,
|
|
578
600
|
n_jobs: int,
|
|
579
601
|
) -> FingerprintResult:
|
|
602
|
+
"""
|
|
603
|
+
Compute fingerprints using sklearn/scikit-fingerprints style transformers.
|
|
604
|
+
|
|
605
|
+
Supports two kinds of transformers for unfolded output (cfg.folded=False):
|
|
606
|
+
1) Classic scikit-fingerprints: unfolded via variant='raw_bits' (matrix output)
|
|
607
|
+
2) ChemapBaseFingerprint style: unfolded via folded=False (list output)
|
|
608
|
+
|
|
609
|
+
Invalid-policy behavior:
|
|
610
|
+
- drop: returns only valid rows (shorter output)
|
|
611
|
+
- keep: aligns output to input, inserting empty rows for invalid smiles
|
|
612
|
+
- raise: raises on first invalid smiles
|
|
613
|
+
"""
|
|
580
614
|
fp = _skfp_configure_output(fpgen, cfg, show_progress=show_progress, n_jobs=n_jobs)
|
|
615
|
+
|
|
616
|
+
# Parse molecules with robust handling (None for invalid SMILES)
|
|
581
617
|
mol_transformer = RobustMolTransformer(n_jobs=n_jobs)
|
|
582
618
|
mols = mol_transformer.transform(smiles)
|
|
583
619
|
|
|
584
|
-
# Determine valid/invalid molecules and handle invalid according to policy.
|
|
585
620
|
valid_idx = [i for i, m in enumerate(mols) if m is not None]
|
|
586
621
|
invalid_idx = [i for i, m in enumerate(mols) if m is None]
|
|
587
622
|
|
|
588
623
|
if invalid_idx and cfg.invalid_policy == "raise":
|
|
589
624
|
raise ValueError(f"Invalid SMILES: {smiles[invalid_idx[0]]}")
|
|
590
625
|
|
|
591
|
-
# Fit/transform only valid mols (safe for most transformers)
|
|
592
626
|
valid_mols = [mols[i] for i in valid_idx]
|
|
593
627
|
|
|
628
|
+
# Most skfp transformers are "fit-less" but expose fit; keep consistent behavior.
|
|
594
629
|
fp.fit(valid_mols)
|
|
595
630
|
X_valid = fp.transform(valid_mols)
|
|
596
631
|
|
|
597
|
-
#
|
|
632
|
+
# -----------------------------
|
|
633
|
+
# Case A: transformer returns chemap-unfolded formats directly (list output)
|
|
634
|
+
# -----------------------------
|
|
635
|
+
is_list_unfolded = isinstance(X_valid, list) and (
|
|
636
|
+
len(X_valid) == 0
|
|
637
|
+
or isinstance(X_valid[0], np.ndarray)
|
|
638
|
+
or (isinstance(X_valid[0], tuple) and len(X_valid[0]) == 2)
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
if is_list_unfolded:
|
|
642
|
+
# In this case, we assume we are already in unfolded mode (cfg.folded=False).
|
|
643
|
+
# If cfg.folded=True but the transformer returns lists, that's an API mismatch.
|
|
644
|
+
if cfg.folded:
|
|
645
|
+
raise TypeError(
|
|
646
|
+
"Transformer returned chemap-unfolded list output while cfg.folded=True. "
|
|
647
|
+
"This likely indicates a misconfigured transformer."
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
if cfg.invalid_policy == "drop":
|
|
651
|
+
return X_valid
|
|
652
|
+
|
|
653
|
+
# keep alignment: reinsert empty rows
|
|
654
|
+
N = len(smiles)
|
|
655
|
+
if cfg.count:
|
|
656
|
+
X_full: UnfoldedCount = [_empty_unfolded_count() for _ in range(N)]
|
|
657
|
+
else:
|
|
658
|
+
X_full: UnfoldedBinary = [_empty_unfolded_binary() for _ in range(N)]
|
|
659
|
+
|
|
660
|
+
for out_i, orig_i in enumerate(valid_idx):
|
|
661
|
+
X_full[orig_i] = X_valid[out_i] # type: ignore[index]
|
|
662
|
+
|
|
663
|
+
return X_full
|
|
664
|
+
|
|
665
|
+
# -----------------------------
|
|
666
|
+
# Case B: transformer returns a matrix (dense or sparse)
|
|
667
|
+
# -----------------------------
|
|
668
|
+
|
|
669
|
+
# Handle invalid-policy re-insertion for matrix outputs
|
|
598
670
|
if cfg.invalid_policy == "drop":
|
|
599
671
|
X = X_valid
|
|
600
672
|
else:
|
|
601
|
-
# policy keep: reinsert empty rows to match input length
|
|
602
673
|
N = len(smiles)
|
|
603
674
|
|
|
604
675
|
if sp.issparse(X_valid):
|
|
605
676
|
X_valid = X_valid.tocsr().astype(np.float32)
|
|
606
677
|
D = X_valid.shape[1]
|
|
607
678
|
X = sp.csr_matrix((N, D), dtype=np.float32)
|
|
608
|
-
# place valid rows
|
|
609
679
|
X[valid_idx, :] = X_valid
|
|
610
680
|
else:
|
|
611
681
|
X_valid = np.asarray(X_valid, dtype=np.float32)
|
|
@@ -613,8 +683,8 @@ def _compute_sklearn(
|
|
|
613
683
|
X = np.zeros((N, D), dtype=np.float32)
|
|
614
684
|
X[valid_idx, :] = X_valid
|
|
615
685
|
|
|
686
|
+
# If unfolded requested, convert matrix -> chemap unfolded formats
|
|
616
687
|
if not cfg.folded:
|
|
617
|
-
# unfolded output
|
|
618
688
|
if sp.issparse(X):
|
|
619
689
|
return _csr_matrix_to_unfolded(X.tocsr().astype(np.float32), cfg)
|
|
620
690
|
return _dense_matrix_to_unfolded(np.asarray(X, dtype=np.float32), cfg)
|
|
@@ -6,8 +6,7 @@ from chemap import FingerprintConfig, compute_fingerprints
|
|
|
6
6
|
from chemap.fingerprint_conversions import fingerprints_to_csr
|
|
7
7
|
from chemap.metrics import (
|
|
8
8
|
tanimoto_distance_dense,
|
|
9
|
-
|
|
10
|
-
tanimoto_distance_unfolded_count,
|
|
9
|
+
tanimoto_distance_sparse,
|
|
11
10
|
)
|
|
12
11
|
|
|
13
12
|
|
|
@@ -25,18 +24,17 @@ def _choose_cpu_metric(config: FingerprintConfig, distance_function: str) -> Any
|
|
|
25
24
|
- unfolded + binary => tanimoto_distance_unfolded_binary
|
|
26
25
|
- folded (usually dense/packed) => tanimoto_distance_dense
|
|
27
26
|
"""
|
|
27
|
+
if distance_function.lower() == "cosine":
|
|
28
|
+
return "cosine"
|
|
28
29
|
if distance_function.lower() != "tanimoto":
|
|
29
30
|
raise ValueError(
|
|
30
31
|
f"Unsupported distance_function={distance_function!r}. "
|
|
31
|
-
"Currently only 'tanimoto' is supported here."
|
|
32
|
+
"Currently only 'tanimoto' and 'cosine' is supported here."
|
|
32
33
|
)
|
|
33
34
|
|
|
34
|
-
|
|
35
35
|
if getattr(config, "folded", False):
|
|
36
36
|
return tanimoto_distance_dense
|
|
37
|
-
|
|
38
|
-
return tanimoto_distance_unfolded_count
|
|
39
|
-
return tanimoto_distance_unfolded_binary
|
|
37
|
+
return tanimoto_distance_sparse
|
|
40
38
|
|
|
41
39
|
|
|
42
40
|
def _log1p_csr_inplace(X) -> Any:
|
|
@@ -61,7 +59,7 @@ def create_chem_space_umap(
|
|
|
61
59
|
n_neighbors: int = 15,
|
|
62
60
|
min_dist: float = 0.25,
|
|
63
61
|
n_jobs: int = -1,
|
|
64
|
-
umap_random_state: Optional[int] =
|
|
62
|
+
umap_random_state: Optional[int] = None,
|
|
65
63
|
distance_function: str = "tanimoto",
|
|
66
64
|
) -> pd.DataFrame:
|
|
67
65
|
"""Compute fingerprints (CPU) and create 2D UMAP coordinates (CPU).
|
|
@@ -220,17 +218,16 @@ def create_chem_space_umap_gpu(
|
|
|
220
218
|
show_progress=show_progress,
|
|
221
219
|
)
|
|
222
220
|
|
|
223
|
-
# Convert to
|
|
224
|
-
fps_csr = fingerprints_to_csr(fingerprints).X
|
|
225
|
-
fps = fps_csr.toarray()
|
|
221
|
+
# Convert to sparse array
|
|
222
|
+
# fps_csr = fingerprints_to_csr(fingerprints).X
|
|
226
223
|
|
|
227
224
|
# Reduce memory footprint (works well for count fingerprints)
|
|
228
225
|
if not log_count:
|
|
229
226
|
# stays integer-like
|
|
230
|
-
fps =
|
|
227
|
+
fps = fingerprints.astype(np.int8, copy=False)
|
|
231
228
|
else:
|
|
232
229
|
# log1p returns float
|
|
233
|
-
fps = np.log1p(
|
|
230
|
+
fps = np.log1p(fingerprints).astype(np.float32, copy=False)
|
|
234
231
|
|
|
235
232
|
umap_model = cuUMAP(
|
|
236
233
|
n_neighbors=int(n_neighbors),
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "chemap"
|
|
3
|
-
version = "0.3.
|
|
3
|
+
version = "0.3.2"
|
|
4
4
|
description = "Library for computing molecular fingerprint based similarities as well as dimensionality reduction based chemical space visualizations. "
|
|
5
5
|
authors = [
|
|
6
6
|
{ name="Florian Huber", email="florian.huber@hs-duesseldorf.de" },
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|