chemap 0.3.1__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {chemap-0.3.1 → chemap-0.3.2}/PKG-INFO +1 -1
  2. {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprint_computation.py +86 -16
  3. {chemap-0.3.1 → chemap-0.3.2}/chemap/plotting/chem_space_umap.py +10 -13
  4. {chemap-0.3.1 → chemap-0.3.2}/pyproject.toml +1 -1
  5. {chemap-0.3.1 → chemap-0.3.2}/LICENSE +0 -0
  6. {chemap-0.3.1 → chemap-0.3.2}/README.md +0 -0
  7. {chemap-0.3.1 → chemap-0.3.2}/chemap/__init__.py +0 -0
  8. {chemap-0.3.1 → chemap-0.3.2}/chemap/approx_nn.py +0 -0
  9. {chemap-0.3.1 → chemap-0.3.2}/chemap/benchmarking/__init__.py +0 -0
  10. {chemap-0.3.1 → chemap-0.3.2}/chemap/benchmarking/fingerprint_duplicates.py +0 -0
  11. {chemap-0.3.1 → chemap-0.3.2}/chemap/benchmarking/utils.py +0 -0
  12. {chemap-0.3.1 → chemap-0.3.2}/chemap/data_loader.py +0 -0
  13. {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprint_conversions.py +0 -0
  14. {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprint_statistics.py +0 -0
  15. {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprints/__init__.py +0 -0
  16. {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprints/chemap_base_fingerprint.py +0 -0
  17. {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprints/element_count_fp.py +0 -0
  18. {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprints/lingo.py +0 -0
  19. {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprints/map4.py +0 -0
  20. {chemap-0.3.1 → chemap-0.3.2}/chemap/fingerprints/mhfp.py +0 -0
  21. {chemap-0.3.1 → chemap-0.3.2}/chemap/mbp.py +0 -0
  22. {chemap-0.3.1 → chemap-0.3.2}/chemap/metrics.py +0 -0
  23. {chemap-0.3.1 → chemap-0.3.2}/chemap/plotting/__init__.py +0 -0
  24. {chemap-0.3.1 → chemap-0.3.2}/chemap/plotting/benchmark_duplicates.py +0 -0
  25. {chemap-0.3.1 → chemap-0.3.2}/chemap/plotting/colormap_handling.py +0 -0
  26. {chemap-0.3.1 → chemap-0.3.2}/chemap/plotting/colormaps.py +0 -0
  27. {chemap-0.3.1 → chemap-0.3.2}/chemap/plotting/scatter_plots.py +0 -0
  28. {chemap-0.3.1 → chemap-0.3.2}/chemap/types.py +0 -0
  29. {chemap-0.3.1 → chemap-0.3.2}/chemap/utils.py +0 -0
  30. {chemap-0.3.1 → chemap-0.3.2}/chemap/visualizations.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chemap
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Library for computing molecular fingerprint based similarities as well as dimensionality reduction based chemical space visualizations.
5
5
  License-Expression: MIT
6
6
  License-File: LICENSE
@@ -7,6 +7,7 @@ from joblib import Parallel, delayed
7
7
  from rdkit import Chem
8
8
  from sklearn.base import BaseEstimator, TransformerMixin
9
9
  from tqdm import tqdm
10
+ from chemap.types import UnfoldedBinary, UnfoldedCount
10
11
 
11
12
 
12
13
  # -----------------------------
@@ -16,9 +17,6 @@ from tqdm import tqdm
16
17
  InvalidPolicy = Literal["drop", "keep", "raise"]
17
18
  Scaling = Optional[Literal["log"]]
18
19
 
19
- UnfoldedBinary = List[np.ndarray] # list of int64 feature IDs per molecule
20
- UnfoldedCount = List[Tuple[np.ndarray, np.ndarray]] # list of (int64 feature IDs, float32 values)
21
-
22
20
  FingerprintResult = Union[np.ndarray, sp.csr_matrix, UnfoldedBinary, UnfoldedCount]
23
21
 
24
22
 
@@ -532,9 +530,13 @@ def _skfp_configure_output(
532
530
  """
533
531
  Configure scikit-fingerprints/sklearn transformer to match (folded, return_csr).
534
532
 
535
- - folded=True : use the transformer's folded output
536
- - folded=False: require variant='raw_bits' if supported
537
- - return_csr=True (only when folded=True): prefer transformer sparse CSR if supported
533
+ Supports two modes:
534
+ 1) scikit-fingerprints classic:
535
+ - unfolded via variant='raw_bits' (if available)
536
+ - folded via variant='folded' (if variant exists)
537
+ 2) ChemapBaseFingerprint style:
538
+ - unfolded via folded=False (no variant)
539
+ - folded via folded=True
538
540
  """
539
541
  params = fpgen.get_params(deep=False)
540
542
  updates: Dict[str, Any] = {}
@@ -545,22 +547,42 @@ def _skfp_configure_output(
545
547
  if "n_jobs" in params:
546
548
  updates["n_jobs"] = n_jobs
547
549
 
550
+ chemap_style = "folded" in params # ChemapBaseFingerprint exposes folded param
551
+
552
+ # -------------------------
553
+ # UNFOLDED (cfg.folded=False)
554
+ # -------------------------
548
555
  if not cfg.folded:
556
+ if chemap_style:
557
+ if params.get("folded") is not False:
558
+ updates["folded"] = False
559
+ # We don't force sparse; unfolded returns lists anyway.
560
+ return _clone_transformer_with_params(fpgen, updates) if updates else fpgen
561
+
562
+ # classic scikit-fingerprints route: needs variant='raw_bits'
549
563
  if "variant" not in params:
550
564
  raise NotImplementedError(
551
- "Requested folded=False (unfolded), but this transformer does not expose a `variant` parameter "
552
- "for an unfolded feature space (e.g., variant='raw_bits')."
565
+ "Requested folded=False (unfolded), but this transformer does not support "
566
+ "either chemap-style `folded` switching or an skfp-style `variant='raw_bits'`."
553
567
  )
568
+
554
569
  if params.get("variant") != "raw_bits":
555
570
  updates["variant"] = "raw_bits"
556
571
 
557
- # For unfolded conversion we can accept either dense or CSR outputs, so we do not force "sparse".
558
572
  return _clone_transformer_with_params(fpgen, updates) if updates else fpgen
559
573
 
560
- # folded=True
574
+ # ------------------------
575
+ # FOLDED (cfg.folded=True)
576
+ # ------------------------
577
+ if chemap_style:
578
+ if params.get("folded") is not True:
579
+ updates["folded"] = True
580
+
581
+ # If it's classic skfp and currently set to raw_bits, restore folded variant
561
582
  if "variant" in params and params.get("variant") == "raw_bits":
562
583
  updates["variant"] = "folded"
563
584
 
585
+ # Prefer CSR if requested and supported
564
586
  if "sparse" in params:
565
587
  desired = bool(cfg.return_csr)
566
588
  if params.get("sparse") != desired:
@@ -577,35 +599,83 @@ def _compute_sklearn(
577
599
  show_progress: bool = False,
578
600
  n_jobs: int,
579
601
  ) -> FingerprintResult:
602
+ """
603
+ Compute fingerprints using sklearn/scikit-fingerprints style transformers.
604
+
605
+ Supports two kinds of transformers for unfolded output (cfg.folded=False):
606
+ 1) Classic scikit-fingerprints: unfolded via variant='raw_bits' (matrix output)
607
+ 2) ChemapBaseFingerprint style: unfolded via folded=False (list output)
608
+
609
+ Invalid-policy behavior:
610
+ - drop: returns only valid rows (shorter output)
611
+ - keep: aligns output to input, inserting empty rows for invalid smiles
612
+ - raise: raises on first invalid smiles
613
+ """
580
614
  fp = _skfp_configure_output(fpgen, cfg, show_progress=show_progress, n_jobs=n_jobs)
615
+
616
+ # Parse molecules with robust handling (None for invalid SMILES)
581
617
  mol_transformer = RobustMolTransformer(n_jobs=n_jobs)
582
618
  mols = mol_transformer.transform(smiles)
583
619
 
584
- # Determine valid/invalid molecules and handle invalid according to policy.
585
620
  valid_idx = [i for i, m in enumerate(mols) if m is not None]
586
621
  invalid_idx = [i for i, m in enumerate(mols) if m is None]
587
622
 
588
623
  if invalid_idx and cfg.invalid_policy == "raise":
589
624
  raise ValueError(f"Invalid SMILES: {smiles[invalid_idx[0]]}")
590
625
 
591
- # Fit/transform only valid mols (safe for most transformers)
592
626
  valid_mols = [mols[i] for i in valid_idx]
593
627
 
628
+ # Most skfp transformers are "fit-less" but expose fit; keep consistent behavior.
594
629
  fp.fit(valid_mols)
595
630
  X_valid = fp.transform(valid_mols)
596
631
 
597
- # If policy is drop: just return X_valid (current behavior)
632
+ # -----------------------------
633
+ # Case A: transformer returns chemap-unfolded formats directly (list output)
634
+ # -----------------------------
635
+ is_list_unfolded = isinstance(X_valid, list) and (
636
+ len(X_valid) == 0
637
+ or isinstance(X_valid[0], np.ndarray)
638
+ or (isinstance(X_valid[0], tuple) and len(X_valid[0]) == 2)
639
+ )
640
+
641
+ if is_list_unfolded:
642
+ # In this case, we assume we are already in unfolded mode (cfg.folded=False).
643
+ # If cfg.folded=True but the transformer returns lists, that's an API mismatch.
644
+ if cfg.folded:
645
+ raise TypeError(
646
+ "Transformer returned chemap-unfolded list output while cfg.folded=True. "
647
+ "This likely indicates a misconfigured transformer."
648
+ )
649
+
650
+ if cfg.invalid_policy == "drop":
651
+ return X_valid
652
+
653
+ # keep alignment: reinsert empty rows
654
+ N = len(smiles)
655
+ if cfg.count:
656
+ X_full: UnfoldedCount = [_empty_unfolded_count() for _ in range(N)]
657
+ else:
658
+ X_full: UnfoldedBinary = [_empty_unfolded_binary() for _ in range(N)]
659
+
660
+ for out_i, orig_i in enumerate(valid_idx):
661
+ X_full[orig_i] = X_valid[out_i] # type: ignore[index]
662
+
663
+ return X_full
664
+
665
+ # -----------------------------
666
+ # Case B: transformer returns a matrix (dense or sparse)
667
+ # -----------------------------
668
+
669
+ # Handle invalid-policy re-insertion for matrix outputs
598
670
  if cfg.invalid_policy == "drop":
599
671
  X = X_valid
600
672
  else:
601
- # policy keep: reinsert empty rows to match input length
602
673
  N = len(smiles)
603
674
 
604
675
  if sp.issparse(X_valid):
605
676
  X_valid = X_valid.tocsr().astype(np.float32)
606
677
  D = X_valid.shape[1]
607
678
  X = sp.csr_matrix((N, D), dtype=np.float32)
608
- # place valid rows
609
679
  X[valid_idx, :] = X_valid
610
680
  else:
611
681
  X_valid = np.asarray(X_valid, dtype=np.float32)
@@ -613,8 +683,8 @@ def _compute_sklearn(
613
683
  X = np.zeros((N, D), dtype=np.float32)
614
684
  X[valid_idx, :] = X_valid
615
685
 
686
+ # If unfolded requested, convert matrix -> chemap unfolded formats
616
687
  if not cfg.folded:
617
- # unfolded output
618
688
  if sp.issparse(X):
619
689
  return _csr_matrix_to_unfolded(X.tocsr().astype(np.float32), cfg)
620
690
  return _dense_matrix_to_unfolded(np.asarray(X, dtype=np.float32), cfg)
@@ -6,8 +6,7 @@ from chemap import FingerprintConfig, compute_fingerprints
6
6
  from chemap.fingerprint_conversions import fingerprints_to_csr
7
7
  from chemap.metrics import (
8
8
  tanimoto_distance_dense,
9
- tanimoto_distance_unfolded_binary,
10
- tanimoto_distance_unfolded_count,
9
+ tanimoto_distance_sparse,
11
10
  )
12
11
 
13
12
 
@@ -25,18 +24,17 @@ def _choose_cpu_metric(config: FingerprintConfig, distance_function: str) -> Any
25
24
  - unfolded + binary => tanimoto_distance_unfolded_binary
26
25
  - folded (usually dense/packed) => tanimoto_distance_dense
27
26
  """
27
+ if distance_function.lower() == "cosine":
28
+ return "cosine"
28
29
  if distance_function.lower() != "tanimoto":
29
30
  raise ValueError(
30
31
  f"Unsupported distance_function={distance_function!r}. "
31
- "Currently only 'tanimoto' is supported here."
32
+ "Currently only 'tanimoto' and 'cosine' is supported here."
32
33
  )
33
34
 
34
-
35
35
  if getattr(config, "folded", False):
36
36
  return tanimoto_distance_dense
37
- if getattr(config, "count", False):
38
- return tanimoto_distance_unfolded_count
39
- return tanimoto_distance_unfolded_binary
37
+ return tanimoto_distance_sparse
40
38
 
41
39
 
42
40
  def _log1p_csr_inplace(X) -> Any:
@@ -61,7 +59,7 @@ def create_chem_space_umap(
61
59
  n_neighbors: int = 15,
62
60
  min_dist: float = 0.25,
63
61
  n_jobs: int = -1,
64
- umap_random_state: Optional[int] = 40476,
62
+ umap_random_state: Optional[int] = None,
65
63
  distance_function: str = "tanimoto",
66
64
  ) -> pd.DataFrame:
67
65
  """Compute fingerprints (CPU) and create 2D UMAP coordinates (CPU).
@@ -220,17 +218,16 @@ def create_chem_space_umap_gpu(
220
218
  show_progress=show_progress,
221
219
  )
222
220
 
223
- # Convert to numeric matrix.
224
- fps_csr = fingerprints_to_csr(fingerprints).X
225
- fps = fps_csr.toarray()
221
+ # Convert to sparse array
222
+ # fps_csr = fingerprints_to_csr(fingerprints).X
226
223
 
227
224
  # Reduce memory footprint (works well for count fingerprints)
228
225
  if not log_count:
229
226
  # stays integer-like
230
- fps = fps.astype(np.int8, copy=False)
227
+ fps = fingerprints.astype(np.int8, copy=False)
231
228
  else:
232
229
  # log1p returns float
233
- fps = np.log1p(fps).astype(np.float32, copy=False)
230
+ fps = np.log1p(fingerprints).astype(np.float32, copy=False)
234
231
 
235
232
  umap_model = cuUMAP(
236
233
  n_neighbors=int(n_neighbors),
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "chemap"
3
- version = "0.3.1"
3
+ version = "0.3.2"
4
4
  description = "Library for computing molecular fingerprint based similarities as well as dimensionality reduction based chemical space visualizations. "
5
5
  authors = [
6
6
  { name="Florian Huber", email="florian.huber@hs-duesseldorf.de" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes