chemap 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chemap-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 matchms
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
chemap-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,123 @@
1
+ Metadata-Version: 2.4
2
+ Name: chemap
3
+ Version: 0.1.0
4
+ Summary: Library for computing molecular fingerprint based similarities as well as dimensionality reduction based chemical space visualizations.
5
+ License-Expression: MIT
6
+ License-File: LICENSE
7
+ Author: Florian Huber
8
+ Author-email: florian.huber@hs-duesseldorf.de
9
+ Requires-Python: >=3.11,<3.14
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Dist: map4 (>=1.1.3)
13
+ Requires-Dist: matplotlib (>=3.10.1)
14
+ Requires-Dist: numba (>=0.61.2)
15
+ Requires-Dist: numpy (>=2.1.0)
16
+ Requires-Dist: pandas (>=2.2.1)
17
+ Requires-Dist: pooch (>=1.8.2)
18
+ Requires-Dist: rdkit (>=2024.9.6)
19
+ Requires-Dist: scikit-fingerprints (>=1.15.0)
20
+ Requires-Dist: scipy (>=1.14.2)
21
+ Requires-Dist: tqdm (>=4.67.1)
22
+ Description-Content-Type: text/markdown
23
+
24
+
25
+ <img src="./materials/chemap_logo_green_pink.svg" width="400">
26
+
27
+ ![GitHub License](https://img.shields.io/github/license/matchms/chemap?color=#00B050)
28
+ [![PyPI](https://img.shields.io/pypi/v/chemap?color=#00B050)](https://pypi.org/project/chemap/)
29
+ ![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/matchms/chemap/CI_build_and_matrix_tests.yml?color=#00B050)
30
+ [![Powered by RDKit](https://img.shields.io/badge/Powered%20by-RDKit-3838ff.svg?logo=)](https://www.rdkit.org/)
31
+
32
+ # chemap - Chemical Mapping
33
+ Library for computing molecular fingerprint based similarities as well as dimensionality reduction based chemical space visualizations.
34
+
35
+
36
+ ## Fingerprint computations
37
+ Fingerprints can be computed using generators from `RDKit` or `scikit-fingerprints`. Here a code example:
38
+
39
+ ```python
40
+ import numpy as np
41
+ import scipy.sparse as sp
42
+ from rdkit.Chem import rdFingerprintGenerator
43
+ from skfp.fingerprints import MAPFingerprint, AtomPairFingerprint
44
+
45
+ from chemap import compute_fingerprints, DatasetLoader, FingerprintConfig
46
+
47
+
48
+ ds_loader = DatasetLoader()
49
+ smiles = ds_loader.load("tests/data/smiles.csv")
50
+
51
+ # ----------------------------
52
+ # RDKit: Morgan (folded, dense)
53
+ # ----------------------------
54
+ morgan = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=4096)
55
+ X_morgan = compute_fingerprints(
56
+ smiles,
57
+ morgan,
58
+ config=FingerprintConfig(
59
+ count=False,
60
+ folded=True,
61
+ return_csr=False, # dense numpy
62
+ invalid_policy="raise",
63
+ ),
64
+ )
65
+ print("RDKit Morgan:", X_morgan.shape, X_morgan.dtype)
66
+
67
+ # -----------------------------------
68
+ # RDKit: RDKitFP (folded, CSR sparse)
69
+ # -----------------------------------
70
+ rdkitfp = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=4096)
71
+ X_rdkitfp_csr = compute_fingerprints(
72
+ smiles,
73
+ rdkitfp,
74
+ config=FingerprintConfig(
75
+ count=False,
76
+ folded=True,
77
+ return_csr=True, # SciPy CSR
78
+ invalid_policy="raise",
79
+ ),
80
+ )
81
+ assert sp.issparse(X_rdkitfp_csr)
82
+ print("RDKit RDKitFP (CSR):", X_rdkitfp_csr.shape, X_rdkitfp_csr.dtype, "nnz=", X_rdkitfp_csr.nnz)
83
+
84
+ # --------------------------------------------------
85
+ # scikit-fingerprints: MAPFingerprint (folded, dense)
86
+ # --------------------------------------------------
87
+ # MAPFingerprint is a MinHash-like fingerprint (different from MAP4 lib).
88
+ map_fp = MAPFingerprint(fp_size=4096, count=False, sparse=False)
89
+ X_map = compute_fingerprints(
90
+ smiles,
91
+ map_fp,
92
+ config=FingerprintConfig(
93
+ count=False,
94
+ folded=True,
95
+ return_csr=False,
96
+ invalid_policy="raise",
97
+ ),
98
+ )
99
+ print("skfp MAPFingerprint:", X_map.shape, X_map.dtype)
100
+
101
+ # ----------------------------------------------------
102
+ # scikit-fingerprints: AtomPairFingerprint (folded, CSR)
103
+ # ----------------------------------------------------
104
+ atom_pair = AtomPairFingerprint(fp_size=4096, count=False, sparse=False, use_3D=False)
105
+ X_ap_csr = compute_fingerprints(
106
+ smiles,
107
+ atom_pair,
108
+ config=FingerprintConfig(
109
+ count=False,
110
+ folded=True,
111
+ return_csr=True,
112
+ invalid_policy="raise",
113
+ ),
114
+ )
115
+ assert sp.issparse(X_ap_csr)
116
+ print("skfp AtomPair (CSR):", X_ap_csr.shape, X_ap_csr.dtype, "nnz=", X_ap_csr.nnz)
117
+
118
+ # (Optional) convert CSR -> dense if you need a NumPy array downstream:
119
+ X_ap = X_ap_csr.toarray().astype(np.float32, copy=False)
120
+
121
+
122
+ ```
123
+
chemap-0.1.0/README.md ADDED
@@ -0,0 +1,99 @@
1
+
2
+ <img src="./materials/chemap_logo_green_pink.svg" width="400">
3
+
4
+ ![GitHub License](https://img.shields.io/github/license/matchms/chemap?color=#00B050)
5
+ [![PyPI](https://img.shields.io/pypi/v/chemap?color=#00B050)](https://pypi.org/project/chemap/)
6
+ ![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/matchms/chemap/CI_build_and_matrix_tests.yml?color=#00B050)
7
+ [![Powered by RDKit](https://img.shields.io/badge/Powered%20by-RDKit-3838ff.svg?logo=)](https://www.rdkit.org/)
8
+
9
+ # chemap - Chemical Mapping
10
+ Library for computing molecular fingerprint based similarities as well as dimensionality reduction based chemical space visualizations.
11
+
12
+
13
+ ## Fingerprint computations
14
+ Fingerprints can be computed using generators from `RDKit` or `scikit-fingerprints`. Here a code example:
15
+
16
+ ```python
17
+ import numpy as np
18
+ import scipy.sparse as sp
19
+ from rdkit.Chem import rdFingerprintGenerator
20
+ from skfp.fingerprints import MAPFingerprint, AtomPairFingerprint
21
+
22
+ from chemap import compute_fingerprints, DatasetLoader, FingerprintConfig
23
+
24
+
25
+ ds_loader = DatasetLoader()
26
+ smiles = ds_loader.load("tests/data/smiles.csv")
27
+
28
+ # ----------------------------
29
+ # RDKit: Morgan (folded, dense)
30
+ # ----------------------------
31
+ morgan = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=4096)
32
+ X_morgan = compute_fingerprints(
33
+ smiles,
34
+ morgan,
35
+ config=FingerprintConfig(
36
+ count=False,
37
+ folded=True,
38
+ return_csr=False, # dense numpy
39
+ invalid_policy="raise",
40
+ ),
41
+ )
42
+ print("RDKit Morgan:", X_morgan.shape, X_morgan.dtype)
43
+
44
+ # -----------------------------------
45
+ # RDKit: RDKitFP (folded, CSR sparse)
46
+ # -----------------------------------
47
+ rdkitfp = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=4096)
48
+ X_rdkitfp_csr = compute_fingerprints(
49
+ smiles,
50
+ rdkitfp,
51
+ config=FingerprintConfig(
52
+ count=False,
53
+ folded=True,
54
+ return_csr=True, # SciPy CSR
55
+ invalid_policy="raise",
56
+ ),
57
+ )
58
+ assert sp.issparse(X_rdkitfp_csr)
59
+ print("RDKit RDKitFP (CSR):", X_rdkitfp_csr.shape, X_rdkitfp_csr.dtype, "nnz=", X_rdkitfp_csr.nnz)
60
+
61
+ # --------------------------------------------------
62
+ # scikit-fingerprints: MAPFingerprint (folded, dense)
63
+ # --------------------------------------------------
64
+ # MAPFingerprint is a MinHash-like fingerprint (different from MAP4 lib).
65
+ map_fp = MAPFingerprint(fp_size=4096, count=False, sparse=False)
66
+ X_map = compute_fingerprints(
67
+ smiles,
68
+ map_fp,
69
+ config=FingerprintConfig(
70
+ count=False,
71
+ folded=True,
72
+ return_csr=False,
73
+ invalid_policy="raise",
74
+ ),
75
+ )
76
+ print("skfp MAPFingerprint:", X_map.shape, X_map.dtype)
77
+
78
+ # ----------------------------------------------------
79
+ # scikit-fingerprints: AtomPairFingerprint (folded, CSR)
80
+ # ----------------------------------------------------
81
+ atom_pair = AtomPairFingerprint(fp_size=4096, count=False, sparse=False, use_3D=False)
82
+ X_ap_csr = compute_fingerprints(
83
+ smiles,
84
+ atom_pair,
85
+ config=FingerprintConfig(
86
+ count=False,
87
+ folded=True,
88
+ return_csr=True,
89
+ invalid_policy="raise",
90
+ ),
91
+ )
92
+ assert sp.issparse(X_ap_csr)
93
+ print("skfp AtomPair (CSR):", X_ap_csr.shape, X_ap_csr.dtype, "nnz=", X_ap_csr.nnz)
94
+
95
+ # (Optional) convert CSR -> dense if you need a NumPy array downstream:
96
+ X_ap = X_ap_csr.toarray().astype(np.float32, copy=False)
97
+
98
+
99
+ ```
@@ -0,0 +1,9 @@
1
+ from .data_loader import DatasetLoader
2
+ from .fingerprint_computation import FingerprintConfig, compute_fingerprints
3
+
4
+
5
+ __all__ = [
6
+ "FingerprintConfig",
7
+ "compute_fingerprints",
8
+ "DatasetLoader",
9
+ ]
@@ -0,0 +1,162 @@
1
+ """
2
+ Module: approx_nn
3
+ -----------------
4
+
5
+ This module defines a function for computing approximate nearest neighbors
6
+ from a list of SMILES strings. It uses two different fingerprints:
7
+ - A dense (1024-bit) fingerprint for dimensionality reduction via PCA.
8
+ - A sparse (4096-bit) fingerprint for a refined nearest neighbor search
9
+ based on a Ruzicka similarity.
10
+
11
+ The general steps are:
12
+ 1. Compute fingerprints from SMILES using RDKit's Morgan generator.
13
+ 2. Scale and reduce the dense fingerprints with PCA.
14
+ 3. Build an approximate NN graph on the PCA vectors.
15
+ 4. Refine the neighbor search using a Ruzicka-based candidate search.
16
+ """
17
+
18
+ import time
19
+ from typing import Any, List, Tuple
20
+ import numba
21
+ import numpy as np
22
+ from fingerprint_computation import compute_fingerprints_from_smiles
23
+ from metrics import ruzicka_similarity_sparse_numba
24
+ from numba import prange
25
+ from pynndescent import NNDescent
26
+ from rdkit.Chem import rdFingerprintGenerator
27
+ from sklearn.decomposition import PCA
28
+ from sklearn.pipeline import Pipeline
29
+ from sklearn.preprocessing import StandardScaler
30
+
31
+
32
+ def compound_nearest_neighbors(
33
+ smiles: List[str], k_pca: int = 500, k_morgan: int = 100
34
+ ) -> Tuple[Any, Any]:
35
+ """
36
+ Compute approximate nearest neighbors for a list of SMILES strings.
37
+
38
+ This function computes two sets of molecular fingerprints using RDKit’s Morgan
39
+ generator (one dense and one sparse), reduces the dimensionality of the dense
40
+ fingerprints via PCA, builds an approximate nearest neighbor graph using NNDescent,
41
+ and finally refines the search with a Ruzicka-based candidate search.
42
+
43
+ Parameters:
44
+ smiles (List[str]): List of SMILES strings representing molecules.
45
+ k_pca (int): Number of neighbors used for the initial approximate NN search
46
+ (using PCA vectors). Must be larger than k_morgan. Default: 500.
47
+ k_morgan (int): Number of neighbors used for the refined Ruzicka-based search.
48
+ Default: 100.
49
+
50
+ Returns:
51
+ Tuple[Any, Any]: A tuple (order, scores) where:
52
+ - order: An array-like structure of neighbor indices.
53
+ - scores: An array-like structure of similarity scores corresponding to the neighbors.
54
+ """
55
+ # Validate input: ensure that the PCA search uses more neighbors than the refined search.
56
+ assert k_pca > k_morgan, "Expected k_pca to be larger than k_morgan"
57
+
58
+ t_start = time.time()
59
+ print(">" * 20, "Compute fingerprints")
60
+ # Compute dense fingerprints (1024 bits) for PCA.
61
+ fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=1024)
62
+ fingerprints_morgan3_count_1024 = compute_fingerprints_from_smiles(
63
+ smiles, fpgen, count=True, sparse=False, progress_bar=True,
64
+ )
65
+
66
+ # Compute sparse fingerprints for refined neighbor search.
67
+ fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=4096)
68
+ fingerprints_morgan3_count_sparse = compute_fingerprints_from_smiles(
69
+ smiles, fpgen, count=True, sparse=True, progress_bar=True,
70
+ )
71
+ print(f"Took: {(time.time() - t_start):.4f} s.")
72
+
73
+ order, scores = compute_approx_nearest_neighbors(
74
+ fingerprints_morgan3_count_1024, fingerprints_morgan3_count_sparse, k_pca, k_morgan
75
+ )
76
+ return order, scores
77
+
78
+
79
+ def compute_approx_nearest_neighbors(
80
+ fingerprints_coarse, fingerprints_fine, k_pca: int = 500, k_morgan: int = 100
81
+ ) -> Tuple[Any, Any]:
82
+
83
+ t_start = time.time()
84
+ print(">" * 20, "Compute PCA vectors")
85
+ pca = PCA(n_components=100)
86
+ scaler = StandardScaler()
87
+ pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca)])
88
+ pca_vectors = pipe.fit_transform(fingerprints_coarse)
89
+ print(f"Took: {(time.time() - t_start):.4f} s.")
90
+
91
+ t_start = time.time()
92
+ print(">" * 20, f"Build NN-graph ({k_pca} neighbors)")
93
+ ann_graph = NNDescent(pca_vectors, metric="cosine", n_neighbors=k_pca)
94
+ print(f"Took: {(time.time() - t_start):.4f} s.")
95
+
96
+ t_start = time.time()
97
+ print(">" * 20, f"Build Ruzicka based NN-graph ({k_morgan} neighbors)")
98
+ order, scores = ruzicka_candidate_search(
99
+ fingerprints_fine, fingerprints_fine,
100
+ ann_graph.neighbor_graph[0],
101
+ k_morgan,
102
+ )
103
+ print(f"Took: {(time.time() - t_start):.4f} s.")
104
+
105
+ return order, scores
106
+
107
+
108
+ @numba.jit(nopython=True)
109
+ def ruzicka_candidate_search(
110
+ references: list, queries: list,
111
+ knn_indices_approx: list,
112
+ k
113
+ ) -> np.ndarray:
114
+ """Search all candidates...
115
+
116
+ Parameters
117
+ ----------
118
+ references:
119
+ List of sparse fingerprints (tuple of two arrays: keys and counts).
120
+ queries
121
+ List of sparse fingerprints (tuple of two arrays: keys and counts).
122
+ """
123
+ size = len(queries)
124
+ candidate_idx = np.zeros((size, k))#, dtyoe=np.int32)
125
+ candidate_scores = np.zeros((size, k), dtype=np.float64)
126
+ for i, knn_indices in enumerate(knn_indices_approx):
127
+
128
+ order, scores = ruzicka_similarity_query_search(
129
+ [references[i] for i in knn_indices],
130
+ queries[i], k
131
+ )
132
+ candidate_idx[i, :] = knn_indices[order]
133
+ candidate_scores[i, :] = scores
134
+ return candidate_idx, candidate_scores
135
+
136
+
137
+ @numba.jit(nopython=True, fastmath=True, parallel=True)
138
+ def ruzicka_similarity_query_search(
139
+ candidates: list, query, k) -> np.ndarray:
140
+ """Returns matrix of Ruzicka similarity between all-vs-all vectors of references and queries.
141
+
142
+ Parameters
143
+ ----------
144
+ references:
145
+ List of sparse fingerprints (tuple of two arrays: keys and counts).
146
+ queries
147
+ Sparse fingerprint (tuple of two arrays: keys and counts).
148
+
149
+ Returns
150
+ -------
151
+ scores:
152
+ Matrix of all-vs-all similarity scores. scores[i, j] will contain the score
153
+ between the vectors references[i, :] and queries[j, :].
154
+ """
155
+ size1 = len(candidates)
156
+ distances = np.zeros(size1)
157
+ for i in prange(size1):
158
+ distances[i] = 1 - ruzicka_similarity_sparse_numba(
159
+ candidates[i][0], candidates[i][1],
160
+ query[0], query[1])
161
+ order = np.argsort(distances)[:k]
162
+ return order, distances[order]
@@ -0,0 +1,95 @@
1
+ import os
2
+ import pathlib
3
+ import pandas as pd
4
+ import pooch
5
+
6
+
7
+ class DatasetLoader:
8
+ def __init__(self, cache_dir="./data_cache"):
9
+ self.cache_dir = cache_dir
10
+
11
+ def load(self, source: str, **kwargs) -> list:
12
+ """
13
+ Loads a dataset from local file or web.
14
+
15
+ Parameters
16
+ -------------
17
+ source:
18
+ Either load a local file or a hyperlink pointing to a remote file.
19
+ Supported filetypes: .csv, .json, .parquet, .xls, .xlsx, .xlsx.
20
+
21
+ Returns
22
+ -------------
23
+ list of smiles strings.
24
+
25
+ Raises
26
+ -------------
27
+ ValueError if neither local file nor http/ftp/sftp.
28
+ """
29
+ if os.path.exists(source):
30
+ return self._from_local_file(source, **kwargs)
31
+ elif source.startswith(("http", "ftp", "sftp")):
32
+ return self._from_web(source, **kwargs)
33
+ else:
34
+ raise ValueError(f"Source {source} unknown.")
35
+
36
+ def _from_local_file(self, path, smiles_column: str = "smiles") -> list:
37
+ """
38
+ Loads a dataset from local file.
39
+
40
+ Parameters
41
+ -------------
42
+ path:
43
+ string of local file path.
44
+
45
+ smiles_column:
46
+ Name of column containing smiles. Defaults to smiles
47
+
48
+ Returns
49
+ -------------
50
+ list of smiles strings.
51
+
52
+ Raises
53
+ -------------
54
+ ValueError if file type unsupported.
55
+ ValueError if smiles column not present.
56
+ """
57
+ suffix = pathlib.Path(path).suffix.lower()
58
+
59
+ if suffix == ".csv":
60
+ df = pd.read_csv(path)
61
+ elif suffix == ".json":
62
+ df = pd.read_json(path)
63
+ elif suffix in [".parquet", ".pq"]:
64
+ df = pd.read_parquet(path)
65
+ elif suffix in [".xlsx", ".xls"]:
66
+ df = pd.read_excel(path)
67
+ else:
68
+ raise ValueError(f"Fileformat {suffix} not supported.")
69
+
70
+ if smiles_column not in df.columns:
71
+ raise ValueError(f"Smiles column {smiles_column} not in dataframe.")
72
+
73
+ return df[smiles_column].tolist()
74
+
75
+ def _from_web(self, url: str, **kwargs) -> list:
76
+ """
77
+ Loads a dataset from web.
78
+
79
+ Parameters
80
+ -------------
81
+ url:
82
+ string of url.
83
+
84
+ Returns
85
+ -------------
86
+ list of smiles strings.
87
+ """
88
+ file_path = pooch.retrieve(
89
+ url=url,
90
+ known_hash=kwargs.get("hash", None),
91
+ path=self.cache_dir,
92
+ progressbar=True,
93
+ )
94
+
95
+ return self._from_local_file(file_path, **kwargs)