chemap 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chemap-0.1.0/LICENSE +21 -0
- chemap-0.1.0/PKG-INFO +123 -0
- chemap-0.1.0/README.md +99 -0
- chemap-0.1.0/chemap/__init__.py +9 -0
- chemap-0.1.0/chemap/approx_nn.py +162 -0
- chemap-0.1.0/chemap/data_loader.py +95 -0
- chemap-0.1.0/chemap/fingerprint_computation.py +604 -0
- chemap-0.1.0/chemap/fingerprint_statistics.py +70 -0
- chemap-0.1.0/chemap/mbp.py +338 -0
- chemap-0.1.0/chemap/metrics.py +460 -0
- chemap-0.1.0/chemap/utils.py +127 -0
- chemap-0.1.0/chemap/visualizations.py +301 -0
- chemap-0.1.0/pyproject.toml +93 -0
chemap-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 matchms
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
chemap-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chemap
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Library for computing molecular fingerprint based similarities as well as dimensionality reduction based chemical space visualizations.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Author: Florian Huber
|
|
8
|
+
Author-email: florian.huber@hs-duesseldorf.de
|
|
9
|
+
Requires-Python: >=3.11,<3.14
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Dist: map4 (>=1.1.3)
|
|
13
|
+
Requires-Dist: matplotlib (>=3.10.1)
|
|
14
|
+
Requires-Dist: numba (>=0.61.2)
|
|
15
|
+
Requires-Dist: numpy (>=2.1.0)
|
|
16
|
+
Requires-Dist: pandas (>=2.2.1)
|
|
17
|
+
Requires-Dist: pooch (>=1.8.2)
|
|
18
|
+
Requires-Dist: rdkit (>=2024.9.6)
|
|
19
|
+
Requires-Dist: scikit-fingerprints (>=1.15.0)
|
|
20
|
+
Requires-Dist: scipy (>=1.14.2)
|
|
21
|
+
Requires-Dist: tqdm (>=4.67.1)
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
<img src="./materials/chemap_logo_green_pink.svg" width="400">
|
|
26
|
+
|
|
27
|
+

|
|
28
|
+
[](https://pypi.org/project/chemap/)
|
|
29
|
+

|
|
30
|
+
[](https://www.rdkit.org/)
|
|
31
|
+
|
|
32
|
+
# chemap - Chemical Mapping
|
|
33
|
+
Library for computing molecular fingerprint based similarities as well as dimensionality reduction based chemical space visualizations.
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
## Fingerprint computations
|
|
37
|
+
Fingerprints can be computed using generators from `RDKit` or `scikit-fingerprints`. Here a code example:
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
import numpy as np
|
|
41
|
+
import scipy.sparse as sp
|
|
42
|
+
from rdkit.Chem import rdFingerprintGenerator
|
|
43
|
+
from skfp.fingerprints import MAPFingerprint, AtomPairFingerprint
|
|
44
|
+
|
|
45
|
+
from chemap import compute_fingerprints, DatasetLoader, FingerprintConfig
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
ds_loader = DatasetLoader()
|
|
49
|
+
smiles = ds_loader.load("tests/data/smiles.csv")
|
|
50
|
+
|
|
51
|
+
# ----------------------------
|
|
52
|
+
# RDKit: Morgan (folded, dense)
|
|
53
|
+
# ----------------------------
|
|
54
|
+
morgan = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=4096)
|
|
55
|
+
X_morgan = compute_fingerprints(
|
|
56
|
+
smiles,
|
|
57
|
+
morgan,
|
|
58
|
+
config=FingerprintConfig(
|
|
59
|
+
count=False,
|
|
60
|
+
folded=True,
|
|
61
|
+
return_csr=False, # dense numpy
|
|
62
|
+
invalid_policy="raise",
|
|
63
|
+
),
|
|
64
|
+
)
|
|
65
|
+
print("RDKit Morgan:", X_morgan.shape, X_morgan.dtype)
|
|
66
|
+
|
|
67
|
+
# -----------------------------------
|
|
68
|
+
# RDKit: RDKitFP (folded, CSR sparse)
|
|
69
|
+
# -----------------------------------
|
|
70
|
+
rdkitfp = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=4096)
|
|
71
|
+
X_rdkitfp_csr = compute_fingerprints(
|
|
72
|
+
smiles,
|
|
73
|
+
rdkitfp,
|
|
74
|
+
config=FingerprintConfig(
|
|
75
|
+
count=False,
|
|
76
|
+
folded=True,
|
|
77
|
+
return_csr=True, # SciPy CSR
|
|
78
|
+
invalid_policy="raise",
|
|
79
|
+
),
|
|
80
|
+
)
|
|
81
|
+
assert sp.issparse(X_rdkitfp_csr)
|
|
82
|
+
print("RDKit RDKitFP (CSR):", X_rdkitfp_csr.shape, X_rdkitfp_csr.dtype, "nnz=", X_rdkitfp_csr.nnz)
|
|
83
|
+
|
|
84
|
+
# --------------------------------------------------
|
|
85
|
+
# scikit-fingerprints: MAPFingerprint (folded, dense)
|
|
86
|
+
# --------------------------------------------------
|
|
87
|
+
# MAPFingerprint is a MinHash-like fingerprint (different from MAP4 lib).
|
|
88
|
+
map_fp = MAPFingerprint(fp_size=4096, count=False, sparse=False)
|
|
89
|
+
X_map = compute_fingerprints(
|
|
90
|
+
smiles,
|
|
91
|
+
map_fp,
|
|
92
|
+
config=FingerprintConfig(
|
|
93
|
+
count=False,
|
|
94
|
+
folded=True,
|
|
95
|
+
return_csr=False,
|
|
96
|
+
invalid_policy="raise",
|
|
97
|
+
),
|
|
98
|
+
)
|
|
99
|
+
print("skfp MAPFingerprint:", X_map.shape, X_map.dtype)
|
|
100
|
+
|
|
101
|
+
# ----------------------------------------------------
|
|
102
|
+
# scikit-fingerprints: AtomPairFingerprint (folded, CSR)
|
|
103
|
+
# ----------------------------------------------------
|
|
104
|
+
atom_pair = AtomPairFingerprint(fp_size=4096, count=False, sparse=False, use_3D=False)
|
|
105
|
+
X_ap_csr = compute_fingerprints(
|
|
106
|
+
smiles,
|
|
107
|
+
atom_pair,
|
|
108
|
+
config=FingerprintConfig(
|
|
109
|
+
count=False,
|
|
110
|
+
folded=True,
|
|
111
|
+
return_csr=True,
|
|
112
|
+
invalid_policy="raise",
|
|
113
|
+
),
|
|
114
|
+
)
|
|
115
|
+
assert sp.issparse(X_ap_csr)
|
|
116
|
+
print("skfp AtomPair (CSR):", X_ap_csr.shape, X_ap_csr.dtype, "nnz=", X_ap_csr.nnz)
|
|
117
|
+
|
|
118
|
+
# (Optional) convert CSR -> dense if you need a NumPy array downstream:
|
|
119
|
+
X_ap = X_ap_csr.toarray().astype(np.float32, copy=False)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
```
|
|
123
|
+
|
chemap-0.1.0/README.md
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
|
|
2
|
+
<img src="./materials/chemap_logo_green_pink.svg" width="400">
|
|
3
|
+
|
|
4
|
+

|
|
5
|
+
[](https://pypi.org/project/chemap/)
|
|
6
|
+

|
|
7
|
+
[](https://www.rdkit.org/)
|
|
8
|
+
|
|
9
|
+
# chemap - Chemical Mapping
|
|
10
|
+
Library for computing molecular fingerprint based similarities as well as dimensionality reduction based chemical space visualizations.
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
## Fingerprint computations
|
|
14
|
+
Fingerprints can be computed using generators from `RDKit` or `scikit-fingerprints`. Here a code example:
|
|
15
|
+
|
|
16
|
+
```python
|
|
17
|
+
import numpy as np
|
|
18
|
+
import scipy.sparse as sp
|
|
19
|
+
from rdkit.Chem import rdFingerprintGenerator
|
|
20
|
+
from skfp.fingerprints import MAPFingerprint, AtomPairFingerprint
|
|
21
|
+
|
|
22
|
+
from chemap import compute_fingerprints, DatasetLoader, FingerprintConfig
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
ds_loader = DatasetLoader()
|
|
26
|
+
smiles = ds_loader.load("tests/data/smiles.csv")
|
|
27
|
+
|
|
28
|
+
# ----------------------------
|
|
29
|
+
# RDKit: Morgan (folded, dense)
|
|
30
|
+
# ----------------------------
|
|
31
|
+
morgan = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=4096)
|
|
32
|
+
X_morgan = compute_fingerprints(
|
|
33
|
+
smiles,
|
|
34
|
+
morgan,
|
|
35
|
+
config=FingerprintConfig(
|
|
36
|
+
count=False,
|
|
37
|
+
folded=True,
|
|
38
|
+
return_csr=False, # dense numpy
|
|
39
|
+
invalid_policy="raise",
|
|
40
|
+
),
|
|
41
|
+
)
|
|
42
|
+
print("RDKit Morgan:", X_morgan.shape, X_morgan.dtype)
|
|
43
|
+
|
|
44
|
+
# -----------------------------------
|
|
45
|
+
# RDKit: RDKitFP (folded, CSR sparse)
|
|
46
|
+
# -----------------------------------
|
|
47
|
+
rdkitfp = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=4096)
|
|
48
|
+
X_rdkitfp_csr = compute_fingerprints(
|
|
49
|
+
smiles,
|
|
50
|
+
rdkitfp,
|
|
51
|
+
config=FingerprintConfig(
|
|
52
|
+
count=False,
|
|
53
|
+
folded=True,
|
|
54
|
+
return_csr=True, # SciPy CSR
|
|
55
|
+
invalid_policy="raise",
|
|
56
|
+
),
|
|
57
|
+
)
|
|
58
|
+
assert sp.issparse(X_rdkitfp_csr)
|
|
59
|
+
print("RDKit RDKitFP (CSR):", X_rdkitfp_csr.shape, X_rdkitfp_csr.dtype, "nnz=", X_rdkitfp_csr.nnz)
|
|
60
|
+
|
|
61
|
+
# --------------------------------------------------
|
|
62
|
+
# scikit-fingerprints: MAPFingerprint (folded, dense)
|
|
63
|
+
# --------------------------------------------------
|
|
64
|
+
# MAPFingerprint is a MinHash-like fingerprint (different from MAP4 lib).
|
|
65
|
+
map_fp = MAPFingerprint(fp_size=4096, count=False, sparse=False)
|
|
66
|
+
X_map = compute_fingerprints(
|
|
67
|
+
smiles,
|
|
68
|
+
map_fp,
|
|
69
|
+
config=FingerprintConfig(
|
|
70
|
+
count=False,
|
|
71
|
+
folded=True,
|
|
72
|
+
return_csr=False,
|
|
73
|
+
invalid_policy="raise",
|
|
74
|
+
),
|
|
75
|
+
)
|
|
76
|
+
print("skfp MAPFingerprint:", X_map.shape, X_map.dtype)
|
|
77
|
+
|
|
78
|
+
# ----------------------------------------------------
|
|
79
|
+
# scikit-fingerprints: AtomPairFingerprint (folded, CSR)
|
|
80
|
+
# ----------------------------------------------------
|
|
81
|
+
atom_pair = AtomPairFingerprint(fp_size=4096, count=False, sparse=False, use_3D=False)
|
|
82
|
+
X_ap_csr = compute_fingerprints(
|
|
83
|
+
smiles,
|
|
84
|
+
atom_pair,
|
|
85
|
+
config=FingerprintConfig(
|
|
86
|
+
count=False,
|
|
87
|
+
folded=True,
|
|
88
|
+
return_csr=True,
|
|
89
|
+
invalid_policy="raise",
|
|
90
|
+
),
|
|
91
|
+
)
|
|
92
|
+
assert sp.issparse(X_ap_csr)
|
|
93
|
+
print("skfp AtomPair (CSR):", X_ap_csr.shape, X_ap_csr.dtype, "nnz=", X_ap_csr.nnz)
|
|
94
|
+
|
|
95
|
+
# (Optional) convert CSR -> dense if you need a NumPy array downstream:
|
|
96
|
+
X_ap = X_ap_csr.toarray().astype(np.float32, copy=False)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
```
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module: approx_nn
|
|
3
|
+
-----------------
|
|
4
|
+
|
|
5
|
+
This module defines a function for computing approximate nearest neighbors
|
|
6
|
+
from a list of SMILES strings. It uses two different fingerprints:
|
|
7
|
+
- A dense (1024-bit) fingerprint for dimensionality reduction via PCA.
|
|
8
|
+
- A sparse (4096-bit) fingerprint for a refined nearest neighbor search
|
|
9
|
+
based on a Ruzicka similarity.
|
|
10
|
+
|
|
11
|
+
The general steps are:
|
|
12
|
+
1. Compute fingerprints from SMILES using RDKit's Morgan generator.
|
|
13
|
+
2. Scale and reduce the dense fingerprints with PCA.
|
|
14
|
+
3. Build an approximate NN graph on the PCA vectors.
|
|
15
|
+
4. Refine the neighbor search using a Ruzicka-based candidate search.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import time
|
|
19
|
+
from typing import Any, List, Tuple
|
|
20
|
+
import numba
|
|
21
|
+
import numpy as np
|
|
22
|
+
from fingerprint_computation import compute_fingerprints_from_smiles
|
|
23
|
+
from metrics import ruzicka_similarity_sparse_numba
|
|
24
|
+
from numba import prange
|
|
25
|
+
from pynndescent import NNDescent
|
|
26
|
+
from rdkit.Chem import rdFingerprintGenerator
|
|
27
|
+
from sklearn.decomposition import PCA
|
|
28
|
+
from sklearn.pipeline import Pipeline
|
|
29
|
+
from sklearn.preprocessing import StandardScaler
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def compound_nearest_neighbors(
|
|
33
|
+
smiles: List[str], k_pca: int = 500, k_morgan: int = 100
|
|
34
|
+
) -> Tuple[Any, Any]:
|
|
35
|
+
"""
|
|
36
|
+
Compute approximate nearest neighbors for a list of SMILES strings.
|
|
37
|
+
|
|
38
|
+
This function computes two sets of molecular fingerprints using RDKit’s Morgan
|
|
39
|
+
generator (one dense and one sparse), reduces the dimensionality of the dense
|
|
40
|
+
fingerprints via PCA, builds an approximate nearest neighbor graph using NNDescent,
|
|
41
|
+
and finally refines the search with a Ruzicka-based candidate search.
|
|
42
|
+
|
|
43
|
+
Parameters:
|
|
44
|
+
smiles (List[str]): List of SMILES strings representing molecules.
|
|
45
|
+
k_pca (int): Number of neighbors used for the initial approximate NN search
|
|
46
|
+
(using PCA vectors). Must be larger than k_morgan. Default: 500.
|
|
47
|
+
k_morgan (int): Number of neighbors used for the refined Ruzicka-based search.
|
|
48
|
+
Default: 100.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Tuple[Any, Any]: A tuple (order, scores) where:
|
|
52
|
+
- order: An array-like structure of neighbor indices.
|
|
53
|
+
- scores: An array-like structure of similarity scores corresponding to the neighbors.
|
|
54
|
+
"""
|
|
55
|
+
# Validate input: ensure that the PCA search uses more neighbors than the refined search.
|
|
56
|
+
assert k_pca > k_morgan, "Expected k_pca to be larger than k_morgan"
|
|
57
|
+
|
|
58
|
+
t_start = time.time()
|
|
59
|
+
print(">" * 20, "Compute fingerprints")
|
|
60
|
+
# Compute dense fingerprints (1024 bits) for PCA.
|
|
61
|
+
fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=1024)
|
|
62
|
+
fingerprints_morgan3_count_1024 = compute_fingerprints_from_smiles(
|
|
63
|
+
smiles, fpgen, count=True, sparse=False, progress_bar=True,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Compute sparse fingerprints for refined neighbor search.
|
|
67
|
+
fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=4096)
|
|
68
|
+
fingerprints_morgan3_count_sparse = compute_fingerprints_from_smiles(
|
|
69
|
+
smiles, fpgen, count=True, sparse=True, progress_bar=True,
|
|
70
|
+
)
|
|
71
|
+
print(f"Took: {(time.time() - t_start):.4f} s.")
|
|
72
|
+
|
|
73
|
+
order, scores = compute_approx_nearest_neighbors(
|
|
74
|
+
fingerprints_morgan3_count_1024, fingerprints_morgan3_count_sparse, k_pca, k_morgan
|
|
75
|
+
)
|
|
76
|
+
return order, scores
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def compute_approx_nearest_neighbors(
|
|
80
|
+
fingerprints_coarse, fingerprints_fine, k_pca: int = 500, k_morgan: int = 100
|
|
81
|
+
) -> Tuple[Any, Any]:
|
|
82
|
+
|
|
83
|
+
t_start = time.time()
|
|
84
|
+
print(">" * 20, "Compute PCA vectors")
|
|
85
|
+
pca = PCA(n_components=100)
|
|
86
|
+
scaler = StandardScaler()
|
|
87
|
+
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca)])
|
|
88
|
+
pca_vectors = pipe.fit_transform(fingerprints_coarse)
|
|
89
|
+
print(f"Took: {(time.time() - t_start):.4f} s.")
|
|
90
|
+
|
|
91
|
+
t_start = time.time()
|
|
92
|
+
print(">" * 20, f"Build NN-graph ({k_pca} neighbors)")
|
|
93
|
+
ann_graph = NNDescent(pca_vectors, metric="cosine", n_neighbors=k_pca)
|
|
94
|
+
print(f"Took: {(time.time() - t_start):.4f} s.")
|
|
95
|
+
|
|
96
|
+
t_start = time.time()
|
|
97
|
+
print(">" * 20, f"Build Ruzicka based NN-graph ({k_morgan} neighbors)")
|
|
98
|
+
order, scores = ruzicka_candidate_search(
|
|
99
|
+
fingerprints_fine, fingerprints_fine,
|
|
100
|
+
ann_graph.neighbor_graph[0],
|
|
101
|
+
k_morgan,
|
|
102
|
+
)
|
|
103
|
+
print(f"Took: {(time.time() - t_start):.4f} s.")
|
|
104
|
+
|
|
105
|
+
return order, scores
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@numba.jit(nopython=True)
|
|
109
|
+
def ruzicka_candidate_search(
|
|
110
|
+
references: list, queries: list,
|
|
111
|
+
knn_indices_approx: list,
|
|
112
|
+
k
|
|
113
|
+
) -> np.ndarray:
|
|
114
|
+
"""Search all candidates...
|
|
115
|
+
|
|
116
|
+
Parameters
|
|
117
|
+
----------
|
|
118
|
+
references:
|
|
119
|
+
List of sparse fingerprints (tuple of two arrays: keys and counts).
|
|
120
|
+
queries
|
|
121
|
+
List of sparse fingerprints (tuple of two arrays: keys and counts).
|
|
122
|
+
"""
|
|
123
|
+
size = len(queries)
|
|
124
|
+
candidate_idx = np.zeros((size, k))#, dtyoe=np.int32)
|
|
125
|
+
candidate_scores = np.zeros((size, k), dtype=np.float64)
|
|
126
|
+
for i, knn_indices in enumerate(knn_indices_approx):
|
|
127
|
+
|
|
128
|
+
order, scores = ruzicka_similarity_query_search(
|
|
129
|
+
[references[i] for i in knn_indices],
|
|
130
|
+
queries[i], k
|
|
131
|
+
)
|
|
132
|
+
candidate_idx[i, :] = knn_indices[order]
|
|
133
|
+
candidate_scores[i, :] = scores
|
|
134
|
+
return candidate_idx, candidate_scores
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@numba.jit(nopython=True, fastmath=True, parallel=True)
|
|
138
|
+
def ruzicka_similarity_query_search(
|
|
139
|
+
candidates: list, query, k) -> np.ndarray:
|
|
140
|
+
"""Returns matrix of Ruzicka similarity between all-vs-all vectors of references and queries.
|
|
141
|
+
|
|
142
|
+
Parameters
|
|
143
|
+
----------
|
|
144
|
+
references:
|
|
145
|
+
List of sparse fingerprints (tuple of two arrays: keys and counts).
|
|
146
|
+
queries
|
|
147
|
+
Sparse fingerprint (tuple of two arrays: keys and counts).
|
|
148
|
+
|
|
149
|
+
Returns
|
|
150
|
+
-------
|
|
151
|
+
scores:
|
|
152
|
+
Matrix of all-vs-all similarity scores. scores[i, j] will contain the score
|
|
153
|
+
between the vectors references[i, :] and queries[j, :].
|
|
154
|
+
"""
|
|
155
|
+
size1 = len(candidates)
|
|
156
|
+
distances = np.zeros(size1)
|
|
157
|
+
for i in prange(size1):
|
|
158
|
+
distances[i] = 1 - ruzicka_similarity_sparse_numba(
|
|
159
|
+
candidates[i][0], candidates[i][1],
|
|
160
|
+
query[0], query[1])
|
|
161
|
+
order = np.argsort(distances)[:k]
|
|
162
|
+
return order, distances[order]
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pathlib
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import pooch
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DatasetLoader:
|
|
8
|
+
def __init__(self, cache_dir="./data_cache"):
|
|
9
|
+
self.cache_dir = cache_dir
|
|
10
|
+
|
|
11
|
+
def load(self, source: str, **kwargs) -> list:
|
|
12
|
+
"""
|
|
13
|
+
Loads a dataset from local file or web.
|
|
14
|
+
|
|
15
|
+
Parameters
|
|
16
|
+
-------------
|
|
17
|
+
source:
|
|
18
|
+
Either load a local file or a hyperlink pointing to a remote file.
|
|
19
|
+
Supported filetypes: .csv, .json, .parquet, .xls, .xlsx, .xlsx.
|
|
20
|
+
|
|
21
|
+
Returns
|
|
22
|
+
-------------
|
|
23
|
+
list of smiles strings.
|
|
24
|
+
|
|
25
|
+
Raises
|
|
26
|
+
-------------
|
|
27
|
+
ValueError if neither local file nor http/ftp/sftp.
|
|
28
|
+
"""
|
|
29
|
+
if os.path.exists(source):
|
|
30
|
+
return self._from_local_file(source, **kwargs)
|
|
31
|
+
elif source.startswith(("http", "ftp", "sftp")):
|
|
32
|
+
return self._from_web(source, **kwargs)
|
|
33
|
+
else:
|
|
34
|
+
raise ValueError(f"Source {source} unknown.")
|
|
35
|
+
|
|
36
|
+
def _from_local_file(self, path, smiles_column: str = "smiles") -> list:
|
|
37
|
+
"""
|
|
38
|
+
Loads a dataset from local file.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
-------------
|
|
42
|
+
path:
|
|
43
|
+
string of local file path.
|
|
44
|
+
|
|
45
|
+
smiles_column:
|
|
46
|
+
Name of column containing smiles. Defaults to smiles
|
|
47
|
+
|
|
48
|
+
Returns
|
|
49
|
+
-------------
|
|
50
|
+
list of smiles strings.
|
|
51
|
+
|
|
52
|
+
Raises
|
|
53
|
+
-------------
|
|
54
|
+
ValueError if file type unsupported.
|
|
55
|
+
ValueError if smiles column not present.
|
|
56
|
+
"""
|
|
57
|
+
suffix = pathlib.Path(path).suffix.lower()
|
|
58
|
+
|
|
59
|
+
if suffix == ".csv":
|
|
60
|
+
df = pd.read_csv(path)
|
|
61
|
+
elif suffix == ".json":
|
|
62
|
+
df = pd.read_json(path)
|
|
63
|
+
elif suffix in [".parquet", ".pq"]:
|
|
64
|
+
df = pd.read_parquet(path)
|
|
65
|
+
elif suffix in [".xlsx", ".xls"]:
|
|
66
|
+
df = pd.read_excel(path)
|
|
67
|
+
else:
|
|
68
|
+
raise ValueError(f"Fileformat {suffix} not supported.")
|
|
69
|
+
|
|
70
|
+
if smiles_column not in df.columns:
|
|
71
|
+
raise ValueError(f"Smiles column {smiles_column} not in dataframe.")
|
|
72
|
+
|
|
73
|
+
return df[smiles_column].tolist()
|
|
74
|
+
|
|
75
|
+
def _from_web(self, url: str, **kwargs) -> list:
|
|
76
|
+
"""
|
|
77
|
+
Loads a dataset from web.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
-------------
|
|
81
|
+
url:
|
|
82
|
+
string of url.
|
|
83
|
+
|
|
84
|
+
Returns
|
|
85
|
+
-------------
|
|
86
|
+
list of smiles strings.
|
|
87
|
+
"""
|
|
88
|
+
file_path = pooch.retrieve(
|
|
89
|
+
url=url,
|
|
90
|
+
known_hash=kwargs.get("hash", None),
|
|
91
|
+
path=self.cache_dir,
|
|
92
|
+
progressbar=True,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return self._from_local_file(file_path, **kwargs)
|