leishmania-screen 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- leishmania_screen-1.0.0/LICENSE +21 -0
- leishmania_screen-1.0.0/PKG-INFO +145 -0
- leishmania_screen-1.0.0/README.md +114 -0
- leishmania_screen-1.0.0/leishmania_screen/__init__.py +20 -0
- leishmania_screen-1.0.0/leishmania_screen/_features.py +133 -0
- leishmania_screen-1.0.0/leishmania_screen/_model.py +29 -0
- leishmania_screen-1.0.0/leishmania_screen/_predict.py +166 -0
- leishmania_screen-1.0.0/leishmania_screen/cli.py +136 -0
- leishmania_screen-1.0.0/leishmania_screen/data/NN_model.pth +0 -0
- leishmania_screen-1.0.0/leishmania_screen/data/__init__.py +0 -0
- leishmania_screen-1.0.0/leishmania_screen/data/pca_100_LD.pkl +0 -0
- leishmania_screen-1.0.0/leishmania_screen/data/scalers_LD.pkl +0 -0
- leishmania_screen-1.0.0/leishmania_screen/data/selected_features_LD.pkl +0 -0
- leishmania_screen-1.0.0/leishmania_screen/data/train_columns_LD.pkl +0 -0
- leishmania_screen-1.0.0/leishmania_screen.egg-info/PKG-INFO +145 -0
- leishmania_screen-1.0.0/leishmania_screen.egg-info/SOURCES.txt +22 -0
- leishmania_screen-1.0.0/leishmania_screen.egg-info/dependency_links.txt +1 -0
- leishmania_screen-1.0.0/leishmania_screen.egg-info/entry_points.txt +2 -0
- leishmania_screen-1.0.0/leishmania_screen.egg-info/requires.txt +6 -0
- leishmania_screen-1.0.0/leishmania_screen.egg-info/top_level.txt +2 -0
- leishmania_screen-1.0.0/pyproject.toml +62 -0
- leishmania_screen-1.0.0/setup.cfg +4 -0
- leishmania_screen-1.0.0/tests/__init__.py +0 -0
- leishmania_screen-1.0.0/tests/test_predict.py +111 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Belaguppa Manjunath Ashwin Desai (CEO, AikyaNova), Pronama Biswas, and Madhavi Bhatt
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: leishmania-screen
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Neural network virtual screening for antileishmanial activity against Leishmania donovani
|
|
5
|
+
Author: Pronama Biswas, Madhavi Bhatt
|
|
6
|
+
Author-email: Belaguppa Manjunath Ashwin Desai <ceo@aikyanova.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Source Code, https://github.com/AikyaNova-Pvt-Ltd/leishmania-screen
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/AikyaNova-Pvt-Ltd/leishmania-screen/issues
|
|
10
|
+
Keywords: leishmania,drug discovery,virtual screening,cheminformatics,deep learning,QSAR
|
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Chemistry
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: torch>=2.0
|
|
25
|
+
Requires-Dist: scikit-learn>=1.7
|
|
26
|
+
Requires-Dist: numpy>=1.24
|
|
27
|
+
Requires-Dist: pandas>=1.5
|
|
28
|
+
Requires-Dist: joblib>=1.2
|
|
29
|
+
Requires-Dist: rdkit>=2023.3
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# leishmania-screen
|
|
33
|
+
|
|
34
|
+
Neural network–based virtual screening for antileishmanial activity against *Leishmania donovani*.
|
|
35
|
+
|
|
36
|
+
Trained on a curated dataset of 6,699 compounds. Achieves **ROC-AUC 0.884** on the held-out test set.
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install leishmania-screen
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
> **RDKit note:** RDKit is listed as a dependency (`rdkit>=2023.3`). If your environment manages RDKit via conda, install it there first:
|
|
47
|
+
> ```bash
|
|
48
|
+
> conda install -c conda-forge rdkit
|
|
49
|
+
> pip install leishmania-screen
|
|
50
|
+
> ```
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Quick start — Python API
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from leishmania_screen import predict
|
|
58
|
+
|
|
59
|
+
# Single compound
|
|
60
|
+
result = predict("CCO")
|
|
61
|
+
print(result.label) # "Active" or "Inactive"
|
|
62
|
+
print(result.probability) # float in [0, 1]
|
|
63
|
+
|
|
64
|
+
# Batch
|
|
65
|
+
results = predict(["CCO", "CC(=O)Oc1ccccc1C(=O)O", "not_valid"])
|
|
66
|
+
for r in results:
|
|
67
|
+
print(r.smiles, r.label, r.probability, r.error)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### `PredictionResult` fields
|
|
71
|
+
|
|
72
|
+
| Field | Type | Description |
|
|
73
|
+
|---|---|---|
|
|
74
|
+
| `smiles` | `str` | The input SMILES string |
|
|
75
|
+
| `label` | `str` | `"Active"`, `"Inactive"`, or `"Invalid"` |
|
|
76
|
+
| `probability` | `float \| None` | Sigmoid output of the model (0–1); `None` for invalid inputs |
|
|
77
|
+
| `error` | `str \| None` | Reason for invalidity; `None` for valid inputs |
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Quick start — Command line
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
# Single SMILES
|
|
85
|
+
leishscreen --smiles "CC(=O)Oc1ccccc1C(=O)O"
|
|
86
|
+
|
|
87
|
+
# Batch from CSV (must have a column named 'smiles')
|
|
88
|
+
leishscreen --file compounds.csv --output results.csv
|
|
89
|
+
|
|
90
|
+
# Batch from plain text (one SMILES per line)
|
|
91
|
+
leishscreen --file smiles.txt --output results.csv
|
|
92
|
+
|
|
93
|
+
# Custom column name
|
|
94
|
+
leishscreen --file library.csv --smiles-col SMILES --output results.csv
|
|
95
|
+
|
|
96
|
+
# Version
|
|
97
|
+
leishscreen --version
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Model details
|
|
103
|
+
|
|
104
|
+
| Item | Value |
|
|
105
|
+
|---|---|
|
|
106
|
+
| Target | *Leishmania donovani* (binary: Active / Inactive) |
|
|
107
|
+
| Training dataset | 6,699 compounds (2,574 active, 4,125 inactive) |
|
|
108
|
+
| Feature pipeline | 218 RDKit descriptors + 2,728 fingerprint bits → 900-feature MI selection → StandardScaler → PCA (100 components) |
|
|
109
|
+
| Architecture | Linear(100→512)→BN→GELU→Drop(0.30) → Linear(512→256)→BN→GELU→Drop(0.25) → Linear(256→128)→GELU→Drop(0.15) → Linear(128→1) |
|
|
110
|
+
| Loss | `BCEWithLogitsLoss` with class-imbalance `pos_weight` |
|
|
111
|
+
| Optimizer | AdamW (lr=8e-4, weight_decay=2e-4) + gradient clipping |
|
|
112
|
+
| Scheduler | ReduceLROnPlateau (factor=0.5, patience=5) |
|
|
113
|
+
| Training strategy | Multi-seed ensemble (seeds 42, 52, 62); early stopping (patience=18) |
|
|
114
|
+
| Classification threshold | **0.60** (optimised on validation set: precision ≥ 0.70, max F1) |
|
|
115
|
+
| Test ROC-AUC | **0.884** |
|
|
116
|
+
| Test PR-AUC | **0.828** |
|
|
117
|
+
| Test Accuracy | **0.815** |
|
|
118
|
+
| Test Balanced Accuracy | **0.810** |
|
|
119
|
+
|
|
120
|
+
### Fingerprints used
|
|
121
|
+
|
|
122
|
+
| Type | Parameters | Bits |
|
|
123
|
+
|---|---|---|
|
|
124
|
+
| Morgan (ECFP-like) | radius=2 | 512 |
|
|
125
|
+
| Avalon | — | 512 |
|
|
126
|
+
| Topological Torsion | — | 512 |
|
|
127
|
+
| Atom Pair | — | 512 |
|
|
128
|
+
| MACCS Keys | — | 167 |
|
|
129
|
+
| RDKit Path-Based | minPath=5, maxPath=7 | 512 |
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## Citation
|
|
134
|
+
|
|
135
|
+
If you use this package in your research, please cite:
|
|
136
|
+
|
|
137
|
+
```
|
|
138
|
+
[Citation to be added upon publication]
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## License
|
|
144
|
+
|
|
145
|
+
MIT
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# leishmania-screen
|
|
2
|
+
|
|
3
|
+
Neural network–based virtual screening for antileishmanial activity against *Leishmania donovani*.
|
|
4
|
+
|
|
5
|
+
Trained on a curated dataset of 6,699 compounds. Achieves **ROC-AUC 0.884** on the held-out test set.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install leishmania-screen
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
> **RDKit note:** RDKit is listed as a dependency (`rdkit>=2023.3`). If your environment manages RDKit via conda, install it there first:
|
|
16
|
+
> ```bash
|
|
17
|
+
> conda install -c conda-forge rdkit
|
|
18
|
+
> pip install leishmania-screen
|
|
19
|
+
> ```
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Quick start — Python API
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from leishmania_screen import predict
|
|
27
|
+
|
|
28
|
+
# Single compound
|
|
29
|
+
result = predict("CCO")
|
|
30
|
+
print(result.label) # "Active" or "Inactive"
|
|
31
|
+
print(result.probability) # float in [0, 1]
|
|
32
|
+
|
|
33
|
+
# Batch
|
|
34
|
+
results = predict(["CCO", "CC(=O)Oc1ccccc1C(=O)O", "not_valid"])
|
|
35
|
+
for r in results:
|
|
36
|
+
print(r.smiles, r.label, r.probability, r.error)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### `PredictionResult` fields
|
|
40
|
+
|
|
41
|
+
| Field | Type | Description |
|
|
42
|
+
|---|---|---|
|
|
43
|
+
| `smiles` | `str` | The input SMILES string |
|
|
44
|
+
| `label` | `str` | `"Active"`, `"Inactive"`, or `"Invalid"` |
|
|
45
|
+
| `probability` | `float \| None` | Sigmoid output of the model (0–1); `None` for invalid inputs |
|
|
46
|
+
| `error` | `str \| None` | Reason for invalidity; `None` for valid inputs |
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## Quick start — Command line
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# Single SMILES
|
|
54
|
+
leishscreen --smiles "CC(=O)Oc1ccccc1C(=O)O"
|
|
55
|
+
|
|
56
|
+
# Batch from CSV (must have a column named 'smiles')
|
|
57
|
+
leishscreen --file compounds.csv --output results.csv
|
|
58
|
+
|
|
59
|
+
# Batch from plain text (one SMILES per line)
|
|
60
|
+
leishscreen --file smiles.txt --output results.csv
|
|
61
|
+
|
|
62
|
+
# Custom column name
|
|
63
|
+
leishscreen --file library.csv --smiles-col SMILES --output results.csv
|
|
64
|
+
|
|
65
|
+
# Version
|
|
66
|
+
leishscreen --version
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Model details
|
|
72
|
+
|
|
73
|
+
| Item | Value |
|
|
74
|
+
|---|---|
|
|
75
|
+
| Target | *Leishmania donovani* (binary: Active / Inactive) |
|
|
76
|
+
| Training dataset | 6,699 compounds (2,574 active, 4,125 inactive) |
|
|
77
|
+
| Feature pipeline | 218 RDKit descriptors + 2,728 fingerprint bits → 900-feature MI selection → StandardScaler → PCA (100 components) |
|
|
78
|
+
| Architecture | Linear(100→512)→BN→GELU→Drop(0.30) → Linear(512→256)→BN→GELU→Drop(0.25) → Linear(256→128)→GELU→Drop(0.15) → Linear(128→1) |
|
|
79
|
+
| Loss | `BCEWithLogitsLoss` with class-imbalance `pos_weight` |
|
|
80
|
+
| Optimizer | AdamW (lr=8e-4, weight_decay=2e-4) + gradient clipping |
|
|
81
|
+
| Scheduler | ReduceLROnPlateau (factor=0.5, patience=5) |
|
|
82
|
+
| Training strategy | Multi-seed ensemble (seeds 42, 52, 62); early stopping (patience=18) |
|
|
83
|
+
| Classification threshold | **0.60** (optimised on validation set: precision ≥ 0.70, max F1) |
|
|
84
|
+
| Test ROC-AUC | **0.884** |
|
|
85
|
+
| Test PR-AUC | **0.828** |
|
|
86
|
+
| Test Accuracy | **0.815** |
|
|
87
|
+
| Test Balanced Accuracy | **0.810** |
|
|
88
|
+
|
|
89
|
+
### Fingerprints used
|
|
90
|
+
|
|
91
|
+
| Type | Parameters | Bits |
|
|
92
|
+
|---|---|---|
|
|
93
|
+
| Morgan (ECFP-like) | radius=2 | 512 |
|
|
94
|
+
| Avalon | — | 512 |
|
|
95
|
+
| Topological Torsion | — | 512 |
|
|
96
|
+
| Atom Pair | — | 512 |
|
|
97
|
+
| MACCS Keys | — | 167 |
|
|
98
|
+
| RDKit Path-Based | minPath=5, maxPath=7 | 512 |
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Citation
|
|
103
|
+
|
|
104
|
+
If you use this package in your research, please cite:
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
[Citation to be added upon publication]
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
## License
|
|
113
|
+
|
|
114
|
+
MIT
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
leishmania-screen
|
|
3
|
+
=================
|
|
4
|
+
Neural network–based virtual screening for antileishmanial activity
|
|
5
|
+
against *Leishmania donovani*.
|
|
6
|
+
|
|
7
|
+
Quick start
|
|
8
|
+
-----------
|
|
9
|
+
>>> from leishmania_screen import predict
|
|
10
|
+
>>> result = predict("CCO")
|
|
11
|
+
>>> print(result.label, result.probability)
|
|
12
|
+
|
|
13
|
+
>>> results = predict(["CCO", "c1ccccc1"])
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from ._predict import predict, PredictionResult, THRESHOLD
|
|
17
|
+
|
|
18
|
+
__version__ = "1.0.0"
|
|
19
|
+
__author__ = "Madhavi et al."
|
|
20
|
+
__all__ = ["predict", "PredictionResult", "THRESHOLD"]
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Molecular feature generation — replicates the training pipeline exactly.
|
|
3
|
+
|
|
4
|
+
Feature vector layout (2,946 columns total):
|
|
5
|
+
[0:218] RDKit descriptors (218)
|
|
6
|
+
[218:730] Morgan r=2 512-bit (Mfpt_) (512)
|
|
7
|
+
[730:1242] Avalon 512-bit (512)
|
|
8
|
+
[1242:1754] Topological Torsion 512-bit (512)
|
|
9
|
+
[1754:2266] Atom Pair 512-bit (512)
|
|
10
|
+
[2266:2433] MACCS keys (167)
|
|
11
|
+
[2433:2945] RDKit path-based 512-bit (512)
|
|
12
|
+
[2944:2946] ID column (dropped before model; kept here for tracing)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import warnings
|
|
18
|
+
from typing import List, Tuple
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
import pandas as pd
|
|
22
|
+
|
|
23
|
+
with warnings.catch_warnings():
|
|
24
|
+
warnings.simplefilter("ignore")
|
|
25
|
+
from rdkit import Chem
|
|
26
|
+
from rdkit.Chem import AllChem, Descriptors, rdMolDescriptors, DataStructs
|
|
27
|
+
from rdkit.Chem import rdFingerprintGenerator
|
|
28
|
+
try:
|
|
29
|
+
from rdkit.Avalon import pyAvalonTools
|
|
30
|
+
_AVALON_OK = True
|
|
31
|
+
except ImportError:
|
|
32
|
+
_AVALON_OK = False
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
_DESCRIPTOR_NAMES: list[str] | None = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _descriptor_names() -> list[str]:
|
|
39
|
+
global _DESCRIPTOR_NAMES
|
|
40
|
+
if _DESCRIPTOR_NAMES is None:
|
|
41
|
+
_DESCRIPTOR_NAMES = [n for n, _ in Descriptors.descList]
|
|
42
|
+
return _DESCRIPTOR_NAMES
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def validate_smiles(smiles: str) -> Tuple[Chem.Mol | None, str | None]:
|
|
46
|
+
"""
|
|
47
|
+
Return (mol, None) on success or (None, error_message) on failure.
|
|
48
|
+
Applies both MolFromSmiles and SanitizeMol checks.
|
|
49
|
+
"""
|
|
50
|
+
if not smiles or not isinstance(smiles, str):
|
|
51
|
+
return None, "Input is empty or not a string."
|
|
52
|
+
mol = Chem.MolFromSmiles(smiles.strip())
|
|
53
|
+
if mol is None:
|
|
54
|
+
return None, f"RDKit could not parse SMILES: '{smiles}'"
|
|
55
|
+
try:
|
|
56
|
+
Chem.SanitizeMol(mol)
|
|
57
|
+
except Exception as exc:
|
|
58
|
+
return None, f"Sanitization failed: {exc}"
|
|
59
|
+
return mol, None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _rdkit_descriptors(mol: Chem.Mol) -> np.ndarray:
|
|
63
|
+
desc_dict = Descriptors.CalcMolDescriptors(mol)
|
|
64
|
+
return np.array([desc_dict[n] for n in _descriptor_names()], dtype=np.float64)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _bitvect_to_array(bv) -> np.ndarray:
|
|
68
|
+
arr = np.zeros(bv.GetNumBits(), dtype=np.uint8)
|
|
69
|
+
DataStructs.ConvertToNumpyArray(bv, arr)
|
|
70
|
+
return arr
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _morgan(mol: Chem.Mol) -> np.ndarray:
|
|
74
|
+
bv = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=512)
|
|
75
|
+
return _bitvect_to_array(bv)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _avalon(smiles: str) -> np.ndarray:
|
|
79
|
+
if _AVALON_OK:
|
|
80
|
+
bv = pyAvalonTools.GetAvalonFP(smiles, isSmiles=True, nBits=512)
|
|
81
|
+
return _bitvect_to_array(bv)
|
|
82
|
+
# Fallback: zero vector if Avalon not available
|
|
83
|
+
return np.zeros(512, dtype=np.uint8)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _topo_torsion(mol: Chem.Mol) -> np.ndarray:
|
|
87
|
+
bv = rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol, nBits=512)
|
|
88
|
+
return _bitvect_to_array(bv)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _atom_pair(mol: Chem.Mol) -> np.ndarray:
|
|
92
|
+
bv = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol, nBits=512)
|
|
93
|
+
return _bitvect_to_array(bv)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _maccs(mol: Chem.Mol) -> np.ndarray:
|
|
97
|
+
bv = rdMolDescriptors.GetMACCSKeysFingerprint(mol)
|
|
98
|
+
return _bitvect_to_array(bv)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _rdkit_path(mol: Chem.Mol) -> np.ndarray:
|
|
102
|
+
bv = AllChem.RDKFingerprint(mol, minPath=5, maxPath=7, fpSize=512)
|
|
103
|
+
return _bitvect_to_array(bv)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _column_names() -> List[str]:
|
|
107
|
+
desc_names = _descriptor_names()
|
|
108
|
+
morgan_names = [f"Mfpt_{i}" for i in range(512)]
|
|
109
|
+
avalon_names = [f"Avfpt_{i}" for i in range(512)]
|
|
110
|
+
tt_names = [f"TT_fpt_{i}" for i in range(512)]
|
|
111
|
+
ap_names = [f"AP_fpt_{i}" for i in range(512)]
|
|
112
|
+
mc_names = [f"MC_fpt_{i}" for i in range(167)]
|
|
113
|
+
rd_names = [f"RD_fpt_{i}" for i in range(512)]
|
|
114
|
+
return desc_names + morgan_names + avalon_names + tt_names + ap_names + mc_names + rd_names
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def compute_features(smiles: str, mol: Chem.Mol) -> pd.DataFrame:
|
|
118
|
+
"""
|
|
119
|
+
Build the full 2,946-column feature row for one molecule.
|
|
120
|
+
Returns a single-row DataFrame with named columns.
|
|
121
|
+
"""
|
|
122
|
+
with warnings.catch_warnings():
|
|
123
|
+
warnings.simplefilter("ignore")
|
|
124
|
+
desc = _rdkit_descriptors(mol)
|
|
125
|
+
mfp = _morgan(mol)
|
|
126
|
+
avfp = _avalon(smiles)
|
|
127
|
+
ttfp = _topo_torsion(mol)
|
|
128
|
+
apfp = _atom_pair(mol)
|
|
129
|
+
mcfp = _maccs(mol)
|
|
130
|
+
rdfp = _rdkit_path(mol)
|
|
131
|
+
|
|
132
|
+
row = np.concatenate([desc, mfp, avfp, ttfp, apfp, mcfp, rdfp])
|
|
133
|
+
return pd.DataFrame([row], columns=_column_names())
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import torch.nn as nn
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class _LeishNet(nn.Module):
|
|
6
|
+
"""Neural network architecture — must match training exactly."""
|
|
7
|
+
|
|
8
|
+
def __init__(self, d: int = 100):
|
|
9
|
+
super().__init__()
|
|
10
|
+
self.net = nn.Sequential(
|
|
11
|
+
nn.Linear(d, 512),
|
|
12
|
+
nn.BatchNorm1d(512),
|
|
13
|
+
nn.GELU(),
|
|
14
|
+
nn.Dropout(0.30),
|
|
15
|
+
|
|
16
|
+
nn.Linear(512, 256),
|
|
17
|
+
nn.BatchNorm1d(256),
|
|
18
|
+
nn.GELU(),
|
|
19
|
+
nn.Dropout(0.25),
|
|
20
|
+
|
|
21
|
+
nn.Linear(256, 128),
|
|
22
|
+
nn.GELU(),
|
|
23
|
+
nn.Dropout(0.15),
|
|
24
|
+
|
|
25
|
+
nn.Linear(128, 1),
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
29
|
+
return self.net(x)
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core inference engine. Loads all artefacts once (lazy, thread-safe singleton)
|
|
3
|
+
and exposes a predict() function that accepts one or many SMILES strings.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import importlib.resources
|
|
9
|
+
import threading
|
|
10
|
+
import warnings
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import List, Union
|
|
13
|
+
|
|
14
|
+
import joblib
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import torch
|
|
18
|
+
|
|
19
|
+
from ._model import _LeishNet
|
|
20
|
+
from ._features import validate_smiles, compute_features
|
|
21
|
+
|
|
22
|
+
THRESHOLD: float = 0.6
|
|
23
|
+
_LOCK = threading.Lock()
|
|
24
|
+
_ARTEFACTS: "_Artefacts | None" = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class PredictionResult:
|
|
29
|
+
smiles: str
|
|
30
|
+
label: str # "Active" | "Inactive" | "Invalid"
|
|
31
|
+
probability: float | None
|
|
32
|
+
error: str | None = None
|
|
33
|
+
|
|
34
|
+
def to_dict(self) -> dict:
|
|
35
|
+
return {
|
|
36
|
+
"smiles": self.smiles,
|
|
37
|
+
"label": self.label,
|
|
38
|
+
"probability": round(self.probability, 4) if self.probability is not None else None,
|
|
39
|
+
"error": self.error,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class _Artefacts:
|
|
44
|
+
"""Holds all loaded model artefacts. Instantiated once."""
|
|
45
|
+
|
|
46
|
+
def __init__(self):
|
|
47
|
+
data_pkg = importlib.resources.files("leishmania_screen") / "data"
|
|
48
|
+
|
|
49
|
+
with warnings.catch_warnings():
|
|
50
|
+
warnings.simplefilter("ignore")
|
|
51
|
+
self.selected_features: list[str] = list(
|
|
52
|
+
joblib.load(str(data_pkg / "selected_features_LD.pkl"))
|
|
53
|
+
)
|
|
54
|
+
self.train_columns: list[str] = joblib.load(
|
|
55
|
+
str(data_pkg / "train_columns_LD.pkl")
|
|
56
|
+
)
|
|
57
|
+
# Per-feature scalers: dict[feature_name -> StandardScaler]
|
|
58
|
+
# Only the 129 features that were scaled during training are included.
|
|
59
|
+
self.scalers: dict = joblib.load(str(data_pkg / "scalers_LD.pkl"))
|
|
60
|
+
self.pca = joblib.load(str(data_pkg / "pca_100_LD.pkl"))
|
|
61
|
+
|
|
62
|
+
self.model = _LeishNet(d=100)
|
|
63
|
+
state = torch.load(
|
|
64
|
+
str(data_pkg / "NN_model.pth"),
|
|
65
|
+
map_location="cpu",
|
|
66
|
+
weights_only=True,
|
|
67
|
+
)
|
|
68
|
+
self.model.load_state_dict(state)
|
|
69
|
+
self.model.eval()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _get_artefacts() -> _Artefacts:
|
|
73
|
+
global _ARTEFACTS
|
|
74
|
+
if _ARTEFACTS is None:
|
|
75
|
+
with _LOCK:
|
|
76
|
+
if _ARTEFACTS is None:
|
|
77
|
+
_ARTEFACTS = _Artefacts()
|
|
78
|
+
return _ARTEFACTS
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _transform(raw_df: pd.DataFrame, art: _Artefacts) -> np.ndarray:
|
|
82
|
+
"""Apply feature selection → per-feature scaling → PCA."""
|
|
83
|
+
# Align to the 900 training features; missing columns filled with 0
|
|
84
|
+
missing = {col: 0.0 for col in art.selected_features if col not in raw_df.columns}
|
|
85
|
+
if missing:
|
|
86
|
+
raw_df = pd.concat([raw_df, pd.DataFrame(missing, index=raw_df.index)], axis=1)
|
|
87
|
+
|
|
88
|
+
X_df = raw_df[art.train_columns].copy()
|
|
89
|
+
|
|
90
|
+
# Apply per-feature scalers only to features that were scaled during training
|
|
91
|
+
with warnings.catch_warnings():
|
|
92
|
+
warnings.simplefilter("ignore")
|
|
93
|
+
for feature, scaler in art.scalers.items():
|
|
94
|
+
if feature in X_df.columns:
|
|
95
|
+
X_df[feature] = scaler.transform(X_df[[feature]])
|
|
96
|
+
X_pca = art.pca.transform(X_df.values.astype(np.float64))
|
|
97
|
+
|
|
98
|
+
return X_pca.astype(np.float32)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _run_model(X_pca: np.ndarray, art: _Artefacts) -> np.ndarray:
|
|
102
|
+
tensor = torch.tensor(X_pca, dtype=torch.float32)
|
|
103
|
+
with torch.no_grad():
|
|
104
|
+
logits = art.model(tensor)
|
|
105
|
+
probs = torch.sigmoid(logits).cpu().numpy().ravel()
|
|
106
|
+
return probs
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def predict(
|
|
110
|
+
smiles: Union[str, List[str]],
|
|
111
|
+
) -> Union[PredictionResult, List[PredictionResult]]:
|
|
112
|
+
"""
|
|
113
|
+
Predict antileishmanial activity for one or more SMILES strings.
|
|
114
|
+
|
|
115
|
+
Parameters
|
|
116
|
+
----------
|
|
117
|
+
smiles : str or list of str
|
|
118
|
+
SMILES string(s) to screen.
|
|
119
|
+
|
|
120
|
+
Returns
|
|
121
|
+
-------
|
|
122
|
+
PredictionResult or list of PredictionResult
|
|
123
|
+
Each result contains .smiles, .label, .probability, .error.
|
|
124
|
+
"""
|
|
125
|
+
single = isinstance(smiles, str)
|
|
126
|
+
inputs: list[str] = [smiles] if single else list(smiles)
|
|
127
|
+
|
|
128
|
+
art = _get_artefacts()
|
|
129
|
+
|
|
130
|
+
# --- validate all SMILES ---
|
|
131
|
+
valid_idx: list[int] = []
|
|
132
|
+
valid_mols: list = []
|
|
133
|
+
valid_smi: list[str] = []
|
|
134
|
+
results: list[PredictionResult | None] = [None] * len(inputs)
|
|
135
|
+
|
|
136
|
+
for i, smi in enumerate(inputs):
|
|
137
|
+
mol, err = validate_smiles(smi)
|
|
138
|
+
if err:
|
|
139
|
+
results[i] = PredictionResult(
|
|
140
|
+
smiles=smi, label="Invalid", probability=None, error=err
|
|
141
|
+
)
|
|
142
|
+
else:
|
|
143
|
+
valid_idx.append(i)
|
|
144
|
+
valid_mols.append(mol)
|
|
145
|
+
valid_smi.append(smi)
|
|
146
|
+
|
|
147
|
+
# --- feature generation for valid molecules ---
|
|
148
|
+
if valid_mols:
|
|
149
|
+
feature_rows = [
|
|
150
|
+
compute_features(smi, mol)
|
|
151
|
+
for smi, mol in zip(valid_smi, valid_mols)
|
|
152
|
+
]
|
|
153
|
+
raw_df = pd.concat(feature_rows, ignore_index=True)
|
|
154
|
+
X_pca = _transform(raw_df, art)
|
|
155
|
+
probs = _run_model(X_pca, art)
|
|
156
|
+
|
|
157
|
+
for local_j, global_i in enumerate(valid_idx):
|
|
158
|
+
p = float(probs[local_j])
|
|
159
|
+
results[global_i] = PredictionResult(
|
|
160
|
+
smiles=valid_smi[local_j],
|
|
161
|
+
label="Active" if p >= THRESHOLD else "Inactive",
|
|
162
|
+
probability=p,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
out = [r for r in results] # type: list[PredictionResult]
|
|
166
|
+
return out[0] if single else out
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line interface for leishmania-screen.
|
|
3
|
+
|
|
4
|
+
Usage examples
|
|
5
|
+
--------------
|
|
6
|
+
Single SMILES:
|
|
7
|
+
leishscreen --smiles "CCO"
|
|
8
|
+
|
|
9
|
+
Batch from CSV (must have a 'smiles' column):
|
|
10
|
+
leishscreen --file compounds.csv --output results.csv
|
|
11
|
+
|
|
12
|
+
Batch from plain text (one SMILES per line):
|
|
13
|
+
leishscreen --file smiles.txt --output results.csv
|
|
14
|
+
|
|
15
|
+
Print version:
|
|
16
|
+
leishscreen --version
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import argparse
|
|
22
|
+
import csv
|
|
23
|
+
import sys
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
28
|
+
p = argparse.ArgumentParser(
|
|
29
|
+
prog="leishscreen",
|
|
30
|
+
description=(
|
|
31
|
+
"Virtual screening for antileishmanial activity against "
|
|
32
|
+
"Leishmania donovani using a trained neural network."
|
|
33
|
+
),
|
|
34
|
+
)
|
|
35
|
+
p.add_argument("--version", action="store_true", help="Print version and exit.")
|
|
36
|
+
|
|
37
|
+
group = p.add_mutually_exclusive_group()
|
|
38
|
+
group.add_argument("--smiles", type=str, metavar="SMILES",
|
|
39
|
+
help="A single SMILES string to predict.")
|
|
40
|
+
group.add_argument("--file", type=Path, metavar="FILE",
|
|
41
|
+
help="Path to a CSV (with 'smiles' column) or plain-text file (one SMILES per line).")
|
|
42
|
+
|
|
43
|
+
p.add_argument("--output", type=Path, metavar="FILE", default=None,
|
|
44
|
+
help="Output CSV path. If omitted, results are printed to stdout.")
|
|
45
|
+
p.add_argument("--smiles-col", type=str, default="smiles", metavar="COL",
|
|
46
|
+
help="Column name to read SMILES from in a CSV input (default: 'smiles').")
|
|
47
|
+
return p
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _read_smiles_from_file(path: Path, smiles_col: str) -> list[str]:
|
|
51
|
+
suffix = path.suffix.lower()
|
|
52
|
+
if suffix == ".csv":
|
|
53
|
+
with open(path, newline="", encoding="utf-8") as fh:
|
|
54
|
+
reader = csv.DictReader(fh)
|
|
55
|
+
if smiles_col not in (reader.fieldnames or []):
|
|
56
|
+
sys.exit(
|
|
57
|
+
f"Error: column '{smiles_col}' not found in {path}.\n"
|
|
58
|
+
f"Available columns: {reader.fieldnames}\n"
|
|
59
|
+
f"Use --smiles-col to specify the correct column name."
|
|
60
|
+
)
|
|
61
|
+
return [row[smiles_col] for row in reader if row[smiles_col].strip()]
|
|
62
|
+
else:
|
|
63
|
+
# Plain text: one SMILES per line
|
|
64
|
+
lines = path.read_text(encoding="utf-8").splitlines()
|
|
65
|
+
return [ln.strip() for ln in lines if ln.strip() and not ln.startswith("#")]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _write_results(results, output: Path | None) -> None:
|
|
69
|
+
rows = [r.to_dict() for r in results]
|
|
70
|
+
fieldnames = ["smiles", "label", "probability", "error"]
|
|
71
|
+
|
|
72
|
+
if output is None:
|
|
73
|
+
writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames)
|
|
74
|
+
writer.writeheader()
|
|
75
|
+
writer.writerows(rows)
|
|
76
|
+
else:
|
|
77
|
+
with open(output, "w", newline="", encoding="utf-8") as fh:
|
|
78
|
+
writer = csv.DictWriter(fh, fieldnames=fieldnames)
|
|
79
|
+
writer.writeheader()
|
|
80
|
+
writer.writerows(rows)
|
|
81
|
+
print(f"Results saved to: {output}")
|
|
82
|
+
_print_summary(results)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _print_summary(results) -> None:
|
|
86
|
+
total = len(results)
|
|
87
|
+
active = sum(1 for r in results if r.label == "Active")
|
|
88
|
+
inactive = sum(1 for r in results if r.label == "Inactive")
|
|
89
|
+
invalid = sum(1 for r in results if r.label == "Invalid")
|
|
90
|
+
print(f"\nSummary: {total} compounds — {active} Active | {inactive} Inactive | {invalid} Invalid")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def main() -> None:
|
|
94
|
+
parser = _build_parser()
|
|
95
|
+
args = parser.parse_args()
|
|
96
|
+
|
|
97
|
+
if args.version:
|
|
98
|
+
from leishmania_screen import __version__
|
|
99
|
+
print(f"leishmania-screen {__version__}")
|
|
100
|
+
return
|
|
101
|
+
|
|
102
|
+
if args.smiles is None and args.file is None:
|
|
103
|
+
parser.print_help()
|
|
104
|
+
sys.exit(0)
|
|
105
|
+
|
|
106
|
+
# Lazy import so --version/--help don't pay the torch/rdkit load cost
|
|
107
|
+
from leishmania_screen import predict
|
|
108
|
+
|
|
109
|
+
if args.smiles:
|
|
110
|
+
result = predict(args.smiles)
|
|
111
|
+
if args.output:
|
|
112
|
+
_write_results([result], args.output)
|
|
113
|
+
else:
|
|
114
|
+
print(f"SMILES : {result.smiles}")
|
|
115
|
+
print(f"Label : {result.label}")
|
|
116
|
+
if result.probability is not None:
|
|
117
|
+
print(f"Probability : {result.probability:.4f}")
|
|
118
|
+
if result.error:
|
|
119
|
+
print(f"Error : {result.error}")
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
# Batch mode
|
|
123
|
+
if not args.file.exists():
|
|
124
|
+
sys.exit(f"Error: file not found: {args.file}")
|
|
125
|
+
|
|
126
|
+
smiles_list = _read_smiles_from_file(args.file, args.smiles_col)
|
|
127
|
+
if not smiles_list:
|
|
128
|
+
sys.exit("Error: no SMILES found in the input file.")
|
|
129
|
+
|
|
130
|
+
print(f"Screening {len(smiles_list)} compounds …", file=sys.stderr)
|
|
131
|
+
results = predict(smiles_list)
|
|
132
|
+
_write_results(results, args.output)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
main()
|
|
Binary file
|
|
File without changes
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: leishmania-screen
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Neural network virtual screening for antileishmanial activity against Leishmania donovani
|
|
5
|
+
Author: Pronama Biswas, Madhavi Bhatt
|
|
6
|
+
Author-email: Belaguppa Manjunath Ashwin Desai <ceo@aikyanova.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Source Code, https://github.com/AikyaNova-Pvt-Ltd/leishmania-screen
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/AikyaNova-Pvt-Ltd/leishmania-screen/issues
|
|
10
|
+
Keywords: leishmania,drug discovery,virtual screening,cheminformatics,deep learning,QSAR
|
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Chemistry
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: torch>=2.0
|
|
25
|
+
Requires-Dist: scikit-learn>=1.7
|
|
26
|
+
Requires-Dist: numpy>=1.24
|
|
27
|
+
Requires-Dist: pandas>=1.5
|
|
28
|
+
Requires-Dist: joblib>=1.2
|
|
29
|
+
Requires-Dist: rdkit>=2023.3
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# leishmania-screen
|
|
33
|
+
|
|
34
|
+
Neural network–based virtual screening for antileishmanial activity against *Leishmania donovani*.
|
|
35
|
+
|
|
36
|
+
Trained on a curated dataset of 6,699 compounds. Achieves **ROC-AUC 0.884** on the held-out test set.
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install leishmania-screen
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
> **RDKit note:** RDKit is listed as a dependency (`rdkit>=2023.3`). If your environment manages RDKit via conda, install it there first:
|
|
47
|
+
> ```bash
|
|
48
|
+
> conda install -c conda-forge rdkit
|
|
49
|
+
> pip install leishmania-screen
|
|
50
|
+
> ```
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Quick start — Python API
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from leishmania_screen import predict
|
|
58
|
+
|
|
59
|
+
# Single compound
|
|
60
|
+
result = predict("CCO")
|
|
61
|
+
print(result.label) # "Active" or "Inactive"
|
|
62
|
+
print(result.probability) # float in [0, 1]
|
|
63
|
+
|
|
64
|
+
# Batch
|
|
65
|
+
results = predict(["CCO", "CC(=O)Oc1ccccc1C(=O)O", "not_valid"])
|
|
66
|
+
for r in results:
|
|
67
|
+
print(r.smiles, r.label, r.probability, r.error)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### `PredictionResult` fields
|
|
71
|
+
|
|
72
|
+
| Field | Type | Description |
|
|
73
|
+
|---|---|---|
|
|
74
|
+
| `smiles` | `str` | The input SMILES string |
|
|
75
|
+
| `label` | `str` | `"Active"`, `"Inactive"`, or `"Invalid"` |
|
|
76
|
+
| `probability` | `float \| None` | Sigmoid output of the model (0–1); `None` for invalid inputs |
|
|
77
|
+
| `error` | `str \| None` | Reason for invalidity; `None` for valid inputs |
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Quick start — Command line
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
# Single SMILES
|
|
85
|
+
leishscreen --smiles "CC(=O)Oc1ccccc1C(=O)O"
|
|
86
|
+
|
|
87
|
+
# Batch from CSV (must have a column named 'smiles')
|
|
88
|
+
leishscreen --file compounds.csv --output results.csv
|
|
89
|
+
|
|
90
|
+
# Batch from plain text (one SMILES per line)
|
|
91
|
+
leishscreen --file smiles.txt --output results.csv
|
|
92
|
+
|
|
93
|
+
# Custom column name
|
|
94
|
+
leishscreen --file library.csv --smiles-col SMILES --output results.csv
|
|
95
|
+
|
|
96
|
+
# Version
|
|
97
|
+
leishscreen --version
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Model details
|
|
103
|
+
|
|
104
|
+
| Item | Value |
|
|
105
|
+
|---|---|
|
|
106
|
+
| Target | *Leishmania donovani* (binary: Active / Inactive) |
|
|
107
|
+
| Training dataset | 6,699 compounds (2,574 active, 4,125 inactive) |
|
|
108
|
+
| Feature pipeline | 218 RDKit descriptors + 2,728 fingerprint bits → 900-feature MI selection → StandardScaler → PCA (100 components) |
|
|
109
|
+
| Architecture | Linear(100→512)→BN→GELU→Drop(0.30) → Linear(512→256)→BN→GELU→Drop(0.25) → Linear(256→128)→GELU→Drop(0.15) → Linear(128→1) |
|
|
110
|
+
| Loss | `BCEWithLogitsLoss` with class-imbalance `pos_weight` |
|
|
111
|
+
| Optimizer | AdamW (lr=8e-4, weight_decay=2e-4) + gradient clipping |
|
|
112
|
+
| Scheduler | ReduceLROnPlateau (factor=0.5, patience=5) |
|
|
113
|
+
| Training strategy | Multi-seed ensemble (seeds 42, 52, 62); early stopping (patience=18) |
|
|
114
|
+
| Classification threshold | **0.60** (optimised on validation set: precision ≥ 0.70, max F1) |
|
|
115
|
+
| Test ROC-AUC | **0.884** |
|
|
116
|
+
| Test PR-AUC | **0.828** |
|
|
117
|
+
| Test Accuracy | **0.815** |
|
|
118
|
+
| Test Balanced Accuracy | **0.810** |
|
|
119
|
+
|
|
120
|
+
### Fingerprints used
|
|
121
|
+
|
|
122
|
+
| Type | Parameters | Bits |
|
|
123
|
+
|---|---|---|
|
|
124
|
+
| Morgan (ECFP-like) | radius=2 | 512 |
|
|
125
|
+
| Avalon | — | 512 |
|
|
126
|
+
| Topological Torsion | — | 512 |
|
|
127
|
+
| Atom Pair | — | 512 |
|
|
128
|
+
| MACCS Keys | — | 167 |
|
|
129
|
+
| RDKit Path-Based | minPath=5, maxPath=7 | 512 |
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## Citation
|
|
134
|
+
|
|
135
|
+
If you use this package in your research, please cite:
|
|
136
|
+
|
|
137
|
+
```
|
|
138
|
+
[Citation to be added upon publication]
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## License
|
|
144
|
+
|
|
145
|
+
MIT
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
leishmania_screen/__init__.py
|
|
5
|
+
leishmania_screen/_features.py
|
|
6
|
+
leishmania_screen/_model.py
|
|
7
|
+
leishmania_screen/_predict.py
|
|
8
|
+
leishmania_screen/cli.py
|
|
9
|
+
leishmania_screen.egg-info/PKG-INFO
|
|
10
|
+
leishmania_screen.egg-info/SOURCES.txt
|
|
11
|
+
leishmania_screen.egg-info/dependency_links.txt
|
|
12
|
+
leishmania_screen.egg-info/entry_points.txt
|
|
13
|
+
leishmania_screen.egg-info/requires.txt
|
|
14
|
+
leishmania_screen.egg-info/top_level.txt
|
|
15
|
+
leishmania_screen/data/NN_model.pth
|
|
16
|
+
leishmania_screen/data/__init__.py
|
|
17
|
+
leishmania_screen/data/pca_100_LD.pkl
|
|
18
|
+
leishmania_screen/data/scalers_LD.pkl
|
|
19
|
+
leishmania_screen/data/selected_features_LD.pkl
|
|
20
|
+
leishmania_screen/data/train_columns_LD.pkl
|
|
21
|
+
tests/__init__.py
|
|
22
|
+
tests/test_predict.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "leishmania-screen"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Neural network virtual screening for antileishmanial activity against Leishmania donovani"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Belaguppa Manjunath Ashwin Desai", email = "ceo@aikyanova.com" },
|
|
15
|
+
{ name = "Pronama Biswas" },
|
|
16
|
+
{ name = "Madhavi Bhatt" },
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
keywords = [
|
|
20
|
+
"leishmania",
|
|
21
|
+
"drug discovery",
|
|
22
|
+
"virtual screening",
|
|
23
|
+
"cheminformatics",
|
|
24
|
+
"deep learning",
|
|
25
|
+
"QSAR",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
classifiers = [
|
|
29
|
+
"Development Status :: 5 - Production/Stable",
|
|
30
|
+
"Intended Audience :: Science/Research",
|
|
31
|
+
"Topic :: Scientific/Engineering :: Chemistry",
|
|
32
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
33
|
+
"License :: OSI Approved :: MIT License",
|
|
34
|
+
"Programming Language :: Python :: 3",
|
|
35
|
+
"Programming Language :: Python :: 3.9",
|
|
36
|
+
"Programming Language :: Python :: 3.10",
|
|
37
|
+
"Programming Language :: Python :: 3.11",
|
|
38
|
+
"Programming Language :: Python :: 3.12",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
dependencies = [
|
|
42
|
+
"torch>=2.0",
|
|
43
|
+
"scikit-learn>=1.7",
|
|
44
|
+
"numpy>=1.24",
|
|
45
|
+
"pandas>=1.5",
|
|
46
|
+
"joblib>=1.2",
|
|
47
|
+
"rdkit>=2023.3",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
[project.scripts]
|
|
51
|
+
leishscreen = "leishmania_screen.cli:main"
|
|
52
|
+
|
|
53
|
+
[project.urls]
|
|
54
|
+
"Source Code" = "https://github.com/AikyaNova-Pvt-Ltd/leishmania-screen"
|
|
55
|
+
"Bug Tracker" = "https://github.com/AikyaNova-Pvt-Ltd/leishmania-screen/issues"
|
|
56
|
+
|
|
57
|
+
[tool.setuptools.packages.find]
|
|
58
|
+
where = ["."]
|
|
59
|
+
exclude = ["tests*"]
|
|
60
|
+
|
|
61
|
+
[tool.setuptools.package-data]
|
|
62
|
+
leishmania_screen = ["data/*.pkl", "data/*.pth", "data/*.py"]
|
|
File without changes
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for leishmania-screen.
|
|
3
|
+
|
|
4
|
+
Run with: pytest tests/ -v
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
from leishmania_screen import predict, PredictionResult, THRESHOLD
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# ---------------------------------------------------------------------------
|
|
12
|
+
# Known SMILES used across tests
|
|
13
|
+
# ---------------------------------------------------------------------------
|
|
14
|
+
ASPIRIN = "CC(=O)Oc1ccccc1C(=O)O" # aspirin
|
|
15
|
+
ETHANOL = "CCO" # ethanol
|
|
16
|
+
INVALID_1 = "not_a_smiles"
|
|
17
|
+
INVALID_2 = ""
|
|
18
|
+
INVALID_3 = "C(C)(C)(C)(C)(C)(C)" # over-valenced carbon
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TestSinglePrediction:
|
|
22
|
+
|
|
23
|
+
def test_returns_prediction_result(self):
|
|
24
|
+
r = predict(ASPIRIN)
|
|
25
|
+
assert isinstance(r, PredictionResult)
|
|
26
|
+
|
|
27
|
+
def test_label_is_active_or_inactive(self):
|
|
28
|
+
r = predict(ASPIRIN)
|
|
29
|
+
assert r.label in ("Active", "Inactive")
|
|
30
|
+
|
|
31
|
+
def test_probability_is_float_in_range(self):
|
|
32
|
+
r = predict(ASPIRIN)
|
|
33
|
+
assert r.probability is not None
|
|
34
|
+
assert 0.0 <= r.probability <= 1.0
|
|
35
|
+
|
|
36
|
+
def test_probability_consistent_with_label(self):
|
|
37
|
+
r = predict(ASPIRIN)
|
|
38
|
+
if r.label == "Active":
|
|
39
|
+
assert r.probability >= THRESHOLD
|
|
40
|
+
else:
|
|
41
|
+
assert r.probability < THRESHOLD
|
|
42
|
+
|
|
43
|
+
def test_smiles_preserved_in_output(self):
|
|
44
|
+
r = predict(ASPIRIN)
|
|
45
|
+
assert r.smiles == ASPIRIN
|
|
46
|
+
|
|
47
|
+
def test_deterministic(self):
|
|
48
|
+
r1 = predict(ETHANOL)
|
|
49
|
+
r2 = predict(ETHANOL)
|
|
50
|
+
assert r1.label == r2.label
|
|
51
|
+
assert r1.probability == r2.probability
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class TestInvalidSmiles:
|
|
55
|
+
|
|
56
|
+
def test_nonsense_string(self):
|
|
57
|
+
r = predict(INVALID_1)
|
|
58
|
+
assert r.label == "Invalid"
|
|
59
|
+
assert r.probability is None
|
|
60
|
+
assert r.error is not None
|
|
61
|
+
|
|
62
|
+
def test_empty_string(self):
|
|
63
|
+
r = predict(INVALID_2)
|
|
64
|
+
assert r.label == "Invalid"
|
|
65
|
+
|
|
66
|
+
def test_over_valenced(self):
|
|
67
|
+
r = predict(INVALID_3)
|
|
68
|
+
assert r.label == "Invalid"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class TestBatchPrediction:
|
|
72
|
+
|
|
73
|
+
def test_returns_list(self):
|
|
74
|
+
results = predict([ASPIRIN, ETHANOL])
|
|
75
|
+
assert isinstance(results, list)
|
|
76
|
+
assert len(results) == 2
|
|
77
|
+
|
|
78
|
+
def test_each_element_is_prediction_result(self):
|
|
79
|
+
results = predict([ASPIRIN, ETHANOL])
|
|
80
|
+
for r in results:
|
|
81
|
+
assert isinstance(r, PredictionResult)
|
|
82
|
+
|
|
83
|
+
def test_mixed_valid_invalid(self):
|
|
84
|
+
results = predict([ASPIRIN, INVALID_1, ETHANOL])
|
|
85
|
+
assert results[0].label in ("Active", "Inactive")
|
|
86
|
+
assert results[1].label == "Invalid"
|
|
87
|
+
assert results[2].label in ("Active", "Inactive")
|
|
88
|
+
|
|
89
|
+
def test_single_element_list_matches_scalar(self):
|
|
90
|
+
scalar = predict(ASPIRIN)
|
|
91
|
+
batch = predict([ASPIRIN])
|
|
92
|
+
assert batch[0].label == scalar.label
|
|
93
|
+
assert batch[0].probability == scalar.probability
|
|
94
|
+
|
|
95
|
+
def test_all_invalid_batch(self):
|
|
96
|
+
results = predict([INVALID_1, INVALID_2])
|
|
97
|
+
assert all(r.label == "Invalid" for r in results)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class TestToDict:
|
|
101
|
+
|
|
102
|
+
def test_to_dict_keys(self):
|
|
103
|
+
r = predict(ASPIRIN)
|
|
104
|
+
d = r.to_dict()
|
|
105
|
+
assert set(d.keys()) == {"smiles", "label", "probability", "error"}
|
|
106
|
+
|
|
107
|
+
def test_to_dict_probability_rounded(self):
|
|
108
|
+
r = predict(ASPIRIN)
|
|
109
|
+
d = r.to_dict()
|
|
110
|
+
if d["probability"] is not None:
|
|
111
|
+
assert d["probability"] == round(r.probability, 4)
|