kawow 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kawow-0.1.1/LICENSE +21 -0
- kawow-0.1.1/PKG-INFO +238 -0
- kawow-0.1.1/README.md +209 -0
- kawow-0.1.1/kawow/__init__.py +43 -0
- kawow-0.1.1/kawow/atom_types.py +95 -0
- kawow-0.1.1/kawow/data/logkoa_model.json +88 -0
- kawow-0.1.1/kawow/data/logkoa_mqg_model.pkl +0 -0
- kawow-0.1.1/kawow/data/logkoa_ols_model.json +89 -0
- kawow-0.1.1/kawow/data/logkow_model.json +88 -0
- kawow-0.1.1/kawow/data/logkow_mqg_model.pkl +0 -0
- kawow-0.1.1/kawow/data/logkow_ols_model.json +89 -0
- kawow-0.1.1/kawow/features.py +174 -0
- kawow-0.1.1/kawow/io.py +135 -0
- kawow-0.1.1/kawow/model.py +756 -0
- kawow-0.1.1/kawow/smarts_model.py +492 -0
- kawow-0.1.1/kawow.egg-info/PKG-INFO +238 -0
- kawow-0.1.1/kawow.egg-info/SOURCES.txt +23 -0
- kawow-0.1.1/kawow.egg-info/dependency_links.txt +1 -0
- kawow-0.1.1/kawow.egg-info/requires.txt +19 -0
- kawow-0.1.1/kawow.egg-info/top_level.txt +1 -0
- kawow-0.1.1/pyproject.toml +82 -0
- kawow-0.1.1/setup.cfg +4 -0
- kawow-0.1.1/tests/test_kawow.py +314 -0
- kawow-0.1.1/tests/test_performance.py +94 -0
- kawow-0.1.1/tests/test_smarts_model_accuracy.py +89 -0
kawow-0.1.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Luc Miaz
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
kawow-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kawow
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Models for partitioning coefficients (logKow, logKoa and logKaw) from molecular structure, including model based on [Naef & Acree 2024](https://doi.org/10.3390/liquids4010011)
|
|
5
|
+
Author: Luc T. Miaz
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/LucMiaz/kawow
|
|
8
|
+
Keywords: cheminformatics,logKow,logKoa,logKaw,partition coefficient,QSPR
|
|
9
|
+
Requires-Python: >=3.9
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: numpy>=1.24
|
|
13
|
+
Requires-Dist: scikit-learn>=1.3
|
|
14
|
+
Requires-Dist: rdkit>=2022.9
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: pytest; extra == "dev"
|
|
17
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
18
|
+
Requires-Dist: matplotlib; extra == "dev"
|
|
19
|
+
Requires-Dist: pandas; extra == "dev"
|
|
20
|
+
Provides-Extra: lint
|
|
21
|
+
Requires-Dist: pylint>=3.0; extra == "lint"
|
|
22
|
+
Provides-Extra: docs
|
|
23
|
+
Requires-Dist: sphinx>=7.0; extra == "docs"
|
|
24
|
+
Requires-Dist: sphinx-rtd-theme>=2.0; extra == "docs"
|
|
25
|
+
Requires-Dist: myst-parser>=2.0; extra == "docs"
|
|
26
|
+
Requires-Dist: sphinx-autodoc-typehints>=1.24; extra == "docs"
|
|
27
|
+
Requires-Dist: numpydoc>=1.6; extra == "docs"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# Kawow (under development)
|
|
31
|
+
|
|
32
|
+
[](https://github.com/LucMiaz/kawow/actions/workflows/ci.yml)
|
|
33
|
+
[](https://www.python.org/)
|
|
34
|
+
[](LICENSE)
|
|
35
|
+
|
|
36
|
+

|
|
37
|
+
|
|
38
|
+
Group-additivity prediction of **log*K*ow**, **log*K*oa**, and **log*K*aw** from molecular structure.
|
|
39
|
+
|
|
40
|
+
*Kawow* implements some models to predict partitioning coefficients (`logKoa`, `logKow` and `logKaw`), in particular the Naef & Acree (2024) group-additivity scheme using RDKit SMARTS pattern matching. Two model families are available depending on how much transparency or accuracy is required.
|
|
41
|
+
|
|
42
|
+
### Flagging criteria used in outputs
|
|
43
|
+
|
|
44
|
+
`run_models(...)` returns B/vB and M/vM flags derived from predicted partition values:
|
|
45
|
+
|
|
46
|
+
- `B`: `logKoa >= 6` and `logKow >= 2`
|
|
47
|
+
- `vB`: `logKoa >= 6` and `logKow >= 5`
|
|
48
|
+
|
|
49
|
+
based on `doi:10.1126/science.1138275`.
|
|
50
|
+
|
|
51
|
+
Mobility is computed via an estimated sorption relation:
|
|
52
|
+
|
|
53
|
+
- `logKoc_est = logKow - 0.4`
|
|
54
|
+
- `M`: `logKoc_est <= 4.5`
|
|
55
|
+
- `vM`: `logKoc_est <= 3.5`
|
|
56
|
+
|
|
57
|
+
following UBA drinking-water source protection guidance:
|
|
58
|
+
|
|
59
|
+
- https://www.umweltbundesamt.de/en/publikationen/protecting-the-sources-of-our-drinking-water-the
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install kawow
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Or from source (requires RDKit ≥ 2022.9):
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
git clone https://github.com/LucMiaz/kawow.git
|
|
73
|
+
cd kawow
|
|
74
|
+
pip install -e ".[dev]"
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Models at a glance
|
|
80
|
+
|
|
81
|
+
| Model key | Class | Approach | log*K*ow R² | log*K*oa R² |
|
|
82
|
+
|-----------|-------|----------|-------------|-------------|
|
|
83
|
+
| `kawow` | `PartitionCalculator` | Ridge regression on Crippen + Naef special-group features | 0.922 (cv) | 0.946 (cv) |
|
|
84
|
+
| `smarts` | `NaefAcreePartitionCalculator` | Pure Naef & Acree 2024 group-additivity (no refitting) | 0.857 (S01) | 0.785 (S02) |
|
|
85
|
+
| `smarts_mixed` | `NaefAcreeCrippenMixedPartitionCalculator` | Naef & Acree additivity + Crippen Ridge hybrid | 0.962 (cv) | 0.968 (cv) |
|
|
86
|
+
| `mqg` | `MQGPartitionCalculator` | Random forest on Molecular Quantum Graph fingerprints | 0.881 (cv) | 0.945 (cv) |
|
|
87
|
+
|
|
88
|
+
Use `run_models()` to run several models at once and get per-molecule B/vB and M/vM flags:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
import kawow
|
|
92
|
+
|
|
93
|
+
results = kawow.run_models(
|
|
94
|
+
["CCCCO", "c1ccccc1", "OC(=O)c1ccccc1"],
|
|
95
|
+
models=["kawow", "smarts_mixed"],
|
|
96
|
+
)
|
|
97
|
+
for row in results:
|
|
98
|
+
print(row["smiles"], row["models"]["kawow"]["logKow"],
|
|
99
|
+
row["models"]["kawow"]["b_class"])
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Each element of the returned list is a `dict` with:
|
|
103
|
+
|
|
104
|
+
| Key | Description |
|
|
105
|
+
|-----|-------------|
|
|
106
|
+
| `smiles` | canonical SMILES |
|
|
107
|
+
| `name` | molecule name from input |
|
|
108
|
+
| `models` | `dict` keyed by model name; each value contains `logKow`, `logKoa`, `logKaw`, `b_class`, `m_class`, `ok` |
|
|
109
|
+
| `ok` | `True` if at least one model succeeded |
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## 1 — `PartitionCalculator` (recommended for most uses)
|
|
114
|
+
|
|
115
|
+
Ridge regression fitted on the same S01/S02 datasets. Coefficients are stored in `kawow/data/*.json` so no re-fitting is needed at import time.
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from kawow import PartitionCalculator
|
|
119
|
+
|
|
120
|
+
calc = PartitionCalculator() # Ridge (default)
|
|
121
|
+
|
|
122
|
+
# Single molecule from SMILES
|
|
123
|
+
result = calc.predict("CCCCO") # 1-butanol
|
|
124
|
+
print(result)
|
|
125
|
+
# {'logKow': 0.88, 'logKoa': 4.12, 'logKaw': -3.24, 'status': 'ok'}
|
|
126
|
+
|
|
127
|
+
# Batch prediction
|
|
128
|
+
smiles = ["c1ccccc1", "CCCCCCCCCC", "OC(=O)c1ccccc1"]
|
|
129
|
+
for r in calc.predict_batch(smiles):
|
|
130
|
+
print(r["smiles"], r["logKow"], r["logKoa"], r["logKaw"])
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
**Predict from an InChI string or SDF file:**
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
r = calc.predict("InChI=1S/C4H10O/c1-2-3-4-5/h5H,2-4H2,1H3")
|
|
137
|
+
results = calc.predict("compounds.sdf") # returns list[dict]
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
**Inspect model metadata:**
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
info = calc.model_info
|
|
144
|
+
print(info["logKow"])
|
|
145
|
+
# {'target': 'logKow', 'n_train': 3234, 'alpha': 51.8,
|
|
146
|
+
# 'r2_cv': 0.9221, 'rmse_cv': 0.5775, ...}
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
**Re-fit on your own training data:**
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
import kawow
|
|
153
|
+
kawow.fit(
|
|
154
|
+
sdf_logkow="my_logkow.sdf",
|
|
155
|
+
sdf_logkoa="my_logkoa.sdf",
|
|
156
|
+
logkow_prop="logP",
|
|
157
|
+
logkoa_prop="logKoa",
|
|
158
|
+
)
|
|
159
|
+
calc = kawow.PartitionCalculator() # reload after fitting
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Performance (5-fold cross-validation on Naef & Acree training sets)
|
|
163
|
+
|
|
164
|
+
| Model | Property | n | R² (cv) | RMSE (cv) |
|
|
165
|
+
|-------|----------|---|---------|----------|
|
|
166
|
+
| `kawow` (Ridge) | log*K*ow | 3 234 | **0.922** | 0.578 |
|
|
167
|
+
| `kawow` (Ridge) | log*K*oa | 1 886 | **0.946** | 0.660 |
|
|
168
|
+
| `smarts_mixed` (hybrid) | log*K*ow | 3 234 | **0.962** | 0.403 |
|
|
169
|
+
| `smarts_mixed` (hybrid) | log*K*oa | 1 886 | **0.968** | 0.532 |
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## 2 — `NaefAcreePartitionCalculator` (SMARTS additivity, full transparency)
|
|
174
|
+
|
|
175
|
+
Implements the Naef & Acree 2024 method exactly: each SMARTS pattern from the paper's supplementary tables is matched against the molecule and its tabulated contribution added. No matrix regression — every contribution is directly interpretable.
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
from kawow.smarts_model import NaefAcreePartitionCalculator
|
|
179
|
+
|
|
180
|
+
calc = NaefAcreePartitionCalculator(smiles="c1ccccc1")
|
|
181
|
+
result = calc.predict("c1ccccc1")
|
|
182
|
+
# {'logKow': 2.13, 'logKoa': 2.80, 'logKaw': -0.67, 'in_coverage': True}
|
|
183
|
+
|
|
184
|
+
# Or pass a pre-built RDKit mol:
|
|
185
|
+
from rdkit import Chem
|
|
186
|
+
mol = Chem.MolFromSmiles("CCCCCCCCCC")
|
|
187
|
+
result = calc.predict(mol)
|
|
188
|
+
|
|
189
|
+
# Batch via constructor:
|
|
190
|
+
calc_batch = NaefAcreePartitionCalculator(
|
|
191
|
+
smiles=["c1ccccc1", "CCCCCCCCCC", "OC(=O)c1ccccc1"]
|
|
192
|
+
)
|
|
193
|
+
for mol, coeffs in calc_batch.results.items():
|
|
194
|
+
print(coeffs)
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Performance on the Naef & Acree training sets
|
|
198
|
+
|
|
199
|
+
| Dataset | Property | n | R² | RMSE | MAE |
|
|
200
|
+
|---------|----------|---|-----|------|-----|
|
|
201
|
+
| S01 (Naef 2024) | log*K*ow | 3 344 | **0.857** | 0.786 | 0.543 |
|
|
202
|
+
| S02 (Naef 2024) | log*K*oa | 1 983 | **0.785** | 1.387 | 0.784 |
|
|
203
|
+
| Arp & Hale 2023 (SI) | log*K*ow | 687 | **0.644** | 1.138 | 0.686 |
|
|
204
|
+
|
|
205
|
+
The remaining error is concentrated in specific chemotypes (notably highly heteroatom-rich agrochemical scaffolds), while the broad SMARTS generalization and pi-environment fixes substantially improved overall log*K*oa performance on S02.
|
|
206
|
+
|
|
207
|
+
### Correlation plots
|
|
208
|
+
|
|
209
|
+
| log*K*ow vs Naef S01 | log*K*oa vs Naef S02 | log*K*ow vs Arp & Hale |
|
|
210
|
+
|:--------------------:|:--------------------:|:----------------------:|
|
|
211
|
+
|  |  |  |
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
## Feature engineering
|
|
216
|
+
|
|
217
|
+
Each molecule is represented by counts of SMARTS atom-type groups from the Naef & Acree parameter tables, plus five special-group descriptors:
|
|
218
|
+
|
|
219
|
+
- **pi-neighbour moieties** — the number of conjugated systems adjacent to a centre atom (controls which entry in a pi-stratified table applies; computed by `count_conjugated_neighbor_moieties`)
|
|
220
|
+
- **H-acceptor binary presence** — 1 if any intramolecular H-bond donor/acceptor pair is within 5 bonds
|
|
221
|
+
- **Alkane flag** — 1 if the molecule is a pure saturated hydrocarbon
|
|
222
|
+
- **Unsaturated HC flag** — 1 if the molecule is a pure unsaturated hydrocarbon
|
|
223
|
+
- **Extra −COOH count** — number of carboxylic acid groups beyond the first
|
|
224
|
+
- **Endocyclic C−C single bond count**
|
|
225
|
+
|
|
226
|
+
The `PartitionCalculator` additionally uses 72 Crippen atom-type features (from RDKit's `Crippen.txt`) on top of the 5 Naef special groups.
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## Reference
|
|
231
|
+
|
|
232
|
+
Naef, Rudolf, and William E. Acree, Jr. 2024. "Calculation of the Three Partition Coefficients logPow, logKoa and logKaw of Organic Molecules at Standard Conditions at Once by Means of a Generally Applicable Group-Additivity Method." *Liquids* 4, no. 1: 231–260. [10.3390/liquids4010011](https://doi.org/10.3390/liquids4010011)
|
|
233
|
+
|
|
234
|
+
Arp, H.P.H. and Hale, S.E. 2023. "From Measured Partition Coefficients to the Prediction of Environmental Fate." Supplementary data: `vg2c00024_si_001` (ACS).
|
|
235
|
+
|
|
236
|
+
## License
|
|
237
|
+
|
|
238
|
+
MIT
|
kawow-0.1.1/README.md
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
# Kawow (under development)
|
|
2
|
+
|
|
3
|
+
[](https://github.com/LucMiaz/kawow/actions/workflows/ci.yml)
|
|
4
|
+
[](https://www.python.org/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
|
|
7
|
+

|
|
8
|
+
|
|
9
|
+
Group-additivity prediction of **log*K*ow**, **log*K*oa**, and **log*K*aw** from molecular structure.
|
|
10
|
+
|
|
11
|
+
*Kawow* implements some models to predict partitioning coefficients (`logKoa`, `logKow` and `logKaw`), in particular the Naef & Acree (2024) group-additivity scheme using RDKit SMARTS pattern matching. Two model families are available depending on how much transparency or accuracy is required.
|
|
12
|
+
|
|
13
|
+
### Flagging criteria used in outputs
|
|
14
|
+
|
|
15
|
+
`run_models(...)` returns B/vB and M/vM flags derived from predicted partition values:
|
|
16
|
+
|
|
17
|
+
- `B`: `logKoa >= 6` and `logKow >= 2`
|
|
18
|
+
- `vB`: `logKoa >= 6` and `logKow >= 5`
|
|
19
|
+
|
|
20
|
+
based on `doi:10.1126/science.1138275`.
|
|
21
|
+
|
|
22
|
+
Mobility is computed via an estimated sorption relation:
|
|
23
|
+
|
|
24
|
+
- `logKoc_est = logKow - 0.4`
|
|
25
|
+
- `M`: `logKoc_est <= 4.5`
|
|
26
|
+
- `vM`: `logKoc_est <= 3.5`
|
|
27
|
+
|
|
28
|
+
following UBA drinking-water source protection guidance:
|
|
29
|
+
|
|
30
|
+
- https://www.umweltbundesamt.de/en/publikationen/protecting-the-sources-of-our-drinking-water-the
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install kawow
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Or from source (requires RDKit ≥ 2022.9):
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
git clone https://github.com/LucMiaz/kawow.git
|
|
44
|
+
cd kawow
|
|
45
|
+
pip install -e ".[dev]"
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## Models at a glance
|
|
51
|
+
|
|
52
|
+
| Model key | Class | Approach | log*K*ow R² | log*K*oa R² |
|
|
53
|
+
|-----------|-------|----------|-------------|-------------|
|
|
54
|
+
| `kawow` | `PartitionCalculator` | Ridge regression on Crippen + Naef special-group features | 0.922 (cv) | 0.946 (cv) |
|
|
55
|
+
| `smarts` | `NaefAcreePartitionCalculator` | Pure Naef & Acree 2024 group-additivity (no refitting) | 0.857 (S01) | 0.785 (S02) |
|
|
56
|
+
| `smarts_mixed` | `NaefAcreeCrippenMixedPartitionCalculator` | Naef & Acree additivity + Crippen Ridge hybrid | 0.962 (cv) | 0.968 (cv) |
|
|
57
|
+
| `mqg` | `MQGPartitionCalculator` | Random forest on Molecular Quantum Graph fingerprints | 0.881 (cv) | 0.945 (cv) |
|
|
58
|
+
|
|
59
|
+
Use `run_models()` to run several models at once and get per-molecule B/vB and M/vM flags:
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
import kawow
|
|
63
|
+
|
|
64
|
+
results = kawow.run_models(
|
|
65
|
+
["CCCCO", "c1ccccc1", "OC(=O)c1ccccc1"],
|
|
66
|
+
models=["kawow", "smarts_mixed"],
|
|
67
|
+
)
|
|
68
|
+
for row in results:
|
|
69
|
+
print(row["smiles"], row["models"]["kawow"]["logKow"],
|
|
70
|
+
row["models"]["kawow"]["b_class"])
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Each element of the returned list is a `dict` with:
|
|
74
|
+
|
|
75
|
+
| Key | Description |
|
|
76
|
+
|-----|-------------|
|
|
77
|
+
| `smiles` | canonical SMILES |
|
|
78
|
+
| `name` | molecule name from input |
|
|
79
|
+
| `models` | `dict` keyed by model name; each value contains `logKow`, `logKoa`, `logKaw`, `b_class`, `m_class`, `ok` |
|
|
80
|
+
| `ok` | `True` if at least one model succeeded |
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## 1 — `PartitionCalculator` (recommended for most uses)
|
|
85
|
+
|
|
86
|
+
Ridge regression fitted on the same S01/S02 datasets. Coefficients are stored in `kawow/data/*.json` so no re-fitting is needed at import time.
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from kawow import PartitionCalculator
|
|
90
|
+
|
|
91
|
+
calc = PartitionCalculator() # Ridge (default)
|
|
92
|
+
|
|
93
|
+
# Single molecule from SMILES
|
|
94
|
+
result = calc.predict("CCCCO") # 1-butanol
|
|
95
|
+
print(result)
|
|
96
|
+
# {'logKow': 0.88, 'logKoa': 4.12, 'logKaw': -3.24, 'status': 'ok'}
|
|
97
|
+
|
|
98
|
+
# Batch prediction
|
|
99
|
+
smiles = ["c1ccccc1", "CCCCCCCCCC", "OC(=O)c1ccccc1"]
|
|
100
|
+
for r in calc.predict_batch(smiles):
|
|
101
|
+
print(r["smiles"], r["logKow"], r["logKoa"], r["logKaw"])
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**Predict from an InChI string or SDF file:**
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
r = calc.predict("InChI=1S/C4H10O/c1-2-3-4-5/h5H,2-4H2,1H3")
|
|
108
|
+
results = calc.predict("compounds.sdf") # returns list[dict]
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
**Inspect model metadata:**
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
info = calc.model_info
|
|
115
|
+
print(info["logKow"])
|
|
116
|
+
# {'target': 'logKow', 'n_train': 3234, 'alpha': 51.8,
|
|
117
|
+
# 'r2_cv': 0.9221, 'rmse_cv': 0.5775, ...}
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
**Re-fit on your own training data:**
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
import kawow
|
|
124
|
+
kawow.fit(
|
|
125
|
+
sdf_logkow="my_logkow.sdf",
|
|
126
|
+
sdf_logkoa="my_logkoa.sdf",
|
|
127
|
+
logkow_prop="logP",
|
|
128
|
+
logkoa_prop="logKoa",
|
|
129
|
+
)
|
|
130
|
+
calc = kawow.PartitionCalculator() # reload after fitting
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Performance (5-fold cross-validation on Naef & Acree training sets)
|
|
134
|
+
|
|
135
|
+
| Model | Property | n | R² (cv) | RMSE (cv) |
|
|
136
|
+
|-------|----------|---|---------|----------|
|
|
137
|
+
| `kawow` (Ridge) | log*K*ow | 3 234 | **0.922** | 0.578 |
|
|
138
|
+
| `kawow` (Ridge) | log*K*oa | 1 886 | **0.946** | 0.660 |
|
|
139
|
+
| `smarts_mixed` (hybrid) | log*K*ow | 3 234 | **0.962** | 0.403 |
|
|
140
|
+
| `smarts_mixed` (hybrid) | log*K*oa | 1 886 | **0.968** | 0.532 |
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## 2 — `NaefAcreePartitionCalculator` (SMARTS additivity, full transparency)
|
|
145
|
+
|
|
146
|
+
Implements the Naef & Acree 2024 method exactly: each SMARTS pattern from the paper's supplementary tables is matched against the molecule and its tabulated contribution added. No matrix regression — every contribution is directly interpretable.
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from kawow.smarts_model import NaefAcreePartitionCalculator
|
|
150
|
+
|
|
151
|
+
calc = NaefAcreePartitionCalculator(smiles="c1ccccc1")
|
|
152
|
+
result = calc.predict("c1ccccc1")
|
|
153
|
+
# {'logKow': 2.13, 'logKoa': 2.80, 'logKaw': -0.67, 'in_coverage': True}
|
|
154
|
+
|
|
155
|
+
# Or pass a pre-built RDKit mol:
|
|
156
|
+
from rdkit import Chem
|
|
157
|
+
mol = Chem.MolFromSmiles("CCCCCCCCCC")
|
|
158
|
+
result = calc.predict(mol)
|
|
159
|
+
|
|
160
|
+
# Batch via constructor:
|
|
161
|
+
calc_batch = NaefAcreePartitionCalculator(
|
|
162
|
+
smiles=["c1ccccc1", "CCCCCCCCCC", "OC(=O)c1ccccc1"]
|
|
163
|
+
)
|
|
164
|
+
for mol, coeffs in calc_batch.results.items():
|
|
165
|
+
print(coeffs)
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Performance on the Naef & Acree training sets
|
|
169
|
+
|
|
170
|
+
| Dataset | Property | n | R² | RMSE | MAE |
|
|
171
|
+
|---------|----------|---|-----|------|-----|
|
|
172
|
+
| S01 (Naef 2024) | log*K*ow | 3 344 | **0.857** | 0.786 | 0.543 |
|
|
173
|
+
| S02 (Naef 2024) | log*K*oa | 1 983 | **0.785** | 1.387 | 0.784 |
|
|
174
|
+
| Arp & Hale 2023 (SI) | log*K*ow | 687 | **0.644** | 1.138 | 0.686 |
|
|
175
|
+
|
|
176
|
+
The remaining error is concentrated in specific chemotypes (notably highly heteroatom-rich agrochemical scaffolds), while the broad SMARTS generalization and pi-environment fixes substantially improved overall log*K*oa performance on S02.
|
|
177
|
+
|
|
178
|
+
### Correlation plots
|
|
179
|
+
|
|
180
|
+
| log*K*ow vs Naef S01 | log*K*oa vs Naef S02 | log*K*ow vs Arp & Hale |
|
|
181
|
+
|:--------------------:|:--------------------:|:----------------------:|
|
|
182
|
+
|  |  |  |
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## Feature engineering
|
|
187
|
+
|
|
188
|
+
Each molecule is represented by counts of SMARTS atom-type groups from the Naef & Acree parameter tables, plus five special-group descriptors:
|
|
189
|
+
|
|
190
|
+
- **pi-neighbour moieties** — the number of conjugated systems adjacent to a centre atom (controls which entry in a pi-stratified table applies; computed by `count_conjugated_neighbor_moieties`)
|
|
191
|
+
- **H-acceptor binary presence** — 1 if any intramolecular H-bond donor/acceptor pair is within 5 bonds
|
|
192
|
+
- **Alkane flag** — 1 if the molecule is a pure saturated hydrocarbon
|
|
193
|
+
- **Unsaturated HC flag** — 1 if the molecule is a pure unsaturated hydrocarbon
|
|
194
|
+
- **Extra −COOH count** — number of carboxylic acid groups beyond the first
|
|
195
|
+
- **Endocyclic C−C single bond count**
|
|
196
|
+
|
|
197
|
+
The `PartitionCalculator` additionally uses 72 Crippen atom-type features (from RDKit's `Crippen.txt`) on top of the 5 Naef special groups.
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Reference
|
|
202
|
+
|
|
203
|
+
Naef, Rudolf, and William E. Acree, Jr. 2024. "Calculation of the Three Partition Coefficients logPow, logKoa and logKaw of Organic Molecules at Standard Conditions at Once by Means of a Generally Applicable Group-Additivity Method." *Liquids* 4, no. 1: 231–260. [10.3390/liquids4010011](https://doi.org/10.3390/liquids4010011)
|
|
204
|
+
|
|
205
|
+
Arp, H.P.H. and Hale, S.E. 2023. "From Measured Partition Coefficients to the Prediction of Environmental Fate." Supplementary data: `vg2c00024_si_001` (ACS).
|
|
206
|
+
|
|
207
|
+
## License
|
|
208
|
+
|
|
209
|
+
MIT
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
kawow
|
|
3
|
+
=========
|
|
4
|
+
Group-additivity prediction of logKow, logKoa and logKaw.
|
|
5
|
+
|
|
6
|
+
Quick start
|
|
7
|
+
-----------
|
|
8
|
+
>>> import kawow
|
|
9
|
+
>>> calc = kawow.PartitionCalculator() # loads pre-fitted JSON
|
|
10
|
+
>>> calc.predict("CCCCO")
|
|
11
|
+
{'logKow': 0.88, 'logKoa': 4.12, 'logKaw': -3.24, 'status': 'ok'}
|
|
12
|
+
|
|
13
|
+
Re-fitting (needs SDF training files)
|
|
14
|
+
--------------------------------------
|
|
15
|
+
>>> kawow.fit(sdf_logkow="S01.sdf", sdf_logkoa="S02.sdf")
|
|
16
|
+
>>> calc = kawow.PartitionCalculator() # reload after fitting
|
|
17
|
+
|
|
18
|
+
Reference
|
|
19
|
+
---------
|
|
20
|
+
R. Naef, W.E. Acree Jr., Liquids 4(1):231-260, 2024.
|
|
21
|
+
DOI: 10.3390/liquids4010011
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from .model import PartitionCalculator, MQGPartitionCalculator, fit, fit_mqg, run_models
|
|
25
|
+
from .smarts_model import NaefAcreePartitionCalculator, NaefAcreeCrippenMixedPartitionCalculator
|
|
26
|
+
from .io import parse_input
|
|
27
|
+
from .features import compute_features
|
|
28
|
+
from .atom_types import FEATURE_LABELS, N_FEATURES
|
|
29
|
+
|
|
30
|
+
__version__ = "0.1.1"
|
|
31
|
+
__all__ = [
|
|
32
|
+
"PartitionCalculator",
|
|
33
|
+
"MQGPartitionCalculator",
|
|
34
|
+
"NaefAcreePartitionCalculator",
|
|
35
|
+
"NaefAcreeCrippenMixedPartitionCalculator",
|
|
36
|
+
"fit",
|
|
37
|
+
"fit_mqg",
|
|
38
|
+
"run_models",
|
|
39
|
+
"parse_input",
|
|
40
|
+
"compute_features",
|
|
41
|
+
"FEATURE_LABELS",
|
|
42
|
+
"N_FEATURES",
|
|
43
|
+
]
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""
|
|
2
|
+
atom_types.py
|
|
3
|
+
=============
|
|
4
|
+
Atom type definitions for the Naef group-additivity method.
|
|
5
|
+
|
|
6
|
+
We use RDKit's Crippen atom-type SMARTS as a proxy for Naef's atom types.
|
|
7
|
+
Both methods define atom types via atom-centred SMARTS that encode element,
|
|
8
|
+
local hybridization, and neighbour types — the same mathematical structure.
|
|
9
|
+
The Crippen *contributions* (logP values) are NOT used; only the SMARTS are
|
|
10
|
+
used to count atom occurrences. New ridge-regression coefficients are fitted
|
|
11
|
+
on the S01/S02 SDF training data (Naef & Acree, Liquids 2024).
|
|
12
|
+
|
|
13
|
+
Naef special groups (beyond atom counts):
|
|
14
|
+
SG_HB_INTRAMOL — number of intramolecular H-bond donor/acceptor pairs
|
|
15
|
+
within 5 bonds (penalises self-sequestered polarity)
|
|
16
|
+
SG_ALKANE — 1 if pure saturated hydrocarbon (C/H only, no rings)
|
|
17
|
+
SG_UNSAT_HC — 1 if pure unsaturated hydrocarbon (C/H only, π bonds)
|
|
18
|
+
SG_EXTRA_COOH — number of -COOH groups beyond the first
|
|
19
|
+
SG_ENDOCYCLIC_CC — number of endocyclic C-C single bonds
|
|
20
|
+
|
|
21
|
+
Atom types are parsed from the Crippen.txt data file shipped with RDKit.
|
|
22
|
+
Each unique ID (e.g. "C1", "N3") represents one atom type. Multiple SMARTS
|
|
23
|
+
rows with the same ID are all used during assignment, with earlier rows having
|
|
24
|
+
priority (matching RDKit's _pyGetAtomContribs logic).
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import os
|
|
28
|
+
from rdkit import Chem, RDConfig
|
|
29
|
+
|
|
30
|
+
# ── Parse Crippen.txt ─────────────────────────────────────────────────────────
|
|
31
|
+
# Try installed RDDataDir first, fall back to rdkit-src in workspace
|
|
32
|
+
_CRIPPEN_TXT_CANDIDATES = [
|
|
33
|
+
os.path.join(RDConfig.RDDataDir, "Crippen.txt"),
|
|
34
|
+
os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "Crippen.txt"),
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
def _parse_crippen_txt() -> tuple[list[str], dict[str, list]]:
|
|
38
|
+
"""
|
|
39
|
+
Parse Crippen.txt into:
|
|
40
|
+
- order : list of unique atom-type IDs in priority order
|
|
41
|
+
- patts : dict {id -> [(smarts_str, compiled_mol), ...]}
|
|
42
|
+
"""
|
|
43
|
+
txt_path = None
|
|
44
|
+
for candidate in _CRIPPEN_TXT_CANDIDATES:
|
|
45
|
+
if os.path.exists(candidate):
|
|
46
|
+
txt_path = candidate
|
|
47
|
+
break
|
|
48
|
+
if txt_path is None:
|
|
49
|
+
raise FileNotFoundError(
|
|
50
|
+
"Cannot find Crippen.txt. Tried:\n" +
|
|
51
|
+
"\n".join(f" {c}" for c in _CRIPPEN_TXT_CANDIDATES)
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
order: list[str] = []
|
|
55
|
+
patts: dict[str, list] = {}
|
|
56
|
+
with open(txt_path, "r", encoding="utf-8") as f:
|
|
57
|
+
for line in f:
|
|
58
|
+
if line.startswith("#"):
|
|
59
|
+
continue
|
|
60
|
+
cols = line.split("\t")
|
|
61
|
+
if len(cols) < 4 or not cols[0].strip():
|
|
62
|
+
continue
|
|
63
|
+
type_id = cols[0].strip()
|
|
64
|
+
smarts = cols[1].strip().replace('"', '')
|
|
65
|
+
if smarts in ("", "SMARTS"):
|
|
66
|
+
continue
|
|
67
|
+
mol = Chem.MolFromSmarts(smarts)
|
|
68
|
+
if mol is None:
|
|
69
|
+
continue
|
|
70
|
+
if type_id not in order:
|
|
71
|
+
order.append(type_id)
|
|
72
|
+
patts.setdefault(type_id, []).append((smarts, mol))
|
|
73
|
+
return order, patts
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# Parsed at module import time (fast: just reads a ~4 KB text file once)
|
|
77
|
+
CRIPPEN_ORDER, CRIPPEN_PATTS = _parse_crippen_txt()
|
|
78
|
+
|
|
79
|
+
# ── Public constants ──────────────────────────────────────────────────────────
|
|
80
|
+
CRIPPEN_LABELS: list[str] = CRIPPEN_ORDER # e.g. ["C1", "C2", ..., "Me2"]
|
|
81
|
+
N_ATOM_TYPES: int = len(CRIPPEN_LABELS)
|
|
82
|
+
|
|
83
|
+
# Special group names (appended after atom-type counts in feature vector)
|
|
84
|
+
SPECIAL_GROUP_LABELS: list[str] = [
|
|
85
|
+
"SG_HB_INTRAMOL",
|
|
86
|
+
"SG_ALKANE",
|
|
87
|
+
"SG_UNSAT_HC",
|
|
88
|
+
"SG_EXTRA_COOH",
|
|
89
|
+
"SG_EXTRA_OH",
|
|
90
|
+
"SG_ENDOCYCLIC_CC",
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
N_SPECIAL_GROUPS: int = len(SPECIAL_GROUP_LABELS)
|
|
94
|
+
FEATURE_LABELS: list[str] = CRIPPEN_LABELS + SPECIAL_GROUP_LABELS
|
|
95
|
+
N_FEATURES: int = N_ATOM_TYPES + N_SPECIAL_GROUPS
|