kawow 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kawow-0.1.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Luc Miaz
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
kawow-0.1.1/PKG-INFO ADDED
@@ -0,0 +1,238 @@
1
+ Metadata-Version: 2.4
2
+ Name: kawow
3
+ Version: 0.1.1
4
+ Summary: Models for partitioning coefficients (logKow, logKoa and logKaw) from molecular structure, including model based on [Naef & Acree 2024](https://doi.org/10.3390/liquids4010011)
5
+ Author: Luc T. Miaz
6
+ License: MIT
7
+ Project-URL: Repository, https://github.com/LucMiaz/kawow
8
+ Keywords: cheminformatics,logKow,logKoa,logKaw,partition coefficient,QSPR
9
+ Requires-Python: >=3.9
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: numpy>=1.24
13
+ Requires-Dist: scikit-learn>=1.3
14
+ Requires-Dist: rdkit>=2022.9
15
+ Provides-Extra: dev
16
+ Requires-Dist: pytest; extra == "dev"
17
+ Requires-Dist: pytest-cov; extra == "dev"
18
+ Requires-Dist: matplotlib; extra == "dev"
19
+ Requires-Dist: pandas; extra == "dev"
20
+ Provides-Extra: lint
21
+ Requires-Dist: pylint>=3.0; extra == "lint"
22
+ Provides-Extra: docs
23
+ Requires-Dist: sphinx>=7.0; extra == "docs"
24
+ Requires-Dist: sphinx-rtd-theme>=2.0; extra == "docs"
25
+ Requires-Dist: myst-parser>=2.0; extra == "docs"
26
+ Requires-Dist: sphinx-autodoc-typehints>=1.24; extra == "docs"
27
+ Requires-Dist: numpydoc>=1.6; extra == "docs"
28
+ Dynamic: license-file
29
+
30
+ # Kawow (under development)
31
+
32
+ [![CI](https://github.com/LucMiaz/kawow/actions/workflows/ci.yml/badge.svg)](https://github.com/LucMiaz/kawow/actions/workflows/ci.yml)
33
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/)
34
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
35
+
36
+ ![Kawow!](logo/kawow.svg)
37
+
38
+ Group-additivity prediction of **log*K*ow**, **log*K*oa**, and **log*K*aw** from molecular structure.
39
+
40
+ *Kawow* implements some models to predict partitioning coefficients (`logKoa`, `logKow` and `logKaw`), in particular the Naef & Acree (2024) group-additivity scheme using RDKit SMARTS pattern matching. Two model families are available depending on how much transparency or accuracy is required.
41
+
42
+ ### Flagging criteria used in outputs
43
+
44
+ `run_models(...)` returns B/vB and M/vM flags derived from predicted partition values:
45
+
46
+ - `B`: `logKoa >= 6` and `logKow >= 2`
47
+ - `vB`: `logKoa >= 6` and `logKow >= 5`
48
+
49
+ based on `doi:10.1126/science.1138275`.
50
+
51
+ Mobility is computed via an estimated sorption relation:
52
+
53
+ - `logKoc_est = logKow - 0.4`
54
+ - `M`: `logKoc_est <= 4.5`
55
+ - `vM`: `logKoc_est <= 3.5`
56
+
57
+ following UBA drinking-water source protection guidance:
58
+
59
+ - https://www.umweltbundesamt.de/en/publikationen/protecting-the-sources-of-our-drinking-water-the
60
+
61
+ ---
62
+
63
+ ## Installation
64
+
65
+ ```bash
66
+ pip install kawow
67
+ ```
68
+
69
+ Or from source (requires RDKit ≥ 2022.9):
70
+
71
+ ```bash
72
+ git clone https://github.com/LucMiaz/kawow.git
73
+ cd kawow
74
+ pip install -e ".[dev]"
75
+ ```
76
+
77
+ ---
78
+
79
+ ## Models at a glance
80
+
81
+ | Model key | Class | Approach | log*K*ow R² | log*K*oa R² |
82
+ |-----------|-------|----------|-------------|-------------|
83
+ | `kawow` | `PartitionCalculator` | Ridge regression on Crippen + Naef special-group features | 0.922 (cv) | 0.946 (cv) |
84
+ | `smarts` | `NaefAcreePartitionCalculator` | Pure Naef & Acree 2024 group-additivity (no refitting) | 0.857 (S01) | 0.785 (S02) |
85
+ | `smarts_mixed` | `NaefAcreeCrippenMixedPartitionCalculator` | Naef & Acree additivity + Crippen Ridge hybrid | 0.962 (cv) | 0.968 (cv) |
86
+ | `mqg` | `MQGPartitionCalculator` | Random forest on Molecular Quantum Graph fingerprints | 0.881 (cv) | 0.945 (cv) |
87
+
88
+ Use `run_models()` to run several models at once and get per-molecule B/vB and M/vM flags:
89
+
90
+ ```python
91
+ import kawow
92
+
93
+ results = kawow.run_models(
94
+ ["CCCCO", "c1ccccc1", "OC(=O)c1ccccc1"],
95
+ models=["kawow", "smarts_mixed"],
96
+ )
97
+ for row in results:
98
+ print(row["smiles"], row["models"]["kawow"]["logKow"],
99
+ row["models"]["kawow"]["b_class"])
100
+ ```
101
+
102
+ Each element of the returned list is a `dict` with:
103
+
104
+ | Key | Description |
105
+ |-----|-------------|
106
+ | `smiles` | canonical SMILES |
107
+ | `name` | molecule name from input |
108
+ | `models` | `dict` keyed by model name; each value contains `logKow`, `logKoa`, `logKaw`, `b_class`, `m_class`, `ok` |
109
+ | `ok` | `True` if at least one model succeeded |
110
+
111
+ ---
112
+
113
+ ## 1 — `PartitionCalculator` (recommended for most uses)
114
+
115
+ Ridge regression fitted on the same S01/S02 datasets. Coefficients are stored in `kawow/data/*.json` so no re-fitting is needed at import time.
116
+
117
+ ```python
118
+ from kawow import PartitionCalculator
119
+
120
+ calc = PartitionCalculator() # Ridge (default)
121
+
122
+ # Single molecule from SMILES
123
+ result = calc.predict("CCCCO") # 1-butanol
124
+ print(result)
125
+ # {'logKow': 0.88, 'logKoa': 4.12, 'logKaw': -3.24, 'status': 'ok'}
126
+
127
+ # Batch prediction
128
+ smiles = ["c1ccccc1", "CCCCCCCCCC", "OC(=O)c1ccccc1"]
129
+ for r in calc.predict_batch(smiles):
130
+ print(r["smiles"], r["logKow"], r["logKoa"], r["logKaw"])
131
+ ```
132
+
133
+ **Predict from an InChI string or SDF file:**
134
+
135
+ ```python
136
+ r = calc.predict("InChI=1S/C4H10O/c1-2-3-4-5/h5H,2-4H2,1H3")
137
+ results = calc.predict("compounds.sdf") # returns list[dict]
138
+ ```
139
+
140
+ **Inspect model metadata:**
141
+
142
+ ```python
143
+ info = calc.model_info
144
+ print(info["logKow"])
145
+ # {'target': 'logKow', 'n_train': 3234, 'alpha': 51.8,
146
+ # 'r2_cv': 0.9221, 'rmse_cv': 0.5775, ...}
147
+ ```
148
+
149
+ **Re-fit on your own training data:**
150
+
151
+ ```python
152
+ import kawow
153
+ kawow.fit(
154
+ sdf_logkow="my_logkow.sdf",
155
+ sdf_logkoa="my_logkoa.sdf",
156
+ logkow_prop="logP",
157
+ logkoa_prop="logKoa",
158
+ )
159
+ calc = kawow.PartitionCalculator() # reload after fitting
160
+ ```
161
+
162
+ ### Performance (5-fold cross-validation on Naef & Acree training sets)
163
+
164
+ | Model | Property | n | R² (cv) | RMSE (cv) |
165
+ |-------|----------|---|---------|----------|
166
+ | `kawow` (Ridge) | log*K*ow | 3 234 | **0.922** | 0.578 |
167
+ | `kawow` (Ridge) | log*K*oa | 1 886 | **0.946** | 0.660 |
168
+ | `smarts_mixed` (hybrid) | log*K*ow | 3 234 | **0.962** | 0.403 |
169
+ | `smarts_mixed` (hybrid) | log*K*oa | 1 886 | **0.968** | 0.532 |
170
+
171
+ ---
172
+
173
+ ## 2 — `NaefAcreePartitionCalculator` (SMARTS additivity, full transparency)
174
+
175
+ Implements the Naef & Acree 2024 method exactly: each SMARTS pattern from the paper's supplementary tables is matched against the molecule and its tabulated contribution added. No matrix regression — every contribution is directly interpretable.
176
+
177
+ ```python
178
+ from kawow.smarts_model import NaefAcreePartitionCalculator
179
+
180
+ calc = NaefAcreePartitionCalculator(smiles="c1ccccc1")
181
+ result = calc.predict("c1ccccc1")
182
+ # {'logKow': 2.13, 'logKoa': 2.80, 'logKaw': -0.67, 'in_coverage': True}
183
+
184
+ # Or pass a pre-built RDKit mol:
185
+ from rdkit import Chem
186
+ mol = Chem.MolFromSmiles("CCCCCCCCCC")
187
+ result = calc.predict(mol)
188
+
189
+ # Batch via constructor:
190
+ calc_batch = NaefAcreePartitionCalculator(
191
+ smiles=["c1ccccc1", "CCCCCCCCCC", "OC(=O)c1ccccc1"]
192
+ )
193
+ for mol, coeffs in calc_batch.results.items():
194
+ print(coeffs)
195
+ ```
196
+
197
+ ### Performance on the Naef & Acree training sets
198
+
199
+ | Dataset | Property | n | R² | RMSE | MAE |
200
+ |---------|----------|---|-----|------|-----|
201
+ | S01 (Naef 2024) | log*K*ow | 3 344 | **0.857** | 0.786 | 0.543 |
202
+ | S02 (Naef 2024) | log*K*oa | 1 983 | **0.785** | 1.387 | 0.784 |
203
+ | Arp & Hale 2023 (SI) | log*K*ow | 687 | **0.644** | 1.138 | 0.686 |
204
+
205
+ The remaining error is concentrated in specific chemotypes (notably highly heteroatom-rich agrochemical scaffolds), while the broad SMARTS generalization and pi-environment fixes substantially improved overall log*K*oa performance on S02.
206
+
207
+ ### Correlation plots
208
+
209
+ | log*K*ow vs Naef S01 | log*K*oa vs Naef S02 | log*K*ow vs Arp & Hale |
210
+ |:--------------------:|:--------------------:|:----------------------:|
211
+ | ![logKow vs S01](docs/imgs/corr_smarts_kow_vs_s01.png) | ![logKoa vs S02](docs/imgs/corr_smarts_koa_vs_s02.png) | ![logKow vs Excel](docs/imgs/corr_smarts_kow_vs_excel.png) |
212
+
213
+ ---
214
+
215
+ ## Feature engineering
216
+
217
+ Each molecule is represented by counts of SMARTS atom-type groups from the Naef & Acree parameter tables, plus five special-group descriptors:
218
+
219
+ - **pi-neighbour moieties** — the number of conjugated systems adjacent to a centre atom (controls which entry in a pi-stratified table applies; computed by `count_conjugated_neighbor_moieties`)
220
+ - **H-acceptor binary presence** — 1 if any intramolecular H-bond donor/acceptor pair is within 5 bonds
221
+ - **Alkane flag** — 1 if the molecule is a pure saturated hydrocarbon
222
+ - **Unsaturated HC flag** — 1 if the molecule is a pure unsaturated hydrocarbon
223
+ - **Extra −COOH count** — number of carboxylic acid groups beyond the first
224
+ - **Endocyclic C−C single bond count**
225
+
226
+ The `PartitionCalculator` additionally uses 72 Crippen atom-type features (from RDKit's `Crippen.txt`) on top of the 5 Naef special groups.
227
+
228
+ ---
229
+
230
+ ## Reference
231
+
232
+ Naef, Rudolf, and William E. Acree, Jr. 2024. "Calculation of the Three Partition Coefficients logPow, logKoa and logKaw of Organic Molecules at Standard Conditions at Once by Means of a Generally Applicable Group-Additivity Method." *Liquids* 4, no. 1: 231–260. [10.3390/liquids4010011](https://doi.org/10.3390/liquids4010011)
233
+
234
+ Arp, H.P.H. and Hale, S.E. 2023. "From Measured Partition Coefficients to the Prediction of Environmental Fate." Supplementary data: `vg2c00024_si_001` (ACS).
235
+
236
+ ## License
237
+
238
+ MIT
kawow-0.1.1/README.md ADDED
@@ -0,0 +1,209 @@
1
+ # Kawow (under development)
2
+
3
+ [![CI](https://github.com/LucMiaz/kawow/actions/workflows/ci.yml/badge.svg)](https://github.com/LucMiaz/kawow/actions/workflows/ci.yml)
4
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
6
+
7
+ ![Kawow!](logo/kawow.svg)
8
+
9
+ Group-additivity prediction of **log*K*ow**, **log*K*oa**, and **log*K*aw** from molecular structure.
10
+
11
+ *Kawow* implements some models to predict partitioning coefficients (`logKoa`, `logKow` and `logKaw`), in particular the Naef & Acree (2024) group-additivity scheme using RDKit SMARTS pattern matching. Two model families are available depending on how much transparency or accuracy is required.
12
+
13
+ ### Flagging criteria used in outputs
14
+
15
+ `run_models(...)` returns B/vB and M/vM flags derived from predicted partition values:
16
+
17
+ - `B`: `logKoa >= 6` and `logKow >= 2`
18
+ - `vB`: `logKoa >= 6` and `logKow >= 5`
19
+
20
+ based on `doi:10.1126/science.1138275`.
21
+
22
+ Mobility is computed via an estimated sorption relation:
23
+
24
+ - `logKoc_est = logKow - 0.4`
25
+ - `M`: `logKoc_est <= 4.5`
26
+ - `vM`: `logKoc_est <= 3.5`
27
+
28
+ following UBA drinking-water source protection guidance:
29
+
30
+ - https://www.umweltbundesamt.de/en/publikationen/protecting-the-sources-of-our-drinking-water-the
31
+
32
+ ---
33
+
34
+ ## Installation
35
+
36
+ ```bash
37
+ pip install kawow
38
+ ```
39
+
40
+ Or from source (requires RDKit ≥ 2022.9):
41
+
42
+ ```bash
43
+ git clone https://github.com/LucMiaz/kawow.git
44
+ cd kawow
45
+ pip install -e ".[dev]"
46
+ ```
47
+
48
+ ---
49
+
50
+ ## Models at a glance
51
+
52
+ | Model key | Class | Approach | log*K*ow R² | log*K*oa R² |
53
+ |-----------|-------|----------|-------------|-------------|
54
+ | `kawow` | `PartitionCalculator` | Ridge regression on Crippen + Naef special-group features | 0.922 (cv) | 0.946 (cv) |
55
+ | `smarts` | `NaefAcreePartitionCalculator` | Pure Naef & Acree 2024 group-additivity (no refitting) | 0.857 (S01) | 0.785 (S02) |
56
+ | `smarts_mixed` | `NaefAcreeCrippenMixedPartitionCalculator` | Naef & Acree additivity + Crippen Ridge hybrid | 0.962 (cv) | 0.968 (cv) |
57
+ | `mqg` | `MQGPartitionCalculator` | Random forest on Molecular Quantum Graph fingerprints | 0.881 (cv) | 0.945 (cv) |
58
+
59
+ Use `run_models()` to run several models at once and get per-molecule B/vB and M/vM flags:
60
+
61
+ ```python
62
+ import kawow
63
+
64
+ results = kawow.run_models(
65
+ ["CCCCO", "c1ccccc1", "OC(=O)c1ccccc1"],
66
+ models=["kawow", "smarts_mixed"],
67
+ )
68
+ for row in results:
69
+ print(row["smiles"], row["models"]["kawow"]["logKow"],
70
+ row["models"]["kawow"]["b_class"])
71
+ ```
72
+
73
+ Each element of the returned list is a `dict` with:
74
+
75
+ | Key | Description |
76
+ |-----|-------------|
77
+ | `smiles` | canonical SMILES |
78
+ | `name` | molecule name from input |
79
+ | `models` | `dict` keyed by model name; each value contains `logKow`, `logKoa`, `logKaw`, `b_class`, `m_class`, `ok` |
80
+ | `ok` | `True` if at least one model succeeded |
81
+
82
+ ---
83
+
84
+ ## 1 — `PartitionCalculator` (recommended for most uses)
85
+
86
+ Ridge regression fitted on the same S01/S02 datasets. Coefficients are stored in `kawow/data/*.json` so no re-fitting is needed at import time.
87
+
88
+ ```python
89
+ from kawow import PartitionCalculator
90
+
91
+ calc = PartitionCalculator() # Ridge (default)
92
+
93
+ # Single molecule from SMILES
94
+ result = calc.predict("CCCCO") # 1-butanol
95
+ print(result)
96
+ # {'logKow': 0.88, 'logKoa': 4.12, 'logKaw': -3.24, 'status': 'ok'}
97
+
98
+ # Batch prediction
99
+ smiles = ["c1ccccc1", "CCCCCCCCCC", "OC(=O)c1ccccc1"]
100
+ for r in calc.predict_batch(smiles):
101
+ print(r["smiles"], r["logKow"], r["logKoa"], r["logKaw"])
102
+ ```
103
+
104
+ **Predict from an InChI string or SDF file:**
105
+
106
+ ```python
107
+ r = calc.predict("InChI=1S/C4H10O/c1-2-3-4-5/h5H,2-4H2,1H3")
108
+ results = calc.predict("compounds.sdf") # returns list[dict]
109
+ ```
110
+
111
+ **Inspect model metadata:**
112
+
113
+ ```python
114
+ info = calc.model_info
115
+ print(info["logKow"])
116
+ # {'target': 'logKow', 'n_train': 3234, 'alpha': 51.8,
117
+ # 'r2_cv': 0.9221, 'rmse_cv': 0.5775, ...}
118
+ ```
119
+
120
+ **Re-fit on your own training data:**
121
+
122
+ ```python
123
+ import kawow
124
+ kawow.fit(
125
+ sdf_logkow="my_logkow.sdf",
126
+ sdf_logkoa="my_logkoa.sdf",
127
+ logkow_prop="logP",
128
+ logkoa_prop="logKoa",
129
+ )
130
+ calc = kawow.PartitionCalculator() # reload after fitting
131
+ ```
132
+
133
+ ### Performance (5-fold cross-validation on Naef & Acree training sets)
134
+
135
+ | Model | Property | n | R² (cv) | RMSE (cv) |
136
+ |-------|----------|---|---------|----------|
137
+ | `kawow` (Ridge) | log*K*ow | 3 234 | **0.922** | 0.578 |
138
+ | `kawow` (Ridge) | log*K*oa | 1 886 | **0.946** | 0.660 |
139
+ | `smarts_mixed` (hybrid) | log*K*ow | 3 234 | **0.962** | 0.403 |
140
+ | `smarts_mixed` (hybrid) | log*K*oa | 1 886 | **0.968** | 0.532 |
141
+
142
+ ---
143
+
144
+ ## 2 — `NaefAcreePartitionCalculator` (SMARTS additivity, full transparency)
145
+
146
+ Implements the Naef & Acree 2024 method exactly: each SMARTS pattern from the paper's supplementary tables is matched against the molecule and its tabulated contribution added. No matrix regression — every contribution is directly interpretable.
147
+
148
+ ```python
149
+ from kawow.smarts_model import NaefAcreePartitionCalculator
150
+
151
+ calc = NaefAcreePartitionCalculator(smiles="c1ccccc1")
152
+ result = calc.predict("c1ccccc1")
153
+ # {'logKow': 2.13, 'logKoa': 2.80, 'logKaw': -0.67, 'in_coverage': True}
154
+
155
+ # Or pass a pre-built RDKit mol:
156
+ from rdkit import Chem
157
+ mol = Chem.MolFromSmiles("CCCCCCCCCC")
158
+ result = calc.predict(mol)
159
+
160
+ # Batch via constructor:
161
+ calc_batch = NaefAcreePartitionCalculator(
162
+ smiles=["c1ccccc1", "CCCCCCCCCC", "OC(=O)c1ccccc1"]
163
+ )
164
+ for mol, coeffs in calc_batch.results.items():
165
+ print(coeffs)
166
+ ```
167
+
168
+ ### Performance on the Naef & Acree training sets
169
+
170
+ | Dataset | Property | n | R² | RMSE | MAE |
171
+ |---------|----------|---|-----|------|-----|
172
+ | S01 (Naef 2024) | log*K*ow | 3 344 | **0.857** | 0.786 | 0.543 |
173
+ | S02 (Naef 2024) | log*K*oa | 1 983 | **0.785** | 1.387 | 0.784 |
174
+ | Arp & Hale 2023 (SI) | log*K*ow | 687 | **0.644** | 1.138 | 0.686 |
175
+
176
+ The remaining error is concentrated in specific chemotypes (notably highly heteroatom-rich agrochemical scaffolds), while the broad SMARTS generalization and pi-environment fixes substantially improved overall log*K*oa performance on S02.
177
+
178
+ ### Correlation plots
179
+
180
+ | log*K*ow vs Naef S01 | log*K*oa vs Naef S02 | log*K*ow vs Arp & Hale |
181
+ |:--------------------:|:--------------------:|:----------------------:|
182
+ | ![logKow vs S01](docs/imgs/corr_smarts_kow_vs_s01.png) | ![logKoa vs S02](docs/imgs/corr_smarts_koa_vs_s02.png) | ![logKow vs Excel](docs/imgs/corr_smarts_kow_vs_excel.png) |
183
+
184
+ ---
185
+
186
+ ## Feature engineering
187
+
188
+ Each molecule is represented by counts of SMARTS atom-type groups from the Naef & Acree parameter tables, plus five special-group descriptors:
189
+
190
+ - **pi-neighbour moieties** — the number of conjugated systems adjacent to a centre atom (controls which entry in a pi-stratified table applies; computed by `count_conjugated_neighbor_moieties`)
191
+ - **H-acceptor binary presence** — 1 if any intramolecular H-bond donor/acceptor pair is within 5 bonds
192
+ - **Alkane flag** — 1 if the molecule is a pure saturated hydrocarbon
193
+ - **Unsaturated HC flag** — 1 if the molecule is a pure unsaturated hydrocarbon
194
+ - **Extra −COOH count** — number of carboxylic acid groups beyond the first
195
+ - **Endocyclic C−C single bond count**
196
+
197
+ The `PartitionCalculator` additionally uses 72 Crippen atom-type features (from RDKit's `Crippen.txt`) on top of the 5 Naef special groups.
198
+
199
+ ---
200
+
201
+ ## Reference
202
+
203
+ Naef, Rudolf, and William E. Acree, Jr. 2024. "Calculation of the Three Partition Coefficients logPow, logKoa and logKaw of Organic Molecules at Standard Conditions at Once by Means of a Generally Applicable Group-Additivity Method." *Liquids* 4, no. 1: 231–260. [10.3390/liquids4010011](https://doi.org/10.3390/liquids4010011)
204
+
205
+ Arp, H.P.H. and Hale, S.E. 2023. "From Measured Partition Coefficients to the Prediction of Environmental Fate." Supplementary data: `vg2c00024_si_001` (ACS).
206
+
207
+ ## License
208
+
209
+ MIT
@@ -0,0 +1,43 @@
1
+ """
2
+ kawow
3
+ =========
4
+ Group-additivity prediction of logKow, logKoa and logKaw.
5
+
6
+ Quick start
7
+ -----------
8
+ >>> import kawow
9
+ >>> calc = kawow.PartitionCalculator() # loads pre-fitted JSON
10
+ >>> calc.predict("CCCCO")
11
+ {'logKow': 0.88, 'logKoa': 4.12, 'logKaw': -3.24, 'status': 'ok'}
12
+
13
+ Re-fitting (needs SDF training files)
14
+ --------------------------------------
15
+ >>> kawow.fit(sdf_logkow="S01.sdf", sdf_logkoa="S02.sdf")
16
+ >>> calc = kawow.PartitionCalculator() # reload after fitting
17
+
18
+ Reference
19
+ ---------
20
+ R. Naef, W.E. Acree Jr., Liquids 4(1):231-260, 2024.
21
+ DOI: 10.3390/liquids4010011
22
+ """
23
+
24
+ from .model import PartitionCalculator, MQGPartitionCalculator, fit, fit_mqg, run_models
25
+ from .smarts_model import NaefAcreePartitionCalculator, NaefAcreeCrippenMixedPartitionCalculator
26
+ from .io import parse_input
27
+ from .features import compute_features
28
+ from .atom_types import FEATURE_LABELS, N_FEATURES
29
+
30
+ __version__ = "0.1.1"
31
+ __all__ = [
32
+ "PartitionCalculator",
33
+ "MQGPartitionCalculator",
34
+ "NaefAcreePartitionCalculator",
35
+ "NaefAcreeCrippenMixedPartitionCalculator",
36
+ "fit",
37
+ "fit_mqg",
38
+ "run_models",
39
+ "parse_input",
40
+ "compute_features",
41
+ "FEATURE_LABELS",
42
+ "N_FEATURES",
43
+ ]
@@ -0,0 +1,95 @@
1
+ """
2
+ atom_types.py
3
+ =============
4
+ Atom type definitions for the Naef group-additivity method.
5
+
6
+ We use RDKit's Crippen atom-type SMARTS as a proxy for Naef's atom types.
7
+ Both methods define atom types via atom-centred SMARTS that encode element,
8
+ local hybridization, and neighbour types — the same mathematical structure.
9
+ The Crippen *contributions* (logP values) are NOT used; only the SMARTS are
10
+ used to count atom occurrences. New ridge-regression coefficients are fitted
11
+ on the S01/S02 SDF training data (Naef & Acree, Liquids 2024).
12
+
13
+ Naef special groups (beyond atom counts):
14
+ SG_HB_INTRAMOL — number of intramolecular H-bond donor/acceptor pairs
15
+ within 5 bonds (penalises self-sequestered polarity)
16
+ SG_ALKANE — 1 if pure saturated hydrocarbon (C/H only, no rings)
17
+ SG_UNSAT_HC — 1 if pure unsaturated hydrocarbon (C/H only, π bonds)
18
+ SG_EXTRA_COOH — number of -COOH groups beyond the first
19
+ SG_ENDOCYCLIC_CC — number of endocyclic C-C single bonds
20
+
21
+ Atom types are parsed from the Crippen.txt data file shipped with RDKit.
22
+ Each unique ID (e.g. "C1", "N3") represents one atom type. Multiple SMARTS
23
+ rows with the same ID are all used during assignment, with earlier rows having
24
+ priority (matching RDKit's _pyGetAtomContribs logic).
25
+ """
26
+
27
+ import os
28
+ from rdkit import Chem, RDConfig
29
+
30
+ # ── Parse Crippen.txt ─────────────────────────────────────────────────────────
31
+ # Try installed RDDataDir first, fall back to rdkit-src in workspace
32
+ _CRIPPEN_TXT_CANDIDATES = [
33
+ os.path.join(RDConfig.RDDataDir, "Crippen.txt"),
34
+ os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "Crippen.txt"),
35
+ ]
36
+
37
+ def _parse_crippen_txt() -> tuple[list[str], dict[str, list]]:
38
+ """
39
+ Parse Crippen.txt into:
40
+ - order : list of unique atom-type IDs in priority order
41
+ - patts : dict {id -> [(smarts_str, compiled_mol), ...]}
42
+ """
43
+ txt_path = None
44
+ for candidate in _CRIPPEN_TXT_CANDIDATES:
45
+ if os.path.exists(candidate):
46
+ txt_path = candidate
47
+ break
48
+ if txt_path is None:
49
+ raise FileNotFoundError(
50
+ "Cannot find Crippen.txt. Tried:\n" +
51
+ "\n".join(f" {c}" for c in _CRIPPEN_TXT_CANDIDATES)
52
+ )
53
+
54
+ order: list[str] = []
55
+ patts: dict[str, list] = {}
56
+ with open(txt_path, "r", encoding="utf-8") as f:
57
+ for line in f:
58
+ if line.startswith("#"):
59
+ continue
60
+ cols = line.split("\t")
61
+ if len(cols) < 4 or not cols[0].strip():
62
+ continue
63
+ type_id = cols[0].strip()
64
+ smarts = cols[1].strip().replace('"', '')
65
+ if smarts in ("", "SMARTS"):
66
+ continue
67
+ mol = Chem.MolFromSmarts(smarts)
68
+ if mol is None:
69
+ continue
70
+ if type_id not in order:
71
+ order.append(type_id)
72
+ patts.setdefault(type_id, []).append((smarts, mol))
73
+ return order, patts
74
+
75
+
76
+ # Parsed at module import time (fast: just reads a ~4 KB text file once)
77
+ CRIPPEN_ORDER, CRIPPEN_PATTS = _parse_crippen_txt()
78
+
79
+ # ── Public constants ──────────────────────────────────────────────────────────
80
+ CRIPPEN_LABELS: list[str] = CRIPPEN_ORDER # e.g. ["C1", "C2", ..., "Me2"]
81
+ N_ATOM_TYPES: int = len(CRIPPEN_LABELS)
82
+
83
+ # Special group names (appended after atom-type counts in feature vector)
84
+ SPECIAL_GROUP_LABELS: list[str] = [
85
+ "SG_HB_INTRAMOL",
86
+ "SG_ALKANE",
87
+ "SG_UNSAT_HC",
88
+ "SG_EXTRA_COOH",
89
+ "SG_EXTRA_OH",
90
+ "SG_ENDOCYCLIC_CC",
91
+ ]
92
+
93
+ N_SPECIAL_GROUPS: int = len(SPECIAL_GROUP_LABELS)
94
+ FEATURE_LABELS: list[str] = CRIPPEN_LABELS + SPECIAL_GROUP_LABELS
95
+ N_FEATURES: int = N_ATOM_TYPES + N_SPECIAL_GROUPS