tyche-core 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tyche_core-0.1.0/LICENSE +21 -0
- tyche_core-0.1.0/PKG-INFO +9 -0
- tyche_core-0.1.0/README.md +576 -0
- tyche_core-0.1.0/pyproject.toml +21 -0
- tyche_core-0.1.0/setup.cfg +4 -0
- tyche_core-0.1.0/tests/test.py +31 -0
- tyche_core-0.1.0/tyche/__init__.py +31 -0
- tyche_core-0.1.0/tyche/exceptions.py +5 -0
- tyche_core-0.1.0/tyche/main.py +33 -0
- tyche_core-0.1.0/tyche/randomization.py +902 -0
- tyche_core-0.1.0/tyche/randomize.py +1197 -0
- tyche_core-0.1.0/tyche_core.egg-info/PKG-INFO +9 -0
- tyche_core-0.1.0/tyche_core.egg-info/SOURCES.txt +14 -0
- tyche_core-0.1.0/tyche_core.egg-info/dependency_links.txt +1 -0
- tyche_core-0.1.0/tyche_core.egg-info/requires.txt +1 -0
- tyche_core-0.1.0/tyche_core.egg-info/top_level.txt +1 -0
tyche_core-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Robert Pollice
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tyche-core
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight SMILES randomization using SELFIES-based molecular graph traversal.
|
|
5
|
+
Project-URL: Repository, https://git.lwp.rug.nl/pollice-research-group/artificial-design/tyche
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Dist: selfies<=2.1.2,>=2.0.0
|
|
9
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,576 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<img src="./images/tyche_logo.svg" alt="logo"></img>
|
|
3
|
+
</div>
|
|
4
|
+
|
|
5
|
+
# TYCHE
|
|
6
|
+
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
[](https://git.lwp.rug.nl/pollice-research-group/artificial-design/tyche/graphs/commit-activity)
|
|
9
|
+
|
|
10
|
+
Main developers: [Robert Pollice](https://git.lwp.rug.nl/robpollice), [AkshatKumar Nigam](https://github.com/akshat998)
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
A Python toolkit for SMILES randomization, median molecule generation, local chemical subspace construction, and evolutionary molecular optimization — core tools for molecular machine learning and cheminformatics research.
|
|
15
|
+
|
|
16
|
+
This repository contains **two pip-installable packages**:
|
|
17
|
+
|
|
18
|
+
| Package | Purpose | Dependencies |
|
|
19
|
+
|---|---|---|
|
|
20
|
+
| **tyche-core** | Lightweight SMILES randomization | `selfies` only |
|
|
21
|
+
| **tyche-tools** | Extended cheminformatics: median molecules, subspace enumeration, optimization | `tyche-core`, `selfies`, `rdkit`, `numpy` (optional: `torch`, `pyyaml`) |
|
|
22
|
+
|
|
23
|
+
**tyche-core** is the minimal core — it provides fast, high-quality SMILES randomization with no RDKit dependency. If all you need is data augmentation via randomized SMILES, install tyche alone.
|
|
24
|
+
|
|
25
|
+
**tyche-tools** is the extended toolkit — it provides median molecule generation, local chemical subspace construction, and evolutionary molecular optimization. It depends on tyche for randomization and adds RDKit, NumPy, and optionally PyTorch for the full feature set.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Project status
|
|
30
|
+
This package is still under development with several planned features still to come. Nevertheless, the basic functionality is considered feature-complete. We are open for community modifications and new feature requests.
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
### Core package only (randomization)
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install tyche-core
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Or from source:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
git clone https://git.lwp.rug.nl/pollice-research-group/artificial-design/tyche.git
|
|
45
|
+
cd TYCHE
|
|
46
|
+
pip install -e .
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Extended package (all functionality)
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install tyche-tools
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Or from source:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
git clone https://git.lwp.rug.nl/pollice-research-group/artificial-design/tyche.git
|
|
59
|
+
cd TYCHE
|
|
60
|
+
pip install -e . # install tyche-core first
|
|
61
|
+
pip install -e ./tools # install tyche-tools
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
To include the neural network classifier (requires PyTorch):
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install -e "./tools[nn]"
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Which package should I install?
|
|
71
|
+
|
|
72
|
+
- **Just need randomized SMILES for data augmentation?** Install `tyche-core`. It is lightweight, depends only on `selfies`, and has no RDKit requirement.
|
|
73
|
+
- **Need median molecules, chemical subspace enumeration, or molecular optimization?** Install `tyche-tools`. It pulls in `tyche-core` automatically as a dependency.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Overview
|
|
78
|
+
|
|
79
|
+
### tyche-core (core)
|
|
80
|
+
|
|
81
|
+
| Function | Description |
|
|
82
|
+
|---|---|
|
|
83
|
+
| `randomize_smiles_tyche` | Generate multiple randomized SMILES representations of a molecule |
|
|
84
|
+
|
|
85
|
+
### tyche-tools (extended)
|
|
86
|
+
|
|
87
|
+
| Function | Description |
|
|
88
|
+
|---|---|
|
|
89
|
+
| `get_median_mols` | Find median molecules that interpolate between two input structures |
|
|
90
|
+
| `get_local_chemical_subspace` | Enumerate a large set of structurally related molecules around a given structure |
|
|
91
|
+
| `optimize_molecules` | Evolve a population of molecules toward a user-defined fitness objective |
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
## `randomize_smiles_tyche`
|
|
96
|
+
|
|
97
|
+
Generates randomized SMILES strings for a given molecule. Randomized SMILES represent the same molecule but with a different atom ordering, which is useful for data augmentation in molecular deep learning.
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from tyche import randomize_smiles_tyche
|
|
101
|
+
|
|
102
|
+
results = randomize_smiles_tyche(smiles, n, unique=True)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Parameters
|
|
106
|
+
|
|
107
|
+
| Parameter | Type | Default | Description |
|
|
108
|
+
|---|---|---|---|
|
|
109
|
+
| `smiles` | `str` | — | Input SMILES string |
|
|
110
|
+
| `n` | `int` | — | Number of randomized SMILES to return |
|
|
111
|
+
| `unique` | `bool` | `True` | If `True`, returns exactly `n` distinct SMILES; if `False`, returns `n` samples (may contain duplicates) |
|
|
112
|
+
|
|
113
|
+
### Returns
|
|
114
|
+
|
|
115
|
+
`list[str]` — List of `n` randomized SMILES strings.
|
|
116
|
+
|
|
117
|
+
### Examples
|
|
118
|
+
|
|
119
|
+
**Basic usage — 5 unique randomized SMILES for aspirin:**
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
from tyche import randomize_smiles_tyche
|
|
123
|
+
|
|
124
|
+
smiles = "CC(=O)Oc1ccccc1C(=O)O" # aspirin
|
|
125
|
+
results = randomize_smiles_tyche(smiles, n=5, unique=True)
|
|
126
|
+
|
|
127
|
+
for smi in results:
|
|
128
|
+
print(smi)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
```
|
|
132
|
+
OC(=O)c1ccccc1OC(C)=O
|
|
133
|
+
O=C(O)c1ccccc1OC(=O)C
|
|
134
|
+
CC(=O)Oc1ccccc1C(=O)O
|
|
135
|
+
O=C(C)Oc1ccccc1C(=O)O
|
|
136
|
+
OC(=O)c1ccccc1OC(=O)C
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
**Allow duplicates — exactly `n` sampling calls:**
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
results = randomize_smiles_tyche(smiles, n=10, unique=False)
|
|
143
|
+
print(len(results)) # always 10, may contain repeats
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
**Data augmentation for a molecular dataset:**
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from tyche import randomize_smiles_tyche
|
|
150
|
+
|
|
151
|
+
dataset = ["c1ccccc1", "CCO", "CC(=O)O"]
|
|
152
|
+
augmented = []
|
|
153
|
+
for smi in dataset:
|
|
154
|
+
augmented.extend(randomize_smiles_tyche(smi, n=10))
|
|
155
|
+
|
|
156
|
+
print(f"Original: {len(dataset)} | Augmented: {len(augmented)}")
|
|
157
|
+
# Original: 3 | Augmented: 30
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
**Parallel generation for large sample counts:**
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
from tyche import randomize_smiles_tyche
|
|
164
|
+
|
|
165
|
+
smiles = "CC(=O)Oc1ccccc1C(=O)O"
|
|
166
|
+
|
|
167
|
+
if __name__ == "__main__":
|
|
168
|
+
# Generate 1 million samples using all CPU cores
|
|
169
|
+
results = randomize_smiles_tyche(smiles, n=1_000_000, unique=False, parallel=True)
|
|
170
|
+
|
|
171
|
+
# Or specify the number of workers
|
|
172
|
+
results = randomize_smiles_tyche(smiles, n=1_000_000, unique=False, parallel=True, num_workers=8)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
Note: `parallel=True` only applies when `unique=False`, since unique collection requires coordinated deduplication across workers. This code assumes that the code snippet above is run via a separate .py file.
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
## `get_median_mols`
|
|
180
|
+
|
|
181
|
+
Finds "median molecules" — structures that are chemically intermediate between two input molecules. Uses SELFIES-space interpolation across an ensemble of randomized SMILES orderings to generate diverse candidate structures, then ranks them by a joint Tanimoto similarity score that rewards proximity to both endpoints.
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
from tyche_tools import get_median_mols
|
|
185
|
+
|
|
186
|
+
best_smiles, best_scores = get_median_mols(starting_smile, target_smile)
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### Parameters
|
|
190
|
+
|
|
191
|
+
| Parameter | Type | Default | Description |
|
|
192
|
+
|---|---|---|---|
|
|
193
|
+
| `starting_smile` | `str` | — | SMILES string for the source molecule |
|
|
194
|
+
| `target_smile` | `str` | — | SMILES string for the target molecule |
|
|
195
|
+
| `num_tries` | `int` | `25` | Number of interpolation path attempts per SMILES ordering pair |
|
|
196
|
+
| `num_random_samples` | `int` | `25` | Number of randomized SMILES orderings generated per molecule |
|
|
197
|
+
| `collect_bidirectional` | `bool` | `True` | If `True`, also collects target → starting paths, doubling coverage |
|
|
198
|
+
| `num_top_iter` | `int` | `100` | Number of top-ranked candidates to return |
|
|
199
|
+
|
|
200
|
+
### Returns
|
|
201
|
+
|
|
202
|
+
- `best_smiles` — `list[str]`: Top-ranked median molecule SMILES, sorted by descending score.
|
|
203
|
+
- `best_scores` — `list[float]`: Corresponding joint similarity scores (higher = more central).
|
|
204
|
+
|
|
205
|
+
### How it works
|
|
206
|
+
|
|
207
|
+
1. Generates an ensemble of randomized SMILES orderings for both molecules.
|
|
208
|
+
2. For each pair of orderings, encodes both into SELFIES and constructs random interpolation paths by swapping tokens one at a time.
|
|
209
|
+
3. Decodes all intermediate SELFIES back to SMILES and canonicalizes them.
|
|
210
|
+
4. Scores each candidate by its average Tanimoto similarity to both endpoints, penalized by the gap between the two scores (favouring structures equidistant from both).
|
|
211
|
+
|
|
212
|
+
### Examples
|
|
213
|
+
|
|
214
|
+
**Find median molecules between two drug-like structures:**
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
from tyche_tools import get_median_mols
|
|
218
|
+
|
|
219
|
+
# Dihydroergotamine and prinomastat (from original STONED paper)
|
|
220
|
+
smi_start = r"[H][C@]56C[C@@H](C(=O)N[C@]1(C)O[C@]4(O)N(C1=O)[C@@H](Cc2ccccc2)C(=O)N3CCC[C@]34[H])CN(C)[C@]5([H])Cc7c[nH]c8cccc6c78"
|
|
221
|
+
smi_target = r"CC1([C@@H](N(CCS1)S(=O)(=O)C2=CC=C(C=C2)OC3=CC=NC=C3)C(=O)NO)C"
|
|
222
|
+
|
|
223
|
+
best_smiles, best_scores = get_median_mols(
|
|
224
|
+
smi_start,
|
|
225
|
+
smi_target,
|
|
226
|
+
num_tries=25,
|
|
227
|
+
num_random_samples=25,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
print(f"Found {len(best_smiles)} median molecule candidates")
|
|
231
|
+
print(f"\nTop 5 results:")
|
|
232
|
+
for smi, score in zip(best_smiles[:5], best_scores[:5]):
|
|
233
|
+
print(f" {score:.4f} {smi}")
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
**Quick exploratory run with reduced compute:**
|
|
237
|
+
|
|
238
|
+
```python
|
|
239
|
+
best_smiles, best_scores = get_median_mols(
|
|
240
|
+
smi_start,
|
|
241
|
+
smi_target,
|
|
242
|
+
num_tries=5,
|
|
243
|
+
num_random_samples=5,
|
|
244
|
+
collect_bidirectional=False,
|
|
245
|
+
num_top_iter=20,
|
|
246
|
+
)
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
**Retrieve only the top candidate:**
|
|
250
|
+
|
|
251
|
+
```python
|
|
252
|
+
best_smiles, best_scores = get_median_mols(smi_start, smi_target, num_top_iter=1)
|
|
253
|
+
median_molecule = best_smiles[0]
|
|
254
|
+
print(f"Best median: {median_molecule} (score: {best_scores[0]:.4f})")
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
## `get_local_chemical_subspace`
|
|
260
|
+
|
|
261
|
+
Constructs a large, diverse set of molecules in the local chemical neighbourhood of a given structure. This is useful for exhaustive analogue enumeration, property landscape mapping, and building training sets for molecular machine learning models.
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
from tyche_tools import get_local_chemical_subspace
|
|
265
|
+
|
|
266
|
+
smiles_list, scores = get_local_chemical_subspace(smiles)
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
### Parameters
|
|
270
|
+
|
|
271
|
+
| Parameter | Type | Default | Description |
|
|
272
|
+
|---|---|---|---|
|
|
273
|
+
| `smiles` | `str` | — | Input SMILES string for the center molecule |
|
|
274
|
+
| `num_random_samples` | `int` | `1_000_000` | Number of unique randomized SMILES orderings to generate before mutation |
|
|
275
|
+
| `num_mutation_ls` | `list of int` | `[1, 2, 3, 4, 5]` | Mutation depths to explore; results from all depths are pooled |
|
|
276
|
+
| `fp_type` | `str` | `"ECFP4"` | Fingerprint type for similarity scoring |
|
|
277
|
+
| `top_n` | `int or None` | `None` | If set, return only the top `n` highest-scoring molecules |
|
|
278
|
+
| `min_score` | `float or None` | `None` | If set, discard molecules with Tanimoto similarity below this threshold |
|
|
279
|
+
| `output_file` | `str or None` | `None` | If set, write the final sorted and filtered SMILES to this file after computation finishes |
|
|
280
|
+
|
|
281
|
+
### Returns
|
|
282
|
+
|
|
283
|
+
- `smiles_list` — `list[str]`: Unique canonical SMILES, sorted by descending Tanimoto similarity to the input. Filtered by `min_score` and truncated to `top_n` if specified.
|
|
284
|
+
- `scores` — `list[float]`: Tanimoto similarity of each molecule to the input, in the same order as `smiles_list`.
|
|
285
|
+
|
|
286
|
+
### How it works
|
|
287
|
+
|
|
288
|
+
1. **Randomization** — Generates `num_random_samples` unique randomized SMILES orderings of the input molecule. Each ordering encodes the same structure but with a different atom traversal order, producing a distinct SELFIES string and a distinct starting point for mutation. A larger value explores a wider variety of encodings before any mutations are applied.
|
|
289
|
+
|
|
290
|
+
2. **SELFIES encoding** — Each randomized SMILES is converted to a SELFIES string, a robust molecular representation guaranteed to decode to a valid molecule.
|
|
291
|
+
|
|
292
|
+
3. **Mutation** — For each depth `d` in `num_mutation_ls`, every SELFIES string undergoes `d` sequential random mutations (insert, replace, or delete a single SELFIES token). Depth 1 produces close structural neighbours — molecules differing by approximately one atom or bond from the input. Depth 5 allows larger structural departures while remaining within the same general chemical neighbourhood. All depths are explored and their outputs pooled together.
|
|
293
|
+
|
|
294
|
+
4. **Filtering and scoring** — Mutated SELFIES are decoded to SMILES, canonicalized, and deduplicated. Each unique structure is scored by Tanimoto similarity to the original molecule.
|
|
295
|
+
|
|
296
|
+
### Supported fingerprint types
|
|
297
|
+
|
|
298
|
+
| `fp_type` | Description |
|
|
299
|
+
|---|---|
|
|
300
|
+
| `ECFP4` | Extended connectivity, radius 2 (default) |
|
|
301
|
+
| `ECFP6` | Extended connectivity, radius 3 |
|
|
302
|
+
| `FCFP4` | Feature-based Morgan, radius 2 |
|
|
303
|
+
| `FCFP6` | Feature-based Morgan, radius 3 |
|
|
304
|
+
| `AP` | Atom pair fingerprint |
|
|
305
|
+
| `PATH` | RDKit path-based fingerprint |
|
|
306
|
+
| `PHCO` | 2D pharmacophore (Gobbi) |
|
|
307
|
+
| `BPF` | Burden-CAS-University of Texas fingerprint |
|
|
308
|
+
| `BTF` | BT fingerprint |
|
|
309
|
+
|
|
310
|
+
### Examples
|
|
311
|
+
|
|
312
|
+
**Quick exploration with reduced compute:**
|
|
313
|
+
|
|
314
|
+
```python
|
|
315
|
+
from tyche_tools import get_local_chemical_subspace
|
|
316
|
+
|
|
317
|
+
smi = "CC(C)(C)NCC(c1ccc(O)c(CO)c1)O" # albuterol
|
|
318
|
+
|
|
319
|
+
smiles_list, scores = get_local_chemical_subspace(
|
|
320
|
+
smi,
|
|
321
|
+
num_random_samples=50000,
|
|
322
|
+
num_mutation_ls=[1, 2, 3],
|
|
323
|
+
fp_type="ECFP4",
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
# Results are sorted best to worst automatically
|
|
327
|
+
print(f"Generated {len(smiles_list)} unique molecules")
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
**Exhaustive enumeration (default settings):**
|
|
331
|
+
|
|
332
|
+
```python
|
|
333
|
+
smiles_list, scores = get_local_chemical_subspace(smi)
|
|
334
|
+
# num_random_samples=1_000_000, num_mutation_ls=[1,2,3,4,5]
|
|
335
|
+
|
|
336
|
+
print(f"Generated {len(smiles_list)} unique molecules")
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
**Return only the top 100 closest analogues:**
|
|
340
|
+
|
|
341
|
+
```python
|
|
342
|
+
smiles_list, scores = get_local_chemical_subspace(
|
|
343
|
+
smi,
|
|
344
|
+
num_random_samples=50000,
|
|
345
|
+
top_n=100,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
print(f"Top 100 scores: {scores[0]:.4f} to {scores[-1]:.4f}")
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
**Filter by minimum similarity threshold:**
|
|
352
|
+
|
|
353
|
+
```python
|
|
354
|
+
smiles_list, scores = get_local_chemical_subspace(
|
|
355
|
+
smi,
|
|
356
|
+
num_random_samples=100000,
|
|
357
|
+
min_score=0.4,
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
print(f"Molecules with Tanimoto >= 0.4: {len(smiles_list)}")
|
|
361
|
+
```
|
|
362
|
+
|
|
363
|
+
**Combine top-n and min-score (min_score is applied first, then top_n):**
|
|
364
|
+
|
|
365
|
+
```python
|
|
366
|
+
smiles_list, scores = get_local_chemical_subspace(
|
|
367
|
+
smi,
|
|
368
|
+
num_random_samples=100000,
|
|
369
|
+
min_score=0.3,
|
|
370
|
+
top_n=50,
|
|
371
|
+
)
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
**Write the sorted, filtered results to a file:**
|
|
375
|
+
|
|
376
|
+
```python
|
|
377
|
+
smiles_list, scores = get_local_chemical_subspace(
|
|
378
|
+
smi,
|
|
379
|
+
num_random_samples=50000,
|
|
380
|
+
min_score=0.4,
|
|
381
|
+
output_file="albuterol_analogues.smi",
|
|
382
|
+
)
|
|
383
|
+
# albuterol_analogues.smi contains one canonical SMILES per line, best first
|
|
384
|
+
```
|
|
385
|
+
|
|
386
|
+
**Use with a different fingerprint for scoring:**
|
|
387
|
+
|
|
388
|
+
```python
|
|
389
|
+
smiles_list, scores = get_local_chemical_subspace(
|
|
390
|
+
smi,
|
|
391
|
+
num_random_samples=50000,
|
|
392
|
+
fp_type="FCFP4",
|
|
393
|
+
)
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
---
|
|
397
|
+
|
|
398
|
+
## `optimize_molecules`
|
|
399
|
+
|
|
400
|
+
Evolves a population of molecules toward a user-defined property objective using a genetic algorithm over SELFIES string representations. The algorithm alternates between two phases each generation:
|
|
401
|
+
|
|
402
|
+
- **Exploration** — mutates and crosses over the current population to discover structurally diverse new candidates. Crossover uses `get_median_mols` to generate chemically intermediate structures between parent molecules. Optionally, a neural network classifier (trained on all previously evaluated molecules) biases selection toward high-predicted-fitness candidates.
|
|
403
|
+
- **Exploitation** — performs an intensive local search around the current best molecule(s) using `get_local_chemical_subspace`, then injects the top results back into the main population.
|
|
404
|
+
|
|
405
|
+
```python
|
|
406
|
+
from tyche_tools import optimize_molecules
|
|
407
|
+
|
|
408
|
+
results = optimize_molecules(fitness_function, start_population)
|
|
409
|
+
```
|
|
410
|
+
|
|
411
|
+
### Required parameters
|
|
412
|
+
|
|
413
|
+
| Parameter | Type | Description |
|
|
414
|
+
|---|---|---|
|
|
415
|
+
| `fitness_function` | `callable` | Maps a SMILES string to a float. Higher is better. |
|
|
416
|
+
| `start_population` | `list[str]` or `str` | List of SMILES, or path to a file with one SMILES per line. Must contain at least `generation_size` valid molecules. |
|
|
417
|
+
|
|
418
|
+
### Optional parameters
|
|
419
|
+
|
|
420
|
+
| Parameter | Default | Description |
|
|
421
|
+
|---|---|---|
|
|
422
|
+
| `work_dir` | `"tyche_output"` | Directory for output files. Created automatically. |
|
|
423
|
+
| `verbose_out` | `False` | Save per-generation sub-directories with population and fitness files. |
|
|
424
|
+
| `custom_filter` | `None` | Optional callable (SMILES → bool). Molecules returning False are discarded. |
|
|
425
|
+
| `alphabet` | `None` | Custom SELFIES tokens for the mutation alphabet. Combined with fragment tokens when `use_fragments=True`. |
|
|
426
|
+
| `use_gpu` | `True` | Use CUDA for neural network training if available. |
|
|
427
|
+
| `num_workers` | CPU count | Parallel worker processes for fragment generation and mutations. |
|
|
428
|
+
| `generations` | `200` | Number of evolutionary iterations. |
|
|
429
|
+
| `generation_size` | `5000` | Molecules maintained in the exploration population. |
|
|
430
|
+
| `num_exchanges` | `5` | Top local-search molecules injected into exploration each generation. |
|
|
431
|
+
| `use_fragments` | `True` | Extend mutation alphabet with SELFIES fragments from the starting population (radius-3 atom environments). |
|
|
432
|
+
| `num_sample_frags` | `200` | Fragment tokens sampled from the extended alphabet per mutation step. |
|
|
433
|
+
| `use_classifier` | `True` | Use a neural network classifier to bias exploration selection. Requires PyTorch; falls back to random sampling if unavailable. |
|
|
434
|
+
| `explr_num_random_samples` | `5` | Randomized SMILES orderings per molecule during exploration mutation. |
|
|
435
|
+
| `explr_num_mutations` | `5` | Sequential mutations per ordering during exploration. |
|
|
436
|
+
| `crossover_num_random_samples` | `1` | SMILES orderings used by `get_median_mols` per crossover pair. |
|
|
437
|
+
| `exploit_num_random_samples` | `400` | Randomized SMILES orderings used by `get_local_chemical_subspace` during exploitation. |
|
|
438
|
+
| `exploit_num_mutations` | `400` | Mutation depth during exploitation. 400 × 400 = 160,000 candidates around the best molecule per generation. |
|
|
439
|
+
| `top_mols` | `1` | Number of top molecules subjected to local search each generation. |
|
|
440
|
+
|
|
441
|
+
### Returns
|
|
442
|
+
|
|
443
|
+
A dict with three keys:
|
|
444
|
+
|
|
445
|
+
- `best_per_generation` — `list[(str, float)]`: the best (SMILES, fitness) at the end of each generation.
|
|
446
|
+
- `final_population` — `list[(str, float)]`: the exploration population from the last generation, sorted by descending fitness.
|
|
447
|
+
- `smiles_collector` — `dict`: maps every evaluated SMILES to `[fitness, eval_count]`.
|
|
448
|
+
|
|
449
|
+
### Output files
|
|
450
|
+
|
|
451
|
+
All files are written to `work_dir`:
|
|
452
|
+
|
|
453
|
+
| File | Contents |
|
|
454
|
+
|---|---|
|
|
455
|
+
| `hparams.yml` | All hyperparameter values (requires PyYAML) |
|
|
456
|
+
| `init_mols.txt` | Initial population after fitness sorting |
|
|
457
|
+
| `generation_all_best.txt` | Best molecule and fitness appended each generation |
|
|
458
|
+
| `fitness_explore.txt` | Exploration fitness values (overwritten each generation) |
|
|
459
|
+
| `population_explore.txt` | Exploration SMILES (overwritten each generation) |
|
|
460
|
+
| `fitness_local_search.txt` | Exploitation fitness values (overwritten each generation) |
|
|
461
|
+
| `population_local_search.txt` | Exploitation SMILES (overwritten each generation) |
|
|
462
|
+
|
|
463
|
+
When `verbose_out=True`, per-generation sub-directories (`0_DATA/`, `1_DATA/`, …) are created, preserving every generation's population and fitness files.
|
|
464
|
+
|
|
465
|
+
### Examples
|
|
466
|
+
|
|
467
|
+
**Minimize synthetic accessibility (SA) score:**
|
|
468
|
+
|
|
469
|
+
```python
|
|
470
|
+
from rdkit.Chem import RDConfig
|
|
471
|
+
import os, sys
|
|
472
|
+
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
|
|
473
|
+
import sascorer
|
|
474
|
+
|
|
475
|
+
from tyche_tools import optimize_molecules
|
|
476
|
+
|
|
477
|
+
def fitness(smi):
|
|
478
|
+
from rdkit import Chem
|
|
479
|
+
mol = Chem.MolFromSmiles(smi)
|
|
480
|
+
if mol is None:
|
|
481
|
+
return 0.0
|
|
482
|
+
sa = sascorer.calculateScore(mol)
|
|
483
|
+
return -sa # minimize SA score → maximize negative SA
|
|
484
|
+
|
|
485
|
+
results = optimize_molecules(
|
|
486
|
+
fitness_function=fitness,
|
|
487
|
+
start_population="starting_molecules.smi", # one SMILES per line
|
|
488
|
+
work_dir="sa_optimization",
|
|
489
|
+
generations=50,
|
|
490
|
+
generation_size=100,
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
best_smiles, best_score = results['best_per_generation'][-1]
|
|
494
|
+
print(f"Best molecule: {best_smiles} (SA score: {-best_score:.2f})")
|
|
495
|
+
```
|
|
496
|
+
|
|
497
|
+
**Maximize logP with a molecular weight filter:**
|
|
498
|
+
|
|
499
|
+
```python
|
|
500
|
+
from rdkit.Chem import Descriptors
|
|
501
|
+
from tyche_tools import optimize_molecules
|
|
502
|
+
|
|
503
|
+
def logp_fitness(smi):
|
|
504
|
+
from rdkit import Chem
|
|
505
|
+
mol = Chem.MolFromSmiles(smi)
|
|
506
|
+
return Descriptors.MolLogP(mol) if mol else 0.0
|
|
507
|
+
|
|
508
|
+
def mw_filter(smi):
|
|
509
|
+
from rdkit import Chem
|
|
510
|
+
mol = Chem.MolFromSmiles(smi)
|
|
511
|
+
return mol is not None and Descriptors.MolWt(mol) <= 500
|
|
512
|
+
|
|
513
|
+
results = optimize_molecules(
|
|
514
|
+
fitness_function=logp_fitness,
|
|
515
|
+
start_population=my_smiles_list, # list of SMILES strings
|
|
516
|
+
work_dir="logp_run",
|
|
517
|
+
custom_filter=mw_filter,
|
|
518
|
+
generations=100,
|
|
519
|
+
generation_size=500,
|
|
520
|
+
use_classifier=True, # NN-guided selection (requires PyTorch)
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
# Print best molecule per generation
|
|
524
|
+
for i, (smi, score) in enumerate(results['best_per_generation']):
|
|
525
|
+
print(f"Gen {i + 1}: {score:.3f} {smi}")
|
|
526
|
+
```
|
|
527
|
+
|
|
528
|
+
**Quick test run (small population, few generations):**
|
|
529
|
+
|
|
530
|
+
```python
|
|
531
|
+
results = optimize_molecules(
|
|
532
|
+
fitness_function=logp_fitness,
|
|
533
|
+
start_population=my_smiles_list,
|
|
534
|
+
generations=5,
|
|
535
|
+
generation_size=50,
|
|
536
|
+
use_classifier=False, # skip NN (no PyTorch needed)
|
|
537
|
+
exploit_num_random_samples=50,
|
|
538
|
+
exploit_num_mutations=50,
|
|
539
|
+
explr_num_random_samples=3,
|
|
540
|
+
explr_num_mutations=3,
|
|
541
|
+
)
|
|
542
|
+
```
|
|
543
|
+
|
|
544
|
+
**Inspect all evaluated molecules:**
|
|
545
|
+
|
|
546
|
+
```python
|
|
547
|
+
collector = results['smiles_collector']
|
|
548
|
+
# Sort all evaluated molecules by fitness
|
|
549
|
+
ranked = sorted(collector.items(), key=lambda x: x[1][0], reverse=True)
|
|
550
|
+
for smi, (fitness, count) in ranked[:10]:
|
|
551
|
+
print(f"{fitness:.4f} (evaluated {count}x) {smi}")
|
|
552
|
+
```
|
|
553
|
+
|
|
554
|
+
---
|
|
555
|
+
|
|
556
|
+
## Background
|
|
557
|
+
|
|
558
|
+
TYCHE is a package for randomizing SMILES and SELFIES strings. Randomization happens at the spanning tree, starting node, branch priorities, kekulization, and stereochemical labels. The underlying algorithm operates both on the graph and at the string level. SMILES randomization is a core building block for molecular data augmentation and generative model training, while the optimization framework enables guided exploration of chemical space toward any user-defined property objective. For chemical space exploration, TYCHE builds on the STONED algorithm ([Nigam et al., 2021](https://pubs.rsc.org/en/content/articlelanding/2021/sc/d1sc05231g)) and the genetic algorithm JANUS ([Nigam et al., 2022](https://doi.org/10.1039/D2DD00003B)), which demonstrated that mutating and interpolating through SELFIES space produces chemically valid, diverse molecular structures.
|
|
559
|
+
|
|
560
|
+
---
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
## Support
|
|
564
|
+
In case you encounter problems, please open an [issue](https://git.lwp.rug.nl/pollice-research-group/artificial-design/tyche/-/issues), describe your python environment, and provide detailed instructions that allow reproducing the problems.
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
## Version History
|
|
568
|
+
The version history is detailed in the [CHANGELOG](https://git.lwp.rug.nl/pollice-research-group/artificial-design/tyche/-/blob/master/CHANGELOG.md).
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
## Credits
|
|
572
|
+
No additional credits at this point in time.
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
## License
|
|
576
|
+
[MIT License](https://choosealicense.com/licenses/mit/)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tyche-core"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Lightweight SMILES randomization using SELFIES-based molecular graph traversal."
|
|
9
|
+
requires-python = ">=3.8"
|
|
10
|
+
license-files = ["LICENSE"]
|
|
11
|
+
dependencies = [
|
|
12
|
+
"selfies>=2.0.0,<=2.1.2",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.urls]
|
|
16
|
+
Repository = "https://git.lwp.rug.nl/pollice-research-group/artificial-design/tyche"
|
|
17
|
+
|
|
18
|
+
[tool.setuptools.packages.find]
|
|
19
|
+
where = ["."]
|
|
20
|
+
include = ["tyche*"]
|
|
21
|
+
exclude = ["tools*"]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from rdkit.Chem import RDConfig
|
|
2
|
+
import os, sys
|
|
3
|
+
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
|
|
4
|
+
import sascorer
|
|
5
|
+
|
|
6
|
+
from tyche_tools import optimize_molecules
|
|
7
|
+
|
|
8
|
+
def fitness(smi):
|
|
9
|
+
from rdkit import Chem
|
|
10
|
+
mol = Chem.MolFromSmiles(smi)
|
|
11
|
+
if mol is None:
|
|
12
|
+
return 0.0
|
|
13
|
+
sa = sascorer.calculateScore(mol)
|
|
14
|
+
return -sa # minimize SA score → maximize negative SA
|
|
15
|
+
|
|
16
|
+
if __name__ == "__main__":
|
|
17
|
+
results = optimize_molecules(
|
|
18
|
+
fitness_function=fitness,
|
|
19
|
+
start_population=["CC(C)(C)NCC(c1ccc(O)c(CO)c1)O", "CC(C)(C)NCC(c1ccc(O)c(CO)c1)O", "CC(C)(C)NCC(c1ccc(O)c(CO)c1)O", "CC(C)(C)NCC(c1ccc(O)c(CO)c1)O", "CC(C)(C)NCC(c1ccc(O)c(CO)c1)O"], # one SMILES per line
|
|
20
|
+
work_dir="sa_optimization",
|
|
21
|
+
generations=5,
|
|
22
|
+
generation_size=5,
|
|
23
|
+
use_classifier=False,
|
|
24
|
+
exploit_num_random_samples=50,
|
|
25
|
+
exploit_num_mutations=50,
|
|
26
|
+
explr_num_random_samples=3,
|
|
27
|
+
explr_num_mutations=3,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
best_smiles, best_score = results['best_per_generation'][-1]
|
|
31
|
+
print(f"Best molecule: {best_smiles} (SA score: {-best_score:.2f})")
|