chemtsv3 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. chemtsv3-0.1.0/PKG-INFO +296 -0
  2. chemtsv3-0.1.0/README.md +278 -0
  3. chemtsv3-0.1.0/chemtsv3/__init__.py +0 -0
  4. chemtsv3-0.1.0/chemtsv3/cli/__init__.py +0 -0
  5. chemtsv3-0.1.0/chemtsv3/cli/generation.py +66 -0
  6. chemtsv3-0.1.0/chemtsv3/cli/model_training.py +47 -0
  7. chemtsv3-0.1.0/chemtsv3/data/filtering_substruct_oota_cho.csv +125 -0
  8. chemtsv3-0.1.0/chemtsv3/data/gbgm/p1.p +0 -0
  9. chemtsv3-0.1.0/chemtsv3/data/gbgm/p_ring.p +0 -0
  10. chemtsv3-0.1.0/chemtsv3/data/gbgm/r_s1.p +0 -0
  11. chemtsv3-0.1.0/chemtsv3/data/gbgm/rs_make_ring.p +0 -0
  12. chemtsv3-0.1.0/chemtsv3/data/gbgm/rs_ring.p +0 -0
  13. chemtsv3-0.1.0/chemtsv3/data/j_score/SA_scores.txt +10000 -0
  14. chemtsv3-0.1.0/chemtsv3/data/j_score/cycle_scores.txt +10000 -0
  15. chemtsv3-0.1.0/chemtsv3/data/j_score/logP_values.txt +10000 -0
  16. chemtsv3-0.1.0/chemtsv3/data/pubchem_filter/atoms_dict.txt +1132 -0
  17. chemtsv3-0.1.0/chemtsv3/data/pubchem_filter/bonds_dict.txt +560 -0
  18. chemtsv3-0.1.0/chemtsv3/data/pubchem_filter/metadata.py +260 -0
  19. chemtsv3-0.1.0/chemtsv3/filter/__init__.py +34 -0
  20. chemtsv3-0.1.0/chemtsv3/filter/aromatic_ring_filter.py +13 -0
  21. chemtsv3-0.1.0/chemtsv3/filter/atom_count_filter.py +10 -0
  22. chemtsv3-0.1.0/chemtsv3/filter/attachment_points_filter.py +11 -0
  23. chemtsv3-0.1.0/chemtsv3/filter/base.py +100 -0
  24. chemtsv3-0.1.0/chemtsv3/filter/catalog_filter.py +41 -0
  25. chemtsv3-0.1.0/chemtsv3/filter/charge_filter.py +14 -0
  26. chemtsv3-0.1.0/chemtsv3/filter/connectivity_filter.py +13 -0
  27. chemtsv3-0.1.0/chemtsv3/filter/hba_filter.py +13 -0
  28. chemtsv3-0.1.0/chemtsv3/filter/hbd_filter.py +13 -0
  29. chemtsv3-0.1.0/chemtsv3/filter/heavy_atom_count_filter.py +13 -0
  30. chemtsv3-0.1.0/chemtsv3/filter/lipinski_filter.py +41 -0
  31. chemtsv3-0.1.0/chemtsv3/filter/log_p_filter.py +13 -0
  32. chemtsv3-0.1.0/chemtsv3/filter/pains_filter.py +22 -0
  33. chemtsv3-0.1.0/chemtsv3/filter/pubchem_filter.py +277 -0
  34. chemtsv3-0.1.0/chemtsv3/filter/radical_filter.py +13 -0
  35. chemtsv3-0.1.0/chemtsv3/filter/ring_bond_filter.py +18 -0
  36. chemtsv3-0.1.0/chemtsv3/filter/ring_size_filter.py +22 -0
  37. chemtsv3-0.1.0/chemtsv3/filter/roc_filter.py +56 -0
  38. chemtsv3-0.1.0/chemtsv3/filter/rotatable_bonds_filter.py +13 -0
  39. chemtsv3-0.1.0/chemtsv3/filter/sa_score_filter.py +14 -0
  40. chemtsv3-0.1.0/chemtsv3/filter/substructure_filter.py +54 -0
  41. chemtsv3-0.1.0/chemtsv3/filter/tpsa_filter.py +13 -0
  42. chemtsv3-0.1.0/chemtsv3/filter/validity_filter.py +13 -0
  43. chemtsv3-0.1.0/chemtsv3/filter/weight_filter.py +13 -0
  44. chemtsv3-0.1.0/chemtsv3/generator/__init__.py +4 -0
  45. chemtsv3-0.1.0/chemtsv3/generator/base.py +531 -0
  46. chemtsv3-0.1.0/chemtsv3/generator/heapq_generator.py +35 -0
  47. chemtsv3-0.1.0/chemtsv3/generator/mcts.py +218 -0
  48. chemtsv3-0.1.0/chemtsv3/generator/random_generator.py +14 -0
  49. chemtsv3-0.1.0/chemtsv3/language/__init__.py +14 -0
  50. chemtsv3-0.1.0/chemtsv3/language/base.py +198 -0
  51. chemtsv3-0.1.0/chemtsv3/language/fasta.py +59 -0
  52. chemtsv3-0.1.0/chemtsv3/language/helm.py +110 -0
  53. chemtsv3-0.1.0/chemtsv3/language/selfies.py +21 -0
  54. chemtsv3-0.1.0/chemtsv3/language/smiles.py +24 -0
  55. chemtsv3-0.1.0/chemtsv3/language/tokenizer.py +46 -0
  56. chemtsv3-0.1.0/chemtsv3/node/__init__.py +10 -0
  57. chemtsv3-0.1.0/chemtsv3/node/base.py +189 -0
  58. chemtsv3-0.1.0/chemtsv3/node/selfies_string_node.py +19 -0
  59. chemtsv3-0.1.0/chemtsv3/node/sentence_node.py +75 -0
  60. chemtsv3-0.1.0/chemtsv3/node/string_node.py +106 -0
  61. chemtsv3-0.1.0/chemtsv3/policy/__init__.py +9 -0
  62. chemtsv3-0.1.0/chemtsv3/policy/base.py +109 -0
  63. chemtsv3-0.1.0/chemtsv3/policy/puct.py +19 -0
  64. chemtsv3-0.1.0/chemtsv3/policy/puct_with_predictor.py +233 -0
  65. chemtsv3-0.1.0/chemtsv3/policy/uct.py +67 -0
  66. chemtsv3-0.1.0/chemtsv3/reward/__init__.py +10 -0
  67. chemtsv3-0.1.0/chemtsv3/reward/base.py +104 -0
  68. chemtsv3-0.1.0/chemtsv3/reward/j_score_reward.py +48 -0
  69. chemtsv3-0.1.0/chemtsv3/reward/log_p_reward.py +15 -0
  70. chemtsv3-0.1.0/chemtsv3/reward/similarity_reward.py +16 -0
  71. chemtsv3-0.1.0/chemtsv3/transition/__init__.py +25 -0
  72. chemtsv3-0.1.0/chemtsv3/transition/base.py +252 -0
  73. chemtsv3-0.1.0/chemtsv3/transition/biot5.py +18 -0
  74. chemtsv3-0.1.0/chemtsv3/transition/chat_gpt.py +35 -0
  75. chemtsv3-0.1.0/chemtsv3/transition/chat_gpt_with_memory.py +60 -0
  76. chemtsv3-0.1.0/chemtsv3/transition/gbga.py +221 -0
  77. chemtsv3-0.1.0/chemtsv3/transition/gbgm.py +171 -0
  78. chemtsv3-0.1.0/chemtsv3/transition/gpt2.py +229 -0
  79. chemtsv3-0.1.0/chemtsv3/transition/prot_gpt2.py +42 -0
  80. chemtsv3-0.1.0/chemtsv3/transition/rnn.py +303 -0
  81. chemtsv3-0.1.0/chemtsv3/transition/rnn_based_mutation.py +41 -0
  82. chemtsv3-0.1.0/chemtsv3/transition/smirks.py +225 -0
  83. chemtsv3-0.1.0/chemtsv3/utils/__init__.py +19 -0
  84. chemtsv3-0.1.0/chemtsv3/utils/file_utils.py +249 -0
  85. chemtsv3-0.1.0/chemtsv3/utils/helm_utils.py +416 -0
  86. chemtsv3-0.1.0/chemtsv3/utils/logging_utils.py +69 -0
  87. chemtsv3-0.1.0/chemtsv3/utils/math_utils.py +194 -0
  88. chemtsv3-0.1.0/chemtsv3/utils/mol_utils.py +119 -0
  89. chemtsv3-0.1.0/chemtsv3/utils/plot_utils.py +86 -0
  90. chemtsv3-0.1.0/chemtsv3/utils/third_party/fpscores.pkl.gz +0 -0
  91. chemtsv3-0.1.0/chemtsv3/utils/third_party/sascorer.py +192 -0
  92. chemtsv3-0.1.0/chemtsv3/utils/yaml_utils.py +235 -0
  93. chemtsv3-0.1.0/pyproject.toml +24 -0
@@ -0,0 +1,296 @@
1
+ Metadata-Version: 2.4
2
+ Name: chemtsv3
3
+ Version: 0.1.0
4
+ Summary: ChemTSv3:
5
+ License: MIT
6
+ Author: Satoru Fujii
7
+ Author-email: fujii.sat.rk@yokohama-cu.ac.jp
8
+ Requires-Python: >=3.11
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Programming Language :: Python :: 3.14
15
+ Project-URL: Repository, https://github.com/molecule-generator-collection/ChemTSv3
16
+ Description-Content-Type: text/markdown
17
+
18
+ ## ChemTSv3
19
+ A unified tree search framework for molecular generation.
20
+ - **Node is modular**: Supports any molecular representation (e.g., SMILES, SELFIES, FASTA, or HELM) in either string or tensor format.
21
+ - **Transition is modular**: Allows any molecular transformation strategy, including graph-based editing, sequence generation with RNN or GPT-2, sequence mutation, or LLM-guided modification.
22
+ - **Filter is modular**: Enables flexible constraints such as structural alerts, scaffold preservation, or physicochemical property filters.
23
+ - **Reward is modular**: Anything can be optimized, including QSAR predictions or simulation results, for both single- and multi-objective tasks.
24
+
25
+ ## Setup
26
+
27
+ <details>
28
+ <summary><b>Minimal installation (Mac, Linux)</b></summary><br>
29
+
30
+ ### Available classes
31
+ - **Transition**: `GBGATransition`, `GPT2Transition`, `RNNBasedMutation`, `RNNTransition`, `SMIRKSTransition`
32
+ - **Reward**: `GFPReward`, `SimilarityReward`, `JScoreReward`, `LogPReward`
33
+ - **Policy**: `UCT`, `PUCT`
34
+ - The corresponding Node classes and all implemented Filter classes are also available in this environment.
35
+
36
+ ### Setup steps
37
+
38
+ 1. Clone the repository
39
+ 2. Install uv: https://docs.astral.sh/uv/getting-started/installation/
40
+ 3. Restart the shell
41
+ 4. Move to the repository root (e.g., cd molgen)
42
+ 5. Run the following commands:
43
+ ```bash
44
+ uv venv --python 3.11.11
45
+ source .venv/bin/activate
46
+ uv pip install numpy==1.26.4 pandas==2.3.3 matplotlib==3.10.7 rdkit==2023.09.6 ipykernel==6.30.0 transformers==4.43.4 torch==2.5.1 --torch-backend=auto
47
+ uv pip install -e .
48
+ ```
49
+
50
+ To activate the virtual environment, run the following command from the repository root (this process can also be automated through VS Code settings):
51
+ ```bash
52
+ source .venv/bin/activate
53
+ ```
54
+ To deactivate the virtual environment, run:
55
+ ```bash
56
+ deactivate
57
+ ```
58
+ </details>
59
+
60
+ <details>
61
+ <summary><b>Minimal installation (Windows)</b></summary><br>
62
+
63
+ ### Available classes
64
+ - **Transition**: `GBGATransition`, `GPT2Transition`, `RNNBasedMutation`, `RNNTransition`, `SMIRKSTransition`
65
+ - **Reward**: `GFPReward`, `SimilarityReward`, `JScoreReward`, `LogPReward`
66
+ - **Policy**: `UCT`, `PUCT`
67
+ - The corresponding Node classes and all implemented Filter classes are also available in this environment.
68
+
69
+ ### Setup steps
70
+
71
+ 1. Clone the repository
72
+ 2. Install uv: https://docs.astral.sh/uv/getting-started/installation/
73
+ 3. Restart the shell (and VSCode if used)
74
+ 4. Move to the repository root (e.g., cd molgen)
75
+ 5. Run the following commands:
76
+ ```bash
77
+ uv venv --python 3.11.11
78
+ .venv\Scripts\activate
79
+ uv pip install numpy==1.26.4 pandas==2.3.3 matplotlib==3.10.7 rdkit==2023.09.6 ipykernel==6.30.0 transformers==4.43.4 torch==2.5.1 --torch-backend=auto
80
+ uv pip install -e .
81
+ ```
82
+
83
+ To activate the virtual environment, run the following command from the repository root (this process can also be automated through VS Code settings):
84
+ ```bash
85
+ .venv\Scripts\activate
86
+ ```
87
+ To deactivate the virtual environment, run:
88
+ ```bash
89
+ deactivate
90
+ ```
91
+ </details>
92
+
93
+ <details>
94
+ <summary><b>Full installation (Mac, Linux)</b></summary><br>
95
+
96
+ ### Available classes
97
+ - **Transition**: `BioT5Transition`, `ChatGPTTransition`, `ChatGPTTransitionWithMemory`, `GBGATransition`, `GPT2Transition`, `RNNBasedMutation`, `RNNTransition`, `SMIRKSTransition`
98
+ - **Reward**: `DScoreReward`, `DyRAMOReward`, `GFPReward`, `SimilarityReward`, `JScoreReward`, `LogPReward`, `TDCReward`
99
+ - The corresponding Node classes, along with all implemented Filter and Policy classes, are also available in this environment.
100
+ - `ChatGPTTransition` and `ChatGPTTransitionWithMemory` requires openai api key to use.
101
+
102
+ ### Setup steps
103
+ 1. Clone the repository
104
+ 2. Install uv: https://docs.astral.sh/uv/getting-started/installation/
105
+ 3. Restart the shell
106
+ 4. Move to the repository root (e.g., cd molgen)
107
+ 5. Run the following commands:
108
+ ```bash
109
+ uv venv --python 3.11.11
110
+ source .venv/bin/activate
111
+ uv pip install pytdc==1.1.14 numpy==1.26.4 pandas==2.3.3 matplotlib==3.10.7 rdkit==2023.09.6 selfies==2.2.0 ipykernel==6.30.0 transformers==4.43.4 setuptools==78.1.1 lightgbm==4.6.0 openai==2.6.0 torch==2.5.1 --torch-backend=auto
112
+ uv pip install -e .
113
+ ```
114
+ To activate the virtual environment, run the following command from the repository root (this process can also be automated through VS Code settings):
115
+ ```bash
116
+ source .venv/bin/activate
117
+ ```
118
+ To deactivate the virtual environment, run:
119
+ ```bash
120
+ deactivate
121
+ ```
122
+ </details>
123
+
124
+ <details>
125
+ <summary><b>Optional dependencies</b></summary><br>
126
+
127
+ The full installation includes the following optional packages:
128
+
129
+ |Package|Required for|Tested version|
130
+ |---|---|---|
131
+ |`lightgbm`|`DScoreReward`, `DyRAMOReward`, `PUCTWithPredictor`|3.3.5, 4.6.0|
132
+ |`selfies`|`SELFIESStringNode`|2.2.0|
133
+ |`openai`|`ChatGPT2Transition`, `ChatGPT2TransitionWithMemory`|2.6.0|
134
+ |`pytdc`|`TDCReward`|1.1.14|
135
+
136
+ </details>
137
+
138
+ <details>
139
+ <summary><b>Troubleshooting</b></summary><br>
140
+
141
+ ### CUDA not available
142
+ In some cases (for example, when setting up environments on a control node), it may be necessary to reinstall torch with a different backend to enable CUDA support. However, since major implemented classes (including `RNNTransition`) are likely to run faster on the CPU, this is not strictly required. After reinstalling torch, you may also need to downgrade numpy to version 1.26.4 if it was upgraded during the process.
143
+ </details>
144
+
145
+ </details>
146
+
147
+ ## Generation via CLI
148
+ See `config/mcts/example.yaml` for an example YAML configuration.
149
+ ```bash
150
+ # Simple generation
151
+ chemtsv3 -c config/mcts/example.yaml
152
+ # Chain generation
153
+ chemtsv3 -c config/mcts/example_chain_1.yaml
154
+ # Load a checkpoint and continue the generation
155
+ chemtsv3 -l generation_results/~~~ --max_generations 100 --time_limit 60
156
+ ```
157
+
158
+ ## Notebooks
159
+ - **Tutorials**: `sandbox/tutorial/***.ipynb`
160
+ - **Generation via notebook**: `sandbox/generation.ipynb`
161
+
162
+ ## Options
163
+ See `config/mcts/example.yaml` for an example and advanced options. More examples (settings used in the paper) can be found in `config/mcts/egfr_de_novo/` and `config/mcts/egfr_lead_opt/`.
164
+
165
+ All options for each component (class) are defined as arguments in the `__init__()` method of the corresponding class.
166
+
167
+ <details>
168
+ <summary><b>Nodes and Transitions</b></summary><br>
169
+
170
+ **For general usage:**
171
+ |Node class|Transition class|Description|
172
+ |---|---|---|
173
+ |`MolSentenceNode`|`RNNTransition`|For de novo generation. Uses the RNN (GRU / LSTM) model specified by `model_dir`.|
174
+ |`MolSentenceNode`|`GPT2Transition`|For de novo generation. Uses the Transformer (GPT-2) model specified by `model_dir`.|
175
+ |`CanonicalSMILESStringNode`|`GBGATransition`|For lead optimization. Uses [GB-GA mutation rules](https://pubs.rsc.org/en/content/articlelanding/2019/sc/c8sc05372c).|
176
+ |`CanonicalSMILESStringNode`|`SMIRKSTransition`|For lead optimization. Uses the specified SMIRKS rules (e.g. MMP-based ones).|
177
+ |`SMILESStringNode`|`ChatGPTTransition`|For lead optimization. Uses the specified prompt(s) as input to the GPT model specified by `model` (e.g., `"gpt-4o-mini"`). Requires an OpenAI API key.|
178
+
179
+ **For research purposes (did not perform well in our testing):**
180
+ |Node class|Transition class|Description|
181
+ |---|---|---|
182
+ |`CanonicalSMILESStringNode`|`GBGMTransition`|For de novo generation. Uses [GB-GM rules](https://pubs.rsc.org/en/content/articlelanding/2019/sc/c8sc05372c). Rollouts iteratively apply transitions until the molecule size reaches a sampled value determined by `size_mean` and `size_std`.|
183
+ |`FASTAStringNode`|`ProtGPT2Transition`|For de novo protein generation. Uses the [ProtGPT2 model](https://www.nature.com/articles/s41467-022-32007-7).|
184
+ |`SELFIESStringNode`|`BioT5Transition`|For lead optimization. Uses the specified prompt(s) as input to the [BioT5 text2mol model](https://github.com/QizhiPei/BioT5).|
185
+ |`SMILESStringNode`|`ChatGPTTransitionWithMemory`|For lead optimization. Unlike `ChatGPTTransition`, retains conversation history and feedback reward calculation results to the model.|
186
+
187
+ </details>
188
+
189
+ <details>
190
+ <summary><b>Policies</b></summary><br>
191
+
192
+ - `UCT`: Does not use transition probabilities. Performed better with `RNNTransition` in our testing.
193
+ - `PUCT`: Incorporates transition probabilities (follows the modification introduced in [AlphaGo Zero](https://www.nature.com/articles/nature24270)). Performed better with `GBGATransition` in our testing.
194
+ - `PUCTWithPredictor`: Trains an optimistic predictor of leaf-node evaluations using the generation history, and uses its output as the score for unvisited nodes when the model’s performance (measured by the normalized pinball loss) exceeds a specified threshold. This option adds a few seconds of overhead per generation (depending on the number of child nodes per transition and the computational cost of each prediction), and is recommended only when the reward calculations are expensive. Inherits all the arguments of `UCT` and `PUCT`. For non-molecular nodes, a function that returns a feature vector must be defined (see `policy/puct_with_predictor.py` for details.)
195
+
196
+ </details>
197
+
198
+ <details>
199
+ <summary><b>Basic options</b></summary><br>
200
+
201
+ |Class|Option|Default|Description|
202
+ |---|---|---|---|
203
+ |-|`max_generations`|-|Stops generation after producing the specified number of molecules.|
204
+ |-|`time_limit`|-|Stops generation once the time limit (in seconds) is reached.|
205
+ |-|`root`|`""`|Key (string) for the root node (e.g. SMILES of the starting molecule for `SMILESStringNode`). Multiple roots can be specified by list input. If not specified, an empty string `""` will be used as the root node's key.|
206
+ |`MCTS`|`n_eval_width`|∞|By default (= ∞), evaluates all new leaf nodes after each transition. Setting `n_eval_width = 1` often improves sample efficiency and can be beneficial when reward computation is expensive.|
207
+ |`MCTS`|`filter_reward`|0|Substitutes the reward with this value when nodes are filtered. Use a list to specify different reward values for each filtering step. Set to `"ignore"` to skip reward assignment (in this case, other penalty types for filtered nodes, such as `failed_parent_reward`, needs to be set).|
208
+ |`UCT`, `PUCT`, `PUCTWithPredictor`|`c`|0.3|A larger value prioritizes exploration over exploitation. Recommended range: [0.01, 1]|
209
+ |`UCT`, `PUCT`, `PUCTWithPredictor`|`best_rate`|0|A value between 0 and 1. The exploitation term is calculated as: `best_rate` * {best reward} + (1 - `best_rate`) * {average reward}. For better sample efficiency, it might be better to set this value to around 0.5 for de novo generations, and around 0.9 for lead optimizations.|
210
+
211
+ </details>
212
+
213
+ <details>
214
+ <summary><b>Advanced options</b></summary><br>
215
+
216
+
217
+ For other options and further details, please refer to each class’s `__init__()` method.
218
+
219
+
220
+ |Class|Option|Default|Description|
221
+ |---|---|---|---|
222
+ |-|`seed`|-|The seed value for `random`, `np.random` and `torch`.|
223
+ |-|`device`|-|Torch device specification (e.g., "cpu", "cuda", "cuda:0"). For `RNNTransition`, using the CPU tends to be faster.|
224
+ |-|`debug`|False|If True, debug logging are enabled.|
225
+ |-|`silent`|False|If True, console logging are disabled.|
226
+ |-|`save​_on​_completion`|False|If True, saves a checkpoint upon completion of the generation.|
227
+ |-|`next_yaml_path`|False|If a path to the YAML config for the next generator is set, the generated molecules will be passed for chain generation.|
228
+ |-|`n_keys_to_pass`|3|Number of top-k generated molecules (keys) to be used as root nodes for the next generator.|
229
+ |`MCTS`|`n_eval_iters`|1|The number of child node evaluations. This value should not be > 1 unless the evaluations are undeterministic (e.g. involve rollouts).|
230
+ |`MCTS`|`n_tries`|1|The number of attempts to obtain an unfiltered node in a single evaluation. This value should not be >1 unless the evaluations are undeterministic (e.g. involve rollouts).|
231
+ |`MCTS`|`allow​_eval​_overlaps`|False|Whether to allow overlap nodes when sampling eval candidates (recommended: False)|
232
+ |`MCTS`|`reward_cutoff`|None|Child nodes are removed if their reward is lower than this value. This applies only to nodes for which `has_reward() = True` (i.e., complete molecules). |
233
+ |`MCTS`|`reward​_cutoff​_warmups`|None|If specified, reward_cutoff will be inactive until `reward_cutoff_warmups` generations.|
234
+ |`MCTS`|`cut_failed_child`|False|If True, child nodes will be removed when {`n_eval_iters` * `n_tries`} evals are filtered.|
235
+ |`MCTS`|`failed​_parent​_reward`|`"ignore"`|Backpropagate this value when {`n_eval_width` * `n_eval_iters` * `n_tries`} evals are filtered from the node.|
236
+ |`MCTS`|`terminal_reward`|`"ignore"`|If a float value is set, that value is backpropagated when a leaf node reaches a terminal state. If set to `"ignore"`, no value is backpropagated.|
237
+ |`MCTS`|`cut_terminal`|True|If True, terminal nodes are pruned from the search tree and will not be visited more than once.|
238
+ |`MCTS`|`avoid_duplicates`|True|If True, duplicate nodes won't be added to the search tree. Should be True if the transition forms a cyclic graph. Unneeded if the tree structure of the transition graph is guranteed, and can be set to False to reduce memory usage.|
239
+ |`MCTS`|`discard​_unneeded​_states`|True|If True, discards node variables that are no longer needed after expansion. Set this to False when using custom classes that utilize these values.|
240
+ |`UCT`, `PUCT`, `PUCTWithPredictor`|`pw_c`, `pw_alpha`, `pw_beta`|None, 0, 0|If `pw_c` is set, the number of available child nodes is limited to `pw_c` * ({visit count} ** `pw_alpha`) + `pw_beta`.|
241
+ |`UCT`, `PUCT`, `PUCTWithPredictor`|`max_prior`|None (0)|A lower bound for the best reward. If the actual best reward is lower than this value, this value is used instead.|
242
+ |`UCT`, `PUCT`, `PUCTWithPredictor`|`epsilon`|0|The probability of randomly selecting a child node while descending the search tree.|
243
+ |`PUCTWithPredictor`|`alpha`|0.9|Quantile level for the predictor, representing the target percentile of the response variable to be estimated and used.|
244
+ |`PUCTWithPredictor`|`score_threshold`|0.6|If the recent prediction score (1 - {pinball loss} / {baseline pinball loss}) is better than this threshold, the model will be used afterwards.|
245
+ |`MolSentenceNode​`, `MolStringNode`|`use​_canonical​_smiles​_as​_key`|False|Whether to convert generated molecules to canonical SMILES when generating keys. If False, the same molecule may be counted multiple times.|
246
+ |`RNNTransition`, `GPT2Transition`|`top_p`|0.995|Nucleus sampling threshold in (0, 1]; keeps the smallest probability mass ≥ `top_p`.|
247
+ |`RNNTransition`, `GPT2Transition`|`temperature`|1|Logit temperature > 0 applied **before** `top_p`; values < 1.0 sharp, > 1.0 smooth.|
248
+ |`RNNTransition`|`sharpness`|1| Probability distribution sharpness > 0 applied **after** `top_p`; values < 1.0 smooth, > 1.0 sharp.|
249
+ |`RNNTransition`|`disable​_top​_p​_on​_rollout`|False|If True, `top_p` won't be applied for rollouts.|
250
+ |`SMIRKSTransition`|`limit`|None|If the number of generated SMILES exceeded this value, stops applying further SMIRKS patterns. The order of SMIRKS patterns are shuffled with weights before applying transition if this option is enabled.|
251
+
252
+ </details>
253
+
254
+ <details>
255
+ <summary><b>Filters</b></summary><br>
256
+
257
+ **Sanity**
258
+ - `ValidityFilter`: Excludes invalid molecule objects. Since other filters and rewards typically assume validity and do not recheck it, usually this filter should be applied first in molecular generation.
259
+ - `RadicalFilter`: Excludes molecules whose number of radical electrons is not 0.
260
+ - `ConnectivityFilter`: Excludes molecules whose number of disconnected fragments is not 1.
261
+
262
+ **Topological**
263
+ - `SubstructureFilter`: Excludes molecules that **do not** contain the specified (list of) substructure(s) by `smiles` or `smarts` arguments. If `preserve` is set to False, excludes molecules that **do** contain the specified (list of) substructure(s) instead. By specifying appropriate SMARTS patterns, it is possible to control where substitutions or structural modifications (i.e., adding a substituent or arm) are allowed to occur.
264
+ - `AromaticRingFilter`: Excludes molecules whose number of aromatic rings falls outside the range [`min`, `max`]. (Default: [1, ∞))
265
+ - `HeavyAtomCountFilter`: Excludes molecules whose number of heavy atoms falls outside the range [`min`, `max`]. (Default: [0, 45])
266
+ - `MaxRingSizeFilter`: Excludes molecules whose largest ring size falls outside the range [`min`, `max`]. (Default: [0, 6])
267
+ - `MinRingSizeFilter`: Excludes molecules whose smallest ring size falls outside the range [`min`, `max`]. (Default: (-∞, ∞))
268
+ - `RingBondFilter`: Excludes molecules containing ring allenes (`[R]=[R]=[R]`) or double bonds in small rings (`[r3,r4]=[r3,r4]`).
269
+ - `RotatableBondsFilter`: Excludes molecules whose number of rotatable bonds falls outside the range [`min`, `max`]. (Default: [0, 10])
270
+
271
+ **Structural alert**
272
+ - `ROCFilter`: Excludes molecules that contain structural alerts defined by Ohta and Cho.
273
+ - `CatalogFilter`: Excludes molecules that contain structural alerts in the specified list of [rdkit.Chem.FilterCatalogParams.FilterCatalogs](https://www.rdkit.org/docs/source/rdkit.Chem.rdfiltercatalog.html#rdkit.Chem.rdfiltercatalog.FilterCatalogParams.FilterCatalogs). (e.g. `catalogs = ["PAINS_A", "PAINS_B", "PAINS_C", "NIH", "BRENK"]`)
274
+
275
+ **Drug-likeness**
276
+ - `PubChemFilter`: Excludes molecules based on the frequency of occurrence of molecular patterns in the PubChem database. Reported in [Ma et al.](https://doi.org/10.1021/acs.jcim.1c00679).
277
+ - `LipinskiFilter`: Excludes molecules based on Lipinski’s Rule of Five. Set `rule_of` to 3 to apply the Rule of Three instead.
278
+ - `SAScoreFilter`: Excludes molecules whose synthetic accessibility score (SA Score) falls outside the range [`min`, `max`]. (Default: [1, 3.5])
279
+
280
+ **Physicochemical**
281
+ - `ChargeFilter`: Excludes molecules whose formal charge is not 0.
282
+ - `HBAFilter`: Excludes molecules whose number of hydrogen bond acceptors falls outside the range [`min`, `max`]. (Default: [0, 10])
283
+ - `HBDFilter`: Excludes molecules whose number of hydrogen bond donors falls outside the range [`min`, `max`]. (Default: [0, 5])
284
+ - `LogPFilter`: Excludes molecules whose LogP value falls outside the range [`min`, `max`]. (Default: (-∞, 5])
285
+ - `TPSAFilter`: Excludes molecules whose topological polar surface area (TPSA) falls outside the range [`min`, `max`]. (Default: [0, 140])
286
+ - `WeightFilter`: Excludes molecules whose molecular weight falls outside the range [`min`, `max`]. (Default: [0, 500])
287
+
288
+ Filters can also be specified using `filters` argument of transitions that inherit from `TemplateTransition` (e.g. `GBGATransition`, `SMIRKSTransition`, `ChatGPTTransition`) to directly exclude molecules from child nodes.
289
+
290
+ </details>
291
+
292
+ ## Model training
293
+ - **RNN (GRU) training** (example): `chemtsv3-train -c config/training/train_rnn_smiles.yaml`
294
+ - **Transformer (GPT-2) training** (example): `chemtsv3-train -c config/training/train_gpt2.yaml`
295
+ Change `dataset_path` in YAML to train on an arbitrary dataset (1 sentence per line).
296
+
@@ -0,0 +1,278 @@
1
+ ## ChemTSv3
2
+ A unified tree search framework for molecular generation.
3
+ - **Node is modular**: Supports any molecular representation (e.g., SMILES, SELFIES, FASTA, or HELM) in either string or tensor format.
4
+ - **Transition is modular**: Allows any molecular transformation strategy, including graph-based editing, sequence generation with RNN or GPT-2, sequence mutation, or LLM-guided modification.
5
+ - **Filter is modular**: Enables flexible constraints such as structural alerts, scaffold preservation, or physicochemical property filters.
6
+ - **Reward is modular**: Anything can be optimized, including QSAR predictions or simulation results, for both single- and multi-objective tasks.
7
+
8
+ ## Setup
9
+
10
+ <details>
11
+ <summary><b>Minimal installation (Mac, Linux)</b></summary><br>
12
+
13
+ ### Available classes
14
+ - **Transition**: `GBGATransition`, `GPT2Transition`, `RNNBasedMutation`, `RNNTransition`, `SMIRKSTransition`
15
+ - **Reward**: `GFPReward`, `SimilarityReward`, `JScoreReward`, `LogPReward`
16
+ - **Policy**: `UCT`, `PUCT`
17
+ - The corresponding Node classes and all implemented Filter classes are also available in this environment.
18
+
19
+ ### Setup steps
20
+
21
+ 1. Clone the repository
22
+ 2. Install uv: https://docs.astral.sh/uv/getting-started/installation/
23
+ 3. Restart the shell
24
+ 4. Move to the repository root (e.g., cd molgen)
25
+ 5. Run the following commands:
26
+ ```bash
27
+ uv venv --python 3.11.11
28
+ source .venv/bin/activate
29
+ uv pip install numpy==1.26.4 pandas==2.3.3 matplotlib==3.10.7 rdkit==2023.09.6 ipykernel==6.30.0 transformers==4.43.4 torch==2.5.1 --torch-backend=auto
30
+ uv pip install -e .
31
+ ```
32
+
33
+ To activate the virtual environment, run the following command from the repository root (this process can also be automated through VS Code settings):
34
+ ```bash
35
+ source .venv/bin/activate
36
+ ```
37
+ To deactivate the virtual environment, run:
38
+ ```bash
39
+ deactivate
40
+ ```
41
+ </details>
42
+
43
+ <details>
44
+ <summary><b>Minimal installation (Windows)</b></summary><br>
45
+
46
+ ### Available classes
47
+ - **Transition**: `GBGATransition`, `GPT2Transition`, `RNNBasedMutation`, `RNNTransition`, `SMIRKSTransition`
48
+ - **Reward**: `GFPReward`, `SimilarityReward`, `JScoreReward`, `LogPReward`
49
+ - **Policy**: `UCT`, `PUCT`
50
+ - The corresponding Node classes and all implemented Filter classes are also available in this environment.
51
+
52
+ ### Setup steps
53
+
54
+ 1. Clone the repository
55
+ 2. Install uv: https://docs.astral.sh/uv/getting-started/installation/
56
+ 3. Restart the shell (and VSCode if used)
57
+ 4. Move to the repository root (e.g., cd molgen)
58
+ 5. Run the following commands:
59
+ ```bash
60
+ uv venv --python 3.11.11
61
+ .venv\Scripts\activate
62
+ uv pip install numpy==1.26.4 pandas==2.3.3 matplotlib==3.10.7 rdkit==2023.09.6 ipykernel==6.30.0 transformers==4.43.4 torch==2.5.1 --torch-backend=auto
63
+ uv pip install -e .
64
+ ```
65
+
66
+ To activate the virtual environment, run the following command from the repository root (this process can also be automated through VS Code settings):
67
+ ```bash
68
+ .venv\Scripts\activate
69
+ ```
70
+ To deactivate the virtual environment, run:
71
+ ```bash
72
+ deactivate
73
+ ```
74
+ </details>
75
+
76
+ <details>
77
+ <summary><b>Full installation (Mac, Linux)</b></summary><br>
78
+
79
+ ### Available classes
80
+ - **Transition**: `BioT5Transition`, `ChatGPTTransition`, `ChatGPTTransitionWithMemory`, `GBGATransition`, `GPT2Transition`, `RNNBasedMutation`, `RNNTransition`, `SMIRKSTransition`
81
+ - **Reward**: `DScoreReward`, `DyRAMOReward`, `GFPReward`, `SimilarityReward`, `JScoreReward`, `LogPReward`, `TDCReward`
82
+ - The corresponding Node classes, along with all implemented Filter and Policy classes, are also available in this environment.
83
+ - `ChatGPTTransition` and `ChatGPTTransitionWithMemory` requires openai api key to use.
84
+
85
+ ### Setup steps
86
+ 1. Clone the repository
87
+ 2. Install uv: https://docs.astral.sh/uv/getting-started/installation/
88
+ 3. Restart the shell
89
+ 4. Move to the repository root (e.g., cd molgen)
90
+ 5. Run the following commands:
91
+ ```bash
92
+ uv venv --python 3.11.11
93
+ source .venv/bin/activate
94
+ uv pip install pytdc==1.1.14 numpy==1.26.4 pandas==2.3.3 matplotlib==3.10.7 rdkit==2023.09.6 selfies==2.2.0 ipykernel==6.30.0 transformers==4.43.4 setuptools==78.1.1 lightgbm==4.6.0 openai==2.6.0 torch==2.5.1 --torch-backend=auto
95
+ uv pip install -e .
96
+ ```
97
+ To activate the virtual environment, run the following command from the repository root (this process can also be automated through VS Code settings):
98
+ ```bash
99
+ source .venv/bin/activate
100
+ ```
101
+ To deactivate the virtual environment, run:
102
+ ```bash
103
+ deactivate
104
+ ```
105
+ </details>
106
+
107
+ <details>
108
+ <summary><b>Optional dependencies</b></summary><br>
109
+
110
+ The full installation includes the following optional packages:
111
+
112
+ |Package|Required for|Tested version|
113
+ |---|---|---|
114
+ |`lightgbm`|`DScoreReward`, `DyRAMOReward`, `PUCTWithPredictor`|3.3.5, 4.6.0|
115
+ |`selfies`|`SELFIESStringNode`|2.2.0|
116
+ |`openai`|`ChatGPT2Transition`, `ChatGPT2TransitionWithMemory`|2.6.0|
117
+ |`pytdc`|`TDCReward`|1.1.14|
118
+
119
+ </details>
120
+
121
+ <details>
122
+ <summary><b>Troubleshooting</b></summary><br>
123
+
124
+ ### CUDA not available
125
+ In some cases (for example, when setting up environments on a control node), it may be necessary to reinstall torch with a different backend to enable CUDA support. However, since major implemented classes (including `RNNTransition`) are likely to run faster on the CPU, this is not strictly required. After reinstalling torch, you may also need to downgrade numpy to version 1.26.4 if it was upgraded during the process.
126
+ </details>
127
+
128
+ </details>
129
+
130
+ ## Generation via CLI
131
+ See `config/mcts/example.yaml` for an example YAML configuration.
132
+ ```bash
133
+ # Simple generation
134
+ chemtsv3 -c config/mcts/example.yaml
135
+ # Chain generation
136
+ chemtsv3 -c config/mcts/example_chain_1.yaml
137
+ # Load a checkpoint and continue the generation
138
+ chemtsv3 -l generation_results/~~~ --max_generations 100 --time_limit 60
139
+ ```
140
+
141
+ ## Notebooks
142
+ - **Tutorials**: `sandbox/tutorial/***.ipynb`
143
+ - **Generation via notebook**: `sandbox/generation.ipynb`
144
+
145
+ ## Options
146
+ See `config/mcts/example.yaml` for an example and advanced options. More examples (settings used in the paper) can be found in `config/mcts/egfr_de_novo/` and `config/mcts/egfr_lead_opt/`.
147
+
148
+ All options for each component (class) are defined as arguments in the `__init__()` method of the corresponding class.
149
+
150
+ <details>
151
+ <summary><b>Nodes and Transitions</b></summary><br>
152
+
153
+ **For general usage:**
154
+ |Node class|Transition class|Description|
155
+ |---|---|---|
156
+ |`MolSentenceNode`|`RNNTransition`|For de novo generation. Uses the RNN (GRU / LSTM) model specified by `model_dir`.|
157
+ |`MolSentenceNode`|`GPT2Transition`|For de novo generation. Uses the Transformer (GPT-2) model specified by `model_dir`.|
158
+ |`CanonicalSMILESStringNode`|`GBGATransition`|For lead optimization. Uses [GB-GA mutation rules](https://pubs.rsc.org/en/content/articlelanding/2019/sc/c8sc05372c).|
159
+ |`CanonicalSMILESStringNode`|`SMIRKSTransition`|For lead optimization. Uses the specified SMIRKS rules (e.g. MMP-based ones).|
160
+ |`SMILESStringNode`|`ChatGPTTransition`|For lead optimization. Uses the specified prompt(s) as input to the GPT model specified by `model` (e.g., `"gpt-4o-mini"`). Requires an OpenAI API key.|
161
+
162
+ **For research purposes (did not perform well in our testing):**
163
+ |Node class|Transition class|Description|
164
+ |---|---|---|
165
+ |`CanonicalSMILESStringNode`|`GBGMTransition`|For de novo generation. Uses [GB-GM rules](https://pubs.rsc.org/en/content/articlelanding/2019/sc/c8sc05372c). Rollouts iteratively apply transitions until the molecule size reaches a sampled value determined by `size_mean` and `size_std`.|
166
+ |`FASTAStringNode`|`ProtGPT2Transition`|For de novo protein generation. Uses the [ProtGPT2 model](https://www.nature.com/articles/s41467-022-32007-7).|
167
+ |`SELFIESStringNode`|`BioT5Transition`|For lead optimization. Uses the specified prompt(s) as input to the [BioT5 text2mol model](https://github.com/QizhiPei/BioT5).|
168
+ |`SMILESStringNode`|`ChatGPTTransitionWithMemory`|For lead optimization. Unlike `ChatGPTTransition`, retains conversation history and feedback reward calculation results to the model.|
169
+
170
+ </details>
171
+
172
+ <details>
173
+ <summary><b>Policies</b></summary><br>
174
+
175
+ - `UCT`: Does not use transition probabilities. Performed better with `RNNTransition` in our testing.
176
+ - `PUCT`: Incorporates transition probabilities (follows the modification introduced in [AlphaGo Zero](https://www.nature.com/articles/nature24270)). Performed better with `GBGATransition` in our testing.
177
+ - `PUCTWithPredictor`: Trains an optimistic predictor of leaf-node evaluations using the generation history, and uses its output as the score for unvisited nodes when the model’s performance (measured by the normalized pinball loss) exceeds a specified threshold. This option adds a few seconds of overhead per generation (depending on the number of child nodes per transition and the computational cost of each prediction), and is recommended only when the reward calculations are expensive. Inherits all the arguments of `UCT` and `PUCT`. For non-molecular nodes, a function that returns a feature vector must be defined (see `policy/puct_with_predictor.py` for details.)
178
+
179
+ </details>
180
+
181
+ <details>
182
+ <summary><b>Basic options</b></summary><br>
183
+
184
+ |Class|Option|Default|Description|
185
+ |---|---|---|---|
186
+ |-|`max_generations`|-|Stops generation after producing the specified number of molecules.|
187
+ |-|`time_limit`|-|Stops generation once the time limit (in seconds) is reached.|
188
+ |-|`root`|`""`|Key (string) for the root node (e.g. SMILES of the starting molecule for `SMILESStringNode`). Multiple roots can be specified by list input. If not specified, an empty string `""` will be used as the root node's key.|
189
+ |`MCTS`|`n_eval_width`|∞|By default (= ∞), evaluates all new leaf nodes after each transition. Setting `n_eval_width = 1` often improves sample efficiency and can be beneficial when reward computation is expensive.|
190
+ |`MCTS`|`filter_reward`|0|Substitutes the reward with this value when nodes are filtered. Use a list to specify different reward values for each filtering step. Set to `"ignore"` to skip reward assignment (in this case, other penalty types for filtered nodes, such as `failed_parent_reward`, needs to be set).|
191
+ |`UCT`, `PUCT`, `PUCTWithPredictor`|`c`|0.3|A larger value prioritizes exploration over exploitation. Recommended range: [0.01, 1]|
192
+ |`UCT`, `PUCT`, `PUCTWithPredictor`|`best_rate`|0|A value between 0 and 1. The exploitation term is calculated as: `best_rate` * {best reward} + (1 - `best_rate`) * {average reward}. For better sample efficiency, it might be better to set this value to around 0.5 for de novo generations, and around 0.9 for lead optimizations.|
193
+
194
+ </details>
195
+
196
+ <details>
197
+ <summary><b>Advanced options</b></summary><br>
198
+
199
+
200
+ For other options and further details, please refer to each class’s `__init__()` method.
201
+
202
+
203
+ |Class|Option|Default|Description|
204
+ |---|---|---|---|
205
+ |-|`seed`|-|The seed value for `random`, `np.random` and `torch`.|
206
+ |-|`device`|-|Torch device specification (e.g., "cpu", "cuda", "cuda:0"). For `RNNTransition`, using the CPU tends to be faster.|
207
+ |-|`debug`|False|If True, debug logging are enabled.|
208
+ |-|`silent`|False|If True, console logging are disabled.|
209
+ |-|`save​_on​_completion`|False|If True, saves a checkpoint upon completion of the generation.|
210
+ |-|`next_yaml_path`|False|If a path to the YAML config for the next generator is set, the generated molecules will be passed for chain generation.|
211
+ |-|`n_keys_to_pass`|3|Number of top-k generated molecules (keys) to be used as root nodes for the next generator.|
212
+ |`MCTS`|`n_eval_iters`|1|The number of child node evaluations. This value should not be > 1 unless the evaluations are undeterministic (e.g. involve rollouts).|
213
+ |`MCTS`|`n_tries`|1|The number of attempts to obtain an unfiltered node in a single evaluation. This value should not be >1 unless the evaluations are undeterministic (e.g. involve rollouts).|
214
+ |`MCTS`|`allow​_eval​_overlaps`|False|Whether to allow overlap nodes when sampling eval candidates (recommended: False)|
215
+ |`MCTS`|`reward_cutoff`|None|Child nodes are removed if their reward is lower than this value. This applies only to nodes for which `has_reward() = True` (i.e., complete molecules). |
216
+ |`MCTS`|`reward​_cutoff​_warmups`|None|If specified, reward_cutoff will be inactive until `reward_cutoff_warmups` generations.|
217
+ |`MCTS`|`cut_failed_child`|False|If True, child nodes will be removed when {`n_eval_iters` * `n_tries`} evals are filtered.|
218
+ |`MCTS`|`failed​_parent​_reward`|`"ignore"`|Backpropagate this value when {`n_eval_width` * `n_eval_iters` * `n_tries`} evals are filtered from the node.|
219
+ |`MCTS`|`terminal_reward`|`"ignore"`|If a float value is set, that value is backpropagated when a leaf node reaches a terminal state. If set to `"ignore"`, no value is backpropagated.|
220
+ |`MCTS`|`cut_terminal`|True|If True, terminal nodes are pruned from the search tree and will not be visited more than once.|
221
+ |`MCTS`|`avoid_duplicates`|True|If True, duplicate nodes won't be added to the search tree. Should be True if the transition forms a cyclic graph. Unneeded if the tree structure of the transition graph is guranteed, and can be set to False to reduce memory usage.|
222
+ |`MCTS`|`discard​_unneeded​_states`|True|If True, discards node variables that are no longer needed after expansion. Set this to False when using custom classes that utilize these values.|
223
+ |`UCT`, `PUCT`, `PUCTWithPredictor`|`pw_c`, `pw_alpha`, `pw_beta`|None, 0, 0|If `pw_c` is set, the number of available child nodes is limited to `pw_c` * ({visit count} ** `pw_alpha`) + `pw_beta`.|
224
+ |`UCT`, `PUCT`, `PUCTWithPredictor`|`max_prior`|None (0)|A lower bound for the best reward. If the actual best reward is lower than this value, this value is used instead.|
225
+ |`UCT`, `PUCT`, `PUCTWithPredictor`|`epsilon`|0|The probability of randomly selecting a child node while descending the search tree.|
226
+ |`PUCTWithPredictor`|`alpha`|0.9|Quantile level for the predictor, representing the target percentile of the response variable to be estimated and used.|
227
+ |`PUCTWithPredictor`|`score_threshold`|0.6|If the recent prediction score (1 - {pinball loss} / {baseline pinball loss}) is better than this threshold, the model will be used afterwards.|
228
+ |`MolSentenceNode​`, `MolStringNode`|`use​_canonical​_smiles​_as​_key`|False|Whether to convert generated molecules to canonical SMILES when generating keys. If False, the same molecule may be counted multiple times.|
229
+ |`RNNTransition`, `GPT2Transition`|`top_p`|0.995|Nucleus sampling threshold in (0, 1]; keeps the smallest probability mass ≥ `top_p`.|
230
+ |`RNNTransition`, `GPT2Transition`|`temperature`|1|Logit temperature > 0 applied **before** `top_p`; values < 1.0 sharp, > 1.0 smooth.|
231
+ |`RNNTransition`|`sharpness`|1| Probability distribution sharpness > 0 applied **after** `top_p`; values < 1.0 smooth, > 1.0 sharp.|
232
+ |`RNNTransition`|`disable​_top​_p​_on​_rollout`|False|If True, `top_p` won't be applied for rollouts.|
233
+ |`SMIRKSTransition`|`limit`|None|If the number of generated SMILES exceeded this value, stops applying further SMIRKS patterns. The order of SMIRKS patterns are shuffled with weights before applying transition if this option is enabled.|
234
+
235
+ </details>
236
+
237
+ <details>
238
+ <summary><b>Filters</b></summary><br>
239
+
240
+ **Sanity**
241
+ - `ValidityFilter`: Excludes invalid molecule objects. Since other filters and rewards typically assume validity and do not recheck it, usually this filter should be applied first in molecular generation.
242
+ - `RadicalFilter`: Excludes molecules whose number of radical electrons is not 0.
243
+ - `ConnectivityFilter`: Excludes molecules whose number of disconnected fragments is not 1.
244
+
245
+ **Topological**
246
+ - `SubstructureFilter`: Excludes molecules that **do not** contain the specified (list of) substructure(s) by `smiles` or `smarts` arguments. If `preserve` is set to False, excludes molecules that **do** contain the specified (list of) substructure(s) instead. By specifying appropriate SMARTS patterns, it is possible to control where substitutions or structural modifications (i.e., adding a substituent or arm) are allowed to occur.
247
+ - `AromaticRingFilter`: Excludes molecules whose number of aromatic rings falls outside the range [`min`, `max`]. (Default: [1, ∞))
248
+ - `HeavyAtomCountFilter`: Excludes molecules whose number of heavy atoms falls outside the range [`min`, `max`]. (Default: [0, 45])
249
+ - `MaxRingSizeFilter`: Excludes molecules whose largest ring size falls outside the range [`min`, `max`]. (Default: [0, 6])
250
+ - `MinRingSizeFilter`: Excludes molecules whose smallest ring size falls outside the range [`min`, `max`]. (Default: (-∞, ∞))
251
+ - `RingBondFilter`: Excludes molecules containing ring allenes (`[R]=[R]=[R]`) or double bonds in small rings (`[r3,r4]=[r3,r4]`).
252
+ - `RotatableBondsFilter`: Excludes molecules whose number of rotatable bonds falls outside the range [`min`, `max`]. (Default: [0, 10])
253
+
254
+ **Structural alert**
255
+ - `ROCFilter`: Excludes molecules that contain structural alerts defined by Ohta and Cho.
256
+ - `CatalogFilter`: Excludes molecules that contain structural alerts in the specified list of [rdkit.Chem.FilterCatalogParams.FilterCatalogs](https://www.rdkit.org/docs/source/rdkit.Chem.rdfiltercatalog.html#rdkit.Chem.rdfiltercatalog.FilterCatalogParams.FilterCatalogs). (e.g. `catalogs = ["PAINS_A", "PAINS_B", "PAINS_C", "NIH", "BRENK"]`)
257
+
258
+ **Drug-likeness**
259
+ - `PubChemFilter`: Excludes molecules based on the frequency of occurrence of molecular patterns in the PubChem database. Reported in [Ma et al.](https://doi.org/10.1021/acs.jcim.1c00679).
260
+ - `LipinskiFilter`: Excludes molecules based on Lipinski’s Rule of Five. Set `rule_of` to 3 to apply the Rule of Three instead.
261
+ - `SAScoreFilter`: Excludes molecules whose synthetic accessibility score (SA Score) falls outside the range [`min`, `max`]. (Default: [1, 3.5])
262
+
263
+ **Physicochemical**
264
+ - `ChargeFilter`: Excludes molecules whose formal charge is not 0.
265
+ - `HBAFilter`: Excludes molecules whose number of hydrogen bond acceptors falls outside the range [`min`, `max`]. (Default: [0, 10])
266
+ - `HBDFilter`: Excludes molecules whose number of hydrogen bond donors falls outside the range [`min`, `max`]. (Default: [0, 5])
267
+ - `LogPFilter`: Excludes molecules whose LogP value falls outside the range [`min`, `max`]. (Default: (-∞, 5])
268
+ - `TPSAFilter`: Excludes molecules whose topological polar surface area (TPSA) falls outside the range [`min`, `max`]. (Default: [0, 140])
269
+ - `WeightFilter`: Excludes molecules whose molecular weight falls outside the range [`min`, `max`]. (Default: [0, 500])
270
+
271
+ Filters can also be specified using `filters` argument of transitions that inherit from `TemplateTransition` (e.g. `GBGATransition`, `SMIRKSTransition`, `ChatGPTTransition`) to directly exclude molecules from child nodes.
272
+
273
+ </details>
274
+
275
+ ## Model training
276
+ - **RNN (GRU) training** (example): `chemtsv3-train -c config/training/train_rnn_smiles.yaml`
277
+ - **Transformer (GPT-2) training** (example): `chemtsv3-train -c config/training/train_gpt2.yaml`
278
+ Change `dataset_path` in YAML to train on an arbitrary dataset (1 sentence per line).
File without changes
File without changes
@@ -0,0 +1,66 @@
1
+ # Example (RNN): chemtsv3 -c config/mcts/example.yaml
2
+ # Example (Chain): chemtsv3 -c config/mcts/example_chain_1.yaml
3
+ # Example (Load): chemtsv3 -l generation_results/~~~ --max_generations 100
4
+
5
+ # Path setup / Imports
6
+ import faulthandler
7
+ # import sys
8
+ # import os
9
+ # repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
10
+ # if repo_root not in sys.path:
11
+ # sys.path.insert(0, repo_root)
12
+
13
+ import argparse
14
+ from chemtsv3.generator import Generator
15
+ from chemtsv3.utils import conf_from_yaml, generator_from_conf
16
+
17
+ def main():
18
+ parser = argparse.ArgumentParser()
19
+ parser.add_argument("-c", "--yaml_path", type=str, help="Path to the config file (.yaml)")
20
+ parser.add_argument("-l", "--load_dir", type=str, help="Path to the save directory (contains config.yaml and save.gtr)")
21
+
22
+ parser.add_argument("--max_generations", type=int, help="Only used when loading the generator from the save.")
23
+ parser.add_argument("-t", "--time_limit", type=int, help="Only used when loading the generator from the save.")
24
+
25
+ args = parser.parse_args()
26
+
27
+ yaml_path = args.yaml_path
28
+ load_dir = args.load_dir
29
+
30
+ if yaml_path is None and load_dir is None:
31
+ raise ValueError("Specify either 'yaml_path' (-c) or 'load_dir' (-l).")
32
+ elif yaml_path is not None and load_dir is None:
33
+ conf = conf_from_yaml(yaml_path)
34
+ generator = generator_from_conf(conf)
35
+ while(yaml_path):
36
+ generator.generate(time_limit=conf.get("time_limit"), max_generations=conf.get("max_generations"))
37
+ if not "next_yaml_path" in conf:
38
+ yaml_path = None
39
+ plot_args = conf.get("plot_args", {})
40
+ if not "save_only" in plot_args:
41
+ plot_args["save_only"] = True
42
+ generator.plot(**plot_args)
43
+ generator.analyze()
44
+ else:
45
+ n_top_keys_to_pass=conf.get("n_keys_to_pass", 3)
46
+ yaml_path = conf["next_yaml_path"]
47
+ conf = conf_from_yaml(yaml_path)
48
+ new_generator = generator_from_conf(conf, predecessor=generator, n_top_keys_to_pass=n_top_keys_to_pass)
49
+ generator = new_generator
50
+
51
+ elif yaml_path is None and load_dir is not None:
52
+ generator = Generator.load_dir(load_dir)
53
+ max_generations = args.max_generations
54
+ time_limit = args.time_limit
55
+ generator.generate(max_generations=max_generations, time_limit=time_limit)
56
+ generator.analyze()
57
+ plot_args = generator.yaml_copy.get("plot_args", {})
58
+ if not "save_only" in plot_args:
59
+ plot_args["save_only"] = True
60
+ generator.plot(**plot_args)
61
+ else:
62
+ raise ValueError("Specify one of 'yaml_path' (-c) or 'load_dir' (-l), not both.")
63
+
64
+ if __name__ == "__main__":
65
+ faulthandler.enable()
66
+ main()