bayesianflow-for-chem 2.2.1__tar.gz → 2.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bayesianflow-for-chem might be problematic. Click here for more details.
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/PKG-INFO +2 -2
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem/__init__.py +1 -1
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem/cli.py +25 -7
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem/data.py +22 -21
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem/tool.py +6 -4
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem.egg-info/PKG-INFO +2 -2
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/LICENSE +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/README.md +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem/_data/vocab.txt +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem/model.py +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem/scorer.py +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem/spectra.py +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem/train.py +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem.egg-info/SOURCES.txt +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem.egg-info/dependency_links.txt +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem.egg-info/entry_points.txt +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem.egg-info/requires.txt +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem.egg-info/top_level.txt +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/pyproject.toml +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/setup.cfg +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/setup.py +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/test/test_cli_plugin.py +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/test/test_jit_compatibility.py +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/test/test_merge_lora.py +0 -0
- {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/test/test_molecular_embedding.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bayesianflow_for_chem
|
|
3
|
-
Version: 2.2.
|
|
3
|
+
Version: 2.2.2
|
|
4
4
|
Summary: Bayesian flow network framework for Chemistry
|
|
5
5
|
Home-page: https://augus1999.github.io/bayesian-flow-network-for-chemistry/
|
|
6
6
|
Author: Nianze A. Tao
|
|
@@ -96,7 +96,7 @@ You can find pretrained models on our [🤗Hugging Face model page](https://hugg
|
|
|
96
96
|
|
|
97
97
|
## Dataset Handling
|
|
98
98
|
|
|
99
|
-
We provide a Python class [`CSVData`](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/blob/main/bayesianflow_for_chem/data.py#
|
|
99
|
+
We provide a Python class [`CSVData`](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/blob/main/bayesianflow_for_chem/data.py#L153) to handle data stored in CSV or similar format containing headers to identify the entities. The following is a quickstart.
|
|
100
100
|
|
|
101
101
|
1. Download your dataset file (e.g., ESOL from [MoleculeNet](https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv)) and split the file:
|
|
102
102
|
```python
|
|
@@ -18,11 +18,11 @@ from bayesianflow_for_chem.scorer import smiles_valid, Scorer
|
|
|
18
18
|
from bayesianflow_for_chem.data import (
|
|
19
19
|
VOCAB_COUNT,
|
|
20
20
|
VOCAB_KEYS,
|
|
21
|
-
|
|
22
|
-
|
|
21
|
+
FASTA_VOCAB_COUNT,
|
|
22
|
+
FASTA_VOCAB_KEYS,
|
|
23
23
|
load_vocab,
|
|
24
24
|
smiles2token,
|
|
25
|
-
|
|
25
|
+
fasta2token,
|
|
26
26
|
split_selfies,
|
|
27
27
|
collate,
|
|
28
28
|
CSVData,
|
|
@@ -116,6 +116,23 @@ madmadmadmadmadmadmadmadmadmadmadmadmadmadmad
|
|
|
116
116
|
madmadmadmadmadmadmadmadmadmadmadmadmadmadmad
|
|
117
117
|
"""
|
|
118
118
|
|
|
119
|
+
_END_MESSAGE = r"""
|
|
120
|
+
If you find this project helpful, please cite us:
|
|
121
|
+
1. N. Tao, and M. Abe, J. Chem. Inf. Model., 2025, 65, 1178-1187.
|
|
122
|
+
2. N. Tao, 2024, arXiv:2412.11439.
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
_ERROR_MESSAGE = r"""
|
|
126
|
+
Some who believe in inductive logic are anxious to point out, with
|
|
127
|
+
Reichenbach, that 'the principle of induction is unreservedly accepted
|
|
128
|
+
by the whole of science and that no man can seriously doubt this
|
|
129
|
+
principle in everyday life either'. Yet even supposing this were the
|
|
130
|
+
case—for after all, 'the whole of science' might err—I should still
|
|
131
|
+
contend that a principle of induction is superfluous, and that it must
|
|
132
|
+
lead to logical inconsistencies.
|
|
133
|
+
-- Karl Popper --
|
|
134
|
+
"""
|
|
135
|
+
|
|
119
136
|
_ALLOWED_PLUGINS = [
|
|
120
137
|
"collate_fn",
|
|
121
138
|
"num_workers",
|
|
@@ -396,7 +413,7 @@ def main_script(version: str) -> None:
|
|
|
396
413
|
print("Configuration check passed.")
|
|
397
414
|
return
|
|
398
415
|
if flag_critical != 0:
|
|
399
|
-
raise RuntimeError
|
|
416
|
+
raise RuntimeError(_ERROR_MESSAGE)
|
|
400
417
|
print(_MESSAGE.format(version))
|
|
401
418
|
# ####### build tokeniser #######
|
|
402
419
|
tokeniser_config = runtime_config["tokeniser"]
|
|
@@ -406,9 +423,9 @@ def main_script(version: str) -> None:
|
|
|
406
423
|
vocab_keys = VOCAB_KEYS
|
|
407
424
|
tokeniser = smiles2token
|
|
408
425
|
if tokeniser_name == "fasta":
|
|
409
|
-
num_vocab =
|
|
410
|
-
vocab_keys =
|
|
411
|
-
tokeniser =
|
|
426
|
+
num_vocab = FASTA_VOCAB_COUNT
|
|
427
|
+
vocab_keys = FASTA_VOCAB_KEYS
|
|
428
|
+
tokeniser = fasta2token
|
|
412
429
|
if tokeniser_name == "selfies":
|
|
413
430
|
vocab_data = load_vocab(tokeniser_config["vocab"])
|
|
414
431
|
num_vocab = vocab_data["vocab_count"]
|
|
@@ -680,6 +697,7 @@ def main_script(version: str) -> None:
|
|
|
680
697
|
f.write("\n".join(mols))
|
|
681
698
|
# ------- finished -------
|
|
682
699
|
print(" ####### job finished #######")
|
|
700
|
+
print(_END_MESSAGE)
|
|
683
701
|
|
|
684
702
|
|
|
685
703
|
if __name__ == "__main__":
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
# Author: Nianze A. TAO (Omozawa SUENO)
|
|
3
3
|
"""
|
|
4
|
-
Tokenise SMILES/SAFE/SELFIES/
|
|
4
|
+
Tokenise SMILES/SAFE/SELFIES/FASTA strings.
|
|
5
5
|
"""
|
|
6
6
|
import os
|
|
7
7
|
import re
|
|
@@ -14,7 +14,7 @@ from torch.utils.data import Dataset
|
|
|
14
14
|
|
|
15
15
|
__filedir__ = Path(__file__).parent
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
_SMI_REGEX_PATTERN = (
|
|
18
18
|
r"(\[|\]|H[e,f,g,s,o]?|"
|
|
19
19
|
r"L[i,v,a,r,u]|"
|
|
20
20
|
r"B[e,r,a,i,h,k]?|"
|
|
@@ -31,11 +31,11 @@ SMI_REGEX_PATTERN = (
|
|
|
31
31
|
r"\(|\)|\.|=|#|-|\+|\\|\/|:|"
|
|
32
32
|
r"~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
|
|
33
33
|
)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
34
|
+
_SEL_REGEX_PATTERN = r"(\[[^\]]+]|\.)"
|
|
35
|
+
_FAS_REGEX_PATTERN = r"(A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|-|\*|\.)"
|
|
36
|
+
_smi_regex = re.compile(_SMI_REGEX_PATTERN)
|
|
37
|
+
_sel_regex = re.compile(_SEL_REGEX_PATTERN)
|
|
38
|
+
_fas_regex = re.compile(_FAS_REGEX_PATTERN)
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
def load_vocab(
|
|
@@ -65,11 +65,12 @@ _DEFUALT_VOCAB = load_vocab(__filedir__ / "_data/vocab.txt")
|
|
|
65
65
|
VOCAB_KEYS: List[str] = _DEFUALT_VOCAB["vocab_keys"]
|
|
66
66
|
VOCAB_DICT: Dict[str, int] = _DEFUALT_VOCAB["vocab_dict"]
|
|
67
67
|
VOCAB_COUNT: int = _DEFUALT_VOCAB["vocab_count"]
|
|
68
|
-
|
|
69
|
-
VOCAB_KEYS[0:3]
|
|
68
|
+
FASTA_VOCAB_KEYS = (
|
|
69
|
+
VOCAB_KEYS[0:3]
|
|
70
|
+
+ "A B C D E F G H I K L M N P Q R S T V W Y Z - . J O U X *".split()
|
|
70
71
|
)
|
|
71
|
-
|
|
72
|
-
|
|
72
|
+
FASTA_VOCAB_COUNT = len(FASTA_VOCAB_KEYS)
|
|
73
|
+
FASTA_VOCAB_DICT = dict(zip(FASTA_VOCAB_KEYS, range(FASTA_VOCAB_COUNT)))
|
|
73
74
|
|
|
74
75
|
|
|
75
76
|
def smiles2vec(smiles: str) -> List[int]:
|
|
@@ -81,21 +82,21 @@ def smiles2vec(smiles: str) -> List[int]:
|
|
|
81
82
|
:return: tokens w/o `<start>` and `<end>`
|
|
82
83
|
:rtype: list
|
|
83
84
|
"""
|
|
84
|
-
tokens = [token for token in
|
|
85
|
+
tokens = [token for token in _smi_regex.findall(smiles)]
|
|
85
86
|
return [VOCAB_DICT[token] for token in tokens]
|
|
86
87
|
|
|
87
88
|
|
|
88
|
-
def
|
|
89
|
+
def fasta2vec(fasta: str) -> List[int]:
|
|
89
90
|
"""
|
|
90
|
-
|
|
91
|
+
FASTA sequence tokenisation using a dataset-independent regex pattern.
|
|
91
92
|
|
|
92
|
-
:param
|
|
93
|
-
:type
|
|
93
|
+
:param fasta: protein (amino acid) sequence
|
|
94
|
+
:type fasta: str
|
|
94
95
|
:return: tokens w/o `<start>` and `<end>`
|
|
95
96
|
:rtype: list
|
|
96
97
|
"""
|
|
97
|
-
tokens = [token for token in
|
|
98
|
-
return [
|
|
98
|
+
tokens = [token for token in _fas_regex.findall(fasta)]
|
|
99
|
+
return [FASTA_VOCAB_DICT[token] for token in tokens]
|
|
99
100
|
|
|
100
101
|
|
|
101
102
|
def split_selfies(selfies: str) -> List[str]:
|
|
@@ -107,7 +108,7 @@ def split_selfies(selfies: str) -> List[str]:
|
|
|
107
108
|
:return: SELFIES vocab
|
|
108
109
|
:rtype: list
|
|
109
110
|
"""
|
|
110
|
-
return [token for token in
|
|
111
|
+
return [token for token in _sel_regex.findall(selfies)]
|
|
111
112
|
|
|
112
113
|
|
|
113
114
|
def smiles2token(smiles: str) -> Tensor:
|
|
@@ -115,9 +116,9 @@ def smiles2token(smiles: str) -> Tensor:
|
|
|
115
116
|
return torch.tensor([1] + smiles2vec(smiles) + [2], dtype=torch.long)
|
|
116
117
|
|
|
117
118
|
|
|
118
|
-
def
|
|
119
|
+
def fasta2token(fasta: str) -> Tensor:
|
|
119
120
|
# start token: <start> = 1; end token: <end> = 2
|
|
120
|
-
return torch.tensor([1] +
|
|
121
|
+
return torch.tensor([1] + fasta2vec(fasta) + [2], dtype=torch.long)
|
|
121
122
|
|
|
122
123
|
|
|
123
124
|
def collate(batch: List[Dict[str, Tensor]]) -> Dict[str, Tensor]:
|
|
@@ -7,7 +7,7 @@ import csv
|
|
|
7
7
|
import random
|
|
8
8
|
import warnings
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import List, Dict, Tuple, Union, Optional
|
|
10
|
+
from typing import List, Dict, Tuple, Union, Optional, Literal
|
|
11
11
|
import torch
|
|
12
12
|
import numpy as np
|
|
13
13
|
from torch import cuda, Tensor, softmax
|
|
@@ -103,7 +103,7 @@ def test(
|
|
|
103
103
|
model: ChemBFN,
|
|
104
104
|
mlp: MLP,
|
|
105
105
|
data: DataLoader,
|
|
106
|
-
mode:
|
|
106
|
+
mode: Literal["regression", "classification"] = "regression",
|
|
107
107
|
device: Union[str, torch.device, None] = None,
|
|
108
108
|
) -> Dict[str, float]:
|
|
109
109
|
"""
|
|
@@ -184,7 +184,9 @@ def test(
|
|
|
184
184
|
|
|
185
185
|
|
|
186
186
|
def split_dataset(
|
|
187
|
-
file: Union[str, Path],
|
|
187
|
+
file: Union[str, Path],
|
|
188
|
+
split_ratio: List[int] = [8, 1, 1],
|
|
189
|
+
method: Literal["random", "scaffold"] = "random",
|
|
188
190
|
) -> None:
|
|
189
191
|
"""
|
|
190
192
|
Split a dataset.
|
|
@@ -526,7 +528,7 @@ class GeometryConverter:
|
|
|
526
528
|
def smiles2cartesian(
|
|
527
529
|
smiles: str,
|
|
528
530
|
num_conformers: int = 250,
|
|
529
|
-
rdkit_ff_type:
|
|
531
|
+
rdkit_ff_type: Literal["MMFF", "UFF"] = "MMFF",
|
|
530
532
|
refine_with_crest: bool = False,
|
|
531
533
|
spin: float = 0.0,
|
|
532
534
|
) -> Tuple[List[str], np.ndarray]:
|
{bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bayesianflow_for_chem
|
|
3
|
-
Version: 2.2.
|
|
3
|
+
Version: 2.2.2
|
|
4
4
|
Summary: Bayesian flow network framework for Chemistry
|
|
5
5
|
Home-page: https://augus1999.github.io/bayesian-flow-network-for-chemistry/
|
|
6
6
|
Author: Nianze A. Tao
|
|
@@ -96,7 +96,7 @@ You can find pretrained models on our [🤗Hugging Face model page](https://hugg
|
|
|
96
96
|
|
|
97
97
|
## Dataset Handling
|
|
98
98
|
|
|
99
|
-
We provide a Python class [`CSVData`](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/blob/main/bayesianflow_for_chem/data.py#
|
|
99
|
+
We provide a Python class [`CSVData`](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/blob/main/bayesianflow_for_chem/data.py#L153) to handle data stored in CSV or similar format containing headers to identify the entities. The following is a quickstart.
|
|
100
100
|
|
|
101
101
|
1. Download your dataset file (e.g., ESOL from [MoleculeNet](https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv)) and split the file:
|
|
102
102
|
```python
|
|
File without changes
|
|
File without changes
|
{bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem/_data/vocab.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/bayesianflow_for_chem/spectra.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.2}/test/test_molecular_embedding.py
RENAMED
|
File without changes
|