bayesianflow-for-chem 2.2.1__tar.gz → 2.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bayesianflow-for-chem might be problematic. Click here for more details.

Files changed (25) hide show
  1. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/PKG-INFO +2 -2
  2. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem/__init__.py +1 -1
  3. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem/cli.py +26 -8
  4. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem/data.py +22 -21
  5. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem/model.py +3 -0
  6. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem/tool.py +6 -4
  7. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem.egg-info/PKG-INFO +2 -2
  8. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/LICENSE +0 -0
  9. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/README.md +0 -0
  10. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem/_data/vocab.txt +0 -0
  11. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem/scorer.py +0 -0
  12. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem/spectra.py +0 -0
  13. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem/train.py +0 -0
  14. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem.egg-info/SOURCES.txt +0 -0
  15. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem.egg-info/dependency_links.txt +0 -0
  16. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem.egg-info/entry_points.txt +0 -0
  17. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem.egg-info/requires.txt +0 -0
  18. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem.egg-info/top_level.txt +0 -0
  19. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/pyproject.toml +0 -0
  20. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/setup.cfg +0 -0
  21. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/setup.py +0 -0
  22. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/test/test_cli_plugin.py +0 -0
  23. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/test/test_jit_compatibility.py +0 -0
  24. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/test/test_merge_lora.py +0 -0
  25. {bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/test/test_molecular_embedding.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bayesianflow_for_chem
3
- Version: 2.2.1
3
+ Version: 2.2.3
4
4
  Summary: Bayesian flow network framework for Chemistry
5
5
  Home-page: https://augus1999.github.io/bayesian-flow-network-for-chemistry/
6
6
  Author: Nianze A. Tao
@@ -96,7 +96,7 @@ You can find pretrained models on our [🤗Hugging Face model page](https://hugg
96
96
 
97
97
  ## Dataset Handling
98
98
 
99
- We provide a Python class [`CSVData`](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/blob/main/bayesianflow_for_chem/data.py#L152) to handle data stored in CSV or similar format containing headers to identify the entities. The following is a quickstart.
99
+ We provide a Python class [`CSVData`](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/blob/main/bayesianflow_for_chem/data.py#L153) to handle data stored in CSV or similar format containing headers to identify the entities. The following is a quickstart.
100
100
 
101
101
  1. Download your dataset file (e.g., ESOL from [MoleculeNet](https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv)) and split the file:
102
102
  ```python
@@ -16,7 +16,7 @@ __all__ = [
16
16
  "MLP",
17
17
  "EnsembleChemBFN",
18
18
  ]
19
- __version__ = "2.2.1"
19
+ __version__ = "2.2.3"
20
20
  __author__ = "Nianze A. Tao (Omozawa Sueno)"
21
21
 
22
22
 
@@ -18,11 +18,11 @@ from bayesianflow_for_chem.scorer import smiles_valid, Scorer
18
18
  from bayesianflow_for_chem.data import (
19
19
  VOCAB_COUNT,
20
20
  VOCAB_KEYS,
21
- AA_VOCAB_COUNT,
22
- AA_VOCAB_KEYS,
21
+ FASTA_VOCAB_COUNT,
22
+ FASTA_VOCAB_KEYS,
23
23
  load_vocab,
24
24
  smiles2token,
25
- aa2token,
25
+ fasta2token,
26
26
  split_selfies,
27
27
  collate,
28
28
  CSVData,
@@ -116,6 +116,23 @@ madmadmadmadmadmadmadmadmadmadmadmadmadmadmad
116
116
  madmadmadmadmadmadmadmadmadmadmadmadmadmadmad
117
117
  """
118
118
 
119
+ _END_MESSAGE = r"""
120
+ If you find this project helpful, please cite us:
121
+ 1. N. Tao, and M. Abe, J. Chem. Inf. Model., 2025, 65, 1178-1187.
122
+ 2. N. Tao, 2024, arXiv:2412.11439.
123
+ """
124
+
125
+ _ERROR_MESSAGE = r"""
126
+ Some who believe in inductive logic are anxious to point out, with
127
+ Reichenbach, that 'the principle of induction is unreservedly accepted
128
+ by the whole of science and that no man can seriously doubt this
129
+ principle in everyday life either'. Yet even supposing this were the
130
+ case—for after all, 'the whole of science' might err—I should still
131
+ contend that a principle of induction is superfluous, and that it must
132
+ lead to logical inconsistencies.
133
+ -- Karl Popper --
134
+ """
135
+
119
136
  _ALLOWED_PLUGINS = [
120
137
  "collate_fn",
121
138
  "num_workers",
@@ -396,7 +413,7 @@ def main_script(version: str) -> None:
396
413
  print("Configuration check passed.")
397
414
  return
398
415
  if flag_critical != 0:
399
- raise RuntimeError
416
+ raise RuntimeError(_ERROR_MESSAGE)
400
417
  print(_MESSAGE.format(version))
401
418
  # ####### build tokeniser #######
402
419
  tokeniser_config = runtime_config["tokeniser"]
@@ -406,9 +423,9 @@ def main_script(version: str) -> None:
406
423
  vocab_keys = VOCAB_KEYS
407
424
  tokeniser = smiles2token
408
425
  if tokeniser_name == "fasta":
409
- num_vocab = AA_VOCAB_COUNT
410
- vocab_keys = AA_VOCAB_KEYS
411
- tokeniser = aa2token
426
+ num_vocab = FASTA_VOCAB_COUNT
427
+ vocab_keys = FASTA_VOCAB_KEYS
428
+ tokeniser = fasta2token
412
429
  if tokeniser_name == "selfies":
413
430
  vocab_data = load_vocab(tokeniser_config["vocab"])
414
431
  num_vocab = vocab_data["vocab_count"]
@@ -679,7 +696,8 @@ def main_script(version: str) -> None:
679
696
  with open(runtime_config["inference"]["result_file"], "w") as f:
680
697
  f.write("\n".join(mols))
681
698
  # ------- finished -------
682
- print(" ####### job finished #######")
699
+ print("*" * 25 + " job finished " + "*" * 25)
700
+ print(_END_MESSAGE)
683
701
 
684
702
 
685
703
  if __name__ == "__main__":
@@ -1,7 +1,7 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # Author: Nianze A. TAO (Omozawa SUENO)
3
3
  """
4
- Tokenise SMILES/SAFE/SELFIES/protein-sequence strings.
4
+ Tokenise SMILES/SAFE/SELFIES/FASTA strings.
5
5
  """
6
6
  import os
7
7
  import re
@@ -14,7 +14,7 @@ from torch.utils.data import Dataset
14
14
 
15
15
  __filedir__ = Path(__file__).parent
16
16
 
17
- SMI_REGEX_PATTERN = (
17
+ _SMI_REGEX_PATTERN = (
18
18
  r"(\[|\]|H[e,f,g,s,o]?|"
19
19
  r"L[i,v,a,r,u]|"
20
20
  r"B[e,r,a,i,h,k]?|"
@@ -31,11 +31,11 @@ SMI_REGEX_PATTERN = (
31
31
  r"\(|\)|\.|=|#|-|\+|\\|\/|:|"
32
32
  r"~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
33
33
  )
34
- SEL_REGEX_PATTERN = r"(\[[^\]]+]|\.)"
35
- AA_REGEX_PATTERN = r"(A|B|C|D|E|F|G|H|I|K|L|M|N|P|Q|R|S|T|V|W|Y|Z|-|.)"
36
- smi_regex = re.compile(SMI_REGEX_PATTERN)
37
- sel_regex = re.compile(SEL_REGEX_PATTERN)
38
- aa_regex = re.compile(AA_REGEX_PATTERN)
34
+ _SEL_REGEX_PATTERN = r"(\[[^\]]+]|\.)"
35
+ _FAS_REGEX_PATTERN = r"(A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|-|\*|\.)"
36
+ _smi_regex = re.compile(_SMI_REGEX_PATTERN)
37
+ _sel_regex = re.compile(_SEL_REGEX_PATTERN)
38
+ _fas_regex = re.compile(_FAS_REGEX_PATTERN)
39
39
 
40
40
 
41
41
  def load_vocab(
@@ -65,11 +65,12 @@ _DEFUALT_VOCAB = load_vocab(__filedir__ / "_data/vocab.txt")
65
65
  VOCAB_KEYS: List[str] = _DEFUALT_VOCAB["vocab_keys"]
66
66
  VOCAB_DICT: Dict[str, int] = _DEFUALT_VOCAB["vocab_dict"]
67
67
  VOCAB_COUNT: int = _DEFUALT_VOCAB["vocab_count"]
68
- AA_VOCAB_KEYS = (
69
- VOCAB_KEYS[0:3] + "A B C D E F G H I K L M N P Q R S T V W Y Z - .".split()
68
+ FASTA_VOCAB_KEYS = (
69
+ VOCAB_KEYS[0:3]
70
+ + "A B C D E F G H I K L M N P Q R S T V W Y Z - . J O U X *".split()
70
71
  )
71
- AA_VOCAB_COUNT = len(AA_VOCAB_KEYS)
72
- AA_VOCAB_DICT = dict(zip(AA_VOCAB_KEYS, range(AA_VOCAB_COUNT)))
72
+ FASTA_VOCAB_COUNT = len(FASTA_VOCAB_KEYS)
73
+ FASTA_VOCAB_DICT = dict(zip(FASTA_VOCAB_KEYS, range(FASTA_VOCAB_COUNT)))
73
74
 
74
75
 
75
76
  def smiles2vec(smiles: str) -> List[int]:
@@ -81,21 +82,21 @@ def smiles2vec(smiles: str) -> List[int]:
81
82
  :return: tokens w/o `<start>` and `<end>`
82
83
  :rtype: list
83
84
  """
84
- tokens = [token for token in smi_regex.findall(smiles)]
85
+ tokens = [token for token in _smi_regex.findall(smiles)]
85
86
  return [VOCAB_DICT[token] for token in tokens]
86
87
 
87
88
 
88
- def aa2vec(aa_seq: str) -> List[int]:
89
+ def fasta2vec(fasta: str) -> List[int]:
89
90
  """
90
- Protein sequence tokenisation using a dataset-independent regex pattern.
91
+ FASTA sequence tokenisation using a dataset-independent regex pattern.
91
92
 
92
- :param aa_seq: protein (amino acid) sequence
93
- :type aa_seq: str
93
+ :param fasta: protein (amino acid) sequence
94
+ :type fasta: str
94
95
  :return: tokens w/o `<start>` and `<end>`
95
96
  :rtype: list
96
97
  """
97
- tokens = [token for token in aa_regex.findall(aa_seq)]
98
- return [AA_VOCAB_DICT[token] for token in tokens]
98
+ tokens = [token for token in _fas_regex.findall(fasta)]
99
+ return [FASTA_VOCAB_DICT[token] for token in tokens]
99
100
 
100
101
 
101
102
  def split_selfies(selfies: str) -> List[str]:
@@ -107,7 +108,7 @@ def split_selfies(selfies: str) -> List[str]:
107
108
  :return: SELFIES vocab
108
109
  :rtype: list
109
110
  """
110
- return [token for token in sel_regex.findall(selfies)]
111
+ return [token for token in _sel_regex.findall(selfies)]
111
112
 
112
113
 
113
114
  def smiles2token(smiles: str) -> Tensor:
@@ -115,9 +116,9 @@ def smiles2token(smiles: str) -> Tensor:
115
116
  return torch.tensor([1] + smiles2vec(smiles) + [2], dtype=torch.long)
116
117
 
117
118
 
118
- def aa2token(aa_seq: str) -> Tensor:
119
+ def fasta2token(fasta: str) -> Tensor:
119
120
  # start token: <start> = 1; end token: <end> = 2
120
- return torch.tensor([1] + aa2vec(aa_seq) + [2], dtype=torch.long)
121
+ return torch.tensor([1] + fasta2vec(fasta) + [2], dtype=torch.long)
121
122
 
122
123
 
123
124
  def collate(batch: List[Dict[str, Tensor]]) -> Dict[str, Tensor]:
@@ -1171,6 +1171,9 @@ class EnsembleChemBFN(ChemBFN):
1171
1171
  assert (
1172
1172
  isinstance(c, dict) is self._label_is_dict
1173
1173
  ), f"`c` should be a {'`dict` instance' if self._label_is_dict else '`list` instance'} but got {type(c)} instand."
1174
+ assert len(c) == len(
1175
+ self.models
1176
+ ), f"Number of conditions should match the number of LoRA models. We have {len(self.models)} LoRA models but {len(c)} conditions were provided."
1174
1177
  out: Dict[str, Tensor] = {}
1175
1178
  if isinstance(c, list):
1176
1179
  c = dict(zip([f"val_{i}" for i in range(len(c))], c))
@@ -7,7 +7,7 @@ import csv
7
7
  import random
8
8
  import warnings
9
9
  from pathlib import Path
10
- from typing import List, Dict, Tuple, Union, Optional
10
+ from typing import List, Dict, Tuple, Union, Optional, Literal
11
11
  import torch
12
12
  import numpy as np
13
13
  from torch import cuda, Tensor, softmax
@@ -103,7 +103,7 @@ def test(
103
103
  model: ChemBFN,
104
104
  mlp: MLP,
105
105
  data: DataLoader,
106
- mode: str = "regression",
106
+ mode: Literal["regression", "classification"] = "regression",
107
107
  device: Union[str, torch.device, None] = None,
108
108
  ) -> Dict[str, float]:
109
109
  """
@@ -184,7 +184,9 @@ def test(
184
184
 
185
185
 
186
186
  def split_dataset(
187
- file: Union[str, Path], split_ratio: List[int] = [8, 1, 1], method: str = "random"
187
+ file: Union[str, Path],
188
+ split_ratio: List[int] = [8, 1, 1],
189
+ method: Literal["random", "scaffold"] = "random",
188
190
  ) -> None:
189
191
  """
190
192
  Split a dataset.
@@ -526,7 +528,7 @@ class GeometryConverter:
526
528
  def smiles2cartesian(
527
529
  smiles: str,
528
530
  num_conformers: int = 250,
529
- rdkit_ff_type: str = "MMFF",
531
+ rdkit_ff_type: Literal["MMFF", "UFF"] = "MMFF",
530
532
  refine_with_crest: bool = False,
531
533
  spin: float = 0.0,
532
534
  ) -> Tuple[List[str], np.ndarray]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bayesianflow_for_chem
3
- Version: 2.2.1
3
+ Version: 2.2.3
4
4
  Summary: Bayesian flow network framework for Chemistry
5
5
  Home-page: https://augus1999.github.io/bayesian-flow-network-for-chemistry/
6
6
  Author: Nianze A. Tao
@@ -96,7 +96,7 @@ You can find pretrained models on our [🤗Hugging Face model page](https://hugg
96
96
 
97
97
  ## Dataset Handling
98
98
 
99
- We provide a Python class [`CSVData`](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/blob/main/bayesianflow_for_chem/data.py#L152) to handle data stored in CSV or similar format containing headers to identify the entities. The following is a quickstart.
99
+ We provide a Python class [`CSVData`](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/blob/main/bayesianflow_for_chem/data.py#L153) to handle data stored in CSV or similar format containing headers to identify the entities. The following is a quickstart.
100
100
 
101
101
  1. Download your dataset file (e.g., ESOL from [MoleculeNet](https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv)) and split the file:
102
102
  ```python