PyPI - bayesianflow-for-chem - Versions diffs - 2.2.1__tar.gz → 2.2.3__tar.gz - Mend

bayesianflow-for-chem 2.2.1tar.gz → 2.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bayesianflow-for-chem might be problematic. Click here for more details.

Files changed (25) hide show

{bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: bayesianflow_for_chem
-Version: 2.2.1
+Version: 2.2.3
 Summary: Bayesian flow network framework for Chemistry
 Home-page: https://augus1999.github.io/bayesian-flow-network-for-chemistry/
 Author: Nianze A. Tao
@@ -96,7 +96,7 @@ You can find pretrained models on our [🤗Hugging Face model page](https://hugg
 ## Dataset Handling
-We provide a Python class [`CSVData`](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/blob/main/bayesianflow_for_chem/data.py#L152) to handle data stored in CSV or similar format containing headers to identify the entities. The following is a quickstart.
+We provide a Python class [`CSVData`](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/blob/main/bayesianflow_for_chem/data.py#L153) to handle data stored in CSV or similar format containing headers to identify the entities. The following is a quickstart.
 1. Download your dataset file (e.g., ESOL from [MoleculeNet](https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv)) and split the file:
 ```python

{bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem/__init__.py RENAMED Viewed

@@ -16,7 +16,7 @@ __all__ = [
     "MLP",
     "EnsembleChemBFN",
 ]
-__version__ = "2.2.1"
+__version__ = "2.2.3"
 __author__ = "Nianze A. Tao (Omozawa Sueno)"

{bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem/cli.py RENAMED Viewed

@@ -18,11 +18,11 @@ from bayesianflow_for_chem.scorer import smiles_valid, Scorer
 from bayesianflow_for_chem.data import (
     VOCAB_COUNT,
     VOCAB_KEYS,
-    AA_VOCAB_COUNT,
-    AA_VOCAB_KEYS,
+    FASTA_VOCAB_COUNT,
+    FASTA_VOCAB_KEYS,
     load_vocab,
     smiles2token,
-    aa2token,
+    fasta2token,
     split_selfies,
     collate,
     CSVData,
@@ -116,6 +116,23 @@ madmadmadmadmadmadmadmadmadmadmadmadmadmadmad
 madmadmadmadmadmadmadmadmadmadmadmadmadmadmad
 """
+_END_MESSAGE = r"""
+If you find this project helpful, please cite us:
+1. N. Tao, and M. Abe, J. Chem. Inf. Model., 2025, 65, 1178-1187.
+2. N. Tao, 2024, arXiv:2412.11439.
+"""
+_ERROR_MESSAGE = r"""
+Some who believe in inductive logic are anxious to point out, with
+Reichenbach, that 'the principle of induction is unreservedly accepted
+by the whole of science and that no man can seriously doubt this
+principle in everyday life either'. Yet even supposing this were the
+case—for after all, 'the whole of science' might err—I should still
+contend that a principle of induction is superfluous, and that it must
+lead to logical inconsistencies.
+                        -- Karl Popper --
+"""
 _ALLOWED_PLUGINS = [
     "collate_fn",
     "num_workers",
@@ -396,7 +413,7 @@ def main_script(version: str) -> None:
             print("Configuration check passed.")
         return
     if flag_critical != 0:
-        raise RuntimeError
+        raise RuntimeError(_ERROR_MESSAGE)
     print(_MESSAGE.format(version))
     # ####### build tokeniser #######
     tokeniser_config = runtime_config["tokeniser"]
@@ -406,9 +423,9 @@ def main_script(version: str) -> None:
         vocab_keys = VOCAB_KEYS
         tokeniser = smiles2token
     if tokeniser_name == "fasta":
-        num_vocab = AA_VOCAB_COUNT
-        vocab_keys = AA_VOCAB_KEYS
-        tokeniser = aa2token
+        num_vocab = FASTA_VOCAB_COUNT
+        vocab_keys = FASTA_VOCAB_KEYS
+        tokeniser = fasta2token
     if tokeniser_name == "selfies":
         vocab_data = load_vocab(tokeniser_config["vocab"])
         num_vocab = vocab_data["vocab_count"]
@@ -679,7 +696,8 @@ def main_script(version: str) -> None:
         with open(runtime_config["inference"]["result_file"], "w") as f:
             f.write("\n".join(mols))
     # ------- finished -------
-    print(" ####### job finished #######")
+    print("*" * 25 + " job finished " + "*" * 25)
+    print(_END_MESSAGE)
 if __name__ == "__main__":

{bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem/data.py RENAMED Viewed

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Author: Nianze A. TAO (Omozawa SUENO)
 """
-Tokenise SMILES/SAFE/SELFIES/protein-sequence strings.
+Tokenise SMILES/SAFE/SELFIES/FASTA strings.
 """
 import os
 import re
@@ -14,7 +14,7 @@ from torch.utils.data import Dataset
 __filedir__ = Path(__file__).parent
-SMI_REGEX_PATTERN = (
+_SMI_REGEX_PATTERN = (
     r"(\[|\]|H[e,f,g,s,o]?|"
     r"L[i,v,a,r,u]|"
     r"B[e,r,a,i,h,k]?|"
@@ -31,11 +31,11 @@ SMI_REGEX_PATTERN = (
     r"\(|\)|\.|=|#|-|\+|\\|\/|:|"
     r"~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
 )
-SEL_REGEX_PATTERN = r"(\[[^\]]+]|\.)"
-AA_REGEX_PATTERN = r"(A|B|C|D|E|F|G|H|I|K|L|M|N|P|Q|R|S|T|V|W|Y|Z|-|.)"
-smi_regex = re.compile(SMI_REGEX_PATTERN)
-sel_regex = re.compile(SEL_REGEX_PATTERN)
-aa_regex = re.compile(AA_REGEX_PATTERN)
+_SEL_REGEX_PATTERN = r"(\[[^\]]+]|\.)"
+_FAS_REGEX_PATTERN = r"(A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|-|\*|\.)"
+_smi_regex = re.compile(_SMI_REGEX_PATTERN)
+_sel_regex = re.compile(_SEL_REGEX_PATTERN)
+_fas_regex = re.compile(_FAS_REGEX_PATTERN)
 def load_vocab(
@@ -65,11 +65,12 @@ _DEFUALT_VOCAB = load_vocab(__filedir__ / "_data/vocab.txt")
 VOCAB_KEYS: List[str] = _DEFUALT_VOCAB["vocab_keys"]
 VOCAB_DICT: Dict[str, int] = _DEFUALT_VOCAB["vocab_dict"]
 VOCAB_COUNT: int = _DEFUALT_VOCAB["vocab_count"]
-AA_VOCAB_KEYS = (
-    VOCAB_KEYS[0:3] + "A B C D E F G H I K L M N P Q R S T V W Y Z - .".split()
+FASTA_VOCAB_KEYS = (
+    VOCAB_KEYS[0:3]
+    + "A B C D E F G H I K L M N P Q R S T V W Y Z - . J O U X *".split()
 )
-AA_VOCAB_COUNT = len(AA_VOCAB_KEYS)
-AA_VOCAB_DICT = dict(zip(AA_VOCAB_KEYS, range(AA_VOCAB_COUNT)))
+FASTA_VOCAB_COUNT = len(FASTA_VOCAB_KEYS)
+FASTA_VOCAB_DICT = dict(zip(FASTA_VOCAB_KEYS, range(FASTA_VOCAB_COUNT)))
 def smiles2vec(smiles: str) -> List[int]:
@@ -81,21 +82,21 @@ def smiles2vec(smiles: str) -> List[int]:
     :return: tokens w/o `<start>` and `<end>`
     :rtype: list
     """
-    tokens = [token for token in smi_regex.findall(smiles)]
+    tokens = [token for token in _smi_regex.findall(smiles)]
     return [VOCAB_DICT[token] for token in tokens]
-def aa2vec(aa_seq: str) -> List[int]:
+def fasta2vec(fasta: str) -> List[int]:
     """
-    Protein sequence tokenisation using a dataset-independent regex pattern.
+    FASTA sequence tokenisation using a dataset-independent regex pattern.
-    :param aa_seq: protein (amino acid) sequence
-    :type aa_seq: str
+    :param fasta: protein (amino acid) sequence
+    :type fasta: str
     :return: tokens w/o `<start>` and `<end>`
     :rtype: list
     """
-    tokens = [token for token in aa_regex.findall(aa_seq)]
-    return [AA_VOCAB_DICT[token] for token in tokens]
+    tokens = [token for token in _fas_regex.findall(fasta)]
+    return [FASTA_VOCAB_DICT[token] for token in tokens]
 def split_selfies(selfies: str) -> List[str]:
@@ -107,7 +108,7 @@ def split_selfies(selfies: str) -> List[str]:
     :return: SELFIES vocab
     :rtype: list
     """
-    return [token for token in sel_regex.findall(selfies)]
+    return [token for token in _sel_regex.findall(selfies)]
 def smiles2token(smiles: str) -> Tensor:
@@ -115,9 +116,9 @@ def smiles2token(smiles: str) -> Tensor:
     return torch.tensor([1] + smiles2vec(smiles) + [2], dtype=torch.long)
-def aa2token(aa_seq: str) -> Tensor:
+def fasta2token(fasta: str) -> Tensor:
     # start token: <start> = 1; end token: <end> = 2
-    return torch.tensor([1] + aa2vec(aa_seq) + [2], dtype=torch.long)
+    return torch.tensor([1] + fasta2vec(fasta) + [2], dtype=torch.long)
 def collate(batch: List[Dict[str, Tensor]]) -> Dict[str, Tensor]:

{bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem/model.py RENAMED Viewed

@@ -1171,6 +1171,9 @@ class EnsembleChemBFN(ChemBFN):
         assert (
             isinstance(c, dict) is self._label_is_dict
         ), f"`c` should be a {'`dict` instance' if self._label_is_dict else '`list` instance'} but got {type(c)} instand."
+        assert len(c) == len(
+            self.models
+        ), f"Number of conditions should match the number of LoRA models. We have {len(self.models)} LoRA models but {len(c)} conditions were provided."
         out: Dict[str, Tensor] = {}
         if isinstance(c, list):
             c = dict(zip([f"val_{i}" for i in range(len(c))], c))

{bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem/tool.py RENAMED Viewed

@@ -7,7 +7,7 @@ import csv
 import random
 import warnings
 from pathlib import Path
-from typing import List, Dict, Tuple, Union, Optional
+from typing import List, Dict, Tuple, Union, Optional, Literal
 import torch
 import numpy as np
 from torch import cuda, Tensor, softmax
@@ -103,7 +103,7 @@ def test(
     model: ChemBFN,
     mlp: MLP,
     data: DataLoader,
-    mode: str = "regression",
+    mode: Literal["regression", "classification"] = "regression",
     device: Union[str, torch.device, None] = None,
 ) -> Dict[str, float]:
     """
@@ -184,7 +184,9 @@ def test(
 def split_dataset(
-    file: Union[str, Path], split_ratio: List[int] = [8, 1, 1], method: str = "random"
+    file: Union[str, Path],
+    split_ratio: List[int] = [8, 1, 1],
+    method: Literal["random", "scaffold"] = "random",
 ) -> None:
     """
     Split a dataset.
@@ -526,7 +528,7 @@ class GeometryConverter:
     def smiles2cartesian(
         smiles: str,
         num_conformers: int = 250,
-        rdkit_ff_type: str = "MMFF",
+        rdkit_ff_type: Literal["MMFF", "UFF"] = "MMFF",
         refine_with_crest: bool = False,
         spin: float = 0.0,
     ) -> Tuple[List[str], np.ndarray]:

{bayesianflow_for_chem-2.2.1 → bayesianflow_for_chem-2.2.3}/bayesianflow_for_chem.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: bayesianflow_for_chem
-Version: 2.2.1
+Version: 2.2.3
 Summary: Bayesian flow network framework for Chemistry
 Home-page: https://augus1999.github.io/bayesian-flow-network-for-chemistry/
 Author: Nianze A. Tao
@@ -96,7 +96,7 @@ You can find pretrained models on our [🤗Hugging Face model page](https://hugg
 ## Dataset Handling
-We provide a Python class [`CSVData`](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/blob/main/bayesianflow_for_chem/data.py#L152) to handle data stored in CSV or similar format containing headers to identify the entities. The following is a quickstart.
+We provide a Python class [`CSVData`](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/blob/main/bayesianflow_for_chem/data.py#L153) to handle data stored in CSV or similar format containing headers to identify the entities. The following is a quickstart.
 1. Download your dataset file (e.g., ESOL from [MoleculeNet](https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv)) and split the file:
 ```python