PyPI - bayesianflow-for-chem - Versions diffs - 2.1.0__py3-none-any.whl → 2.2.2__py3-none-any.whl - Mend

bayesianflow-for-chem 2.1.0py3-none-any.whl → 2.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bayesianflow-for-chem might be problematic. Click here for more details.

Files changed (14) hide show

bayesianflow_for_chem/__init__.py CHANGED Viewed

@@ -3,10 +3,8 @@
 """
 ChemBFN package.
 """
-import colorama
 from . import data, tool, train, scorer, spectra
 from .model import ChemBFN, MLP, EnsembleChemBFN
-from .cli import main_script
 __all__ = [
     "data",
@@ -18,7 +16,7 @@ __all__ = [
     "MLP",
     "EnsembleChemBFN",
 ]
-__version__ = "2.1.0"
+__version__ = "2.2.2"
 __author__ = "Nianze A. Tao (Omozawa Sueno)"
@@ -29,6 +27,9 @@ def main() -> None:
     :return:
     :rtype: None
     """
+    import colorama
+    from bayesianflow_for_chem.cli import main_script
     colorama.just_fix_windows_console()
     main_script(__version__)
     colorama.deinit()

bayesianflow_for_chem/cli.py CHANGED Viewed

@@ -12,22 +12,17 @@ from pathlib import Path
 from functools import partial
 from typing import List, Tuple, Dict, Union, Callable
 import torch
-import lightning as L
 from rdkit.Chem import MolFromSmiles, CanonSmiles
-from torch.utils.data import DataLoader
-from lightning.pytorch import loggers
-from lightning.pytorch.callbacks import ModelCheckpoint
 from bayesianflow_for_chem import ChemBFN, MLP
-from bayesianflow_for_chem.train import Model
 from bayesianflow_for_chem.scorer import smiles_valid, Scorer
 from bayesianflow_for_chem.data import (
     VOCAB_COUNT,
     VOCAB_KEYS,
-    AA_VOCAB_COUNT,
-    AA_VOCAB_KEYS,
+    FASTA_VOCAB_COUNT,
+    FASTA_VOCAB_KEYS,
     load_vocab,
     smiles2token,
-    aa2token,
+    fasta2token,
     split_selfies,
     collate,
     CSVData,
@@ -90,6 +85,7 @@ checkpoint_save_path = "home/user/project/ckpt"
 train_strategy = "auto"  # or any strategy supported by Lightning, e.g., "ddp"
 accumulate_grad_batches = 1
 enable_progress_bar = false
+plugin_script = ""  # define customised behaviours of dataset, datasetloader, etc in a python script
 # Remove this table if inference is unnecessary
 [inference]
@@ -120,6 +116,49 @@ madmadmadmadmadmadmadmadmadmadmadmadmadmadmad
 madmadmadmadmadmadmadmadmadmadmadmadmadmadmad
 """
+_END_MESSAGE = r"""
+If you find this project helpful, please cite us:
+1. N. Tao, and M. Abe, J. Chem. Inf. Model., 2025, 65, 1178-1187.
+2. N. Tao, 2024, arXiv:2412.11439.
+"""
+_ERROR_MESSAGE = r"""
+Some who believe in inductive logic are anxious to point out, with
+Reichenbach, that 'the principle of induction is unreservedly accepted
+by the whole of science and that no man can seriously doubt this
+principle in everyday life either'. Yet even supposing this were the
+case—for after all, 'the whole of science' might err—I should still
+contend that a principle of induction is superfluous, and that it must
+lead to logical inconsistencies.
+                        -- Karl Popper --
+"""
+_ALLOWED_PLUGINS = [
+    "collate_fn",
+    "num_workers",
+    "max_sequence_length",
+    "shuffle",
+    "CustomData",
+]
+def _load_plugin(plugin_file: str) -> Dict[str, Union[int, Callable, object, None]]:
+    if not plugin_file:
+        return {n: None for n in _ALLOWED_PLUGINS}
+    from importlib import util as iutil
+    spec = iutil.spec_from_file_location(Path(plugin_file).stem, plugin_file)
+    plugins = iutil.module_from_spec(spec)
+    spec.loader.exec_module(plugins)
+    plugin_names: List[str] = plugins.__all__
+    plugin_dict = {}
+    for n in _ALLOWED_PLUGINS:
+        if n in plugin_names:
+            plugin_dict[n] = getattr(plugins, n)
+        else:
+            plugin_dict[n] = None
+    return plugin_dict
 def parse_cli(version: str) -> argparse.Namespace:
     """
@@ -267,6 +306,14 @@ def load_runtime_config(
                     f"\033[0;31mCritical\033[0;0m in {config_file}: Restart checkpoint file {ckpt_file} does not exist."
                 )
                 flag_critical += 1
+        # ↓ added in v2.2.0; need to be compatible with old versions.
+        plugin_script: str = config["train"].get("plugin_script", "")
+        if plugin_script:
+            if not os.path.exists(plugin_script):
+                print(
+                    f"\033[0;31mCritical\033[0;0m in {config_file}: Plugin script {plugin_script} does not exist."
+                )
+                flag_critical += 1
     if "inference" in config:
         if not "train" in config:
             if not isinstance(config["inference"]["sequence_length"], int):
@@ -335,14 +382,15 @@ def main_script(version: str) -> None:
                 )
                 flag_warning += 1
         if not os.path.exists(runtime_config["train"]["checkpoint_save_path"]):
-            os.makedirs(runtime_config["train"]["checkpoint_save_path"])
+            if not parser.dryrun:  # only create it in real tasks
+                os.makedirs(runtime_config["train"]["checkpoint_save_path"])
     else:
         if not model_config["ChemBFN"]["base_model"]:
             print(
                 f"\033[0;33mWarning\033[0;0m in {parser.model_config}: You should load a pretrained ChemBFN model."
             )
             flag_warning += 1
-        if not model_config["MLP"]["base_model"]:
+        if "MLP" in model_config and not model_config["MLP"]["base_model"]:
             print(
                 f"\033[0;33mWarning\033[0;0m in {parser.model_config}: You should load a pretrained MLP."
             )
@@ -350,18 +398,22 @@ def main_script(version: str) -> None:
     if "inference" in runtime_config:
         if runtime_config["inference"]["guidance_objective"]:
             if not "MLP" in model_config:
-                print(f"Warning in {parser.model_config}: Oh no, you don't have a MLP.")
+                print(
+                    f"\033[0;33mWarning\033[0;0m in {parser.model_config}: Oh no, you don't have a MLP."
+                )
                 flag_warning += 1
     if parser.dryrun:
         if flag_critical != 0:
             print("Configuration check failed!")
         elif flag_warning != 0:
-            print("Your job will probably run, but it may not follow your expectation.")
+            print(
+                "Your job will probably run, but it may not follow your expectations."
+            )
         else:
             print("Configuration check passed.")
         return
     if flag_critical != 0:
-        raise RuntimeError
+        raise RuntimeError(_ERROR_MESSAGE)
     print(_MESSAGE.format(version))
     # ####### build tokeniser #######
     tokeniser_config = runtime_config["tokeniser"]
@@ -371,9 +423,9 @@ def main_script(version: str) -> None:
         vocab_keys = VOCAB_KEYS
         tokeniser = smiles2token
     if tokeniser_name == "fasta":
-        num_vocab = AA_VOCAB_COUNT
-        vocab_keys = AA_VOCAB_KEYS
-        tokeniser = aa2token
+        num_vocab = FASTA_VOCAB_COUNT
+        vocab_keys = FASTA_VOCAB_KEYS
+        tokeniser = fasta2token
     if tokeniser_name == "selfies":
         vocab_data = load_vocab(tokeniser_config["vocab"])
         num_vocab = vocab_data["vocab_count"]
@@ -415,6 +467,15 @@ def main_script(version: str) -> None:
         mlp = None
     # ------- train -------
     if "train" in runtime_config:
+        import lightning as L
+        from torch.utils.data import DataLoader
+        from lightning.pytorch import loggers
+        from lightning.pytorch.callbacks import ModelCheckpoint
+        from bayesianflow_for_chem.train import Model
+        # ####### get plugins #######
+        plugin_file = runtime_config["train"].get("plugin_script", "")
+        plugins = _load_plugin(plugin_file)
         # ####### build scorer #######
         if (tokeniser_name == "smiles" or tokeniser_name == "safe") and runtime_config[
             "train"
@@ -428,30 +489,43 @@ def main_script(version: str) -> None:
         mol_tag = runtime_config["train"]["molecule_tag"]
         obj_tag = runtime_config["train"]["objective_tag"]
         dataset_file = runtime_config["train"]["dataset"]
-        with open(dataset_file, "r") as db:
-            _data = db.readlines()
-        header = _data[0]
-        mol_idx = []
-        for i, tag in enumerate(header.replace("\n", "").split(",")):
-            if tag == mol_tag:
-                mol_idx.append(i)
-        _data_len = []
-        for i in _data[1:]:
-            i = i.replace("\n", "").split(",")
-            _mol = ".".join([i[j] for j in mol_idx])
-            _data_len.append(tokeniser(_mol).shape[-1])
-        lmax = max(_data_len)
-        dataset = CSVData(dataset_file)
+        if plugins["max_sequence_length"]:
+            lmax = plugins["max_sequence_length"]
+        else:
+            with open(dataset_file, "r") as db:
+                _data = db.readlines()
+            _header = _data[0]
+            _mol_idx = []
+            for i, tag in enumerate(_header.replace("\n", "").split(",")):
+                if tag == mol_tag:
+                    _mol_idx.append(i)
+            _data_len = []
+            for i in _data[1:]:
+                i = i.replace("\n", "").split(",")
+                _mol = ".".join([i[j] for j in _mol_idx])
+                _data_len.append(tokeniser(_mol).shape[-1])
+            lmax = max(_data_len)
+            del _data, _data_len, _header, _mol_idx  # clear memory
+        if plugins["CustomData"] is not None:
+            dataset = plugins["CustomData"](dataset_file)
+        else:
+            dataset = CSVData(dataset_file)
         dataset.map(
             partial(_encode, mol_tag=mol_tag, obj_tag=obj_tag, tokeniser=tokeniser)
         )
         dataloader = DataLoader(
             dataset,
             runtime_config["train"]["batch_size"],
-            True,
-            num_workers=4,
-            collate_fn=collate,
-            persistent_workers=True,
+            True if plugins["shuffle"] is None else plugins["shuffle"],
+            num_workers=4 if plugins["num_workers"] is None else plugins["num_workers"],
+            collate_fn=(
+                collate if plugins["collate_fn"] is None else plugins["collate_fn"]
+            ),
+            persistent_workers=(
+                True
+                if (plugins["num_workers"] is None or plugins["num_workers"] > 0)
+                else False
+            ),
         )
         # ####### build trainer #######
         logger_name = runtime_config["train"]["logger_name"].lower()
@@ -530,6 +604,7 @@ def main_script(version: str) -> None:
         if "train" in runtime_config:
             bfn = model.model
             mlp = model.mlp
+        # ↓ added in v2.1.0; need to be compatible with old versions
         lora_scaling = runtime_config["inference"].get("lora_scaling", 1.0)
         # ####### strat inference #######
         bfn.semi_autoregressive = runtime_config["inference"]["semi_autoregressive"]
@@ -622,6 +697,7 @@ def main_script(version: str) -> None:
             f.write("\n".join(mols))
     # ------- finished -------
     print(" ####### job finished #######")
+    print(_END_MESSAGE)
 if __name__ == "__main__":

bayesianflow_for_chem/data.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Author: Nianze A. TAO (Omozawa SUENO)
 """
-Tokenise SMILES/SAFE/SELFIES/protein-sequence strings.
+Tokenise SMILES/SAFE/SELFIES/FASTA strings.
 """
 import os
 import re
@@ -14,7 +14,7 @@ from torch.utils.data import Dataset
 __filedir__ = Path(__file__).parent
-SMI_REGEX_PATTERN = (
+_SMI_REGEX_PATTERN = (
     r"(\[|\]|H[e,f,g,s,o]?|"
     r"L[i,v,a,r,u]|"
     r"B[e,r,a,i,h,k]?|"
@@ -31,11 +31,11 @@ SMI_REGEX_PATTERN = (
     r"\(|\)|\.|=|#|-|\+|\\|\/|:|"
     r"~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
 )
-SEL_REGEX_PATTERN = r"(\[[^\]]+]|\.)"
-AA_REGEX_PATTERN = r"(A|B|C|D|E|F|G|H|I|K|L|M|N|P|Q|R|S|T|V|W|Y|Z|-|.)"
-smi_regex = re.compile(SMI_REGEX_PATTERN)
-sel_regex = re.compile(SEL_REGEX_PATTERN)
-aa_regex = re.compile(AA_REGEX_PATTERN)
+_SEL_REGEX_PATTERN = r"(\[[^\]]+]|\.)"
+_FAS_REGEX_PATTERN = r"(A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|-|\*|\.)"
+_smi_regex = re.compile(_SMI_REGEX_PATTERN)
+_sel_regex = re.compile(_SEL_REGEX_PATTERN)
+_fas_regex = re.compile(_FAS_REGEX_PATTERN)
 def load_vocab(
@@ -61,15 +61,16 @@ def load_vocab(
     }
-_DEFUALT_VOCAB = load_vocab(__filedir__ / "vocab.txt")
+_DEFUALT_VOCAB = load_vocab(__filedir__ / "_data/vocab.txt")
 VOCAB_KEYS: List[str] = _DEFUALT_VOCAB["vocab_keys"]
 VOCAB_DICT: Dict[str, int] = _DEFUALT_VOCAB["vocab_dict"]
 VOCAB_COUNT: int = _DEFUALT_VOCAB["vocab_count"]
-AA_VOCAB_KEYS = (
-    VOCAB_KEYS[0:3] + "A B C D E F G H I K L M N P Q R S T V W Y Z - .".split()
+FASTA_VOCAB_KEYS = (
+    VOCAB_KEYS[0:3]
+    + "A B C D E F G H I K L M N P Q R S T V W Y Z - . J O U X *".split()
 )
-AA_VOCAB_COUNT = len(AA_VOCAB_KEYS)
-AA_VOCAB_DICT = dict(zip(AA_VOCAB_KEYS, range(AA_VOCAB_COUNT)))
+FASTA_VOCAB_COUNT = len(FASTA_VOCAB_KEYS)
+FASTA_VOCAB_DICT = dict(zip(FASTA_VOCAB_KEYS, range(FASTA_VOCAB_COUNT)))
 def smiles2vec(smiles: str) -> List[int]:
@@ -81,21 +82,21 @@ def smiles2vec(smiles: str) -> List[int]:
     :return: tokens w/o `<start>` and `<end>`
     :rtype: list
     """
-    tokens = [token for token in smi_regex.findall(smiles)]
+    tokens = [token for token in _smi_regex.findall(smiles)]
     return [VOCAB_DICT[token] for token in tokens]
-def aa2vec(aa_seq: str) -> List[int]:
+def fasta2vec(fasta: str) -> List[int]:
     """
-    Protein sequence tokenisation using a dataset-independent regex pattern.
+    FASTA sequence tokenisation using a dataset-independent regex pattern.
-    :param aa_seq: protein (amino acid) sequence
-    :type aa_seq: str
+    :param fasta: protein (amino acid) sequence
+    :type fasta: str
     :return: tokens w/o `<start>` and `<end>`
     :rtype: list
     """
-    tokens = [token for token in aa_regex.findall(aa_seq)]
-    return [AA_VOCAB_DICT[token] for token in tokens]
+    tokens = [token for token in _fas_regex.findall(fasta)]
+    return [FASTA_VOCAB_DICT[token] for token in tokens]
 def split_selfies(selfies: str) -> List[str]:
@@ -107,7 +108,7 @@ def split_selfies(selfies: str) -> List[str]:
     :return: SELFIES vocab
     :rtype: list
     """
-    return [token for token in sel_regex.findall(selfies)]
+    return [token for token in _sel_regex.findall(selfies)]
 def smiles2token(smiles: str) -> Tensor:
@@ -115,9 +116,9 @@ def smiles2token(smiles: str) -> Tensor:
     return torch.tensor([1] + smiles2vec(smiles) + [2], dtype=torch.long)
-def aa2token(aa_seq: str) -> Tensor:
+def fasta2token(fasta: str) -> Tensor:
     # start token: <start> = 1; end token: <end> = 2
-    return torch.tensor([1] + aa2vec(aa_seq) + [2], dtype=torch.long)
+    return torch.tensor([1] + fasta2vec(fasta) + [2], dtype=torch.long)
 def collate(batch: List[Dict[str, Tensor]]) -> Dict[str, Tensor]:

bayesianflow_for_chem/model.py CHANGED Viewed

@@ -659,12 +659,91 @@ class ChemBFN(nn.Module):
         return (-logits.gather(-1, x[..., :1]).squeeze(-1)).mean()
     @staticmethod
-    def reshape_y(y: Tensor) -> Tensor:
+    def _reshape(y: Tensor) -> Tensor:
         assert y.dim() <= 3  # this doesn't work if the model is frezen in JIT.
         if y.dim() == 2:
             return y[:, None, :]
         return y
+    def _process(
+        self,
+        theta: Tensor,
+        mask: Optional[Tuple[Tensor, Tensor]],
+        y: Optional[Tensor],
+        sample_step: int,
+        guidance_strength: float,
+        token_mask: Optional[Tensor],
+    ) -> Tuple[Tensor, Tensor]:
+        # BFN inference process.
+        #
+        # theta: piror distribution;            shape: (n_b, n_t, n_vocab)
+        # mask: masked condition distribution;  shape: (n_b, n_t, n_vocab)
+        #       condition distribution mask;    shape: (n_b, n_t, 1)
+        n_b = theta.shape[0]
+        if y is not None:
+            y = self._reshape(y)
+        for i in torch.linspace(1, sample_step, sample_step, device=self.beta.device):
+            t = (i - 1).view(1, 1, 1).repeat(n_b, 1, 1) / sample_step
+            p = self.discrete_output_distribution(theta, t, y, guidance_strength)
+            if token_mask is not None:
+                p = p.masked_fill_(token_mask, 0.0)
+            alpha = self.calc_discrete_alpha(t, t + 1 / sample_step)
+            e_k = nn.functional.one_hot(torch.argmax(p, -1), self.K).float()
+            mu = alpha * (self.K * e_k - 1)
+            sigma = (alpha * self.K).sqrt()
+            theta = (mu + sigma * torch.randn_like(mu)).exp() * theta
+            theta = theta / theta.sum(-1, True)
+            if mask is not None:
+                x_onehot, x_mask = mask
+                theta = x_onehot + (1 - x_mask) * theta
+        t_final = torch.ones((n_b, 1, 1), device=self.beta.device)
+        p = self.discrete_output_distribution(theta, t_final, y, guidance_strength)
+        entropy = -(p * p.log()).sum(-1).mean(-1)
+        if token_mask is not None:
+            p = p.masked_fill_(token_mask, 0.0)
+        return torch.argmax(p, -1), entropy
+    def _ode_process(
+        self,
+        z: Tensor,
+        mask: Optional[Tuple[Tensor, Tensor]],
+        y: Optional[Tensor],
+        sample_step: int,
+        guidance_strength: float,
+        token_mask: Optional[Tensor],
+        temperature: float,
+    ) -> Tuple[Tensor, Tensor]:
+        # ODE-solver engaged inference process.
+        #
+        # z: prior latent vector;               shape: (n_b, n_t, n_vocab)
+        # mask: masked condition distribution;  shape: (n_b, n_t, n_vocab)
+        #       condition distribution mask;    shape: (n_b, n_t, 1)
+        n_b = z.shape[0]
+        if y is not None:
+            y = self._reshape(y)
+        for i in torch.linspace(1, sample_step, sample_step, device=self.beta.device):
+            t = (i - 1).view(1, 1, 1).repeat(n_b, 1, 1) / sample_step
+            theta = softmax(z, -1)
+            if mask is not None:
+                x_onehot, x_mask = mask
+                theta = x_onehot + (1 - x_mask) * theta
+            beta = self.calc_beta(t + 1 / sample_step)
+            p = self.discrete_output_distribution(theta, t, y, guidance_strength)
+            if token_mask is not None:
+                p = p.masked_fill_(token_mask, 0.0)
+            u = torch.randn_like(z)
+            z = (self.K * p - 1) * beta + (self.K * beta * temperature).sqrt() * u
+        t_final = torch.ones((n_b, 1, 1), device=self.beta.device)
+        theta = softmax(z, -1)
+        if mask is not None:
+            x_onehot, x_mask = mask
+            theta = x_onehot + (1 - x_mask) * theta
+        p = self.discrete_output_distribution(theta, t_final, y, guidance_strength)
+        entropy = -(p * p.log()).sum(-1).mean(-1)
+        if token_mask is not None:
+            p = p.masked_fill_(token_mask, 0.0)
+        return torch.argmax(p, -1), entropy
     @torch.jit.export
     def sample(
         self,
@@ -680,44 +759,26 @@ class ChemBFN(nn.Module):
         :param batch_size: batch size
         :param sequence_size: max sequence length
-        :param y: conditioning vector;      shape: (n_b, 1, n_f) or (n_b, n_f)
+        :param y: conditioning vector;   shape: (n_b, 1, n_f) or (n_b, n_f)
         :param sample_step: number of sampling steps
         :param guidance_strength: strength of conditional generation. It is not used if y is null.
         :param token_mask: token mask assigning unwanted token(s) with `True`;
-                                            shape: (1, 1, n_vocab)
+                                         shape: (1, 1, n_vocab)
         :type batch_size: int
         :type sequence_size: int
         :type y: torch.Tensor | None
         :type sample_step: int
         :type guidance_strength: float
         :type token_mask: torch.Tensor | None
-        :return: sampled token indices;     shape: (n_b, n_t) \n
-                 entropy of the tokens;     shape: (n_b)
+        :return: sampled token indices;  shape: (n_b, n_t) \n
+                 entropy of the tokens;  shape: (n_b)
         :rtype: tuple
         """
         theta = (
             torch.ones((batch_size, sequence_size, self.K), device=self.beta.device)
             / self.K
         )
-        if y is not None:
-            y = self.reshape_y(y)
-        for i in torch.linspace(1, sample_step, sample_step, device=self.beta.device):
-            t = (i - 1).view(1, 1, 1).repeat(batch_size, 1, 1) / sample_step
-            p = self.discrete_output_distribution(theta, t, y, guidance_strength)
-            if token_mask is not None:
-                p = p.masked_fill_(token_mask, 0.0)
-            alpha = self.calc_discrete_alpha(t, t + 1 / sample_step)
-            e_k = nn.functional.one_hot(torch.argmax(p, -1), self.K).float()
-            mu = alpha * (self.K * e_k - 1)
-            sigma = (alpha * self.K).sqrt()
-            theta = (mu + sigma * torch.randn_like(mu)).exp() * theta
-            theta = theta / theta.sum(-1, True)
-        t_final = torch.ones((batch_size, 1, 1), device=self.beta.device)
-        p = self.discrete_output_distribution(theta, t_final, y, guidance_strength)
-        entropy = -(p * p.log()).sum(-1).mean(-1)
-        if token_mask is not None:
-            p = p.masked_fill_(token_mask, 0.0)
-        return torch.argmax(p, -1), entropy
+        return self._process(theta, None, y, sample_step, guidance_strength, token_mask)
     @torch.jit.export
     def ode_sample(
@@ -735,11 +796,11 @@ class ChemBFN(nn.Module):
         :param batch_size: batch size
         :param sequence_size: max sequence length
-        :param y: conditioning vector;      shape: (n_b, 1, n_f) or (n_b, n_f)
+        :param y: conditioning vector;   shape: (n_b, 1, n_f) or (n_b, n_f)
         :param sample_step: number of sampling steps
         :param guidance_strength: strength of conditional generation. It is not used if y is null.
         :param token_mask: token mask assigning unwanted token(s) with `True`;
-                                            shape: (1, 1, n_vocab)
+                                         shape: (1, 1, n_vocab)
         :param temperature: sampling temperature
         :type batch_size: int
         :type sequence_size: int
@@ -748,29 +809,14 @@ class ChemBFN(nn.Module):
         :type guidance_strength: float
         :type token_mask: torch.Tensor | None
         :type temperature: float
-        :return: sampled token indices;     shape: (n_b, n_t) \n
-                 entropy of the tokens;     shape: (n_b)
+        :return: sampled token indices;  shape: (n_b, n_t) \n
+                 entropy of the tokens;  shape: (n_b)
         :rtype: tuple
         """
         z = torch.zeros((batch_size, sequence_size, self.K), device=self.beta.device)
-        if y is not None:
-            y = self.reshape_y(y)
-        for i in torch.linspace(1, sample_step, sample_step, device=self.beta.device):
-            t = (i - 1).view(1, 1, 1).repeat(batch_size, 1, 1) / sample_step
-            theta = torch.softmax(z, -1)
-            beta = self.calc_beta(t + 1 / sample_step)
-            p = self.discrete_output_distribution(theta, t, y, guidance_strength)
-            if token_mask is not None:
-                p = p.masked_fill_(token_mask, 0.0)
-            u = torch.randn_like(z)
-            z = (self.K * p - 1) * beta + (self.K * beta * temperature).sqrt() * u
-        t_final = torch.ones((batch_size, 1, 1), device=self.beta.device)
-        theta = torch.softmax(z, -1)
-        p = self.discrete_output_distribution(theta, t_final, y, guidance_strength)
-        entropy = -(p * p.log()).sum(-1).mean(-1)
-        if token_mask is not None:
-            p = p.masked_fill_(token_mask, 0.0)
-        return torch.argmax(p, -1), entropy
+        return self._ode_process(
+            z, None, y, sample_step, guidance_strength, token_mask, temperature
+        )
     @torch.jit.export
     def inpaint(
@@ -800,30 +846,12 @@ class ChemBFN(nn.Module):
         :rtype: tuple
         """
         n_b, n_t = x.shape
-        mask = (x != 0).float()[..., None]
+        x_mask = (x != 0).float()[..., None]
         theta = torch.ones((n_b, n_t, self.K), device=x.device) / self.K
-        x_onehot = nn.functional.one_hot(x, self.K) * mask
-        theta = x_onehot + (1 - mask) * theta
-        if y is not None:
-            y = self.reshape_y(y)
-        for i in torch.linspace(1, sample_step, sample_step, device=x.device):
-            t = (i - 1).view(1, 1, 1).repeat(n_b, 1, 1) / sample_step
-            p = self.discrete_output_distribution(theta, t, y, guidance_strength)
-            if token_mask is not None:
-                p = p.masked_fill_(token_mask, 0.0)
-            alpha = self.calc_discrete_alpha(t, t + 1 / sample_step)
-            e_k = nn.functional.one_hot(torch.argmax(p, -1), self.K).float()
-            mu = alpha * (self.K * e_k - 1)
-            sigma = (alpha * self.K).sqrt()
-            theta = (mu + sigma * torch.randn_like(mu)).exp() * theta
-            theta = theta / theta.sum(-1, True)
-            theta = x_onehot + (1 - mask) * theta
-        t_final = torch.ones((n_b, 1, 1), device=x.device)
-        p = self.discrete_output_distribution(theta, t_final, y, guidance_strength)
-        entropy = -(p * p.log()).sum(-1).mean(-1)
-        if token_mask is not None:
-            p = p.masked_fill_(token_mask, 0.0)
-        return torch.argmax(p, -1), entropy
+        x_onehot = nn.functional.one_hot(x, self.K) * x_mask
+        theta = x_onehot + (1 - x_mask) * theta
+        mask = (x_onehot, x_mask)
+        return self._process(theta, mask, y, sample_step, guidance_strength, token_mask)
     @torch.jit.export
     def ode_inpaint(
@@ -856,29 +884,13 @@ class ChemBFN(nn.Module):
         :rtype: tuple
         """
         n_b, n_t = x.shape
-        mask = (x != 0).float()[..., None]
-        x_onehot = nn.functional.one_hot(x, self.K) * mask
+        x_mask = (x != 0).float()[..., None]
+        x_onehot = nn.functional.one_hot(x, self.K) * x_mask
         z = torch.zeros((n_b, n_t, self.K), device=self.beta.device)
-        if y is not None:
-            y = self.reshape_y(y)
-        for i in torch.linspace(1, sample_step, sample_step, device=self.beta.device):
-            t = (i - 1).view(1, 1, 1).repeat(n_b, 1, 1) / sample_step
-            theta = torch.softmax(z, -1)
-            theta = x_onehot + (1 - mask) * theta
-            beta = self.calc_beta(t + 1 / sample_step)
-            p = self.discrete_output_distribution(theta, t, y, guidance_strength)
-            if token_mask is not None:
-                p = p.masked_fill_(token_mask, 0.0)
-            u = torch.randn_like(z)
-            z = (self.K * p - 1) * beta + (self.K * beta * temperature).sqrt() * u
-        t_final = torch.ones((n_b, 1, 1), device=self.beta.device)
-        theta = torch.softmax(z, -1)
-        theta = x_onehot + (1 - mask) * theta
-        p = self.discrete_output_distribution(theta, t_final, y, guidance_strength)
-        entropy = -(p * p.log()).sum(-1).mean(-1)
-        if token_mask is not None:
-            p = p.masked_fill_(token_mask, 0.0)
-        return torch.argmax(p, -1), entropy
+        mask = (x_onehot, x_mask)
+        return self._ode_process(
+            z, mask, y, sample_step, guidance_strength, token_mask, temperature
+        )
     @torch.jit.export
     def optimise(
@@ -908,28 +920,9 @@ class ChemBFN(nn.Module):
                  entropy of the tokens;             shape: (n_b)
         :rtype: tuple
         """
-        n_b = x.shape[0]
         x_onehot = nn.functional.one_hot(x, self.K).float()
-        theta = nn.functional.softmax(x_onehot, -1)
-        if y is not None:
-            y = self.reshape_y(y)
-        for i in torch.linspace(1, sample_step, sample_step, device=x.device):
-            t = (i - 1).view(1, 1, 1).repeat(n_b, 1, 1) / sample_step
-            p = self.discrete_output_distribution(theta, t, y, guidance_strength)
-            if token_mask is not None:
-                p = p.masked_fill_(token_mask, 0.0)
-            alpha = self.calc_discrete_alpha(t, t + 1 / sample_step)
-            e_k = nn.functional.one_hot(torch.argmax(p, -1), self.K).float()
-            mu = alpha * (self.K * e_k - 1)
-            sigma = (alpha * self.K).sqrt()
-            theta = (mu + sigma * torch.randn_like(mu)).exp() * theta
-            theta = theta / theta.sum(-1, True)
-        t_final = torch.ones((n_b, 1, 1), device=x.device)
-        p = self.discrete_output_distribution(theta, t_final, y, guidance_strength)
-        entropy = -(p * p.log()).sum(-1).mean(-1)
-        if token_mask is not None:
-            p = p.masked_fill_(token_mask, 0.0)
-        return torch.argmax(p, -1), entropy
+        theta = softmax(x_onehot, -1)
+        return self._process(theta, None, y, sample_step, guidance_strength, token_mask)
     @torch.jit.export
     def ode_optimise(
@@ -961,26 +954,10 @@ class ChemBFN(nn.Module):
                  entropy of the tokens;             shape: (n_b)
         :rtype: tuple
         """
-        n_b = x.shape[0]
         z = nn.functional.one_hot(x, self.K).float()
-        if y is not None:
-            y = self.reshape_y(y)
-        for i in torch.linspace(1, sample_step, sample_step, device=self.beta.device):
-            t = (i - 1).view(1, 1, 1).repeat(n_b, 1, 1) / sample_step
-            theta = torch.softmax(z, -1)
-            beta = self.calc_beta(t + 1 / sample_step)
-            p = self.discrete_output_distribution(theta, t, y, guidance_strength)
-            if token_mask is not None:
-                p = p.masked_fill_(token_mask, 0.0)
-            u = torch.randn_like(z)
-            z = (self.K * p - 1) * beta + (self.K * beta * temperature).sqrt() * u
-        t_final = torch.ones((n_b, 1, 1), device=self.beta.device)
-        theta = torch.softmax(z, -1)
-        p = self.discrete_output_distribution(theta, t_final, y, guidance_strength)
-        entropy = -(p * p.log()).sum(-1).mean(-1)
-        if token_mask is not None:
-            p = p.masked_fill_(token_mask, 0.0)
-        return torch.argmax(p, -1), entropy
+        return self._ode_process(
+            z, None, y, sample_step, guidance_strength, token_mask, temperature
+        )
     def inference(
         self, x: Tensor, mlp: MLP, embed_fn: Optional[Callable[[Tensor], Tensor]] = None
@@ -1154,22 +1131,6 @@ class EnsembleChemBFN(ChemBFN):
                     module.lora_dropout = None
             v.lora_enabled = False
-    def construct_y(
-        self, c: Union[List[Tensor], Dict[str, Tensor]]
-    ) -> Dict[str, Tensor]:
-        assert (
-            isinstance(c, dict) is self._label_is_dict
-        ), f"`c` should be a {'`dict` instance' if self._label_is_dict else '`list` instance'} but got {type(c)} instand."
-        out: Dict[str, Tensor] = {}
-        if isinstance(c, list):
-            c = dict(zip([f"val_{i}" for i in range(len(c))], c))
-        for name, model in self.cond_heads.items():
-            y = model.forward(c[name])
-            if y.dim() == 2:
-                y = y[:, None, :]
-            out[name] = y
-        return out
     def discrete_output_distribution(
         self, theta: Tensor, t: Tensor, y: Dict[str, Tensor], w: float
     ) -> Tensor:
@@ -1204,8 +1165,24 @@ class EnsembleChemBFN(ChemBFN):
             p_cond += p_cond_ * self.adapter_weights[name]
         return softmax((1 + w) * p_cond - w * p_uncond, -1)
+    def _map_to_dict(
+        self, c: Union[List[Tensor], Dict[str, Tensor]]
+    ) -> Dict[str, Tensor]:
+        assert (
+            isinstance(c, dict) is self._label_is_dict
+        ), f"`c` should be a {'`dict` instance' if self._label_is_dict else '`list` instance'} but got {type(c)} instand."
+        out: Dict[str, Tensor] = {}
+        if isinstance(c, list):
+            c = dict(zip([f"val_{i}" for i in range(len(c))], c))
+        for name, model in self.cond_heads.items():
+            y = model.forward(c[name])
+            if y.dim() == 2:
+                y = y[:, None, :]
+            out[name] = y
+        return out
     @staticmethod
-    def reshape_y(y: Dict[str, Tensor]) -> Dict[str, Tensor]:
+    def _reshape(y: Dict[str, Tensor]) -> Dict[str, Tensor]:
         for k in y:
             assert y[k].dim() <= 3
             if y[k].dim() == 2:
@@ -1241,7 +1218,7 @@ class EnsembleChemBFN(ChemBFN):
                  entropy of the tokens;          shape: (n_b)
         :rtype: tuple
         """
-        y = self.construct_y(conditions)
+        y = self._map_to_dict(conditions)
         return super().sample(
             batch_size, sequence_size, y, sample_step, guidance_strength, token_mask
         )
@@ -1278,7 +1255,7 @@ class EnsembleChemBFN(ChemBFN):
                  entropy of the tokens;          shape: (n_b)
         :rtype: tuple
         """
-        y = self.construct_y(conditions)
+        y = self._map_to_dict(conditions)
         return super().ode_sample(
             batch_size,
             sequence_size,
@@ -1315,7 +1292,7 @@ class EnsembleChemBFN(ChemBFN):
                  entropy of the tokens;             shape: (n_b)
         :rtype: tuple
         """
-        y = self.construct_y(conditions)
+        y = self._map_to_dict(conditions)
         return super().inpaint(x, y, sample_step, guidance_strength, token_mask)
     @torch.inference_mode()
@@ -1347,7 +1324,7 @@ class EnsembleChemBFN(ChemBFN):
                  entropy of the tokens;             shape: (n_b)
         :rtype: tuple
         """
-        y = self.construct_y(conditions)
+        y = self._map_to_dict(conditions)
         return super().ode_inpaint(
             x, y, sample_step, guidance_strength, token_mask, temperature
         )
@@ -1380,7 +1357,7 @@ class EnsembleChemBFN(ChemBFN):
                  entropy of the tokens;             shape: (n_b)
         :rtype: tuple
         """
-        y = self.construct_y(conditions)
+        y = self._map_to_dict(conditions)
         return super().optimise(x, y, sample_step, guidance_strength, token_mask)
     @torch.inference_mode()
@@ -1412,7 +1389,7 @@ class EnsembleChemBFN(ChemBFN):
                  entropy of the tokens;             shape: (n_b)
         :rtype: tuple
         """
-        y = self.construct_y(conditions)
+        y = self._map_to_dict(conditions)
         return super().ode_optimise(
             x, y, sample_step, guidance_strength, token_mask, temperature
         )

bayesianflow_for_chem/tool.py CHANGED Viewed

@@ -7,9 +7,8 @@ import csv
 import random
 import warnings
 from pathlib import Path
-from typing import List, Dict, Tuple, Union, Optional
+from typing import List, Dict, Tuple, Union, Optional, Literal
 import torch
-import colorama
 import numpy as np
 from torch import cuda, Tensor, softmax
 from torch.utils.data import DataLoader
@@ -24,15 +23,7 @@ from rdkit.Chem import (
     AddHs,
     Mol,
 )
-from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles  # type: ignore
-from sklearn.metrics import (
-    roc_auc_score,
-    auc,
-    precision_recall_curve,
-    r2_score,
-    mean_absolute_error,
-    root_mean_squared_error,
-)
+from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
 from .data import VOCAB_KEYS
 from .model import ChemBFN, MLP, EnsembleChemBFN
@@ -45,12 +36,74 @@ def _find_device() -> torch.device:
     return torch.device("cpu")
+def _parse_and_assert_param(
+    model: Union[ChemBFN, EnsembleChemBFN],
+    y: Optional[Union[Tensor, Dict[str, Tensor], List[Tensor]]],
+    method: str,
+) -> Optional[float]:
+    assert method.split(":")[0].lower() in ("ode", "bfn")
+    if isinstance(model, EnsembleChemBFN):
+        assert y is not None, "conditioning is required while using an ensemble model."
+        assert isinstance(y, list) or isinstance(y, dict)
+    else:
+        assert isinstance(y, Tensor) or (y is None)
+    if "ode" in method.lower():
+        tp = float(method.split(":")[-1])
+        assert tp > 0, "Sampling temperature should be higher than 0."
+        return tp
+    return None
+def _map_to_device(
+    y: Optional[Union[Tensor, Dict[str, Tensor], List[Tensor]]],
+    device: Union[str, torch.device],
+) -> Optional[Union[Tensor, Dict[str, Tensor], List[Tensor]]]:
+    if y is not None:
+        if isinstance(y, Tensor):
+            y = y.to(device)
+        elif isinstance(y, list):
+            y = [i.to(device) for i in y]
+        elif isinstance(y, dict):
+            y = {k: v.to(device) for k, v in y.items()}
+        else:
+            raise NotImplementedError
+    return y
+def _build_token_mask(
+    allowed_tokens: Union[str, List[str]],
+    vocab_keys: List[str],
+    device: Union[str, torch.tensor],
+) -> Optional[Tensor]:
+    if isinstance(allowed_tokens, list):
+        token_mask = [0 if i in allowed_tokens else 1 for i in vocab_keys]
+        token_mask = torch.tensor([[token_mask]], dtype=torch.bool).to(device)
+    else:
+        token_mask = None
+    return token_mask
+def _token_to_seq(
+    tokens: Tensor, entropy: Tensor, vocab_keys: List[str], separator: str, sort: bool
+) -> List[str]:
+    if sort:
+        sorted_idx = entropy.argsort(stable=True)
+        tokens = tokens[sorted_idx]
+    return [
+        separator.join([vocab_keys[i] for i in j])
+        .split("<start>" + separator)[-1]
+        .split(separator + "<end>")[0]
+        .replace("<pad>", "")
+        for j in tokens
+    ]
 @torch.no_grad()
 def test(
     model: ChemBFN,
     mlp: MLP,
     data: DataLoader,
-    mode: str = "regression",
+    mode: Literal["regression", "classification"] = "regression",
     device: Union[str, torch.device, None] = None,
 ) -> Dict[str, float]:
     """
@@ -86,6 +139,12 @@ def test(
         predict_y.append(y_hat.detach().to("cpu"))
     predict_y, label_y = torch.cat(predict_y, 0), torch.cat(label_y, 0).split(1, -1)
     if mode == "regression":
+        from sklearn.metrics import (
+            r2_score,
+            mean_absolute_error,
+            root_mean_squared_error,
+        )
         predict_y = [
             predict[label_y[i] != torch.inf]
             for (i, predict) in enumerate(predict_y.split(1, -1))
@@ -99,6 +158,8 @@ def test(
         r2 = [r2_score(label, predict) for (label, predict) in y_zipped]
         return {"MAE": mae, "RMSE": rmse, "R^2": r2}
     if mode == "classification":
+        from sklearn.metrics import roc_auc_score, auc, precision_recall_curve
         n_c = len(label_y)
         predict_y = predict_y.chunk(n_c, -1)
         y_zipped = list(zip(label_y, predict_y))
@@ -123,7 +184,9 @@ def test(
 def split_dataset(
-    file: Union[str, Path], split_ratio: List[int] = [8, 1, 1], method: str = "random"
+    file: Union[str, Path],
+    split_ratio: List[int] = [8, 1, 1],
+    method: Literal["random", "scaffold"] = "random",
 ) -> None:
     """
     Split a dataset.
@@ -142,7 +205,6 @@ def split_dataset(
     assert file.endswith(".csv")
     assert len(split_ratio) == 3
     assert method in ("random", "scaffold")
-    colorama.just_fix_windows_console()
     with open(file, "r") as f:
         data = list(csv.reader(f))
     header = data[0]
@@ -167,10 +229,8 @@ def split_dataset(
             # compute Bemis-Murcko scaffold
             if len(smiles_idx) > 1:
                 warnings.warn(
-                    "\033[32;1m"
                     f"We found {len(smiles_idx)} SMILES strings in a row!"
-                    " Only the first SMILES will be used to compute the molecular scaffold."
-                    "\033[0m",
+                    " Only the first SMILES will be used to compute the molecular scaffold.",
                     stacklevel=2,
                 )
             try:
@@ -197,10 +257,10 @@ def split_dataset(
     with open(file.replace(".csv", "_test.csv"), "w", newline="") as fte:
         writer = csv.writer(fte)
         writer.writerows([header] + test_set)
-    with open(file.replace(".csv", "_val.csv"), "w", newline="") as fva:
-        writer = csv.writer(fva)
-        writer.writerows([header] + val_set)
-    colorama.deinit()
+    if val_set:
+        with open(file.replace(".csv", "_val.csv"), "w", newline="") as fva:
+            writer = csv.writer(fva)
+            writer.writerows([header] + val_set)
 @torch.no_grad()
@@ -250,32 +310,12 @@ def sample(
     :return: a list of generated molecular strings
     :rtype: list
     """
-    assert method.split(":")[0].lower() in ("ode", "bfn")
-    if isinstance(model, EnsembleChemBFN):
-        assert y is not None, "conditioning is required while using an ensemble model."
-        assert isinstance(y, list) or isinstance(y, dict)
-    else:
-        assert isinstance(y, Tensor) or y is None
-    if device is None:
-        device = _find_device()
+    tp = _parse_and_assert_param(model, y, method)
+    device = _find_device() if device is None else device
     model.to(device).eval()
-    if y is not None:
-        if isinstance(y, Tensor):
-            y = y.to(device)
-        elif isinstance(y, list):
-            y = [i.to(device) for i in y]
-        elif isinstance(y, dict):
-            y = {k: v.to(device) for k, v in y.items()}
-        else:
-            raise NotImplementedError
-    if isinstance(allowed_tokens, list):
-        token_mask = [0 if i in allowed_tokens else 1 for i in vocab_keys]
-        token_mask = torch.tensor([[token_mask]], dtype=torch.bool).to(device)
-    else:
-        token_mask = None
-    if "ode" in method.lower():
-        tp = float(method.split(":")[-1])
-        assert tp > 0, "Sampling temperature should be higher than 0."
+    y = _map_to_device(y, device)
+    token_mask = _build_token_mask(allowed_tokens, vocab_keys, device)
+    if tp:
         tokens, entropy = model.ode_sample(
             batch_size, sequence_size, y, sample_step, guidance_strength, token_mask, tp
         )
@@ -283,16 +323,7 @@ def sample(
         tokens, entropy = model.sample(
             batch_size, sequence_size, y, sample_step, guidance_strength, token_mask
         )
-    if sort:
-        sorted_idx = entropy.argsort(stable=True)
-        tokens = tokens[sorted_idx]
-    return [
-        seperator.join([vocab_keys[i] for i in j])
-        .split("<start>" + seperator)[-1]
-        .split(seperator + "<end>")[0]
-        .replace("<pad>", "")
-        for j in tokens
-    ]
+    return _token_to_seq(tokens, entropy, vocab_keys, seperator, sort)
 @torch.no_grad()
@@ -339,33 +370,13 @@ def inpaint(
     :return: a list of generated molecular strings
     :rtype: list
     """
-    assert method.split(":")[0].lower() in ("ode", "bfn")
-    if isinstance(model, EnsembleChemBFN):
-        assert y is not None, "conditioning is required while using an ensemble model."
-        assert isinstance(y, list) or isinstance(y, dict)
-    else:
-        assert isinstance(y, Tensor) or y is None
-    if device is None:
-        device = _find_device()
+    tp = _parse_and_assert_param(model, y, method)
+    device = _find_device() if device is None else device
     model.to(device).eval()
     x = x.to(device)
-    if y is not None:
-        if isinstance(y, Tensor):
-            y = y.to(device)
-        elif isinstance(y, list):
-            y = [i.to(device) for i in y]
-        elif isinstance(y, dict):
-            y = {k: v.to(device) for k, v in y.items()}
-        else:
-            raise NotImplementedError
-    if isinstance(allowed_tokens, list):
-        token_mask = [0 if i in allowed_tokens else 1 for i in vocab_keys]
-        token_mask = torch.tensor([[token_mask]], dtype=torch.bool).to(device)
-    else:
-        token_mask = None
-    if "ode" in method.lower():
-        tp = float(method.split(":")[-1])
-        assert tp > 0, "Sampling temperature should be higher than 0."
+    y = _map_to_device(y, device)
+    token_mask = _build_token_mask(allowed_tokens, vocab_keys, device)
+    if tp:
         tokens, entropy = model.ode_inpaint(
             x, y, sample_step, guidance_strength, token_mask, tp
         )
@@ -373,16 +384,7 @@ def inpaint(
         tokens, entropy = model.inpaint(
             x, y, sample_step, guidance_strength, token_mask
         )
-    if sort:
-        sorted_idx = entropy.argsort(stable=True)
-        tokens = tokens[sorted_idx]
-    return [
-        separator.join([vocab_keys[i] for i in j])
-        .split("<start>" + separator)[-1]
-        .split(separator + "<end>")[0]
-        .replace("<pad>", "")
-        for j in tokens
-    ]
+    return _token_to_seq(tokens, entropy, vocab_keys, separator, sort)
 @torch.no_grad()
@@ -429,33 +431,13 @@ def optimise(
     :return: a list of generated molecular strings
     :rtype: list
     """
-    assert method.split(":")[0].lower() in ("ode", "bfn")
-    if isinstance(model, EnsembleChemBFN):
-        assert y is not None, "conditioning is required while using an ensemble model."
-        assert isinstance(y, list) or isinstance(y, dict)
-    else:
-        assert isinstance(y, Tensor) or y is None
-    if device is None:
-        device = _find_device()
+    tp = _parse_and_assert_param(model, y, method)
+    device = _find_device() if device is None else device
     model.to(device).eval()
     x = x.to(device)
-    if y is not None:
-        if isinstance(y, Tensor):
-            y = y.to(device)
-        elif isinstance(y, list):
-            y = [i.to(device) for i in y]
-        elif isinstance(y, dict):
-            y = {k: v.to(device) for k, v in y.items()}
-        else:
-            raise NotImplementedError
-    if isinstance(allowed_tokens, list):
-        token_mask = [0 if i in allowed_tokens else 1 for i in vocab_keys]
-        token_mask = torch.tensor([[token_mask]], dtype=torch.bool).to(device)
-    else:
-        token_mask = None
-    if "ode" in method.lower():
-        tp = float(method.split(":")[-1])
-        assert tp > 0, "Sampling temperature should be higher than 0."
+    y = _map_to_device(y, device)
+    token_mask = _build_token_mask(allowed_tokens, vocab_keys, device)
+    if tp:
         tokens, entropy = model.ode_optimise(
             x, y, sample_step, guidance_strength, token_mask, tp
         )
@@ -463,16 +445,7 @@ def optimise(
         tokens, entropy = model.optimise(
             x, y, sample_step, guidance_strength, token_mask
         )
-    if sort:
-        sorted_idx = entropy.argsort(stable=True)
-        tokens = tokens[sorted_idx]
-    return [
-        separator.join([vocab_keys[i] for i in j])
-        .split("<start>" + separator)[-1]
-        .split(separator + "<end>")[0]
-        .replace("<pad>", "")
-        for j in tokens
-    ]
+    return _token_to_seq(tokens, entropy, vocab_keys, separator, sort)
 def quantise_model_(model: ChemBFN) -> None:
@@ -555,7 +528,7 @@ class GeometryConverter:
     def smiles2cartesian(
         smiles: str,
         num_conformers: int = 250,
-        rdkit_ff_type: str = "MMFF",
+        rdkit_ff_type: Literal["MMFF", "UFF"] = "MMFF",
         refine_with_crest: bool = False,
         spin: float = 0.0,
     ) -> Tuple[List[str], np.ndarray]:

bayesianflow_for_chem/train.py CHANGED Viewed

@@ -8,7 +8,6 @@ from typing import Dict, Tuple, Union, Optional
 import torch
 import torch.optim as op
 import torch.nn.functional as F
-from loralib import lora_state_dict, mark_only_lora_as_trainable
 from torch import Tensor
 from torch.optim.lr_scheduler import ReduceLROnPlateau
 from lightning import LightningModule
@@ -55,6 +54,8 @@ class Model(LightningModule):
         self.scorer = scorer
         self.save_hyperparameters(hparam, ignore=["model", "mlp", "scorer"])
         if model.lora_enabled:
+            from loralib import mark_only_lora_as_trainable
             mark_only_lora_as_trainable(self.model)
         self.use_scorer = self.scorer is not None
@@ -107,6 +108,8 @@ class Model(LightningModule):
         :rtype: None
         """
         if self.model.lora_enabled:
+            from loralib import lora_state_dict
             torch.save(
                 {
                     "lora_nn": lora_state_dict(self.model),
@@ -152,6 +155,8 @@ class Regressor(LightningModule):
         self.model.requires_grad_(not hparam["freeze"])
         self.save_hyperparameters(hparam, ignore=["model", "mlp"])
         if model.lora_enabled:
+            from loralib import mark_only_lora_as_trainable
             mark_only_lora_as_trainable(self.model)
         assert hparam["mode"] in ("regression", "classification")
@@ -231,6 +236,8 @@ class Regressor(LightningModule):
         )
         if not self.hparams.freeze:
             if self.model.lora_enabled:
+                from loralib import lora_state_dict
                 torch.save(
                     {
                         "lora_nn": lora_state_dict(self.model),

{bayesianflow_for_chem-2.1.0.dist-info → bayesianflow_for_chem-2.2.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: bayesianflow_for_chem
-Version: 2.1.0
+Version: 2.2.2
 Summary: Bayesian flow network framework for Chemistry
 Home-page: https://augus1999.github.io/bayesian-flow-network-for-chemistry/
 Author: Nianze A. Tao
@@ -54,12 +54,14 @@ This is the repository of the PyTorch implementation of ChemBFN model.
 [![PyPI](https://img.shields.io/pypi/v/bayesianflow-for-chem?color=ff69b4)](https://pypi.org/project/bayesianflow-for-chem/)
 ![pytest](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/actions/workflows/pytest.yml/badge.svg)
+[![document](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/actions/workflows/pages/pages-build-deployment/badge.svg)](https://augus1999.github.io/bayesian-flow-network-for-chemistry/)
 ## Features
 ChemBFN provides the state-of-the-art functionalities of
 * SMILES or SELFIES-based *de novo* molecule generation
 * Protein sequence *de novo* generation
+* Template optimisation (mol2mol)
 * Classifier-free guidance conditional generation (single or multi-objective optimisation)
 * Context-guided conditional generation (inpaint)
 * Outstanding out-of-distribution chemical space sampling
@@ -71,6 +73,7 @@ in an all-in-one-model style.
 ## News
+* [09/10/2025] A web app [`chembfn_webui`](https://github.com/Augus1999/ChemBFN-WebUI) for hosting ChemBFN models is available on [PyPI](https://pypi.org/project/chembfn-webui/).
 * [30/01/2025] The package `bayesianflow_for_chem` is available on [PyPI](https://pypi.org/project/bayesianflow-for-chem/).
 * [21/01/2025] Our first paper has been accepted by [JCIM](https://pubs.acs.org/doi/10.1021/acs.jcim.4c01792).
 * [17/12/2024] The second paper of out-of-distribution generation is available on [arxiv.org](https://arxiv.org/abs/2412.11439).
@@ -93,7 +96,7 @@ You can find pretrained models on our [🤗Hugging Face model page](https://hugg
 ## Dataset Handling
-We provide a Python class [`CSVData`](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/blob/main/bayesianflow_for_chem/data.py#L152) to handle data stored in CSV or similar format containing headers to identify the entities. The following is a quickstart.
+We provide a Python class [`CSVData`](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/blob/main/bayesianflow_for_chem/data.py#L153) to handle data stored in CSV or similar format containing headers to identify the entities. The following is a quickstart.
 1. Download your dataset file (e.g., ESOL from [MoleculeNet](https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv)) and split the file:
 ```python

bayesianflow_for_chem-2.2.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+bayesianflow_for_chem/__init__.py,sha256=5nyapMRF-NM610Y_FzZQxu97LQvFaQDxwsyYFBJFxdw,642
+bayesianflow_for_chem/cli.py,sha256=wSDQ5EpETB0-o_YeSIuFt4hP1gI4if566a3qehspgB0,27353
+bayesianflow_for_chem/data.py,sha256=jOzcOO5FDNju8hnaimT_WI8sjdaiOHDalDIOEOpLjEE,6643
+bayesianflow_for_chem/model.py,sha256=M35G4u4mX4btl9vOK3Iqs6yOSuIKI_OoCTmLhmjbwNk,57559
+bayesianflow_for_chem/scorer.py,sha256=gQFUlkyxitch02ntqcRh1ZS8aondKLynW5U6NfTQTb4,4084
+bayesianflow_for_chem/spectra.py,sha256=Ba9ib1aDvTtDYbH3b4d-lIty3ZSQMu7jwehuV2KmhwA,1781
+bayesianflow_for_chem/tool.py,sha256=pAEGfYzEiquu9cTM0Te8EAAr2RPRRObGCzLk9uXaw8o,23686
+bayesianflow_for_chem/train.py,sha256=7AU0A-eZwzSYsLyIe3OxGTNWPnhGpHmVUaQLplV2Fn8,9886
+bayesianflow_for_chem/_data/vocab.txt,sha256=HgtAZmpWYk4y8PqEVC4vqut1vE75DfRKE_10s2UW0rU,790
+bayesianflow_for_chem-2.2.2.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
+bayesianflow_for_chem-2.2.2.dist-info/METADATA,sha256=nh6i_LRZTBSoJr3KP3iD0Q-CgrveQSj8AHrdg75FsU4,6476
+bayesianflow_for_chem-2.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+bayesianflow_for_chem-2.2.2.dist-info/entry_points.txt,sha256=N63RMoJsr8rxuKxc7Fj802SL8J5AlpCoPkS8E3IFPLI,54
+bayesianflow_for_chem-2.2.2.dist-info/top_level.txt,sha256=KHsanI3BMCt8D9Qpze2ycrF6nMa3PyojgO6eS1c8kco,22
+bayesianflow_for_chem-2.2.2.dist-info/RECORD,,

bayesianflow_for_chem-2.1.0.dist-info/RECORD DELETED Viewed

@@ -1,15 +0,0 @@
-bayesianflow_for_chem/__init__.py,sha256=bmqERRnnmyK7UgUYn3BFH_YipKfTxNNjRzu9__RVid4,612
-bayesianflow_for_chem/cli.py,sha256=A1cFz6jhpLEpbK9r8GxCLdbnPCzQ4RrsavLKg_lssVg,24208
-bayesianflow_for_chem/data.py,sha256=Pl0gGWHmMKTKHpsxznvLgYPCwwlLNL7nqH19Vipjkxs,6584
-bayesianflow_for_chem/model.py,sha256=UW5hfAofYK9dH9euDPYWfJedVMRFxk8WtY427fObf70,59641
-bayesianflow_for_chem/scorer.py,sha256=gQFUlkyxitch02ntqcRh1ZS8aondKLynW5U6NfTQTb4,4084
-bayesianflow_for_chem/spectra.py,sha256=Ba9ib1aDvTtDYbH3b4d-lIty3ZSQMu7jwehuV2KmhwA,1781
-bayesianflow_for_chem/tool.py,sha256=bqoIMas8bmcjYBghuQWLh75Eq8ZlG6mh9ZeDzWGOmuw,24790
-bayesianflow_for_chem/train.py,sha256=jYkhSguW50lrcTEydCQ20yig_mmc1j7WH9KmVwBCTAo,9727
-bayesianflow_for_chem/vocab.txt,sha256=HgtAZmpWYk4y8PqEVC4vqut1vE75DfRKE_10s2UW0rU,790
-bayesianflow_for_chem-2.1.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
-bayesianflow_for_chem-2.1.0.dist-info/METADATA,sha256=ctwor5jnPCmAo-1wuIGkX70aqkXj-8YQxr27AJOdEjM,6057
-bayesianflow_for_chem-2.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-bayesianflow_for_chem-2.1.0.dist-info/entry_points.txt,sha256=N63RMoJsr8rxuKxc7Fj802SL8J5AlpCoPkS8E3IFPLI,54
-bayesianflow_for_chem-2.1.0.dist-info/top_level.txt,sha256=KHsanI3BMCt8D9Qpze2ycrF6nMa3PyojgO6eS1c8kco,22
-bayesianflow_for_chem-2.1.0.dist-info/RECORD,,

/bayesianflow_for_chem/{vocab.txt → _data/vocab.txt} RENAMED Viewed

File without changes

{bayesianflow_for_chem-2.1.0.dist-info → bayesianflow_for_chem-2.2.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{bayesianflow_for_chem-2.1.0.dist-info → bayesianflow_for_chem-2.2.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{bayesianflow_for_chem-2.1.0.dist-info → bayesianflow_for_chem-2.2.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{bayesianflow_for_chem-2.1.0.dist-info → bayesianflow_for_chem-2.2.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

bayesianflow-for-chem 2.1.0__py3-none-any.whl → 2.2.2__py3-none-any.whl

Potentially problematic release.

bayesianflow-for-chem 2.1.0py3-none-any.whl → 2.2.2py3-none-any.whl