PyPI - bayesianflow-for-chem - Versions diffs - 2.0.5__tar.gz → 2.2.1__tar.gz - Mend

bayesianflow-for-chem 2.0.5tar.gz → 2.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bayesianflow-for-chem might be problematic. Click here for more details.

Files changed (25) hide show

{bayesianflow_for_chem-2.0.5 → bayesianflow_for_chem-2.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: bayesianflow_for_chem
-Version: 2.0.5
+Version: 2.2.1
 Summary: Bayesian flow network framework for Chemistry
 Home-page: https://augus1999.github.io/bayesian-flow-network-for-chemistry/
 Author: Nianze A. Tao
@@ -54,12 +54,14 @@ This is the repository of the PyTorch implementation of ChemBFN model.
 [![PyPI](https://img.shields.io/pypi/v/bayesianflow-for-chem?color=ff69b4)](https://pypi.org/project/bayesianflow-for-chem/)
 ![pytest](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/actions/workflows/pytest.yml/badge.svg)
+[![document](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/actions/workflows/pages/pages-build-deployment/badge.svg)](https://augus1999.github.io/bayesian-flow-network-for-chemistry/)
 ## Features
 ChemBFN provides the state-of-the-art functionalities of
 * SMILES or SELFIES-based *de novo* molecule generation
 * Protein sequence *de novo* generation
+* Template optimisation (mol2mol)
 * Classifier-free guidance conditional generation (single or multi-objective optimisation)
 * Context-guided conditional generation (inpaint)
 * Outstanding out-of-distribution chemical space sampling
@@ -71,6 +73,7 @@ in an all-in-one-model style.
 ## News
+* [09/10/2025] A web app [`chembfn_webui`](https://github.com/Augus1999/ChemBFN-WebUI) for hosting ChemBFN models is available on [PyPI](https://pypi.org/project/chembfn-webui/).
 * [30/01/2025] The package `bayesianflow_for_chem` is available on [PyPI](https://pypi.org/project/bayesianflow-for-chem/).
 * [21/01/2025] Our first paper has been accepted by [JCIM](https://pubs.acs.org/doi/10.1021/acs.jcim.4c01792).
 * [17/12/2024] The second paper of out-of-distribution generation is available on [arxiv.org](https://arxiv.org/abs/2412.11439).

{bayesianflow_for_chem-2.0.5 → bayesianflow_for_chem-2.2.1}/README.md RENAMED Viewed

@@ -9,12 +9,14 @@ This is the repository of the PyTorch implementation of ChemBFN model.
 [![PyPI](https://img.shields.io/pypi/v/bayesianflow-for-chem?color=ff69b4)](https://pypi.org/project/bayesianflow-for-chem/)
 ![pytest](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/actions/workflows/pytest.yml/badge.svg)
+[![document](https://github.com/Augus1999/bayesian-flow-network-for-chemistry/actions/workflows/pages/pages-build-deployment/badge.svg)](https://augus1999.github.io/bayesian-flow-network-for-chemistry/)
 ## Features
 ChemBFN provides the state-of-the-art functionalities of
 * SMILES or SELFIES-based *de novo* molecule generation
 * Protein sequence *de novo* generation
+* Template optimisation (mol2mol)
 * Classifier-free guidance conditional generation (single or multi-objective optimisation)
 * Context-guided conditional generation (inpaint)
 * Outstanding out-of-distribution chemical space sampling
@@ -26,6 +28,7 @@ in an all-in-one-model style.
 ## News
+* [09/10/2025] A web app [`chembfn_webui`](https://github.com/Augus1999/ChemBFN-WebUI) for hosting ChemBFN models is available on [PyPI](https://pypi.org/project/chembfn-webui/).
 * [30/01/2025] The package `bayesianflow_for_chem` is available on [PyPI](https://pypi.org/project/bayesianflow-for-chem/).
 * [21/01/2025] Our first paper has been accepted by [JCIM](https://pubs.acs.org/doi/10.1021/acs.jcim.4c01792).
 * [17/12/2024] The second paper of out-of-distribution generation is available on [arxiv.org](https://arxiv.org/abs/2412.11439).

{bayesianflow_for_chem-2.0.5 → bayesianflow_for_chem-2.2.1}/bayesianflow_for_chem/__init__.py RENAMED Viewed

@@ -3,10 +3,8 @@
 """
 ChemBFN package.
 """
-import colorama
 from . import data, tool, train, scorer, spectra
 from .model import ChemBFN, MLP, EnsembleChemBFN
-from .cli import main_script
 __all__ = [
     "data",
@@ -18,7 +16,7 @@ __all__ = [
     "MLP",
     "EnsembleChemBFN",
 ]
-__version__ = "2.0.5"
+__version__ = "2.2.1"
 __author__ = "Nianze A. Tao (Omozawa Sueno)"
@@ -29,6 +27,9 @@ def main() -> None:
     :return:
     :rtype: None
     """
+    import colorama
+    from bayesianflow_for_chem.cli import main_script
     colorama.just_fix_windows_console()
     main_script(__version__)
     colorama.deinit()

{bayesianflow_for_chem-2.0.5 → bayesianflow_for_chem-2.2.1}/bayesianflow_for_chem/cli.py RENAMED Viewed

@@ -12,13 +12,8 @@ from pathlib import Path
 from functools import partial
 from typing import List, Tuple, Dict, Union, Callable
 import torch
-import lightning as L
 from rdkit.Chem import MolFromSmiles, CanonSmiles
-from torch.utils.data import DataLoader
-from lightning.pytorch import loggers
-from lightning.pytorch.callbacks import ModelCheckpoint
 from bayesianflow_for_chem import ChemBFN, MLP
-from bayesianflow_for_chem.train import Model
 from bayesianflow_for_chem.scorer import smiles_valid, Scorer
 from bayesianflow_for_chem.data import (
     VOCAB_COUNT,
@@ -32,7 +27,7 @@ from bayesianflow_for_chem.data import (
     collate,
     CSVData,
 )
-from bayesianflow_for_chem.tool import sample, inpaint
+from bayesianflow_for_chem.tool import sample, inpaint, optimise, adjust_lora_
 """
@@ -90,6 +85,7 @@ checkpoint_save_path = "home/user/project/ckpt"
 train_strategy = "auto"  # or any strategy supported by Lightning, e.g., "ddp"
 accumulate_grad_batches = 1
 enable_progress_bar = false
+plugin_script = ""  # define customised behaviours of dataset, datasetloader, etc in a python script
 # Remove this table if inference is unnecessary
 [inference]
@@ -99,9 +95,11 @@ sample_size = 1000  # the minimum number of samples you want
 sample_step = 100
 sample_method = "ODE:0.5"  # ODE-solver with temperature of 0.5; another choice is "BFN"
 semi_autoregressive = false
+lora_scaling = 1.0  # LoRA scaling if applied
 guidance_objective = [-0.023, 0.09, 0.113]  # if no objective is needed set it to empty array []
 guidance_objective_strength = 4.0  # unnecessary if guidance_objective = []
 guidance_scaffold = "c1ccccc1"  # if no scaffold is used set it to empty string ""
+sample_template = ""  # template for mol2mol task; leave it blank if scaffold is used
 unwanted_token = []
 exclude_invalid = true  # to only store valid samples
 exclude_duplicate = true  # to only store unique samples
@@ -118,6 +116,32 @@ madmadmadmadmadmadmadmadmadmadmadmadmadmadmad
 madmadmadmadmadmadmadmadmadmadmadmadmadmadmad
 """
+_ALLOWED_PLUGINS = [
+    "collate_fn",
+    "num_workers",
+    "max_sequence_length",
+    "shuffle",
+    "CustomData",
+]
+def _load_plugin(plugin_file: str) -> Dict[str, Union[int, Callable, object, None]]:
+    if not plugin_file:
+        return {n: None for n in _ALLOWED_PLUGINS}
+    from importlib import util as iutil
+    spec = iutil.spec_from_file_location(Path(plugin_file).stem, plugin_file)
+    plugins = iutil.module_from_spec(spec)
+    spec.loader.exec_module(plugins)
+    plugin_names: List[str] = plugins.__all__
+    plugin_dict = {}
+    for n in _ALLOWED_PLUGINS:
+        if n in plugin_names:
+            plugin_dict[n] = getattr(plugins, n)
+        else:
+            plugin_dict[n] = None
+    return plugin_dict
 def parse_cli(version: str) -> argparse.Namespace:
     """
@@ -130,7 +154,7 @@ def parse_cli(version: str) -> argparse.Namespace:
     """
     parser = argparse.ArgumentParser(
         description="Madmol: a CLI molecular design tool for "
-        "de novo design, R-group replacement, and sequence in-filling, "
+        "de novo design, R-group replacement, molecule optimisation, and sequence in-filling, "
         "based on generative route of ChemBFN method. "
         "Let's make some craziest molecules.",
         epilog=f"Madmol {version}, developed in Hiroshima University by chemists for chemists. "
@@ -157,7 +181,7 @@ def parse_cli(version: str) -> argparse.Namespace:
         "-D",
         "--dryrun",
         action="store_true",
-        help="dry-run to check the configurations",
+        help="dry-run to check the configurations and exit",
     )
     parser.add_argument("-V", "--version", action="version", version=version)
     return parser.parse_args()
@@ -265,6 +289,14 @@ def load_runtime_config(
                     f"\033[0;31mCritical\033[0;0m in {config_file}: Restart checkpoint file {ckpt_file} does not exist."
                 )
                 flag_critical += 1
+        # ↓ added in v2.2.0; need to be compatible with old versions.
+        plugin_script: str = config["train"].get("plugin_script", "")
+        if plugin_script:
+            if not os.path.exists(plugin_script):
+                print(
+                    f"\033[0;31mCritical\033[0;0m in {config_file}: Plugin script {plugin_script} does not exist."
+                )
+                flag_critical += 1
     if "inference" in config:
         if not "train" in config:
             if not isinstance(config["inference"]["sequence_length"], int):
@@ -284,6 +316,14 @@ def load_runtime_config(
                 f"\033[0;33mWarning\033[0;0m in {config_file}: Directory {result_dir} to save the result does not exist."
             )
             flag_warning += 1
+        if (
+            config["inference"]["guidance_scaffold"] != ""
+            and config["inference"]["sample_template"] != ""
+        ):
+            print(
+                f"\033[0;33mWarning\033[0;0m in {config_file}: Inpaint task or mol2mol task?"
+            )
+            flag_warning += 1
     return config, flag_critical, flag_warning
@@ -325,14 +365,15 @@ def main_script(version: str) -> None:
                 )
                 flag_warning += 1
         if not os.path.exists(runtime_config["train"]["checkpoint_save_path"]):
-            os.makedirs(runtime_config["train"]["checkpoint_save_path"])
+            if not parser.dryrun:  # only create it in real tasks
+                os.makedirs(runtime_config["train"]["checkpoint_save_path"])
     else:
         if not model_config["ChemBFN"]["base_model"]:
             print(
                 f"\033[0;33mWarning\033[0;0m in {parser.model_config}: You should load a pretrained ChemBFN model."
             )
             flag_warning += 1
-        if not model_config["MLP"]["base_model"]:
+        if "MLP" in model_config and not model_config["MLP"]["base_model"]:
             print(
                 f"\033[0;33mWarning\033[0;0m in {parser.model_config}: You should load a pretrained MLP."
             )
@@ -340,13 +381,17 @@ def main_script(version: str) -> None:
     if "inference" in runtime_config:
         if runtime_config["inference"]["guidance_objective"]:
             if not "MLP" in model_config:
-                print(f"Warning in {parser.model_config}: Oh no, you don't have a MLP.")
+                print(
+                    f"\033[0;33mWarning\033[0;0m in {parser.model_config}: Oh no, you don't have a MLP."
+                )
                 flag_warning += 1
     if parser.dryrun:
         if flag_critical != 0:
             print("Configuration check failed!")
         elif flag_warning != 0:
-            print("Your job will probably run, but it may not follow your expectation.")
+            print(
+                "Your job will probably run, but it may not follow your expectations."
+            )
         else:
             print("Configuration check passed.")
         return
@@ -405,6 +450,15 @@ def main_script(version: str) -> None:
         mlp = None
     # ------- train -------
     if "train" in runtime_config:
+        import lightning as L
+        from torch.utils.data import DataLoader
+        from lightning.pytorch import loggers
+        from lightning.pytorch.callbacks import ModelCheckpoint
+        from bayesianflow_for_chem.train import Model
+        # ####### get plugins #######
+        plugin_file = runtime_config["train"].get("plugin_script", "")
+        plugins = _load_plugin(plugin_file)
         # ####### build scorer #######
         if (tokeniser_name == "smiles" or tokeniser_name == "safe") and runtime_config[
             "train"
@@ -418,30 +472,43 @@ def main_script(version: str) -> None:
         mol_tag = runtime_config["train"]["molecule_tag"]
         obj_tag = runtime_config["train"]["objective_tag"]
         dataset_file = runtime_config["train"]["dataset"]
-        with open(dataset_file, "r") as db:
-            _data = db.readlines()
-        header = _data[0]
-        mol_idx = []
-        for i, tag in enumerate(header.replace("\n", "").split(",")):
-            if tag == mol_tag:
-                mol_idx.append(i)
-        _data_len = []
-        for i in _data[1:]:
-            i = i.replace("\n", "").split(",")
-            _mol = ".".join([i[j] for j in mol_idx])
-            _data_len.append(tokeniser(_mol).shape[-1])
-        lmax = max(_data_len)
-        dataset = CSVData(dataset_file)
+        if plugins["max_sequence_length"]:
+            lmax = plugins["max_sequence_length"]
+        else:
+            with open(dataset_file, "r") as db:
+                _data = db.readlines()
+            _header = _data[0]
+            _mol_idx = []
+            for i, tag in enumerate(_header.replace("\n", "").split(",")):
+                if tag == mol_tag:
+                    _mol_idx.append(i)
+            _data_len = []
+            for i in _data[1:]:
+                i = i.replace("\n", "").split(",")
+                _mol = ".".join([i[j] for j in _mol_idx])
+                _data_len.append(tokeniser(_mol).shape[-1])
+            lmax = max(_data_len)
+            del _data, _data_len, _header, _mol_idx  # clear memory
+        if plugins["CustomData"] is not None:
+            dataset = plugins["CustomData"](dataset_file)
+        else:
+            dataset = CSVData(dataset_file)
         dataset.map(
             partial(_encode, mol_tag=mol_tag, obj_tag=obj_tag, tokeniser=tokeniser)
         )
         dataloader = DataLoader(
             dataset,
             runtime_config["train"]["batch_size"],
-            True,
-            num_workers=4,
-            collate_fn=collate,
-            persistent_workers=True,
+            True if plugins["shuffle"] is None else plugins["shuffle"],
+            num_workers=4 if plugins["num_workers"] is None else plugins["num_workers"],
+            collate_fn=(
+                collate if plugins["collate_fn"] is None else plugins["collate_fn"]
+            ),
+            persistent_workers=(
+                True
+                if (plugins["num_workers"] is None or plugins["num_workers"] > 0)
+                else False
+            ),
         )
         # ####### build trainer #######
         logger_name = runtime_config["train"]["logger_name"].lower()
@@ -520,6 +587,8 @@ def main_script(version: str) -> None:
         if "train" in runtime_config:
             bfn = model.model
             mlp = model.mlp
+        # ↓ added in v2.1.0; need to be compatible with old versions
+        lora_scaling = runtime_config["inference"].get("lora_scaling", 1.0)
         # ####### strat inference #######
         bfn.semi_autoregressive = runtime_config["inference"]["semi_autoregressive"]
         _device = (
@@ -550,8 +619,16 @@ def main_script(version: str) -> None:
                 x[:-1], (0, sequence_length - x.shape[-1] + 1), value=0
             )
             x = x[None, :].repeat(batch_size, 1)
+            # then sample template will be ignored.
+        elif runtime_config["inference"]["sample_template"]:
+            template = runtime_config["inference"]["sample_template"]
+            x = tokeniser(template)
+            x = torch.nn.functional.pad(x, (0, sequence_length - x.shape[-1]), value=0)
+            x = x[None, :].repeat(batch_size, 1)
         else:
             x = None
+        if bfn.lora_enabled:
+            adjust_lora_(bfn, lora_scaling)
         mols = []
         while len(mols) < runtime_config["inference"]["sample_size"]:
             if x is None:
@@ -567,7 +644,7 @@ def main_script(version: str) -> None:
                     method=sample_method,
                     allowed_tokens=allowed_token,
                 )
-            else:
+            elif runtime_config["inference"]["guidance_scaffold"]:
                 s = inpaint(
                     bfn,
                     x,
@@ -579,6 +656,18 @@ def main_script(version: str) -> None:
                     method=sample_method,
                     allowed_tokens=allowed_token,
                 )
+            else:
+                s = optimise(
+                    bfn,
+                    x,
+                    sample_step,
+                    y,
+                    guidance_strength,
+                    _device,
+                    vocab_keys,
+                    method=sample_method,
+                    allowed_tokens=allowed_token,
+                )
             if runtime_config["inference"]["exclude_invalid"]:
                 s = [i for i in s if i]
                 if tokeniser_name == "smiles" or tokeniser_name == "safe":

{bayesianflow_for_chem-2.0.5 → bayesianflow_for_chem-2.2.1}/bayesianflow_for_chem/data.py RENAMED Viewed

@@ -61,7 +61,7 @@ def load_vocab(
     }
-_DEFUALT_VOCAB = load_vocab(__filedir__ / "vocab.txt")
+_DEFUALT_VOCAB = load_vocab(__filedir__ / "_data/vocab.txt")
 VOCAB_KEYS: List[str] = _DEFUALT_VOCAB["vocab_keys"]
 VOCAB_DICT: Dict[str, int] = _DEFUALT_VOCAB["vocab_dict"]
 VOCAB_COUNT: int = _DEFUALT_VOCAB["vocab_count"]

bayesianflow-for-chem 2.0.5__tar.gz → 2.2.1__tar.gz

Potentially problematic release.

bayesianflow-for-chem 2.0.5tar.gz → 2.2.1tar.gz