PyPI - bayesianflow-for-chem - Versions diffs - 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl - Mend

bayesianflow-for-chem 1.3.0py3-none-any.whl → 1.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bayesianflow-for-chem might be problematic. Click here for more details.

Files changed (10) hide show

bayesianflow_for_chem/__init__.py CHANGED Viewed

@@ -7,5 +7,5 @@ from . import data, tool, train, scorer
 from .model import ChemBFN, MLP, EnsembleChemBFN
 __all__ = ["data", "tool", "train", "scorer", "ChemBFN", "MLP", "EnsembleChemBFN"]
-__version__ = "1.3.0"
+__version__ = "1.4.0"
 __author__ = "Nianze A. Tao (Omozawa Sueno)"

bayesianflow_for_chem/data.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Author: Nianze A. TAO (Omozawa SUENO)
 """
-Tokenise SMILES/SAFE/SELFIES/GEO2SEQ/protein-sequence strings.
+Tokenise SMILES/SAFE/SELFIES/protein-sequence strings.
 """
 import os
 import re
@@ -32,25 +32,9 @@ SMI_REGEX_PATTERN = (
     r"~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
 )
 SEL_REGEX_PATTERN = r"(\[[^\]]+]|\.)"
-GEO_REGEX_PATTERN = (
-    r"(H[e,f,g,s,o]?|"
-    r"L[i,v,a,r,u]|"
-    r"B[e,r,a,i,h,k]?|"
-    r"C[l,a,r,o,u,d,s,n,e,m,f]?|"
-    r"N[e,a,i,b,h,d,o,p]?|"
-    r"O[s,g]?|S[i,c,e,r,n,m,b,g]?|"
-    r"K[r]?|T[i,c,e,a,l,b,h,m,s]|"
-    r"G[a,e,d]|R[b,u,h,e,n,a,f,g]|"
-    r"Yb?|Z[n,r]|P[t,o,d,r,a,u,b,m]?|"
-    r"F[e,r,l,m]?|M[g,n,o,t,c,d]|"
-    r"A[l,r,s,g,u,t,c,m]|I[n,r]?|"
-    r"W|X[e]|E[u,r,s]|U|D[b,s,y]|"
-    r"-|.| |[0-9])"
-)
 AA_REGEX_PATTERN = r"(A|B|C|D|E|F|G|H|I|K|L|M|N|P|Q|R|S|T|V|W|Y|Z|-|.)"
 smi_regex = re.compile(SMI_REGEX_PATTERN)
 sel_regex = re.compile(SEL_REGEX_PATTERN)
-geo_regex = re.compile(GEO_REGEX_PATTERN)
 aa_regex = re.compile(AA_REGEX_PATTERN)
@@ -86,9 +70,6 @@ AA_VOCAB_KEYS = (
 )
 AA_VOCAB_COUNT = len(AA_VOCAB_KEYS)
 AA_VOCAB_DICT = dict(zip(AA_VOCAB_KEYS, range(AA_VOCAB_COUNT)))
-GEO_VOCAB_KEYS = VOCAB_KEYS[0:3] + [" "] + VOCAB_KEYS[22:150] + [".", "-"]
-GEO_VOCAB_COUNT = len(GEO_VOCAB_KEYS)
-GEO_VOCAB_DICT = dict(zip(GEO_VOCAB_KEYS, range(GEO_VOCAB_COUNT)))
 def smiles2vec(smiles: str) -> List[int]:
@@ -104,19 +85,6 @@ def smiles2vec(smiles: str) -> List[int]:
     return [VOCAB_DICT[token] for token in tokens]
-def geo2vec(geo2seq: str) -> List[int]:
-    """
-    Geo2Seq tokenisation using a dataset-independent regex pattern.
-    :param geo2seq: `GEO2SEQ` string
-    :type geo2seq: str
-    :return: tokens w/o `<start>` and `<end>`
-    :rtype: list
-    """
-    tokens = [token for token in geo_regex.findall(geo2seq)]
-    return [GEO_VOCAB_DICT[token] for token in tokens]
 def aa2vec(aa_seq: str) -> List[int]:
     """
     Protein sequence tokenisation using a dataset-independent regex pattern.
@@ -147,11 +115,6 @@ def smiles2token(smiles: str) -> Tensor:
     return torch.tensor([1] + smiles2vec(smiles) + [2], dtype=torch.long)
-def geo2token(geo2seq: str) -> Tensor:
-    # start token: <start> = 1; end token: <esc> = 2
-    return torch.tensor([1] + geo2vec(geo2seq) + [2], dtype=torch.long)
 def aa2token(aa_seq: str) -> Tensor:
     # start token: <start> = 1; end token: <end> = 2
     return torch.tensor([1] + aa2vec(aa_seq) + [2], dtype=torch.long)

bayesianflow_for_chem/model.py CHANGED Viewed

@@ -162,8 +162,8 @@ class Attention(nn.Module):
         :return: attentioned output;   shape: (n_b, n_t, n_f)
         :rtype: torch.Tensor
         """
-        n_b, n_a, _ = shape = x.shape
-        split = (n_b, n_a, self.nh, self.d)
+        n_b, n_t, _ = shape = x.shape
+        split = (n_b, n_t, self.nh, self.d)
         q, k, v = self.qkv(x).chunk(3, -1)
         q = q.view(split).permute(2, 0, 1, 3).contiguous()
         k = k.view(split).permute(2, 0, 1, 3).contiguous()
@@ -428,12 +428,12 @@ class ChemBFN(nn.Module):
         c = self.time_embed(t)
         if y is not None:
             c += y
-        pe = self.position(x.shape[1])
+        pe = self.position(n_t)
         x = self.embedding(x)
         attn_mask: Optional[Tensor] = None
         if self.semi_autoregressive:
             attn_mask = torch.tril(
-                torch.ones((1, n_b, n_t, n_t), device=self.beta.device), diagonal=0
+                torch.ones((1, n_b, n_t, n_t), device=x.device), diagonal=0
             )
         else:
             if mask is not None:

bayesianflow_for_chem/tool.py CHANGED Viewed

@@ -3,7 +3,6 @@
 """
 Essential tools.
 """
-import re
 import csv
 import random
 import warnings
@@ -18,7 +17,15 @@ from torch.ao import quantization
 from torch.utils.data import DataLoader
 from typing_extensions import Self
 from rdkit.Chem.rdchem import Mol, Bond
-from rdkit.Chem import rdDetermineBonds, MolFromXYZBlock, MolToSmiles, CanonSmiles
+from rdkit.Chem import (
+    rdDetermineBonds,
+    MolFromXYZBlock,
+    MolFromSmiles,
+    MolToSmiles,
+    CanonSmiles,
+    AllChem,
+    AddHs,
+)
 from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles  # type: ignore
 from sklearn.metrics import (
     roc_auc_score,
@@ -28,36 +35,10 @@ from sklearn.metrics import (
     mean_absolute_error,
     root_mean_squared_error,
 )
-try:
-    from pynauty import Graph, canon_label  # type: ignore
-    _use_pynauty = True
-except ImportError:
-    import platform
-    _use_pynauty = False
 from .data import VOCAB_KEYS
 from .model import ChemBFN, MLP, Linear, EnsembleChemBFN
-_atom_regex_pattern = (
-    r"(H[e,f,g,s,o]?|"
-    r"L[i,v,a,r,u]|"
-    r"B[e,r,a,i,h,k]?|"
-    r"C[l,a,r,o,u,d,s,n,e,m,f]?|"
-    r"N[e,a,i,b,h,d,o,p]?|"
-    r"O[s,g]?|S[i,c,e,r,n,m,b,g]?|"
-    r"K[r]?|T[i,c,e,a,l,b,h,m,s]|"
-    r"G[a,e,d]|R[b,u,h,e,n,a,f,g]|"
-    r"Yb?|Z[n,r]|P[t,o,d,r,a,u,b,m]?|"
-    r"F[e,r,l,m]?|M[g,n,o,t,c,d]|"
-    r"A[l,r,s,g,u,t,c,m]|I[n,r]?|"
-    r"W|X[e]|E[u,r,s]|U|D[b,s,y])"
-)
-_atom_regex = re.compile(_atom_regex_pattern)
 def _find_device() -> torch.device:
     if cuda.is_available():
         return torch.device("cuda")
@@ -66,10 +47,6 @@ def _find_device() -> torch.device:
     return torch.device("cpu")
-def _bond_pair_idx(bonds: Bond) -> List[List[int]]:
-    return [[i.GetBeginAtomIdx(), i.GetEndAtomIdx()] for i in bonds]
 @torch.no_grad()
 def test(
     model: ChemBFN,
@@ -493,6 +470,8 @@ def quantise_model(model: ChemBFN) -> nn.Module:
             assert hasattr(
                 mod, "qconfig"
             ), "Input float module must have qconfig defined"
+            if use_precomputed_fake_quant:
+                warnings.warn("Fake quantize operator is not implemented.")
             if mod.qconfig is not None and mod.qconfig.weight is not None:
                 weight_observer = mod.qconfig.weight()
             else:
@@ -560,6 +539,42 @@ class GeometryConverter:
             xyz_block.append(f"{atom} {r[i][0]:.10f} {r[i][1]:.10f} {r[i][2]:.10f}")
         return MolFromXYZBlock("\n".join(xyz_block))
+    @staticmethod
+    def _bond_pair_idx(bonds: Bond) -> List[List[int]]:
+        return [[i.GetBeginAtomIdx(), i.GetEndAtomIdx()] for i in bonds]
+    @staticmethod
+    def smiles2cartesian(
+        smiles: str, num_conformers: int = 50, random_seed: int = 42
+    ) -> Tuple[List[str], np.ndarray]:
+        """
+        Guess the 3D geometry from SMILES string via MMFF conformer search.
+        :param smiles: a valid SMILES string
+        :param num_conformers: number of initial conformers
+        :param random_seed: random seed used to generate conformers
+        :type smiles: str
+        :type num_conformers: int
+        :type random_seed: int
+        :return: atomic symbols \n
+                 cartesian coordinates;  shape: (n_a, 3)
+        :rtype: tuple
+        """
+        mol = MolFromSmiles(smiles)
+        mol = AddHs(mol)
+        AllChem.EmbedMultipleConfs(mol, numConfs=num_conformers, randomSeed=random_seed)
+        symbols = [atom.GetSymbol() for atom in mol.GetAtoms()]
+        energies = []
+        for conf_id in range(num_conformers):
+            ff = AllChem.MMFFGetMoleculeForceField(
+                mol, AllChem.MMFFGetMoleculeProperties(mol), confId=conf_id
+            )
+            energy = ff.CalcEnergy()
+            energies.append((conf_id, energy))
+        lowest_energy_conf = min(energies, key=lambda x: x[1])
+        coordinates = mol.GetConformer(id=lowest_energy_conf[0]).GetPositions()
+        return symbols, coordinates
     def cartesian2smiles(
         self,
         symbols: List[str],
@@ -587,136 +602,3 @@ class GeometryConverter:
         if canonical:
             smiles = CanonSmiles(smiles)
         return smiles
-    def canonicalise(
-        self, symbols: List[str], coordinates: np.ndarray
-    ) -> Tuple[List[str], np.ndarray]:
-        """
-        Canonicalising the 3D molecular graph.
-        :param symbols: a list of atomic symbols
-        :param coordinates: Cartesian coordinates;  shape: (n_a, 3)
-        :type symbols: list
-        :type coordinates: numpy.ndarray
-        :return: canonicalised symbols \n
-                 canonicalised coordinates;         shape: (n_a, 3)
-        :rtype: tuple
-        """
-        if not _use_pynauty:
-            if platform.system() == "Windows":
-                raise NotImplementedError(
-                    "This method is not implemented on Windows platform."
-                )
-            else:
-                raise ImportError("`pynauty` is not installed.")
-        n = len(symbols)
-        if n == 1:
-            return symbols, coordinates
-        mol = self._xyz2mol(symbols, coordinates)
-        rdDetermineBonds.DetermineConnectivity(mol)
-        # ------- Canonicalization -------
-        pair_idx = np.array(_bond_pair_idx(mol.GetBonds())).T.tolist()
-        pair_dict: Dict[int, List[int]] = {}
-        for key, i in enumerate(pair_idx[0]):
-            if i not in pair_dict:
-                pair_dict[i] = [pair_idx[1][key]]
-            else:
-                pair_dict[i].append(pair_idx[1][key])
-        g = Graph(n, adjacency_dict=pair_dict)
-        cl = canon_label(g)  # type: list
-        symbols = np.array([[s] for s in symbols])[cl].flatten().tolist()
-        coordinates = coordinates[cl]
-        return symbols, coordinates
-    @staticmethod
-    def cartesian2spherical(coordinates: np.ndarray) -> np.ndarray:
-        """
-        Transforming Cartesian coordinate to spherical form.\n
-        The method is adapted from the paper: https://arxiv.org/abs/2408.10120.
-        :param coordinates: Cartesian coordinates;  shape: (n_a, 3)
-        :type coordinates: numpy.ndarray
-        :return: spherical coordinates;             shape: (n_a, 3)
-        :rtype: numpy.ndarray
-        """
-        n = coordinates.shape[0]
-        if n == 1:
-            return np.array([[0.0, 0.0, 0.0]])
-        # ------- Find global coordinate frame -------
-        if n == 2:
-            d = np.linalg.norm(coordinates[0] - coordinates[1], 2)
-            return np.array([[0.0, 0.0, 0.0], [d, 0.0, 0.0]])
-        for idx_0 in range(n - 2):
-            _vec0 = coordinates[idx_0] - coordinates[idx_0 + 1]
-            _vec1 = coordinates[idx_0] - coordinates[idx_0 + 2]
-            _d1 = np.linalg.norm(_vec0, 2)
-            _d2 = np.linalg.norm(_vec1, 2)
-            if 1 - np.abs(np.dot(_vec0, _vec1) / (_d1 * _d2)) > 1e-6:
-                break
-        x = (coordinates[idx_0 + 1] - coordinates[idx_0]) / _d1
-        y = np.cross((coordinates[idx_0 + 2] - coordinates[idx_0]), x)
-        y_d = np.linalg.norm(y, 2)
-        y = y / np.ma.filled(np.ma.array(y_d, mask=y_d == 0), np.inf)
-        z = np.cross(x, y)
-        # ------- Build spherical coordinates -------
-        vec = coordinates - coordinates[idx_0]
-        d = np.linalg.norm(vec, 2, axis=-1)
-        _d = np.ma.filled(np.ma.array(d, mask=d == 0), np.inf)
-        theta = np.arccos(np.dot(vec, z) / _d)  # in [0, \pi]
-        phi = np.arctan2(np.dot(vec, y), np.dot(vec, x))  # in [-\pi, \pi]
-        info = np.vstack([d, theta, phi]).T
-        info[idx_0] = np.zeros_like(info[idx_0])
-        return info
-    def geo2seq(
-        self, symbols: List[str], coordinates: np.ndarray, decimals: int = 2
-    ) -> str:
-        """
-        Geometry-to-sequence function.\n
-        The algorithm follows the descriptions in paper: https://arxiv.org/abs/2408.10120.
-        :param symbols: a list of atomic symbols
-        :param coordinates: Cartesian coordinates;  shape: (n_a, 3)
-        :param decimals: the maxmium number of decimals to keep; default is 2
-        :type symbols: list
-        :type coordinates: numpy.ndarray
-        :type decimals: int
-        :return: `Geo2Seq` string
-        :rtype: str
-        """
-        symbols, coordinates = self.canonicalise(symbols, coordinates)
-        info = self.cartesian2spherical(coordinates)
-        info = [
-            f"{symbols[i]} {r[0]} {r[1]} {r[2]}"
-            for i, r in enumerate(np.round(info, decimals))
-        ]
-        return " ".join(info)
-    @staticmethod
-    def seq2geo(seq: str) -> Tuple[Optional[List[str]], Optional[np.ndarray]]:
-        """
-        Sequence-to-geometry function.\n
-        The method follows the descriptions in paper: https://arxiv.org/abs/2408.10120.
-        :param seq: `Geo2Seq` string
-        :type seq: str
-        :return: (symbols, coordinates) if `seq` is valid
-        :rtype: tuple
-        """
-        tokens = seq.split()
-        if len(tokens) % 4 != 0:
-            return None, None
-        tokens = np.array(tokens).reshape(-1, 4)
-        symbols, coordinates = tokens[::, 0], tokens[::, 1:]
-        if sum([len(_atom_regex.findall(sym)) for sym in symbols]) != len(symbols):
-            return None, None
-        try:
-            coord = [[float(i) for i in j] for j in coordinates]
-            coord = np.array(coord)
-        except ValueError:
-            return None, None
-        d, theta, phi = coord[::, 0, None], coord[::, 1, None], coord[::, 2, None]
-        x = d * np.sin(theta) * np.cos(phi)
-        y = d * np.sin(theta) * np.sin(phi)
-        z = d * np.cos(theta)
-        return symbols, np.concatenate([x, y, z], -1)

{bayesianflow_for_chem-1.3.0.dist-info → bayesianflow_for_chem-1.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: bayesianflow_for_chem
-Version: 1.3.0
+Version: 1.4.0
 Summary: Bayesian flow network framework for Chemistry
 Home-page: https://augus1999.github.io/bayesian-flow-network-for-chemistry/
 Author: Nianze A. Tao
@@ -28,8 +28,6 @@ Requires-Dist: loralib>=0.1.2
 Requires-Dist: lightning>=2.2.0
 Requires-Dist: scikit-learn>=1.5.0
 Requires-Dist: typing_extensions>=4.8.0
-Provides-Extra: geo2seq
-Requires-Dist: pynauty>=2.8.8.1; extra == "geo2seq"
 Dynamic: author
 Dynamic: author-email
 Dynamic: classifier
@@ -40,7 +38,6 @@ Dynamic: keywords
 Dynamic: license
 Dynamic: license-file
 Dynamic: project-url
-Dynamic: provides-extra
 Dynamic: requires-dist
 Dynamic: requires-python
 Dynamic: summary
@@ -92,7 +89,7 @@ You can find pretrained models on our [🤗Hugging Face model page](https://hugg
 We provide a Python class [`CSVData`](./bayesianflow_for_chem/data.py) to handle data stored in CSV or similar format containing headers to identify the entities. The following is a quickstart.
-1. Download your dataset file (e.g., ESOL form [MoleculeNet](https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv)) and split the file:
+1. Download your dataset file (e.g., ESOL from [MoleculeNet](https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv)) and split the file:
 ```python
 >>> from bayesianflow_for_chem.tool import split_data

bayesianflow_for_chem-1.4.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+bayesianflow_for_chem/__init__.py,sha256=3sP8nM4_idOX-ksvpBJEApxPAVAPijKvQHxidTO5790,329
+bayesianflow_for_chem/data.py,sha256=WoOCOVmJX4WeHa2WeO4i66J2FS8rvRaYRCdlBN7ZeOM,6576
+bayesianflow_for_chem/model.py,sha256=fUrXKhn2U9FrVPJyb4lqACqPTePkIgI0v6_1jPs5c0Q,50784
+bayesianflow_for_chem/scorer.py,sha256=7G1TVSwC0qONtNm6kiDZUWwvuFPzasNSjp4eJAk5TL0,4101
+bayesianflow_for_chem/tool.py,sha256=NMMRHk2FJY0fyA76zCrz6tkcylCuExMUMj5hohWTnkE,23155
+bayesianflow_for_chem/train.py,sha256=hGKyhGhLch-exSYPZdLXrLn3gf39Q1VLSJs2qtuikQE,9709
+bayesianflow_for_chem/vocab.txt,sha256=HgtAZmpWYk4y8PqEVC4vqut1vE75DfRKE_10s2UW0rU,790
+bayesianflow_for_chem-1.4.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
+bayesianflow_for_chem-1.4.0.dist-info/METADATA,sha256=1Y5mLIOaPsHcyCCm2SkWz7OCniQYVJ67-cVq3cUU0Mw,5643
+bayesianflow_for_chem-1.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+bayesianflow_for_chem-1.4.0.dist-info/top_level.txt,sha256=KHsanI3BMCt8D9Qpze2ycrF6nMa3PyojgO6eS1c8kco,22
+bayesianflow_for_chem-1.4.0.dist-info/RECORD,,

bayesianflow_for_chem-1.3.0.dist-info/RECORD DELETED Viewed

@@ -1,12 +0,0 @@
-bayesianflow_for_chem/__init__.py,sha256=3BW4-ri8OcMZAIPJBT2q-48L3LAY776xluMDC6WXaZU,329
-bayesianflow_for_chem/data.py,sha256=EbCfhA1bCieVHVOYVk7nvgsaOzhKyFdnHd261qNR4BY,7763
-bayesianflow_for_chem/model.py,sha256=fFcfg1RZuoJeptAtglo2U8j1EGNSGjItMHqlKdLGGhU,50799
-bayesianflow_for_chem/scorer.py,sha256=7G1TVSwC0qONtNm6kiDZUWwvuFPzasNSjp4eJAk5TL0,4101
-bayesianflow_for_chem/tool.py,sha256=Z9qF80qzK-CJk9MJaWuSNOLnA-LPiD6CiC7S3sZbBP8,27704
-bayesianflow_for_chem/train.py,sha256=hGKyhGhLch-exSYPZdLXrLn3gf39Q1VLSJs2qtuikQE,9709
-bayesianflow_for_chem/vocab.txt,sha256=HgtAZmpWYk4y8PqEVC4vqut1vE75DfRKE_10s2UW0rU,790
-bayesianflow_for_chem-1.3.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
-bayesianflow_for_chem-1.3.0.dist-info/METADATA,sha256=2BDjaVhIkd0TLolVETa2kb7xUGYhn8kdlq2CMfF-i7Y,5746
-bayesianflow_for_chem-1.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-bayesianflow_for_chem-1.3.0.dist-info/top_level.txt,sha256=KHsanI3BMCt8D9Qpze2ycrF6nMa3PyojgO6eS1c8kco,22
-bayesianflow_for_chem-1.3.0.dist-info/RECORD,,

{bayesianflow_for_chem-1.3.0.dist-info → bayesianflow_for_chem-1.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{bayesianflow_for_chem-1.3.0.dist-info → bayesianflow_for_chem-1.4.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{bayesianflow_for_chem-1.3.0.dist-info → bayesianflow_for_chem-1.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

bayesianflow-for-chem 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

Potentially problematic release.

bayesianflow-for-chem 1.3.0py3-none-any.whl → 1.4.0py3-none-any.whl