PyPI - boltz-vsynthes - Versions diffs - 1.0.0__py3-none-any.whl - Mend

boltz-vsynthes 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

boltz/__init__.py +7 -0
boltz/data/__init__.py +0 -0
boltz/data/const.py +1184 -0
boltz/data/crop/__init__.py +0 -0
boltz/data/crop/affinity.py +164 -0
boltz/data/crop/boltz.py +296 -0
boltz/data/crop/cropper.py +45 -0
boltz/data/feature/__init__.py +0 -0
boltz/data/feature/featurizer.py +1230 -0
boltz/data/feature/featurizerv2.py +2208 -0
boltz/data/feature/symmetry.py +602 -0
boltz/data/filter/__init__.py +0 -0
boltz/data/filter/dynamic/__init__.py +0 -0
boltz/data/filter/dynamic/date.py +76 -0
boltz/data/filter/dynamic/filter.py +24 -0
boltz/data/filter/dynamic/max_residues.py +37 -0
boltz/data/filter/dynamic/resolution.py +34 -0
boltz/data/filter/dynamic/size.py +38 -0
boltz/data/filter/dynamic/subset.py +42 -0
boltz/data/filter/static/__init__.py +0 -0
boltz/data/filter/static/filter.py +26 -0
boltz/data/filter/static/ligand.py +37 -0
boltz/data/filter/static/polymer.py +299 -0
boltz/data/module/__init__.py +0 -0
boltz/data/module/inference.py +307 -0
boltz/data/module/inferencev2.py +429 -0
boltz/data/module/training.py +684 -0
boltz/data/module/trainingv2.py +660 -0
boltz/data/mol.py +900 -0
boltz/data/msa/__init__.py +0 -0
boltz/data/msa/mmseqs2.py +235 -0
boltz/data/pad.py +84 -0
boltz/data/parse/__init__.py +0 -0
boltz/data/parse/a3m.py +134 -0
boltz/data/parse/csv.py +100 -0
boltz/data/parse/fasta.py +138 -0
boltz/data/parse/mmcif.py +1239 -0
boltz/data/parse/mmcif_with_constraints.py +1607 -0
boltz/data/parse/schema.py +1851 -0
boltz/data/parse/yaml.py +68 -0
boltz/data/sample/__init__.py +0 -0
boltz/data/sample/cluster.py +283 -0
boltz/data/sample/distillation.py +57 -0
boltz/data/sample/random.py +39 -0
boltz/data/sample/sampler.py +49 -0
boltz/data/tokenize/__init__.py +0 -0
boltz/data/tokenize/boltz.py +195 -0
boltz/data/tokenize/boltz2.py +396 -0
boltz/data/tokenize/tokenizer.py +24 -0
boltz/data/types.py +777 -0
boltz/data/write/__init__.py +0 -0
boltz/data/write/mmcif.py +305 -0
boltz/data/write/pdb.py +171 -0
boltz/data/write/utils.py +23 -0
boltz/data/write/writer.py +330 -0
boltz/main.py +1292 -0
boltz/model/__init__.py +0 -0
boltz/model/layers/__init__.py +0 -0
boltz/model/layers/attention.py +132 -0
boltz/model/layers/attentionv2.py +111 -0
boltz/model/layers/confidence_utils.py +231 -0
boltz/model/layers/dropout.py +34 -0
boltz/model/layers/initialize.py +100 -0
boltz/model/layers/outer_product_mean.py +98 -0
boltz/model/layers/pair_averaging.py +135 -0
boltz/model/layers/pairformer.py +337 -0
boltz/model/layers/relative.py +58 -0
boltz/model/layers/transition.py +78 -0
boltz/model/layers/triangular_attention/__init__.py +0 -0
boltz/model/layers/triangular_attention/attention.py +189 -0
boltz/model/layers/triangular_attention/primitives.py +409 -0
boltz/model/layers/triangular_attention/utils.py +380 -0
boltz/model/layers/triangular_mult.py +212 -0
boltz/model/loss/__init__.py +0 -0
boltz/model/loss/bfactor.py +49 -0
boltz/model/loss/confidence.py +590 -0
boltz/model/loss/confidencev2.py +621 -0
boltz/model/loss/diffusion.py +171 -0
boltz/model/loss/diffusionv2.py +134 -0
boltz/model/loss/distogram.py +48 -0
boltz/model/loss/distogramv2.py +105 -0
boltz/model/loss/validation.py +1025 -0
boltz/model/models/__init__.py +0 -0
boltz/model/models/boltz1.py +1286 -0
boltz/model/models/boltz2.py +1249 -0
boltz/model/modules/__init__.py +0 -0
boltz/model/modules/affinity.py +223 -0
boltz/model/modules/confidence.py +481 -0
boltz/model/modules/confidence_utils.py +181 -0
boltz/model/modules/confidencev2.py +495 -0
boltz/model/modules/diffusion.py +844 -0
boltz/model/modules/diffusion_conditioning.py +116 -0
boltz/model/modules/diffusionv2.py +677 -0
boltz/model/modules/encoders.py +639 -0
boltz/model/modules/encodersv2.py +565 -0
boltz/model/modules/transformers.py +322 -0
boltz/model/modules/transformersv2.py +261 -0
boltz/model/modules/trunk.py +688 -0
boltz/model/modules/trunkv2.py +828 -0
boltz/model/modules/utils.py +303 -0
boltz/model/optim/__init__.py +0 -0
boltz/model/optim/ema.py +389 -0
boltz/model/optim/scheduler.py +99 -0
boltz/model/potentials/__init__.py +0 -0
boltz/model/potentials/potentials.py +497 -0
boltz/model/potentials/schedules.py +32 -0
boltz_vsynthes-1.0.0.dist-info/METADATA +151 -0
boltz_vsynthes-1.0.0.dist-info/RECORD +112 -0
boltz_vsynthes-1.0.0.dist-info/WHEEL +5 -0
boltz_vsynthes-1.0.0.dist-info/entry_points.txt +2 -0
boltz_vsynthes-1.0.0.dist-info/licenses/LICENSE +21 -0
boltz_vsynthes-1.0.0.dist-info/top_level.txt +1 -0

boltz/data/write/__init__.py ADDED Viewed

File without changes

boltz/data/write/mmcif.py ADDED Viewed

@@ -0,0 +1,305 @@
+import io
+import re
+from collections.abc import Iterator
+from typing import Optional
+import ihm
+import modelcif
+from modelcif import Assembly, AsymUnit, Entity, System, dumper
+from modelcif.model import AbInitioModel, Atom, ModelGroup
+from rdkit import Chem
+from torch import Tensor
+from boltz.data import const
+from boltz.data.types import Structure
+def to_mmcif(
+    structure: Structure,
+    plddts: Optional[Tensor] = None,
+    boltz2: bool = False,
+) -> str:  # noqa: C901, PLR0915, PLR0912
+    """Write a structure into an MMCIF file.
+    Parameters
+    ----------
+    structure : Structure
+        The input structure
+    Returns
+    -------
+    str
+        the output MMCIF file
+    """
+    system = System()
+    # Load periodic table for element mapping
+    periodic_table = Chem.GetPeriodicTable()
+    # Map entities to chain_ids
+    entity_to_chains = {}
+    entity_to_moltype = {}
+    for chain in structure.chains:
+        entity_id = chain["entity_id"]
+        mol_type = chain["mol_type"]
+        entity_to_chains.setdefault(entity_id, []).append(chain)
+        entity_to_moltype[entity_id] = mol_type
+    # Map entities to sequences
+    sequences = {}
+    for entity in entity_to_chains:
+        # Get the first chain
+        chain = entity_to_chains[entity][0]
+        # Get the sequence
+        res_start = chain["res_idx"]
+        res_end = chain["res_idx"] + chain["res_num"]
+        residues = structure.residues[res_start:res_end]
+        sequence = [str(res["name"]) for res in residues]
+        sequences[entity] = sequence
+    # Create entity objects
+    lig_entity = None
+    entities_map = {}
+    for entity, sequence in sequences.items():
+        mol_type = entity_to_moltype[entity]
+        if mol_type == const.chain_type_ids["PROTEIN"]:
+            alphabet = ihm.LPeptideAlphabet()
+            chem_comp = lambda x: ihm.LPeptideChemComp(id=x, code=x, code_canonical="X")  # noqa: E731
+        elif mol_type == const.chain_type_ids["DNA"]:
+            alphabet = ihm.DNAAlphabet()
+            chem_comp = lambda x: ihm.DNAChemComp(id=x, code=x, code_canonical="N")  # noqa: E731
+        elif mol_type == const.chain_type_ids["RNA"]:
+            alphabet = ihm.RNAAlphabet()
+            chem_comp = lambda x: ihm.RNAChemComp(id=x, code=x, code_canonical="N")  # noqa: E731
+        elif len(sequence) > 1:
+            alphabet = {}
+            chem_comp = lambda x: ihm.SaccharideChemComp(id=x)  # noqa: E731
+        else:
+            alphabet = {}
+            chem_comp = lambda x: ihm.NonPolymerChemComp(id=x)  # noqa: E731
+        # Handle smiles
+        if len(sequence) == 1 and (sequence[0] == "LIG"):
+            if lig_entity is None:
+                seq = [chem_comp(sequence[0])]
+                lig_entity = Entity(seq)
+            model_e = lig_entity
+        else:
+            seq = [
+                alphabet[item] if item in alphabet else chem_comp(item)
+                for item in sequence
+            ]
+            model_e = Entity(seq)
+        for chain in entity_to_chains[entity]:
+            chain_idx = chain["asym_id"]
+            entities_map[chain_idx] = model_e
+    # We don't assume that symmetry is perfect, so we dump everything
+    # into the asymmetric unit, and produce just a single assembly
+    asym_unit_map = {}
+    for chain in structure.chains:
+        # Define the model assembly
+        chain_idx = chain["asym_id"]
+        chain_tag = str(chain["name"])
+        entity = entities_map[chain_idx]
+        if entity.type == "water":
+            asym = ihm.WaterAsymUnit(
+                entity,
+                1,
+                details="Model subunit %s" % chain_tag,
+                id=chain_tag,
+            )
+        else:
+            asym = AsymUnit(
+                entity,
+                details="Model subunit %s" % chain_tag,
+                id=chain_tag,
+            )
+        asym_unit_map[chain_idx] = asym
+    modeled_assembly = Assembly(asym_unit_map.values(), name="Modeled assembly")
+    class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT):
+        name = "pLDDT"
+        software = None
+        description = "Predicted lddt"
+    class _MyModel(AbInitioModel):
+        def get_atoms(self) -> Iterator[Atom]:
+            # Index into plddt tensor for current residue.
+            res_num = 0
+            # Tracks non-ligand plddt tensor indices,
+            # Initializing to -1 handles case where ligand is resnum 0
+            prev_polymer_resnum = -1
+            # Tracks ligand indices.
+            ligand_index_offset = 0
+            # Add all atom sites.
+            for chain in structure.chains:
+                # We rename the chains in alphabetical order
+                het = chain["mol_type"] == const.chain_type_ids["NONPOLYMER"]
+                chain_idx = chain["asym_id"]
+                res_start = chain["res_idx"]
+                res_end = chain["res_idx"] + chain["res_num"]
+                record_type = (
+                    "ATOM"
+                    if chain["mol_type"] != const.chain_type_ids["NONPOLYMER"]
+                    else "HETATM"
+                )
+                residues = structure.residues[res_start:res_end]
+                for residue in residues:
+                    res_name = str(residue["name"])
+                    atom_start = residue["atom_idx"]
+                    atom_end = residue["atom_idx"] + residue["atom_num"]
+                    atoms = structure.atoms[atom_start:atom_end]
+                    atom_coords = atoms["coords"]
+                    for i, atom in enumerate(atoms):
+                        # This should not happen on predictions, but just in case.
+                        if not atom["is_present"]:
+                            continue
+                        if boltz2:
+                            atom_name = str(atom["name"])
+                            atom_key = re.sub(r"\d", "", atom_name)
+                            if atom_key in const.ambiguous_atoms:
+                                if isinstance(const.ambiguous_atoms[atom_key], str):
+                                    element = const.ambiguous_atoms[atom_key]
+                                elif res_name in const.ambiguous_atoms[atom_key]:
+                                    element = const.ambiguous_atoms[atom_key][res_name]
+                                else:
+                                    element = const.ambiguous_atoms[atom_key]["*"]
+                            else:
+                                element = atom_key[0]
+                        else:
+                            atom_name = atom["name"]
+                            atom_name = [chr(c + 32) for c in atom_name if c != 0]
+                            atom_name = "".join(atom_name)
+                            element = periodic_table.GetElementSymbol(
+                                atom["element"].item()
+                            )
+                        element = element.upper()
+                        residue_index = residue["res_idx"] + 1
+                        pos = atom_coords[i]
+                        if record_type != "HETATM":
+                            # The current residue plddt is stored at the res_num index unless a ligand has previouly been added.
+                            biso = (
+                                100.00
+                                if plddts is None
+                                else round(
+                                    plddts[res_num + ligand_index_offset].item() * 100,
+                                    3,
+                                )
+                            )
+                            prev_polymer_resnum = res_num
+                        else:
+                            # If not a polymer resnum, we can get index into plddts by adding offset relative to previous polymer resnum.
+                            ligand_index_offset += 1
+                            biso = (
+                                100.00
+                                if plddts is None
+                                else round(
+                                    plddts[
+                                        prev_polymer_resnum + ligand_index_offset
+                                    ].item()
+                                    * 100,
+                                    3,
+                                )
+                            )
+                        yield Atom(
+                            asym_unit=asym_unit_map[chain_idx],
+                            type_symbol=element,
+                            seq_id=residue_index,
+                            atom_id=atom_name,
+                            x=f"{pos[0]:.5f}",
+                            y=f"{pos[1]:.5f}",
+                            z=f"{pos[2]:.5f}",
+                            het=het,
+                            biso=biso,
+                            occupancy=1,
+                        )
+                    if record_type != "HETATM":
+                        res_num += 1
+        def add_plddt(self, plddts):
+            res_num = 0
+            prev_polymer_resnum = (
+                -1
+            )  # -1 handles case where ligand is the first residue
+            ligand_index_offset = 0
+            for chain in structure.chains:
+                chain_idx = chain["asym_id"]
+                res_start = chain["res_idx"]
+                res_end = chain["res_idx"] + chain["res_num"]
+                residues = structure.residues[res_start:res_end]
+                record_type = (
+                    "ATOM"
+                    if chain["mol_type"] != const.chain_type_ids["NONPOLYMER"]
+                    else "HETATM"
+                )
+                # We rename the chains in alphabetical order
+                for residue in residues:
+                    residue_idx = residue["res_idx"] + 1
+                    atom_start = residue["atom_idx"]
+                    atom_end = residue["atom_idx"] + residue["atom_num"]
+                    if record_type != "HETATM":
+                        # The current residue plddt is stored at the res_num index unless a ligand has previouly been added.
+                        self.qa_metrics.append(
+                            _LocalPLDDT(
+                                asym_unit_map[chain_idx].residue(residue_idx),
+                                round(
+                                    plddts[res_num + ligand_index_offset].item() * 100,
+                                    3,
+                                ),
+                            )
+                        )
+                        prev_polymer_resnum = res_num
+                    else:
+                        # If not a polymer resnum, we can get index into plddts by adding offset relative to previous polymer resnum.
+                        self.qa_metrics.append(
+                            _LocalPLDDT(
+                                asym_unit_map[chain_idx].residue(residue_idx),
+                                round(
+                                    plddts[
+                                        prev_polymer_resnum
+                                        + ligand_index_offset
+                                        + 1 : prev_polymer_resnum
+                                        + ligand_index_offset
+                                        + residue["atom_num"]
+                                        + 1
+                                    ]
+                                    .mean()
+                                    .item()
+                                    * 100,
+                                    2,
+                                ),
+                            )
+                        )
+                        ligand_index_offset += residue["atom_num"]
+                    if record_type != "HETATM":
+                        res_num += 1
+    # Add the model and modeling protocol to the file and write them out:
+    model = _MyModel(assembly=modeled_assembly, name="Model")
+    if plddts is not None:
+        model.add_plddt(plddts)
+    model_group = ModelGroup([model], name="All models")
+    system.model_groups.append(model_group)
+    fh = io.StringIO()
+    dumper.write(fh, [system])
+    return fh.getvalue()

boltz/data/write/pdb.py ADDED Viewed

@@ -0,0 +1,171 @@
+import re
+from typing import Optional
+from rdkit import Chem
+from torch import Tensor
+from boltz.data import const
+from boltz.data.types import Structure
+def to_pdb(
+    structure: Structure,
+    plddts: Optional[Tensor] = None,
+    boltz2: bool = False,
+) -> str:  # noqa: PLR0915
+    """Write a structure into a PDB file.
+    Parameters
+    ----------
+    structure : Structure
+        The input structure
+    Returns
+    -------
+    str
+        the output PDB file
+    """
+    pdb_lines = []
+    atom_index = 1
+    atom_reindex_ter = []
+    # Load periodic table for element mapping
+    periodic_table = Chem.GetPeriodicTable()
+    # Index into plddt tensor for current residue.
+    res_num = 0
+    # Tracks non-ligand plddt tensor indices,
+    # Initializing to -1 handles case where ligand is resnum 0
+    prev_polymer_resnum = -1
+    # Tracks ligand indices.
+    ligand_index_offset = 0
+    # Add all atom sites.
+    for chain in structure.chains:
+        # We rename the chains in alphabetical order
+        chain_idx = chain["asym_id"]
+        chain_tag = chain["name"]
+        res_start = chain["res_idx"]
+        res_end = chain["res_idx"] + chain["res_num"]
+        residues = structure.residues[res_start:res_end]
+        for residue in residues:
+            res_name = str(residue["name"])
+            atom_start = residue["atom_idx"]
+            atom_end = residue["atom_idx"] + residue["atom_num"]
+            atoms = structure.atoms[atom_start:atom_end]
+            atom_coords = atoms["coords"]
+            for i, atom in enumerate(atoms):
+                # This should not happen on predictions, but just in case.
+                if not atom["is_present"]:
+                    continue
+                record_type = (
+                    "ATOM"
+                    if chain["mol_type"] != const.chain_type_ids["NONPOLYMER"]
+                    else "HETATM"
+                )
+                name = str(atom["name"])
+                if boltz2:
+                    atom_name = str(atom["name"])
+                    atom_key = re.sub(r"\d", "", atom_name)
+                    if atom_key in const.ambiguous_atoms:
+                        if isinstance(const.ambiguous_atoms[atom_key], str):
+                            element = const.ambiguous_atoms[atom_key]
+                        elif res_name in const.ambiguous_atoms[atom_key]:
+                            element = const.ambiguous_atoms[atom_key][res_name]
+                        else:
+                            element = const.ambiguous_atoms[atom_key]["*"]
+                    else:
+                        element = atom_key[0]
+                else:
+                    atom_name = atom["name"]
+                    atom_name = [chr(c + 32) for c in atom_name if c != 0]
+                    atom_name = "".join(atom_name)
+                    element = periodic_table.GetElementSymbol(atom["element"].item())
+                name = name if len(name) == 4 else f" {name}"  # noqa: PLR2004
+                alt_loc = ""
+                insertion_code = ""
+                occupancy = 1.00
+                element = element.upper()
+                charge = ""
+                residue_index = residue["res_idx"] + 1
+                pos = atom_coords[i]
+                res_name_3 = (
+                    "LIG" if record_type == "HETATM" else str(residue["name"][:3])
+                )
+                if record_type != "HETATM":
+                    # The current residue plddt is stored at the res_num index unless a ligand has previouly been added.
+                    b_factor = (
+                        100.00
+                        if plddts is None
+                        else round(
+                            plddts[res_num + ligand_index_offset].item() * 100, 2
+                        )
+                    )
+                    prev_polymer_resnum = res_num
+                else:
+                    # If not a polymer resnum, we can get index into plddts by adding offset relative to previous polymer resnum.
+                    ligand_index_offset += 1
+                    b_factor = (
+                        100.00
+                        if plddts is None
+                        else round(
+                            plddts[prev_polymer_resnum + ligand_index_offset].item()
+                            * 100,
+                            2,
+                        )
+                    )
+                # PDB is a columnar format, every space matters here!
+                atom_line = (
+                    f"{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}"
+                    f"{res_name_3:>3} {chain_tag:>1}"
+                    f"{residue_index:>4}{insertion_code:>1}   "
+                    f"{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}"
+                    f"{occupancy:>6.2f}{b_factor:>6.2f}          "
+                    f"{element:>2}{charge:>2}"
+                )
+                pdb_lines.append(atom_line)
+                atom_reindex_ter.append(atom_index)
+                atom_index += 1
+            if record_type != "HETATM":
+                res_num += 1
+        should_terminate = chain_idx < (len(structure.chains) - 1)
+        if should_terminate:
+            # Close the chain.
+            chain_end = "TER"
+            chain_termination_line = (
+                f"{chain_end:<6}{atom_index:>5}      "
+                f"{res_name_3:>3} "
+                f"{chain_tag:>1}{residue_index:>4}"
+            )
+            pdb_lines.append(chain_termination_line)
+            atom_index += 1
+    # Dump CONECT records.
+    all_bonds = structure.bonds
+    if hasattr(structure, "connections"):
+        all_bonds = all_bonds + structure.connections
+    for bond in all_bonds:
+        atom1 = structure.atoms[bond["atom_1"]]
+        atom2 = structure.atoms[bond["atom_2"]]
+        if not atom1["is_present"] or not atom2["is_present"]:
+            continue
+        atom1_idx = atom_reindex_ter[bond["atom_1"]]
+        atom2_idx = atom_reindex_ter[bond["atom_2"]]
+        conect_line = f"CONECT{atom1_idx:>5}{atom2_idx:>5}"
+        pdb_lines.append(conect_line)
+    pdb_lines.append("END")
+    pdb_lines.append("")
+    pdb_lines = [line.ljust(80) for line in pdb_lines]
+    return "\n".join(pdb_lines)

boltz/data/write/utils.py ADDED Viewed

@@ -0,0 +1,23 @@
+import string
+from collections.abc import Iterator
+def generate_tags() -> Iterator[str]:
+    """Generate chain tags.
+    Yields
+    ------
+    str
+        The next chain tag
+    """
+    for i in range(1, 4):
+        for j in range(len(string.ascii_uppercase) ** i):
+            tag = ""
+            for k in range(i):
+                tag += string.ascii_uppercase[
+                    j
+                    // (len(string.ascii_uppercase) ** k)
+                    % len(string.ascii_uppercase)
+                ]
+            yield tag