PyPI - boltz-vsynthes - Versions diffs - 1.0.8__py3-none-any.whl → 1.0.10__py3-none-any.whl - Mend

boltz-vsynthes 1.0.8py3-none-any.whl → 1.0.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

boltz/data/mol.py +0 -4
boltz/data/parse/__init__.py +21 -0
boltz/data/parse/pdb.py +71 -0
boltz/data/parse/pdb_download.py +114 -0
boltz/data/parse/schema.py +802 -141
boltz/data/parse/sdf.py +60 -0
boltz/main.py +176 -208
{boltz_vsynthes-1.0.8.dist-info → boltz_vsynthes-1.0.10.dist-info}/METADATA +2 -2
{boltz_vsynthes-1.0.8.dist-info → boltz_vsynthes-1.0.10.dist-info}/RECORD +13 -10
{boltz_vsynthes-1.0.8.dist-info → boltz_vsynthes-1.0.10.dist-info}/WHEEL +0 -0
{boltz_vsynthes-1.0.8.dist-info → boltz_vsynthes-1.0.10.dist-info}/entry_points.txt +0 -0
{boltz_vsynthes-1.0.8.dist-info → boltz_vsynthes-1.0.10.dist-info}/licenses/LICENSE +0 -0
{boltz_vsynthes-1.0.8.dist-info → boltz_vsynthes-1.0.10.dist-info}/top_level.txt +0 -0

boltz/data/parse/schema.py CHANGED Viewed

@@ -621,9 +621,6 @@ def get_mol(ccd: str, mols: dict, moldir: str) -> Mol:
     Return mol with ccd from mols if it is in mols. Otherwise load it from moldir,
     add it to mols, and return the mol.
     """
-    # Skip if it's a SMILES string (starts with LIG)
-    if ccd.startswith("LIG"):
-        return None
     mol = mols.get(ccd)
     if mol is None:
         mol = load_molecules(moldir, [ccd])[ccd]
@@ -658,10 +655,6 @@ def parse_ccd_residue(
        The output ParsedResidue, if successful.
     """
-    # Skip if it's a SMILES string (starts with LIG)
-    if name.startswith("LIG"):
-        return None
     unk_chirality = const.chirality_type_ids[const.unk_chirality_type]
     # Check if this is a single heavy atom CCD residue
@@ -936,111 +929,100 @@ def token_spec_to_ids(
         contacts.append((chain_to_idx[chain_name], residue_index_or_atom_name - 1))
-def parse_boltz_schema(schema: dict) -> dict:
-    """Parse the Boltz input schema.
+def parse_boltz_schema(  # noqa: C901, PLR0915, PLR0912
+    name: str,
+    schema: dict,
+    ccd: Mapping[str, Mol],
+    mol_dir: Optional[Path] = None,
+    boltz_2: bool = False,
+) -> Target:
+    """Parse a Boltz input yaml / json.
+    The input file should be a dictionary with the following format:
+    version: 1
+    sequences:
+        - protein:
+            id: A
+            sequence: "MADQLTEEQIAEFKEAFSLF"
+            msa: path/to/msa1.a3m
+        - protein:
+            id: [B, C]
+            sequence: "AKLSILPWGHC"
+            msa: path/to/msa2.a3m
+        - rna:
+            id: D
+            sequence: "GCAUAGC"
+        - ligand:
+            id: E
+            smiles: "CC1=CC=CC=C1"
+    constraints:
+        - bond:
+            atom1: [A, 1, CA]
+            atom2: [A, 2, N]
+        - pocket:
+            binder: E
+            contacts: [[B, 1], [B, 2]]
+            max_distance: 6
+        - contact:
+            token1: [A, 1]
+            token2: [B, 1]
+            max_distance: 6
+    templates:
+        - cif: path/to/template.cif
+    properties:
+        - affinity:
+            binder: E
     Parameters
     ----------
+    name : str
+        A name for the input.
     schema : dict
         The input schema.
+    components : dict
+        Dictionary of CCD components.
+    mol_dir: Path
+        Path to the directory containing the molecules.
+    boltz2: bool
+        Whether to parse the input for Boltz2.
     Returns
     -------
-    dict
-        The parsed schema.
+    Target
+        The parsed target.
     """
-    # Check version
-    if "version" not in schema:
-        msg = "Schema must have a version field"
+    # Assert version 1
+    version = schema.get("version", 1)
+    if version != 1:
+        msg = f"Invalid version {version} in input!"
         raise ValueError(msg)
-    # Group items by entity type and sequence
+    # Disable rdkit warnings
+    blocker = rdBase.BlockLogs()  # noqa: F841
+    # First group items that have the same type, sequence and modifications
     items_to_group = {}
     chain_name_to_entity_type = {}
-    # Keep track of ligand IDs
-    ligand_id = 1
-    ligand_id_map = {}
-    # Parse sequences
     for item in schema["sequences"]:
-        entity_type = list(item.keys())[0]
-        entity_id = item[entity_type]["id"]
-        entity_id = [entity_id] if isinstance(entity_id, str) else entity_id
+        # Get entity type
+        entity_type = next(iter(item.keys())).lower()
+        if entity_type not in {"protein", "dna", "rna", "ligand"}:
+            msg = f"Invalid entity type: {entity_type}"
+            raise ValueError(msg)
         # Get sequence
-        if entity_type == "protein":
-            if "sequence" in item[entity_type]:
-                seq = item[entity_type]["sequence"]
-            elif "pdb" in item[entity_type]:
-                pdb_input = item[entity_type]["pdb"]
-                if pdb_input.startswith(("http://", "https://")):
-                    # It's a PDB ID
-                    import requests
-                    response = requests.get(f"https://files.rcsb.org/download/{pdb_input}.pdb")
-                    if response.status_code != 200:
-                        msg = f"Failed to download PDB file: {pdb_input}"
-                        raise FileNotFoundError(msg)
-                    pdb_data = response.text
-                else:
-                    # It's a file path
-                    pdb_path = Path(pdb_input)
-                    if not pdb_path.exists():
-                        msg = f"PDB file not found: {pdb_path}"
-                        raise FileNotFoundError(msg)
-                    with pdb_path.open("r") as f:
-                        pdb_data = f.read()
-                # Parse PDB data
-                from Bio.PDB import PDBParser
-                from io import StringIO
-                parser = PDBParser()
-                structure = parser.get_structure("protein", StringIO(pdb_data))
-                # Extract sequence
-                seq = ""
-                for model in structure:
-                    for chain in model:
-                        for residue in chain:
-                            if residue.id[0] == " ":  # Only standard residues
-                                seq += residue.resname
-                seq = "".join(seq)
-            else:
-                msg = "Protein must have either 'sequence' or 'pdb' field"
-                raise ValueError(msg)
+        if entity_type in {"protein", "dna", "rna"}:
+            seq = str(item[entity_type]["sequence"])
         elif entity_type == "ligand":
-            # Support for SMILES, CCD, and SDF
+            assert "smiles" in item[entity_type] or "ccd" in item[entity_type]
+            assert "smiles" not in item[entity_type] or "ccd" not in item[entity_type]
             if "smiles" in item[entity_type]:
                 seq = str(item[entity_type]["smiles"])
-                # Map user-provided ID to internal LIG1, LIG2, etc.
-                for id in entity_id:
-                    ligand_id_map[id] = f"LIG{ligand_id}"
-                ligand_id += 1
-            elif "ccd" in item[entity_type]:
-                seq = str(item[entity_type]["ccd"])
-                # For CCD ligands, use the CCD code as the internal ID
-                for id in entity_id:
-                    ligand_id_map[id] = seq
-            elif "sdf" in item[entity_type]:
-                sdf_path = Path(item[entity_type]["sdf"])
-                if not sdf_path.exists():
-                    msg = f"SDF file not found: {sdf_path}"
-                    raise FileNotFoundError(msg)
-                # Read SDF and convert to SMILES
-                from rdkit import Chem
-                mol = Chem.SDMolSupplier(str(sdf_path))[0]
-                if mol is None:
-                    msg = f"Failed to read SDF file: {sdf_path}"
-                    raise ValueError(msg)
-                seq = Chem.MolToSmiles(mol)
-                # Map user-provided ID to internal LIG1, LIG2, etc.
-                for id in entity_id:
-                    ligand_id_map[id] = f"LIG{ligand_id}"
-                ligand_id += 1
             else:
-                msg = "Ligand must have either 'smiles', 'ccd', or 'sdf' field"
-                raise ValueError(msg)
+                seq = str(item[entity_type]["ccd"])
         # Group items by entity
         items_to_group.setdefault((entity_type, seq), []).append(item)
@@ -1051,60 +1033,739 @@ def parse_boltz_schema(schema: dict) -> dict:
         for chain_name in chain_names:
             chain_name_to_entity_type[chain_name] = entity_type
-    # Get all proteins and ligands
-    proteins = []
-    ligands = []
-    for item in schema["sequences"]:
-        entity_type = list(item.keys())[0]
-        entity_id = item[entity_type]["id"]
-        entity_id = [entity_id] if isinstance(entity_id, str) else entity_id
+    # Check if any affinity ligand is present
+    affinity_ligands = set()
+    properties = schema.get("properties", [])
+    if properties and not boltz_2:
+        msg = "Affinity prediction is only supported for Boltz2!"
+        raise ValueError(msg)
+    for prop in properties:
+        prop_type = next(iter(prop.keys())).lower()
+        if prop_type == "affinity":
+            binder = prop["affinity"]["binder"]
+            if not isinstance(binder, str):
+                # TODO: support multi residue ligands and ccd's
+                msg = "Binder must be a single chain."
+                raise ValueError(msg)
+            if binder not in chain_name_to_entity_type:
+                msg = f"Could not find binder with name {binder} in the input!"
+                raise ValueError(msg)
+            if chain_name_to_entity_type[binder] != "ligand":
+                msg = (
+                    f"Chain {binder} is not a ligand! "
+                    "Affinity is currently only supported for ligands."
+                )
+                raise ValueError(msg)
+            affinity_ligands.add(binder)
+    # Check only one affinity ligand is present
+    if len(affinity_ligands) > 1:
+        msg = "Only one affinity ligand is currently supported!"
+        raise ValueError(msg)
+    # Go through entities and parse them
+    extra_mols: dict[str, Mol] = {}
+    chains: dict[str, ParsedChain] = {}
+    chain_to_msa: dict[str, str] = {}
+    entity_to_seq: dict[str, str] = {}
+    is_msa_custom = False
+    is_msa_auto = False
+    ligand_id = 1
+    for entity_id, items in enumerate(items_to_group.values()):
+        # Get entity type and sequence
+        entity_type = next(iter(items[0].keys())).lower()
+        # Get ids
+        ids = []
+        for item in items:
+            if isinstance(item[entity_type]["id"], str):
+                ids.append(item[entity_type]["id"])
+            elif isinstance(item[entity_type]["id"], list):
+                ids.extend(item[entity_type]["id"])
+        # Check if any affinity ligand is present
+        if len(ids) == 1:
+            affinity = ids[0] in affinity_ligands
+        elif (len(ids) > 1) and any(x in affinity_ligands for x in ids):
+            msg = "Cannot compute affinity for a ligand that has multiple copies!"
+            raise ValueError(msg)
+        else:
+            affinity = False
+        # Ensure all the items share the same msa
+        msa = -1
         if entity_type == "protein":
-            proteins.extend(entity_id)
-        elif entity_type == "ligand":
-            ligands.extend(entity_id)
-    # Generate properties for each protein-ligand pair
-    new_properties = []
-    for prop in schema.get("properties", []):
-        if "affinity" in prop:
-            affinity = prop["affinity"]
-            # Handle protein as binder
-            if "protein" in affinity:
-                binder = affinity["protein"]
-                if binder not in proteins:
-                    msg = f"Protein {binder} not found in sequences"
+            # Get the msa, default to 0, meaning auto-generated
+            msa = items[0][entity_type].get("msa", 0)
+            if (msa is None) or (msa == ""):
+                msa = 0
+            # Check if all MSAs are the same within the same entity
+            for item in items:
+                item_msa = item[entity_type].get("msa", 0)
+                if (item_msa is None) or (item_msa == ""):
+                    item_msa = 0
+                if item_msa != msa:
+                    msg = "All proteins with the same sequence must share the same MSA!"
                     raise ValueError(msg)
-                # Generate pairs with all ligands
-                for ligand in ligands:
-                    if ligand in ligand_id_map:
-                        ligand = ligand_id_map[ligand]  # Convert to internal LIG1, LIG2, etc.
-                    new_properties.append({
-                        "affinity": {
-                            "binder": binder,
-                            "ligand": ligand
-                        }
-                    })
-            # Handle ligand as binder (backward compatibility)
-            elif "binder" in affinity:
-                binder = affinity["binder"]
-                if binder not in proteins:
-                    msg = f"Protein {binder} not found in sequences"
+            # Set the MSA, warn if passed in single-sequence mode
+            if msa == "empty":
+                msa = -1
+                msg = (
+                    "Found explicit empty MSA for some proteins, will run "
+                    "these in single sequence mode. Keep in mind that the "
+                    "model predictions will be suboptimal without an MSA."
+                )
+                click.echo(msg)
+            if msa not in (0, -1):
+                is_msa_custom = True
+            elif msa == 0:
+                is_msa_auto = True
+        # Parse a polymer
+        if entity_type in {"protein", "dna", "rna"}:
+            # Get token map
+            if entity_type == "rna":
+                token_map = const.rna_letter_to_token
+            elif entity_type == "dna":
+                token_map = const.dna_letter_to_token
+            elif entity_type == "protein":
+                token_map = const.prot_letter_to_token
+            else:
+                msg = f"Unknown polymer type: {entity_type}"
+                raise ValueError(msg)
+            # Get polymer info
+            chain_type = const.chain_type_ids[entity_type.upper()]
+            unk_token = const.unk_token[entity_type.upper()]
+            # Extract sequence
+            raw_seq = items[0][entity_type]["sequence"]
+            entity_to_seq[entity_id] = raw_seq
+            # Convert sequence to tokens
+            seq = [token_map.get(c, unk_token) for c in list(raw_seq)]
+            # Apply modifications
+            for mod in items[0][entity_type].get("modifications", []):
+                code = mod["ccd"]
+                idx = mod["position"] - 1  # 1-indexed
+                seq[idx] = code
+            cyclic = items[0][entity_type].get("cyclic", False)
+            # Parse a polymer
+            parsed_chain = parse_polymer(
+                sequence=seq,
+                raw_sequence=raw_seq,
+                entity=entity_id,
+                chain_type=chain_type,
+                components=ccd,
+                cyclic=cyclic,
+                mol_dir=mol_dir,
+            )
+        # Parse a non-polymer
+        elif (entity_type == "ligand") and "ccd" in (items[0][entity_type]):
+            seq = items[0][entity_type]["ccd"]
+            if isinstance(seq, str):
+                seq = [seq]
+            if affinity and len(seq) > 1:
+                msg = "Cannot compute affinity for multi residue ligands!"
+                raise ValueError(msg)
+            residues = []
+            affinity_mw = None
+            for res_idx, code in enumerate(seq):
+                # Get mol
+                ref_mol = get_mol(code, ccd, mol_dir)
+                if affinity:
+                    affinity_mw = AllChem.Descriptors.MolWt(ref_mol)
+                # Parse residue
+                residue = parse_ccd_residue(
+                    name=code,
+                    ref_mol=ref_mol,
+                    res_idx=res_idx,
+                )
+                residues.append(residue)
+            # Create multi ligand chain
+            parsed_chain = ParsedChain(
+                entity=entity_id,
+                residues=residues,
+                type=const.chain_type_ids["NONPOLYMER"],
+                cyclic_period=0,
+                sequence=None,
+                affinity=affinity,
+                affinity_mw=affinity_mw,
+            )
+            assert not items[0][entity_type].get(
+                "cyclic", False
+            ), "Cyclic flag is not supported for ligands"
+        elif (entity_type == "ligand") and ("smiles" in items[0][entity_type]):
+            seq = items[0][entity_type]["smiles"]
+            if affinity:
+                seq = standardize(seq)
+            mol = AllChem.MolFromSmiles(seq)
+            mol = AllChem.AddHs(mol)
+            # Set atom names
+            canonical_order = AllChem.CanonicalRankAtoms(mol)
+            for atom, can_idx in zip(mol.GetAtoms(), canonical_order):
+                atom_name = atom.GetSymbol().upper() + str(can_idx + 1)
+                if len(atom_name) > 4:
+                    msg = (
+                        f"{seq} has an atom with a name longer than "
+                        f"4 characters: {atom_name}."
+                    )
                     raise ValueError(msg)
-                # Generate pairs with all ligands
-                for ligand in ligands:
-                    if ligand in ligand_id_map:
-                        ligand = ligand_id_map[ligand]  # Convert to internal LIG1, LIG2, etc.
-                    new_properties.append({
-                        "affinity": {
-                            "binder": binder,
-                            "ligand": ligand
-                        }
-                    })
-    # Update schema with generated properties
-    schema["properties"] = new_properties
-    return schema
+                atom.SetProp("name", atom_name)
+            success = compute_3d_conformer(mol)
+            if not success:
+                msg = f"Failed to compute 3D conformer for {seq}"
+                raise ValueError(msg)
+            mol_no_h = AllChem.RemoveHs(mol, sanitize=False)
+            affinity_mw = AllChem.Descriptors.MolWt(mol_no_h) if affinity else None
+            extra_mols[f"LIG{ligand_id}"] = mol_no_h
+            residue = parse_ccd_residue(
+                name=f"LIG{ligand_id}",
+                ref_mol=mol,
+                res_idx=0,
+            )
+            ligand_id += 1
+            parsed_chain = ParsedChain(
+                entity=entity_id,
+                residues=[residue],
+                type=const.chain_type_ids["NONPOLYMER"],
+                cyclic_period=0,
+                sequence=None,
+                affinity=affinity,
+                affinity_mw=affinity_mw,
+            )
+            assert not items[0][entity_type].get(
+                "cyclic", False
+            ), "Cyclic flag is not supported for ligands"
+        else:
+            msg = f"Invalid entity type: {entity_type}"
+            raise ValueError(msg)
+        # Add as many chains as provided ids
+        for item in items:
+            ids = item[entity_type]["id"]
+            if isinstance(ids, str):
+                ids = [ids]
+            for chain_name in ids:
+                chains[chain_name] = parsed_chain
+                chain_to_msa[chain_name] = msa
+    # Check if msa is custom or auto
+    if is_msa_custom and is_msa_auto:
+        msg = "Cannot mix custom and auto-generated MSAs in the same input!"
+        raise ValueError(msg)
+    # If no chains parsed fail
+    if not chains:
+        msg = "No chains parsed!"
+        raise ValueError(msg)
+    # Create tables
+    atom_data = []
+    bond_data = []
+    res_data = []
+    chain_data = []
+    protein_chains = set()
+    affinity_info = None
+    rdkit_bounds_constraint_data = []
+    chiral_atom_constraint_data = []
+    stereo_bond_constraint_data = []
+    planar_bond_constraint_data = []
+    planar_ring_5_constraint_data = []
+    planar_ring_6_constraint_data = []
+    # Convert parsed chains to tables
+    atom_idx = 0
+    res_idx = 0
+    asym_id = 0
+    sym_count = {}
+    chain_to_idx = {}
+    # Keep a mapping of (chain_name, residue_idx, atom_name) to atom_idx
+    atom_idx_map = {}
+    for asym_id, (chain_name, chain) in enumerate(chains.items()):
+        # Compute number of atoms and residues
+        res_num = len(chain.residues)
+        atom_num = sum(len(res.atoms) for res in chain.residues)
+        # Save protein chains for later
+        if chain.type == const.chain_type_ids["PROTEIN"]:
+            protein_chains.add(chain_name)
+        # Add affinity info
+        if chain.affinity and affinity_info is not None:
+            msg = "Cannot compute affinity for multiple ligands!"
+            raise ValueError(msg)
+        if chain.affinity:
+            affinity_info = AffinityInfo(
+                chain_id=asym_id,
+                mw=chain.affinity_mw,
+            )
+        # Find all copies of this chain in the assembly
+        entity_id = int(chain.entity)
+        sym_id = sym_count.get(entity_id, 0)
+        chain_data.append(
+            (
+                chain_name,
+                chain.type,
+                entity_id,
+                sym_id,
+                asym_id,
+                atom_idx,
+                atom_num,
+                res_idx,
+                res_num,
+                chain.cyclic_period,
+            )
+        )
+        chain_to_idx[chain_name] = asym_id
+        sym_count[entity_id] = sym_id + 1
+        # Add residue, atom, bond, data
+        for res in chain.residues:
+            atom_center = atom_idx + res.atom_center
+            atom_disto = atom_idx + res.atom_disto
+            res_data.append(
+                (
+                    res.name,
+                    res.type,
+                    res.idx,
+                    atom_idx,
+                    len(res.atoms),
+                    atom_center,
+                    atom_disto,
+                    res.is_standard,
+                    res.is_present,
+                )
+            )
+            if res.rdkit_bounds_constraints is not None:
+                for constraint in res.rdkit_bounds_constraints:
+                    rdkit_bounds_constraint_data.append(  # noqa: PERF401
+                        (
+                            tuple(
+                                c_atom_idx + atom_idx
+                                for c_atom_idx in constraint.atom_idxs
+                            ),
+                            constraint.is_bond,
+                            constraint.is_angle,
+                            constraint.upper_bound,
+                            constraint.lower_bound,
+                        )
+                    )
+            if res.chiral_atom_constraints is not None:
+                for constraint in res.chiral_atom_constraints:
+                    chiral_atom_constraint_data.append(  # noqa: PERF401
+                        (
+                            tuple(
+                                c_atom_idx + atom_idx
+                                for c_atom_idx in constraint.atom_idxs
+                            ),
+                            constraint.is_reference,
+                            constraint.is_r,
+                        )
+                    )
+            if res.stereo_bond_constraints is not None:
+                for constraint in res.stereo_bond_constraints:
+                    stereo_bond_constraint_data.append(  # noqa: PERF401
+                        (
+                            tuple(
+                                c_atom_idx + atom_idx
+                                for c_atom_idx in constraint.atom_idxs
+                            ),
+                            constraint.is_check,
+                            constraint.is_e,
+                        )
+                    )
+            if res.planar_bond_constraints is not None:
+                for constraint in res.planar_bond_constraints:
+                    planar_bond_constraint_data.append(  # noqa: PERF401
+                        (
+                            tuple(
+                                c_atom_idx + atom_idx
+                                for c_atom_idx in constraint.atom_idxs
+                            ),
+                        )
+                    )
+            if res.planar_ring_5_constraints is not None:
+                for constraint in res.planar_ring_5_constraints:
+                    planar_ring_5_constraint_data.append(  # noqa: PERF401
+                        (
+                            tuple(
+                                c_atom_idx + atom_idx
+                                for c_atom_idx in constraint.atom_idxs
+                            ),
+                        )
+                    )
+            if res.planar_ring_6_constraints is not None:
+                for constraint in res.planar_ring_6_constraints:
+                    planar_ring_6_constraint_data.append(  # noqa: PERF401
+                        (
+                            tuple(
+                                c_atom_idx + atom_idx
+                                for c_atom_idx in constraint.atom_idxs
+                            ),
+                        )
+                    )
+            for bond in res.bonds:
+                atom_1 = atom_idx + bond.atom_1
+                atom_2 = atom_idx + bond.atom_2
+                bond_data.append(
+                    (
+                        asym_id,
+                        asym_id,
+                        res_idx,
+                        res_idx,
+                        atom_1,
+                        atom_2,
+                        bond.type,
+                    )
+                )
+            for atom in res.atoms:
+                # Add atom to map
+                atom_idx_map[(chain_name, res.idx, atom.name)] = (
+                    asym_id,
+                    res_idx,
+                    atom_idx,
+                )
+                # Add atom to data
+                atom_data.append(
+                    (
+                        atom.name,
+                        atom.element,
+                        atom.charge,
+                        atom.coords,
+                        atom.conformer,
+                        atom.is_present,
+                        atom.chirality,
+                    )
+                )
+                atom_idx += 1
+            res_idx += 1
+    # Parse constraints
+    connections = []
+    pocket_constraints = []
+    contact_constraints = []
+    constraints = schema.get("constraints", [])
+    for constraint in constraints:
+        if "bond" in constraint:
+            if "atom1" not in constraint["bond"] or "atom2" not in constraint["bond"]:
+                msg = f"Bond constraint was not properly specified"
+                raise ValueError(msg)
+            c1, r1, a1 = tuple(constraint["bond"]["atom1"])
+            c2, r2, a2 = tuple(constraint["bond"]["atom2"])
+            c1, r1, a1 = atom_idx_map[(c1, r1 - 1, a1)]  # 1-indexed
+            c2, r2, a2 = atom_idx_map[(c2, r2 - 1, a2)]  # 1-indexed
+            connections.append((c1, c2, r1, r2, a1, a2))
+        elif "pocket" in constraint:
+            if (
+                "binder" not in constraint["pocket"]
+                or "contacts" not in constraint["pocket"]
+            ):
+                msg = f"Pocket constraint was not properly specified"
+                raise ValueError(msg)
+            if len(pocket_constraints) > 0 and not boltz_2:
+                msg = f"Only one pocket binders is supported in Boltz-1!"
+                raise ValueError(msg)
+            max_distance = constraint["pocket"].get("max_distance", 6.0)
+            if max_distance != 6.0 and not boltz_2:
+                msg = f"Max distance != 6.0 is not supported in Boltz-1!"
+                raise ValueError(msg)
+            binder = constraint["pocket"]["binder"]
+            binder = chain_to_idx[binder]
+            contacts = []
+            for chain_name, residue_index_or_atom_name in constraint["pocket"][
+                "contacts"
+            ]:
+                if chains[chain_name].type == const.chain_type_ids["NONPOLYMER"]:
+                    # Non-polymer chains are indexed by atom name
+                    _, _, atom_idx = atom_idx_map[
+                        (chain_name, 0, residue_index_or_atom_name)
+                    ]
+                    contact = (chain_to_idx[chain_name], atom_idx)
+                else:
+                    # Polymer chains are indexed by residue index
+                    contact = (chain_to_idx[chain_name], residue_index_or_atom_name - 1)
+                contacts.append(contact)
+            pocket_constraints.append((binder, contacts, max_distance))
+        elif "contact" in constraint:
+            if (
+                "token1" not in constraint["contact"]
+                or "token2" not in constraint["contact"]
+            ):
+                msg = f"Contact constraint was not properly specified"
+                raise ValueError(msg)
+            if not boltz_2:
+                msg = f"Contact constraint is not supported in Boltz-1!"
+                raise ValueError(msg)
+            max_distance = constraint["contact"].get("max_distance", 6.0)
+            chain_name1, residue_index_or_atom_name1 = constraint["contact"]["token1"]
+            if chains[chain_name1].type == const.chain_type_ids["NONPOLYMER"]:
+                # Non-polymer chains are indexed by atom name
+                _, _, atom_idx = atom_idx_map[
+                    (chain_name1, 0, residue_index_or_atom_name1)
+                ]
+                token1 = (chain_to_idx[chain_name1], atom_idx)
+            else:
+                # Polymer chains are indexed by residue index
+                token1 = (chain_to_idx[chain_name1], residue_index_or_atom_name1 - 1)
+            pocket_constraints.append((binder, contacts, max_distance))
+        else:
+            msg = f"Invalid constraint: {constraint}"
+            raise ValueError(msg)
+    # Get protein sequences in this YAML
+    protein_seqs = {name: chains[name].sequence for name in protein_chains}
+    # Parse templates
+    template_schema = schema.get("templates", [])
+    if template_schema and not boltz_2:
+        msg = "Templates are not supported in Boltz 1.0!"
+        raise ValueError(msg)
+    templates = {}
+    template_records = []
+    for template in template_schema:
+        if "cif" not in template:
+            msg = "Template was not properly specified, missing CIF path!"
+            raise ValueError(msg)
+        path = template["cif"]
+        template_id = Path(path).stem
+        chain_ids = template.get("chain_id", None)
+        template_chain_ids = template.get("template_id", None)
+        # Check validity of input
+        matched = False
+        if chain_ids is not None and not isinstance(chain_ids, list):
+            chain_ids = [chain_ids]
+        if template_chain_ids is not None and not isinstance(template_chain_ids, list):
+            template_chain_ids = [template_chain_ids]
+        if (
+            template_chain_ids is not None
+            and chain_ids is not None
+            and len(template_chain_ids) != len(chain_ids)
+        ):
+            matched = True
+            if len(template_chain_ids) != len(chain_ids):
+                msg = (
+                    "When providing both the chain_id and template_id, the number of"
+                    "template_ids provided must match the number of chain_ids!"
+                )
+                raise ValueError(msg)
+        # Get relevant chains ids
+        if chain_ids is None:
+            chain_ids = list(protein_chains)
+        for chain_id in chain_ids:
+            if chain_id not in protein_chains:
+                msg = (
+                    f"Chain {chain_id} assigned for template"
+                    f"{template_id} is not one of the protein chains!"
+                )
+                raise ValueError(msg)
+        # Get relevant template chain ids
+        parsed_template = parse_mmcif(
+            path,
+            mols=ccd,
+            moldir=mol_dir,
+            use_assembly=False,
+            compute_interfaces=False,
+        )
+        template_proteins = {
+            str(c["name"])
+            for c in parsed_template.data.chains
+            if c["mol_type"] == const.chain_type_ids["PROTEIN"]
+        }
+        if template_chain_ids is None:
+            template_chain_ids = list(template_proteins)
+        for chain_id in template_chain_ids:
+            if chain_id not in template_proteins:
+                msg = (
+                    f"Template chain {chain_id} assigned for template"
+                    f"{template_id} is not one of the protein chains!"
+                )
+                raise ValueError(msg)
+        # Compute template records
+        if matched:
+            template_records.extend(
+                get_template_records_from_matching(
+                    template_id=template_id,
+                    chain_ids=chain_ids,
+                    sequences=protein_seqs,
+                    template_chain_ids=template_chain_ids,
+                    template_sequences=parsed_template.sequences,
+                )
+            )
+        else:
+            template_records.extend(
+                get_template_records_from_search(
+                    template_id=template_id,
+                    chain_ids=chain_ids,
+                    sequences=protein_seqs,
+                    template_chain_ids=template_chain_ids,
+                    template_sequences=parsed_template.sequences,
+                )
+            )
+        # Save template
+        templates[template_id] = parsed_template.data
+    # Convert into datatypes
+    residues = np.array(res_data, dtype=Residue)
+    chains = np.array(chain_data, dtype=Chain)
+    interfaces = np.array([], dtype=Interface)
+    mask = np.ones(len(chain_data), dtype=bool)
+    rdkit_bounds_constraints = np.array(
+        rdkit_bounds_constraint_data, dtype=RDKitBoundsConstraint
+    )
+    chiral_atom_constraints = np.array(
+        chiral_atom_constraint_data, dtype=ChiralAtomConstraint
+    )
+    stereo_bond_constraints = np.array(
+        stereo_bond_constraint_data, dtype=StereoBondConstraint
+    )
+    planar_bond_constraints = np.array(
+        planar_bond_constraint_data, dtype=PlanarBondConstraint
+    )
+    planar_ring_5_constraints = np.array(
+        planar_ring_5_constraint_data, dtype=PlanarRing5Constraint
+    )
+    planar_ring_6_constraints = np.array(
+        planar_ring_6_constraint_data, dtype=PlanarRing6Constraint
+    )
+    if boltz_2:
+        atom_data = [(a[0], a[3], a[5], 0.0, 1.0) for a in atom_data]
+        connections = [(*c, const.bond_type_ids["COVALENT"]) for c in connections]
+        bond_data = bond_data + connections
+        atoms = np.array(atom_data, dtype=AtomV2)
+        bonds = np.array(bond_data, dtype=BondV2)
+        coords = [(x,) for x in atoms["coords"]]
+        coords = np.array(coords, Coords)
+        ensemble = np.array([(0, len(coords))], dtype=Ensemble)
+        data = StructureV2(
+            atoms=atoms,
+            bonds=bonds,
+            residues=residues,
+            chains=chains,
+            interfaces=interfaces,
+            mask=mask,
+            coords=coords,
+            ensemble=ensemble,
+        )
+    else:
+        bond_data = [(b[4], b[5], b[6]) for b in bond_data]
+        atom_data = [(convert_atom_name(a[0]), *a[1:]) for a in atom_data]
+        atoms = np.array(atom_data, dtype=Atom)
+        bonds = np.array(bond_data, dtype=Bond)
+        connections = np.array(connections, dtype=Connection)
+        data = Structure(
+            atoms=atoms,
+            bonds=bonds,
+            residues=residues,
+            chains=chains,
+            connections=connections,
+            interfaces=interfaces,
+            mask=mask,
+        )
+    # Create metadata
+    struct_info = StructureInfo(num_chains=len(chains))
+    chain_infos = []
+    for chain in chains:
+        chain_info = ChainInfo(
+            chain_id=int(chain["asym_id"]),
+            chain_name=chain["name"],
+            mol_type=int(chain["mol_type"]),
+            cluster_id=-1,
+            msa_id=chain_to_msa[chain["name"]],
+            num_residues=int(chain["res_num"]),
+            valid=True,
+            entity_id=int(chain["entity_id"]),
+        )
+        chain_infos.append(chain_info)
+    options = InferenceOptions(pocket_constraints=pocket_constraints)
+    record = Record(
+        id=name,
+        structure=struct_info,
+        chains=chain_infos,
+        interfaces=[],
+        inference_options=options,
+        templates=template_records,
+        affinity=affinity_info,
+    )
+    residue_constraints = ResidueConstraints(
+        rdkit_bounds_constraints=rdkit_bounds_constraints,
+        chiral_atom_constraints=chiral_atom_constraints,
+        stereo_bond_constraints=stereo_bond_constraints,
+        planar_bond_constraints=planar_bond_constraints,
+        planar_ring_5_constraints=planar_ring_5_constraints,
+        planar_ring_6_constraints=planar_ring_6_constraints,
+    )
+    return Target(
+        record=record,
+        structure=data,
+        sequences=entity_to_seq,
+        residue_constraints=residue_constraints,
+        templates=templates,
+        extra_mols=extra_mols,
+    )
 def standardize(smiles: str) -> Optional[str]:

boltz-vsynthes 1.0.8__py3-none-any.whl → 1.0.10__py3-none-any.whl

boltz-vsynthes 1.0.8py3-none-any.whl → 1.0.10py3-none-any.whl