boltz-vsynthes 1.0.8__py3-none-any.whl → 1.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
boltz/data/mol.py CHANGED
@@ -30,10 +30,6 @@ def load_molecules(moldir: str, molecules: list[str]) -> dict[str, Mol]:
30
30
  """
31
31
  loaded_mols = {}
32
32
  for molecule in molecules:
33
- # Skip if it's a SMILES string (starts with LIG)
34
- if molecule.startswith("LIG"):
35
- continue
36
-
37
33
  path = Path(moldir) / f"{molecule}.pkl"
38
34
  if not path.exists():
39
35
  msg = f"CCD component {molecule} not found!"
@@ -0,0 +1,21 @@
1
+ from boltz.data.parse.pdb import parse_pdb
2
+ from boltz.data.parse.sdf import parse_sdf
3
+ from boltz.data.parse.pdb_download import parse_pdb_id
4
+ from boltz.data.parse.yaml import parse_yaml
5
+ from boltz.data.parse.fasta import parse_fasta
6
+ from boltz.data.parse.a3m import parse_a3m
7
+ from boltz.data.parse.csv import parse_csv
8
+ from boltz.data.parse.mmcif import parse_mmcif
9
+ from boltz.data.parse.mmcif_with_constraints import parse_mmcif_with_constraints
10
+
11
+ __all__ = [
12
+ "parse_pdb",
13
+ "parse_sdf",
14
+ "parse_pdb_id",
15
+ "parse_yaml",
16
+ "parse_fasta",
17
+ "parse_a3m",
18
+ "parse_csv",
19
+ "parse_mmcif",
20
+ "parse_mmcif_with_constraints",
21
+ ]
@@ -0,0 +1,71 @@
1
+ from pathlib import Path
2
+ from typing import Optional
3
+
4
+ from Bio import PDB
5
+ from Bio.PDB.Polypeptide import three_to_one
6
+ from rdkit import Chem
7
+ from rdkit.Chem.rdchem import Mol
8
+
9
+ from boltz.data.types import Target
10
+ from boltz.data.parse.yaml import parse_boltz_schema
11
+
12
+
13
+ def parse_pdb(
14
+ path: Path,
15
+ ccd: dict[str, Mol],
16
+ mol_dir: Path,
17
+ boltz2: bool = False,
18
+ ) -> Target:
19
+ """Parse a PDB file.
20
+
21
+ Parameters
22
+ ----------
23
+ path : Path
24
+ Path to the PDB file.
25
+ ccd : Dict
26
+ Dictionary of CCD components.
27
+ mol_dir : Path
28
+ Path to the directory containing the molecules.
29
+ boltz2 : bool
30
+ Whether to parse the input for Boltz2.
31
+
32
+ Returns
33
+ -------
34
+ Target
35
+ The parsed target.
36
+ """
37
+ # Read PDB file
38
+ parser = PDB.PDBParser(QUIET=True)
39
+ structure = parser.get_structure("protein", str(path))
40
+
41
+ # Convert to yaml format
42
+ sequences = []
43
+ for model in structure:
44
+ for chain in model:
45
+ # Get chain sequence
46
+ seq = ""
47
+ for residue in chain:
48
+ if residue.id[0] == " ": # Only standard residues
49
+ try:
50
+ seq += three_to_one(residue.resname)
51
+ except KeyError:
52
+ continue
53
+
54
+ if seq: # Only add if sequence is not empty
55
+ molecule = {
56
+ "protein": {
57
+ "id": chain.id,
58
+ "sequence": seq,
59
+ "modifications": [],
60
+ },
61
+ }
62
+ sequences.append(molecule)
63
+
64
+ data = {
65
+ "sequences": sequences,
66
+ "bonds": [],
67
+ "version": 1,
68
+ }
69
+
70
+ name = path.stem
71
+ return parse_boltz_schema(name, data, ccd, mol_dir, boltz2)
@@ -0,0 +1,114 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ import requests
6
+ from Bio import PDB
7
+ from Bio.PDB.Polypeptide import three_to_one
8
+ from rdkit import Chem
9
+ from rdkit.Chem.rdchem import Mol
10
+
11
+ from boltz.data.types import Target
12
+ from boltz.data.parse.yaml import parse_boltz_schema
13
+
14
+
15
+ def download_pdb(pdb_id: str, cache_dir: Path) -> Path:
16
+ """Download a PDB file by ID.
17
+
18
+ Parameters
19
+ ----------
20
+ pdb_id : str
21
+ The PDB ID to download.
22
+ cache_dir : Path
23
+ The directory to cache the downloaded file.
24
+
25
+ Returns
26
+ -------
27
+ Path
28
+ The path to the downloaded PDB file.
29
+ """
30
+ # Create cache directory if it doesn't exist
31
+ cache_dir.mkdir(parents=True, exist_ok=True)
32
+
33
+ # Check if file already exists in cache
34
+ pdb_path = cache_dir / f"{pdb_id.lower()}.pdb"
35
+ if pdb_path.exists():
36
+ return pdb_path
37
+
38
+ # Download from RCSB PDB
39
+ url = f"https://files.rcsb.org/download/{pdb_id.lower()}.pdb"
40
+ response = requests.get(url, stream=True)
41
+ response.raise_for_status()
42
+
43
+ # Save to cache
44
+ with pdb_path.open("wb") as f:
45
+ for chunk in response.iter_content(chunk_size=8192):
46
+ f.write(chunk)
47
+
48
+ return pdb_path
49
+
50
+
51
+ def parse_pdb_id(
52
+ pdb_id: str,
53
+ ccd: dict[str, Mol],
54
+ mol_dir: Path,
55
+ cache_dir: Path,
56
+ boltz2: bool = False,
57
+ ) -> Target:
58
+ """Parse a PDB file by ID.
59
+
60
+ Parameters
61
+ ----------
62
+ pdb_id : str
63
+ The PDB ID to parse.
64
+ ccd : Dict
65
+ Dictionary of CCD components.
66
+ mol_dir : Path
67
+ Path to the directory containing the molecules.
68
+ cache_dir : Path
69
+ The directory to cache downloaded PDB files.
70
+ boltz2 : bool
71
+ Whether to parse the input for Boltz2.
72
+
73
+ Returns
74
+ -------
75
+ Target
76
+ The parsed target.
77
+ """
78
+ # Download PDB file
79
+ pdb_path = download_pdb(pdb_id, cache_dir)
80
+
81
+ # Read PDB file
82
+ parser = PDB.PDBParser(QUIET=True)
83
+ structure = parser.get_structure("protein", str(pdb_path))
84
+
85
+ # Convert to yaml format
86
+ sequences = []
87
+ for model in structure:
88
+ for chain in model:
89
+ # Get chain sequence
90
+ seq = ""
91
+ for residue in chain:
92
+ if residue.id[0] == " ": # Only standard residues
93
+ try:
94
+ seq += three_to_one(residue.resname)
95
+ except KeyError:
96
+ continue
97
+
98
+ if seq: # Only add if sequence is not empty
99
+ molecule = {
100
+ "protein": {
101
+ "id": chain.id,
102
+ "sequence": seq,
103
+ "modifications": [],
104
+ },
105
+ }
106
+ sequences.append(molecule)
107
+
108
+ data = {
109
+ "sequences": sequences,
110
+ "bonds": [],
111
+ "version": 1,
112
+ }
113
+
114
+ return parse_boltz_schema(pdb_id, data, ccd, mol_dir, boltz2)