boltz-vsynthes 1.0.9__py3-none-any.whl → 1.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- boltz/data/mol.py +0 -4
- boltz/data/parse/__init__.py +21 -0
- boltz/data/parse/pdb.py +72 -0
- boltz/data/parse/pdb_download.py +114 -0
- boltz/data/parse/schema.py +799 -146
- boltz/data/parse/sdf.py +60 -0
- boltz/main.py +176 -208
- {boltz_vsynthes-1.0.9.dist-info → boltz_vsynthes-1.0.11.dist-info}/METADATA +2 -2
- {boltz_vsynthes-1.0.9.dist-info → boltz_vsynthes-1.0.11.dist-info}/RECORD +13 -10
- {boltz_vsynthes-1.0.9.dist-info → boltz_vsynthes-1.0.11.dist-info}/WHEEL +0 -0
- {boltz_vsynthes-1.0.9.dist-info → boltz_vsynthes-1.0.11.dist-info}/entry_points.txt +0 -0
- {boltz_vsynthes-1.0.9.dist-info → boltz_vsynthes-1.0.11.dist-info}/licenses/LICENSE +0 -0
- {boltz_vsynthes-1.0.9.dist-info → boltz_vsynthes-1.0.11.dist-info}/top_level.txt +0 -0
boltz/data/mol.py
CHANGED
@@ -30,10 +30,6 @@ def load_molecules(moldir: str, molecules: list[str]) -> dict[str, Mol]:
|
|
30
30
|
"""
|
31
31
|
loaded_mols = {}
|
32
32
|
for molecule in molecules:
|
33
|
-
# Skip if it's a SMILES string (starts with LIG)
|
34
|
-
if molecule.startswith("LIG"):
|
35
|
-
continue
|
36
|
-
|
37
33
|
path = Path(moldir) / f"{molecule}.pkl"
|
38
34
|
if not path.exists():
|
39
35
|
msg = f"CCD component {molecule} not found!"
|
boltz/data/parse/__init__.py
CHANGED
@@ -0,0 +1,21 @@
|
|
1
|
+
from boltz.data.parse.pdb import parse_pdb
|
2
|
+
from boltz.data.parse.sdf import parse_sdf
|
3
|
+
from boltz.data.parse.pdb_download import parse_pdb_id
|
4
|
+
from boltz.data.parse.yaml import parse_yaml
|
5
|
+
from boltz.data.parse.fasta import parse_fasta
|
6
|
+
from boltz.data.parse.a3m import parse_a3m
|
7
|
+
from boltz.data.parse.csv import parse_csv
|
8
|
+
from boltz.data.parse.mmcif import parse_mmcif
|
9
|
+
from boltz.data.parse.mmcif_with_constraints import parse_mmcif_with_constraints
|
10
|
+
|
11
|
+
__all__ = [
|
12
|
+
"parse_pdb",
|
13
|
+
"parse_sdf",
|
14
|
+
"parse_pdb_id",
|
15
|
+
"parse_yaml",
|
16
|
+
"parse_fasta",
|
17
|
+
"parse_a3m",
|
18
|
+
"parse_csv",
|
19
|
+
"parse_mmcif",
|
20
|
+
"parse_mmcif_with_constraints",
|
21
|
+
]
|
boltz/data/parse/pdb.py
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from Bio import PDB
|
5
|
+
from Bio.PDB.Polypeptide import three_to_one
|
6
|
+
from Bio.Data.IUPACData import protein_letters_3to1
|
7
|
+
from rdkit import Chem
|
8
|
+
from rdkit.Chem.rdchem import Mol
|
9
|
+
|
10
|
+
from boltz.data.types import Target
|
11
|
+
from boltz.data.parse.yaml import parse_boltz_schema
|
12
|
+
|
13
|
+
|
14
|
+
def parse_pdb(
|
15
|
+
path: Path,
|
16
|
+
ccd: dict[str, Mol],
|
17
|
+
mol_dir: Path,
|
18
|
+
boltz2: bool = False,
|
19
|
+
) -> Target:
|
20
|
+
"""Parse a PDB file.
|
21
|
+
|
22
|
+
Parameters
|
23
|
+
----------
|
24
|
+
path : Path
|
25
|
+
Path to the PDB file.
|
26
|
+
ccd : Dict
|
27
|
+
Dictionary of CCD components.
|
28
|
+
mol_dir : Path
|
29
|
+
Path to the directory containing the molecules.
|
30
|
+
boltz2 : bool
|
31
|
+
Whether to parse the input for Boltz2.
|
32
|
+
|
33
|
+
Returns
|
34
|
+
-------
|
35
|
+
Target
|
36
|
+
The parsed target.
|
37
|
+
"""
|
38
|
+
# Read PDB file
|
39
|
+
parser = PDB.PDBParser(QUIET=True)
|
40
|
+
structure = parser.get_structure("protein", str(path))
|
41
|
+
|
42
|
+
# Convert to yaml format
|
43
|
+
sequences = []
|
44
|
+
for model in structure:
|
45
|
+
for chain in model:
|
46
|
+
# Get chain sequence
|
47
|
+
seq = ""
|
48
|
+
for residue in chain:
|
49
|
+
if residue.id[0] == " ": # Only standard residues
|
50
|
+
try:
|
51
|
+
seq += protein_letters_3to1[residue.resname]
|
52
|
+
except KeyError:
|
53
|
+
continue
|
54
|
+
|
55
|
+
if seq: # Only add if sequence is not empty
|
56
|
+
molecule = {
|
57
|
+
"protein": {
|
58
|
+
"id": chain.id,
|
59
|
+
"sequence": seq,
|
60
|
+
"modifications": [],
|
61
|
+
},
|
62
|
+
}
|
63
|
+
sequences.append(molecule)
|
64
|
+
|
65
|
+
data = {
|
66
|
+
"sequences": sequences,
|
67
|
+
"bonds": [],
|
68
|
+
"version": 1,
|
69
|
+
}
|
70
|
+
|
71
|
+
name = path.stem
|
72
|
+
return parse_boltz_schema(name, data, ccd, mol_dir, boltz2)
|
@@ -0,0 +1,114 @@
|
|
1
|
+
import os
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
import requests
|
6
|
+
from Bio import PDB
|
7
|
+
from Bio.Data.IUPACData import protein_letters_3to1
|
8
|
+
from rdkit import Chem
|
9
|
+
from rdkit.Chem.rdchem import Mol
|
10
|
+
|
11
|
+
from boltz.data.types import Target
|
12
|
+
from boltz.data.parse.yaml import parse_boltz_schema
|
13
|
+
|
14
|
+
|
15
|
+
def download_pdb(pdb_id: str, cache_dir: Path) -> Path:
|
16
|
+
"""Download a PDB file by ID.
|
17
|
+
|
18
|
+
Parameters
|
19
|
+
----------
|
20
|
+
pdb_id : str
|
21
|
+
The PDB ID to download.
|
22
|
+
cache_dir : Path
|
23
|
+
The directory to cache the downloaded file.
|
24
|
+
|
25
|
+
Returns
|
26
|
+
-------
|
27
|
+
Path
|
28
|
+
The path to the downloaded PDB file.
|
29
|
+
"""
|
30
|
+
# Create cache directory if it doesn't exist
|
31
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
32
|
+
|
33
|
+
# Check if file already exists in cache
|
34
|
+
pdb_path = cache_dir / f"{pdb_id.lower()}.pdb"
|
35
|
+
if pdb_path.exists():
|
36
|
+
return pdb_path
|
37
|
+
|
38
|
+
# Download from RCSB PDB
|
39
|
+
url = f"https://files.rcsb.org/download/{pdb_id.lower()}.pdb"
|
40
|
+
response = requests.get(url, stream=True)
|
41
|
+
response.raise_for_status()
|
42
|
+
|
43
|
+
# Save to cache
|
44
|
+
with pdb_path.open("wb") as f:
|
45
|
+
for chunk in response.iter_content(chunk_size=8192):
|
46
|
+
f.write(chunk)
|
47
|
+
|
48
|
+
return pdb_path
|
49
|
+
|
50
|
+
|
51
|
+
def parse_pdb_id(
|
52
|
+
pdb_id: str,
|
53
|
+
ccd: dict[str, Mol],
|
54
|
+
mol_dir: Path,
|
55
|
+
cache_dir: Path,
|
56
|
+
boltz2: bool = False,
|
57
|
+
) -> Target:
|
58
|
+
"""Parse a PDB file by ID.
|
59
|
+
|
60
|
+
Parameters
|
61
|
+
----------
|
62
|
+
pdb_id : str
|
63
|
+
The PDB ID to parse.
|
64
|
+
ccd : Dict
|
65
|
+
Dictionary of CCD components.
|
66
|
+
mol_dir : Path
|
67
|
+
Path to the directory containing the molecules.
|
68
|
+
cache_dir : Path
|
69
|
+
The directory to cache downloaded PDB files.
|
70
|
+
boltz2 : bool
|
71
|
+
Whether to parse the input for Boltz2.
|
72
|
+
|
73
|
+
Returns
|
74
|
+
-------
|
75
|
+
Target
|
76
|
+
The parsed target.
|
77
|
+
"""
|
78
|
+
# Download PDB file
|
79
|
+
pdb_path = download_pdb(pdb_id, cache_dir)
|
80
|
+
|
81
|
+
# Read PDB file
|
82
|
+
parser = PDB.PDBParser(QUIET=True)
|
83
|
+
structure = parser.get_structure("protein", str(pdb_path))
|
84
|
+
|
85
|
+
# Convert to yaml format
|
86
|
+
sequences = []
|
87
|
+
for model in structure:
|
88
|
+
for chain in model:
|
89
|
+
# Get chain sequence
|
90
|
+
seq = ""
|
91
|
+
for residue in chain:
|
92
|
+
if residue.id[0] == " ": # Only standard residues
|
93
|
+
try:
|
94
|
+
seq += protein_letters_3to1[residue.resname]
|
95
|
+
except KeyError:
|
96
|
+
continue
|
97
|
+
|
98
|
+
if seq: # Only add if sequence is not empty
|
99
|
+
molecule = {
|
100
|
+
"protein": {
|
101
|
+
"id": chain.id,
|
102
|
+
"sequence": seq,
|
103
|
+
"modifications": [],
|
104
|
+
},
|
105
|
+
}
|
106
|
+
sequences.append(molecule)
|
107
|
+
|
108
|
+
data = {
|
109
|
+
"sequences": sequences,
|
110
|
+
"bonds": [],
|
111
|
+
"version": 1,
|
112
|
+
}
|
113
|
+
|
114
|
+
return parse_boltz_schema(pdb_id, data, ccd, mol_dir, boltz2)
|