boltz-vsynthes 1.0.15__py3-none-any.whl → 1.0.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- boltz/data/parse/pdb.py +23 -28
- boltz/data/parse/pdb_download.py +18 -23
- boltz/data/parse/schema.py +5 -3
- boltz/data/parse/sdf.py +52 -22
- {boltz_vsynthes-1.0.15.dist-info → boltz_vsynthes-1.0.17.dist-info}/METADATA +1 -1
- {boltz_vsynthes-1.0.15.dist-info → boltz_vsynthes-1.0.17.dist-info}/RECORD +10 -10
- {boltz_vsynthes-1.0.15.dist-info → boltz_vsynthes-1.0.17.dist-info}/WHEEL +0 -0
- {boltz_vsynthes-1.0.15.dist-info → boltz_vsynthes-1.0.17.dist-info}/entry_points.txt +0 -0
- {boltz_vsynthes-1.0.15.dist-info → boltz_vsynthes-1.0.17.dist-info}/licenses/LICENSE +0 -0
- {boltz_vsynthes-1.0.15.dist-info → boltz_vsynthes-1.0.17.dist-info}/top_level.txt +0 -0
boltz/data/parse/pdb.py
CHANGED
@@ -1,17 +1,19 @@
|
|
1
|
+
import os
|
1
2
|
from pathlib import Path
|
2
3
|
from typing import Optional
|
3
4
|
|
4
5
|
from Bio import PDB
|
5
|
-
from Bio.
|
6
|
+
from Bio.PDB.PDBParser import PDBParser
|
7
|
+
from Bio.PDB.Polypeptide import PPBuilder
|
6
8
|
from rdkit import Chem
|
7
9
|
from rdkit.Chem.rdchem import Mol
|
8
10
|
|
9
11
|
from boltz.data.types import Target
|
10
|
-
from boltz.data.parse.
|
12
|
+
from boltz.data.parse.schema import parse_boltz_schema
|
11
13
|
|
12
14
|
|
13
15
|
def parse_pdb(
|
14
|
-
|
16
|
+
pdb_path: Path,
|
15
17
|
ccd: dict[str, Mol],
|
16
18
|
mol_dir: Path,
|
17
19
|
boltz2: bool = False,
|
@@ -20,14 +22,14 @@ def parse_pdb(
|
|
20
22
|
|
21
23
|
Parameters
|
22
24
|
----------
|
23
|
-
|
25
|
+
pdb_path : Path
|
24
26
|
Path to the PDB file.
|
25
27
|
ccd : Dict
|
26
28
|
Dictionary of CCD components.
|
27
29
|
mol_dir : Path
|
28
30
|
Path to the directory containing the molecules.
|
29
|
-
boltz2 : bool
|
30
|
-
Whether to
|
31
|
+
boltz2 : bool, optional
|
32
|
+
Whether to use Boltz2 format, by default False.
|
31
33
|
|
32
34
|
Returns
|
33
35
|
-------
|
@@ -35,31 +37,25 @@ def parse_pdb(
|
|
35
37
|
The parsed target.
|
36
38
|
"""
|
37
39
|
# Read PDB file
|
38
|
-
parser =
|
39
|
-
structure = parser.get_structure("protein", str(
|
40
|
+
parser = PDBParser(QUIET=True)
|
41
|
+
structure = parser.get_structure("protein", str(pdb_path))
|
42
|
+
ppb = PPBuilder()
|
40
43
|
|
41
44
|
# Convert to yaml format
|
42
45
|
sequences = []
|
43
46
|
for model in structure:
|
44
47
|
for chain in model:
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
"protein": {
|
57
|
-
"id": chain.id,
|
58
|
-
"sequence": seq,
|
59
|
-
"modifications": [],
|
60
|
-
},
|
61
|
-
}
|
62
|
-
sequences.append(molecule)
|
48
|
+
for pp in ppb.build_peptides(chain):
|
49
|
+
seq = str(pp.get_sequence())
|
50
|
+
if seq: # Only add if sequence is not empty
|
51
|
+
molecule = {
|
52
|
+
"protein": {
|
53
|
+
"id": chain.id,
|
54
|
+
"sequence": seq,
|
55
|
+
"modifications": [],
|
56
|
+
},
|
57
|
+
}
|
58
|
+
sequences.append(molecule)
|
63
59
|
|
64
60
|
data = {
|
65
61
|
"sequences": sequences,
|
@@ -67,5 +63,4 @@ def parse_pdb(
|
|
67
63
|
"version": 1,
|
68
64
|
}
|
69
65
|
|
70
|
-
|
71
|
-
return parse_boltz_schema(name, data, ccd, mol_dir, boltz2)
|
66
|
+
return parse_boltz_schema(pdb_path.stem, data, ccd, mol_dir, boltz2)
|
boltz/data/parse/pdb_download.py
CHANGED
@@ -4,12 +4,13 @@ from typing import Optional
|
|
4
4
|
|
5
5
|
import requests
|
6
6
|
from Bio import PDB
|
7
|
-
from Bio.
|
7
|
+
from Bio.PDB.PDBParser import PDBParser
|
8
|
+
from Bio.PDB.PPBuilder import PPBuilder
|
8
9
|
from rdkit import Chem
|
9
10
|
from rdkit.Chem.rdchem import Mol
|
10
11
|
|
11
12
|
from boltz.data.types import Target
|
12
|
-
from boltz.data.parse.
|
13
|
+
from boltz.data.parse.schema import parse_boltz_schema
|
13
14
|
|
14
15
|
|
15
16
|
def download_pdb(pdb_id: str, cache_dir: Path) -> Path:
|
@@ -67,8 +68,8 @@ def parse_pdb_id(
|
|
67
68
|
Path to the directory containing the molecules.
|
68
69
|
cache_dir : Path
|
69
70
|
The directory to cache downloaded PDB files.
|
70
|
-
boltz2 : bool
|
71
|
-
Whether to
|
71
|
+
boltz2 : bool, optional
|
72
|
+
Whether to use Boltz2 format, by default False.
|
72
73
|
|
73
74
|
Returns
|
74
75
|
-------
|
@@ -79,31 +80,25 @@ def parse_pdb_id(
|
|
79
80
|
pdb_path = download_pdb(pdb_id, cache_dir)
|
80
81
|
|
81
82
|
# Read PDB file
|
82
|
-
parser =
|
83
|
+
parser = PDBParser(QUIET=True)
|
83
84
|
structure = parser.get_structure("protein", str(pdb_path))
|
85
|
+
ppb = PPBuilder()
|
84
86
|
|
85
87
|
# Convert to yaml format
|
86
88
|
sequences = []
|
87
89
|
for model in structure:
|
88
90
|
for chain in model:
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
"protein": {
|
101
|
-
"id": chain.id,
|
102
|
-
"sequence": seq,
|
103
|
-
"modifications": [],
|
104
|
-
},
|
105
|
-
}
|
106
|
-
sequences.append(molecule)
|
91
|
+
for pp in ppb.build_peptides(chain):
|
92
|
+
seq = str(pp.get_sequence())
|
93
|
+
if seq: # Only add if sequence is not empty
|
94
|
+
molecule = {
|
95
|
+
"protein": {
|
96
|
+
"id": chain.id,
|
97
|
+
"sequence": seq,
|
98
|
+
"modifications": [],
|
99
|
+
},
|
100
|
+
}
|
101
|
+
sequences.append(molecule)
|
107
102
|
|
108
103
|
data = {
|
109
104
|
"sequences": sequences,
|
boltz/data/parse/schema.py
CHANGED
@@ -1024,12 +1024,14 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
|
|
1024
1024
|
# This is a PDB ID
|
1025
1025
|
from boltz.data.parse.pdb_download import parse_pdb_id
|
1026
1026
|
target = parse_pdb_id(pdb_path.stem, ccd, mol_dir, pdb_path.parent)
|
1027
|
+
# Get sequence from the first chain
|
1028
|
+
seq = target.sequences[0]["protein"]["sequence"]
|
1027
1029
|
else:
|
1028
1030
|
# This is a PDB file
|
1029
1031
|
from boltz.data.parse.pdb import parse_pdb
|
1030
1032
|
target = parse_pdb(pdb_path, ccd, mol_dir)
|
1031
|
-
|
1032
|
-
|
1033
|
+
# Get sequence from the first chain
|
1034
|
+
seq = target.sequences[0]["protein"]["sequence"]
|
1033
1035
|
else:
|
1034
1036
|
msg = f"Protein must have either 'sequence' or 'pdb' field: {item}"
|
1035
1037
|
raise ValueError(msg)
|
@@ -1042,7 +1044,7 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
|
|
1042
1044
|
from boltz.data.parse.sdf import parse_sdf
|
1043
1045
|
target = parse_sdf(sdf_path, ccd, mol_dir)
|
1044
1046
|
# Get sequence from the first ligand
|
1045
|
-
seq = target.sequences[0]
|
1047
|
+
seq = target.sequences[0]["ligand"]["sequence"]
|
1046
1048
|
elif "ccd" in item[entity_type]:
|
1047
1049
|
seq = str(item[entity_type]["ccd"])
|
1048
1050
|
else:
|
boltz/data/parse/sdf.py
CHANGED
@@ -1,15 +1,49 @@
|
|
1
|
+
import os
|
1
2
|
from pathlib import Path
|
2
3
|
from typing import Optional
|
3
4
|
|
4
5
|
from rdkit import Chem
|
6
|
+
from rdkit.Chem import AllChem
|
5
7
|
from rdkit.Chem.rdchem import Mol
|
8
|
+
import rdkit.Chem.rdmolfiles as rdmolfiles
|
6
9
|
|
7
10
|
from boltz.data.types import Target
|
8
|
-
from boltz.data.parse.
|
11
|
+
from boltz.data.parse.schema import parse_boltz_schema
|
12
|
+
|
13
|
+
|
14
|
+
def _process_sdf(sdf_path: str) -> dict[str, str]:
|
15
|
+
"""Process an SDF file and extract SMILES strings.
|
16
|
+
|
17
|
+
Parameters
|
18
|
+
----------
|
19
|
+
sdf_path : str
|
20
|
+
Path to the SDF file.
|
21
|
+
|
22
|
+
Returns
|
23
|
+
-------
|
24
|
+
dict[str, str]
|
25
|
+
Dictionary mapping molecule names to SMILES strings.
|
26
|
+
"""
|
27
|
+
output_dict = {}
|
28
|
+
suppl = rdmolfiles.ForwardSDMolSupplier(sdf_path)
|
29
|
+
|
30
|
+
for mol in suppl:
|
31
|
+
if mol is not None:
|
32
|
+
mol_smiles = rdmolfiles.MolToSmiles(mol)
|
33
|
+
if mol.HasProp("_Name"):
|
34
|
+
mol_name = mol.GetProp("_Name")
|
35
|
+
if mol_name == "":
|
36
|
+
mol_name = mol_smiles
|
37
|
+
else:
|
38
|
+
mol_name = mol_smiles
|
39
|
+
|
40
|
+
output_dict[mol_name] = mol_smiles
|
41
|
+
|
42
|
+
return output_dict
|
9
43
|
|
10
44
|
|
11
45
|
def parse_sdf(
|
12
|
-
|
46
|
+
sdf_path: Path,
|
13
47
|
ccd: dict[str, Mol],
|
14
48
|
mol_dir: Path,
|
15
49
|
boltz2: bool = False,
|
@@ -18,37 +52,34 @@ def parse_sdf(
|
|
18
52
|
|
19
53
|
Parameters
|
20
54
|
----------
|
21
|
-
|
55
|
+
sdf_path : Path
|
22
56
|
Path to the SDF file.
|
23
57
|
ccd : Dict
|
24
58
|
Dictionary of CCD components.
|
25
59
|
mol_dir : Path
|
26
60
|
Path to the directory containing the molecules.
|
27
|
-
boltz2 : bool
|
28
|
-
Whether to
|
61
|
+
boltz2 : bool, optional
|
62
|
+
Whether to use Boltz2 format, by default False.
|
29
63
|
|
30
64
|
Returns
|
31
65
|
-------
|
32
66
|
Target
|
33
67
|
The parsed target.
|
34
68
|
"""
|
35
|
-
#
|
36
|
-
|
37
|
-
|
69
|
+
# Process SDF file
|
70
|
+
mol_dict = _process_sdf(str(sdf_path))
|
71
|
+
|
38
72
|
# Convert to yaml format
|
39
73
|
sequences = []
|
40
|
-
for
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
},
|
50
|
-
}
|
51
|
-
sequences.append(molecule)
|
74
|
+
for mol_name, smiles in mol_dict.items():
|
75
|
+
molecule = {
|
76
|
+
"ligand": {
|
77
|
+
"id": mol_name,
|
78
|
+
"sequence": smiles,
|
79
|
+
"modifications": [],
|
80
|
+
},
|
81
|
+
}
|
82
|
+
sequences.append(molecule)
|
52
83
|
|
53
84
|
data = {
|
54
85
|
"sequences": sequences,
|
@@ -56,5 +87,4 @@ def parse_sdf(
|
|
56
87
|
"version": 1,
|
57
88
|
}
|
58
89
|
|
59
|
-
|
60
|
-
return parse_boltz_schema(name, data, ccd, mol_dir, boltz2)
|
90
|
+
return parse_boltz_schema(sdf_path.stem, data, ccd, mol_dir, boltz2)
|
@@ -38,10 +38,10 @@ boltz/data/parse/csv.py,sha256=Hcq8rJW2njczahEr8jfd_o-zxLaNSgJ3YIoC9srIqpw,2518
|
|
38
38
|
boltz/data/parse/fasta.py,sha256=taI4s_CqPtyF0XaLJAsVAJHCL0GXm2g1g8Qeccdxikk,3906
|
39
39
|
boltz/data/parse/mmcif.py,sha256=25kEXCkx-OuaawAs7cdz0fxdRu5_CCO0AV00u84PrjQ,36822
|
40
40
|
boltz/data/parse/mmcif_with_constraints.py,sha256=WHYZckSqUwu-Nb9vmVmxHmC7uxwVrF7AVUeVKsc5wGQ,51473
|
41
|
-
boltz/data/parse/pdb.py,sha256=
|
42
|
-
boltz/data/parse/pdb_download.py,sha256=
|
43
|
-
boltz/data/parse/schema.py,sha256=
|
44
|
-
boltz/data/parse/sdf.py,sha256=
|
41
|
+
boltz/data/parse/pdb.py,sha256=R6rDCh6AKLn7n9ELltF1wDORDtR1CFgFnjFdYXKJCNg,1697
|
42
|
+
boltz/data/parse/pdb_download.py,sha256=YEvSoYqKAan6EizCICMruC_3v-Vpvc4vpCpwmHPY67E,2808
|
43
|
+
boltz/data/parse/schema.py,sha256=pq9PRrm3pyaOH0kJEjtG-rVHEPpr_KmiykvxKsMWnn0,60979
|
44
|
+
boltz/data/parse/sdf.py,sha256=z7XTH-a38Rq-grC8bEZUPxGmLyP-g17LCxYWzVkBS_0,2107
|
45
45
|
boltz/data/parse/yaml.py,sha256=GRFRMtDD4PQ4PIpA_S1jj0vRaEu2LlZd_g4rN1zUrNo,1505
|
46
46
|
boltz/data/sample/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
47
47
|
boltz/data/sample/cluster.py,sha256=9Sx8qP7zGZOAyEspwYFtCTbGTBZnuN-zfCKFbbA_6oI,8175
|
@@ -107,9 +107,9 @@ boltz/model/optim/scheduler.py,sha256=nB4jz0CZ4pR4n08LQngExL_pNycIdYI8AXVoHPnZWQ
|
|
107
107
|
boltz/model/potentials/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
108
108
|
boltz/model/potentials/potentials.py,sha256=vev8Vjfs-ML1hyrdv_R8DynG4wSFahJ6nzPWp7CYQqw,17507
|
109
109
|
boltz/model/potentials/schedules.py,sha256=m7XJjfuF9uTX3bR9VisXv1rvzJjxiD8PobXRpcBBu1c,968
|
110
|
-
boltz_vsynthes-1.0.
|
111
|
-
boltz_vsynthes-1.0.
|
112
|
-
boltz_vsynthes-1.0.
|
113
|
-
boltz_vsynthes-1.0.
|
114
|
-
boltz_vsynthes-1.0.
|
115
|
-
boltz_vsynthes-1.0.
|
110
|
+
boltz_vsynthes-1.0.17.dist-info/licenses/LICENSE,sha256=8GZ_1eZsUeG6jdqgJJxtciWzADfgLEV4LY8sKUOsJhc,1102
|
111
|
+
boltz_vsynthes-1.0.17.dist-info/METADATA,sha256=s2lJPUPQJ4BMEFa2Hc57Q8e5EYMPuEKr5IO_SxCVTmc,7171
|
112
|
+
boltz_vsynthes-1.0.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
113
|
+
boltz_vsynthes-1.0.17.dist-info/entry_points.txt,sha256=n5a5I35ntu9lmyr16oZgHPFY0b0YxjiixY7m7nbMTLc,41
|
114
|
+
boltz_vsynthes-1.0.17.dist-info/top_level.txt,sha256=MgU3Jfb-ctWm07YGMts68PMjSh9v26D0gfG3dFRmVFA,6
|
115
|
+
boltz_vsynthes-1.0.17.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|