boltz-vsynthes 1.0.15__py3-none-any.whl → 1.0.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
boltz/data/parse/pdb.py CHANGED
@@ -1,17 +1,19 @@
1
+ import os
1
2
  from pathlib import Path
2
3
  from typing import Optional
3
4
 
4
5
  from Bio import PDB
5
- from Bio.Data.IUPACData import protein_letters_3to1
6
+ from Bio.PDB.PDBParser import PDBParser
7
+ from Bio.PDB.Polypeptide import PPBuilder
6
8
  from rdkit import Chem
7
9
  from rdkit.Chem.rdchem import Mol
8
10
 
9
11
  from boltz.data.types import Target
10
- from boltz.data.parse.yaml import parse_boltz_schema
12
+ from boltz.data.parse.schema import parse_boltz_schema
11
13
 
12
14
 
13
15
  def parse_pdb(
14
- path: Path,
16
+ pdb_path: Path,
15
17
  ccd: dict[str, Mol],
16
18
  mol_dir: Path,
17
19
  boltz2: bool = False,
@@ -20,14 +22,14 @@ def parse_pdb(
20
22
 
21
23
  Parameters
22
24
  ----------
23
- path : Path
25
+ pdb_path : Path
24
26
  Path to the PDB file.
25
27
  ccd : Dict
26
28
  Dictionary of CCD components.
27
29
  mol_dir : Path
28
30
  Path to the directory containing the molecules.
29
- boltz2 : bool
30
- Whether to parse the input for Boltz2.
31
+ boltz2 : bool, optional
32
+ Whether to use Boltz2 format, by default False.
31
33
 
32
34
  Returns
33
35
  -------
@@ -35,31 +37,25 @@ def parse_pdb(
35
37
  The parsed target.
36
38
  """
37
39
  # Read PDB file
38
- parser = PDB.PDBParser(QUIET=True)
39
- structure = parser.get_structure("protein", str(path))
40
+ parser = PDBParser(QUIET=True)
41
+ structure = parser.get_structure("protein", str(pdb_path))
42
+ ppb = PPBuilder()
40
43
 
41
44
  # Convert to yaml format
42
45
  sequences = []
43
46
  for model in structure:
44
47
  for chain in model:
45
- # Get chain sequence
46
- seq = ""
47
- for residue in chain:
48
- if residue.id[0] == " ": # Only standard residues
49
- try:
50
- seq += protein_letters_3to1[residue.resname]
51
- except KeyError:
52
- continue
53
-
54
- if seq: # Only add if sequence is not empty
55
- molecule = {
56
- "protein": {
57
- "id": chain.id,
58
- "sequence": seq,
59
- "modifications": [],
60
- },
61
- }
62
- sequences.append(molecule)
48
+ for pp in ppb.build_peptides(chain):
49
+ seq = str(pp.get_sequence())
50
+ if seq: # Only add if sequence is not empty
51
+ molecule = {
52
+ "protein": {
53
+ "id": chain.id,
54
+ "sequence": seq,
55
+ "modifications": [],
56
+ },
57
+ }
58
+ sequences.append(molecule)
63
59
 
64
60
  data = {
65
61
  "sequences": sequences,
@@ -67,5 +63,4 @@ def parse_pdb(
67
63
  "version": 1,
68
64
  }
69
65
 
70
- name = path.stem
71
- return parse_boltz_schema(name, data, ccd, mol_dir, boltz2)
66
+ return parse_boltz_schema(pdb_path.stem, data, ccd, mol_dir, boltz2)
@@ -4,12 +4,13 @@ from typing import Optional
4
4
 
5
5
  import requests
6
6
  from Bio import PDB
7
- from Bio.Data.IUPACData import protein_letters_3to1
7
+ from Bio.PDB.PDBParser import PDBParser
8
+ from Bio.PDB.PPBuilder import PPBuilder
8
9
  from rdkit import Chem
9
10
  from rdkit.Chem.rdchem import Mol
10
11
 
11
12
  from boltz.data.types import Target
12
- from boltz.data.parse.yaml import parse_boltz_schema
13
+ from boltz.data.parse.schema import parse_boltz_schema
13
14
 
14
15
 
15
16
  def download_pdb(pdb_id: str, cache_dir: Path) -> Path:
@@ -67,8 +68,8 @@ def parse_pdb_id(
67
68
  Path to the directory containing the molecules.
68
69
  cache_dir : Path
69
70
  The directory to cache downloaded PDB files.
70
- boltz2 : bool
71
- Whether to parse the input for Boltz2.
71
+ boltz2 : bool, optional
72
+ Whether to use Boltz2 format, by default False.
72
73
 
73
74
  Returns
74
75
  -------
@@ -79,31 +80,25 @@ def parse_pdb_id(
79
80
  pdb_path = download_pdb(pdb_id, cache_dir)
80
81
 
81
82
  # Read PDB file
82
- parser = PDB.PDBParser(QUIET=True)
83
+ parser = PDBParser(QUIET=True)
83
84
  structure = parser.get_structure("protein", str(pdb_path))
85
+ ppb = PPBuilder()
84
86
 
85
87
  # Convert to yaml format
86
88
  sequences = []
87
89
  for model in structure:
88
90
  for chain in model:
89
- # Get chain sequence
90
- seq = ""
91
- for residue in chain:
92
- if residue.id[0] == " ": # Only standard residues
93
- try:
94
- seq += protein_letters_3to1[residue.resname]
95
- except KeyError:
96
- continue
97
-
98
- if seq: # Only add if sequence is not empty
99
- molecule = {
100
- "protein": {
101
- "id": chain.id,
102
- "sequence": seq,
103
- "modifications": [],
104
- },
105
- }
106
- sequences.append(molecule)
91
+ for pp in ppb.build_peptides(chain):
92
+ seq = str(pp.get_sequence())
93
+ if seq: # Only add if sequence is not empty
94
+ molecule = {
95
+ "protein": {
96
+ "id": chain.id,
97
+ "sequence": seq,
98
+ "modifications": [],
99
+ },
100
+ }
101
+ sequences.append(molecule)
107
102
 
108
103
  data = {
109
104
  "sequences": sequences,
@@ -1024,12 +1024,14 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
1024
1024
  # This is a PDB ID
1025
1025
  from boltz.data.parse.pdb_download import parse_pdb_id
1026
1026
  target = parse_pdb_id(pdb_path.stem, ccd, mol_dir, pdb_path.parent)
1027
+ # Get sequence from the first chain
1028
+ seq = target.sequences[0]["protein"]["sequence"]
1027
1029
  else:
1028
1030
  # This is a PDB file
1029
1031
  from boltz.data.parse.pdb import parse_pdb
1030
1032
  target = parse_pdb(pdb_path, ccd, mol_dir)
1031
- # Get sequence from the first chain
1032
- seq = target.sequences[0]
1033
+ # Get sequence from the first chain
1034
+ seq = target.sequences[0]["protein"]["sequence"]
1033
1035
  else:
1034
1036
  msg = f"Protein must have either 'sequence' or 'pdb' field: {item}"
1035
1037
  raise ValueError(msg)
@@ -1042,7 +1044,7 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
1042
1044
  from boltz.data.parse.sdf import parse_sdf
1043
1045
  target = parse_sdf(sdf_path, ccd, mol_dir)
1044
1046
  # Get sequence from the first ligand
1045
- seq = target.sequences[0]
1047
+ seq = target.sequences[0]["ligand"]["sequence"]
1046
1048
  elif "ccd" in item[entity_type]:
1047
1049
  seq = str(item[entity_type]["ccd"])
1048
1050
  else:
boltz/data/parse/sdf.py CHANGED
@@ -1,15 +1,49 @@
1
+ import os
1
2
  from pathlib import Path
2
3
  from typing import Optional
3
4
 
4
5
  from rdkit import Chem
6
+ from rdkit.Chem import AllChem
5
7
  from rdkit.Chem.rdchem import Mol
8
+ import rdkit.Chem.rdmolfiles as rdmolfiles
6
9
 
7
10
  from boltz.data.types import Target
8
- from boltz.data.parse.yaml import parse_boltz_schema
11
+ from boltz.data.parse.schema import parse_boltz_schema
12
+
13
+
14
+ def _process_sdf(sdf_path: str) -> dict[str, str]:
15
+ """Process an SDF file and extract SMILES strings.
16
+
17
+ Parameters
18
+ ----------
19
+ sdf_path : str
20
+ Path to the SDF file.
21
+
22
+ Returns
23
+ -------
24
+ dict[str, str]
25
+ Dictionary mapping molecule names to SMILES strings.
26
+ """
27
+ output_dict = {}
28
+ suppl = rdmolfiles.ForwardSDMolSupplier(sdf_path)
29
+
30
+ for mol in suppl:
31
+ if mol is not None:
32
+ mol_smiles = rdmolfiles.MolToSmiles(mol)
33
+ if mol.HasProp("_Name"):
34
+ mol_name = mol.GetProp("_Name")
35
+ if mol_name == "":
36
+ mol_name = mol_smiles
37
+ else:
38
+ mol_name = mol_smiles
39
+
40
+ output_dict[mol_name] = mol_smiles
41
+
42
+ return output_dict
9
43
 
10
44
 
11
45
  def parse_sdf(
12
- path: Path,
46
+ sdf_path: Path,
13
47
  ccd: dict[str, Mol],
14
48
  mol_dir: Path,
15
49
  boltz2: bool = False,
@@ -18,37 +52,34 @@ def parse_sdf(
18
52
 
19
53
  Parameters
20
54
  ----------
21
- path : Path
55
+ sdf_path : Path
22
56
  Path to the SDF file.
23
57
  ccd : Dict
24
58
  Dictionary of CCD components.
25
59
  mol_dir : Path
26
60
  Path to the directory containing the molecules.
27
- boltz2 : bool
28
- Whether to parse the input for Boltz2.
61
+ boltz2 : bool, optional
62
+ Whether to use Boltz2 format, by default False.
29
63
 
30
64
  Returns
31
65
  -------
32
66
  Target
33
67
  The parsed target.
34
68
  """
35
- # Read SDF file
36
- supplier = Chem.SDMolSupplier(str(path))
37
-
69
+ # Process SDF file
70
+ mol_dict = _process_sdf(str(sdf_path))
71
+
38
72
  # Convert to yaml format
39
73
  sequences = []
40
- for i, mol in enumerate(supplier):
41
- if mol is not None:
42
- # Get SMILES
43
- smiles = Chem.MolToSmiles(mol)
44
-
45
- molecule = {
46
- "ligand": {
47
- "id": f"L{i+1}", # Use L1, L2, etc. as chain IDs
48
- "smiles": smiles,
49
- },
50
- }
51
- sequences.append(molecule)
74
+ for mol_name, smiles in mol_dict.items():
75
+ molecule = {
76
+ "ligand": {
77
+ "id": mol_name,
78
+ "sequence": smiles,
79
+ "modifications": [],
80
+ },
81
+ }
82
+ sequences.append(molecule)
52
83
 
53
84
  data = {
54
85
  "sequences": sequences,
@@ -56,5 +87,4 @@ def parse_sdf(
56
87
  "version": 1,
57
88
  }
58
89
 
59
- name = path.stem
60
- return parse_boltz_schema(name, data, ccd, mol_dir, boltz2)
90
+ return parse_boltz_schema(sdf_path.stem, data, ccd, mol_dir, boltz2)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: boltz-vsynthes
3
- Version: 1.0.15
3
+ Version: 1.0.17
4
4
  Summary: Boltz for VSYNTHES
5
5
  Requires-Python: <3.13,>=3.10
6
6
  Description-Content-Type: text/markdown
@@ -38,10 +38,10 @@ boltz/data/parse/csv.py,sha256=Hcq8rJW2njczahEr8jfd_o-zxLaNSgJ3YIoC9srIqpw,2518
38
38
  boltz/data/parse/fasta.py,sha256=taI4s_CqPtyF0XaLJAsVAJHCL0GXm2g1g8Qeccdxikk,3906
39
39
  boltz/data/parse/mmcif.py,sha256=25kEXCkx-OuaawAs7cdz0fxdRu5_CCO0AV00u84PrjQ,36822
40
40
  boltz/data/parse/mmcif_with_constraints.py,sha256=WHYZckSqUwu-Nb9vmVmxHmC7uxwVrF7AVUeVKsc5wGQ,51473
41
- boltz/data/parse/pdb.py,sha256=o2gGGVDxLaKhqPaZjuwfY6gdIyu4iUxczmkvDnEnNls,1808
42
- boltz/data/parse/pdb_download.py,sha256=FJTX7qHKJ_sBRcHg1PLKlqy6gVuluPDAWOAuYKI4-gM,2931
43
- boltz/data/parse/schema.py,sha256=uKv1toawysagqi7wnl1njCDrQKM1tNCNDxvcNc6VXK0,60801
44
- boltz/data/parse/sdf.py,sha256=myFA3bL6MkdPdMFfZHotxJ8yNMGpsc_u6w06YFadeiw,1364
41
+ boltz/data/parse/pdb.py,sha256=R6rDCh6AKLn7n9ELltF1wDORDtR1CFgFnjFdYXKJCNg,1697
42
+ boltz/data/parse/pdb_download.py,sha256=YEvSoYqKAan6EizCICMruC_3v-Vpvc4vpCpwmHPY67E,2808
43
+ boltz/data/parse/schema.py,sha256=pq9PRrm3pyaOH0kJEjtG-rVHEPpr_KmiykvxKsMWnn0,60979
44
+ boltz/data/parse/sdf.py,sha256=z7XTH-a38Rq-grC8bEZUPxGmLyP-g17LCxYWzVkBS_0,2107
45
45
  boltz/data/parse/yaml.py,sha256=GRFRMtDD4PQ4PIpA_S1jj0vRaEu2LlZd_g4rN1zUrNo,1505
46
46
  boltz/data/sample/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
47
  boltz/data/sample/cluster.py,sha256=9Sx8qP7zGZOAyEspwYFtCTbGTBZnuN-zfCKFbbA_6oI,8175
@@ -107,9 +107,9 @@ boltz/model/optim/scheduler.py,sha256=nB4jz0CZ4pR4n08LQngExL_pNycIdYI8AXVoHPnZWQ
107
107
  boltz/model/potentials/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
108
  boltz/model/potentials/potentials.py,sha256=vev8Vjfs-ML1hyrdv_R8DynG4wSFahJ6nzPWp7CYQqw,17507
109
109
  boltz/model/potentials/schedules.py,sha256=m7XJjfuF9uTX3bR9VisXv1rvzJjxiD8PobXRpcBBu1c,968
110
- boltz_vsynthes-1.0.15.dist-info/licenses/LICENSE,sha256=8GZ_1eZsUeG6jdqgJJxtciWzADfgLEV4LY8sKUOsJhc,1102
111
- boltz_vsynthes-1.0.15.dist-info/METADATA,sha256=dZV2p_UkKrGC3QZh40tY3ZEYLO7MylqdH77xA5pSe9I,7171
112
- boltz_vsynthes-1.0.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
113
- boltz_vsynthes-1.0.15.dist-info/entry_points.txt,sha256=n5a5I35ntu9lmyr16oZgHPFY0b0YxjiixY7m7nbMTLc,41
114
- boltz_vsynthes-1.0.15.dist-info/top_level.txt,sha256=MgU3Jfb-ctWm07YGMts68PMjSh9v26D0gfG3dFRmVFA,6
115
- boltz_vsynthes-1.0.15.dist-info/RECORD,,
110
+ boltz_vsynthes-1.0.17.dist-info/licenses/LICENSE,sha256=8GZ_1eZsUeG6jdqgJJxtciWzADfgLEV4LY8sKUOsJhc,1102
111
+ boltz_vsynthes-1.0.17.dist-info/METADATA,sha256=s2lJPUPQJ4BMEFa2Hc57Q8e5EYMPuEKr5IO_SxCVTmc,7171
112
+ boltz_vsynthes-1.0.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
113
+ boltz_vsynthes-1.0.17.dist-info/entry_points.txt,sha256=n5a5I35ntu9lmyr16oZgHPFY0b0YxjiixY7m7nbMTLc,41
114
+ boltz_vsynthes-1.0.17.dist-info/top_level.txt,sha256=MgU3Jfb-ctWm07YGMts68PMjSh9v26D0gfG3dFRmVFA,6
115
+ boltz_vsynthes-1.0.17.dist-info/RECORD,,