boltz-vsynthes 1.0.40__py3-none-any.whl → 1.0.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,8 +2,6 @@ from collections.abc import Mapping
2
2
  from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Optional
5
- import json
6
- import yaml
7
5
 
8
6
  import click
9
7
  import numpy as np
@@ -937,7 +935,6 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
937
935
  ccd: Mapping[str, Mol],
938
936
  mol_dir: Optional[Path] = None,
939
937
  boltz_2: bool = False,
940
- output_dir: Optional[Path] = None,
941
938
  ) -> Target:
942
939
  """Parse a Boltz input yaml / json.
943
940
 
@@ -989,8 +986,6 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
989
986
  Path to the directory containing the molecules.
990
987
  boltz2: bool
991
988
  Whether to parse the input for Boltz2.
992
- output_dir: Path, optional
993
- Path to the output directory. If provided, results will be saved in a subfolder named after the input file.
994
989
 
995
990
  Returns
996
991
  -------
@@ -998,14 +993,6 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
998
993
  The parsed target.
999
994
 
1000
995
  """
1001
- # Create output directory if specified
1002
- if output_dir is not None:
1003
- output_dir = Path(output_dir)
1004
- output_dir.mkdir(parents=True, exist_ok=True)
1005
- # Create subfolder based on input name
1006
- subfolder = output_dir / name
1007
- subfolder.mkdir(parents=True, exist_ok=True)
1008
-
1009
996
  # Assert version 1
1010
997
  version = schema.get("version", 1)
1011
998
  if version != 1:
@@ -1072,7 +1059,6 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
1072
1059
 
1073
1060
  # Check if any affinity ligand is present
1074
1061
  affinity_ligands = set()
1075
- affinity_proteins = set()
1076
1062
  properties = schema.get("properties", [])
1077
1063
  if properties and not boltz_2:
1078
1064
  msg = "Affinity prediction is only supported for Boltz2!"
@@ -1083,6 +1069,7 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
1083
1069
  if prop_type == "affinity":
1084
1070
  binder = prop["affinity"]["binder"]
1085
1071
  if not isinstance(binder, str):
1072
+ # TODO: support multi residue ligands and ccd's
1086
1073
  msg = "Binder must be a single chain."
1087
1074
  raise ValueError(msg)
1088
1075
 
@@ -1090,21 +1077,18 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
1090
1077
  msg = f"Could not find binder with name {binder} in the input!"
1091
1078
  raise ValueError(msg)
1092
1079
 
1093
- # Allow both protein and ligand as binders
1094
- if chain_name_to_entity_type[binder] == "protein":
1095
- affinity_proteins.add(binder)
1096
- elif chain_name_to_entity_type[binder] == "ligand":
1097
- affinity_ligands.add(binder)
1098
- else:
1080
+ if chain_name_to_entity_type[binder] != "ligand":
1099
1081
  msg = (
1100
- f"Chain {binder} is not a protein or ligand! "
1101
- "Affinity is currently only supported for proteins and ligands."
1082
+ f"Chain {binder} is not a ligand! "
1083
+ "Affinity is currently only supported for ligands."
1102
1084
  )
1103
1085
  raise ValueError(msg)
1104
1086
 
1105
- # Check if any affinity binder is present
1106
- if len(affinity_proteins) + len(affinity_ligands) > 1:
1107
- msg = "Only one affinity binder is currently supported!"
1087
+ affinity_ligands.add(binder)
1088
+
1089
+ # Check only one affinity ligand is present
1090
+ if len(affinity_ligands) > 1:
1091
+ msg = "Only one affinity ligand is currently supported!"
1108
1092
  raise ValueError(msg)
1109
1093
 
1110
1094
  # Go through entities and parse them
@@ -1127,15 +1111,12 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
1127
1111
  elif isinstance(item[entity_type]["id"], list):
1128
1112
  ids.extend(item[entity_type]["id"])
1129
1113
 
1130
- # Check if any affinity binder is present
1114
+ # Check if any affinity ligand is present
1131
1115
  if len(ids) == 1:
1132
- affinity = ids[0] in affinity_ligands or ids[0] in affinity_proteins
1116
+ affinity = ids[0] in affinity_ligands
1133
1117
  elif (len(ids) > 1) and any(x in affinity_ligands for x in ids):
1134
1118
  msg = "Cannot compute affinity for a ligand that has multiple copies!"
1135
1119
  raise ValueError(msg)
1136
- elif (len(ids) > 1) and any(x in affinity_proteins for x in ids):
1137
- # If binder is a protein, allow multiple ligands
1138
- affinity = True
1139
1120
  else:
1140
1121
  affinity = False
1141
1122
 
@@ -1234,7 +1215,7 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
1234
1215
  )
1235
1216
 
1236
1217
  # Parse a non-polymer
1237
- elif (entity_type == "ligand") and "ccd" in (items[0][entity_type]):
1218
+ elif (entity_type == "ligand") and ("ccd" in items[0][entity_type]):
1238
1219
  seq = items[0][entity_type]["ccd"]
1239
1220
 
1240
1221
  if isinstance(seq, str):
@@ -1909,64 +1890,3 @@ def standardize(smiles: str) -> Optional[str]:
1909
1890
  raise ValueError("Molecule is broken")
1910
1891
 
1911
1892
  return smiles
1912
-
1913
-
1914
- def parse_boltz_directory(
1915
- input_dir: Path,
1916
- output_dir: Path,
1917
- ccd: Mapping[str, Mol],
1918
- mol_dir: Optional[Path] = None,
1919
- boltz_2: bool = False,
1920
- ) -> list[Target]:
1921
- """Parse all YAML files in a directory.
1922
-
1923
- Parameters
1924
- ----------
1925
- input_dir : Path
1926
- Path to the directory containing YAML files.
1927
- output_dir : Path
1928
- Path to the output directory where results will be saved.
1929
- ccd : Mapping[str, Mol]
1930
- Dictionary of CCD components.
1931
- mol_dir : Path, optional
1932
- Path to the directory containing the molecules.
1933
- boltz_2 : bool, optional
1934
- Whether to parse the input for Boltz2.
1935
-
1936
- Returns
1937
- -------
1938
- list[Target]
1939
- List of parsed targets.
1940
-
1941
- """
1942
- input_dir = Path(input_dir)
1943
- output_dir = Path(output_dir)
1944
- output_dir.mkdir(parents=True, exist_ok=True)
1945
-
1946
- targets = []
1947
- for yaml_file in input_dir.glob("*.yaml"):
1948
- # Skip hidden files and directories
1949
- if yaml_file.name.startswith('.') or any(part.startswith('.') for part in yaml_file.parts):
1950
- continue
1951
-
1952
- try:
1953
- # Load YAML file
1954
- with open(yaml_file, "r") as f:
1955
- schema = yaml.safe_load(f)
1956
-
1957
- # Parse schema
1958
- target = parse_boltz_schema(
1959
- name=yaml_file.stem,
1960
- schema=schema,
1961
- ccd=ccd,
1962
- mol_dir=mol_dir,
1963
- boltz_2=boltz_2,
1964
- output_dir=output_dir,
1965
- )
1966
- targets.append(target)
1967
-
1968
- except Exception as e:
1969
- print(f"Error processing {yaml_file}: {str(e)}")
1970
- continue
1971
-
1972
- return targets
boltz/data/parse/yaml.py CHANGED
@@ -1,10 +1,9 @@
1
1
  from pathlib import Path
2
- from typing import Union, List, Optional
3
2
 
4
3
  import yaml
5
4
  from rdkit.Chem.rdchem import Mol
6
5
 
7
- from boltz.data.parse.schema import parse_boltz_schema, parse_boltz_directory
6
+ from boltz.data.parse.schema import parse_boltz_schema
8
7
  from boltz.data.types import Target
9
8
 
10
9
 
@@ -13,9 +12,8 @@ def parse_yaml(
13
12
  ccd: dict[str, Mol],
14
13
  mol_dir: Path,
15
14
  boltz2: bool = False,
16
- output_dir: Optional[Path] = None,
17
- ) -> Union[Target, List[Target]]:
18
- """Parse a Boltz input yaml / json file or directory.
15
+ ) -> Target:
16
+ """Parse a Boltz input yaml / json.
19
17
 
20
18
  The input file should be a yaml file with the following format:
21
19
 
@@ -51,28 +49,20 @@ def parse_yaml(
51
49
  Parameters
52
50
  ----------
53
51
  path : Path
54
- Path to the YAML input file or directory containing YAML files.
55
- ccd : Dict
52
+ Path to the YAML input format.
53
+ components : Dict
56
54
  Dictionary of CCD components.
57
- mol_dir : Path
58
- Path to the directory containing molecules.
59
- boltz2 : bool, optional
55
+ boltz2 : bool
60
56
  Whether to parse the input for Boltz2.
61
- output_dir : Path, optional
62
- Path to the output directory where results will be saved.
63
57
 
64
58
  Returns
65
59
  -------
66
- Union[Target, List[Target]]
67
- The parsed target(s).
60
+ Target
61
+ The parsed target.
68
62
 
69
63
  """
70
- path = Path(path)
71
-
72
- if path.is_dir():
73
- return parse_boltz_directory(path, output_dir or path, ccd, mol_dir, boltz2)
74
- else:
75
- with path.open("r") as file:
76
- data = yaml.safe_load(file)
77
- name = path.stem
78
- return parse_boltz_schema(name, data, ccd, mol_dir, boltz2, output_dir)
64
+ with path.open("r") as file:
65
+ data = yaml.safe_load(file)
66
+
67
+ name = path.stem
68
+ return parse_boltz_schema(name, data, ccd, mol_dir, boltz2)
boltz/main.py CHANGED
@@ -272,7 +272,7 @@ def get_cache_path() -> str:
272
272
 
273
273
 
274
274
  def check_inputs(data: Path) -> list[Path]:
275
- """Check the input data.
275
+ """Check the input data and output directory.
276
276
 
277
277
  Parameters
278
278
  ----------
@@ -282,21 +282,18 @@ def check_inputs(data: Path) -> list[Path]:
282
282
  Returns
283
283
  -------
284
284
  list[Path]
285
- The list of input files.
285
+ The list of input data.
286
286
 
287
287
  """
288
+ click.echo("Checking input data.")
289
+
288
290
  # Check if data is a directory
289
291
  if data.is_dir():
290
292
  data: list[Path] = list(data.glob("*"))
291
293
 
292
294
  # Filter out non .fasta or .yaml files, raise
293
295
  # an error on directory and other file types
294
- filtered_data = []
295
296
  for d in data:
296
- # Skip hidden files and directories
297
- if d.name.startswith('.') or any(part.startswith('.') for part in d.parts):
298
- continue
299
-
300
297
  if d.is_dir():
301
298
  msg = f"Found directory {d} instead of .fasta or .yaml."
302
299
  raise RuntimeError(msg)
@@ -306,8 +303,6 @@ def check_inputs(data: Path) -> list[Path]:
306
303
  "please provide a .fasta or .yaml file."
307
304
  )
308
305
  raise RuntimeError(msg)
309
- filtered_data.append(d)
310
- data = filtered_data
311
306
  else:
312
307
  data = [data]
313
308
 
@@ -498,25 +493,13 @@ def process_input( # noqa: C901, PLR0912, PLR0915, D103
498
493
  ) -> None:
499
494
  try:
500
495
  # Parse data
501
- if path.is_dir():
502
- # Process all YAML and FASTA files in the directory
503
- targets = []
504
- for file_path in path.glob("*"):
505
- if file_path.suffix in (".fa", ".fas", ".fasta"):
506
- target = parse_fasta(file_path, ccd, mol_dir, boltz2)
507
- targets.append(target)
508
- elif file_path.suffix in (".yml", ".yaml"):
509
- target = parse_yaml(file_path, ccd, mol_dir, boltz2)
510
- if not isinstance(target, list):
511
- target = [target]
512
- targets.extend(target)
513
- elif path.suffix in (".fa", ".fas", ".fasta"):
496
+ if path.suffix in (".fa", ".fas", ".fasta"):
514
497
  target = parse_fasta(path, ccd, mol_dir, boltz2)
515
- targets = [target]
516
498
  elif path.suffix in (".yml", ".yaml"):
517
- targets = parse_yaml(path, ccd, mol_dir, boltz2)
518
- if not isinstance(targets, list):
519
- targets = [targets]
499
+ target = parse_yaml(path, ccd, mol_dir, boltz2)
500
+ elif path.is_dir():
501
+ msg = f"Found directory {path} instead of .fasta or .yaml, skipping."
502
+ raise RuntimeError(msg) # noqa: TRY301
520
503
  else:
521
504
  msg = (
522
505
  f"Unable to parse filetype {path.suffix}, "
@@ -524,98 +507,96 @@ def process_input( # noqa: C901, PLR0912, PLR0915, D103
524
507
  )
525
508
  raise RuntimeError(msg) # noqa: TRY301
526
509
 
527
- # Process each target
528
- for target in targets:
529
- # Get target id
530
- target_id = target.record.id
531
-
532
- # Get all MSA ids and decide whether to generate MSA
533
- to_generate = {}
534
- prot_id = const.chain_type_ids["PROTEIN"]
535
- for chain in target.record.chains:
536
- # Add to generate list, assigning entity id
537
- if (chain.mol_type == prot_id) and (chain.msa_id == 0):
538
- entity_id = chain.entity_id
539
- msa_id = f"{target_id}_{entity_id}"
540
- to_generate[msa_id] = target.sequences[entity_id]
541
- chain.msa_id = msa_dir / f"{msa_id}.csv"
542
-
543
- # We do not support msa generation for non-protein chains
544
- elif chain.msa_id == 0:
545
- chain.msa_id = -1
546
-
547
- # Generate MSA
548
- if to_generate and not use_msa_server:
549
- msg = "Missing MSA's in input and --use_msa_server flag not set."
550
- raise RuntimeError(msg) # noqa: TRY301
551
-
552
- if to_generate:
553
- msg = f"Generating MSA for {path} with {len(to_generate)} protein entities."
554
- click.echo(msg)
555
- compute_msa(
556
- data=to_generate,
557
- target_id=target_id,
558
- msa_dir=msa_dir,
559
- msa_server_url=msa_server_url,
560
- msa_pairing_strategy=msa_pairing_strategy,
561
- )
510
+ # Get target id
511
+ target_id = target.record.id
512
+
513
+ # Get all MSA ids and decide whether to generate MSA
514
+ to_generate = {}
515
+ prot_id = const.chain_type_ids["PROTEIN"]
516
+ for chain in target.record.chains:
517
+ # Add to generate list, assigning entity id
518
+ if (chain.mol_type == prot_id) and (chain.msa_id == 0):
519
+ entity_id = chain.entity_id
520
+ msa_id = f"{target_id}_{entity_id}"
521
+ to_generate[msa_id] = target.sequences[entity_id]
522
+ chain.msa_id = msa_dir / f"{msa_id}.csv"
523
+
524
+ # We do not support msa generation for non-protein chains
525
+ elif chain.msa_id == 0:
526
+ chain.msa_id = -1
527
+
528
+ # Generate MSA
529
+ if to_generate and not use_msa_server:
530
+ msg = "Missing MSA's in input and --use_msa_server flag not set."
531
+ raise RuntimeError(msg) # noqa: TRY301
532
+
533
+ if to_generate:
534
+ msg = f"Generating MSA for {path} with {len(to_generate)} protein entities."
535
+ click.echo(msg)
536
+ compute_msa(
537
+ data=to_generate,
538
+ target_id=target_id,
539
+ msa_dir=msa_dir,
540
+ msa_server_url=msa_server_url,
541
+ msa_pairing_strategy=msa_pairing_strategy,
542
+ )
562
543
 
563
- # Parse MSA data
564
- msas = sorted({c.msa_id for c in target.record.chains if c.msa_id != -1})
565
- msa_id_map = {}
566
- for msa_idx, msa_id in enumerate(msas):
567
- # Check that raw MSA exists
568
- msa_path = Path(msa_id)
569
- if not msa_path.exists():
570
- msg = f"MSA file {msa_path} not found."
571
- raise FileNotFoundError(msg) # noqa: TRY301
572
-
573
- # Dump processed MSA
574
- processed = processed_msa_dir / f"{target_id}_{msa_idx}.npz"
575
- msa_id_map[msa_id] = f"{target_id}_{msa_idx}"
576
- if not processed.exists():
577
- # Parse A3M
578
- if msa_path.suffix == ".a3m":
579
- msa: MSA = parse_a3m(
580
- msa_path,
581
- taxonomy=None,
582
- max_seqs=max_msa_seqs,
583
- )
584
- elif msa_path.suffix == ".csv":
585
- msa: MSA = parse_csv(msa_path, max_seqs=max_msa_seqs)
586
- else:
587
- msg = f"MSA file {msa_path} not supported, only a3m or csv."
588
- raise RuntimeError(msg) # noqa: TRY301
589
-
590
- msa.dump(processed)
591
-
592
- # Modify records to point to processed MSA
593
- for c in target.record.chains:
594
- if (c.msa_id != -1) and (c.msa_id in msa_id_map):
595
- c.msa_id = msa_id_map[c.msa_id]
596
-
597
- # Dump templates
598
- for template_id, template in target.templates.items():
599
- name = f"{target.record.id}_{template_id}.npz"
600
- template_path = processed_templates_dir / name
601
- template.dump(template_path)
602
-
603
- # Dump constraints
604
- constraints_path = processed_constraints_dir / f"{target.record.id}.npz"
605
- target.residue_constraints.dump(constraints_path)
606
-
607
- # Dump extra molecules
608
- Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps)
609
- with (processed_mols_dir / f"{target.record.id}.pkl").open("wb") as f:
610
- pickle.dump(target.extra_mols, f)
611
-
612
- # Dump structure
613
- struct_path = structure_dir / f"{target.record.id}.npz"
614
- target.structure.dump(struct_path)
615
-
616
- # Dump record
617
- record_path = records_dir / f"{target.record.id}.json"
618
- target.record.dump(record_path)
544
+ # Parse MSA data
545
+ msas = sorted({c.msa_id for c in target.record.chains if c.msa_id != -1})
546
+ msa_id_map = {}
547
+ for msa_idx, msa_id in enumerate(msas):
548
+ # Check that raw MSA exists
549
+ msa_path = Path(msa_id)
550
+ if not msa_path.exists():
551
+ msg = f"MSA file {msa_path} not found."
552
+ raise FileNotFoundError(msg) # noqa: TRY301
553
+
554
+ # Dump processed MSA
555
+ processed = processed_msa_dir / f"{target_id}_{msa_idx}.npz"
556
+ msa_id_map[msa_id] = f"{target_id}_{msa_idx}"
557
+ if not processed.exists():
558
+ # Parse A3M
559
+ if msa_path.suffix == ".a3m":
560
+ msa: MSA = parse_a3m(
561
+ msa_path,
562
+ taxonomy=None,
563
+ max_seqs=max_msa_seqs,
564
+ )
565
+ elif msa_path.suffix == ".csv":
566
+ msa: MSA = parse_csv(msa_path, max_seqs=max_msa_seqs)
567
+ else:
568
+ msg = f"MSA file {msa_path} not supported, only a3m or csv."
569
+ raise RuntimeError(msg) # noqa: TRY301
570
+
571
+ msa.dump(processed)
572
+
573
+ # Modify records to point to processed MSA
574
+ for c in target.record.chains:
575
+ if (c.msa_id != -1) and (c.msa_id in msa_id_map):
576
+ c.msa_id = msa_id_map[c.msa_id]
577
+
578
+ # Dump templates
579
+ for template_id, template in target.templates.items():
580
+ name = f"{target.record.id}_{template_id}.npz"
581
+ template_path = processed_templates_dir / name
582
+ template.dump(template_path)
583
+
584
+ # Dump constraints
585
+ constraints_path = processed_constraints_dir / f"{target.record.id}.npz"
586
+ target.residue_constraints.dump(constraints_path)
587
+
588
+ # Dump extra molecules
589
+ Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps)
590
+ with (processed_mols_dir / f"{target.record.id}.pkl").open("wb") as f:
591
+ pickle.dump(target.extra_mols, f)
592
+
593
+ # Dump structure
594
+ struct_path = structure_dir / f"{target.record.id}.npz"
595
+ target.structure.dump(struct_path)
596
+
597
+ # Dump record
598
+ record_path = records_dir / f"{target.record.id}.json"
599
+ target.record.dump(record_path)
619
600
 
620
601
  except Exception as e: # noqa: BLE001
621
602
  import traceback
@@ -0,0 +1,63 @@
1
+ from pathlib import Path
2
+ from typing import Optional
3
+ from rdkit import Chem
4
+
5
+ def split_sdf_to_individuals(
6
+ input_sdf: Path,
7
+ output_dir: Path,
8
+ prefix: str = "ligand_",
9
+ start_index: int = 1,
10
+ ) -> None:
11
+ """Split a large SDF file into individual SDF files, one molecule per file.
12
+
13
+ Parameters
14
+ ----------
15
+ input_sdf : Path
16
+ Path to the input SDF file.
17
+ output_dir : Path
18
+ Path to the output directory where individual files will be saved.
19
+ prefix : str, optional
20
+ Prefix for output filenames, by default "ligand_".
21
+ start_index : int, optional
22
+ Starting index for output filenames, by default 1.
23
+ """
24
+ # Create output directory if it doesn't exist
25
+ output_dir.mkdir(parents=True, exist_ok=True)
26
+
27
+ # Read all molecules from input SDF
28
+ supplier = Chem.SDMolSupplier(str(input_sdf))
29
+ molecules = [mol for mol in supplier if mol is not None]
30
+ total_molecules = len(molecules)
31
+
32
+ # Write each molecule to a separate file
33
+ for i, mol in enumerate(molecules):
34
+ # Create output filename
35
+ output_file = output_dir / f"{prefix}{start_index + i}.sdf"
36
+
37
+ # Write molecule to file
38
+ writer = Chem.SDWriter(str(output_file))
39
+ writer.write(mol)
40
+ writer.close()
41
+
42
+ print(f"Created {output_file}")
43
+
44
+ def main():
45
+ import argparse
46
+
47
+ parser = argparse.ArgumentParser(description="Split a large SDF file into individual files")
48
+ parser.add_argument("input_sdf", type=str, help="Path to input SDF file")
49
+ parser.add_argument("output_dir", type=str, help="Path to output directory")
50
+ parser.add_argument("--prefix", type=str, default="ligand_", help="Prefix for output filenames")
51
+ parser.add_argument("--start-index", type=int, default=1, help="Starting index for output filenames")
52
+
53
+ args = parser.parse_args()
54
+
55
+ split_sdf_to_individuals(
56
+ input_sdf=Path(args.input_sdf),
57
+ output_dir=Path(args.output_dir),
58
+ prefix=args.prefix,
59
+ start_index=args.start_index,
60
+ )
61
+
62
+ if __name__ == "__main__":
63
+ main()
@@ -0,0 +1,120 @@
1
+ from pathlib import Path
2
+ import yaml
3
+ from typing import Optional, Dict, Any
4
+ import string
5
+
6
+ def get_next_ligand_id(config: Dict[str, Any]) -> str:
7
+ """Get the next available ligand ID based on existing IDs in the config.
8
+
9
+ Parameters
10
+ ----------
11
+ config : Dict[str, Any]
12
+ The configuration dictionary.
13
+
14
+ Returns
15
+ -------
16
+ str
17
+ The next available ligand ID.
18
+ """
19
+ # Get all existing IDs
20
+ existing_ids = set()
21
+ for item in config["sequences"]:
22
+ for key in item:
23
+ if "id" in item[key]:
24
+ existing_ids.add(item[key]["id"])
25
+
26
+ # Find the first available letter
27
+ for letter in string.ascii_uppercase:
28
+ if letter not in existing_ids:
29
+ return letter
30
+
31
+ # If we run out of single letters, use AA, AB, etc.
32
+ for first in string.ascii_uppercase:
33
+ for second in string.ascii_uppercase:
34
+ new_id = first + second
35
+ if new_id not in existing_ids:
36
+ return new_id
37
+
38
+ raise ValueError("Ran out of available ligand IDs!")
39
+
40
+ def generate_yamls_from_sdfs(
41
+ template_yaml: Path,
42
+ sdf_dir: Path,
43
+ output_dir: Path,
44
+ yaml_prefix: str = "config_",
45
+ start_index: int = 1,
46
+ ) -> None:
47
+ """Generate YAML files from a template and a folder of SDF files.
48
+
49
+ Parameters
50
+ ----------
51
+ template_yaml : Path
52
+ Path to the template YAML file.
53
+ sdf_dir : Path
54
+ Path to the directory containing SDF files.
55
+ output_dir : Path
56
+ Path to the output directory where YAML files will be saved.
57
+ yaml_prefix : str, optional
58
+ Prefix for output YAML filenames, by default "config_".
59
+ start_index : int, optional
60
+ Starting index for output filenames, by default 1.
61
+ """
62
+ # Create output directory if it doesn't exist
63
+ output_dir.mkdir(parents=True, exist_ok=True)
64
+
65
+ # Load template YAML
66
+ with open(template_yaml) as f:
67
+ template = yaml.safe_load(f)
68
+
69
+ # Get all SDF files
70
+ sdf_files = sorted(sdf_dir.glob("*.sdf"))
71
+
72
+ # Generate YAML for each SDF
73
+ for i, sdf_file in enumerate(sdf_files):
74
+ # Create a copy of the template
75
+ config = template.copy()
76
+
77
+ # Get next available ligand ID
78
+ ligand_id = get_next_ligand_id(config)
79
+
80
+ # Update ligand information
81
+ for item in config["sequences"]:
82
+ if "ligand" in item:
83
+ item["ligand"]["id"] = ligand_id
84
+ item["ligand"]["sdf"] = str(sdf_file)
85
+
86
+ # Update affinity information if present
87
+ if "properties" in config:
88
+ for prop in config["properties"]:
89
+ if "affinity" in prop:
90
+ prop["affinity"]["binder"] = ligand_id
91
+
92
+ # Write YAML file
93
+ output_file = output_dir / f"{yaml_prefix}{start_index + i}.yaml"
94
+ with open(output_file, "w") as f:
95
+ yaml.dump(config, f, default_flow_style=False)
96
+
97
+ print(f"Created {output_file} with ligand ID {ligand_id}")
98
+
99
+ def main():
100
+ import argparse
101
+
102
+ parser = argparse.ArgumentParser(description="Generate YAML files from a template and SDF files")
103
+ parser.add_argument("template_yaml", type=str, help="Path to template YAML file")
104
+ parser.add_argument("sdf_dir", type=str, help="Path to directory containing SDF files")
105
+ parser.add_argument("output_dir", type=str, help="Path to output directory")
106
+ parser.add_argument("--yaml-prefix", type=str, default="config_", help="Prefix for output YAML filenames")
107
+ parser.add_argument("--start-index", type=int, default=1, help="Starting index for output filenames")
108
+
109
+ args = parser.parse_args()
110
+
111
+ generate_yamls_from_sdfs(
112
+ template_yaml=Path(args.template_yaml),
113
+ sdf_dir=Path(args.sdf_dir),
114
+ output_dir=Path(args.output_dir),
115
+ yaml_prefix=args.yaml_prefix,
116
+ start_index=args.start_index,
117
+ )
118
+
119
+ if __name__ == "__main__":
120
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: boltz-vsynthes
3
- Version: 1.0.40
3
+ Version: 1.0.42
4
4
  Summary: Boltz for VSYNTHES
5
5
  Requires-Python: <3.13,>=3.10
6
6
  Description-Content-Type: text/markdown
@@ -1,5 +1,5 @@
1
1
  boltz/__init__.py,sha256=F_-so3S40iZrSZ89Ge4TS6aZqwWyZXq_H4AXGDlbA_g,187
2
- boltz/main.py,sha256=SHM-t-9wjwjTJmWR4N5SrAHxk2vgz7fTruz5shiixVc,40882
2
+ boltz/main.py,sha256=AMYdcqTLOL5Mbo8P2ix1KeNwTijH5fWNzKUnLHBNtn0,39735
3
3
  boltz/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  boltz/data/const.py,sha256=1M-88Z6HkfKY6MkNtqcj3b9P-oX9xEXluh3qM_u8dNU,26779
5
5
  boltz/data/mol.py,sha256=maOpPHEGX1VVXCIFY6pQNGF7gUBZPAfgSvuPf2QO1yc,34268
@@ -40,9 +40,9 @@ boltz/data/parse/mmcif.py,sha256=25kEXCkx-OuaawAs7cdz0fxdRu5_CCO0AV00u84PrjQ,368
40
40
  boltz/data/parse/mmcif_with_constraints.py,sha256=WHYZckSqUwu-Nb9vmVmxHmC7uxwVrF7AVUeVKsc5wGQ,51473
41
41
  boltz/data/parse/pdb.py,sha256=iybk4p2UgUy_ABGprDq_xxyPSdm1HAZsGTM0lhxVEwM,1654
42
42
  boltz/data/parse/pdb_download.py,sha256=wge-scX-lOatX0q83W1wOsaql99rYp-6uGWSHEc995M,2718
43
- boltz/data/parse/schema.py,sha256=p4KIAVzQAuApcxRLHc6-KKG7ICgLmEWVzE8Qqm6v04w,66402
43
+ boltz/data/parse/schema.py,sha256=kNu28U2_MGiecwWNlcxgaDH3WOcO0P-q2LdoSPSb66w,63826
44
44
  boltz/data/parse/sdf.py,sha256=fs3MQVClDcCzxJaeVYiDuoh-fUrYc8Tcd5Bz8ws3FKI,2052
45
- boltz/data/parse/yaml.py,sha256=M3dRQK2mMDue3bPSO_T2ThaVojSMrOV7rMY-KXQvaGQ,2047
45
+ boltz/data/parse/yaml.py,sha256=GRFRMtDD4PQ4PIpA_S1jj0vRaEu2LlZd_g4rN1zUrNo,1505
46
46
  boltz/data/sample/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
47
  boltz/data/sample/cluster.py,sha256=9Sx8qP7zGZOAyEspwYFtCTbGTBZnuN-zfCKFbbA_6oI,8175
48
48
  boltz/data/sample/distillation.py,sha256=ABzst2FBr_E54KqZWIHc1bYtKYr79lxRJM7PnS4ifK0,1789
@@ -107,9 +107,11 @@ boltz/model/optim/scheduler.py,sha256=nB4jz0CZ4pR4n08LQngExL_pNycIdYI8AXVoHPnZWQ
107
107
  boltz/model/potentials/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
108
  boltz/model/potentials/potentials.py,sha256=vev8Vjfs-ML1hyrdv_R8DynG4wSFahJ6nzPWp7CYQqw,17507
109
109
  boltz/model/potentials/schedules.py,sha256=m7XJjfuF9uTX3bR9VisXv1rvzJjxiD8PobXRpcBBu1c,968
110
- boltz_vsynthes-1.0.40.dist-info/licenses/LICENSE,sha256=8GZ_1eZsUeG6jdqgJJxtciWzADfgLEV4LY8sKUOsJhc,1102
111
- boltz_vsynthes-1.0.40.dist-info/METADATA,sha256=z2kizv_5w3PrpKHsDV_GXjhzQDRxRCWWT2pOESvbcFU,7171
112
- boltz_vsynthes-1.0.40.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
113
- boltz_vsynthes-1.0.40.dist-info/entry_points.txt,sha256=n5a5I35ntu9lmyr16oZgHPFY0b0YxjiixY7m7nbMTLc,41
114
- boltz_vsynthes-1.0.40.dist-info/top_level.txt,sha256=MgU3Jfb-ctWm07YGMts68PMjSh9v26D0gfG3dFRmVFA,6
115
- boltz_vsynthes-1.0.40.dist-info/RECORD,,
110
+ boltz/utils/sdf_splitter.py,sha256=ZHn_syOcmm-fDnJ3YEGyGv_vYz2IRzUW7vbbMSU2JBY,2108
111
+ boltz/utils/yaml_generator.py,sha256=0Lg7F5dQRX75_xR8jiFVokSprDmFzsNmhaITI4fHjao,3980
112
+ boltz_vsynthes-1.0.42.dist-info/licenses/LICENSE,sha256=8GZ_1eZsUeG6jdqgJJxtciWzADfgLEV4LY8sKUOsJhc,1102
113
+ boltz_vsynthes-1.0.42.dist-info/METADATA,sha256=IKbpa2PuzcHwROQ7sbRRf2GXL7HmlWzHRg3NUHGrZ58,7171
114
+ boltz_vsynthes-1.0.42.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
115
+ boltz_vsynthes-1.0.42.dist-info/entry_points.txt,sha256=n5a5I35ntu9lmyr16oZgHPFY0b0YxjiixY7m7nbMTLc,41
116
+ boltz_vsynthes-1.0.42.dist-info/top_level.txt,sha256=MgU3Jfb-ctWm07YGMts68PMjSh9v26D0gfG3dFRmVFA,6
117
+ boltz_vsynthes-1.0.42.dist-info/RECORD,,