boltz-vsynthes 1.0.5__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1005,46 +1005,31 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
1005
1005
  # First group items that have the same type, sequence and modifications
1006
1006
  items_to_group = {}
1007
1007
  chain_name_to_entity_type = {}
1008
+
1009
+ # Keep track of ligand IDs
1010
+ ligand_id = 1
1011
+ ligand_id_map = {}
1008
1012
 
1013
+ # Parse sequences
1009
1014
  for item in schema["sequences"]:
1010
- # Get entity type
1011
- entity_type = next(iter(item.keys())).lower()
1012
- if entity_type not in {"protein", "dna", "rna", "ligand"}:
1013
- msg = f"Invalid entity type: {entity_type}"
1014
- raise ValueError(msg)
1015
+ entity_type = list(item.keys())[0]
1016
+ entity_id = item[entity_type]["id"]
1017
+ entity_id = [entity_id] if isinstance(entity_id, str) else entity_id
1015
1018
 
1016
- # Get sequence or PDB
1017
- if entity_type in {"protein", "dna", "rna"}:
1019
+ # Get sequence
1020
+ if entity_type == "protein":
1018
1021
  if "sequence" in item[entity_type]:
1019
- seq = str(item[entity_type]["sequence"])
1022
+ seq = item[entity_type]["sequence"]
1020
1023
  elif "pdb" in item[entity_type]:
1021
- pdb_input = str(item[entity_type]["pdb"])
1022
- # Check if it's a PDB code (4 characters) or a file path
1023
- if len(pdb_input) == 4 and pdb_input.isalnum():
1024
- # It's a PDB code, check cache first
1025
- cache_dir = Path(os.environ.get("BOLTZ_CACHE", "~/.boltz")).expanduser()
1026
- pdb_cache_dir = cache_dir / "pdb"
1027
- pdb_cache_dir.mkdir(parents=True, exist_ok=True)
1028
-
1029
- pdb_cache_file = pdb_cache_dir / f"{pdb_input.lower()}.pdb"
1030
-
1031
- if pdb_cache_file.exists():
1032
- # Use cached file
1033
- with pdb_cache_file.open("r") as f:
1034
- pdb_data = f.read()
1035
- else:
1036
- # Download and cache
1037
- import urllib.request
1038
- pdb_url = f"https://files.rcsb.org/download/{pdb_input.lower()}.pdb"
1039
- try:
1040
- with urllib.request.urlopen(pdb_url) as response:
1041
- pdb_data = response.read().decode()
1042
- # Cache the downloaded data
1043
- with pdb_cache_file.open("w") as f:
1044
- f.write(pdb_data)
1045
- except Exception as e:
1046
- msg = f"Failed to download PDB {pdb_input}: {str(e)}"
1047
- raise RuntimeError(msg) from e
1024
+ pdb_input = item[entity_type]["pdb"]
1025
+ if pdb_input.startswith(("http://", "https://")):
1026
+ # It's a PDB ID
1027
+ import requests
1028
+ response = requests.get(f"https://files.rcsb.org/download/{pdb_input}.pdb")
1029
+ if response.status_code != 200:
1030
+ msg = f"Failed to download PDB file: {pdb_input}"
1031
+ raise FileNotFoundError(msg)
1032
+ pdb_data = response.text
1048
1033
  else:
1049
1034
  # It's a file path
1050
1035
  pdb_path = Path(pdb_input)
@@ -1076,8 +1061,15 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
1076
1061
  assert "smiles" not in item[entity_type] or "ccd" not in item[entity_type]
1077
1062
  if "smiles" in item[entity_type]:
1078
1063
  seq = str(item[entity_type]["smiles"])
1064
+ # Map user-provided ID to internal LIG1, LIG2, etc.
1065
+ for id in entity_id:
1066
+ ligand_id_map[id] = f"LIG{ligand_id}"
1067
+ ligand_id += 1
1079
1068
  else:
1080
1069
  seq = str(item[entity_type]["ccd"])
1070
+ # For CCD ligands, use the CCD code as the internal ID
1071
+ for id in entity_id:
1072
+ ligand_id_map[id] = seq
1081
1073
 
1082
1074
  # Group items by entity
1083
1075
  items_to_group.setdefault((entity_type, seq), []).append(item)
@@ -1091,140 +1083,97 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
1091
1083
  # Check if any affinity ligand is present
1092
1084
  affinity_ligands = set()
1093
1085
  properties = schema.get("properties", [])
1094
- if properties and not boltz_2:
1095
- msg = "Affinity prediction is only supported for Boltz2!"
1096
- raise ValueError(msg)
1097
-
1086
+
1087
+ # Get all ligands
1088
+ ligands = []
1089
+ for item in schema["sequences"]:
1090
+ entity_type = list(item.keys())[0]
1091
+ if entity_type == "ligand":
1092
+ entity_id = item[entity_type]["id"]
1093
+ entity_id = [entity_id] if isinstance(entity_id, str) else entity_id
1094
+ ligands.extend(entity_id)
1095
+
1096
+ # Get user-specified binders
1097
+ specified_binders = set()
1098
1098
  for prop in properties:
1099
- prop_type = next(iter(prop.keys())).lower()
1100
- if prop_type == "affinity":
1099
+ if "affinity" in prop:
1101
1100
  binder = prop["affinity"]["binder"]
1102
- if not isinstance(binder, str):
1103
- # TODO: support multi residue ligands and ccd's
1104
- msg = "Binder must be a single chain."
1105
- raise ValueError(msg)
1106
-
1107
- if binder not in chain_name_to_entity_type:
1108
- msg = f"Could not find binder with name {binder} in the input!"
1109
- raise ValueError(msg)
1110
-
1111
- if chain_name_to_entity_type[binder] != "ligand":
1112
- msg = (
1113
- f"Chain {binder} is not a ligand! "
1114
- "Affinity is currently only supported for ligands."
1115
- )
1116
- raise ValueError(msg)
1117
-
1118
- affinity_ligands.add(binder)
1119
-
1120
- # Check only one affinity ligand is present
1121
- if len(affinity_ligands) > 1:
1122
- msg = "Only one affinity ligand is currently supported!"
1123
- raise ValueError(msg)
1124
-
1125
- # Go through entities and parse them
1126
- extra_mols: dict[str, Mol] = {}
1127
- chains: dict[str, ParsedChain] = {}
1128
- chain_to_msa: dict[str, str] = {}
1129
- entity_to_seq: dict[str, str] = {}
1130
- is_msa_custom = False
1131
- is_msa_auto = False
1132
- ligand_id = 1
1133
- for entity_id, items in enumerate(items_to_group.values()):
1134
- # Get entity type and sequence
1135
- entity_type = next(iter(items[0].keys())).lower()
1136
-
1137
- # Get ids
1138
- ids = []
1139
- for item in items:
1140
- if isinstance(item[entity_type]["id"], str):
1141
- ids.append(item[entity_type]["id"])
1142
- elif isinstance(item[entity_type]["id"], list):
1143
- ids.extend(item[entity_type]["id"])
1144
-
1145
- # Check if any affinity ligand is present
1146
- if len(ids) == 1:
1147
- affinity = ids[0] in affinity_ligands
1148
- elif (len(ids) > 1) and any(x in affinity_ligands for x in ids):
1149
- msg = "Cannot compute affinity for a ligand that has multiple copies!"
1150
- raise ValueError(msg)
1151
- else:
1152
- affinity = False
1153
-
1154
- # Ensure all the items share the same msa
1155
- msa = -1
1101
+ specified_binders.add(binder)
1102
+
1103
+ # If no binders specified, use all proteins
1104
+ if not specified_binders:
1105
+ for item in schema["sequences"]:
1106
+ entity_type = list(item.keys())[0]
1107
+ if entity_type == "protein":
1108
+ entity_id = item[entity_type]["id"]
1109
+ entity_id = [entity_id] if isinstance(entity_id, str) else entity_id
1110
+ specified_binders.update(entity_id)
1111
+
1112
+ # Generate protein-ligand pairs for specified binders
1113
+ new_properties = []
1114
+ for binder in specified_binders:
1115
+ for ligand in ligands:
1116
+ if ligand in ligand_id_map:
1117
+ ligand = ligand_id_map[ligand] # Convert to internal LIG1, LIG2, etc.
1118
+ affinity_ligands.add(ligand)
1119
+ new_properties.append({
1120
+ "affinity": {
1121
+ "binder": binder,
1122
+ "ligand": ligand
1123
+ }
1124
+ })
1125
+
1126
+ # Update schema with generated properties
1127
+ schema["properties"] = new_properties
1128
+
1129
+ # Parse each group
1130
+ chains = []
1131
+ extra_mols = {}
1132
+ for (entity_type, seq), items in items_to_group.items():
1133
+ # Get entity id
1134
+ entity_id = items[0][entity_type]["id"]
1135
+ entity_id = [entity_id] if isinstance(entity_id, str) else entity_id
1136
+
1137
+ # Check if this entity has affinity
1138
+ affinity = any(entity in affinity_ligands for entity in entity_id)
1139
+
1140
+ # Parse a protein
1156
1141
  if entity_type == "protein":
1157
- # Get the msa, default to 0, meaning auto-generated
1158
- msa = items[0][entity_type].get("msa", 0)
1159
- if (msa is None) or (msa == ""):
1160
- msa = 0
1161
-
1162
- # Check if all MSAs are the same within the same entity
1163
- for item in items:
1164
- item_msa = item[entity_type].get("msa", 0)
1165
- if (item_msa is None) or (item_msa == ""):
1166
- item_msa = 0
1167
-
1168
- if item_msa != msa:
1169
- msg = "All proteins with the same sequence must share the same MSA!"
1170
- raise ValueError(msg)
1171
-
1172
- # Set the MSA, warn if passed in single-sequence mode
1173
- if msa == "empty":
1174
- msa = -1
1175
- msg = (
1176
- "Found explicit empty MSA for some proteins, will run "
1177
- "these in single sequence mode. Keep in mind that the "
1178
- "model predictions will be suboptimal without an MSA."
1179
- )
1180
- click.echo(msg)
1181
-
1182
- if msa not in (0, -1):
1183
- is_msa_custom = True
1184
- elif msa == 0:
1185
- is_msa_auto = True
1186
-
1187
- # Parse a polymer
1188
- if entity_type in {"protein", "dna", "rna"}:
1189
- # Get token map
1190
- if entity_type == "rna":
1191
- token_map = const.rna_letter_to_token
1192
- elif entity_type == "dna":
1193
- token_map = const.dna_letter_to_token
1194
- elif entity_type == "protein":
1195
- token_map = const.prot_letter_to_token
1142
+ # Get MSA
1143
+ msa = items[0][entity_type].get("msa")
1144
+ if msa is not None:
1145
+ msa = Path(msa)
1146
+ if not msa.exists():
1147
+ msg = f"MSA file not found: {msa}"
1148
+ raise FileNotFoundError(msg)
1149
+ with msa.open("r") as f:
1150
+ msa_data = f.read()
1196
1151
  else:
1197
- msg = f"Unknown polymer type: {entity_type}"
1198
- raise ValueError(msg)
1199
-
1200
- # Get polymer info
1201
- chain_type = const.chain_type_ids[entity_type.upper()]
1202
- unk_token = const.unk_token[entity_type.upper()]
1203
-
1204
- # Extract sequence
1205
- raw_seq = items[0][entity_type]["sequence"]
1206
- entity_to_seq[entity_id] = raw_seq
1207
-
1208
- # Convert sequence to tokens
1209
- seq = [token_map.get(c, unk_token) for c in list(raw_seq)]
1152
+ msa_data = None
1210
1153
 
1211
- # Apply modifications
1212
- for mod in items[0][entity_type].get("modifications", []):
1213
- code = mod["ccd"]
1214
- idx = mod["position"] - 1 # 1-indexed
1215
- seq[idx] = code
1154
+ # Parse sequence
1155
+ residues = []
1156
+ for res_idx, code in enumerate(seq):
1157
+ # Get mol
1158
+ ref_mol = get_mol(code, ccd, mol_dir)
1216
1159
 
1217
- cyclic = items[0][entity_type].get("cyclic", False)
1160
+ # Parse residue
1161
+ residue = parse_ccd_residue(
1162
+ name=code,
1163
+ ref_mol=ref_mol,
1164
+ res_idx=res_idx,
1165
+ )
1166
+ residues.append(residue)
1218
1167
 
1219
- # Parse a polymer
1220
- parsed_chain = parse_polymer(
1221
- sequence=seq,
1222
- raw_sequence=raw_seq,
1168
+ # Create protein chain
1169
+ parsed_chain = ParsedChain(
1223
1170
  entity=entity_id,
1224
- chain_type=chain_type,
1225
- components=ccd,
1226
- cyclic=cyclic,
1227
- mol_dir=mol_dir,
1171
+ residues=residues,
1172
+ type=const.chain_type_ids["PROTEIN"],
1173
+ cyclic_period=0,
1174
+ sequence=seq,
1175
+ affinity=affinity,
1176
+ affinity_mw=None,
1228
1177
  )
1229
1178
 
1230
1179
  # Parse a non-polymer
@@ -1298,14 +1247,16 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
1298
1247
 
1299
1248
  mol_no_h = AllChem.RemoveHs(mol, sanitize=False)
1300
1249
  affinity_mw = AllChem.Descriptors.MolWt(mol_no_h) if affinity else None
1301
- extra_mols[f"LIG{ligand_id}"] = mol_no_h
1250
+
1251
+ # Use the mapped internal ID (LIG1, LIG2, etc.)
1252
+ internal_id = ligand_id_map[entity_id[0]]
1253
+ extra_mols[internal_id] = mol_no_h
1302
1254
  residue = parse_ccd_residue(
1303
- name=f"LIG{ligand_id}",
1255
+ name=internal_id,
1304
1256
  ref_mol=mol,
1305
1257
  res_idx=0,
1306
1258
  )
1307
1259
 
1308
- ligand_id += 1
1309
1260
  parsed_chain = ParsedChain(
1310
1261
  entity=entity_id,
1311
1262
  residues=[residue],
@@ -1324,504 +1275,50 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
1324
1275
  msg = f"Invalid entity type: {entity_type}"
1325
1276
  raise ValueError(msg)
1326
1277
 
1327
- # Add as many chains as provided ids
1328
- for item in items:
1329
- ids = item[entity_type]["id"]
1330
- if isinstance(ids, str):
1331
- ids = [ids]
1332
- for chain_name in ids:
1333
- chains[chain_name] = parsed_chain
1334
- chain_to_msa[chain_name] = msa
1335
-
1336
- # Check if msa is custom or auto
1337
- if is_msa_custom and is_msa_auto:
1338
- msg = "Cannot mix custom and auto-generated MSAs in the same input!"
1339
- raise ValueError(msg)
1340
-
1341
- # If no chains parsed fail
1342
- if not chains:
1343
- msg = "No chains parsed!"
1344
- raise ValueError(msg)
1345
-
1346
- # Create tables
1347
- atom_data = []
1348
- bond_data = []
1349
- res_data = []
1350
- chain_data = []
1351
- protein_chains = set()
1352
- affinity_info = None
1353
-
1354
- rdkit_bounds_constraint_data = []
1355
- chiral_atom_constraint_data = []
1356
- stereo_bond_constraint_data = []
1357
- planar_bond_constraint_data = []
1358
- planar_ring_5_constraint_data = []
1359
- planar_ring_6_constraint_data = []
1360
-
1361
- # Convert parsed chains to tables
1362
- atom_idx = 0
1363
- res_idx = 0
1364
- asym_id = 0
1365
- sym_count = {}
1366
- chain_to_idx = {}
1367
-
1368
- # Keep a mapping of (chain_name, residue_idx, atom_name) to atom_idx
1369
- atom_idx_map = {}
1370
-
1371
- for asym_id, (chain_name, chain) in enumerate(chains.items()):
1372
- # Compute number of atoms and residues
1373
- res_num = len(chain.residues)
1374
- atom_num = sum(len(res.atoms) for res in chain.residues)
1375
-
1376
- # Save protein chains for later
1377
- if chain.type == const.chain_type_ids["PROTEIN"]:
1378
- protein_chains.add(chain_name)
1379
-
1380
- # Add affinity info
1381
- if chain.affinity and affinity_info is not None:
1382
- msg = "Cannot compute affinity for multiple ligands!"
1383
- raise ValueError(msg)
1384
-
1385
- if chain.affinity:
1386
- affinity_info = AffinityInfo(
1387
- chain_id=asym_id,
1388
- mw=chain.affinity_mw,
1389
- )
1390
-
1391
- # Find all copies of this chain in the assembly
1392
- entity_id = int(chain.entity)
1393
- sym_id = sym_count.get(entity_id, 0)
1394
- chain_data.append(
1395
- (
1396
- chain_name,
1397
- chain.type,
1398
- entity_id,
1399
- sym_id,
1400
- asym_id,
1401
- atom_idx,
1402
- atom_num,
1403
- res_idx,
1404
- res_num,
1405
- chain.cyclic_period,
1406
- )
1407
- )
1408
- chain_to_idx[chain_name] = asym_id
1409
- sym_count[entity_id] = sym_id + 1
1410
-
1411
- # Add residue, atom, bond, data
1412
- for res in chain.residues:
1413
- atom_center = atom_idx + res.atom_center
1414
- atom_disto = atom_idx + res.atom_disto
1415
- res_data.append(
1416
- (
1417
- res.name,
1418
- res.type,
1419
- res.idx,
1420
- atom_idx,
1421
- len(res.atoms),
1422
- atom_center,
1423
- atom_disto,
1424
- res.is_standard,
1425
- res.is_present,
1426
- )
1427
- )
1428
-
1429
- if res.rdkit_bounds_constraints is not None:
1430
- for constraint in res.rdkit_bounds_constraints:
1431
- rdkit_bounds_constraint_data.append( # noqa: PERF401
1432
- (
1433
- tuple(
1434
- c_atom_idx + atom_idx
1435
- for c_atom_idx in constraint.atom_idxs
1436
- ),
1437
- constraint.is_bond,
1438
- constraint.is_angle,
1439
- constraint.upper_bound,
1440
- constraint.lower_bound,
1441
- )
1442
- )
1443
- if res.chiral_atom_constraints is not None:
1444
- for constraint in res.chiral_atom_constraints:
1445
- chiral_atom_constraint_data.append( # noqa: PERF401
1446
- (
1447
- tuple(
1448
- c_atom_idx + atom_idx
1449
- for c_atom_idx in constraint.atom_idxs
1450
- ),
1451
- constraint.is_reference,
1452
- constraint.is_r,
1453
- )
1454
- )
1455
- if res.stereo_bond_constraints is not None:
1456
- for constraint in res.stereo_bond_constraints:
1457
- stereo_bond_constraint_data.append( # noqa: PERF401
1458
- (
1459
- tuple(
1460
- c_atom_idx + atom_idx
1461
- for c_atom_idx in constraint.atom_idxs
1462
- ),
1463
- constraint.is_check,
1464
- constraint.is_e,
1465
- )
1466
- )
1467
- if res.planar_bond_constraints is not None:
1468
- for constraint in res.planar_bond_constraints:
1469
- planar_bond_constraint_data.append( # noqa: PERF401
1470
- (
1471
- tuple(
1472
- c_atom_idx + atom_idx
1473
- for c_atom_idx in constraint.atom_idxs
1474
- ),
1475
- )
1476
- )
1477
- if res.planar_ring_5_constraints is not None:
1478
- for constraint in res.planar_ring_5_constraints:
1479
- planar_ring_5_constraint_data.append( # noqa: PERF401
1480
- (
1481
- tuple(
1482
- c_atom_idx + atom_idx
1483
- for c_atom_idx in constraint.atom_idxs
1484
- ),
1485
- )
1486
- )
1487
- if res.planar_ring_6_constraints is not None:
1488
- for constraint in res.planar_ring_6_constraints:
1489
- planar_ring_6_constraint_data.append( # noqa: PERF401
1490
- (
1491
- tuple(
1492
- c_atom_idx + atom_idx
1493
- for c_atom_idx in constraint.atom_idxs
1494
- ),
1495
- )
1496
- )
1497
-
1498
- for bond in res.bonds:
1499
- atom_1 = atom_idx + bond.atom_1
1500
- atom_2 = atom_idx + bond.atom_2
1501
- bond_data.append(
1502
- (
1503
- asym_id,
1504
- asym_id,
1505
- res_idx,
1506
- res_idx,
1507
- atom_1,
1508
- atom_2,
1509
- bond.type,
1510
- )
1511
- )
1512
-
1513
- for atom in res.atoms:
1514
- # Add atom to map
1515
- atom_idx_map[(chain_name, res.idx, atom.name)] = (
1516
- asym_id,
1517
- res_idx,
1518
- atom_idx,
1519
- )
1520
-
1521
- # Add atom to data
1522
- atom_data.append(
1523
- (
1524
- atom.name,
1525
- atom.element,
1526
- atom.charge,
1527
- atom.coords,
1528
- atom.conformer,
1529
- atom.is_present,
1530
- atom.chirality,
1531
- )
1532
- )
1533
- atom_idx += 1
1534
-
1535
- res_idx += 1
1278
+ chains.append(parsed_chain)
1536
1279
 
1537
1280
  # Parse constraints
1538
- connections = []
1539
- pocket_constraints = []
1540
- contact_constraints = []
1541
- constraints = schema.get("constraints", [])
1542
- for constraint in constraints:
1281
+ constraints = []
1282
+ for constraint in schema.get("constraints", []):
1543
1283
  if "bond" in constraint:
1544
- if "atom1" not in constraint["bond"] or "atom2" not in constraint["bond"]:
1545
- msg = f"Bond constraint was not properly specified"
1546
- raise ValueError(msg)
1547
-
1548
- c1, r1, a1 = tuple(constraint["bond"]["atom1"])
1549
- c2, r2, a2 = tuple(constraint["bond"]["atom2"])
1550
- c1, r1, a1 = atom_idx_map[(c1, r1 - 1, a1)] # 1-indexed
1551
- c2, r2, a2 = atom_idx_map[(c2, r2 - 1, a2)] # 1-indexed
1552
- connections.append((c1, c2, r1, r2, a1, a2))
1284
+ atom1 = constraint["bond"]["atom1"]
1285
+ atom2 = constraint["bond"]["atom2"]
1286
+ constraints.append(ParsedBond(atom1, atom2))
1553
1287
  elif "pocket" in constraint:
1554
- if (
1555
- "binder" not in constraint["pocket"]
1556
- or "contacts" not in constraint["pocket"]
1557
- ):
1558
- msg = f"Pocket constraint was not properly specified"
1559
- raise ValueError(msg)
1560
-
1561
- if len(pocket_constraints) > 0 and not boltz_2:
1562
- msg = f"Only one pocket binders is supported in Boltz-1!"
1563
- raise ValueError(msg)
1564
-
1565
- max_distance = constraint["pocket"].get("max_distance", 6.0)
1566
- if max_distance != 6.0 and not boltz_2:
1567
- msg = f"Max distance != 6.0 is not supported in Boltz-1!"
1568
- raise ValueError(msg)
1569
-
1570
1288
  binder = constraint["pocket"]["binder"]
1571
- binder = chain_to_idx[binder]
1572
-
1573
- contacts = []
1574
- for chain_name, residue_index_or_atom_name in constraint["pocket"][
1575
- "contacts"
1576
- ]:
1577
- if chains[chain_name].type == const.chain_type_ids["NONPOLYMER"]:
1578
- # Non-polymer chains are indexed by atom name
1579
- _, _, atom_idx = atom_idx_map[
1580
- (chain_name, 0, residue_index_or_atom_name)
1581
- ]
1582
- contact = (chain_to_idx[chain_name], atom_idx)
1583
- else:
1584
- # Polymer chains are indexed by residue index
1585
- contact = (chain_to_idx[chain_name], residue_index_or_atom_name - 1)
1586
- contacts.append(contact)
1587
-
1588
- pocket_constraints.append((binder, contacts, max_distance))
1289
+ if binder in ligand_id_map:
1290
+ binder = ligand_id_map[binder] # Convert to internal LIG1, LIG2, etc.
1291
+ contacts = constraint["pocket"]["contacts"]
1292
+ max_distance = constraint["pocket"].get("max_distance", 6.0)
1293
+ constraints.append(ParsedPocket(binder, contacts, max_distance))
1589
1294
  elif "contact" in constraint:
1590
- if (
1591
- "token1" not in constraint["contact"]
1592
- or "token2" not in constraint["contact"]
1593
- ):
1594
- msg = f"Contact constraint was not properly specified"
1595
- raise ValueError(msg)
1596
-
1597
- if not boltz_2:
1598
- msg = f"Contact constraint is not supported in Boltz-1!"
1599
- raise ValueError(msg)
1600
-
1295
+ token1 = constraint["contact"]["token1"]
1296
+ token2 = constraint["contact"]["token2"]
1601
1297
  max_distance = constraint["contact"].get("max_distance", 6.0)
1602
-
1603
- chain_name1, residue_index_or_atom_name1 = constraint["contact"]["token1"]
1604
- if chains[chain_name1].type == const.chain_type_ids["NONPOLYMER"]:
1605
- # Non-polymer chains are indexed by atom name
1606
- _, _, atom_idx = atom_idx_map[
1607
- (chain_name1, 0, residue_index_or_atom_name1)
1608
- ]
1609
- token1 = (chain_to_idx[chain_name1], atom_idx)
1610
- else:
1611
- # Polymer chains are indexed by residue index
1612
- token1 = (chain_to_idx[chain_name1], residue_index_or_atom_name1 - 1)
1613
-
1614
- pocket_constraints.append((binder, contacts, max_distance))
1298
+ constraints.append(ParsedContact(token1, token2, max_distance))
1615
1299
  else:
1616
- msg = f"Invalid constraint: {constraint}"
1300
+ msg = f"Invalid constraint type: {list(constraint.keys())[0]}"
1617
1301
  raise ValueError(msg)
1618
1302
 
1619
- # Get protein sequences in this YAML
1620
- protein_seqs = {name: chains[name].sequence for name in protein_chains}
1621
-
1622
1303
  # Parse templates
1623
- template_schema = schema.get("templates", [])
1624
- if template_schema and not boltz_2:
1625
- msg = "Templates are not supported in Boltz 1.0!"
1626
- raise ValueError(msg)
1627
-
1628
- templates = {}
1629
- template_records = []
1630
- for template in template_schema:
1631
- if "cif" not in template:
1632
- msg = "Template was not properly specified, missing CIF path!"
1633
- raise ValueError(msg)
1634
-
1635
- path = template["cif"]
1636
- template_id = Path(path).stem
1637
- chain_ids = template.get("chain_id", None)
1638
- template_chain_ids = template.get("template_id", None)
1639
-
1640
- # Check validity of input
1641
- matched = False
1642
-
1643
- if chain_ids is not None and not isinstance(chain_ids, list):
1644
- chain_ids = [chain_ids]
1645
- if template_chain_ids is not None and not isinstance(template_chain_ids, list):
1646
- template_chain_ids = [template_chain_ids]
1647
-
1648
- if (
1649
- template_chain_ids is not None
1650
- and chain_ids is not None
1651
- and len(template_chain_ids) != len(chain_ids)
1652
- ):
1653
- matched = True
1654
- if len(template_chain_ids) != len(chain_ids):
1655
- msg = (
1656
- "When providing both the chain_id and template_id, the number of"
1657
- "template_ids provided must match the number of chain_ids!"
1658
- )
1659
- raise ValueError(msg)
1660
-
1661
- # Get relevant chains ids
1662
- if chain_ids is None:
1663
- chain_ids = list(protein_chains)
1664
-
1665
- for chain_id in chain_ids:
1666
- if chain_id not in protein_chains:
1667
- msg = (
1668
- f"Chain {chain_id} assigned for template"
1669
- f"{template_id} is not one of the protein chains!"
1670
- )
1671
- raise ValueError(msg)
1672
-
1673
- # Get relevant template chain ids
1674
- parsed_template = parse_mmcif(
1675
- path,
1676
- mols=ccd,
1677
- moldir=mol_dir,
1678
- use_assembly=False,
1679
- compute_interfaces=False,
1680
- )
1681
- template_proteins = {
1682
- str(c["name"])
1683
- for c in parsed_template.data.chains
1684
- if c["mol_type"] == const.chain_type_ids["PROTEIN"]
1685
- }
1686
- if template_chain_ids is None:
1687
- template_chain_ids = list(template_proteins)
1688
-
1689
- for chain_id in template_chain_ids:
1690
- if chain_id not in template_proteins:
1691
- msg = (
1692
- f"Template chain {chain_id} assigned for template"
1693
- f"{template_id} is not one of the protein chains!"
1694
- )
1695
- raise ValueError(msg)
1696
-
1697
- # Compute template records
1698
- if matched:
1699
- template_records.extend(
1700
- get_template_records_from_matching(
1701
- template_id=template_id,
1702
- chain_ids=chain_ids,
1703
- sequences=protein_seqs,
1704
- template_chain_ids=template_chain_ids,
1705
- template_sequences=parsed_template.sequences,
1706
- )
1707
- )
1708
- else:
1709
- template_records.extend(
1710
- get_template_records_from_search(
1711
- template_id=template_id,
1712
- chain_ids=chain_ids,
1713
- sequences=protein_seqs,
1714
- template_chain_ids=template_chain_ids,
1715
- template_sequences=parsed_template.sequences,
1716
- )
1717
- )
1718
- # Save template
1719
- templates[template_id] = parsed_template.data
1720
-
1721
- # Convert into datatypes
1722
- residues = np.array(res_data, dtype=Residue)
1723
- chains = np.array(chain_data, dtype=Chain)
1724
- interfaces = np.array([], dtype=Interface)
1725
- mask = np.ones(len(chain_data), dtype=bool)
1726
- rdkit_bounds_constraints = np.array(
1727
- rdkit_bounds_constraint_data, dtype=RDKitBoundsConstraint
1728
- )
1729
- chiral_atom_constraints = np.array(
1730
- chiral_atom_constraint_data, dtype=ChiralAtomConstraint
1731
- )
1732
- stereo_bond_constraints = np.array(
1733
- stereo_bond_constraint_data, dtype=StereoBondConstraint
1734
- )
1735
- planar_bond_constraints = np.array(
1736
- planar_bond_constraint_data, dtype=PlanarBondConstraint
1737
- )
1738
- planar_ring_5_constraints = np.array(
1739
- planar_ring_5_constraint_data, dtype=PlanarRing5Constraint
1740
- )
1741
- planar_ring_6_constraints = np.array(
1742
- planar_ring_6_constraint_data, dtype=PlanarRing6Constraint
1743
- )
1744
-
1745
- if boltz_2:
1746
- atom_data = [(a[0], a[3], a[5], 0.0, 1.0) for a in atom_data]
1747
- connections = [(*c, const.bond_type_ids["COVALENT"]) for c in connections]
1748
- bond_data = bond_data + connections
1749
- atoms = np.array(atom_data, dtype=AtomV2)
1750
- bonds = np.array(bond_data, dtype=BondV2)
1751
- coords = [(x,) for x in atoms["coords"]]
1752
- coords = np.array(coords, Coords)
1753
- ensemble = np.array([(0, len(coords))], dtype=Ensemble)
1754
- data = StructureV2(
1755
- atoms=atoms,
1756
- bonds=bonds,
1757
- residues=residues,
1758
- chains=chains,
1759
- interfaces=interfaces,
1760
- mask=mask,
1761
- coords=coords,
1762
- ensemble=ensemble,
1763
- )
1764
- else:
1765
- bond_data = [(b[4], b[5], b[6]) for b in bond_data]
1766
- atom_data = [(convert_atom_name(a[0]), *a[1:]) for a in atom_data]
1767
- atoms = np.array(atom_data, dtype=Atom)
1768
- bonds = np.array(bond_data, dtype=Bond)
1769
- connections = np.array(connections, dtype=Connection)
1770
- data = Structure(
1771
- atoms=atoms,
1772
- bonds=bonds,
1773
- residues=residues,
1774
- chains=chains,
1775
- connections=connections,
1776
- interfaces=interfaces,
1777
- mask=mask,
1778
- )
1779
-
1780
- # Create metadata
1781
- struct_info = StructureInfo(num_chains=len(chains))
1782
- chain_infos = []
1783
- for chain in chains:
1784
- chain_info = ChainInfo(
1785
- chain_id=int(chain["asym_id"]),
1786
- chain_name=chain["name"],
1787
- mol_type=int(chain["mol_type"]),
1788
- cluster_id=-1,
1789
- msa_id=chain_to_msa[chain["name"]],
1790
- num_residues=int(chain["res_num"]),
1791
- valid=True,
1792
- entity_id=int(chain["entity_id"]),
1793
- )
1794
- chain_infos.append(chain_info)
1795
-
1796
- options = InferenceOptions(pocket_constraints=pocket_constraints)
1797
- record = Record(
1798
- id=name,
1799
- structure=struct_info,
1800
- chains=chain_infos,
1801
- interfaces=[],
1802
- inference_options=options,
1803
- templates=template_records,
1804
- affinity=affinity_info,
1805
- )
1806
-
1807
- residue_constraints = ResidueConstraints(
1808
- rdkit_bounds_constraints=rdkit_bounds_constraints,
1809
- chiral_atom_constraints=chiral_atom_constraints,
1810
- stereo_bond_constraints=stereo_bond_constraints,
1811
- planar_bond_constraints=planar_bond_constraints,
1812
- planar_ring_5_constraints=planar_ring_5_constraints,
1813
- planar_ring_6_constraints=planar_ring_6_constraints,
1814
- )
1815
-
1816
- return Target(
1817
- record=record,
1818
- structure=data,
1819
- sequences=entity_to_seq,
1820
- residue_constraints=residue_constraints,
1304
+ templates = []
1305
+ for template in schema.get("templates", []):
1306
+ cif = template["cif"]
1307
+ chain_id = template.get("chain_id")
1308
+ template_id = template.get("template_id")
1309
+ templates.append(ParsedTemplate(cif, chain_id, template_id))
1310
+
1311
+ # Create target
1312
+ target = Target(
1313
+ name=name,
1314
+ chains=chains,
1315
+ constraints=constraints,
1821
1316
  templates=templates,
1822
1317
  extra_mols=extra_mols,
1823
1318
  )
1824
1319
 
1320
+ return target
1321
+
1825
1322
 
1826
1323
  def standardize(smiles: str) -> Optional[str]:
1827
1324
  """Standardize a molecule and return its SMILES and a flag indicating whether the molecule is valid.
boltz/main.py CHANGED
@@ -742,7 +742,11 @@ def process_inputs(
742
742
 
743
743
  # Process this input file
744
744
  click.echo(f"Processing {input_file.name}")
745
- process_input_partial(input_file)
745
+ try:
746
+ process_input_partial(input_file)
747
+ except Exception as e:
748
+ click.echo(f"Error processing {input_file.name}: {str(e)}")
749
+ continue
746
750
 
747
751
  # Copy MSA files to central MSA directory
748
752
  for msa_file in file_processed_msa_dir.glob("*.npz"):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: boltz-vsynthes
3
- Version: 1.0.5
3
+ Version: 1.0.7
4
4
  Summary: Boltz for V-Synthes
5
5
  Requires-Python: <3.13,>=3.10
6
6
  Description-Content-Type: text/markdown
@@ -1,5 +1,5 @@
1
1
  boltz/__init__.py,sha256=F_-so3S40iZrSZ89Ge4TS6aZqwWyZXq_H4AXGDlbA_g,187
2
- boltz/main.py,sha256=sF_fNSzOElFhnlUBrnRidY1Dg_dduIHl23CREMo_ICc,41374
2
+ boltz/main.py,sha256=w7c8dpAR0_97HIS_u76wywC1lswL4XVg98CuCrrXLvQ,41515
3
3
  boltz/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  boltz/data/const.py,sha256=1M-88Z6HkfKY6MkNtqcj3b9P-oX9xEXluh3qM_u8dNU,26779
5
5
  boltz/data/mol.py,sha256=maOpPHEGX1VVXCIFY6pQNGF7gUBZPAfgSvuPf2QO1yc,34268
@@ -38,7 +38,7 @@ boltz/data/parse/csv.py,sha256=Hcq8rJW2njczahEr8jfd_o-zxLaNSgJ3YIoC9srIqpw,2518
38
38
  boltz/data/parse/fasta.py,sha256=taI4s_CqPtyF0XaLJAsVAJHCL0GXm2g1g8Qeccdxikk,3906
39
39
  boltz/data/parse/mmcif.py,sha256=25kEXCkx-OuaawAs7cdz0fxdRu5_CCO0AV00u84PrjQ,36822
40
40
  boltz/data/parse/mmcif_with_constraints.py,sha256=WHYZckSqUwu-Nb9vmVmxHmC7uxwVrF7AVUeVKsc5wGQ,51473
41
- boltz/data/parse/schema.py,sha256=6dpgtwlPBkMCEnB6Wd-8m1l69-hgapDuNBkTGBu-p-M,62363
41
+ boltz/data/parse/schema.py,sha256=DvMwh1Brn4ELzBuLEk89fdYv4XBx5bX3Fq2_TMeZ-08,43352
42
42
  boltz/data/parse/yaml.py,sha256=GRFRMtDD4PQ4PIpA_S1jj0vRaEu2LlZd_g4rN1zUrNo,1505
43
43
  boltz/data/sample/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  boltz/data/sample/cluster.py,sha256=9Sx8qP7zGZOAyEspwYFtCTbGTBZnuN-zfCKFbbA_6oI,8175
@@ -104,9 +104,9 @@ boltz/model/optim/scheduler.py,sha256=nB4jz0CZ4pR4n08LQngExL_pNycIdYI8AXVoHPnZWQ
104
104
  boltz/model/potentials/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
105
105
  boltz/model/potentials/potentials.py,sha256=vev8Vjfs-ML1hyrdv_R8DynG4wSFahJ6nzPWp7CYQqw,17507
106
106
  boltz/model/potentials/schedules.py,sha256=m7XJjfuF9uTX3bR9VisXv1rvzJjxiD8PobXRpcBBu1c,968
107
- boltz_vsynthes-1.0.5.dist-info/licenses/LICENSE,sha256=8GZ_1eZsUeG6jdqgJJxtciWzADfgLEV4LY8sKUOsJhc,1102
108
- boltz_vsynthes-1.0.5.dist-info/METADATA,sha256=-MCCHAI1TOA1tlDaX-X6npP-HgYgJZht77XK-J9CeAI,7171
109
- boltz_vsynthes-1.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
110
- boltz_vsynthes-1.0.5.dist-info/entry_points.txt,sha256=n5a5I35ntu9lmyr16oZgHPFY0b0YxjiixY7m7nbMTLc,41
111
- boltz_vsynthes-1.0.5.dist-info/top_level.txt,sha256=MgU3Jfb-ctWm07YGMts68PMjSh9v26D0gfG3dFRmVFA,6
112
- boltz_vsynthes-1.0.5.dist-info/RECORD,,
107
+ boltz_vsynthes-1.0.7.dist-info/licenses/LICENSE,sha256=8GZ_1eZsUeG6jdqgJJxtciWzADfgLEV4LY8sKUOsJhc,1102
108
+ boltz_vsynthes-1.0.7.dist-info/METADATA,sha256=AQB7KiKkpIvaBZ2aMiTw1wfHE8_Vm_4D7cbJMN80J2U,7171
109
+ boltz_vsynthes-1.0.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
110
+ boltz_vsynthes-1.0.7.dist-info/entry_points.txt,sha256=n5a5I35ntu9lmyr16oZgHPFY0b0YxjiixY7m7nbMTLc,41
111
+ boltz_vsynthes-1.0.7.dist-info/top_level.txt,sha256=MgU3Jfb-ctWm07YGMts68PMjSh9v26D0gfG3dFRmVFA,6
112
+ boltz_vsynthes-1.0.7.dist-info/RECORD,,