boltz-vsynthes 1.0.5__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- boltz/data/parse/schema.py +144 -647
- boltz/main.py +5 -1
- {boltz_vsynthes-1.0.5.dist-info → boltz_vsynthes-1.0.7.dist-info}/METADATA +1 -1
- {boltz_vsynthes-1.0.5.dist-info → boltz_vsynthes-1.0.7.dist-info}/RECORD +8 -8
- {boltz_vsynthes-1.0.5.dist-info → boltz_vsynthes-1.0.7.dist-info}/WHEEL +0 -0
- {boltz_vsynthes-1.0.5.dist-info → boltz_vsynthes-1.0.7.dist-info}/entry_points.txt +0 -0
- {boltz_vsynthes-1.0.5.dist-info → boltz_vsynthes-1.0.7.dist-info}/licenses/LICENSE +0 -0
- {boltz_vsynthes-1.0.5.dist-info → boltz_vsynthes-1.0.7.dist-info}/top_level.txt +0 -0
boltz/data/parse/schema.py
CHANGED
@@ -1005,46 +1005,31 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
|
|
1005
1005
|
# First group items that have the same type, sequence and modifications
|
1006
1006
|
items_to_group = {}
|
1007
1007
|
chain_name_to_entity_type = {}
|
1008
|
+
|
1009
|
+
# Keep track of ligand IDs
|
1010
|
+
ligand_id = 1
|
1011
|
+
ligand_id_map = {}
|
1008
1012
|
|
1013
|
+
# Parse sequences
|
1009
1014
|
for item in schema["sequences"]:
|
1010
|
-
|
1011
|
-
|
1012
|
-
|
1013
|
-
msg = f"Invalid entity type: {entity_type}"
|
1014
|
-
raise ValueError(msg)
|
1015
|
+
entity_type = list(item.keys())[0]
|
1016
|
+
entity_id = item[entity_type]["id"]
|
1017
|
+
entity_id = [entity_id] if isinstance(entity_id, str) else entity_id
|
1015
1018
|
|
1016
|
-
# Get sequence
|
1017
|
-
if entity_type
|
1019
|
+
# Get sequence
|
1020
|
+
if entity_type == "protein":
|
1018
1021
|
if "sequence" in item[entity_type]:
|
1019
|
-
seq =
|
1022
|
+
seq = item[entity_type]["sequence"]
|
1020
1023
|
elif "pdb" in item[entity_type]:
|
1021
|
-
pdb_input =
|
1022
|
-
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
if pdb_cache_file.exists():
|
1032
|
-
# Use cached file
|
1033
|
-
with pdb_cache_file.open("r") as f:
|
1034
|
-
pdb_data = f.read()
|
1035
|
-
else:
|
1036
|
-
# Download and cache
|
1037
|
-
import urllib.request
|
1038
|
-
pdb_url = f"https://files.rcsb.org/download/{pdb_input.lower()}.pdb"
|
1039
|
-
try:
|
1040
|
-
with urllib.request.urlopen(pdb_url) as response:
|
1041
|
-
pdb_data = response.read().decode()
|
1042
|
-
# Cache the downloaded data
|
1043
|
-
with pdb_cache_file.open("w") as f:
|
1044
|
-
f.write(pdb_data)
|
1045
|
-
except Exception as e:
|
1046
|
-
msg = f"Failed to download PDB {pdb_input}: {str(e)}"
|
1047
|
-
raise RuntimeError(msg) from e
|
1024
|
+
pdb_input = item[entity_type]["pdb"]
|
1025
|
+
if pdb_input.startswith(("http://", "https://")):
|
1026
|
+
# It's a PDB ID
|
1027
|
+
import requests
|
1028
|
+
response = requests.get(f"https://files.rcsb.org/download/{pdb_input}.pdb")
|
1029
|
+
if response.status_code != 200:
|
1030
|
+
msg = f"Failed to download PDB file: {pdb_input}"
|
1031
|
+
raise FileNotFoundError(msg)
|
1032
|
+
pdb_data = response.text
|
1048
1033
|
else:
|
1049
1034
|
# It's a file path
|
1050
1035
|
pdb_path = Path(pdb_input)
|
@@ -1076,8 +1061,15 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
|
|
1076
1061
|
assert "smiles" not in item[entity_type] or "ccd" not in item[entity_type]
|
1077
1062
|
if "smiles" in item[entity_type]:
|
1078
1063
|
seq = str(item[entity_type]["smiles"])
|
1064
|
+
# Map user-provided ID to internal LIG1, LIG2, etc.
|
1065
|
+
for id in entity_id:
|
1066
|
+
ligand_id_map[id] = f"LIG{ligand_id}"
|
1067
|
+
ligand_id += 1
|
1079
1068
|
else:
|
1080
1069
|
seq = str(item[entity_type]["ccd"])
|
1070
|
+
# For CCD ligands, use the CCD code as the internal ID
|
1071
|
+
for id in entity_id:
|
1072
|
+
ligand_id_map[id] = seq
|
1081
1073
|
|
1082
1074
|
# Group items by entity
|
1083
1075
|
items_to_group.setdefault((entity_type, seq), []).append(item)
|
@@ -1091,140 +1083,97 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
|
|
1091
1083
|
# Check if any affinity ligand is present
|
1092
1084
|
affinity_ligands = set()
|
1093
1085
|
properties = schema.get("properties", [])
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1086
|
+
|
1087
|
+
# Get all ligands
|
1088
|
+
ligands = []
|
1089
|
+
for item in schema["sequences"]:
|
1090
|
+
entity_type = list(item.keys())[0]
|
1091
|
+
if entity_type == "ligand":
|
1092
|
+
entity_id = item[entity_type]["id"]
|
1093
|
+
entity_id = [entity_id] if isinstance(entity_id, str) else entity_id
|
1094
|
+
ligands.extend(entity_id)
|
1095
|
+
|
1096
|
+
# Get user-specified binders
|
1097
|
+
specified_binders = set()
|
1098
1098
|
for prop in properties:
|
1099
|
-
|
1100
|
-
if prop_type == "affinity":
|
1099
|
+
if "affinity" in prop:
|
1101
1100
|
binder = prop["affinity"]["binder"]
|
1102
|
-
|
1103
|
-
|
1104
|
-
|
1105
|
-
|
1106
|
-
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
for
|
1134
|
-
# Get entity
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
|
1139
|
-
for
|
1140
|
-
|
1141
|
-
|
1142
|
-
elif isinstance(item[entity_type]["id"], list):
|
1143
|
-
ids.extend(item[entity_type]["id"])
|
1144
|
-
|
1145
|
-
# Check if any affinity ligand is present
|
1146
|
-
if len(ids) == 1:
|
1147
|
-
affinity = ids[0] in affinity_ligands
|
1148
|
-
elif (len(ids) > 1) and any(x in affinity_ligands for x in ids):
|
1149
|
-
msg = "Cannot compute affinity for a ligand that has multiple copies!"
|
1150
|
-
raise ValueError(msg)
|
1151
|
-
else:
|
1152
|
-
affinity = False
|
1153
|
-
|
1154
|
-
# Ensure all the items share the same msa
|
1155
|
-
msa = -1
|
1101
|
+
specified_binders.add(binder)
|
1102
|
+
|
1103
|
+
# If no binders specified, use all proteins
|
1104
|
+
if not specified_binders:
|
1105
|
+
for item in schema["sequences"]:
|
1106
|
+
entity_type = list(item.keys())[0]
|
1107
|
+
if entity_type == "protein":
|
1108
|
+
entity_id = item[entity_type]["id"]
|
1109
|
+
entity_id = [entity_id] if isinstance(entity_id, str) else entity_id
|
1110
|
+
specified_binders.update(entity_id)
|
1111
|
+
|
1112
|
+
# Generate protein-ligand pairs for specified binders
|
1113
|
+
new_properties = []
|
1114
|
+
for binder in specified_binders:
|
1115
|
+
for ligand in ligands:
|
1116
|
+
if ligand in ligand_id_map:
|
1117
|
+
ligand = ligand_id_map[ligand] # Convert to internal LIG1, LIG2, etc.
|
1118
|
+
affinity_ligands.add(ligand)
|
1119
|
+
new_properties.append({
|
1120
|
+
"affinity": {
|
1121
|
+
"binder": binder,
|
1122
|
+
"ligand": ligand
|
1123
|
+
}
|
1124
|
+
})
|
1125
|
+
|
1126
|
+
# Update schema with generated properties
|
1127
|
+
schema["properties"] = new_properties
|
1128
|
+
|
1129
|
+
# Parse each group
|
1130
|
+
chains = []
|
1131
|
+
extra_mols = {}
|
1132
|
+
for (entity_type, seq), items in items_to_group.items():
|
1133
|
+
# Get entity id
|
1134
|
+
entity_id = items[0][entity_type]["id"]
|
1135
|
+
entity_id = [entity_id] if isinstance(entity_id, str) else entity_id
|
1136
|
+
|
1137
|
+
# Check if this entity has affinity
|
1138
|
+
affinity = any(entity in affinity_ligands for entity in entity_id)
|
1139
|
+
|
1140
|
+
# Parse a protein
|
1156
1141
|
if entity_type == "protein":
|
1157
|
-
# Get
|
1158
|
-
msa = items[0][entity_type].get("msa"
|
1159
|
-
if
|
1160
|
-
msa =
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
item_msa = 0
|
1167
|
-
|
1168
|
-
if item_msa != msa:
|
1169
|
-
msg = "All proteins with the same sequence must share the same MSA!"
|
1170
|
-
raise ValueError(msg)
|
1171
|
-
|
1172
|
-
# Set the MSA, warn if passed in single-sequence mode
|
1173
|
-
if msa == "empty":
|
1174
|
-
msa = -1
|
1175
|
-
msg = (
|
1176
|
-
"Found explicit empty MSA for some proteins, will run "
|
1177
|
-
"these in single sequence mode. Keep in mind that the "
|
1178
|
-
"model predictions will be suboptimal without an MSA."
|
1179
|
-
)
|
1180
|
-
click.echo(msg)
|
1181
|
-
|
1182
|
-
if msa not in (0, -1):
|
1183
|
-
is_msa_custom = True
|
1184
|
-
elif msa == 0:
|
1185
|
-
is_msa_auto = True
|
1186
|
-
|
1187
|
-
# Parse a polymer
|
1188
|
-
if entity_type in {"protein", "dna", "rna"}:
|
1189
|
-
# Get token map
|
1190
|
-
if entity_type == "rna":
|
1191
|
-
token_map = const.rna_letter_to_token
|
1192
|
-
elif entity_type == "dna":
|
1193
|
-
token_map = const.dna_letter_to_token
|
1194
|
-
elif entity_type == "protein":
|
1195
|
-
token_map = const.prot_letter_to_token
|
1142
|
+
# Get MSA
|
1143
|
+
msa = items[0][entity_type].get("msa")
|
1144
|
+
if msa is not None:
|
1145
|
+
msa = Path(msa)
|
1146
|
+
if not msa.exists():
|
1147
|
+
msg = f"MSA file not found: {msa}"
|
1148
|
+
raise FileNotFoundError(msg)
|
1149
|
+
with msa.open("r") as f:
|
1150
|
+
msa_data = f.read()
|
1196
1151
|
else:
|
1197
|
-
|
1198
|
-
raise ValueError(msg)
|
1199
|
-
|
1200
|
-
# Get polymer info
|
1201
|
-
chain_type = const.chain_type_ids[entity_type.upper()]
|
1202
|
-
unk_token = const.unk_token[entity_type.upper()]
|
1203
|
-
|
1204
|
-
# Extract sequence
|
1205
|
-
raw_seq = items[0][entity_type]["sequence"]
|
1206
|
-
entity_to_seq[entity_id] = raw_seq
|
1207
|
-
|
1208
|
-
# Convert sequence to tokens
|
1209
|
-
seq = [token_map.get(c, unk_token) for c in list(raw_seq)]
|
1152
|
+
msa_data = None
|
1210
1153
|
|
1211
|
-
#
|
1212
|
-
|
1213
|
-
|
1214
|
-
|
1215
|
-
|
1154
|
+
# Parse sequence
|
1155
|
+
residues = []
|
1156
|
+
for res_idx, code in enumerate(seq):
|
1157
|
+
# Get mol
|
1158
|
+
ref_mol = get_mol(code, ccd, mol_dir)
|
1216
1159
|
|
1217
|
-
|
1160
|
+
# Parse residue
|
1161
|
+
residue = parse_ccd_residue(
|
1162
|
+
name=code,
|
1163
|
+
ref_mol=ref_mol,
|
1164
|
+
res_idx=res_idx,
|
1165
|
+
)
|
1166
|
+
residues.append(residue)
|
1218
1167
|
|
1219
|
-
#
|
1220
|
-
parsed_chain =
|
1221
|
-
sequence=seq,
|
1222
|
-
raw_sequence=raw_seq,
|
1168
|
+
# Create protein chain
|
1169
|
+
parsed_chain = ParsedChain(
|
1223
1170
|
entity=entity_id,
|
1224
|
-
|
1225
|
-
|
1226
|
-
|
1227
|
-
|
1171
|
+
residues=residues,
|
1172
|
+
type=const.chain_type_ids["PROTEIN"],
|
1173
|
+
cyclic_period=0,
|
1174
|
+
sequence=seq,
|
1175
|
+
affinity=affinity,
|
1176
|
+
affinity_mw=None,
|
1228
1177
|
)
|
1229
1178
|
|
1230
1179
|
# Parse a non-polymer
|
@@ -1298,14 +1247,16 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
|
|
1298
1247
|
|
1299
1248
|
mol_no_h = AllChem.RemoveHs(mol, sanitize=False)
|
1300
1249
|
affinity_mw = AllChem.Descriptors.MolWt(mol_no_h) if affinity else None
|
1301
|
-
|
1250
|
+
|
1251
|
+
# Use the mapped internal ID (LIG1, LIG2, etc.)
|
1252
|
+
internal_id = ligand_id_map[entity_id[0]]
|
1253
|
+
extra_mols[internal_id] = mol_no_h
|
1302
1254
|
residue = parse_ccd_residue(
|
1303
|
-
name=
|
1255
|
+
name=internal_id,
|
1304
1256
|
ref_mol=mol,
|
1305
1257
|
res_idx=0,
|
1306
1258
|
)
|
1307
1259
|
|
1308
|
-
ligand_id += 1
|
1309
1260
|
parsed_chain = ParsedChain(
|
1310
1261
|
entity=entity_id,
|
1311
1262
|
residues=[residue],
|
@@ -1324,504 +1275,50 @@ def parse_boltz_schema( # noqa: C901, PLR0915, PLR0912
|
|
1324
1275
|
msg = f"Invalid entity type: {entity_type}"
|
1325
1276
|
raise ValueError(msg)
|
1326
1277
|
|
1327
|
-
|
1328
|
-
for item in items:
|
1329
|
-
ids = item[entity_type]["id"]
|
1330
|
-
if isinstance(ids, str):
|
1331
|
-
ids = [ids]
|
1332
|
-
for chain_name in ids:
|
1333
|
-
chains[chain_name] = parsed_chain
|
1334
|
-
chain_to_msa[chain_name] = msa
|
1335
|
-
|
1336
|
-
# Check if msa is custom or auto
|
1337
|
-
if is_msa_custom and is_msa_auto:
|
1338
|
-
msg = "Cannot mix custom and auto-generated MSAs in the same input!"
|
1339
|
-
raise ValueError(msg)
|
1340
|
-
|
1341
|
-
# If no chains parsed fail
|
1342
|
-
if not chains:
|
1343
|
-
msg = "No chains parsed!"
|
1344
|
-
raise ValueError(msg)
|
1345
|
-
|
1346
|
-
# Create tables
|
1347
|
-
atom_data = []
|
1348
|
-
bond_data = []
|
1349
|
-
res_data = []
|
1350
|
-
chain_data = []
|
1351
|
-
protein_chains = set()
|
1352
|
-
affinity_info = None
|
1353
|
-
|
1354
|
-
rdkit_bounds_constraint_data = []
|
1355
|
-
chiral_atom_constraint_data = []
|
1356
|
-
stereo_bond_constraint_data = []
|
1357
|
-
planar_bond_constraint_data = []
|
1358
|
-
planar_ring_5_constraint_data = []
|
1359
|
-
planar_ring_6_constraint_data = []
|
1360
|
-
|
1361
|
-
# Convert parsed chains to tables
|
1362
|
-
atom_idx = 0
|
1363
|
-
res_idx = 0
|
1364
|
-
asym_id = 0
|
1365
|
-
sym_count = {}
|
1366
|
-
chain_to_idx = {}
|
1367
|
-
|
1368
|
-
# Keep a mapping of (chain_name, residue_idx, atom_name) to atom_idx
|
1369
|
-
atom_idx_map = {}
|
1370
|
-
|
1371
|
-
for asym_id, (chain_name, chain) in enumerate(chains.items()):
|
1372
|
-
# Compute number of atoms and residues
|
1373
|
-
res_num = len(chain.residues)
|
1374
|
-
atom_num = sum(len(res.atoms) for res in chain.residues)
|
1375
|
-
|
1376
|
-
# Save protein chains for later
|
1377
|
-
if chain.type == const.chain_type_ids["PROTEIN"]:
|
1378
|
-
protein_chains.add(chain_name)
|
1379
|
-
|
1380
|
-
# Add affinity info
|
1381
|
-
if chain.affinity and affinity_info is not None:
|
1382
|
-
msg = "Cannot compute affinity for multiple ligands!"
|
1383
|
-
raise ValueError(msg)
|
1384
|
-
|
1385
|
-
if chain.affinity:
|
1386
|
-
affinity_info = AffinityInfo(
|
1387
|
-
chain_id=asym_id,
|
1388
|
-
mw=chain.affinity_mw,
|
1389
|
-
)
|
1390
|
-
|
1391
|
-
# Find all copies of this chain in the assembly
|
1392
|
-
entity_id = int(chain.entity)
|
1393
|
-
sym_id = sym_count.get(entity_id, 0)
|
1394
|
-
chain_data.append(
|
1395
|
-
(
|
1396
|
-
chain_name,
|
1397
|
-
chain.type,
|
1398
|
-
entity_id,
|
1399
|
-
sym_id,
|
1400
|
-
asym_id,
|
1401
|
-
atom_idx,
|
1402
|
-
atom_num,
|
1403
|
-
res_idx,
|
1404
|
-
res_num,
|
1405
|
-
chain.cyclic_period,
|
1406
|
-
)
|
1407
|
-
)
|
1408
|
-
chain_to_idx[chain_name] = asym_id
|
1409
|
-
sym_count[entity_id] = sym_id + 1
|
1410
|
-
|
1411
|
-
# Add residue, atom, bond, data
|
1412
|
-
for res in chain.residues:
|
1413
|
-
atom_center = atom_idx + res.atom_center
|
1414
|
-
atom_disto = atom_idx + res.atom_disto
|
1415
|
-
res_data.append(
|
1416
|
-
(
|
1417
|
-
res.name,
|
1418
|
-
res.type,
|
1419
|
-
res.idx,
|
1420
|
-
atom_idx,
|
1421
|
-
len(res.atoms),
|
1422
|
-
atom_center,
|
1423
|
-
atom_disto,
|
1424
|
-
res.is_standard,
|
1425
|
-
res.is_present,
|
1426
|
-
)
|
1427
|
-
)
|
1428
|
-
|
1429
|
-
if res.rdkit_bounds_constraints is not None:
|
1430
|
-
for constraint in res.rdkit_bounds_constraints:
|
1431
|
-
rdkit_bounds_constraint_data.append( # noqa: PERF401
|
1432
|
-
(
|
1433
|
-
tuple(
|
1434
|
-
c_atom_idx + atom_idx
|
1435
|
-
for c_atom_idx in constraint.atom_idxs
|
1436
|
-
),
|
1437
|
-
constraint.is_bond,
|
1438
|
-
constraint.is_angle,
|
1439
|
-
constraint.upper_bound,
|
1440
|
-
constraint.lower_bound,
|
1441
|
-
)
|
1442
|
-
)
|
1443
|
-
if res.chiral_atom_constraints is not None:
|
1444
|
-
for constraint in res.chiral_atom_constraints:
|
1445
|
-
chiral_atom_constraint_data.append( # noqa: PERF401
|
1446
|
-
(
|
1447
|
-
tuple(
|
1448
|
-
c_atom_idx + atom_idx
|
1449
|
-
for c_atom_idx in constraint.atom_idxs
|
1450
|
-
),
|
1451
|
-
constraint.is_reference,
|
1452
|
-
constraint.is_r,
|
1453
|
-
)
|
1454
|
-
)
|
1455
|
-
if res.stereo_bond_constraints is not None:
|
1456
|
-
for constraint in res.stereo_bond_constraints:
|
1457
|
-
stereo_bond_constraint_data.append( # noqa: PERF401
|
1458
|
-
(
|
1459
|
-
tuple(
|
1460
|
-
c_atom_idx + atom_idx
|
1461
|
-
for c_atom_idx in constraint.atom_idxs
|
1462
|
-
),
|
1463
|
-
constraint.is_check,
|
1464
|
-
constraint.is_e,
|
1465
|
-
)
|
1466
|
-
)
|
1467
|
-
if res.planar_bond_constraints is not None:
|
1468
|
-
for constraint in res.planar_bond_constraints:
|
1469
|
-
planar_bond_constraint_data.append( # noqa: PERF401
|
1470
|
-
(
|
1471
|
-
tuple(
|
1472
|
-
c_atom_idx + atom_idx
|
1473
|
-
for c_atom_idx in constraint.atom_idxs
|
1474
|
-
),
|
1475
|
-
)
|
1476
|
-
)
|
1477
|
-
if res.planar_ring_5_constraints is not None:
|
1478
|
-
for constraint in res.planar_ring_5_constraints:
|
1479
|
-
planar_ring_5_constraint_data.append( # noqa: PERF401
|
1480
|
-
(
|
1481
|
-
tuple(
|
1482
|
-
c_atom_idx + atom_idx
|
1483
|
-
for c_atom_idx in constraint.atom_idxs
|
1484
|
-
),
|
1485
|
-
)
|
1486
|
-
)
|
1487
|
-
if res.planar_ring_6_constraints is not None:
|
1488
|
-
for constraint in res.planar_ring_6_constraints:
|
1489
|
-
planar_ring_6_constraint_data.append( # noqa: PERF401
|
1490
|
-
(
|
1491
|
-
tuple(
|
1492
|
-
c_atom_idx + atom_idx
|
1493
|
-
for c_atom_idx in constraint.atom_idxs
|
1494
|
-
),
|
1495
|
-
)
|
1496
|
-
)
|
1497
|
-
|
1498
|
-
for bond in res.bonds:
|
1499
|
-
atom_1 = atom_idx + bond.atom_1
|
1500
|
-
atom_2 = atom_idx + bond.atom_2
|
1501
|
-
bond_data.append(
|
1502
|
-
(
|
1503
|
-
asym_id,
|
1504
|
-
asym_id,
|
1505
|
-
res_idx,
|
1506
|
-
res_idx,
|
1507
|
-
atom_1,
|
1508
|
-
atom_2,
|
1509
|
-
bond.type,
|
1510
|
-
)
|
1511
|
-
)
|
1512
|
-
|
1513
|
-
for atom in res.atoms:
|
1514
|
-
# Add atom to map
|
1515
|
-
atom_idx_map[(chain_name, res.idx, atom.name)] = (
|
1516
|
-
asym_id,
|
1517
|
-
res_idx,
|
1518
|
-
atom_idx,
|
1519
|
-
)
|
1520
|
-
|
1521
|
-
# Add atom to data
|
1522
|
-
atom_data.append(
|
1523
|
-
(
|
1524
|
-
atom.name,
|
1525
|
-
atom.element,
|
1526
|
-
atom.charge,
|
1527
|
-
atom.coords,
|
1528
|
-
atom.conformer,
|
1529
|
-
atom.is_present,
|
1530
|
-
atom.chirality,
|
1531
|
-
)
|
1532
|
-
)
|
1533
|
-
atom_idx += 1
|
1534
|
-
|
1535
|
-
res_idx += 1
|
1278
|
+
chains.append(parsed_chain)
|
1536
1279
|
|
1537
1280
|
# Parse constraints
|
1538
|
-
|
1539
|
-
|
1540
|
-
contact_constraints = []
|
1541
|
-
constraints = schema.get("constraints", [])
|
1542
|
-
for constraint in constraints:
|
1281
|
+
constraints = []
|
1282
|
+
for constraint in schema.get("constraints", []):
|
1543
1283
|
if "bond" in constraint:
|
1544
|
-
|
1545
|
-
|
1546
|
-
|
1547
|
-
|
1548
|
-
c1, r1, a1 = tuple(constraint["bond"]["atom1"])
|
1549
|
-
c2, r2, a2 = tuple(constraint["bond"]["atom2"])
|
1550
|
-
c1, r1, a1 = atom_idx_map[(c1, r1 - 1, a1)] # 1-indexed
|
1551
|
-
c2, r2, a2 = atom_idx_map[(c2, r2 - 1, a2)] # 1-indexed
|
1552
|
-
connections.append((c1, c2, r1, r2, a1, a2))
|
1284
|
+
atom1 = constraint["bond"]["atom1"]
|
1285
|
+
atom2 = constraint["bond"]["atom2"]
|
1286
|
+
constraints.append(ParsedBond(atom1, atom2))
|
1553
1287
|
elif "pocket" in constraint:
|
1554
|
-
if (
|
1555
|
-
"binder" not in constraint["pocket"]
|
1556
|
-
or "contacts" not in constraint["pocket"]
|
1557
|
-
):
|
1558
|
-
msg = f"Pocket constraint was not properly specified"
|
1559
|
-
raise ValueError(msg)
|
1560
|
-
|
1561
|
-
if len(pocket_constraints) > 0 and not boltz_2:
|
1562
|
-
msg = f"Only one pocket binders is supported in Boltz-1!"
|
1563
|
-
raise ValueError(msg)
|
1564
|
-
|
1565
|
-
max_distance = constraint["pocket"].get("max_distance", 6.0)
|
1566
|
-
if max_distance != 6.0 and not boltz_2:
|
1567
|
-
msg = f"Max distance != 6.0 is not supported in Boltz-1!"
|
1568
|
-
raise ValueError(msg)
|
1569
|
-
|
1570
1288
|
binder = constraint["pocket"]["binder"]
|
1571
|
-
binder
|
1572
|
-
|
1573
|
-
contacts = []
|
1574
|
-
|
1575
|
-
|
1576
|
-
]:
|
1577
|
-
if chains[chain_name].type == const.chain_type_ids["NONPOLYMER"]:
|
1578
|
-
# Non-polymer chains are indexed by atom name
|
1579
|
-
_, _, atom_idx = atom_idx_map[
|
1580
|
-
(chain_name, 0, residue_index_or_atom_name)
|
1581
|
-
]
|
1582
|
-
contact = (chain_to_idx[chain_name], atom_idx)
|
1583
|
-
else:
|
1584
|
-
# Polymer chains are indexed by residue index
|
1585
|
-
contact = (chain_to_idx[chain_name], residue_index_or_atom_name - 1)
|
1586
|
-
contacts.append(contact)
|
1587
|
-
|
1588
|
-
pocket_constraints.append((binder, contacts, max_distance))
|
1289
|
+
if binder in ligand_id_map:
|
1290
|
+
binder = ligand_id_map[binder] # Convert to internal LIG1, LIG2, etc.
|
1291
|
+
contacts = constraint["pocket"]["contacts"]
|
1292
|
+
max_distance = constraint["pocket"].get("max_distance", 6.0)
|
1293
|
+
constraints.append(ParsedPocket(binder, contacts, max_distance))
|
1589
1294
|
elif "contact" in constraint:
|
1590
|
-
|
1591
|
-
|
1592
|
-
or "token2" not in constraint["contact"]
|
1593
|
-
):
|
1594
|
-
msg = f"Contact constraint was not properly specified"
|
1595
|
-
raise ValueError(msg)
|
1596
|
-
|
1597
|
-
if not boltz_2:
|
1598
|
-
msg = f"Contact constraint is not supported in Boltz-1!"
|
1599
|
-
raise ValueError(msg)
|
1600
|
-
|
1295
|
+
token1 = constraint["contact"]["token1"]
|
1296
|
+
token2 = constraint["contact"]["token2"]
|
1601
1297
|
max_distance = constraint["contact"].get("max_distance", 6.0)
|
1602
|
-
|
1603
|
-
chain_name1, residue_index_or_atom_name1 = constraint["contact"]["token1"]
|
1604
|
-
if chains[chain_name1].type == const.chain_type_ids["NONPOLYMER"]:
|
1605
|
-
# Non-polymer chains are indexed by atom name
|
1606
|
-
_, _, atom_idx = atom_idx_map[
|
1607
|
-
(chain_name1, 0, residue_index_or_atom_name1)
|
1608
|
-
]
|
1609
|
-
token1 = (chain_to_idx[chain_name1], atom_idx)
|
1610
|
-
else:
|
1611
|
-
# Polymer chains are indexed by residue index
|
1612
|
-
token1 = (chain_to_idx[chain_name1], residue_index_or_atom_name1 - 1)
|
1613
|
-
|
1614
|
-
pocket_constraints.append((binder, contacts, max_distance))
|
1298
|
+
constraints.append(ParsedContact(token1, token2, max_distance))
|
1615
1299
|
else:
|
1616
|
-
msg = f"Invalid constraint: {constraint}"
|
1300
|
+
msg = f"Invalid constraint type: {list(constraint.keys())[0]}"
|
1617
1301
|
raise ValueError(msg)
|
1618
1302
|
|
1619
|
-
# Get protein sequences in this YAML
|
1620
|
-
protein_seqs = {name: chains[name].sequence for name in protein_chains}
|
1621
|
-
|
1622
1303
|
# Parse templates
|
1623
|
-
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1628
|
-
|
1629
|
-
|
1630
|
-
|
1631
|
-
|
1632
|
-
|
1633
|
-
|
1634
|
-
|
1635
|
-
path = template["cif"]
|
1636
|
-
template_id = Path(path).stem
|
1637
|
-
chain_ids = template.get("chain_id", None)
|
1638
|
-
template_chain_ids = template.get("template_id", None)
|
1639
|
-
|
1640
|
-
# Check validity of input
|
1641
|
-
matched = False
|
1642
|
-
|
1643
|
-
if chain_ids is not None and not isinstance(chain_ids, list):
|
1644
|
-
chain_ids = [chain_ids]
|
1645
|
-
if template_chain_ids is not None and not isinstance(template_chain_ids, list):
|
1646
|
-
template_chain_ids = [template_chain_ids]
|
1647
|
-
|
1648
|
-
if (
|
1649
|
-
template_chain_ids is not None
|
1650
|
-
and chain_ids is not None
|
1651
|
-
and len(template_chain_ids) != len(chain_ids)
|
1652
|
-
):
|
1653
|
-
matched = True
|
1654
|
-
if len(template_chain_ids) != len(chain_ids):
|
1655
|
-
msg = (
|
1656
|
-
"When providing both the chain_id and template_id, the number of"
|
1657
|
-
"template_ids provided must match the number of chain_ids!"
|
1658
|
-
)
|
1659
|
-
raise ValueError(msg)
|
1660
|
-
|
1661
|
-
# Get relevant chains ids
|
1662
|
-
if chain_ids is None:
|
1663
|
-
chain_ids = list(protein_chains)
|
1664
|
-
|
1665
|
-
for chain_id in chain_ids:
|
1666
|
-
if chain_id not in protein_chains:
|
1667
|
-
msg = (
|
1668
|
-
f"Chain {chain_id} assigned for template"
|
1669
|
-
f"{template_id} is not one of the protein chains!"
|
1670
|
-
)
|
1671
|
-
raise ValueError(msg)
|
1672
|
-
|
1673
|
-
# Get relevant template chain ids
|
1674
|
-
parsed_template = parse_mmcif(
|
1675
|
-
path,
|
1676
|
-
mols=ccd,
|
1677
|
-
moldir=mol_dir,
|
1678
|
-
use_assembly=False,
|
1679
|
-
compute_interfaces=False,
|
1680
|
-
)
|
1681
|
-
template_proteins = {
|
1682
|
-
str(c["name"])
|
1683
|
-
for c in parsed_template.data.chains
|
1684
|
-
if c["mol_type"] == const.chain_type_ids["PROTEIN"]
|
1685
|
-
}
|
1686
|
-
if template_chain_ids is None:
|
1687
|
-
template_chain_ids = list(template_proteins)
|
1688
|
-
|
1689
|
-
for chain_id in template_chain_ids:
|
1690
|
-
if chain_id not in template_proteins:
|
1691
|
-
msg = (
|
1692
|
-
f"Template chain {chain_id} assigned for template"
|
1693
|
-
f"{template_id} is not one of the protein chains!"
|
1694
|
-
)
|
1695
|
-
raise ValueError(msg)
|
1696
|
-
|
1697
|
-
# Compute template records
|
1698
|
-
if matched:
|
1699
|
-
template_records.extend(
|
1700
|
-
get_template_records_from_matching(
|
1701
|
-
template_id=template_id,
|
1702
|
-
chain_ids=chain_ids,
|
1703
|
-
sequences=protein_seqs,
|
1704
|
-
template_chain_ids=template_chain_ids,
|
1705
|
-
template_sequences=parsed_template.sequences,
|
1706
|
-
)
|
1707
|
-
)
|
1708
|
-
else:
|
1709
|
-
template_records.extend(
|
1710
|
-
get_template_records_from_search(
|
1711
|
-
template_id=template_id,
|
1712
|
-
chain_ids=chain_ids,
|
1713
|
-
sequences=protein_seqs,
|
1714
|
-
template_chain_ids=template_chain_ids,
|
1715
|
-
template_sequences=parsed_template.sequences,
|
1716
|
-
)
|
1717
|
-
)
|
1718
|
-
# Save template
|
1719
|
-
templates[template_id] = parsed_template.data
|
1720
|
-
|
1721
|
-
# Convert into datatypes
|
1722
|
-
residues = np.array(res_data, dtype=Residue)
|
1723
|
-
chains = np.array(chain_data, dtype=Chain)
|
1724
|
-
interfaces = np.array([], dtype=Interface)
|
1725
|
-
mask = np.ones(len(chain_data), dtype=bool)
|
1726
|
-
rdkit_bounds_constraints = np.array(
|
1727
|
-
rdkit_bounds_constraint_data, dtype=RDKitBoundsConstraint
|
1728
|
-
)
|
1729
|
-
chiral_atom_constraints = np.array(
|
1730
|
-
chiral_atom_constraint_data, dtype=ChiralAtomConstraint
|
1731
|
-
)
|
1732
|
-
stereo_bond_constraints = np.array(
|
1733
|
-
stereo_bond_constraint_data, dtype=StereoBondConstraint
|
1734
|
-
)
|
1735
|
-
planar_bond_constraints = np.array(
|
1736
|
-
planar_bond_constraint_data, dtype=PlanarBondConstraint
|
1737
|
-
)
|
1738
|
-
planar_ring_5_constraints = np.array(
|
1739
|
-
planar_ring_5_constraint_data, dtype=PlanarRing5Constraint
|
1740
|
-
)
|
1741
|
-
planar_ring_6_constraints = np.array(
|
1742
|
-
planar_ring_6_constraint_data, dtype=PlanarRing6Constraint
|
1743
|
-
)
|
1744
|
-
|
1745
|
-
if boltz_2:
|
1746
|
-
atom_data = [(a[0], a[3], a[5], 0.0, 1.0) for a in atom_data]
|
1747
|
-
connections = [(*c, const.bond_type_ids["COVALENT"]) for c in connections]
|
1748
|
-
bond_data = bond_data + connections
|
1749
|
-
atoms = np.array(atom_data, dtype=AtomV2)
|
1750
|
-
bonds = np.array(bond_data, dtype=BondV2)
|
1751
|
-
coords = [(x,) for x in atoms["coords"]]
|
1752
|
-
coords = np.array(coords, Coords)
|
1753
|
-
ensemble = np.array([(0, len(coords))], dtype=Ensemble)
|
1754
|
-
data = StructureV2(
|
1755
|
-
atoms=atoms,
|
1756
|
-
bonds=bonds,
|
1757
|
-
residues=residues,
|
1758
|
-
chains=chains,
|
1759
|
-
interfaces=interfaces,
|
1760
|
-
mask=mask,
|
1761
|
-
coords=coords,
|
1762
|
-
ensemble=ensemble,
|
1763
|
-
)
|
1764
|
-
else:
|
1765
|
-
bond_data = [(b[4], b[5], b[6]) for b in bond_data]
|
1766
|
-
atom_data = [(convert_atom_name(a[0]), *a[1:]) for a in atom_data]
|
1767
|
-
atoms = np.array(atom_data, dtype=Atom)
|
1768
|
-
bonds = np.array(bond_data, dtype=Bond)
|
1769
|
-
connections = np.array(connections, dtype=Connection)
|
1770
|
-
data = Structure(
|
1771
|
-
atoms=atoms,
|
1772
|
-
bonds=bonds,
|
1773
|
-
residues=residues,
|
1774
|
-
chains=chains,
|
1775
|
-
connections=connections,
|
1776
|
-
interfaces=interfaces,
|
1777
|
-
mask=mask,
|
1778
|
-
)
|
1779
|
-
|
1780
|
-
# Create metadata
|
1781
|
-
struct_info = StructureInfo(num_chains=len(chains))
|
1782
|
-
chain_infos = []
|
1783
|
-
for chain in chains:
|
1784
|
-
chain_info = ChainInfo(
|
1785
|
-
chain_id=int(chain["asym_id"]),
|
1786
|
-
chain_name=chain["name"],
|
1787
|
-
mol_type=int(chain["mol_type"]),
|
1788
|
-
cluster_id=-1,
|
1789
|
-
msa_id=chain_to_msa[chain["name"]],
|
1790
|
-
num_residues=int(chain["res_num"]),
|
1791
|
-
valid=True,
|
1792
|
-
entity_id=int(chain["entity_id"]),
|
1793
|
-
)
|
1794
|
-
chain_infos.append(chain_info)
|
1795
|
-
|
1796
|
-
options = InferenceOptions(pocket_constraints=pocket_constraints)
|
1797
|
-
record = Record(
|
1798
|
-
id=name,
|
1799
|
-
structure=struct_info,
|
1800
|
-
chains=chain_infos,
|
1801
|
-
interfaces=[],
|
1802
|
-
inference_options=options,
|
1803
|
-
templates=template_records,
|
1804
|
-
affinity=affinity_info,
|
1805
|
-
)
|
1806
|
-
|
1807
|
-
residue_constraints = ResidueConstraints(
|
1808
|
-
rdkit_bounds_constraints=rdkit_bounds_constraints,
|
1809
|
-
chiral_atom_constraints=chiral_atom_constraints,
|
1810
|
-
stereo_bond_constraints=stereo_bond_constraints,
|
1811
|
-
planar_bond_constraints=planar_bond_constraints,
|
1812
|
-
planar_ring_5_constraints=planar_ring_5_constraints,
|
1813
|
-
planar_ring_6_constraints=planar_ring_6_constraints,
|
1814
|
-
)
|
1815
|
-
|
1816
|
-
return Target(
|
1817
|
-
record=record,
|
1818
|
-
structure=data,
|
1819
|
-
sequences=entity_to_seq,
|
1820
|
-
residue_constraints=residue_constraints,
|
1304
|
+
templates = []
|
1305
|
+
for template in schema.get("templates", []):
|
1306
|
+
cif = template["cif"]
|
1307
|
+
chain_id = template.get("chain_id")
|
1308
|
+
template_id = template.get("template_id")
|
1309
|
+
templates.append(ParsedTemplate(cif, chain_id, template_id))
|
1310
|
+
|
1311
|
+
# Create target
|
1312
|
+
target = Target(
|
1313
|
+
name=name,
|
1314
|
+
chains=chains,
|
1315
|
+
constraints=constraints,
|
1821
1316
|
templates=templates,
|
1822
1317
|
extra_mols=extra_mols,
|
1823
1318
|
)
|
1824
1319
|
|
1320
|
+
return target
|
1321
|
+
|
1825
1322
|
|
1826
1323
|
def standardize(smiles: str) -> Optional[str]:
|
1827
1324
|
"""Standardize a molecule and return its SMILES and a flag indicating whether the molecule is valid.
|
boltz/main.py
CHANGED
@@ -742,7 +742,11 @@ def process_inputs(
|
|
742
742
|
|
743
743
|
# Process this input file
|
744
744
|
click.echo(f"Processing {input_file.name}")
|
745
|
-
|
745
|
+
try:
|
746
|
+
process_input_partial(input_file)
|
747
|
+
except Exception as e:
|
748
|
+
click.echo(f"Error processing {input_file.name}: {str(e)}")
|
749
|
+
continue
|
746
750
|
|
747
751
|
# Copy MSA files to central MSA directory
|
748
752
|
for msa_file in file_processed_msa_dir.glob("*.npz"):
|
@@ -1,5 +1,5 @@
|
|
1
1
|
boltz/__init__.py,sha256=F_-so3S40iZrSZ89Ge4TS6aZqwWyZXq_H4AXGDlbA_g,187
|
2
|
-
boltz/main.py,sha256=
|
2
|
+
boltz/main.py,sha256=w7c8dpAR0_97HIS_u76wywC1lswL4XVg98CuCrrXLvQ,41515
|
3
3
|
boltz/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
boltz/data/const.py,sha256=1M-88Z6HkfKY6MkNtqcj3b9P-oX9xEXluh3qM_u8dNU,26779
|
5
5
|
boltz/data/mol.py,sha256=maOpPHEGX1VVXCIFY6pQNGF7gUBZPAfgSvuPf2QO1yc,34268
|
@@ -38,7 +38,7 @@ boltz/data/parse/csv.py,sha256=Hcq8rJW2njczahEr8jfd_o-zxLaNSgJ3YIoC9srIqpw,2518
|
|
38
38
|
boltz/data/parse/fasta.py,sha256=taI4s_CqPtyF0XaLJAsVAJHCL0GXm2g1g8Qeccdxikk,3906
|
39
39
|
boltz/data/parse/mmcif.py,sha256=25kEXCkx-OuaawAs7cdz0fxdRu5_CCO0AV00u84PrjQ,36822
|
40
40
|
boltz/data/parse/mmcif_with_constraints.py,sha256=WHYZckSqUwu-Nb9vmVmxHmC7uxwVrF7AVUeVKsc5wGQ,51473
|
41
|
-
boltz/data/parse/schema.py,sha256=
|
41
|
+
boltz/data/parse/schema.py,sha256=DvMwh1Brn4ELzBuLEk89fdYv4XBx5bX3Fq2_TMeZ-08,43352
|
42
42
|
boltz/data/parse/yaml.py,sha256=GRFRMtDD4PQ4PIpA_S1jj0vRaEu2LlZd_g4rN1zUrNo,1505
|
43
43
|
boltz/data/sample/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
44
|
boltz/data/sample/cluster.py,sha256=9Sx8qP7zGZOAyEspwYFtCTbGTBZnuN-zfCKFbbA_6oI,8175
|
@@ -104,9 +104,9 @@ boltz/model/optim/scheduler.py,sha256=nB4jz0CZ4pR4n08LQngExL_pNycIdYI8AXVoHPnZWQ
|
|
104
104
|
boltz/model/potentials/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
105
105
|
boltz/model/potentials/potentials.py,sha256=vev8Vjfs-ML1hyrdv_R8DynG4wSFahJ6nzPWp7CYQqw,17507
|
106
106
|
boltz/model/potentials/schedules.py,sha256=m7XJjfuF9uTX3bR9VisXv1rvzJjxiD8PobXRpcBBu1c,968
|
107
|
-
boltz_vsynthes-1.0.
|
108
|
-
boltz_vsynthes-1.0.
|
109
|
-
boltz_vsynthes-1.0.
|
110
|
-
boltz_vsynthes-1.0.
|
111
|
-
boltz_vsynthes-1.0.
|
112
|
-
boltz_vsynthes-1.0.
|
107
|
+
boltz_vsynthes-1.0.7.dist-info/licenses/LICENSE,sha256=8GZ_1eZsUeG6jdqgJJxtciWzADfgLEV4LY8sKUOsJhc,1102
|
108
|
+
boltz_vsynthes-1.0.7.dist-info/METADATA,sha256=AQB7KiKkpIvaBZ2aMiTw1wfHE8_Vm_4D7cbJMN80J2U,7171
|
109
|
+
boltz_vsynthes-1.0.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
110
|
+
boltz_vsynthes-1.0.7.dist-info/entry_points.txt,sha256=n5a5I35ntu9lmyr16oZgHPFY0b0YxjiixY7m7nbMTLc,41
|
111
|
+
boltz_vsynthes-1.0.7.dist-info/top_level.txt,sha256=MgU3Jfb-ctWm07YGMts68PMjSh9v26D0gfG3dFRmVFA,6
|
112
|
+
boltz_vsynthes-1.0.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|