openprotein-python 0.8.2__1-py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openprotein/__init__.py +164 -0
- openprotein/_version.py +48 -0
- openprotein/align/__init__.py +8 -0
- openprotein/align/align.py +395 -0
- openprotein/align/api.py +428 -0
- openprotein/align/future.py +55 -0
- openprotein/align/msa.py +129 -0
- openprotein/align/schemas.py +165 -0
- openprotein/base.py +181 -0
- openprotein/chains.py +88 -0
- openprotein/common/__init__.py +5 -0
- openprotein/common/features.py +7 -0
- openprotein/common/model_metadata.py +33 -0
- openprotein/common/reduction.py +8 -0
- openprotein/config.py +9 -0
- openprotein/csv.py +31 -0
- openprotein/data/__init__.py +9 -0
- openprotein/data/api.py +218 -0
- openprotein/data/assaydataset.py +178 -0
- openprotein/data/data.py +93 -0
- openprotein/data/schemas.py +27 -0
- openprotein/design/__init__.py +16 -0
- openprotein/design/api.py +259 -0
- openprotein/design/design.py +125 -0
- openprotein/design/future.py +146 -0
- openprotein/design/schemas.py +607 -0
- openprotein/embeddings/__init__.py +27 -0
- openprotein/embeddings/api.py +619 -0
- openprotein/embeddings/embeddings.py +151 -0
- openprotein/embeddings/esm.py +33 -0
- openprotein/embeddings/future.py +146 -0
- openprotein/embeddings/models.py +421 -0
- openprotein/embeddings/openprotein.py +21 -0
- openprotein/embeddings/poet.py +446 -0
- openprotein/embeddings/poet2.py +505 -0
- openprotein/embeddings/schemas.py +78 -0
- openprotein/errors.py +76 -0
- openprotein/fasta.py +92 -0
- openprotein/fold/__init__.py +21 -0
- openprotein/fold/alphafold2.py +131 -0
- openprotein/fold/api.py +287 -0
- openprotein/fold/boltz.py +691 -0
- openprotein/fold/esmfold.py +54 -0
- openprotein/fold/fold.py +107 -0
- openprotein/fold/future.py +509 -0
- openprotein/fold/models.py +139 -0
- openprotein/fold/schemas.py +39 -0
- openprotein/jobs/__init__.py +9 -0
- openprotein/jobs/api.py +71 -0
- openprotein/jobs/futures.py +746 -0
- openprotein/jobs/jobs.py +69 -0
- openprotein/jobs/schemas.py +135 -0
- openprotein/models/__init__.py +4 -0
- openprotein/models/base.py +63 -0
- openprotein/models/foundation/rfdiffusion.py +283 -0
- openprotein/models/models.py +33 -0
- openprotein/predictor/__init__.py +25 -0
- openprotein/predictor/api.py +384 -0
- openprotein/predictor/models.py +374 -0
- openprotein/predictor/prediction.py +79 -0
- openprotein/predictor/predictor.py +242 -0
- openprotein/predictor/schemas.py +113 -0
- openprotein/predictor/validate.py +40 -0
- openprotein/prompt/__init__.py +9 -0
- openprotein/prompt/api.py +505 -0
- openprotein/prompt/models.py +142 -0
- openprotein/prompt/prompt.py +130 -0
- openprotein/prompt/schemas.py +49 -0
- openprotein/protein.py +587 -0
- openprotein/svd/__init__.py +9 -0
- openprotein/svd/api.py +206 -0
- openprotein/svd/models.py +288 -0
- openprotein/svd/schemas.py +31 -0
- openprotein/svd/svd.py +134 -0
- openprotein/umap/__init__.py +9 -0
- openprotein/umap/api.py +259 -0
- openprotein/umap/models.py +211 -0
- openprotein/umap/schemas.py +35 -0
- openprotein/umap/umap.py +175 -0
- openprotein/utils/uuid.py +29 -0
- openprotein_python-0.8.2.dist-info/METADATA +176 -0
- openprotein_python-0.8.2.dist-info/RECORD +84 -0
- openprotein_python-0.8.2.dist-info/WHEEL +4 -0
- openprotein_python-0.8.2.dist-info/licenses/LICENSE.txt +30 -0
openprotein/protein.py
ADDED
|
@@ -0,0 +1,587 @@
|
|
|
1
|
+
import io
|
|
2
|
+
from collections.abc import Sequence
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Literal
|
|
5
|
+
|
|
6
|
+
import gemmi
|
|
7
|
+
import numpy as np
|
|
8
|
+
import numpy.typing as npt
|
|
9
|
+
|
|
10
|
+
from . import fasta
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from openprotein.align import MSAFuture
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# fmt: off
|
|
17
|
+
_ATOM_TYPES = (
|
|
18
|
+
'N', 'CA', 'C', 'CB', 'O', 'CG', 'CG1', 'CG2', 'OG', 'OG1', 'SG', 'CD',
|
|
19
|
+
'CD1', 'CD2', 'ND1', 'ND2', 'OD1', 'OD2', 'SD', 'CE', 'CE1', 'CE2', 'CE3',
|
|
20
|
+
'NE', 'NE1', 'NE2', 'OE1', 'OE2', 'CH2', 'NH1', 'NH2', 'OH', 'CZ', 'CZ2',
|
|
21
|
+
'CZ3', 'NZ', 'OXT'
|
|
22
|
+
)
|
|
23
|
+
# fmt: on
|
|
24
|
+
_N_ATOM = len(_ATOM_TYPES)
|
|
25
|
+
_ATOM_TYPE_TO_IDX = {atom_type: i for i, atom_type in enumerate(_ATOM_TYPES)}
|
|
26
|
+
|
|
27
|
+
_BACKBONE_ATOM_TYPES = ("N", "CA", "C")
|
|
28
|
+
|
|
29
|
+
_NAN_BFACTOR_VALUE = 9999.75 # can't/hard to use 9999.99 due to precision issues
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def calc_rmsd(
|
|
33
|
+
xyz1: npt.NDArray[np.floating], xyz2: npt.NDArray[np.floating], eps: float = 1e-6
|
|
34
|
+
) -> tuple[float, npt.NDArray[np.floating]]:
|
|
35
|
+
"""
|
|
36
|
+
Calculates RMSD between two sets of atoms (L, 3)
|
|
37
|
+
Adapted from https://github.com/RosettaCommons/RFdiffusion/blob/b44206a2a79f219bb1a649ea50603a284c225050/rfdiffusion/util.py#L719
|
|
38
|
+
"""
|
|
39
|
+
# center to CA centroid
|
|
40
|
+
xyz1 = xyz1 - xyz1.mean(0)
|
|
41
|
+
xyz2 = xyz2 - xyz2.mean(0)
|
|
42
|
+
|
|
43
|
+
# Computation of the covariance matrix
|
|
44
|
+
C = xyz2.T @ xyz1
|
|
45
|
+
|
|
46
|
+
# Compute otimal rotation matrix using SVD
|
|
47
|
+
V, S, W = np.linalg.svd(C)
|
|
48
|
+
|
|
49
|
+
# get sign to ensure right-handedness
|
|
50
|
+
d = np.ones([3, 3])
|
|
51
|
+
d[:, -1] = np.sign(np.linalg.det(V) * np.linalg.det(W))
|
|
52
|
+
|
|
53
|
+
# Rotation matrix U
|
|
54
|
+
U = (d * V) @ W
|
|
55
|
+
|
|
56
|
+
# Rotate xyz2
|
|
57
|
+
xyz2_ = xyz2 @ U
|
|
58
|
+
L = xyz2_.shape[0]
|
|
59
|
+
rmsd = np.sqrt(np.sum((xyz2_ - xyz1) * (xyz2_ - xyz1), axis=(0, 1)) / L + eps)
|
|
60
|
+
|
|
61
|
+
return rmsd, U
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class Protein:
|
|
65
|
+
"""
|
|
66
|
+
Represents a protein with optional sequence, atomic coordinates, per-residue
|
|
67
|
+
confidence scores (pLDDT), and name.
|
|
68
|
+
|
|
69
|
+
This class supports partial or complete information: users may initialize a Protein
|
|
70
|
+
with only a sequence, only a structure, or both. The class ensures that all
|
|
71
|
+
provided fields have consistent residue-level lengths and provides convenient
|
|
72
|
+
methods for indexing, masking, and structural comparisons.
|
|
73
|
+
|
|
74
|
+
Attributes:
|
|
75
|
+
sequence: Amino acid sequence as bytes. Unknown or masked residues are
|
|
76
|
+
represented as b"X".
|
|
77
|
+
coordinates: an array containing the 3D coordinates of the heavy atoms of the
|
|
78
|
+
protein in atom37 format. It has shape `(L, 37, 3)`, where `L` is the
|
|
79
|
+
length of the protein, `37` is the number of heavy atoms, and `3` is the
|
|
80
|
+
number of coordinates (x, y, and z).
|
|
81
|
+
plddt: an array of shape `(L,)`. For predicted structures, this contains the
|
|
82
|
+
pLDDT of each residue, which is a measure of prediction confidence. For
|
|
83
|
+
experimental structures, this should be set to `100` if the coordinates of
|
|
84
|
+
the alpha carbon are known, and `NaN` otherwise.
|
|
85
|
+
name: Optional identifier for the protein as a string.
|
|
86
|
+
|
|
87
|
+
Conventions:
|
|
88
|
+
- Missing or unknown residues in the sequence are denoted by b"X".
|
|
89
|
+
- Missing structural data (coordinates or pLDDT) are represented by NaN.
|
|
90
|
+
- Residue indices are 1-based for user-facing methods (e.g., `mask_sequence_at`),
|
|
91
|
+
but internally stored as 0-based arrays.
|
|
92
|
+
|
|
93
|
+
Examples:
|
|
94
|
+
Create a Protein from sequence only:
|
|
95
|
+
Protein(sequence="ACDEFGHIK")
|
|
96
|
+
|
|
97
|
+
Create a Protein from sequence and name:
|
|
98
|
+
Protein(sequence="ACDEFGHIK", name="my_protein")
|
|
99
|
+
|
|
100
|
+
Create a Protein with sequence and structure:
|
|
101
|
+
Protein(sequence="ACD", coordinates=coords_array, plddt=plddt_array)
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ValueError: If sequence, coordinates, or pLDDT are specified with inconsistent lengths.
|
|
105
|
+
ValueError: If none of sequence, coordinates, or pLDDT are provided.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def __init__(
|
|
109
|
+
self,
|
|
110
|
+
sequence: bytes | str | None = None,
|
|
111
|
+
coordinates: npt.NDArray[np.float32] | None = None,
|
|
112
|
+
plddt: npt.NDArray[np.float32] | None = None,
|
|
113
|
+
name: bytes | str | None = None,
|
|
114
|
+
):
|
|
115
|
+
lengths = {len(x) for x in (sequence, coordinates, plddt) if x is not None}
|
|
116
|
+
if len(lengths) == 0:
|
|
117
|
+
raise ValueError(
|
|
118
|
+
"At least one of sequence, coordinates, or plddt must be specified."
|
|
119
|
+
)
|
|
120
|
+
elif len(lengths) > 1:
|
|
121
|
+
raise ValueError(
|
|
122
|
+
"Specified sequence, coordinates, and plddt must all have the same length."
|
|
123
|
+
)
|
|
124
|
+
length = next(iter(lengths))
|
|
125
|
+
if sequence is not None:
|
|
126
|
+
self._sequence = (
|
|
127
|
+
sequence.encode() if isinstance(sequence, str) else sequence
|
|
128
|
+
)
|
|
129
|
+
else:
|
|
130
|
+
self._sequence = b"X" * length
|
|
131
|
+
if coordinates is not None:
|
|
132
|
+
self._coordinates = coordinates
|
|
133
|
+
else:
|
|
134
|
+
self._coordinates = np.full((length, _N_ATOM, 3), np.nan, dtype=np.float32)
|
|
135
|
+
if plddt is not None:
|
|
136
|
+
self._plddt = plddt
|
|
137
|
+
else:
|
|
138
|
+
self._plddt = np.full((length,), np.nan, dtype=np.float32)
|
|
139
|
+
if name is not None:
|
|
140
|
+
self._name = name if isinstance(name, str) else name.decode()
|
|
141
|
+
else:
|
|
142
|
+
self._name = name
|
|
143
|
+
self._tags = {}
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def name(self) -> str | None:
|
|
147
|
+
return self._name
|
|
148
|
+
|
|
149
|
+
@name.setter
|
|
150
|
+
def name(self, x: bytes | str) -> None:
|
|
151
|
+
self._name = x if isinstance(x, str) else x.decode()
|
|
152
|
+
|
|
153
|
+
@property
|
|
154
|
+
def sequence(self) -> bytes:
|
|
155
|
+
return self._sequence
|
|
156
|
+
|
|
157
|
+
@sequence.setter
|
|
158
|
+
def sequence(self, x: bytes | str) -> None:
|
|
159
|
+
assert len(x) == len(self)
|
|
160
|
+
self._sequence = x.encode() if isinstance(x, str) else x
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def coordinates(self) -> npt.NDArray[np.float32]:
|
|
164
|
+
return self._coordinates
|
|
165
|
+
|
|
166
|
+
@coordinates.setter
|
|
167
|
+
def coordinates(self, x: npt.NDArray[np.float32]) -> None:
|
|
168
|
+
assert len(x) == len(self)
|
|
169
|
+
self._coordinates = x
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def plddt(self) -> npt.NDArray[np.float32]:
|
|
173
|
+
return self._plddt
|
|
174
|
+
|
|
175
|
+
@plddt.setter
|
|
176
|
+
def plddt(self, x: npt.NDArray[np.float32]) -> None:
|
|
177
|
+
assert len(x) == len(self)
|
|
178
|
+
self._plddt = x
|
|
179
|
+
|
|
180
|
+
@property
|
|
181
|
+
def chain_id(self) -> str | list[str] | None:
|
|
182
|
+
return self._tags.get("chain_id")
|
|
183
|
+
|
|
184
|
+
@chain_id.setter
|
|
185
|
+
def chain_id(self, chain_id: str | list[str]) -> None:
|
|
186
|
+
self._tags["chain_id"] = chain_id
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def cyclic(self) -> bool:
|
|
190
|
+
return self._tags.get("cyclic") or False
|
|
191
|
+
|
|
192
|
+
@cyclic.setter
|
|
193
|
+
def cyclic(self, cyclic: bool) -> None:
|
|
194
|
+
self._tags["cyclic"] = cyclic
|
|
195
|
+
|
|
196
|
+
class NullMSA: ...
|
|
197
|
+
|
|
198
|
+
single_sequence_mode = NullMSA
|
|
199
|
+
|
|
200
|
+
@property
|
|
201
|
+
def msa(self) -> "str | MSAFuture | None | NullMSA":
|
|
202
|
+
return self._tags.get("msa")
|
|
203
|
+
|
|
204
|
+
@msa.setter
|
|
205
|
+
def msa(self, msa: "str | MSAFuture | None | NullMSA") -> None:
|
|
206
|
+
self._tags["msa"] = msa
|
|
207
|
+
|
|
208
|
+
def __len__(self):
|
|
209
|
+
lengths = {
|
|
210
|
+
len(x)
|
|
211
|
+
for x in (self.sequence, self.coordinates, self.plddt)
|
|
212
|
+
if x is not None
|
|
213
|
+
}
|
|
214
|
+
assert len(lengths) == 1
|
|
215
|
+
return next(iter(lengths))
|
|
216
|
+
|
|
217
|
+
def __getitem__(
|
|
218
|
+
self, idx: int | list[int] | slice | npt.NDArray[np.integer]
|
|
219
|
+
) -> "Protein":
|
|
220
|
+
"""Return a new Protein object indexing into residues by `idx`."""
|
|
221
|
+
if isinstance(idx, int):
|
|
222
|
+
idx = np.array([idx], dtype=int)
|
|
223
|
+
return Protein(
|
|
224
|
+
sequence=np.frombuffer(self.sequence, dtype=np.uint8)[idx].tobytes(),
|
|
225
|
+
coordinates=self.coordinates[idx].copy(),
|
|
226
|
+
plddt=self.plddt[idx].copy(),
|
|
227
|
+
name=self.name,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
def __add__(self, tgt: "Protein") -> "Protein":
|
|
231
|
+
"""Return a new Protein object by concatenating with another Protein."""
|
|
232
|
+
assert isinstance(tgt, Protein)
|
|
233
|
+
return Protein(
|
|
234
|
+
sequence=self.sequence + tgt.sequence,
|
|
235
|
+
coordinates=np.concatenate((self.coordinates, tgt.coordinates)),
|
|
236
|
+
plddt=np.concatenate((self.plddt, tgt.plddt)),
|
|
237
|
+
name=self.name if self.name == tgt.name else None,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
def at(self, positions: Sequence[int] | npt.NDArray[np.integer]) -> "Protein":
|
|
241
|
+
"""
|
|
242
|
+
Return a new Protein object containing residues at given 1-indexed positions.
|
|
243
|
+
"""
|
|
244
|
+
if not isinstance(positions, np.ndarray):
|
|
245
|
+
positions = np.array(positions, dtype=int)
|
|
246
|
+
return self[positions - 1]
|
|
247
|
+
|
|
248
|
+
def mask_sequence_at(
|
|
249
|
+
self, positions: Sequence[int] | npt.NDArray[np.integer]
|
|
250
|
+
) -> "Protein":
|
|
251
|
+
"""Mask sequence at given 1-indexed positions."""
|
|
252
|
+
if not isinstance(positions, np.ndarray):
|
|
253
|
+
positions = np.array(positions, dtype=int)
|
|
254
|
+
idxs = positions - 1
|
|
255
|
+
sequence = np.frombuffer(self.sequence, dtype=np.uint8).copy()
|
|
256
|
+
sequence[idxs] = ord(b"X")
|
|
257
|
+
return Protein(
|
|
258
|
+
sequence=sequence.tobytes(),
|
|
259
|
+
coordinates=self.coordinates.copy(),
|
|
260
|
+
plddt=self.plddt.copy(),
|
|
261
|
+
name=self.name,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
def mask_sequence_except_at(
|
|
265
|
+
self, positions: Sequence[int] | npt.NDArray[np.integer]
|
|
266
|
+
) -> "Protein":
|
|
267
|
+
"""Mask sequence at all positions except the given 1-indexed positions."""
|
|
268
|
+
if not isinstance(positions, np.ndarray):
|
|
269
|
+
positions = np.array(positions, dtype=int)
|
|
270
|
+
idxs = positions - 1
|
|
271
|
+
sequence = np.frombuffer(self.sequence, dtype=np.uint8).copy()
|
|
272
|
+
mask = np.ones_like(sequence, dtype=bool)
|
|
273
|
+
mask[idxs] = False
|
|
274
|
+
sequence[mask] = ord(b"X")
|
|
275
|
+
return Protein(
|
|
276
|
+
sequence=sequence.tobytes(),
|
|
277
|
+
coordinates=self.coordinates.copy(),
|
|
278
|
+
plddt=self.plddt.copy(),
|
|
279
|
+
name=self.name,
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
def mask_structure_at(
|
|
283
|
+
self, positions: Sequence[int] | npt.NDArray[np.integer]
|
|
284
|
+
) -> "Protein":
|
|
285
|
+
"""Mask structure at given 1-indexed positions."""
|
|
286
|
+
if not isinstance(positions, np.ndarray):
|
|
287
|
+
positions = np.array(positions, dtype=int)
|
|
288
|
+
idxs = positions - 1
|
|
289
|
+
coordinates, plddt = self.coordinates.copy(), self.plddt.copy()
|
|
290
|
+
coordinates[idxs], plddt[idxs] = np.nan, np.nan
|
|
291
|
+
return Protein(
|
|
292
|
+
sequence=self.sequence, coordinates=coordinates, plddt=plddt, name=self.name
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
def mask_structure_except_at(
|
|
296
|
+
self, positions: Sequence[int] | npt.NDArray[np.integer]
|
|
297
|
+
) -> "Protein":
|
|
298
|
+
"""Mask structure at all positions except the given 1-indexed positions."""
|
|
299
|
+
if not isinstance(positions, np.ndarray):
|
|
300
|
+
positions = np.array(positions, dtype=int)
|
|
301
|
+
idxs = positions - 1
|
|
302
|
+
mask = np.ones(len(self), dtype=bool)
|
|
303
|
+
mask[idxs] = False
|
|
304
|
+
coordinates, plddt = self.coordinates.copy(), self.plddt.copy()
|
|
305
|
+
coordinates[mask], plddt[mask] = np.nan, np.nan
|
|
306
|
+
return Protein(
|
|
307
|
+
sequence=self.sequence, coordinates=coordinates, plddt=plddt, name=self.name
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
@property
|
|
311
|
+
def has_structure(self) -> bool:
|
|
312
|
+
"""Whether or not the structure is known at any position in the protein."""
|
|
313
|
+
return (not np.isnan(self.coordinates).all()) or (
|
|
314
|
+
not np.isnan(self.plddt).all()
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
def rmsd(
|
|
318
|
+
self, tgt: "Protein", backbone_only: bool | str | Sequence[str] = False
|
|
319
|
+
) -> float:
|
|
320
|
+
"""
|
|
321
|
+
Compute the root-mean-square deviation (RMSD) between this Protein and a target
|
|
322
|
+
Protein.
|
|
323
|
+
|
|
324
|
+
Only atoms that are present (i.e., not NaN) in both structures are included in
|
|
325
|
+
the calculation.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
tgt: The target Protein to compare against.
|
|
329
|
+
backbone_only: Specifies which atoms to include in the RMSD calculation.
|
|
330
|
+
- If False (default), all atom types are included.
|
|
331
|
+
- If True, only backbone atoms ("N", "CA", "C") are included.
|
|
332
|
+
- If a string, it must be a single atom type (e.g., "CA").
|
|
333
|
+
- If a sequence of strings, it must be a non-empty list of atom types
|
|
334
|
+
(e.g., ["CA", "CB", "O"]). All specified atom types must be valid.
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
The RMSD value between the aligned structures.
|
|
338
|
+
|
|
339
|
+
Notes:
|
|
340
|
+
This method assumes that residues in `self` and `tgt` are already aligned.
|
|
341
|
+
"""
|
|
342
|
+
if backbone_only is False:
|
|
343
|
+
atom_idxs = np.arange(len(_ATOM_TYPES))
|
|
344
|
+
elif backbone_only is True:
|
|
345
|
+
atom_idxs = np.arange(3)
|
|
346
|
+
elif isinstance(backbone_only, str):
|
|
347
|
+
atom_idxs = [_ATOM_TYPE_TO_IDX[backbone_only]]
|
|
348
|
+
elif isinstance(backbone_only, Sequence):
|
|
349
|
+
assert len(backbone_only) > 0 and isinstance(next(iter(backbone_only)), str)
|
|
350
|
+
atom_idxs = [_ATOM_TYPE_TO_IDX[x] for x in backbone_only]
|
|
351
|
+
else:
|
|
352
|
+
raise ValueError(backbone_only)
|
|
353
|
+
src_coords = self.coordinates[:, atom_idxs]
|
|
354
|
+
tgt_coords = tgt.coordinates[:, atom_idxs]
|
|
355
|
+
src_known_atoms = ~np.isnan(src_coords).any(axis=2)
|
|
356
|
+
tgt_known_atoms = ~np.isnan(tgt_coords).any(axis=2)
|
|
357
|
+
overlapping_known_atoms = src_known_atoms & tgt_known_atoms
|
|
358
|
+
src_coords = src_coords[overlapping_known_atoms]
|
|
359
|
+
tgt_coords = tgt_coords[overlapping_known_atoms]
|
|
360
|
+
rmsd, _ = calc_rmsd(src_coords, tgt_coords)
|
|
361
|
+
return rmsd
|
|
362
|
+
|
|
363
|
+
def make_cif_string(self) -> str:
|
|
364
|
+
# TODO: add note about _NAN_BFACTOR_VALUE
|
|
365
|
+
assert (
|
|
366
|
+
self.has_structure
|
|
367
|
+
), "cannot make cif string for protein with no structure data"
|
|
368
|
+
# Create an empty structure and add a model with a default chain.
|
|
369
|
+
structure = gemmi.Structure()
|
|
370
|
+
if self.name is not None:
|
|
371
|
+
structure.name = self.name
|
|
372
|
+
model = structure.add_model(gemmi.Model(1))
|
|
373
|
+
|
|
374
|
+
# Process the sequence.
|
|
375
|
+
resnames = gemmi.expand_one_letter_sequence(
|
|
376
|
+
self.sequence.decode(), gemmi.ResidueKind.AA
|
|
377
|
+
)
|
|
378
|
+
entity = gemmi.Entity("1")
|
|
379
|
+
entity.full_sequence = resnames
|
|
380
|
+
entity.entity_type = gemmi.EntityType.Polymer
|
|
381
|
+
entity.polymer_type = gemmi.PolymerType.PeptideL
|
|
382
|
+
entity.subchains = ["A"]
|
|
383
|
+
structure.entities.append(entity)
|
|
384
|
+
|
|
385
|
+
# Process the coordinates.
|
|
386
|
+
n_nan_coords = np.isnan(self.coordinates).sum(axis=2)
|
|
387
|
+
assert (
|
|
388
|
+
(n_nan_coords == 0) | (n_nan_coords == 3)
|
|
389
|
+
).all(), "either all coords of an atom must be nan, or none are"
|
|
390
|
+
# Process the plddt.
|
|
391
|
+
assert (
|
|
392
|
+
np.isnan(self.plddt) | (~np.isnan(self.plddt) & (n_nan_coords[:, 1] == 0))
|
|
393
|
+
).all(), "if plddt is known, coord of CA must be known"
|
|
394
|
+
|
|
395
|
+
# Write the chain
|
|
396
|
+
chain = model.add_chain(gemmi.Chain("A"))
|
|
397
|
+
for i in range(len(self)):
|
|
398
|
+
# Add a residue to the chain; note that residue numbering starts at 1.
|
|
399
|
+
residue = gemmi.Residue()
|
|
400
|
+
residue.entity_id = "1"
|
|
401
|
+
residue.entity_type = gemmi.EntityType.Polymer
|
|
402
|
+
residue.subchain = "A"
|
|
403
|
+
residue.name = resnames[i]
|
|
404
|
+
residue.label_seq = i + 1
|
|
405
|
+
residue = chain.add_residue(residue, i + 1)
|
|
406
|
+
# For each residue, add the atoms.
|
|
407
|
+
for j, atom_name in enumerate(_ATOM_TYPES):
|
|
408
|
+
if np.isnan(self.coordinates[i, j]).any():
|
|
409
|
+
continue
|
|
410
|
+
atom = gemmi.Atom()
|
|
411
|
+
atom.name = atom_name
|
|
412
|
+
atom.element = gemmi.Element(atom_name[0])
|
|
413
|
+
atom.pos = gemmi.Position(*self.coordinates[i, j])
|
|
414
|
+
if not np.isnan(self.plddt[i]):
|
|
415
|
+
atom.b_iso = self.plddt[i]
|
|
416
|
+
else:
|
|
417
|
+
atom.b_iso = _NAN_BFACTOR_VALUE
|
|
418
|
+
atom = residue.add_atom(atom)
|
|
419
|
+
block = structure.make_mmcif_block()
|
|
420
|
+
# NB: gemmi doesn't seem to write the _chem_comp category properly... it says
|
|
421
|
+
# the type is `.`, but is should be something like `L-PEPTIDE LINKING`...
|
|
422
|
+
block.find_mmcif_category("_chem_comp").erase() # ...so we remove it
|
|
423
|
+
return block.as_string()
|
|
424
|
+
|
|
425
|
+
def make_fasta_bytes(self) -> bytes:
|
|
426
|
+
assert self.name is not None
|
|
427
|
+
data = io.BytesIO()
|
|
428
|
+
data.write(b">")
|
|
429
|
+
data.write(self.name.encode())
|
|
430
|
+
data.write(b"\n")
|
|
431
|
+
data.write(
|
|
432
|
+
self.sequence.encode()
|
|
433
|
+
if not isinstance(self.sequence, bytes)
|
|
434
|
+
else self.sequence
|
|
435
|
+
)
|
|
436
|
+
data.write(b"\n")
|
|
437
|
+
return data.getvalue()
|
|
438
|
+
|
|
439
|
+
@staticmethod
|
|
440
|
+
def from_filepath(
|
|
441
|
+
path: str | Path,
|
|
442
|
+
chain_id: str,
|
|
443
|
+
use_bfactor_as_plddt: bool | None = None,
|
|
444
|
+
model_idx: int = 0,
|
|
445
|
+
verbose: bool = True,
|
|
446
|
+
) -> "Protein":
|
|
447
|
+
"""
|
|
448
|
+
Create a Protein from a structure file.
|
|
449
|
+
|
|
450
|
+
If the structure file has multiple conformers, the first conformer is always
|
|
451
|
+
used.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
path: path to structure file (e.g. pdb or cif file)
|
|
455
|
+
chain_id: id of the chain in the structure file to use
|
|
456
|
+
use_bfactor_as_plddt: whether or not to use the bfactor of the CA atom as
|
|
457
|
+
the plddt of structure of its residue. If None, this will be set to
|
|
458
|
+
true only if the resolution of the structure is unspecified or zero.
|
|
459
|
+
model_idx: index of the model in the structure file to use
|
|
460
|
+
verbose: whether or not to print debugging information such as oddities in
|
|
461
|
+
the structure e.g. missing atoms
|
|
462
|
+
"""
|
|
463
|
+
structure = gemmi.read_structure(str(path))
|
|
464
|
+
structure.name = Path(path).stem
|
|
465
|
+
return Protein.from_structure(
|
|
466
|
+
structure=structure,
|
|
467
|
+
chain_id=chain_id,
|
|
468
|
+
use_bfactor_as_plddt=use_bfactor_as_plddt,
|
|
469
|
+
model_idx=model_idx,
|
|
470
|
+
verbose=verbose,
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
@staticmethod
|
|
474
|
+
def from_string(
|
|
475
|
+
filestring: bytes | str,
|
|
476
|
+
format: Literal["pdb", "cif"],
|
|
477
|
+
chain_id: str,
|
|
478
|
+
use_bfactor_as_plddt: bool | None = None,
|
|
479
|
+
model_idx: int = 0,
|
|
480
|
+
verbose: bool = True,
|
|
481
|
+
) -> "Protein":
|
|
482
|
+
filestring = filestring if isinstance(filestring, str) else filestring.decode()
|
|
483
|
+
if format == "pdb":
|
|
484
|
+
structure = gemmi.read_pdb_string(filestring)
|
|
485
|
+
elif format == "cif":
|
|
486
|
+
structure = gemmi.make_structure_from_block(
|
|
487
|
+
gemmi.cif.read_string(filestring).sole_block()
|
|
488
|
+
)
|
|
489
|
+
else:
|
|
490
|
+
raise ValueError(f"Unknown {format=}")
|
|
491
|
+
return Protein.from_structure(
|
|
492
|
+
structure=structure,
|
|
493
|
+
chain_id=chain_id,
|
|
494
|
+
use_bfactor_as_plddt=use_bfactor_as_plddt,
|
|
495
|
+
model_idx=model_idx,
|
|
496
|
+
verbose=verbose,
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
@staticmethod
|
|
500
|
+
def from_structure(
|
|
501
|
+
structure: gemmi.Structure,
|
|
502
|
+
chain_id: str,
|
|
503
|
+
use_bfactor_as_plddt: bool | None = None,
|
|
504
|
+
model_idx: int = 0,
|
|
505
|
+
verbose: bool = True,
|
|
506
|
+
) -> "Protein":
|
|
507
|
+
structure.setup_entities()
|
|
508
|
+
structure.assign_label_seq_id()
|
|
509
|
+
if use_bfactor_as_plddt is None:
|
|
510
|
+
use_bfactor_as_plddt = structure.resolution == 0.0
|
|
511
|
+
model = structure[model_idx]
|
|
512
|
+
chain = model.find_chain(chain_id)
|
|
513
|
+
assert chain is not None
|
|
514
|
+
polymer = chain.get_polymer()
|
|
515
|
+
|
|
516
|
+
# extract sequence
|
|
517
|
+
entity = structure.get_entity_of(polymer)
|
|
518
|
+
if len(entity.full_sequence) > 0:
|
|
519
|
+
chain_seq = entity.full_sequence
|
|
520
|
+
else:
|
|
521
|
+
chain_seq = [residue.name for residue in polymer]
|
|
522
|
+
chain_seq = [
|
|
523
|
+
gemmi.find_tabulated_residue(
|
|
524
|
+
# gemmi.Entity.first_mon extracts the first conformer
|
|
525
|
+
gemmi.Entity.first_mon(residue_name)
|
|
526
|
+
).one_letter_code
|
|
527
|
+
for residue_name in chain_seq
|
|
528
|
+
]
|
|
529
|
+
# for find_tabulated_residue: lowercase means nonstandard, " " means unknown
|
|
530
|
+
chain_seq = [c.upper() if c != " " else "X" for c in chain_seq]
|
|
531
|
+
# extract coordinates and plddt
|
|
532
|
+
coordinates = np.full((len(chain_seq), _N_ATOM, 3), np.nan, dtype=np.float32)
|
|
533
|
+
plddt = np.full(len(chain_seq), np.nan, dtype=np.float32)
|
|
534
|
+
for residue_idx, residue in enumerate(polymer):
|
|
535
|
+
i = residue.label_seq - 1 if residue.label_seq is not None else residue_idx
|
|
536
|
+
code = gemmi.find_tabulated_residue(residue.name).one_letter_code
|
|
537
|
+
code = code.upper() if code != " " else "X"
|
|
538
|
+
if code != chain_seq[i]:
|
|
539
|
+
if verbose:
|
|
540
|
+
# TODO: can this ever happen...? probably want to have this regardless i guess
|
|
541
|
+
# TODO: improve this message?
|
|
542
|
+
print(
|
|
543
|
+
f"Amino acid mismatch at position {i + 1}: SEQRES {chain_seq[i]} Structure {code}"
|
|
544
|
+
)
|
|
545
|
+
chain_seq[i] = code
|
|
546
|
+
if verbose and code == "X" and residue.name != "UNK":
|
|
547
|
+
print(f"Unknown amino acid at position {i + 1}: {residue.name}")
|
|
548
|
+
if verbose:
|
|
549
|
+
for j, atom_name in enumerate(_BACKBONE_ATOM_TYPES):
|
|
550
|
+
if atom_name not in residue:
|
|
551
|
+
print(
|
|
552
|
+
f"Residue at position {i + 1} missing backbone atom={atom_name}"
|
|
553
|
+
)
|
|
554
|
+
for atom in residue.first_conformer():
|
|
555
|
+
atom_name = atom.name
|
|
556
|
+
if residue.name == "MSE" and atom_name == "SE":
|
|
557
|
+
atom_name = "SD"
|
|
558
|
+
if (j := _ATOM_TYPE_TO_IDX.get(atom.name)) is None:
|
|
559
|
+
continue
|
|
560
|
+
coordinates[i, j] = atom.pos.tolist()
|
|
561
|
+
if use_bfactor_as_plddt and atom_name == "CA":
|
|
562
|
+
plddt[i] = (
|
|
563
|
+
atom.b_iso if atom.b_iso != _NAN_BFACTOR_VALUE else np.nan
|
|
564
|
+
)
|
|
565
|
+
# TODO: we should experiment and see if this is the behavior we want
|
|
566
|
+
if (
|
|
567
|
+
not use_bfactor_as_plddt
|
|
568
|
+
and np.isfinite(coordinates[i, _ATOM_TYPE_TO_IDX["CA"]]).all()
|
|
569
|
+
):
|
|
570
|
+
plddt[i] = 100.0
|
|
571
|
+
assert np.isnan(plddt).all() or (
|
|
572
|
+
(np.nanmin(plddt) >= 0) and (np.nanmax(plddt) <= 100)
|
|
573
|
+
)
|
|
574
|
+
return Protein(
|
|
575
|
+
sequence="".join(chain_seq),
|
|
576
|
+
coordinates=coordinates,
|
|
577
|
+
plddt=plddt,
|
|
578
|
+
name=structure.name if structure.name != "" else None,
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
def parse_fasta_as_proteins(path: str | Path) -> list[Protein]:
|
|
583
|
+
proteins = []
|
|
584
|
+
with open(path, "rb") as fp:
|
|
585
|
+
for name, sequence in fasta.parse_stream(fp):
|
|
586
|
+
proteins.append(Protein(name=name, sequence=sequence))
|
|
587
|
+
return proteins
|