openprotein-python 0.8.2__1-py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. openprotein/__init__.py +164 -0
  2. openprotein/_version.py +48 -0
  3. openprotein/align/__init__.py +8 -0
  4. openprotein/align/align.py +395 -0
  5. openprotein/align/api.py +428 -0
  6. openprotein/align/future.py +55 -0
  7. openprotein/align/msa.py +129 -0
  8. openprotein/align/schemas.py +165 -0
  9. openprotein/base.py +181 -0
  10. openprotein/chains.py +88 -0
  11. openprotein/common/__init__.py +5 -0
  12. openprotein/common/features.py +7 -0
  13. openprotein/common/model_metadata.py +33 -0
  14. openprotein/common/reduction.py +8 -0
  15. openprotein/config.py +9 -0
  16. openprotein/csv.py +31 -0
  17. openprotein/data/__init__.py +9 -0
  18. openprotein/data/api.py +218 -0
  19. openprotein/data/assaydataset.py +178 -0
  20. openprotein/data/data.py +93 -0
  21. openprotein/data/schemas.py +27 -0
  22. openprotein/design/__init__.py +16 -0
  23. openprotein/design/api.py +259 -0
  24. openprotein/design/design.py +125 -0
  25. openprotein/design/future.py +146 -0
  26. openprotein/design/schemas.py +607 -0
  27. openprotein/embeddings/__init__.py +27 -0
  28. openprotein/embeddings/api.py +619 -0
  29. openprotein/embeddings/embeddings.py +151 -0
  30. openprotein/embeddings/esm.py +33 -0
  31. openprotein/embeddings/future.py +146 -0
  32. openprotein/embeddings/models.py +421 -0
  33. openprotein/embeddings/openprotein.py +21 -0
  34. openprotein/embeddings/poet.py +446 -0
  35. openprotein/embeddings/poet2.py +505 -0
  36. openprotein/embeddings/schemas.py +78 -0
  37. openprotein/errors.py +76 -0
  38. openprotein/fasta.py +92 -0
  39. openprotein/fold/__init__.py +21 -0
  40. openprotein/fold/alphafold2.py +131 -0
  41. openprotein/fold/api.py +287 -0
  42. openprotein/fold/boltz.py +691 -0
  43. openprotein/fold/esmfold.py +54 -0
  44. openprotein/fold/fold.py +107 -0
  45. openprotein/fold/future.py +509 -0
  46. openprotein/fold/models.py +139 -0
  47. openprotein/fold/schemas.py +39 -0
  48. openprotein/jobs/__init__.py +9 -0
  49. openprotein/jobs/api.py +71 -0
  50. openprotein/jobs/futures.py +746 -0
  51. openprotein/jobs/jobs.py +69 -0
  52. openprotein/jobs/schemas.py +135 -0
  53. openprotein/models/__init__.py +4 -0
  54. openprotein/models/base.py +63 -0
  55. openprotein/models/foundation/rfdiffusion.py +283 -0
  56. openprotein/models/models.py +33 -0
  57. openprotein/predictor/__init__.py +25 -0
  58. openprotein/predictor/api.py +384 -0
  59. openprotein/predictor/models.py +374 -0
  60. openprotein/predictor/prediction.py +79 -0
  61. openprotein/predictor/predictor.py +242 -0
  62. openprotein/predictor/schemas.py +113 -0
  63. openprotein/predictor/validate.py +40 -0
  64. openprotein/prompt/__init__.py +9 -0
  65. openprotein/prompt/api.py +505 -0
  66. openprotein/prompt/models.py +142 -0
  67. openprotein/prompt/prompt.py +130 -0
  68. openprotein/prompt/schemas.py +49 -0
  69. openprotein/protein.py +587 -0
  70. openprotein/svd/__init__.py +9 -0
  71. openprotein/svd/api.py +206 -0
  72. openprotein/svd/models.py +288 -0
  73. openprotein/svd/schemas.py +31 -0
  74. openprotein/svd/svd.py +134 -0
  75. openprotein/umap/__init__.py +9 -0
  76. openprotein/umap/api.py +259 -0
  77. openprotein/umap/models.py +211 -0
  78. openprotein/umap/schemas.py +35 -0
  79. openprotein/umap/umap.py +175 -0
  80. openprotein/utils/uuid.py +29 -0
  81. openprotein_python-0.8.2.dist-info/METADATA +176 -0
  82. openprotein_python-0.8.2.dist-info/RECORD +84 -0
  83. openprotein_python-0.8.2.dist-info/WHEEL +4 -0
  84. openprotein_python-0.8.2.dist-info/licenses/LICENSE.txt +30 -0
openprotein/protein.py ADDED
@@ -0,0 +1,587 @@
1
+ import io
2
+ from collections.abc import Sequence
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Literal
5
+
6
+ import gemmi
7
+ import numpy as np
8
+ import numpy.typing as npt
9
+
10
+ from . import fasta
11
+
12
+ if TYPE_CHECKING:
13
+ from openprotein.align import MSAFuture
14
+
15
+
16
+ # fmt: off
17
+ _ATOM_TYPES = (
18
+ 'N', 'CA', 'C', 'CB', 'O', 'CG', 'CG1', 'CG2', 'OG', 'OG1', 'SG', 'CD',
19
+ 'CD1', 'CD2', 'ND1', 'ND2', 'OD1', 'OD2', 'SD', 'CE', 'CE1', 'CE2', 'CE3',
20
+ 'NE', 'NE1', 'NE2', 'OE1', 'OE2', 'CH2', 'NH1', 'NH2', 'OH', 'CZ', 'CZ2',
21
+ 'CZ3', 'NZ', 'OXT'
22
+ )
23
+ # fmt: on
24
+ _N_ATOM = len(_ATOM_TYPES)
25
+ _ATOM_TYPE_TO_IDX = {atom_type: i for i, atom_type in enumerate(_ATOM_TYPES)}
26
+
27
+ _BACKBONE_ATOM_TYPES = ("N", "CA", "C")
28
+
29
+ _NAN_BFACTOR_VALUE = 9999.75 # can't/hard to use 9999.99 due to precision issues
30
+
31
+
32
+ def calc_rmsd(
33
+ xyz1: npt.NDArray[np.floating], xyz2: npt.NDArray[np.floating], eps: float = 1e-6
34
+ ) -> tuple[float, npt.NDArray[np.floating]]:
35
+ """
36
+ Calculates RMSD between two sets of atoms (L, 3)
37
+ Adapted from https://github.com/RosettaCommons/RFdiffusion/blob/b44206a2a79f219bb1a649ea50603a284c225050/rfdiffusion/util.py#L719
38
+ """
39
+ # center to CA centroid
40
+ xyz1 = xyz1 - xyz1.mean(0)
41
+ xyz2 = xyz2 - xyz2.mean(0)
42
+
43
+ # Computation of the covariance matrix
44
+ C = xyz2.T @ xyz1
45
+
46
+ # Compute otimal rotation matrix using SVD
47
+ V, S, W = np.linalg.svd(C)
48
+
49
+ # get sign to ensure right-handedness
50
+ d = np.ones([3, 3])
51
+ d[:, -1] = np.sign(np.linalg.det(V) * np.linalg.det(W))
52
+
53
+ # Rotation matrix U
54
+ U = (d * V) @ W
55
+
56
+ # Rotate xyz2
57
+ xyz2_ = xyz2 @ U
58
+ L = xyz2_.shape[0]
59
+ rmsd = np.sqrt(np.sum((xyz2_ - xyz1) * (xyz2_ - xyz1), axis=(0, 1)) / L + eps)
60
+
61
+ return rmsd, U
62
+
63
+
64
+ class Protein:
65
+ """
66
+ Represents a protein with optional sequence, atomic coordinates, per-residue
67
+ confidence scores (pLDDT), and name.
68
+
69
+ This class supports partial or complete information: users may initialize a Protein
70
+ with only a sequence, only a structure, or both. The class ensures that all
71
+ provided fields have consistent residue-level lengths and provides convenient
72
+ methods for indexing, masking, and structural comparisons.
73
+
74
+ Attributes:
75
+ sequence: Amino acid sequence as bytes. Unknown or masked residues are
76
+ represented as b"X".
77
+ coordinates: an array containing the 3D coordinates of the heavy atoms of the
78
+ protein in atom37 format. It has shape `(L, 37, 3)`, where `L` is the
79
+ length of the protein, `37` is the number of heavy atoms, and `3` is the
80
+ number of coordinates (x, y, and z).
81
+ plddt: an array of shape `(L,)`. For predicted structures, this contains the
82
+ pLDDT of each residue, which is a measure of prediction confidence. For
83
+ experimental structures, this should be set to `100` if the coordinates of
84
+ the alpha carbon are known, and `NaN` otherwise.
85
+ name: Optional identifier for the protein as a string.
86
+
87
+ Conventions:
88
+ - Missing or unknown residues in the sequence are denoted by b"X".
89
+ - Missing structural data (coordinates or pLDDT) are represented by NaN.
90
+ - Residue indices are 1-based for user-facing methods (e.g., `mask_sequence_at`),
91
+ but internally stored as 0-based arrays.
92
+
93
+ Examples:
94
+ Create a Protein from sequence only:
95
+ Protein(sequence="ACDEFGHIK")
96
+
97
+ Create a Protein from sequence and name:
98
+ Protein(sequence="ACDEFGHIK", name="my_protein")
99
+
100
+ Create a Protein with sequence and structure:
101
+ Protein(sequence="ACD", coordinates=coords_array, plddt=plddt_array)
102
+
103
+ Raises:
104
+ ValueError: If sequence, coordinates, or pLDDT are specified with inconsistent lengths.
105
+ ValueError: If none of sequence, coordinates, or pLDDT are provided.
106
+ """
107
+
108
+ def __init__(
109
+ self,
110
+ sequence: bytes | str | None = None,
111
+ coordinates: npt.NDArray[np.float32] | None = None,
112
+ plddt: npt.NDArray[np.float32] | None = None,
113
+ name: bytes | str | None = None,
114
+ ):
115
+ lengths = {len(x) for x in (sequence, coordinates, plddt) if x is not None}
116
+ if len(lengths) == 0:
117
+ raise ValueError(
118
+ "At least one of sequence, coordinates, or plddt must be specified."
119
+ )
120
+ elif len(lengths) > 1:
121
+ raise ValueError(
122
+ "Specified sequence, coordinates, and plddt must all have the same length."
123
+ )
124
+ length = next(iter(lengths))
125
+ if sequence is not None:
126
+ self._sequence = (
127
+ sequence.encode() if isinstance(sequence, str) else sequence
128
+ )
129
+ else:
130
+ self._sequence = b"X" * length
131
+ if coordinates is not None:
132
+ self._coordinates = coordinates
133
+ else:
134
+ self._coordinates = np.full((length, _N_ATOM, 3), np.nan, dtype=np.float32)
135
+ if plddt is not None:
136
+ self._plddt = plddt
137
+ else:
138
+ self._plddt = np.full((length,), np.nan, dtype=np.float32)
139
+ if name is not None:
140
+ self._name = name if isinstance(name, str) else name.decode()
141
+ else:
142
+ self._name = name
143
+ self._tags = {}
144
+
145
+ @property
146
+ def name(self) -> str | None:
147
+ return self._name
148
+
149
+ @name.setter
150
+ def name(self, x: bytes | str) -> None:
151
+ self._name = x if isinstance(x, str) else x.decode()
152
+
153
+ @property
154
+ def sequence(self) -> bytes:
155
+ return self._sequence
156
+
157
+ @sequence.setter
158
+ def sequence(self, x: bytes | str) -> None:
159
+ assert len(x) == len(self)
160
+ self._sequence = x.encode() if isinstance(x, str) else x
161
+
162
+ @property
163
+ def coordinates(self) -> npt.NDArray[np.float32]:
164
+ return self._coordinates
165
+
166
+ @coordinates.setter
167
+ def coordinates(self, x: npt.NDArray[np.float32]) -> None:
168
+ assert len(x) == len(self)
169
+ self._coordinates = x
170
+
171
+ @property
172
+ def plddt(self) -> npt.NDArray[np.float32]:
173
+ return self._plddt
174
+
175
+ @plddt.setter
176
+ def plddt(self, x: npt.NDArray[np.float32]) -> None:
177
+ assert len(x) == len(self)
178
+ self._plddt = x
179
+
180
+ @property
181
+ def chain_id(self) -> str | list[str] | None:
182
+ return self._tags.get("chain_id")
183
+
184
+ @chain_id.setter
185
+ def chain_id(self, chain_id: str | list[str]) -> None:
186
+ self._tags["chain_id"] = chain_id
187
+
188
+ @property
189
+ def cyclic(self) -> bool:
190
+ return self._tags.get("cyclic") or False
191
+
192
+ @cyclic.setter
193
+ def cyclic(self, cyclic: bool) -> None:
194
+ self._tags["cyclic"] = cyclic
195
+
196
+ class NullMSA: ...
197
+
198
+ single_sequence_mode = NullMSA
199
+
200
+ @property
201
+ def msa(self) -> "str | MSAFuture | None | NullMSA":
202
+ return self._tags.get("msa")
203
+
204
+ @msa.setter
205
+ def msa(self, msa: "str | MSAFuture | None | NullMSA") -> None:
206
+ self._tags["msa"] = msa
207
+
208
+ def __len__(self):
209
+ lengths = {
210
+ len(x)
211
+ for x in (self.sequence, self.coordinates, self.plddt)
212
+ if x is not None
213
+ }
214
+ assert len(lengths) == 1
215
+ return next(iter(lengths))
216
+
217
+ def __getitem__(
218
+ self, idx: int | list[int] | slice | npt.NDArray[np.integer]
219
+ ) -> "Protein":
220
+ """Return a new Protein object indexing into residues by `idx`."""
221
+ if isinstance(idx, int):
222
+ idx = np.array([idx], dtype=int)
223
+ return Protein(
224
+ sequence=np.frombuffer(self.sequence, dtype=np.uint8)[idx].tobytes(),
225
+ coordinates=self.coordinates[idx].copy(),
226
+ plddt=self.plddt[idx].copy(),
227
+ name=self.name,
228
+ )
229
+
230
+ def __add__(self, tgt: "Protein") -> "Protein":
231
+ """Return a new Protein object by concatenating with another Protein."""
232
+ assert isinstance(tgt, Protein)
233
+ return Protein(
234
+ sequence=self.sequence + tgt.sequence,
235
+ coordinates=np.concatenate((self.coordinates, tgt.coordinates)),
236
+ plddt=np.concatenate((self.plddt, tgt.plddt)),
237
+ name=self.name if self.name == tgt.name else None,
238
+ )
239
+
240
+ def at(self, positions: Sequence[int] | npt.NDArray[np.integer]) -> "Protein":
241
+ """
242
+ Return a new Protein object containing residues at given 1-indexed positions.
243
+ """
244
+ if not isinstance(positions, np.ndarray):
245
+ positions = np.array(positions, dtype=int)
246
+ return self[positions - 1]
247
+
248
+ def mask_sequence_at(
249
+ self, positions: Sequence[int] | npt.NDArray[np.integer]
250
+ ) -> "Protein":
251
+ """Mask sequence at given 1-indexed positions."""
252
+ if not isinstance(positions, np.ndarray):
253
+ positions = np.array(positions, dtype=int)
254
+ idxs = positions - 1
255
+ sequence = np.frombuffer(self.sequence, dtype=np.uint8).copy()
256
+ sequence[idxs] = ord(b"X")
257
+ return Protein(
258
+ sequence=sequence.tobytes(),
259
+ coordinates=self.coordinates.copy(),
260
+ plddt=self.plddt.copy(),
261
+ name=self.name,
262
+ )
263
+
264
+ def mask_sequence_except_at(
265
+ self, positions: Sequence[int] | npt.NDArray[np.integer]
266
+ ) -> "Protein":
267
+ """Mask sequence at all positions except the given 1-indexed positions."""
268
+ if not isinstance(positions, np.ndarray):
269
+ positions = np.array(positions, dtype=int)
270
+ idxs = positions - 1
271
+ sequence = np.frombuffer(self.sequence, dtype=np.uint8).copy()
272
+ mask = np.ones_like(sequence, dtype=bool)
273
+ mask[idxs] = False
274
+ sequence[mask] = ord(b"X")
275
+ return Protein(
276
+ sequence=sequence.tobytes(),
277
+ coordinates=self.coordinates.copy(),
278
+ plddt=self.plddt.copy(),
279
+ name=self.name,
280
+ )
281
+
282
+ def mask_structure_at(
283
+ self, positions: Sequence[int] | npt.NDArray[np.integer]
284
+ ) -> "Protein":
285
+ """Mask structure at given 1-indexed positions."""
286
+ if not isinstance(positions, np.ndarray):
287
+ positions = np.array(positions, dtype=int)
288
+ idxs = positions - 1
289
+ coordinates, plddt = self.coordinates.copy(), self.plddt.copy()
290
+ coordinates[idxs], plddt[idxs] = np.nan, np.nan
291
+ return Protein(
292
+ sequence=self.sequence, coordinates=coordinates, plddt=plddt, name=self.name
293
+ )
294
+
295
+ def mask_structure_except_at(
296
+ self, positions: Sequence[int] | npt.NDArray[np.integer]
297
+ ) -> "Protein":
298
+ """Mask structure at all positions except the given 1-indexed positions."""
299
+ if not isinstance(positions, np.ndarray):
300
+ positions = np.array(positions, dtype=int)
301
+ idxs = positions - 1
302
+ mask = np.ones(len(self), dtype=bool)
303
+ mask[idxs] = False
304
+ coordinates, plddt = self.coordinates.copy(), self.plddt.copy()
305
+ coordinates[mask], plddt[mask] = np.nan, np.nan
306
+ return Protein(
307
+ sequence=self.sequence, coordinates=coordinates, plddt=plddt, name=self.name
308
+ )
309
+
310
+ @property
311
+ def has_structure(self) -> bool:
312
+ """Whether or not the structure is known at any position in the protein."""
313
+ return (not np.isnan(self.coordinates).all()) or (
314
+ not np.isnan(self.plddt).all()
315
+ )
316
+
317
+ def rmsd(
318
+ self, tgt: "Protein", backbone_only: bool | str | Sequence[str] = False
319
+ ) -> float:
320
+ """
321
+ Compute the root-mean-square deviation (RMSD) between this Protein and a target
322
+ Protein.
323
+
324
+ Only atoms that are present (i.e., not NaN) in both structures are included in
325
+ the calculation.
326
+
327
+ Args:
328
+ tgt: The target Protein to compare against.
329
+ backbone_only: Specifies which atoms to include in the RMSD calculation.
330
+ - If False (default), all atom types are included.
331
+ - If True, only backbone atoms ("N", "CA", "C") are included.
332
+ - If a string, it must be a single atom type (e.g., "CA").
333
+ - If a sequence of strings, it must be a non-empty list of atom types
334
+ (e.g., ["CA", "CB", "O"]). All specified atom types must be valid.
335
+
336
+ Returns:
337
+ The RMSD value between the aligned structures.
338
+
339
+ Notes:
340
+ This method assumes that residues in `self` and `tgt` are already aligned.
341
+ """
342
+ if backbone_only is False:
343
+ atom_idxs = np.arange(len(_ATOM_TYPES))
344
+ elif backbone_only is True:
345
+ atom_idxs = np.arange(3)
346
+ elif isinstance(backbone_only, str):
347
+ atom_idxs = [_ATOM_TYPE_TO_IDX[backbone_only]]
348
+ elif isinstance(backbone_only, Sequence):
349
+ assert len(backbone_only) > 0 and isinstance(next(iter(backbone_only)), str)
350
+ atom_idxs = [_ATOM_TYPE_TO_IDX[x] for x in backbone_only]
351
+ else:
352
+ raise ValueError(backbone_only)
353
+ src_coords = self.coordinates[:, atom_idxs]
354
+ tgt_coords = tgt.coordinates[:, atom_idxs]
355
+ src_known_atoms = ~np.isnan(src_coords).any(axis=2)
356
+ tgt_known_atoms = ~np.isnan(tgt_coords).any(axis=2)
357
+ overlapping_known_atoms = src_known_atoms & tgt_known_atoms
358
+ src_coords = src_coords[overlapping_known_atoms]
359
+ tgt_coords = tgt_coords[overlapping_known_atoms]
360
+ rmsd, _ = calc_rmsd(src_coords, tgt_coords)
361
+ return rmsd
362
+
363
+ def make_cif_string(self) -> str:
364
+ # TODO: add note about _NAN_BFACTOR_VALUE
365
+ assert (
366
+ self.has_structure
367
+ ), "cannot make cif string for protein with no structure data"
368
+ # Create an empty structure and add a model with a default chain.
369
+ structure = gemmi.Structure()
370
+ if self.name is not None:
371
+ structure.name = self.name
372
+ model = structure.add_model(gemmi.Model(1))
373
+
374
+ # Process the sequence.
375
+ resnames = gemmi.expand_one_letter_sequence(
376
+ self.sequence.decode(), gemmi.ResidueKind.AA
377
+ )
378
+ entity = gemmi.Entity("1")
379
+ entity.full_sequence = resnames
380
+ entity.entity_type = gemmi.EntityType.Polymer
381
+ entity.polymer_type = gemmi.PolymerType.PeptideL
382
+ entity.subchains = ["A"]
383
+ structure.entities.append(entity)
384
+
385
+ # Process the coordinates.
386
+ n_nan_coords = np.isnan(self.coordinates).sum(axis=2)
387
+ assert (
388
+ (n_nan_coords == 0) | (n_nan_coords == 3)
389
+ ).all(), "either all coords of an atom must be nan, or none are"
390
+ # Process the plddt.
391
+ assert (
392
+ np.isnan(self.plddt) | (~np.isnan(self.plddt) & (n_nan_coords[:, 1] == 0))
393
+ ).all(), "if plddt is known, coord of CA must be known"
394
+
395
+ # Write the chain
396
+ chain = model.add_chain(gemmi.Chain("A"))
397
+ for i in range(len(self)):
398
+ # Add a residue to the chain; note that residue numbering starts at 1.
399
+ residue = gemmi.Residue()
400
+ residue.entity_id = "1"
401
+ residue.entity_type = gemmi.EntityType.Polymer
402
+ residue.subchain = "A"
403
+ residue.name = resnames[i]
404
+ residue.label_seq = i + 1
405
+ residue = chain.add_residue(residue, i + 1)
406
+ # For each residue, add the atoms.
407
+ for j, atom_name in enumerate(_ATOM_TYPES):
408
+ if np.isnan(self.coordinates[i, j]).any():
409
+ continue
410
+ atom = gemmi.Atom()
411
+ atom.name = atom_name
412
+ atom.element = gemmi.Element(atom_name[0])
413
+ atom.pos = gemmi.Position(*self.coordinates[i, j])
414
+ if not np.isnan(self.plddt[i]):
415
+ atom.b_iso = self.plddt[i]
416
+ else:
417
+ atom.b_iso = _NAN_BFACTOR_VALUE
418
+ atom = residue.add_atom(atom)
419
+ block = structure.make_mmcif_block()
420
+ # NB: gemmi doesn't seem to write the _chem_comp category properly... it says
421
+ # the type is `.`, but is should be something like `L-PEPTIDE LINKING`...
422
+ block.find_mmcif_category("_chem_comp").erase() # ...so we remove it
423
+ return block.as_string()
424
+
425
+ def make_fasta_bytes(self) -> bytes:
426
+ assert self.name is not None
427
+ data = io.BytesIO()
428
+ data.write(b">")
429
+ data.write(self.name.encode())
430
+ data.write(b"\n")
431
+ data.write(
432
+ self.sequence.encode()
433
+ if not isinstance(self.sequence, bytes)
434
+ else self.sequence
435
+ )
436
+ data.write(b"\n")
437
+ return data.getvalue()
438
+
439
+ @staticmethod
440
+ def from_filepath(
441
+ path: str | Path,
442
+ chain_id: str,
443
+ use_bfactor_as_plddt: bool | None = None,
444
+ model_idx: int = 0,
445
+ verbose: bool = True,
446
+ ) -> "Protein":
447
+ """
448
+ Create a Protein from a structure file.
449
+
450
+ If the structure file has multiple conformers, the first conformer is always
451
+ used.
452
+
453
+ Args:
454
+ path: path to structure file (e.g. pdb or cif file)
455
+ chain_id: id of the chain in the structure file to use
456
+ use_bfactor_as_plddt: whether or not to use the bfactor of the CA atom as
457
+ the plddt of structure of its residue. If None, this will be set to
458
+ true only if the resolution of the structure is unspecified or zero.
459
+ model_idx: index of the model in the structure file to use
460
+ verbose: whether or not to print debugging information such as oddities in
461
+ the structure e.g. missing atoms
462
+ """
463
+ structure = gemmi.read_structure(str(path))
464
+ structure.name = Path(path).stem
465
+ return Protein.from_structure(
466
+ structure=structure,
467
+ chain_id=chain_id,
468
+ use_bfactor_as_plddt=use_bfactor_as_plddt,
469
+ model_idx=model_idx,
470
+ verbose=verbose,
471
+ )
472
+
473
+ @staticmethod
474
+ def from_string(
475
+ filestring: bytes | str,
476
+ format: Literal["pdb", "cif"],
477
+ chain_id: str,
478
+ use_bfactor_as_plddt: bool | None = None,
479
+ model_idx: int = 0,
480
+ verbose: bool = True,
481
+ ) -> "Protein":
482
+ filestring = filestring if isinstance(filestring, str) else filestring.decode()
483
+ if format == "pdb":
484
+ structure = gemmi.read_pdb_string(filestring)
485
+ elif format == "cif":
486
+ structure = gemmi.make_structure_from_block(
487
+ gemmi.cif.read_string(filestring).sole_block()
488
+ )
489
+ else:
490
+ raise ValueError(f"Unknown {format=}")
491
+ return Protein.from_structure(
492
+ structure=structure,
493
+ chain_id=chain_id,
494
+ use_bfactor_as_plddt=use_bfactor_as_plddt,
495
+ model_idx=model_idx,
496
+ verbose=verbose,
497
+ )
498
+
499
+ @staticmethod
500
+ def from_structure(
501
+ structure: gemmi.Structure,
502
+ chain_id: str,
503
+ use_bfactor_as_plddt: bool | None = None,
504
+ model_idx: int = 0,
505
+ verbose: bool = True,
506
+ ) -> "Protein":
507
+ structure.setup_entities()
508
+ structure.assign_label_seq_id()
509
+ if use_bfactor_as_plddt is None:
510
+ use_bfactor_as_plddt = structure.resolution == 0.0
511
+ model = structure[model_idx]
512
+ chain = model.find_chain(chain_id)
513
+ assert chain is not None
514
+ polymer = chain.get_polymer()
515
+
516
+ # extract sequence
517
+ entity = structure.get_entity_of(polymer)
518
+ if len(entity.full_sequence) > 0:
519
+ chain_seq = entity.full_sequence
520
+ else:
521
+ chain_seq = [residue.name for residue in polymer]
522
+ chain_seq = [
523
+ gemmi.find_tabulated_residue(
524
+ # gemmi.Entity.first_mon extracts the first conformer
525
+ gemmi.Entity.first_mon(residue_name)
526
+ ).one_letter_code
527
+ for residue_name in chain_seq
528
+ ]
529
+ # for find_tabulated_residue: lowercase means nonstandard, " " means unknown
530
+ chain_seq = [c.upper() if c != " " else "X" for c in chain_seq]
531
+ # extract coordinates and plddt
532
+ coordinates = np.full((len(chain_seq), _N_ATOM, 3), np.nan, dtype=np.float32)
533
+ plddt = np.full(len(chain_seq), np.nan, dtype=np.float32)
534
+ for residue_idx, residue in enumerate(polymer):
535
+ i = residue.label_seq - 1 if residue.label_seq is not None else residue_idx
536
+ code = gemmi.find_tabulated_residue(residue.name).one_letter_code
537
+ code = code.upper() if code != " " else "X"
538
+ if code != chain_seq[i]:
539
+ if verbose:
540
+ # TODO: can this ever happen...? probably want to have this regardless i guess
541
+ # TODO: improve this message?
542
+ print(
543
+ f"Amino acid mismatch at position {i + 1}: SEQRES {chain_seq[i]} Structure {code}"
544
+ )
545
+ chain_seq[i] = code
546
+ if verbose and code == "X" and residue.name != "UNK":
547
+ print(f"Unknown amino acid at position {i + 1}: {residue.name}")
548
+ if verbose:
549
+ for j, atom_name in enumerate(_BACKBONE_ATOM_TYPES):
550
+ if atom_name not in residue:
551
+ print(
552
+ f"Residue at position {i + 1} missing backbone atom={atom_name}"
553
+ )
554
+ for atom in residue.first_conformer():
555
+ atom_name = atom.name
556
+ if residue.name == "MSE" and atom_name == "SE":
557
+ atom_name = "SD"
558
+ if (j := _ATOM_TYPE_TO_IDX.get(atom.name)) is None:
559
+ continue
560
+ coordinates[i, j] = atom.pos.tolist()
561
+ if use_bfactor_as_plddt and atom_name == "CA":
562
+ plddt[i] = (
563
+ atom.b_iso if atom.b_iso != _NAN_BFACTOR_VALUE else np.nan
564
+ )
565
+ # TODO: we should experiment and see if this is the behavior we want
566
+ if (
567
+ not use_bfactor_as_plddt
568
+ and np.isfinite(coordinates[i, _ATOM_TYPE_TO_IDX["CA"]]).all()
569
+ ):
570
+ plddt[i] = 100.0
571
+ assert np.isnan(plddt).all() or (
572
+ (np.nanmin(plddt) >= 0) and (np.nanmax(plddt) <= 100)
573
+ )
574
+ return Protein(
575
+ sequence="".join(chain_seq),
576
+ coordinates=coordinates,
577
+ plddt=plddt,
578
+ name=structure.name if structure.name != "" else None,
579
+ )
580
+
581
+
582
+ def parse_fasta_as_proteins(path: str | Path) -> list[Protein]:
583
+ proteins = []
584
+ with open(path, "rb") as fp:
585
+ for name, sequence in fasta.parse_stream(fp):
586
+ proteins.append(Protein(name=name, sequence=sequence))
587
+ return proteins
@@ -0,0 +1,9 @@
1
+ """
2
+ SVD module for OpenProtein for reducing embeddings.
3
+
4
+ isort:skip_file
5
+ """
6
+
7
+ from .schemas import SVDMetadata, SVDFitJob, SVDEmbeddingsJob
8
+ from .models import SVDModel, SVDEmbeddingsResultFuture
9
+ from .svd import SVDAPI