openprotein-python 0.8.2__1-py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. openprotein/__init__.py +164 -0
  2. openprotein/_version.py +48 -0
  3. openprotein/align/__init__.py +8 -0
  4. openprotein/align/align.py +395 -0
  5. openprotein/align/api.py +428 -0
  6. openprotein/align/future.py +55 -0
  7. openprotein/align/msa.py +129 -0
  8. openprotein/align/schemas.py +165 -0
  9. openprotein/base.py +181 -0
  10. openprotein/chains.py +88 -0
  11. openprotein/common/__init__.py +5 -0
  12. openprotein/common/features.py +7 -0
  13. openprotein/common/model_metadata.py +33 -0
  14. openprotein/common/reduction.py +8 -0
  15. openprotein/config.py +9 -0
  16. openprotein/csv.py +31 -0
  17. openprotein/data/__init__.py +9 -0
  18. openprotein/data/api.py +218 -0
  19. openprotein/data/assaydataset.py +178 -0
  20. openprotein/data/data.py +93 -0
  21. openprotein/data/schemas.py +27 -0
  22. openprotein/design/__init__.py +16 -0
  23. openprotein/design/api.py +259 -0
  24. openprotein/design/design.py +125 -0
  25. openprotein/design/future.py +146 -0
  26. openprotein/design/schemas.py +607 -0
  27. openprotein/embeddings/__init__.py +27 -0
  28. openprotein/embeddings/api.py +619 -0
  29. openprotein/embeddings/embeddings.py +151 -0
  30. openprotein/embeddings/esm.py +33 -0
  31. openprotein/embeddings/future.py +146 -0
  32. openprotein/embeddings/models.py +421 -0
  33. openprotein/embeddings/openprotein.py +21 -0
  34. openprotein/embeddings/poet.py +446 -0
  35. openprotein/embeddings/poet2.py +505 -0
  36. openprotein/embeddings/schemas.py +78 -0
  37. openprotein/errors.py +76 -0
  38. openprotein/fasta.py +92 -0
  39. openprotein/fold/__init__.py +21 -0
  40. openprotein/fold/alphafold2.py +131 -0
  41. openprotein/fold/api.py +287 -0
  42. openprotein/fold/boltz.py +691 -0
  43. openprotein/fold/esmfold.py +54 -0
  44. openprotein/fold/fold.py +107 -0
  45. openprotein/fold/future.py +509 -0
  46. openprotein/fold/models.py +139 -0
  47. openprotein/fold/schemas.py +39 -0
  48. openprotein/jobs/__init__.py +9 -0
  49. openprotein/jobs/api.py +71 -0
  50. openprotein/jobs/futures.py +746 -0
  51. openprotein/jobs/jobs.py +69 -0
  52. openprotein/jobs/schemas.py +135 -0
  53. openprotein/models/__init__.py +4 -0
  54. openprotein/models/base.py +63 -0
  55. openprotein/models/foundation/rfdiffusion.py +283 -0
  56. openprotein/models/models.py +33 -0
  57. openprotein/predictor/__init__.py +25 -0
  58. openprotein/predictor/api.py +384 -0
  59. openprotein/predictor/models.py +374 -0
  60. openprotein/predictor/prediction.py +79 -0
  61. openprotein/predictor/predictor.py +242 -0
  62. openprotein/predictor/schemas.py +113 -0
  63. openprotein/predictor/validate.py +40 -0
  64. openprotein/prompt/__init__.py +9 -0
  65. openprotein/prompt/api.py +505 -0
  66. openprotein/prompt/models.py +142 -0
  67. openprotein/prompt/prompt.py +130 -0
  68. openprotein/prompt/schemas.py +49 -0
  69. openprotein/protein.py +587 -0
  70. openprotein/svd/__init__.py +9 -0
  71. openprotein/svd/api.py +206 -0
  72. openprotein/svd/models.py +288 -0
  73. openprotein/svd/schemas.py +31 -0
  74. openprotein/svd/svd.py +134 -0
  75. openprotein/umap/__init__.py +9 -0
  76. openprotein/umap/api.py +259 -0
  77. openprotein/umap/models.py +211 -0
  78. openprotein/umap/schemas.py +35 -0
  79. openprotein/umap/umap.py +175 -0
  80. openprotein/utils/uuid.py +29 -0
  81. openprotein_python-0.8.2.dist-info/METADATA +176 -0
  82. openprotein_python-0.8.2.dist-info/RECORD +84 -0
  83. openprotein_python-0.8.2.dist-info/WHEEL +4 -0
  84. openprotein_python-0.8.2.dist-info/licenses/LICENSE.txt +30 -0
@@ -0,0 +1,446 @@
1
+ """Original PoET model handling various protein engineering tasks."""
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from openprotein.base import APISession
6
+ from openprotein.common import ModelMetadata, ReductionType
7
+ from openprotein.data import AssayDataset, AssayMetadata
8
+ from openprotein.prompt import Prompt
9
+
10
+ from . import api
11
+ from .future import (
12
+ EmbeddingsGenerateFuture,
13
+ EmbeddingsResultFuture,
14
+ EmbeddingsScoreFuture,
15
+ )
16
+ from .models import EmbeddingModel
17
+
18
+ if TYPE_CHECKING:
19
+ from openprotein.predictor import PredictorModel
20
+ from openprotein.svd import SVDModel
21
+ from openprotein.umap import UMAPModel
22
+
23
+
24
+ class PoETModel(EmbeddingModel):
25
+ """
26
+ Class for OpenProtein's foundation model PoET.
27
+
28
+ Note
29
+ ----
30
+ PoET functions are dependent on a prompt supplied via the prompt endpoints.
31
+
32
+ Examples
33
+ --------
34
+ View specific model details (including supported tokens) with the `?` operator.
35
+
36
+ >>> import openprotein
37
+ >>> session = openprotein.connect(username="user", password="password")
38
+ >>> session.embedding.poet.<embeddings_method>
39
+ """
40
+
41
+ model_id = "poet"
42
+
43
+ # TODO - Add model to explicitly require prompt_id
44
+ def __init__(
45
+ self,
46
+ session: APISession,
47
+ model_id: str,
48
+ metadata: ModelMetadata | None = None,
49
+ ):
50
+ super().__init__(session=session, model_id=model_id, metadata=metadata)
51
+
52
+ def embed(
53
+ self,
54
+ sequences: list[bytes],
55
+ prompt: str | Prompt | None = None,
56
+ reduction: ReductionType | None = ReductionType.MEAN,
57
+ **kwargs,
58
+ ) -> EmbeddingsResultFuture:
59
+ """
60
+ Embed sequences using the PoET model.
61
+
62
+ Parameters
63
+ ----------
64
+ sequences : list of bytes
65
+ Sequences to embed.
66
+ prompt : str or Prompt or None, optional
67
+ Prompt from an align workflow to condition the PoET model.
68
+ reduction : ReductionType or None, optional
69
+ Embeddings reduction to use (e.g., mean). Default is ReductionType.MEAN.
70
+ **kwargs
71
+ Additional keyword arguments.
72
+
73
+ Returns
74
+ -------
75
+ EmbeddingsResultFuture
76
+ Future object that returns the embeddings of the submitted sequences.
77
+ """
78
+ if prompt is None:
79
+ prompt_id = None
80
+ else:
81
+ prompt_id = prompt if isinstance(prompt, str) else prompt.id
82
+ return super().embed(
83
+ sequences=sequences,
84
+ reduction=reduction,
85
+ prompt_id=prompt_id,
86
+ **kwargs,
87
+ )
88
+
89
+ def logits(
90
+ self,
91
+ sequences: list[bytes],
92
+ prompt: str | Prompt | None = None,
93
+ **kwargs,
94
+ ) -> EmbeddingsResultFuture:
95
+ """
96
+ Compute logits for sequences using the PoET model.
97
+
98
+ Parameters
99
+ ----------
100
+ sequences : list of bytes
101
+ Sequences to analyze.
102
+ prompt : str or Prompt or None, optional
103
+ Prompt from an align workflow to condition the PoET model.
104
+ **kwargs
105
+ Additional keyword arguments.
106
+
107
+ Returns
108
+ -------
109
+ EmbeddingsResultFuture
110
+ Future object that returns the logits of the submitted sequences.
111
+ """
112
+ if prompt is None:
113
+ prompt_id = None
114
+ else:
115
+ prompt_id = prompt if isinstance(prompt, str) else prompt.id
116
+ return super().logits(sequences=sequences, prompt_id=prompt_id, **kwargs)
117
+
118
+ def attn(self):
119
+ """
120
+ Attention is not available for PoET.
121
+
122
+ Raises
123
+ ------
124
+ ValueError
125
+ Always raised, as attention is not supported for PoET.
126
+
127
+ :meta private:
128
+ """
129
+ raise ValueError("Attn not yet supported for poet")
130
+
131
+ def score(
132
+ self,
133
+ sequences: list[bytes],
134
+ prompt: str | Prompt | None = None,
135
+ **kwargs,
136
+ ) -> EmbeddingsScoreFuture:
137
+ """
138
+ Score query sequences using the specified prompt.
139
+
140
+ Parameters
141
+ ----------
142
+ sequences : list of bytes
143
+ Sequences to score.
144
+ prompt : str or Prompt or None, optional
145
+ Prompt from an align workflow to condition the PoET model.
146
+ **kwargs
147
+ Additional keyword arguments.
148
+
149
+ Returns
150
+ -------
151
+ EmbeddingsScoreFuture
152
+ Future object that returns the scores of the submitted sequences.
153
+ """
154
+ if prompt is None:
155
+ prompt_id = None
156
+ else:
157
+ prompt_id = prompt if isinstance(prompt, str) else prompt.id
158
+ return EmbeddingsScoreFuture.create(
159
+ session=self.session,
160
+ job=api.request_score_post(
161
+ session=self.session,
162
+ model_id=self.id,
163
+ prompt_id=prompt_id,
164
+ sequences=sequences,
165
+ **kwargs,
166
+ ),
167
+ )
168
+
169
+ def indel(
170
+ self,
171
+ sequence: bytes,
172
+ prompt: str | Prompt | None = None,
173
+ insert: str | None = None,
174
+ delete: list[int] | None = None,
175
+ **kwargs,
176
+ ) -> EmbeddingsScoreFuture:
177
+ """
178
+ Score all indels of the query sequence using the specified prompt.
179
+
180
+ Parameters
181
+ ----------
182
+ sequence : bytes
183
+ Sequence to analyze.
184
+ prompt : str or Prompt or None, optional
185
+ Prompt from an align workflow to condition the PoET model.
186
+ insert : str or None, optional
187
+ Insertion fragment at each site.
188
+ delete : list of int or None, optional
189
+ Range of size of fragment to delete at each site.
190
+ **kwargs
191
+ Additional keyword arguments.
192
+
193
+ Returns
194
+ -------
195
+ EmbeddingsScoreFuture
196
+ Future object that returns the scores of the indel-ed sequence.
197
+
198
+ Raises
199
+ ------
200
+ ValueError
201
+ If neither insert nor delete is provided.
202
+ """
203
+ if not insert and not delete:
204
+ raise ValueError("Expected insert and/or delete to be provided")
205
+ if prompt is None:
206
+ prompt_id = None
207
+ else:
208
+ prompt_id = prompt if isinstance(prompt, str) else prompt.id
209
+ return EmbeddingsScoreFuture.create(
210
+ session=self.session,
211
+ job=api.request_score_indel_post(
212
+ session=self.session,
213
+ model_id=self.id,
214
+ base_sequence=sequence,
215
+ prompt_id=prompt_id,
216
+ insert=insert,
217
+ delete=delete,
218
+ **kwargs,
219
+ ),
220
+ )
221
+
222
+ def single_site(
223
+ self,
224
+ sequence: bytes,
225
+ prompt: str | Prompt | None = None,
226
+ **kwargs,
227
+ ) -> EmbeddingsScoreFuture:
228
+ """
229
+ Score all single substitutions of the query sequence using the specified prompt.
230
+
231
+ Parameters
232
+ ----------
233
+ sequence : bytes
234
+ Sequence to analyze.
235
+ prompt : str or Prompt or None, optional
236
+ Prompt from an align workflow to condition the PoET model.
237
+ **kwargs
238
+ Additional keyword arguments.
239
+
240
+ Returns
241
+ -------
242
+ EmbeddingsScoreFuture
243
+ Future object that returns the scores of the mutated sequence.
244
+ """
245
+ if prompt is None:
246
+ prompt_id = None
247
+ else:
248
+ prompt_id = prompt if isinstance(prompt, str) else prompt.id
249
+ return EmbeddingsScoreFuture.create(
250
+ session=self.session,
251
+ job=api.request_score_single_site_post(
252
+ session=self.session,
253
+ model_id=self.id,
254
+ base_sequence=sequence,
255
+ prompt_id=prompt_id,
256
+ **kwargs,
257
+ ),
258
+ )
259
+
260
+ def generate(
261
+ self,
262
+ prompt: str | Prompt,
263
+ num_samples: int = 100,
264
+ temperature: float = 1.0,
265
+ topk: float | None = None,
266
+ topp: float | None = None,
267
+ max_length: int = 1000,
268
+ seed: int | None = None,
269
+ **kwargs,
270
+ ) -> EmbeddingsGenerateFuture:
271
+ """
272
+ Generate protein sequences conditioned on a prompt.
273
+
274
+ Parameters
275
+ ----------
276
+ prompt : str or Prompt
277
+ Prompt from an align workflow to condition the PoET model.
278
+ num_samples : int, optional
279
+ Number of samples to generate. Default is 100.
280
+ temperature : float, optional
281
+ Temperature for sampling. Higher values produce more random outputs. Default is 1.0.
282
+ topk : float or None, optional
283
+ Number of top-k residues to consider during sampling. Default is None.
284
+ topp : float or None, optional
285
+ Cumulative probability threshold for top-p sampling. Default is None.
286
+ max_length : int, optional
287
+ Maximum length of generated proteins. Default is 1000.
288
+ seed : int or None, optional
289
+ Seed for random number generation. Default is None.
290
+ **kwargs
291
+ Additional keyword arguments.
292
+
293
+ Returns
294
+ -------
295
+ EmbeddingsGenerateFuture
296
+ Future object representing the status and information about the generation job.
297
+ """
298
+ prompt_id = prompt if isinstance(prompt, str) else prompt.id
299
+ return EmbeddingsGenerateFuture.create(
300
+ session=self.session,
301
+ job=api.request_generate_post(
302
+ session=self.session,
303
+ model_id=self.id,
304
+ num_samples=num_samples,
305
+ temperature=temperature,
306
+ topk=topk,
307
+ topp=topp,
308
+ max_length=max_length,
309
+ random_seed=seed,
310
+ prompt_id=prompt_id,
311
+ **kwargs,
312
+ ),
313
+ )
314
+
315
+ def fit_svd(
316
+ self,
317
+ prompt: str | Prompt | None = None,
318
+ sequences: list[bytes] | list[str] | None = None,
319
+ assay: AssayDataset | None = None,
320
+ n_components: int = 1024,
321
+ reduction: ReductionType | None = None,
322
+ **kwargs,
323
+ ) -> "SVDModel":
324
+ """
325
+ Fit an SVD on the embedding results of PoET.
326
+
327
+ This function creates an SVDModel based on the embeddings from this model
328
+ as well as the hyperparameters specified in the arguments.
329
+
330
+ Parameters
331
+ ----------
332
+ prompt : str or Prompt or None, optional
333
+ Prompt from an align workflow to condition the PoET model.
334
+ sequences : list of bytes or list of str or None, optional
335
+ Sequences to use for SVD.
336
+ assay : AssayDataset or None, optional
337
+ Assay dataset to use for SVD.
338
+ n_components : int, optional
339
+ Number of components in SVD. Determines output shapes. Default is 1024.
340
+ reduction : ReductionType or None, optional
341
+ Embeddings reduction to use (e.g., mean).
342
+ **kwargs
343
+ Additional keyword arguments.
344
+
345
+ Returns
346
+ -------
347
+ SVDModel
348
+ Future that represents the fitted SVD model.
349
+ """
350
+ if prompt is None:
351
+ prompt_id = None
352
+ else:
353
+ prompt_id = prompt if isinstance(prompt, str) else prompt.id
354
+ return super().fit_svd(
355
+ sequences=sequences,
356
+ assay=assay,
357
+ n_components=n_components,
358
+ reduction=reduction,
359
+ prompt_id=prompt_id,
360
+ **kwargs,
361
+ )
362
+
363
+ def fit_umap(
364
+ self,
365
+ prompt: str | Prompt | None = None,
366
+ sequences: list[bytes] | list[str] | None = None,
367
+ assay: AssayDataset | None = None,
368
+ n_components: int = 2,
369
+ reduction: ReductionType | None = ReductionType.MEAN,
370
+ **kwargs,
371
+ ) -> "UMAPModel":
372
+ """
373
+ Fit a UMAP on assay using PoET and hyperparameters.
374
+
375
+ This function creates a UMAP based on the embeddings from this PoET model
376
+ as well as the hyperparameters specified in the arguments.
377
+
378
+ Parameters
379
+ ----------
380
+ prompt : str or Prompt or None, optional
381
+ Prompt from an align workflow to condition the PoET model.
382
+ sequences : list of bytes or list of str or None, optional
383
+ Optional sequences to fit UMAP with. Either use sequences or assay. Sequences is preferred.
384
+ assay : AssayDataset or None, optional
385
+ Optional assay containing sequences to fit UMAP with. Either use sequences or assay. Ignored if sequences are provided.
386
+ n_components : int, optional
387
+ Number of components in UMAP fit. Determines output shapes. Default is 2.
388
+ reduction : ReductionType or None, optional
389
+ Embeddings reduction to use (e.g., mean). Default is ReductionType.MEAN.
390
+ **kwargs
391
+ Additional keyword arguments.
392
+
393
+ Returns
394
+ -------
395
+ UMAPModel
396
+ Future that represents the fitted UMAP model.
397
+ """
398
+ if prompt is None:
399
+ prompt_id = None
400
+ else:
401
+ prompt_id = prompt if isinstance(prompt, str) else prompt.id
402
+ return super().fit_umap(
403
+ sequences=sequences,
404
+ assay=assay,
405
+ n_components=n_components,
406
+ reduction=reduction,
407
+ prompt_id=prompt_id,
408
+ **kwargs,
409
+ )
410
+
411
+ def fit_gp(
412
+ self,
413
+ assay: AssayMetadata | AssayDataset | str,
414
+ properties: list[str],
415
+ prompt: str | Prompt | None = None,
416
+ **kwargs,
417
+ ) -> "PredictorModel":
418
+ """
419
+ Fit a Gaussian Process (GP) on assay using this embedding model and hyperparameters.
420
+
421
+ Parameters
422
+ ----------
423
+ assay : AssayMetadata or AssayDataset or str
424
+ Assay to fit GP on.
425
+ properties : list of str
426
+ Properties in the assay to fit the GP on.
427
+ prompt : str or Prompt or None, optional
428
+ Prompt from an align workflow to condition the PoET model.
429
+ **kwargs
430
+ Additional keyword arguments.
431
+
432
+ Returns
433
+ -------
434
+ PredictorModel
435
+ Future that represents the trained predictor model.
436
+ """
437
+ if prompt is None:
438
+ prompt_id = None
439
+ else:
440
+ prompt_id = prompt if isinstance(prompt, str) else prompt.id
441
+ return super().fit_gp(
442
+ assay=assay,
443
+ properties=properties,
444
+ prompt_id=prompt_id,
445
+ **kwargs,
446
+ )