openprotein-python 0.8.2__1-py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openprotein/__init__.py +164 -0
- openprotein/_version.py +48 -0
- openprotein/align/__init__.py +8 -0
- openprotein/align/align.py +395 -0
- openprotein/align/api.py +428 -0
- openprotein/align/future.py +55 -0
- openprotein/align/msa.py +129 -0
- openprotein/align/schemas.py +165 -0
- openprotein/base.py +181 -0
- openprotein/chains.py +88 -0
- openprotein/common/__init__.py +5 -0
- openprotein/common/features.py +7 -0
- openprotein/common/model_metadata.py +33 -0
- openprotein/common/reduction.py +8 -0
- openprotein/config.py +9 -0
- openprotein/csv.py +31 -0
- openprotein/data/__init__.py +9 -0
- openprotein/data/api.py +218 -0
- openprotein/data/assaydataset.py +178 -0
- openprotein/data/data.py +93 -0
- openprotein/data/schemas.py +27 -0
- openprotein/design/__init__.py +16 -0
- openprotein/design/api.py +259 -0
- openprotein/design/design.py +125 -0
- openprotein/design/future.py +146 -0
- openprotein/design/schemas.py +607 -0
- openprotein/embeddings/__init__.py +27 -0
- openprotein/embeddings/api.py +619 -0
- openprotein/embeddings/embeddings.py +151 -0
- openprotein/embeddings/esm.py +33 -0
- openprotein/embeddings/future.py +146 -0
- openprotein/embeddings/models.py +421 -0
- openprotein/embeddings/openprotein.py +21 -0
- openprotein/embeddings/poet.py +446 -0
- openprotein/embeddings/poet2.py +505 -0
- openprotein/embeddings/schemas.py +78 -0
- openprotein/errors.py +76 -0
- openprotein/fasta.py +92 -0
- openprotein/fold/__init__.py +21 -0
- openprotein/fold/alphafold2.py +131 -0
- openprotein/fold/api.py +287 -0
- openprotein/fold/boltz.py +691 -0
- openprotein/fold/esmfold.py +54 -0
- openprotein/fold/fold.py +107 -0
- openprotein/fold/future.py +509 -0
- openprotein/fold/models.py +139 -0
- openprotein/fold/schemas.py +39 -0
- openprotein/jobs/__init__.py +9 -0
- openprotein/jobs/api.py +71 -0
- openprotein/jobs/futures.py +746 -0
- openprotein/jobs/jobs.py +69 -0
- openprotein/jobs/schemas.py +135 -0
- openprotein/models/__init__.py +4 -0
- openprotein/models/base.py +63 -0
- openprotein/models/foundation/rfdiffusion.py +283 -0
- openprotein/models/models.py +33 -0
- openprotein/predictor/__init__.py +25 -0
- openprotein/predictor/api.py +384 -0
- openprotein/predictor/models.py +374 -0
- openprotein/predictor/prediction.py +79 -0
- openprotein/predictor/predictor.py +242 -0
- openprotein/predictor/schemas.py +113 -0
- openprotein/predictor/validate.py +40 -0
- openprotein/prompt/__init__.py +9 -0
- openprotein/prompt/api.py +505 -0
- openprotein/prompt/models.py +142 -0
- openprotein/prompt/prompt.py +130 -0
- openprotein/prompt/schemas.py +49 -0
- openprotein/protein.py +587 -0
- openprotein/svd/__init__.py +9 -0
- openprotein/svd/api.py +206 -0
- openprotein/svd/models.py +288 -0
- openprotein/svd/schemas.py +31 -0
- openprotein/svd/svd.py +134 -0
- openprotein/umap/__init__.py +9 -0
- openprotein/umap/api.py +259 -0
- openprotein/umap/models.py +211 -0
- openprotein/umap/schemas.py +35 -0
- openprotein/umap/umap.py +175 -0
- openprotein/utils/uuid.py +29 -0
- openprotein_python-0.8.2.dist-info/METADATA +176 -0
- openprotein_python-0.8.2.dist-info/RECORD +84 -0
- openprotein_python-0.8.2.dist-info/WHEEL +4 -0
- openprotein_python-0.8.2.dist-info/licenses/LICENSE.txt +30 -0
|
@@ -0,0 +1,505 @@
|
|
|
1
|
+
"""Proprietary PoET-2 model providing top-class performance on protein engineering tasks."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
from typing import TYPE_CHECKING, Literal
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from openprotein.base import APISession
|
|
9
|
+
from openprotein.common import ModelMetadata, ReductionType
|
|
10
|
+
from openprotein.data import AssayDataset, AssayMetadata
|
|
11
|
+
from openprotein.prompt import Prompt, PromptAPI, Query
|
|
12
|
+
from openprotein.protein import Protein
|
|
13
|
+
from openprotein.utils import uuid
|
|
14
|
+
|
|
15
|
+
from .future import (
|
|
16
|
+
EmbeddingsGenerateFuture,
|
|
17
|
+
EmbeddingsResultFuture,
|
|
18
|
+
EmbeddingsScoreFuture,
|
|
19
|
+
)
|
|
20
|
+
from .models import EmbeddingModel
|
|
21
|
+
from .poet import PoETModel
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from openprotein.predictor import PredictorModel
|
|
25
|
+
from openprotein.svd import SVDModel
|
|
26
|
+
from openprotein.umap import UMAPModel
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class PoET2Model(PoETModel, EmbeddingModel):
|
|
30
|
+
"""
|
|
31
|
+
Class for OpenProtein's foundation model PoET 2.
|
|
32
|
+
|
|
33
|
+
PoET functions are dependent on a prompt supplied via the prompt endpoints.
|
|
34
|
+
|
|
35
|
+
Examples
|
|
36
|
+
--------
|
|
37
|
+
View specific model details (including supported tokens) with the `?` operator.
|
|
38
|
+
|
|
39
|
+
Examples
|
|
40
|
+
--------
|
|
41
|
+
.. code-block:: python
|
|
42
|
+
|
|
43
|
+
>>> import openprotein
|
|
44
|
+
>>> session = openprotein.connect(username="user", password="password")
|
|
45
|
+
>>> session.embedding.poet2?
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
model_id = "poet-2"
|
|
49
|
+
|
|
50
|
+
# TODO - Add model to explicitly require prompt_id
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
session: APISession,
|
|
54
|
+
model_id: str,
|
|
55
|
+
metadata: ModelMetadata | None = None,
|
|
56
|
+
):
|
|
57
|
+
super().__init__(session=session, model_id=model_id, metadata=metadata)
|
|
58
|
+
|
|
59
|
+
def __resolve_query(
|
|
60
|
+
self,
|
|
61
|
+
query: str | bytes | Protein | Query | None = None,
|
|
62
|
+
) -> str | None:
|
|
63
|
+
if query is None:
|
|
64
|
+
query_id = None
|
|
65
|
+
elif (
|
|
66
|
+
isinstance(query, Protein)
|
|
67
|
+
or isinstance(query, bytes)
|
|
68
|
+
or (isinstance(query, str) and not uuid.is_valid_uuid(query))
|
|
69
|
+
):
|
|
70
|
+
prompt_api = getattr(self.session, "prompt", None)
|
|
71
|
+
assert isinstance(prompt_api, PromptAPI)
|
|
72
|
+
query_ = prompt_api.create_query(query=query)
|
|
73
|
+
query_id = query_.id
|
|
74
|
+
else:
|
|
75
|
+
query_id = query if isinstance(query, str) else query.id
|
|
76
|
+
return query_id
|
|
77
|
+
|
|
78
|
+
def embed(
|
|
79
|
+
self,
|
|
80
|
+
sequences: list[bytes],
|
|
81
|
+
reduction: ReductionType | None = ReductionType.MEAN,
|
|
82
|
+
prompt: str | Prompt | None = None,
|
|
83
|
+
query: str | bytes | Protein | Query | None = None,
|
|
84
|
+
use_query_structure_in_decoder: bool = True,
|
|
85
|
+
decoder_type: Literal["mlm", "clm"] | None = None,
|
|
86
|
+
) -> EmbeddingsResultFuture:
|
|
87
|
+
"""
|
|
88
|
+
Embed sequences using this model.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
sequences : list of bytes
|
|
93
|
+
Sequences to embed.
|
|
94
|
+
reduction : ReductionType or None, optional
|
|
95
|
+
Embeddings reduction to use (e.g. mean). Default is ReductionType.MEAN.
|
|
96
|
+
prompt : str or Prompt or None, optional
|
|
97
|
+
Prompt or prompt_id or prompt from an align workflow to condition PoET model.
|
|
98
|
+
query : str or bytes or Protein or Query or None, optional
|
|
99
|
+
Query to use with prompt.
|
|
100
|
+
use_query_structure_in_decoder : bool, optional
|
|
101
|
+
Whether to use query structure in decoder. Default is True.
|
|
102
|
+
decoder_type : {'mlm', 'clm'} or None, optional
|
|
103
|
+
Decoder type. Default is None.
|
|
104
|
+
|
|
105
|
+
Returns
|
|
106
|
+
-------
|
|
107
|
+
EmbeddingsResultFuture
|
|
108
|
+
A future object that returns the embeddings of the submitted sequences.
|
|
109
|
+
"""
|
|
110
|
+
query_id = self.__resolve_query(query=query)
|
|
111
|
+
return super().embed(
|
|
112
|
+
sequences=sequences,
|
|
113
|
+
reduction=reduction,
|
|
114
|
+
prompt=prompt,
|
|
115
|
+
query_id=query_id,
|
|
116
|
+
use_query_structure_in_decoder=use_query_structure_in_decoder,
|
|
117
|
+
decoder_type=decoder_type,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def logits(
|
|
121
|
+
self,
|
|
122
|
+
sequences: list[bytes],
|
|
123
|
+
prompt: str | Prompt | None = None,
|
|
124
|
+
query: str | bytes | Protein | Query | None = None,
|
|
125
|
+
use_query_structure_in_decoder: bool = True,
|
|
126
|
+
decoder_type: Literal["mlm", "clm"] | None = None,
|
|
127
|
+
) -> EmbeddingsResultFuture:
|
|
128
|
+
"""
|
|
129
|
+
Compute logit embeddings for sequences using this model.
|
|
130
|
+
|
|
131
|
+
Parameters
|
|
132
|
+
----------
|
|
133
|
+
sequences : list of bytes
|
|
134
|
+
Sequences to analyze.
|
|
135
|
+
prompt : str or Prompt or None, optional
|
|
136
|
+
Prompt or prompt_id or prompt from an align workflow to condition PoET model.
|
|
137
|
+
query : str or bytes or Protein or Query or None, optional
|
|
138
|
+
Query to use with prompt.
|
|
139
|
+
use_query_structure_in_decoder : bool, optional
|
|
140
|
+
Whether to use query structure in decoder. Default is True.
|
|
141
|
+
decoder_type : {'mlm', 'clm'} or None, optional
|
|
142
|
+
Decoder type. Default is None.
|
|
143
|
+
|
|
144
|
+
Returns
|
|
145
|
+
-------
|
|
146
|
+
EmbeddingsResultFuture
|
|
147
|
+
A future object that returns the logits of the submitted sequences.
|
|
148
|
+
"""
|
|
149
|
+
query_id = self.__resolve_query(query=query)
|
|
150
|
+
return super().logits(
|
|
151
|
+
sequences=sequences,
|
|
152
|
+
prompt=prompt,
|
|
153
|
+
query_id=query_id,
|
|
154
|
+
use_query_structure_in_decoder=use_query_structure_in_decoder,
|
|
155
|
+
decoder_type=decoder_type,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
def score(
|
|
159
|
+
self,
|
|
160
|
+
sequences: list[bytes],
|
|
161
|
+
prompt: str | Prompt | None = None,
|
|
162
|
+
query: str | bytes | Protein | Query | None = None,
|
|
163
|
+
use_query_structure_in_decoder: bool = True,
|
|
164
|
+
decoder_type: Literal["mlm", "clm"] | None = None,
|
|
165
|
+
) -> EmbeddingsScoreFuture:
|
|
166
|
+
"""
|
|
167
|
+
Score query sequences using the specified prompt.
|
|
168
|
+
|
|
169
|
+
Parameters
|
|
170
|
+
----------
|
|
171
|
+
sequences : list of bytes
|
|
172
|
+
Sequences to score.
|
|
173
|
+
prompt : str or Prompt or None, optional
|
|
174
|
+
Prompt or prompt_id or prompt from an align workflow to condition PoET model.
|
|
175
|
+
query : str or bytes or Protein or Query or None, optional
|
|
176
|
+
Query to use with prompt.
|
|
177
|
+
use_query_structure_in_decoder : bool, optional
|
|
178
|
+
Whether to use query structure in decoder. Default is True.
|
|
179
|
+
decoder_type : {'mlm', 'clm'} or None, optional
|
|
180
|
+
Decoder type. Default is None.
|
|
181
|
+
|
|
182
|
+
Returns
|
|
183
|
+
-------
|
|
184
|
+
EmbeddingsScoreFuture
|
|
185
|
+
A future object that returns the scores of the submitted sequences.
|
|
186
|
+
"""
|
|
187
|
+
query_id = self.__resolve_query(query=query)
|
|
188
|
+
return super().score(
|
|
189
|
+
sequences=sequences,
|
|
190
|
+
prompt=prompt,
|
|
191
|
+
query_id=query_id,
|
|
192
|
+
use_query_structure_in_decoder=use_query_structure_in_decoder,
|
|
193
|
+
decoder_type=decoder_type,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
def indel(
|
|
197
|
+
self,
|
|
198
|
+
sequence: bytes,
|
|
199
|
+
prompt: str | Prompt | None = None,
|
|
200
|
+
query: str | bytes | Protein | Query | None = None,
|
|
201
|
+
use_query_structure_in_decoder: bool = True,
|
|
202
|
+
decoder_type: Literal["mlm", "clm"] | None = None,
|
|
203
|
+
insert: str | None = None,
|
|
204
|
+
delete: list[int] | None = None,
|
|
205
|
+
**kwargs,
|
|
206
|
+
) -> EmbeddingsScoreFuture:
|
|
207
|
+
"""
|
|
208
|
+
Score all indels of the query sequence using the specified prompt.
|
|
209
|
+
|
|
210
|
+
Parameters
|
|
211
|
+
----------
|
|
212
|
+
sequence : bytes
|
|
213
|
+
Sequence to analyze.
|
|
214
|
+
prompt : str or Prompt or None, optional
|
|
215
|
+
Prompt from an align workflow to condition the PoET model.
|
|
216
|
+
query : str or bytes or Protein or Query or None, optional
|
|
217
|
+
Query to use with prompt.
|
|
218
|
+
use_query_structure_in_decoder : bool, optional
|
|
219
|
+
Whether to use query structure in decoder. Default is True.
|
|
220
|
+
decoder_type : {'mlm', 'clm'} or None, optional
|
|
221
|
+
Decoder type. Default is None.
|
|
222
|
+
insert : str or None, optional
|
|
223
|
+
Insertion fragment at each site.
|
|
224
|
+
delete : list of int or None, optional
|
|
225
|
+
Range of size of fragment to delete at each site.
|
|
226
|
+
**kwargs
|
|
227
|
+
Additional keyword arguments.
|
|
228
|
+
|
|
229
|
+
Returns
|
|
230
|
+
-------
|
|
231
|
+
EmbeddingsScoreFuture
|
|
232
|
+
A future object that returns the scores of the indel-ed sequence.
|
|
233
|
+
|
|
234
|
+
Raises
|
|
235
|
+
------
|
|
236
|
+
ValueError
|
|
237
|
+
If neither insert nor delete is provided.
|
|
238
|
+
"""
|
|
239
|
+
query_id = self.__resolve_query(query=query)
|
|
240
|
+
return super().indel(
|
|
241
|
+
sequence=sequence,
|
|
242
|
+
prompt=prompt,
|
|
243
|
+
query_id=query_id,
|
|
244
|
+
use_query_structure_in_decoder=use_query_structure_in_decoder,
|
|
245
|
+
decoder_type=decoder_type,
|
|
246
|
+
insert=insert,
|
|
247
|
+
delete=delete,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
def single_site(
|
|
251
|
+
self,
|
|
252
|
+
sequence: bytes,
|
|
253
|
+
prompt: str | Prompt | None = None,
|
|
254
|
+
query: str | bytes | Protein | Query | None = None,
|
|
255
|
+
use_query_structure_in_decoder: bool = True,
|
|
256
|
+
decoder_type: Literal["mlm", "clm"] | None = None,
|
|
257
|
+
) -> EmbeddingsScoreFuture:
|
|
258
|
+
"""
|
|
259
|
+
Score all single substitutions of the query sequence using the specified prompt.
|
|
260
|
+
|
|
261
|
+
Parameters
|
|
262
|
+
----------
|
|
263
|
+
sequence : bytes
|
|
264
|
+
Sequence to analyze.
|
|
265
|
+
prompt : str or Prompt or None, optional
|
|
266
|
+
Prompt or prompt_id or prompt from an align workflow to condition PoET model.
|
|
267
|
+
query : str or bytes or Protein or Query or None, optional
|
|
268
|
+
Query to use with prompt.
|
|
269
|
+
use_query_structure_in_decoder : bool, optional
|
|
270
|
+
Whether to use query structure in decoder. Default is True.
|
|
271
|
+
decoder_type : {'mlm', 'clm'} or None, optional
|
|
272
|
+
Decoder type. Default is None.
|
|
273
|
+
|
|
274
|
+
Returns
|
|
275
|
+
-------
|
|
276
|
+
EmbeddingsScoreFuture
|
|
277
|
+
A future object that returns the scores of the mutated sequence.
|
|
278
|
+
"""
|
|
279
|
+
query_id = self.__resolve_query(query=query)
|
|
280
|
+
return super().single_site(
|
|
281
|
+
sequence=sequence,
|
|
282
|
+
prompt=prompt,
|
|
283
|
+
query_id=query_id,
|
|
284
|
+
use_query_structure_in_decoder=use_query_structure_in_decoder,
|
|
285
|
+
decoder_type=decoder_type,
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
def generate(
|
|
289
|
+
self,
|
|
290
|
+
prompt: str | Prompt,
|
|
291
|
+
query: str | bytes | Protein | Query | None = None,
|
|
292
|
+
use_query_structure_in_decoder: bool = True,
|
|
293
|
+
num_samples: int = 100,
|
|
294
|
+
temperature: float = 1.0,
|
|
295
|
+
topk: float | None = None,
|
|
296
|
+
topp: float | None = None,
|
|
297
|
+
max_length: int = 1000,
|
|
298
|
+
seed: int | None = None,
|
|
299
|
+
ensemble_weights: Sequence[float] | None = None,
|
|
300
|
+
ensemble_method: Literal["arithmetic", "geometric"] | None = None,
|
|
301
|
+
) -> EmbeddingsGenerateFuture:
|
|
302
|
+
"""
|
|
303
|
+
Generate protein sequences conditioned on a prompt.
|
|
304
|
+
|
|
305
|
+
Parameters
|
|
306
|
+
----------
|
|
307
|
+
prompt : str or Prompt
|
|
308
|
+
Prompt from an align workflow to condition PoET model.
|
|
309
|
+
query : str or bytes or Protein or Query or None, optional
|
|
310
|
+
Query to use with prompt.
|
|
311
|
+
use_query_structure_in_decoder : bool, optional
|
|
312
|
+
Whether to use query structure in decoder. Default is True.
|
|
313
|
+
num_samples : int, optional
|
|
314
|
+
The number of samples to generate. Default is 100.
|
|
315
|
+
temperature : float, optional
|
|
316
|
+
The temperature for sampling. Higher values produce more random outputs. Default is 1.0.
|
|
317
|
+
topk : float or None, optional
|
|
318
|
+
The number of top-k residues to consider during sampling. Default is None.
|
|
319
|
+
topp : float or None, optional
|
|
320
|
+
The cumulative probability threshold for top-p sampling. Default is None.
|
|
321
|
+
max_length : int, optional
|
|
322
|
+
The maximum length of generated proteins. Default is 1000.
|
|
323
|
+
seed : int or None, optional
|
|
324
|
+
Seed for random number generation. Default is None.
|
|
325
|
+
ensemble_weights : Sequence of float or None, optional
|
|
326
|
+
Weights for combining likelihoods from multiple prompts in the ensemble.
|
|
327
|
+
The length of this sequence must match the number of prompts.
|
|
328
|
+
All weights must be finite. If ensemble_method is "arithmetic", then weights
|
|
329
|
+
must also be non-negative, and have a non-zero sum.
|
|
330
|
+
ensemble_method : {'arithmetic', 'geometric'} or None, optional
|
|
331
|
+
Method used to combine likelihoods from multiple prompts in the ensemble.
|
|
332
|
+
If "arithmetic", the weighted mean is used; if "geometric", the weighted
|
|
333
|
+
geometric mean is used. If None (default), the method defaults to
|
|
334
|
+
"arithmetic", but this behavior may change in the future.
|
|
335
|
+
|
|
336
|
+
Returns
|
|
337
|
+
-------
|
|
338
|
+
EmbeddingsGenerateFuture
|
|
339
|
+
A future object representing the status and information about the generation job.
|
|
340
|
+
"""
|
|
341
|
+
query_id = self.__resolve_query(query=query)
|
|
342
|
+
if ensemble_weights is not None:
|
|
343
|
+
# NB: for now, ensemble_method is None -> ensemble_method == "arithmetic"
|
|
344
|
+
if ensemble_method is None or (ensemble_method == "arithmetic"):
|
|
345
|
+
assert all(w >= 0 for w in ensemble_weights)
|
|
346
|
+
assert sum(ensemble_weights) >= 0
|
|
347
|
+
assert np.isfinite(np.array(ensemble_weights)).all()
|
|
348
|
+
if isinstance(prompt, Prompt):
|
|
349
|
+
assert len(ensemble_weights) == prompt.num_replicates, (
|
|
350
|
+
f"Number of ensemble weights ({len(ensemble_weights)}) must be "
|
|
351
|
+
f"equal to the number of prompts ({prompt.num_replicates})"
|
|
352
|
+
)
|
|
353
|
+
return super().generate(
|
|
354
|
+
prompt=prompt,
|
|
355
|
+
num_samples=num_samples,
|
|
356
|
+
temperature=temperature,
|
|
357
|
+
topk=topk,
|
|
358
|
+
topp=topp,
|
|
359
|
+
max_length=max_length,
|
|
360
|
+
seed=seed,
|
|
361
|
+
query_id=query_id,
|
|
362
|
+
use_query_structure_in_decoder=use_query_structure_in_decoder,
|
|
363
|
+
ensemble_weights=ensemble_weights,
|
|
364
|
+
ensemble_method=ensemble_method,
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
def fit_svd(
|
|
368
|
+
self,
|
|
369
|
+
sequences: list[bytes] | list[str] | None = None,
|
|
370
|
+
assay: AssayDataset | None = None,
|
|
371
|
+
n_components: int = 1024,
|
|
372
|
+
reduction: ReductionType | None = None,
|
|
373
|
+
prompt: str | Prompt | None = None,
|
|
374
|
+
query: str | bytes | Protein | Query | None = None,
|
|
375
|
+
use_query_structure_in_decoder: bool = True,
|
|
376
|
+
) -> "SVDModel":
|
|
377
|
+
"""
|
|
378
|
+
Fit an SVD on the embedding results of PoET.
|
|
379
|
+
|
|
380
|
+
This function will create an SVDModel based on the embeddings from this model
|
|
381
|
+
as well as the hyperparameters specified in the arguments.
|
|
382
|
+
|
|
383
|
+
Parameters
|
|
384
|
+
----------
|
|
385
|
+
sequences : list of bytes or list of str or None, optional
|
|
386
|
+
Sequences to fit SVD. If None, assay must be provided.
|
|
387
|
+
assay : AssayDataset or None, optional
|
|
388
|
+
Assay containing sequences to fit SVD. Ignored if sequences are provided.
|
|
389
|
+
n_components : int, optional
|
|
390
|
+
Number of components in SVD. Determines output shapes. Default is 1024.
|
|
391
|
+
reduction : ReductionType or None, optional
|
|
392
|
+
Embeddings reduction to use (e.g. mean).
|
|
393
|
+
prompt : str or Prompt or None, optional
|
|
394
|
+
Prompt from an align workflow to condition PoET model.
|
|
395
|
+
query : str or bytes or Protein or Query or None, optional
|
|
396
|
+
Query to use with prompt.
|
|
397
|
+
use_query_structure_in_decoder : bool, optional
|
|
398
|
+
Whether to use query structure in decoder. Default is True.
|
|
399
|
+
|
|
400
|
+
Returns
|
|
401
|
+
-------
|
|
402
|
+
SVDModel
|
|
403
|
+
A future that represents the fitted SVD model.
|
|
404
|
+
"""
|
|
405
|
+
query_id = self.__resolve_query(query=query)
|
|
406
|
+
return super().fit_svd(
|
|
407
|
+
sequences=sequences,
|
|
408
|
+
assay=assay,
|
|
409
|
+
n_components=n_components,
|
|
410
|
+
reduction=reduction,
|
|
411
|
+
prompt=prompt,
|
|
412
|
+
query_id=query_id,
|
|
413
|
+
use_query_structure_in_decoder=use_query_structure_in_decoder,
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
def fit_umap(
|
|
417
|
+
self,
|
|
418
|
+
sequences: list[bytes] | list[str] | None = None,
|
|
419
|
+
assay: AssayDataset | None = None,
|
|
420
|
+
n_components: int = 2,
|
|
421
|
+
reduction: ReductionType | None = ReductionType.MEAN,
|
|
422
|
+
prompt: str | Prompt | None = None,
|
|
423
|
+
query: str | bytes | Protein | Query | None = None,
|
|
424
|
+
use_query_structure_in_decoder: bool = True,
|
|
425
|
+
) -> "UMAPModel":
|
|
426
|
+
"""
|
|
427
|
+
Fit a UMAP on assay using PoET and hyperparameters.
|
|
428
|
+
|
|
429
|
+
This function will create a UMAP based on the embeddings from this PoET model
|
|
430
|
+
as well as the hyperparameters specified in the arguments.
|
|
431
|
+
|
|
432
|
+
Parameters
|
|
433
|
+
----------
|
|
434
|
+
sequences : list of bytes or list of str or None, optional
|
|
435
|
+
Sequences to fit UMAP. If None, assay must be provided.
|
|
436
|
+
assay : AssayDataset or None, optional
|
|
437
|
+
Assay containing sequences to fit UMAP. Ignored if sequences are provided.
|
|
438
|
+
n_components : int, optional
|
|
439
|
+
Number of components in UMAP fit. Determines output shapes. Default is 2.
|
|
440
|
+
reduction : ReductionType or None, optional
|
|
441
|
+
Embeddings reduction to use (e.g. mean). Default is ReductionType.MEAN.
|
|
442
|
+
prompt : str or Prompt or None, optional
|
|
443
|
+
Prompt from an align workflow to condition PoET model.
|
|
444
|
+
query : str or bytes or Protein or Query or None, optional
|
|
445
|
+
Query to use with prompt.
|
|
446
|
+
use_query_structure_in_decoder : bool, optional
|
|
447
|
+
Whether to use query structure in decoder. Default is True.
|
|
448
|
+
|
|
449
|
+
Returns
|
|
450
|
+
-------
|
|
451
|
+
UMAPModel
|
|
452
|
+
A future that represents the fitted UMAP model.
|
|
453
|
+
"""
|
|
454
|
+
query_id = self.__resolve_query(query=query)
|
|
455
|
+
return super().fit_umap(
|
|
456
|
+
sequences=sequences,
|
|
457
|
+
assay=assay,
|
|
458
|
+
n_components=n_components,
|
|
459
|
+
reduction=reduction,
|
|
460
|
+
prompt=prompt,
|
|
461
|
+
query_id=query_id,
|
|
462
|
+
use_query_structure_in_decoder=use_query_structure_in_decoder,
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
def fit_gp(
|
|
466
|
+
self,
|
|
467
|
+
assay: AssayMetadata | AssayDataset | str,
|
|
468
|
+
properties: list[str],
|
|
469
|
+
prompt: str | Prompt | None = None,
|
|
470
|
+
query: str | bytes | Protein | Query | None = None,
|
|
471
|
+
use_query_structure_in_decoder: bool = True,
|
|
472
|
+
**kwargs,
|
|
473
|
+
) -> "PredictorModel":
|
|
474
|
+
"""
|
|
475
|
+
Fit a Gaussian Process (GP) on assay using this embedding model and hyperparameters.
|
|
476
|
+
|
|
477
|
+
Parameters
|
|
478
|
+
----------
|
|
479
|
+
assay : AssayMetadata or AssayDataset or str
|
|
480
|
+
Assay to fit GP on.
|
|
481
|
+
properties : list of str
|
|
482
|
+
Properties in the assay to fit the GP on.
|
|
483
|
+
prompt : str or Prompt or None, optional
|
|
484
|
+
Prompt from an align workflow to condition PoET model.
|
|
485
|
+
query : str or bytes or Protein or Query or None, optional
|
|
486
|
+
Query to use with prompt.
|
|
487
|
+
use_query_structure_in_decoder : bool, optional
|
|
488
|
+
Whether to use query structure in decoder. Default is True.
|
|
489
|
+
**kwargs
|
|
490
|
+
Additional keyword arguments.
|
|
491
|
+
|
|
492
|
+
Returns
|
|
493
|
+
-------
|
|
494
|
+
PredictorModel
|
|
495
|
+
A future that represents the trained predictor model.
|
|
496
|
+
"""
|
|
497
|
+
query_id = self.__resolve_query(query=query)
|
|
498
|
+
return super().fit_gp(
|
|
499
|
+
assay=assay,
|
|
500
|
+
properties=properties,
|
|
501
|
+
prompt=prompt,
|
|
502
|
+
query_id=query_id,
|
|
503
|
+
use_query_structure_in_decoder=use_query_structure_in_decoder,
|
|
504
|
+
**kwargs,
|
|
505
|
+
)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Schemas for OpenProtein embeddings system."""
|
|
2
|
+
|
|
3
|
+
from typing import Literal
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
7
|
+
|
|
8
|
+
from openprotein.jobs import BatchJob, Job, JobType
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class EmbeddedSequence(BaseModel):
|
|
12
|
+
"""
|
|
13
|
+
Representation of an embedded sequence created from our models.
|
|
14
|
+
|
|
15
|
+
Represented as an iterable yielding the sequence followed by the embedding.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
sequence: bytes
|
|
19
|
+
embedding: np.ndarray
|
|
20
|
+
|
|
21
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
22
|
+
|
|
23
|
+
def __iter__(self):
|
|
24
|
+
yield self.sequence
|
|
25
|
+
yield self.embedding
|
|
26
|
+
|
|
27
|
+
def __len__(self):
|
|
28
|
+
return 2
|
|
29
|
+
|
|
30
|
+
def __getitem__(self, i):
|
|
31
|
+
if i == 0:
|
|
32
|
+
return self.sequence
|
|
33
|
+
elif i == 1:
|
|
34
|
+
return self.embedding
|
|
35
|
+
raise IndexError("Index out of range")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class EmbeddingsJob(Job, BatchJob):
|
|
39
|
+
|
|
40
|
+
job_type: Literal[JobType.embeddings_embed, JobType.embeddings_embed_reduced] = Field(
|
|
41
|
+
default=JobType.embeddings_embed
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class AttnJob(Job, BatchJob):
|
|
46
|
+
|
|
47
|
+
job_type: Literal[JobType.embeddings_attn] = Field(default=JobType.embeddings_attn)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class LogitsJob(Job, BatchJob):
|
|
51
|
+
|
|
52
|
+
job_type: Literal[JobType.embeddings_logits] = Field(
|
|
53
|
+
default=JobType.embeddings_logits
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class ScoreJob(Job, BatchJob):
|
|
58
|
+
|
|
59
|
+
job_type: Literal[JobType.poet_score] = Field(default=JobType.poet_score)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class ScoreIndelJob(Job, BatchJob):
|
|
63
|
+
|
|
64
|
+
job_type: Literal[JobType.poet_score_indel] = Field(
|
|
65
|
+
default=JobType.poet_score_indel
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class ScoreSingleSiteJob(Job, BatchJob):
|
|
70
|
+
|
|
71
|
+
job_type: Literal[JobType.poet_single_site] = Field(
|
|
72
|
+
default=JobType.poet_single_site
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class GenerateJob(Job, BatchJob):
|
|
77
|
+
|
|
78
|
+
job_type: Literal[JobType.poet_generate] = Field(default=JobType.poet_generate)
|
openprotein/errors.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
from requests import Response
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
# Errors for OpenProtein
|
|
6
|
+
class InvalidParameterError(Exception):
|
|
7
|
+
"""InvalidParameterError"""
|
|
8
|
+
|
|
9
|
+
def __init__(self, message="Invalid parameter"):
|
|
10
|
+
self.message = message
|
|
11
|
+
super().__init__(self.message)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MissingParameterError(Exception):
|
|
15
|
+
"""MissingParameterError"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, message="Required parameter is missing"):
|
|
18
|
+
self.message = message
|
|
19
|
+
super().__init__(self.message)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RawAPIError(BaseModel):
|
|
23
|
+
|
|
24
|
+
detail: str
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class APIError(Exception):
|
|
28
|
+
"""APIError"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, message: str):
|
|
31
|
+
self.message = message
|
|
32
|
+
super().__init__(self.message)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class HTTPError(APIError):
|
|
36
|
+
def __init__(self, response: Response):
|
|
37
|
+
self.response = response
|
|
38
|
+
self.status_code = response.status_code
|
|
39
|
+
self.text = response.text
|
|
40
|
+
self.url = response.url
|
|
41
|
+
message = (
|
|
42
|
+
f"Status code {self.status_code}\non resource: {self.url}\n{self.text}"
|
|
43
|
+
)
|
|
44
|
+
super().__init__(message)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class AuthError(Exception):
|
|
48
|
+
"""InvalidParameterError"""
|
|
49
|
+
|
|
50
|
+
def __init__(self, message="Invalid authorization"):
|
|
51
|
+
self.message = message
|
|
52
|
+
super().__init__(self.message)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class InvalidJob(Exception):
|
|
56
|
+
"""InvalidParameterError"""
|
|
57
|
+
|
|
58
|
+
def __init__(self, message="No such job"):
|
|
59
|
+
self.message = message
|
|
60
|
+
super().__init__(self.message)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class TimeoutException(Exception):
|
|
64
|
+
"""InvalidParameterError"""
|
|
65
|
+
|
|
66
|
+
def __init__(self, message="Request timed out!"):
|
|
67
|
+
self.message = message
|
|
68
|
+
super().__init__(self.message)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class DeprecationError(Exception):
|
|
72
|
+
"""DeprecationError used for flagging to the user to not use this interface anymore."""
|
|
73
|
+
|
|
74
|
+
def __init__(self, message="This API is deprecated and no longer supported"):
|
|
75
|
+
self.message = message
|
|
76
|
+
super().__init__(self.message)
|