openprotein-python 0.8.2__1-py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. openprotein/__init__.py +164 -0
  2. openprotein/_version.py +48 -0
  3. openprotein/align/__init__.py +8 -0
  4. openprotein/align/align.py +395 -0
  5. openprotein/align/api.py +428 -0
  6. openprotein/align/future.py +55 -0
  7. openprotein/align/msa.py +129 -0
  8. openprotein/align/schemas.py +165 -0
  9. openprotein/base.py +181 -0
  10. openprotein/chains.py +88 -0
  11. openprotein/common/__init__.py +5 -0
  12. openprotein/common/features.py +7 -0
  13. openprotein/common/model_metadata.py +33 -0
  14. openprotein/common/reduction.py +8 -0
  15. openprotein/config.py +9 -0
  16. openprotein/csv.py +31 -0
  17. openprotein/data/__init__.py +9 -0
  18. openprotein/data/api.py +218 -0
  19. openprotein/data/assaydataset.py +178 -0
  20. openprotein/data/data.py +93 -0
  21. openprotein/data/schemas.py +27 -0
  22. openprotein/design/__init__.py +16 -0
  23. openprotein/design/api.py +259 -0
  24. openprotein/design/design.py +125 -0
  25. openprotein/design/future.py +146 -0
  26. openprotein/design/schemas.py +607 -0
  27. openprotein/embeddings/__init__.py +27 -0
  28. openprotein/embeddings/api.py +619 -0
  29. openprotein/embeddings/embeddings.py +151 -0
  30. openprotein/embeddings/esm.py +33 -0
  31. openprotein/embeddings/future.py +146 -0
  32. openprotein/embeddings/models.py +421 -0
  33. openprotein/embeddings/openprotein.py +21 -0
  34. openprotein/embeddings/poet.py +446 -0
  35. openprotein/embeddings/poet2.py +505 -0
  36. openprotein/embeddings/schemas.py +78 -0
  37. openprotein/errors.py +76 -0
  38. openprotein/fasta.py +92 -0
  39. openprotein/fold/__init__.py +21 -0
  40. openprotein/fold/alphafold2.py +131 -0
  41. openprotein/fold/api.py +287 -0
  42. openprotein/fold/boltz.py +691 -0
  43. openprotein/fold/esmfold.py +54 -0
  44. openprotein/fold/fold.py +107 -0
  45. openprotein/fold/future.py +509 -0
  46. openprotein/fold/models.py +139 -0
  47. openprotein/fold/schemas.py +39 -0
  48. openprotein/jobs/__init__.py +9 -0
  49. openprotein/jobs/api.py +71 -0
  50. openprotein/jobs/futures.py +746 -0
  51. openprotein/jobs/jobs.py +69 -0
  52. openprotein/jobs/schemas.py +135 -0
  53. openprotein/models/__init__.py +4 -0
  54. openprotein/models/base.py +63 -0
  55. openprotein/models/foundation/rfdiffusion.py +283 -0
  56. openprotein/models/models.py +33 -0
  57. openprotein/predictor/__init__.py +25 -0
  58. openprotein/predictor/api.py +384 -0
  59. openprotein/predictor/models.py +374 -0
  60. openprotein/predictor/prediction.py +79 -0
  61. openprotein/predictor/predictor.py +242 -0
  62. openprotein/predictor/schemas.py +113 -0
  63. openprotein/predictor/validate.py +40 -0
  64. openprotein/prompt/__init__.py +9 -0
  65. openprotein/prompt/api.py +505 -0
  66. openprotein/prompt/models.py +142 -0
  67. openprotein/prompt/prompt.py +130 -0
  68. openprotein/prompt/schemas.py +49 -0
  69. openprotein/protein.py +587 -0
  70. openprotein/svd/__init__.py +9 -0
  71. openprotein/svd/api.py +206 -0
  72. openprotein/svd/models.py +288 -0
  73. openprotein/svd/schemas.py +31 -0
  74. openprotein/svd/svd.py +134 -0
  75. openprotein/umap/__init__.py +9 -0
  76. openprotein/umap/api.py +259 -0
  77. openprotein/umap/models.py +211 -0
  78. openprotein/umap/schemas.py +35 -0
  79. openprotein/umap/umap.py +175 -0
  80. openprotein/utils/uuid.py +29 -0
  81. openprotein_python-0.8.2.dist-info/METADATA +176 -0
  82. openprotein_python-0.8.2.dist-info/RECORD +84 -0
  83. openprotein_python-0.8.2.dist-info/WHEEL +4 -0
  84. openprotein_python-0.8.2.dist-info/licenses/LICENSE.txt +30 -0
@@ -0,0 +1,421 @@
1
+ """Embeddings model representations which can be used directly for creating embeddings."""
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from openprotein.base import APISession
6
+ from openprotein.common import FeatureType, ModelMetadata, ReductionType
7
+ from openprotein.data import AssayDataset, AssayMetadata, DataAPI
8
+ from openprotein.errors import InvalidParameterError
9
+
10
+ from . import api
11
+ from .future import EmbeddingsResultFuture
12
+
13
+ if TYPE_CHECKING:
14
+ from openprotein.predictor import PredictorModel
15
+ from openprotein.svd import SVDModel
16
+ from openprotein.umap import UMAPModel
17
+
18
+
19
+ class EmbeddingModel:
20
+ """Base embeddings model used to understand and provide embeddings from sequences."""
21
+
22
+ # overridden by subclasses
23
+ # used to get correct emb model during factory create
24
+ model_id: list[str] | str = "protembed"
25
+
26
+ def __init__(
27
+ self,
28
+ session: APISession,
29
+ model_id: str,
30
+ metadata: ModelMetadata | None = None,
31
+ ):
32
+ self.session = session
33
+ self.id = model_id
34
+ self._metadata = metadata
35
+ self.__doc__ = self.__fmt_doc()
36
+
37
+ def __fmt_doc(self):
38
+ summary = str(self.metadata.description.summary)
39
+ return f"""\t{summary}
40
+ \t max_sequence_length = {self.metadata.max_sequence_length}
41
+ \t supported outputs = {self.metadata.output_types}
42
+ \t supported tokens = {self.metadata.input_tokens}
43
+ """
44
+
45
+ def __str__(self) -> str:
46
+ return self.id
47
+
48
+ def __repr__(self) -> str:
49
+ return self.id
50
+
51
+ @classmethod
52
+ def get_model(cls):
53
+ """
54
+ Get the model_id(s) for this EmbeddingModel subclass.
55
+
56
+ Returns
57
+ -------
58
+ list of str
59
+ List of model_id strings associated with this class.
60
+ """
61
+ if isinstance(cls.model_id, str):
62
+ return [cls.model_id]
63
+ return cls.model_id
64
+
65
+ @classmethod
66
+ def create(
67
+ cls,
68
+ session: APISession,
69
+ model_id: str,
70
+ default: type["EmbeddingModel"] | None = None,
71
+ **kwargs,
72
+ ):
73
+ """
74
+ Create and return an instance of the appropriate EmbeddingModel subclass based on the model_id.
75
+
76
+ Parameters
77
+ ----------
78
+ session : APISession
79
+ The API session to use.
80
+ model_id : str
81
+ The model identifier.
82
+ default : type variable of EmbeddingModel or None, optional
83
+ Default EmbeddingModel subclass to use if no match is found.
84
+ kwargs :
85
+ Additional keyword arguments to pass to the model constructor.
86
+
87
+ Returns
88
+ -------
89
+ EmbeddingModel
90
+ An instance of the appropriate EmbeddingModel subclass.
91
+
92
+ Raises
93
+ ------
94
+ ValueError
95
+ If no suitable EmbeddingModel subclass is found and no default is provided.
96
+ """
97
+ # Dynamically discover all subclasses of EmbeddingModel
98
+ model_classes = EmbeddingModel.__subclasses__()
99
+
100
+ # Find the EmbeddingModel class that matches the model_id
101
+ for model_class in model_classes:
102
+ if model_id in model_class.get_model():
103
+ return model_class(session=session, model_id=model_id, **kwargs)
104
+ # default to ProtembedModel
105
+ if default is not None:
106
+ try:
107
+ return default(session=session, model_id=model_id, **kwargs)
108
+ except:
109
+ # continue to throw error as unsupported
110
+ pass
111
+ raise ValueError(f"Unsupported model_id type: {model_id}")
112
+
113
+ @property
114
+ def metadata(self):
115
+ """
116
+ ModelMetadata for this model.
117
+
118
+ Returns
119
+ -------
120
+ ModelMetadata
121
+ The metadata associated with this model.
122
+ """
123
+ if self._metadata is None:
124
+ self._metadata = self.get_metadata()
125
+ return self._metadata
126
+
127
+ def get_metadata(self) -> ModelMetadata:
128
+ """
129
+ Get model metadata for this model.
130
+
131
+ Returns
132
+ -------
133
+ ModelMetadata
134
+ The metadata associated with this model.
135
+ """
136
+ return api.get_model(self.session, self.id)
137
+
138
+ def embed(
139
+ self,
140
+ sequences: list[bytes] | list[str],
141
+ reduction: ReductionType | None = ReductionType.MEAN,
142
+ **kwargs,
143
+ ) -> EmbeddingsResultFuture:
144
+ """
145
+ Embed sequences using this model.
146
+
147
+ Parameters
148
+ ----------
149
+ sequences : list of bytes or list of str
150
+ Sequences to embed.
151
+ reduction : ReductionType or None, optional
152
+ Reduction to use (e.g. mean). Defaults to mean embedding.
153
+ kwargs:
154
+ Additional keyword arguments to be used from foundational models, e.g. prompt_id for PoET models.
155
+
156
+ Returns
157
+ -------
158
+ EmbeddingsResultFuture
159
+ Future object representing the embedding result.
160
+ """
161
+ return EmbeddingsResultFuture.create(
162
+ session=self.session,
163
+ job=api.request_post(
164
+ session=self.session,
165
+ model_id=self.id,
166
+ sequences=sequences,
167
+ reduction=reduction,
168
+ **kwargs,
169
+ ),
170
+ sequences=sequences,
171
+ )
172
+
173
+ def logits(
174
+ self, sequences: list[bytes] | list[str], **kwargs
175
+ ) -> EmbeddingsResultFuture:
176
+ """
177
+ Compute logit embeddings for sequences using this model.
178
+
179
+ Parameters
180
+ ----------
181
+ sequences : list of bytes or list of str
182
+ Sequences to compute logits for.
183
+ kwargs :
184
+ Additional keyword arguments to be used from foundational models, e.g. prompt_id for PoET models.
185
+
186
+ Returns
187
+ -------
188
+ EmbeddingsResultFuture
189
+ Future object representing the logits result.
190
+ """
191
+ return EmbeddingsResultFuture.create(
192
+ session=self.session,
193
+ job=api.request_logits_post(
194
+ session=self.session, model_id=self.id, sequences=sequences, **kwargs
195
+ ),
196
+ sequences=sequences,
197
+ )
198
+
199
+ def fit_svd(
200
+ self,
201
+ sequences: list[bytes] | list[str] | None = None,
202
+ assay: AssayDataset | None = None,
203
+ n_components: int = 1024,
204
+ reduction: ReductionType | None = None,
205
+ **kwargs,
206
+ ) -> "SVDModel":
207
+ """
208
+ Fit an SVD on the embedding results of this model.
209
+
210
+ This function will create an SVDModel based on the embeddings from this model
211
+ as well as the hyperparameters specified in the arguments.
212
+
213
+ Parameters
214
+ ----------
215
+ sequences : list of bytes or list of str or None, optional
216
+ Sequences to fit SVD on.
217
+ assay : AssayDataset or None, optional
218
+ Assay containing sequences to fit SVD on.
219
+ n_components : int, optional
220
+ Number of components in SVD. Determines output shapes. Default is 1024.
221
+ reduction : ReductionType or None, optional
222
+ Embeddings reduction to use (e.g. mean).
223
+ kwargs :
224
+ Additional keyword arguments to be used from foundational models, e.g. prompt_id for PoET models.
225
+
226
+ Returns
227
+ -------
228
+ SVDModel
229
+ The fitted SVD model.
230
+
231
+ Raises
232
+ ------
233
+ InvalidParameterError
234
+ If neither or both of `assay` and `sequences` are provided.
235
+ """
236
+ # local import for cyclic dep
237
+ from openprotein.svd import SVDAPI
238
+
239
+ svd_api = getattr(self.session, "svd", None)
240
+ assert isinstance(svd_api, SVDAPI)
241
+
242
+ # Ensure either or
243
+ if (assay is None and sequences is None) or (
244
+ assay is not None and sequences is not None
245
+ ):
246
+ raise InvalidParameterError(
247
+ "Expected either assay or sequences to fit SVD on!"
248
+ )
249
+ model_id = self.id
250
+ return svd_api.fit_svd(
251
+ model_id=model_id,
252
+ sequences=sequences,
253
+ assay=assay,
254
+ n_components=n_components,
255
+ reduction=reduction,
256
+ **kwargs,
257
+ )
258
+
259
+ def fit_umap(
260
+ self,
261
+ sequences: list[bytes] | list[str] | None = None,
262
+ assay: AssayDataset | None = None,
263
+ n_components: int = 2,
264
+ reduction: ReductionType | None = ReductionType.MEAN,
265
+ **kwargs,
266
+ ) -> "UMAPModel":
267
+ """
268
+ Fit a UMAP on the embedding results of this model.
269
+
270
+ This function will create a UMAPModel based on the embeddings from this model
271
+ as well as the hyperparameters specified in the arguments.
272
+
273
+ Parameters
274
+ ----------
275
+ sequences : list of bytes or list of str or None, optional
276
+ Optional sequences to fit UMAP with. Either use sequences or assay. Sequences is preferred.
277
+ assay : AssayDataset or None, optional
278
+ Optional assay containing sequences to fit UMAP with. Either use sequences or assay. Ignored if sequences are provided.
279
+ n_components : int, optional
280
+ Number of components in UMAP fit. Determines output shapes. Default is 2.
281
+ reduction : ReductionType or None, optional
282
+ Embeddings reduction to use (e.g. mean). Defaults to MEAN.
283
+ kwargs :
284
+ Additional keyword arguments to be used from foundational models, e.g. prompt_id for PoET models.
285
+
286
+ Returns
287
+ -------
288
+ UMAPModel
289
+ The fitted UMAP model.
290
+
291
+ Raises
292
+ ------
293
+ InvalidParameterError
294
+ If neither or both of `assay` and `sequences` are provided.
295
+ """
296
+ # local import for cyclic dep
297
+ from openprotein.umap import UMAPAPI
298
+
299
+ umap_api = getattr(self.session, "umap", None)
300
+ assert isinstance(umap_api, UMAPAPI)
301
+
302
+ # Ensure either or
303
+ if (assay is None and sequences is None) or (
304
+ assay is not None and sequences is not None
305
+ ):
306
+ raise InvalidParameterError(
307
+ "Expected either assay or sequences to fit UMAP on!"
308
+ )
309
+ model_id = self.id
310
+ return umap_api.fit_umap(
311
+ model_id=model_id,
312
+ feature_type=FeatureType.PLM,
313
+ sequences=sequences,
314
+ assay_id=assay.id if assay is not None else None,
315
+ n_components=n_components,
316
+ reduction=reduction,
317
+ **kwargs,
318
+ )
319
+
320
+ def fit_gp(
321
+ self,
322
+ assay: AssayMetadata | AssayDataset | str,
323
+ properties: list[str],
324
+ reduction: ReductionType,
325
+ name: str | None = None,
326
+ description: str | None = None,
327
+ **kwargs,
328
+ ) -> "PredictorModel":
329
+ """
330
+ Fit a Gaussian Process (GP) on an assay using this embedding model and hyperparameters.
331
+
332
+ Parameters
333
+ ----------
334
+ assay : AssayMetadata, AssayDataset, or str
335
+ Assay to fit GP on.
336
+ properties : list of str
337
+ Properties in the assay to fit the GP on.
338
+ reduction : ReductionType
339
+ Type of embedding reduction to use for computing features. PLM must use reduction.
340
+ name : str or None, optional
341
+ Optional name for the predictor model.
342
+ description : str or None, optional
343
+ Optional description for the predictor model.
344
+ kwargs :
345
+ Additional keyword arguments to be used from foundational models, e.g. prompt_id for PoET models.
346
+
347
+ Returns
348
+ -------
349
+ PredictorModel
350
+ The fitted predictor model.
351
+
352
+ Raises
353
+ ------
354
+ InvalidParameterError
355
+ If no properties are provided, properties are not a subset of assay measurements,
356
+ or multitask GP is requested.
357
+ """
358
+ # local import to resolve cyclic
359
+ from openprotein.predictor import PredictorAPI
360
+
361
+ data_api = getattr(self.session, "data", None)
362
+ assert isinstance(data_api, DataAPI)
363
+ predictor_api = getattr(self.session, "predictor", None)
364
+ assert isinstance(predictor_api, PredictorAPI)
365
+
366
+ # get assay if str
367
+ assay = data_api.get(assay_id=assay) if isinstance(assay, str) else assay
368
+ # extract assay_id
369
+ if len(properties) == 0:
370
+ raise InvalidParameterError("Expected (at-least) 1 property to train")
371
+ if not set(properties) <= set(assay.measurement_names):
372
+ raise InvalidParameterError(
373
+ f"Expected all provided properties to be a subset of assay's measurements: {assay.measurement_names}"
374
+ )
375
+ # TODO - support multitask
376
+ if len(properties) > 1:
377
+ raise InvalidParameterError(
378
+ "Training a multitask GP is not yet supported (i.e. number of properties should only be 1 for now)"
379
+ )
380
+
381
+ # inject into predictor api
382
+ return predictor_api.fit_gp(
383
+ assay=assay,
384
+ properties=properties,
385
+ feature_type=FeatureType.PLM,
386
+ model=self,
387
+ reduction=reduction,
388
+ name=name,
389
+ description=description,
390
+ **kwargs,
391
+ )
392
+
393
+
394
+ class AttnModel(EmbeddingModel):
395
+ """Embeddings model that provides attention computation."""
396
+
397
+ def attn(
398
+ self, sequences: list[bytes] | list[str], **kwargs
399
+ ) -> EmbeddingsResultFuture:
400
+ """
401
+ Compute attention embeddings for sequences using this model.
402
+
403
+ Parameters
404
+ ----------
405
+ sequences : list of bytes or list of str
406
+ Sequences to compute attention embeddings for.
407
+ kwargs :
408
+ Additional keyword arguments to be used from foundational models.
409
+
410
+ Returns
411
+ -------
412
+ EmbeddingsResultFuture
413
+ Future object representing the attention result.
414
+ """
415
+ return EmbeddingsResultFuture.create(
416
+ session=self.session,
417
+ job=api.request_attn_post(
418
+ session=self.session, model_id=self.id, sequences=sequences, **kwargs
419
+ ),
420
+ sequences=sequences,
421
+ )
@@ -0,0 +1,21 @@
1
+ """OpenProtein-proprietary models."""
2
+
3
+ from .models import AttnModel, EmbeddingModel
4
+
5
+
6
+ class OpenProteinModel(AttnModel, EmbeddingModel):
7
+ """
8
+ Proprietary protein embedding models served by OpenProtein.
9
+
10
+ Examples
11
+ --------
12
+ View specific model details (inc supported tokens) with the `?` operator.
13
+
14
+ .. code-block:: python
15
+
16
+ >>> import openprotein
17
+ >>> session = openprotein.connect(username="user", password="password")
18
+ >>> session.embedding.prot_seq?
19
+ """
20
+
21
+ model_id = ["prot-seq", "rotaprot-large-uniref50w", "rotaprot_large_uniref90_ft"]