openprotein-python 0.8.4__tar.gz → 0.8.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/PKG-INFO +9 -9
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/README.md +8 -8
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/common/__init__.py +2 -2
- openprotein_python-0.8.6/openprotein/common/features.py +15 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/common/model_metadata.py +1 -1
- openprotein_python-0.8.6/openprotein/common/reduction.py +14 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/data/api.py +13 -2
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/data/data.py +9 -2
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/embeddings/models.py +37 -28
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/fold/alphafold2.py +7 -4
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/fold/future.py +59 -33
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/predictor/api.py +2 -2
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/predictor/predictor.py +40 -12
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/predictor/schemas.py +2 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/protein.py +53 -36
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/svd/svd.py +6 -4
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/umap/umap.py +43 -14
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/pyproject.toml +1 -18
- openprotein_python-0.8.4/openprotein/common/features.py +0 -7
- openprotein_python-0.8.4/openprotein/common/reduction.py +0 -8
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/.gitignore +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/LICENSE.txt +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/__init__.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/_version.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/align/__init__.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/align/align.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/align/api.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/align/future.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/align/msa.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/align/schemas.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/base.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/chains.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/config.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/csv.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/data/__init__.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/data/assaydataset.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/data/schemas.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/design/__init__.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/design/api.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/design/design.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/design/future.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/design/schemas.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/embeddings/__init__.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/embeddings/api.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/embeddings/embeddings.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/embeddings/esm.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/embeddings/future.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/embeddings/openprotein.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/embeddings/poet.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/embeddings/poet2.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/embeddings/schemas.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/errors.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/fasta.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/fold/__init__.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/fold/api.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/fold/boltz.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/fold/esmfold.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/fold/fold.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/fold/models.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/fold/schemas.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/jobs/__init__.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/jobs/api.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/jobs/futures.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/jobs/jobs.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/jobs/schemas.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/models/__init__.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/models/base.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/models/foundation/rfdiffusion.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/models/models.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/predictor/__init__.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/predictor/models.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/predictor/prediction.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/predictor/validate.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/prompt/__init__.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/prompt/api.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/prompt/models.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/prompt/prompt.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/prompt/schemas.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/svd/__init__.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/svd/api.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/svd/models.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/svd/schemas.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/umap/__init__.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/umap/api.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/umap/models.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/umap/schemas.py +0 -0
- {openprotein_python-0.8.4 → openprotein_python-0.8.6}/openprotein/utils/uuid.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: openprotein-python
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.6
|
|
4
4
|
Summary: OpenProtein Python interface.
|
|
5
5
|
Author-email: Mark Gee <markgee@ne47.bio>, "Timothy Truong Jr." <ttruong@ne47.bio>, Tristan Bepler <tbepler@ne47.bio>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -28,14 +28,14 @@ The OpenProtein.AI Python Interface provides a user-friendly library to interact
|
|
|
28
28
|
|
|
29
29
|
# Table of Contents
|
|
30
30
|
|
|
31
|
-
| | Workflow
|
|
32
|
-
|
|
33
|
-
| 0 | [`Quick start`](#Quick-start)
|
|
34
|
-
| 1 | [`Installation`](https://docs.openprotein.ai/api-python/installation.html)
|
|
35
|
-
| 2 | [`Session management`](https://docs.openprotein.ai/api-python/overview.html)
|
|
36
|
-
| 3 | [`Asssay-based Sequence Learning`](https://docs.openprotein.ai/api-python/core_workflow.html)
|
|
37
|
-
| 4 | [`De Novo prediction & generative models (PoET)`](https://docs.openprotein.ai/api-python/poet_workflow.html) | Covers PoET, a protein LLM for *de novo* scoring, as well as sequence generation.
|
|
38
|
-
| 5 | [`Protein Language Models & Embeddings`](https://docs.openprotein.ai/api-python/embedding_workflow.html)
|
|
31
|
+
| | Workflow | Description |
|
|
32
|
+
|---|--------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------|
|
|
33
|
+
| 0 | [`Quick start`](#Quick-start) | Quick start guide |
|
|
34
|
+
| 1 | [`Installation`](https://docs.openprotein.ai/api-python/installation.html) | Install guide for pip and conda. |
|
|
35
|
+
| 2 | [`Session management`](https://docs.openprotein.ai/api-python/overview.html) | An overview of the OpenProtein Python Client & the asynchronous jobs system. |
|
|
36
|
+
| 3 | [`Asssay-based Sequence Learning`](https://docs.openprotein.ai/api-python/core_workflow.html) | Covers core tasks such as data upload, model training & prediction, and sequence design. |
|
|
37
|
+
| 4 | [`De Novo prediction & generative models (PoET)`](https://docs.openprotein.ai/api-python/poet_workflow.html) | Covers PoET, a protein LLM for *de novo* scoring, as well as sequence generation. |
|
|
38
|
+
| 5 | [`Protein Language Models & Embeddings`](https://docs.openprotein.ai/api-python/embedding_workflow.html) | Covers methods for creating sequence embeddings with proprietary & open-source models. |
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
# Quick-start
|
|
@@ -10,14 +10,14 @@ The OpenProtein.AI Python Interface provides a user-friendly library to interact
|
|
|
10
10
|
|
|
11
11
|
# Table of Contents
|
|
12
12
|
|
|
13
|
-
| | Workflow
|
|
14
|
-
|
|
15
|
-
| 0 | [`Quick start`](#Quick-start)
|
|
16
|
-
| 1 | [`Installation`](https://docs.openprotein.ai/api-python/installation.html)
|
|
17
|
-
| 2 | [`Session management`](https://docs.openprotein.ai/api-python/overview.html)
|
|
18
|
-
| 3 | [`Asssay-based Sequence Learning`](https://docs.openprotein.ai/api-python/core_workflow.html)
|
|
19
|
-
| 4 | [`De Novo prediction & generative models (PoET)`](https://docs.openprotein.ai/api-python/poet_workflow.html) | Covers PoET, a protein LLM for *de novo* scoring, as well as sequence generation.
|
|
20
|
-
| 5 | [`Protein Language Models & Embeddings`](https://docs.openprotein.ai/api-python/embedding_workflow.html)
|
|
13
|
+
| | Workflow | Description |
|
|
14
|
+
|---|--------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------|
|
|
15
|
+
| 0 | [`Quick start`](#Quick-start) | Quick start guide |
|
|
16
|
+
| 1 | [`Installation`](https://docs.openprotein.ai/api-python/installation.html) | Install guide for pip and conda. |
|
|
17
|
+
| 2 | [`Session management`](https://docs.openprotein.ai/api-python/overview.html) | An overview of the OpenProtein Python Client & the asynchronous jobs system. |
|
|
18
|
+
| 3 | [`Asssay-based Sequence Learning`](https://docs.openprotein.ai/api-python/core_workflow.html) | Covers core tasks such as data upload, model training & prediction, and sequence design. |
|
|
19
|
+
| 4 | [`De Novo prediction & generative models (PoET)`](https://docs.openprotein.ai/api-python/poet_workflow.html) | Covers PoET, a protein LLM for *de novo* scoring, as well as sequence generation. |
|
|
20
|
+
| 5 | [`Protein Language Models & Embeddings`](https://docs.openprotein.ai/api-python/embedding_workflow.html) | Covers methods for creating sequence embeddings with proprietary & open-source models. |
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
# Quick-start
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""Common classes and utilities for OpenProtein."""
|
|
2
2
|
|
|
3
|
-
from .features import FeatureType
|
|
3
|
+
from .features import Feature, FeatureType
|
|
4
4
|
from .model_metadata import ModelDescription, ModelMetadata, TokenInfo
|
|
5
|
-
from .reduction import ReductionType
|
|
5
|
+
from .reduction import Reduction, ReductionType
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Feature types used in OpenProtein."""
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FeatureType(str, Enum):
|
|
8
|
+
|
|
9
|
+
PLM = "PLM"
|
|
10
|
+
SVD = "SVD"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# NOTE: only works with python 3.12+
|
|
14
|
+
# Feature = Literal[*tuple([r.value for r in FeatureType])]
|
|
15
|
+
Feature = Literal["PLM", "SVD"]
|
|
@@ -28,6 +28,6 @@ class ModelMetadata(BaseModel):
|
|
|
28
28
|
max_sequence_length: int | None = None
|
|
29
29
|
dimension: int
|
|
30
30
|
output_types: list[str]
|
|
31
|
-
input_tokens: list[str]
|
|
31
|
+
input_tokens: list[str] | None
|
|
32
32
|
output_tokens: list[str] | None = None
|
|
33
33
|
token_descriptions: list[list[TokenInfo]]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Reduction types used in OpenProtein."""
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ReductionType(str, Enum):
|
|
8
|
+
MEAN = "MEAN"
|
|
9
|
+
SUM = "SUM"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# NOTE: only works with python 3.12+
|
|
13
|
+
# Reduction = Literal[*tuple([r.value for r in ReductionType])]
|
|
14
|
+
Reduction = Literal["MEAN", "SUM"]
|
|
@@ -64,7 +64,9 @@ def assaydata_post(
|
|
|
64
64
|
raise APIError(f"Unable to post assay data: {response.text}")
|
|
65
65
|
|
|
66
66
|
|
|
67
|
-
def assaydata_list(
|
|
67
|
+
def assaydata_list(
|
|
68
|
+
session: APISession, limit: int | None = None, offset: int | None = None
|
|
69
|
+
) -> list[AssayMetadata]:
|
|
68
70
|
"""
|
|
69
71
|
Get a list of all assay metadata.
|
|
70
72
|
|
|
@@ -72,6 +74,10 @@ def assaydata_list(session: APISession) -> list[AssayMetadata]:
|
|
|
72
74
|
----------
|
|
73
75
|
session : APISession
|
|
74
76
|
Session object for API communication.
|
|
77
|
+
limit : int, optional
|
|
78
|
+
Limit the number of assays to return.
|
|
79
|
+
offset : int, optional
|
|
80
|
+
Offset of assays to retrieve. Useful with limit.
|
|
75
81
|
|
|
76
82
|
Returns
|
|
77
83
|
-------
|
|
@@ -84,7 +90,12 @@ def assaydata_list(session: APISession) -> list[AssayMetadata]:
|
|
|
84
90
|
If an error occurs during the API request.
|
|
85
91
|
"""
|
|
86
92
|
endpoint = "v1/assaydata"
|
|
87
|
-
|
|
93
|
+
params = {}
|
|
94
|
+
if limit is not None:
|
|
95
|
+
params["limit"] = limit
|
|
96
|
+
if offset is not None:
|
|
97
|
+
params["offset"] = offset
|
|
98
|
+
response = session.get(endpoint, params=params)
|
|
88
99
|
if response.status_code == 200:
|
|
89
100
|
return TypeAdapter(list[AssayMetadata]).validate_python(response.json())
|
|
90
101
|
else:
|
|
@@ -14,16 +14,23 @@ class DataAPI:
|
|
|
14
14
|
def __init__(self, session: APISession):
|
|
15
15
|
self.session = session
|
|
16
16
|
|
|
17
|
-
def list(
|
|
17
|
+
def list(
|
|
18
|
+
self, limit: int | None = None, offset: int | None = None
|
|
19
|
+
) -> list[AssayDataset]:
|
|
18
20
|
"""
|
|
19
21
|
List all assay datasets.
|
|
20
22
|
|
|
23
|
+
limit : int, optional
|
|
24
|
+
Limit the number of assays to return.
|
|
25
|
+
offset : int, optional
|
|
26
|
+
Offset of assays to retrieve. Useful with limit.
|
|
27
|
+
|
|
21
28
|
Returns
|
|
22
29
|
-------
|
|
23
30
|
List[AssayDataset]
|
|
24
31
|
List of all assay datasets.
|
|
25
32
|
"""
|
|
26
|
-
metadata = api.assaydata_list(self.session)
|
|
33
|
+
metadata = api.assaydata_list(session=self.session, limit=limit, offset=offset)
|
|
27
34
|
return [AssayDataset(self.session, x) for x in metadata]
|
|
28
35
|
|
|
29
36
|
def create(
|
|
@@ -3,7 +3,13 @@
|
|
|
3
3
|
from typing import TYPE_CHECKING
|
|
4
4
|
|
|
5
5
|
from openprotein.base import APISession
|
|
6
|
-
from openprotein.common import
|
|
6
|
+
from openprotein.common import (
|
|
7
|
+
Feature,
|
|
8
|
+
FeatureType,
|
|
9
|
+
ModelMetadata,
|
|
10
|
+
Reduction,
|
|
11
|
+
ReductionType,
|
|
12
|
+
)
|
|
7
13
|
from openprotein.data import AssayDataset, AssayMetadata, DataAPI
|
|
8
14
|
from openprotein.errors import InvalidParameterError
|
|
9
15
|
|
|
@@ -199,9 +205,9 @@ class EmbeddingModel:
|
|
|
199
205
|
def fit_svd(
|
|
200
206
|
self,
|
|
201
207
|
sequences: list[bytes] | list[str] | None = None,
|
|
202
|
-
assay: AssayDataset | None = None,
|
|
208
|
+
assay: AssayDataset | AssayMetadata | None = None,
|
|
203
209
|
n_components: int = 1024,
|
|
204
|
-
reduction: ReductionType | None = None,
|
|
210
|
+
reduction: Reduction | ReductionType | None = None,
|
|
205
211
|
**kwargs,
|
|
206
212
|
) -> "SVDModel":
|
|
207
213
|
"""
|
|
@@ -236,6 +242,11 @@ class EmbeddingModel:
|
|
|
236
242
|
# local import for cyclic dep
|
|
237
243
|
from openprotein.svd import SVDAPI
|
|
238
244
|
|
|
245
|
+
# runtime check on value
|
|
246
|
+
if isinstance(reduction, str):
|
|
247
|
+
reduction = ReductionType(reduction)
|
|
248
|
+
reduction = reduction.value
|
|
249
|
+
|
|
239
250
|
svd_api = getattr(self.session, "svd", None)
|
|
240
251
|
assert isinstance(svd_api, SVDAPI)
|
|
241
252
|
|
|
@@ -246,9 +257,8 @@ class EmbeddingModel:
|
|
|
246
257
|
raise InvalidParameterError(
|
|
247
258
|
"Expected either assay or sequences to fit SVD on!"
|
|
248
259
|
)
|
|
249
|
-
model_id = self.id
|
|
250
260
|
return svd_api.fit_svd(
|
|
251
|
-
|
|
261
|
+
model=self,
|
|
252
262
|
sequences=sequences,
|
|
253
263
|
assay=assay,
|
|
254
264
|
n_components=n_components,
|
|
@@ -259,9 +269,9 @@ class EmbeddingModel:
|
|
|
259
269
|
def fit_umap(
|
|
260
270
|
self,
|
|
261
271
|
sequences: list[bytes] | list[str] | None = None,
|
|
262
|
-
assay: AssayDataset | None = None,
|
|
272
|
+
assay: AssayDataset | AssayMetadata | None = None,
|
|
263
273
|
n_components: int = 2,
|
|
264
|
-
reduction:
|
|
274
|
+
reduction: Reduction | ReductionType = "MEAN",
|
|
265
275
|
**kwargs,
|
|
266
276
|
) -> "UMAPModel":
|
|
267
277
|
"""
|
|
@@ -274,11 +284,11 @@ class EmbeddingModel:
|
|
|
274
284
|
----------
|
|
275
285
|
sequences : list of bytes or list of str or None, optional
|
|
276
286
|
Optional sequences to fit UMAP with. Either use sequences or assay. Sequences is preferred.
|
|
277
|
-
assay : AssayDataset or None, optional
|
|
287
|
+
assay : AssayDataset or AssayMetadata or None, optional
|
|
278
288
|
Optional assay containing sequences to fit UMAP with. Either use sequences or assay. Ignored if sequences are provided.
|
|
279
289
|
n_components : int, optional
|
|
280
290
|
Number of components in UMAP fit. Determines output shapes. Default is 2.
|
|
281
|
-
reduction : ReductionType or None, optional
|
|
291
|
+
reduction : Reduction or ReductionType or None, optional
|
|
282
292
|
Embeddings reduction to use (e.g. mean). Defaults to MEAN.
|
|
283
293
|
kwargs :
|
|
284
294
|
Additional keyword arguments to be used from foundational models, e.g. prompt_id for PoET models.
|
|
@@ -296,6 +306,16 @@ class EmbeddingModel:
|
|
|
296
306
|
# local import for cyclic dep
|
|
297
307
|
from openprotein.umap import UMAPAPI
|
|
298
308
|
|
|
309
|
+
if reduction is None:
|
|
310
|
+
raise InvalidParameterError(
|
|
311
|
+
"Expected reduction if using EmbeddingModel to fit UMAP"
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
# runtime check on value
|
|
315
|
+
if isinstance(reduction, str):
|
|
316
|
+
reduction = ReductionType(reduction)
|
|
317
|
+
reduction = reduction.value
|
|
318
|
+
|
|
299
319
|
umap_api = getattr(self.session, "umap", None)
|
|
300
320
|
assert isinstance(umap_api, UMAPAPI)
|
|
301
321
|
|
|
@@ -306,12 +326,18 @@ class EmbeddingModel:
|
|
|
306
326
|
raise InvalidParameterError(
|
|
307
327
|
"Expected either assay or sequences to fit UMAP on!"
|
|
308
328
|
)
|
|
329
|
+
# get assay_id
|
|
330
|
+
assay_id = (
|
|
331
|
+
assay.assay_id
|
|
332
|
+
if isinstance(assay, AssayMetadata)
|
|
333
|
+
else assay.id if isinstance(assay, AssayDataset) else assay
|
|
334
|
+
)
|
|
309
335
|
model_id = self.id
|
|
310
336
|
return umap_api.fit_umap(
|
|
311
337
|
model_id=model_id,
|
|
312
338
|
feature_type=FeatureType.PLM,
|
|
313
339
|
sequences=sequences,
|
|
314
|
-
assay_id=
|
|
340
|
+
assay_id=assay_id,
|
|
315
341
|
n_components=n_components,
|
|
316
342
|
reduction=reduction,
|
|
317
343
|
**kwargs,
|
|
@@ -319,7 +345,7 @@ class EmbeddingModel:
|
|
|
319
345
|
|
|
320
346
|
def fit_gp(
|
|
321
347
|
self,
|
|
322
|
-
assay:
|
|
348
|
+
assay: AssayDataset | AssayMetadata | str,
|
|
323
349
|
properties: list[str],
|
|
324
350
|
reduction: ReductionType,
|
|
325
351
|
name: str | None = None,
|
|
@@ -358,26 +384,9 @@ class EmbeddingModel:
|
|
|
358
384
|
# local import to resolve cyclic
|
|
359
385
|
from openprotein.predictor import PredictorAPI
|
|
360
386
|
|
|
361
|
-
data_api = getattr(self.session, "data", None)
|
|
362
|
-
assert isinstance(data_api, DataAPI)
|
|
363
387
|
predictor_api = getattr(self.session, "predictor", None)
|
|
364
388
|
assert isinstance(predictor_api, PredictorAPI)
|
|
365
389
|
|
|
366
|
-
# get assay if str
|
|
367
|
-
assay = data_api.get(assay_id=assay) if isinstance(assay, str) else assay
|
|
368
|
-
# extract assay_id
|
|
369
|
-
if len(properties) == 0:
|
|
370
|
-
raise InvalidParameterError("Expected (at-least) 1 property to train")
|
|
371
|
-
if not set(properties) <= set(assay.measurement_names):
|
|
372
|
-
raise InvalidParameterError(
|
|
373
|
-
f"Expected all provided properties to be a subset of assay's measurements: {assay.measurement_names}"
|
|
374
|
-
)
|
|
375
|
-
# TODO - support multitask
|
|
376
|
-
if len(properties) > 1:
|
|
377
|
-
raise InvalidParameterError(
|
|
378
|
-
"Training a multitask GP is not yet supported (i.e. number of properties should only be 1 for now)"
|
|
379
|
-
)
|
|
380
|
-
|
|
381
390
|
# inject into predictor api
|
|
382
391
|
return predictor_api.fit_gp(
|
|
383
392
|
assay=assay,
|
|
@@ -47,10 +47,8 @@ class AlphaFold2Model(FoldModel):
|
|
|
47
47
|
number of times to recycle models
|
|
48
48
|
num_models : int
|
|
49
49
|
number of models to train - best model will be used
|
|
50
|
-
|
|
51
|
-
maximum number of
|
|
52
|
-
relax_max_iterations : int
|
|
53
|
-
maximum number of iterations
|
|
50
|
+
num_relax : int
|
|
51
|
+
maximum number of iterations for relax
|
|
54
52
|
|
|
55
53
|
Returns
|
|
56
54
|
-------
|
|
@@ -61,6 +59,7 @@ class AlphaFold2Model(FoldModel):
|
|
|
61
59
|
"Inputs to AlphaFold 2 have been updated. 'msa' should be supplied as 'proteins' argument. Support will be dropped in the future."
|
|
62
60
|
)
|
|
63
61
|
proteins = kwargs["msa"]
|
|
62
|
+
assert isinstance(proteins, MSAFuture), "Expected msa to be an MSAFuture"
|
|
64
63
|
if "ligands" in kwargs or "dnas" in kwargs or "rnas" in kwargs:
|
|
65
64
|
with warnings.catch_warnings():
|
|
66
65
|
warnings.simplefilter("always") # Force warning to always show
|
|
@@ -73,6 +72,10 @@ class AlphaFold2Model(FoldModel):
|
|
|
73
72
|
msa_to_seed: dict[str, Counter] = dict()
|
|
74
73
|
for protein in proteins:
|
|
75
74
|
if (msa := protein.msa) is not None:
|
|
75
|
+
if isinstance(msa, Protein.NullMSA):
|
|
76
|
+
raise ValueError(
|
|
77
|
+
"AlphaFold 2 expects MSA and does not support single sequence mode"
|
|
78
|
+
)
|
|
76
79
|
msa_id = msa.id if isinstance(msa, MSAFuture) else msa
|
|
77
80
|
if msa_id in msa_to_seed:
|
|
78
81
|
seeds = msa_to_seed[msa_id]
|
|
@@ -9,11 +9,11 @@ from typing_extensions import Self
|
|
|
9
9
|
from openprotein import config
|
|
10
10
|
from openprotein.base import APISession
|
|
11
11
|
from openprotein.chains import DNA, RNA, Ligand
|
|
12
|
-
from openprotein.jobs import Future, MappedFuture
|
|
12
|
+
from openprotein.jobs import Future, JobsAPI, MappedFuture
|
|
13
13
|
from openprotein.protein import Protein
|
|
14
14
|
|
|
15
15
|
from . import api
|
|
16
|
-
from .schemas import FoldJob
|
|
16
|
+
from .schemas import FoldJob, FoldMetadata
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
19
|
from .boltz import BoltzAffinity, BoltzConfidence
|
|
@@ -34,34 +34,39 @@ class FoldResultFuture(MappedFuture, Future):
|
|
|
34
34
|
def __init__(
|
|
35
35
|
self,
|
|
36
36
|
session: APISession,
|
|
37
|
-
job: FoldJob,
|
|
37
|
+
job: FoldJob | None = None,
|
|
38
|
+
metadata: FoldMetadata | None = None,
|
|
38
39
|
sequences: list[bytes] | None = None,
|
|
39
40
|
max_workers: int = config.MAX_CONCURRENT_WORKERS,
|
|
40
41
|
):
|
|
41
42
|
"""
|
|
42
43
|
Initialize a FoldResultFuture instance.
|
|
43
44
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
The API session to use for requests.
|
|
48
|
-
job : FoldJob
|
|
49
|
-
The fold job associated with this future.
|
|
50
|
-
sequences : list[bytes], optional
|
|
51
|
-
List of sequences submitted for the fold request. If None, sequences will be fetched.
|
|
52
|
-
max_workers : int, optional
|
|
53
|
-
Maximum number of concurrent workers. Default is config.MAX_CONCURRENT_WORKERS.
|
|
45
|
+
Takes in either a fold job, or the fold job metadata.
|
|
46
|
+
|
|
47
|
+
:meta private:
|
|
54
48
|
"""
|
|
55
|
-
|
|
49
|
+
# initialize the fold job metadata
|
|
50
|
+
if metadata is None:
|
|
51
|
+
if job is None or job.job_id is None:
|
|
52
|
+
raise ValueError("Expected fold metadata or job")
|
|
53
|
+
metadata = api.fold_get(session, job.job_id)
|
|
54
|
+
self._metadata = metadata
|
|
55
|
+
if job is None:
|
|
56
|
+
jobs_api = getattr(session, "jobs", None)
|
|
57
|
+
assert isinstance(jobs_api, JobsAPI)
|
|
58
|
+
job = FoldJob.create(jobs_api.get_job(job_id=metadata.job_id))
|
|
56
59
|
if sequences is None:
|
|
57
60
|
sequences = api.fold_get_sequences(self.session, job_id=job.job_id)
|
|
58
61
|
self._sequences = sequences
|
|
62
|
+
super().__init__(session, job, max_workers)
|
|
59
63
|
|
|
60
64
|
@classmethod
|
|
61
65
|
def create(
|
|
62
66
|
cls: type[Self],
|
|
63
67
|
session: APISession,
|
|
64
|
-
job: FoldJob,
|
|
68
|
+
job: FoldJob | None = None,
|
|
69
|
+
metadata: FoldMetadata | None = None,
|
|
65
70
|
**kwargs,
|
|
66
71
|
) -> "Self | FoldComplexResultFuture":
|
|
67
72
|
"""
|
|
@@ -81,7 +86,13 @@ class FoldResultFuture(MappedFuture, Future):
|
|
|
81
86
|
FoldResultFuture or FoldComplexResultFuture
|
|
82
87
|
An instance of FoldResultFuture or FoldComplexResultFuture depending on the model.
|
|
83
88
|
"""
|
|
84
|
-
|
|
89
|
+
if job is not None:
|
|
90
|
+
job_id = job.job_id
|
|
91
|
+
elif metadata is not None:
|
|
92
|
+
job_id = metadata.job_id
|
|
93
|
+
else:
|
|
94
|
+
raise ValueError("Expected fold metadata or job")
|
|
95
|
+
model_id = api.fold_get(session=session, job_id=job_id).model_id
|
|
85
96
|
if model_id.startswith("boltz") or model_id.startswith("alphafold"):
|
|
86
97
|
return FoldComplexResultFuture(session=session, job=job, **kwargs)
|
|
87
98
|
else:
|
|
@@ -101,22 +112,6 @@ class FoldResultFuture(MappedFuture, Future):
|
|
|
101
112
|
self._sequences = api.fold_get_sequences(self.session, self.job.job_id)
|
|
102
113
|
return self._sequences
|
|
103
114
|
|
|
104
|
-
@property
|
|
105
|
-
def model_id(self) -> str:
|
|
106
|
-
"""
|
|
107
|
-
Get the model ID used for the fold request.
|
|
108
|
-
|
|
109
|
-
Returns
|
|
110
|
-
-------
|
|
111
|
-
str
|
|
112
|
-
Model ID.
|
|
113
|
-
"""
|
|
114
|
-
if self._model_id is None:
|
|
115
|
-
self._model_id = api.fold_get(
|
|
116
|
-
session=self.session, job_id=self.job.job_id
|
|
117
|
-
).model_id
|
|
118
|
-
return self._model_id
|
|
119
|
-
|
|
120
115
|
@property
|
|
121
116
|
def id(self):
|
|
122
117
|
"""
|
|
@@ -129,6 +124,17 @@ class FoldResultFuture(MappedFuture, Future):
|
|
|
129
124
|
"""
|
|
130
125
|
return self.job.job_id
|
|
131
126
|
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
def metadata(self) -> FoldMetadata:
|
|
130
|
+
"""The fold metadata."""
|
|
131
|
+
return self._metadata
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def model_id(self) -> str:
|
|
135
|
+
"""The fold model used."""
|
|
136
|
+
return self._metadata.model_id
|
|
137
|
+
|
|
132
138
|
def __keys__(self):
|
|
133
139
|
"""
|
|
134
140
|
Get the list of sequences submitted for the fold request.
|
|
@@ -189,7 +195,8 @@ class FoldComplexResultFuture(Future):
|
|
|
189
195
|
def __init__(
|
|
190
196
|
self,
|
|
191
197
|
session: APISession,
|
|
192
|
-
job: FoldJob,
|
|
198
|
+
job: FoldJob | None = None,
|
|
199
|
+
metadata: FoldMetadata | None = None,
|
|
193
200
|
model_id: str | None = None,
|
|
194
201
|
proteins: list[Protein] | None = None,
|
|
195
202
|
ligands: list[Ligand] | None = None,
|
|
@@ -216,6 +223,16 @@ class FoldComplexResultFuture(Future):
|
|
|
216
223
|
rnas : list[RNA], optional
|
|
217
224
|
List of RNAs submitted for fold request.
|
|
218
225
|
"""
|
|
226
|
+
# initialize the fold job metadata
|
|
227
|
+
if metadata is None:
|
|
228
|
+
if job is None or job.job_id is None:
|
|
229
|
+
raise ValueError("Expected fold metadata or job")
|
|
230
|
+
metadata = api.fold_get(session, job.job_id)
|
|
231
|
+
self._metadata = metadata
|
|
232
|
+
if job is None:
|
|
233
|
+
jobs_api = getattr(session, "jobs", None)
|
|
234
|
+
assert isinstance(jobs_api, JobsAPI)
|
|
235
|
+
job = FoldJob.create(jobs_api.get_job(job_id=metadata.job_id))
|
|
219
236
|
super().__init__(session, job)
|
|
220
237
|
self._model_id = model_id
|
|
221
238
|
self._proteins = proteins
|
|
@@ -229,6 +246,11 @@ class FoldComplexResultFuture(Future):
|
|
|
229
246
|
self._confidence: list["BoltzConfidence"] | None = None
|
|
230
247
|
self._affinity: "BoltzAffinity | None" = None
|
|
231
248
|
|
|
249
|
+
@property
|
|
250
|
+
def metadata(self) -> FoldMetadata:
|
|
251
|
+
"""The fold metadata."""
|
|
252
|
+
return self._metadata
|
|
253
|
+
|
|
232
254
|
@property
|
|
233
255
|
def model_id(self) -> str:
|
|
234
256
|
"""
|
|
@@ -433,6 +455,8 @@ class FoldComplexResultFuture(Future):
|
|
|
433
455
|
AttributeError
|
|
434
456
|
If confidence is not supported for the model.
|
|
435
457
|
"""
|
|
458
|
+
from .boltz import BoltzConfidence
|
|
459
|
+
|
|
436
460
|
if self.model_id not in {"boltz-1", "boltz-1x", "boltz-2"}:
|
|
437
461
|
raise AttributeError("confidence not supported for non-Boltz model")
|
|
438
462
|
if self._confidence is None:
|
|
@@ -464,6 +488,8 @@ class FoldComplexResultFuture(Future):
|
|
|
464
488
|
AttributeError
|
|
465
489
|
If affinity is not supported for the model.
|
|
466
490
|
"""
|
|
491
|
+
from .boltz import BoltzAffinity
|
|
492
|
+
|
|
467
493
|
if self.model_id not in {"boltz-1", "boltz-1x", "boltz-2"}:
|
|
468
494
|
raise AttributeError("affinity not supported for non-Boltz model")
|
|
469
495
|
if self._affinity is None:
|
|
@@ -162,8 +162,8 @@ def predictor_fit_gp_post(
|
|
|
162
162
|
body["name"] = name
|
|
163
163
|
if description is not None:
|
|
164
164
|
body["description"] = description
|
|
165
|
-
# add kwargs for embeddings kwargs
|
|
166
|
-
body.update(kwargs)
|
|
165
|
+
# add kwargs for embeddings kwargs to features
|
|
166
|
+
body["features"].update(kwargs)
|
|
167
167
|
|
|
168
168
|
response = session.post(endpoint, json=body)
|
|
169
169
|
return PredictorTrainJob.model_validate(response.json())
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
"""Predictor API providing the interface to train and predict predictors."""
|
|
2
2
|
|
|
3
3
|
from openprotein.base import APISession
|
|
4
|
-
from openprotein.common import FeatureType, ReductionType
|
|
4
|
+
from openprotein.common import Feature, FeatureType, Reduction, ReductionType
|
|
5
5
|
from openprotein.data import (
|
|
6
6
|
AssayDataset,
|
|
7
7
|
AssayMetadata,
|
|
8
|
+
DataAPI,
|
|
8
9
|
)
|
|
9
10
|
from openprotein.embeddings import EmbeddingModel, EmbeddingsAPI
|
|
10
11
|
from openprotein.errors import InvalidParameterError
|
|
@@ -120,8 +121,8 @@ class PredictorAPI:
|
|
|
120
121
|
assay: AssayDataset | AssayMetadata | str,
|
|
121
122
|
properties: list[str],
|
|
122
123
|
model: EmbeddingModel | SVDModel | str,
|
|
123
|
-
feature_type: FeatureType | None = None,
|
|
124
|
-
reduction: ReductionType | None = None,
|
|
124
|
+
feature_type: Feature | FeatureType | None = None,
|
|
125
|
+
reduction: Reduction | ReductionType | None = None,
|
|
125
126
|
name: str | None = None,
|
|
126
127
|
description: str | None = None,
|
|
127
128
|
**kwargs,
|
|
@@ -139,10 +140,10 @@ class PredictorAPI:
|
|
|
139
140
|
Instance of either EmbeddingModel or SVDModel to use depending
|
|
140
141
|
on feature type. Can also be a str specifying the model id,
|
|
141
142
|
but then feature_type would have to be specified.
|
|
142
|
-
feature_type : FeatureType or None
|
|
143
|
+
feature_type : Feature or FeatureType or None
|
|
143
144
|
Type of features to use for encoding sequences. "SVD" or "PLM".
|
|
144
145
|
None would require model to be EmbeddingModel or SVDModel.
|
|
145
|
-
reduction :
|
|
146
|
+
reduction : Reduction or ReductionType or None, optional
|
|
146
147
|
Type of embedding reduction to use for computing features.
|
|
147
148
|
E.g. "MEAN" or "SUM". Used only if using EmbeddingModel, and
|
|
148
149
|
must be non-nil if using an EmbeddingModel. Defaults to None.
|
|
@@ -154,6 +155,29 @@ class PredictorAPI:
|
|
|
154
155
|
PredictorModel
|
|
155
156
|
The GP model being fit.
|
|
156
157
|
"""
|
|
158
|
+
data_api = getattr(self.session, "data", None)
|
|
159
|
+
assert isinstance(data_api, DataAPI)
|
|
160
|
+
# 1. Check assay data input
|
|
161
|
+
# get assay if str
|
|
162
|
+
assay = data_api.get(assay_id=assay) if isinstance(assay, str) else assay
|
|
163
|
+
# extract assay_id
|
|
164
|
+
assay_id = (
|
|
165
|
+
assay.assay_id
|
|
166
|
+
if isinstance(assay, AssayMetadata)
|
|
167
|
+
else assay.id if isinstance(assay, AssayDataset) else assay
|
|
168
|
+
)
|
|
169
|
+
if len(properties) == 0:
|
|
170
|
+
raise InvalidParameterError("Expected (at-least) 1 property to train")
|
|
171
|
+
if not set(properties) <= set(assay.measurement_names):
|
|
172
|
+
raise InvalidParameterError(
|
|
173
|
+
f"Expected all provided properties to be a subset of assay's measurements: {assay.measurement_names}"
|
|
174
|
+
)
|
|
175
|
+
# TODO - support multitask
|
|
176
|
+
if len(properties) > 1:
|
|
177
|
+
raise InvalidParameterError(
|
|
178
|
+
"Training a multitask GP is not yet supported (i.e. number of properties should only be 1 for now)"
|
|
179
|
+
)
|
|
180
|
+
# 2. Check features input
|
|
157
181
|
# extract feature type
|
|
158
182
|
feature_type = (
|
|
159
183
|
FeatureType.PLM
|
|
@@ -164,6 +188,15 @@ class PredictorAPI:
|
|
|
164
188
|
raise InvalidParameterError(
|
|
165
189
|
"Expected feature_type to be provided if passing str model_id as model"
|
|
166
190
|
)
|
|
191
|
+
# runtime check on value
|
|
192
|
+
if isinstance(feature_type, str):
|
|
193
|
+
feature_type = FeatureType(feature_type)
|
|
194
|
+
|
|
195
|
+
# 3. Check reduction
|
|
196
|
+
if isinstance(reduction, str):
|
|
197
|
+
reduction = ReductionType(reduction)
|
|
198
|
+
reduction = reduction.value
|
|
199
|
+
|
|
167
200
|
# get model if model_id
|
|
168
201
|
if feature_type == FeatureType.PLM:
|
|
169
202
|
if reduction is None:
|
|
@@ -183,19 +216,14 @@ class PredictorAPI:
|
|
|
183
216
|
model = svd_api.get_svd(model)
|
|
184
217
|
assert isinstance(model, SVDModel), "Expected SVDModel"
|
|
185
218
|
model_id = model.id
|
|
186
|
-
|
|
187
|
-
assay_id = (
|
|
188
|
-
assay.assay_id
|
|
189
|
-
if isinstance(assay, AssayMetadata)
|
|
190
|
-
else assay.id if isinstance(assay, AssayDataset) else assay
|
|
191
|
-
)
|
|
219
|
+
|
|
192
220
|
return PredictorModel(
|
|
193
221
|
session=self.session,
|
|
194
222
|
job=api.predictor_fit_gp_post(
|
|
195
223
|
session=self.session,
|
|
196
224
|
assay_id=assay_id,
|
|
197
225
|
properties=properties,
|
|
198
|
-
feature_type=feature_type,
|
|
226
|
+
feature_type=feature_type.value,
|
|
199
227
|
model_id=model_id,
|
|
200
228
|
reduction=reduction,
|
|
201
229
|
name=name,
|