openprotein-python 0.8.2__1-py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openprotein/__init__.py +164 -0
- openprotein/_version.py +48 -0
- openprotein/align/__init__.py +8 -0
- openprotein/align/align.py +395 -0
- openprotein/align/api.py +428 -0
- openprotein/align/future.py +55 -0
- openprotein/align/msa.py +129 -0
- openprotein/align/schemas.py +165 -0
- openprotein/base.py +181 -0
- openprotein/chains.py +88 -0
- openprotein/common/__init__.py +5 -0
- openprotein/common/features.py +7 -0
- openprotein/common/model_metadata.py +33 -0
- openprotein/common/reduction.py +8 -0
- openprotein/config.py +9 -0
- openprotein/csv.py +31 -0
- openprotein/data/__init__.py +9 -0
- openprotein/data/api.py +218 -0
- openprotein/data/assaydataset.py +178 -0
- openprotein/data/data.py +93 -0
- openprotein/data/schemas.py +27 -0
- openprotein/design/__init__.py +16 -0
- openprotein/design/api.py +259 -0
- openprotein/design/design.py +125 -0
- openprotein/design/future.py +146 -0
- openprotein/design/schemas.py +607 -0
- openprotein/embeddings/__init__.py +27 -0
- openprotein/embeddings/api.py +619 -0
- openprotein/embeddings/embeddings.py +151 -0
- openprotein/embeddings/esm.py +33 -0
- openprotein/embeddings/future.py +146 -0
- openprotein/embeddings/models.py +421 -0
- openprotein/embeddings/openprotein.py +21 -0
- openprotein/embeddings/poet.py +446 -0
- openprotein/embeddings/poet2.py +505 -0
- openprotein/embeddings/schemas.py +78 -0
- openprotein/errors.py +76 -0
- openprotein/fasta.py +92 -0
- openprotein/fold/__init__.py +21 -0
- openprotein/fold/alphafold2.py +131 -0
- openprotein/fold/api.py +287 -0
- openprotein/fold/boltz.py +691 -0
- openprotein/fold/esmfold.py +54 -0
- openprotein/fold/fold.py +107 -0
- openprotein/fold/future.py +509 -0
- openprotein/fold/models.py +139 -0
- openprotein/fold/schemas.py +39 -0
- openprotein/jobs/__init__.py +9 -0
- openprotein/jobs/api.py +71 -0
- openprotein/jobs/futures.py +746 -0
- openprotein/jobs/jobs.py +69 -0
- openprotein/jobs/schemas.py +135 -0
- openprotein/models/__init__.py +4 -0
- openprotein/models/base.py +63 -0
- openprotein/models/foundation/rfdiffusion.py +283 -0
- openprotein/models/models.py +33 -0
- openprotein/predictor/__init__.py +25 -0
- openprotein/predictor/api.py +384 -0
- openprotein/predictor/models.py +374 -0
- openprotein/predictor/prediction.py +79 -0
- openprotein/predictor/predictor.py +242 -0
- openprotein/predictor/schemas.py +113 -0
- openprotein/predictor/validate.py +40 -0
- openprotein/prompt/__init__.py +9 -0
- openprotein/prompt/api.py +505 -0
- openprotein/prompt/models.py +142 -0
- openprotein/prompt/prompt.py +130 -0
- openprotein/prompt/schemas.py +49 -0
- openprotein/protein.py +587 -0
- openprotein/svd/__init__.py +9 -0
- openprotein/svd/api.py +206 -0
- openprotein/svd/models.py +288 -0
- openprotein/svd/schemas.py +31 -0
- openprotein/svd/svd.py +134 -0
- openprotein/umap/__init__.py +9 -0
- openprotein/umap/api.py +259 -0
- openprotein/umap/models.py +211 -0
- openprotein/umap/schemas.py +35 -0
- openprotein/umap/umap.py +175 -0
- openprotein/utils/uuid.py +29 -0
- openprotein_python-0.8.2.dist-info/METADATA +176 -0
- openprotein_python-0.8.2.dist-info/RECORD +84 -0
- openprotein_python-0.8.2.dist-info/WHEEL +4 -0
- openprotein_python-0.8.2.dist-info/licenses/LICENSE.txt +30 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Embeddings API providing the interface for creating embeddings using protein language models."""
|
|
2
|
+
|
|
3
|
+
from openprotein.base import APISession
|
|
4
|
+
|
|
5
|
+
from . import api
|
|
6
|
+
from .esm import ESMModel
|
|
7
|
+
from .future import EmbeddingsResultFuture
|
|
8
|
+
from .models import EmbeddingModel
|
|
9
|
+
from .openprotein import OpenProteinModel
|
|
10
|
+
from .poet import PoETModel
|
|
11
|
+
from .poet2 import PoET2Model
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class EmbeddingsAPI:
|
|
15
|
+
"""
|
|
16
|
+
Embeddings API providing the interface for creating embeddings using protein language models.
|
|
17
|
+
|
|
18
|
+
You can access all our models either via :meth:`get_model` or directly through the session's embedding attribute using the model's ID and the desired method. For example, to use the attention method on the protein sequence model, you would use ``session.embedding.prot_seq.attn()``.
|
|
19
|
+
|
|
20
|
+
Examples
|
|
21
|
+
--------
|
|
22
|
+
Accessing a model's method:
|
|
23
|
+
|
|
24
|
+
.. code-block:: python
|
|
25
|
+
|
|
26
|
+
# To call the attention method on the protein sequence model:
|
|
27
|
+
import openprotein
|
|
28
|
+
session = openprotein.connect(username="user", password="password")
|
|
29
|
+
session.embedding.prot_seq.attn()
|
|
30
|
+
|
|
31
|
+
Using the `get_model` method:
|
|
32
|
+
|
|
33
|
+
.. code-block:: python
|
|
34
|
+
|
|
35
|
+
# Get a model instance by name:
|
|
36
|
+
import openprotein
|
|
37
|
+
session = openprotein.connect(username="user", password="password")
|
|
38
|
+
# list available models:
|
|
39
|
+
print(session.embedding.list_models() )
|
|
40
|
+
# init model by name
|
|
41
|
+
model = session.embedding.get_model('prot-seq')
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
# added for static typing, eg pylance, for autocomplete
|
|
45
|
+
# at init these are all overwritten.
|
|
46
|
+
|
|
47
|
+
#: PoET-2 model
|
|
48
|
+
poet2: PoET2Model
|
|
49
|
+
#: PoET model
|
|
50
|
+
poet: PoETModel
|
|
51
|
+
#: Prot-seq model
|
|
52
|
+
prot_seq: OpenProteinModel
|
|
53
|
+
#: Rotaprot model trained on UniRef50
|
|
54
|
+
rotaprot_large_uniref50w: OpenProteinModel
|
|
55
|
+
#: Rotaprot model trained on UniRef90
|
|
56
|
+
rotaprot_large_uniref90_ft: OpenProteinModel
|
|
57
|
+
poet_2: PoET2Model
|
|
58
|
+
|
|
59
|
+
#: ESM1b model
|
|
60
|
+
esm1b: ESMModel # alias
|
|
61
|
+
esm1b_t33_650M_UR50S: ESMModel
|
|
62
|
+
|
|
63
|
+
#: ESM1v model
|
|
64
|
+
esm1v: ESMModel # alias
|
|
65
|
+
esm1v_t33_650M_UR90S_1: ESMModel
|
|
66
|
+
esm1v_t33_650M_UR90S_2: ESMModel
|
|
67
|
+
esm1v_t33_650M_UR90S_3: ESMModel
|
|
68
|
+
esm1v_t33_650M_UR90S_4: ESMModel
|
|
69
|
+
esm1v_t33_650M_UR90S_5: ESMModel
|
|
70
|
+
|
|
71
|
+
#: ESM2 model
|
|
72
|
+
esm2: ESMModel # alias
|
|
73
|
+
esm2_t12_35M_UR50D: ESMModel
|
|
74
|
+
esm2_t30_150M_UR50D: ESMModel
|
|
75
|
+
esm2_t33_650M_UR50D: ESMModel
|
|
76
|
+
esm2_t36_3B_UR50D: ESMModel
|
|
77
|
+
esm2_t6_8M_UR50D: ESMModel
|
|
78
|
+
|
|
79
|
+
def __init__(self, session: APISession):
|
|
80
|
+
self.session = session
|
|
81
|
+
# dynamically add models from api list
|
|
82
|
+
self._load_models()
|
|
83
|
+
|
|
84
|
+
def _load_models(self):
|
|
85
|
+
# Dynamically add model instances as attributes - precludes any drift
|
|
86
|
+
models = self.list_models()
|
|
87
|
+
for model in models:
|
|
88
|
+
model_name = model.id.replace("-", "_") # hyphens out
|
|
89
|
+
setattr(self, model_name, model)
|
|
90
|
+
# Setup aliases safely
|
|
91
|
+
if getattr(self, "esm1b_t33_650M_UR50S", None):
|
|
92
|
+
self.esm1b = self.esm1b_t33_650M_UR50S
|
|
93
|
+
if getattr(self, "esm1v_t33_650M_UR90S_1", None):
|
|
94
|
+
self.esm1v = self.esm1v_t33_650M_UR90S_1
|
|
95
|
+
if getattr(self, "esm2_t33_650M_UR50D", None):
|
|
96
|
+
self.esm2 = self.esm2_t33_650M_UR50D
|
|
97
|
+
if getattr(self, "poet_2", None):
|
|
98
|
+
self.poet2 = self.poet_2
|
|
99
|
+
|
|
100
|
+
def list_models(self) -> list[EmbeddingModel]:
|
|
101
|
+
"""list models available for creating embeddings of your sequences"""
|
|
102
|
+
models = []
|
|
103
|
+
for model_id in api.list_models(self.session):
|
|
104
|
+
models.append(
|
|
105
|
+
EmbeddingModel.create(
|
|
106
|
+
session=self.session, model_id=model_id, default=EmbeddingModel
|
|
107
|
+
)
|
|
108
|
+
)
|
|
109
|
+
return models
|
|
110
|
+
|
|
111
|
+
def get_model(self, name: str) -> EmbeddingModel:
|
|
112
|
+
"""
|
|
113
|
+
Get model by model_id.
|
|
114
|
+
|
|
115
|
+
ProtembedModel allows all the usual job manipulation: \
|
|
116
|
+
e.g. making POST and GET requests for this model specifically.
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
model_id : str
|
|
122
|
+
the model identifier
|
|
123
|
+
|
|
124
|
+
Returns
|
|
125
|
+
-------
|
|
126
|
+
ProtembedModel
|
|
127
|
+
The model
|
|
128
|
+
|
|
129
|
+
Raises
|
|
130
|
+
------
|
|
131
|
+
HTTPError
|
|
132
|
+
If the GET request does not succeed.
|
|
133
|
+
"""
|
|
134
|
+
model_name = name.replace("-", "_")
|
|
135
|
+
return getattr(self, model_name)
|
|
136
|
+
|
|
137
|
+
def __get_results(self, job) -> EmbeddingsResultFuture:
|
|
138
|
+
"""
|
|
139
|
+
Retrieves the results of an embedding job.
|
|
140
|
+
|
|
141
|
+
Parameters
|
|
142
|
+
----------
|
|
143
|
+
job : Job
|
|
144
|
+
The embedding job whose results are to be retrieved.
|
|
145
|
+
|
|
146
|
+
Returns
|
|
147
|
+
-------
|
|
148
|
+
EmbeddingResultFuture
|
|
149
|
+
An instance of EmbeddingResultFuture
|
|
150
|
+
"""
|
|
151
|
+
return EmbeddingsResultFuture(job=job, session=self.session)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Community-based ESM models."""
|
|
2
|
+
|
|
3
|
+
from .models import AttnModel, EmbeddingModel
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ESMModel(AttnModel, EmbeddingModel):
|
|
7
|
+
"""
|
|
8
|
+
Class providing inference endpoints for Facebook's ESM protein language models.
|
|
9
|
+
|
|
10
|
+
Examples
|
|
11
|
+
--------
|
|
12
|
+
View specific model details (inc supported tokens) with the `?` operator.
|
|
13
|
+
|
|
14
|
+
.. code-block:: python
|
|
15
|
+
|
|
16
|
+
>>> import openprotein
|
|
17
|
+
>>> session = openprotein.connect(username="user", password="password")
|
|
18
|
+
>>> session.embedding.esm2_t12_35M_UR50D?
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
model_id = [
|
|
22
|
+
"esm1b_t33_650M_UR50S",
|
|
23
|
+
"esm1v_t33_650M_UR90S_1",
|
|
24
|
+
"esm1v_t33_650M_UR90S_2",
|
|
25
|
+
"esm1v_t33_650M_UR90S_3",
|
|
26
|
+
"esm1v_t33_650M_UR90S_4",
|
|
27
|
+
"esm1v_t33_650M_UR90S_5",
|
|
28
|
+
"esm2_t12_35M_UR50D",
|
|
29
|
+
"esm2_t30_150M_UR50D",
|
|
30
|
+
"esm2_t33_650M_UR50D",
|
|
31
|
+
"esm2_t36_3B_UR50D",
|
|
32
|
+
"esm2_t6_8M_UR50D",
|
|
33
|
+
]
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Future for embeddings-related jobs."""
|
|
2
|
+
|
|
3
|
+
from collections import namedtuple
|
|
4
|
+
from typing import Generator
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from openprotein import config
|
|
9
|
+
|
|
10
|
+
"""Embeddings results represented as futures."""
|
|
11
|
+
|
|
12
|
+
from openprotein.base import APISession
|
|
13
|
+
from openprotein.jobs import Future, MappedFuture, StreamingFuture
|
|
14
|
+
|
|
15
|
+
from . import api
|
|
16
|
+
from .schemas import (
|
|
17
|
+
AttnJob,
|
|
18
|
+
EmbeddingsJob,
|
|
19
|
+
GenerateJob,
|
|
20
|
+
JobType,
|
|
21
|
+
LogitsJob,
|
|
22
|
+
ScoreIndelJob,
|
|
23
|
+
ScoreJob,
|
|
24
|
+
ScoreSingleSiteJob,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class EmbeddingsResultFuture(MappedFuture, Future):
|
|
29
|
+
"""Future for manipulating results for embeddings-related requests."""
|
|
30
|
+
|
|
31
|
+
job: EmbeddingsJob | AttnJob | LogitsJob
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
session: APISession,
|
|
36
|
+
job: EmbeddingsJob | AttnJob | LogitsJob,
|
|
37
|
+
sequences: list[bytes] | list[str] | None = None,
|
|
38
|
+
max_workers: int = config.MAX_CONCURRENT_WORKERS,
|
|
39
|
+
):
|
|
40
|
+
super().__init__(session=session, job=job, max_workers=max_workers)
|
|
41
|
+
# ensure all list[bytes]
|
|
42
|
+
self._sequences = (
|
|
43
|
+
[seq.encode() if isinstance(seq, str) else seq for seq in sequences]
|
|
44
|
+
if sequences is not None
|
|
45
|
+
else sequences
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def stream(self):
|
|
49
|
+
return api.request_get_embeddings_stream(session=self.session, job_id=self.id)
|
|
50
|
+
|
|
51
|
+
def get(self, verbose=False) -> list[np.ndarray]:
|
|
52
|
+
return super().get(verbose=verbose)
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def sequences(self) -> list[bytes] | list[str]:
|
|
56
|
+
if self._sequences is None:
|
|
57
|
+
self._sequences = api.get_request_sequences(
|
|
58
|
+
session=self.session, job_id=self.job.job_id, job_type=self.job.job_type
|
|
59
|
+
)
|
|
60
|
+
return self._sequences
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def id(self):
|
|
64
|
+
return self.job.job_id
|
|
65
|
+
|
|
66
|
+
def __keys__(self):
|
|
67
|
+
"""
|
|
68
|
+
Get the list of sequences submitted for the embed request.
|
|
69
|
+
|
|
70
|
+
Returns
|
|
71
|
+
-------
|
|
72
|
+
list of bytes
|
|
73
|
+
List of sequences.
|
|
74
|
+
"""
|
|
75
|
+
return self.sequences
|
|
76
|
+
|
|
77
|
+
def get_item(self, sequence: bytes) -> np.ndarray:
|
|
78
|
+
"""
|
|
79
|
+
Get embedding results for specified sequence.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
sequence (bytes): sequence to fetch results for
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
np.ndarray: embeddings
|
|
86
|
+
"""
|
|
87
|
+
data = api.request_get_sequence_result(
|
|
88
|
+
session=self.session,
|
|
89
|
+
job_id=self.job.job_id,
|
|
90
|
+
sequence=sequence,
|
|
91
|
+
job_type=self.job.job_type,
|
|
92
|
+
)
|
|
93
|
+
return api.result_decode(data)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class EmbeddingsScoreFuture(StreamingFuture, Future):
|
|
97
|
+
"""Future for manipulating results for embeddings score-related requests."""
|
|
98
|
+
|
|
99
|
+
job: ScoreJob | ScoreIndelJob | ScoreSingleSiteJob
|
|
100
|
+
|
|
101
|
+
def __init__(
|
|
102
|
+
self,
|
|
103
|
+
session: APISession,
|
|
104
|
+
job: ScoreJob | ScoreSingleSiteJob | GenerateJob,
|
|
105
|
+
sequences: list[bytes] | list[str] | None = None,
|
|
106
|
+
):
|
|
107
|
+
super().__init__(session=session, job=job)
|
|
108
|
+
self._sequences = sequences
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def sequences(self) -> list[bytes] | list[str]:
|
|
112
|
+
if self._sequences is None:
|
|
113
|
+
self._sequences = api.get_request_sequences(self.session, self.job.job_id)
|
|
114
|
+
return self._sequences
|
|
115
|
+
|
|
116
|
+
def stream(self) -> Generator:
|
|
117
|
+
if self.job_type == JobType.poet_generate:
|
|
118
|
+
stream = api.request_get_generate_result(
|
|
119
|
+
session=self.session, job_id=self.id
|
|
120
|
+
)
|
|
121
|
+
else:
|
|
122
|
+
stream = api.request_get_score_result(session=self.session, job_id=self.id)
|
|
123
|
+
# mut_code, ... for ssp
|
|
124
|
+
# name, sequence, ... for score
|
|
125
|
+
header = next(stream)
|
|
126
|
+
score_start_index = 0
|
|
127
|
+
for i, col_name in enumerate(header):
|
|
128
|
+
if col_name.startswith("score"):
|
|
129
|
+
score_start_index = i
|
|
130
|
+
break
|
|
131
|
+
Score = namedtuple("Score", header[:score_start_index] + ["score"])
|
|
132
|
+
for line in stream:
|
|
133
|
+
# combine scores into numpy array
|
|
134
|
+
scores = np.array([float(s) for s in line[score_start_index:]])
|
|
135
|
+
output = Score(*line[:score_start_index], scores)
|
|
136
|
+
yield output
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class EmbeddingsGenerateFuture(EmbeddingsScoreFuture, StreamingFuture, Future):
|
|
140
|
+
"""Future for manipulating results for embeddings generate-related requests."""
|
|
141
|
+
|
|
142
|
+
job: GenerateJob
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def sequences(self):
|
|
146
|
+
raise Exception("generate job does not support listing sequences")
|