openprotein-python 0.8.2__1-py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openprotein/__init__.py +164 -0
- openprotein/_version.py +48 -0
- openprotein/align/__init__.py +8 -0
- openprotein/align/align.py +395 -0
- openprotein/align/api.py +428 -0
- openprotein/align/future.py +55 -0
- openprotein/align/msa.py +129 -0
- openprotein/align/schemas.py +165 -0
- openprotein/base.py +181 -0
- openprotein/chains.py +88 -0
- openprotein/common/__init__.py +5 -0
- openprotein/common/features.py +7 -0
- openprotein/common/model_metadata.py +33 -0
- openprotein/common/reduction.py +8 -0
- openprotein/config.py +9 -0
- openprotein/csv.py +31 -0
- openprotein/data/__init__.py +9 -0
- openprotein/data/api.py +218 -0
- openprotein/data/assaydataset.py +178 -0
- openprotein/data/data.py +93 -0
- openprotein/data/schemas.py +27 -0
- openprotein/design/__init__.py +16 -0
- openprotein/design/api.py +259 -0
- openprotein/design/design.py +125 -0
- openprotein/design/future.py +146 -0
- openprotein/design/schemas.py +607 -0
- openprotein/embeddings/__init__.py +27 -0
- openprotein/embeddings/api.py +619 -0
- openprotein/embeddings/embeddings.py +151 -0
- openprotein/embeddings/esm.py +33 -0
- openprotein/embeddings/future.py +146 -0
- openprotein/embeddings/models.py +421 -0
- openprotein/embeddings/openprotein.py +21 -0
- openprotein/embeddings/poet.py +446 -0
- openprotein/embeddings/poet2.py +505 -0
- openprotein/embeddings/schemas.py +78 -0
- openprotein/errors.py +76 -0
- openprotein/fasta.py +92 -0
- openprotein/fold/__init__.py +21 -0
- openprotein/fold/alphafold2.py +131 -0
- openprotein/fold/api.py +287 -0
- openprotein/fold/boltz.py +691 -0
- openprotein/fold/esmfold.py +54 -0
- openprotein/fold/fold.py +107 -0
- openprotein/fold/future.py +509 -0
- openprotein/fold/models.py +139 -0
- openprotein/fold/schemas.py +39 -0
- openprotein/jobs/__init__.py +9 -0
- openprotein/jobs/api.py +71 -0
- openprotein/jobs/futures.py +746 -0
- openprotein/jobs/jobs.py +69 -0
- openprotein/jobs/schemas.py +135 -0
- openprotein/models/__init__.py +4 -0
- openprotein/models/base.py +63 -0
- openprotein/models/foundation/rfdiffusion.py +283 -0
- openprotein/models/models.py +33 -0
- openprotein/predictor/__init__.py +25 -0
- openprotein/predictor/api.py +384 -0
- openprotein/predictor/models.py +374 -0
- openprotein/predictor/prediction.py +79 -0
- openprotein/predictor/predictor.py +242 -0
- openprotein/predictor/schemas.py +113 -0
- openprotein/predictor/validate.py +40 -0
- openprotein/prompt/__init__.py +9 -0
- openprotein/prompt/api.py +505 -0
- openprotein/prompt/models.py +142 -0
- openprotein/prompt/prompt.py +130 -0
- openprotein/prompt/schemas.py +49 -0
- openprotein/protein.py +587 -0
- openprotein/svd/__init__.py +9 -0
- openprotein/svd/api.py +206 -0
- openprotein/svd/models.py +288 -0
- openprotein/svd/schemas.py +31 -0
- openprotein/svd/svd.py +134 -0
- openprotein/umap/__init__.py +9 -0
- openprotein/umap/api.py +259 -0
- openprotein/umap/models.py +211 -0
- openprotein/umap/schemas.py +35 -0
- openprotein/umap/umap.py +175 -0
- openprotein/utils/uuid.py +29 -0
- openprotein_python-0.8.2.dist-info/METADATA +176 -0
- openprotein_python-0.8.2.dist-info/RECORD +84 -0
- openprotein_python-0.8.2.dist-info/WHEEL +4 -0
- openprotein_python-0.8.2.dist-info/licenses/LICENSE.txt +30 -0
openprotein/umap/api.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
"""UMAP REST API for making HTTP calls to our UMAP backend."""
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from pydantic import TypeAdapter
|
|
8
|
+
|
|
9
|
+
from openprotein.base import APISession
|
|
10
|
+
from openprotein.errors import APIError, InvalidParameterError
|
|
11
|
+
|
|
12
|
+
from .schemas import FeatureType, UMAPEmbeddingsJob, UMAPFitJob, UMAPMetadata
|
|
13
|
+
|
|
14
|
+
PATH_PREFIX = "v1/umap"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def umap_list_get(session: APISession) -> list[UMAPMetadata]:
|
|
18
|
+
"""Get UMAP job metadata for all UMAPs. Including UMAP dimension and sequence lengths."""
|
|
19
|
+
endpoint = PATH_PREFIX
|
|
20
|
+
response = session.get(endpoint)
|
|
21
|
+
return TypeAdapter(list[UMAPMetadata]).validate_python(response.json())
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def umap_get(session: APISession, umap_id: str) -> UMAPMetadata:
|
|
25
|
+
"""Get UMAP job metadata. Including UMAP dimension and sequence lengths."""
|
|
26
|
+
endpoint = PATH_PREFIX + f"/{umap_id}"
|
|
27
|
+
response = session.get(endpoint)
|
|
28
|
+
return UMAPMetadata.model_validate(response.json())
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def umap_get_sequences(session: APISession, umap_id: str) -> list[bytes]:
|
|
32
|
+
"""
|
|
33
|
+
Get sequences used to fit an UMAP.
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
session : APISession
|
|
38
|
+
Session object for API communication.
|
|
39
|
+
umap_id : str
|
|
40
|
+
UMAP ID whose sequences to fetch
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
sequences : List[bytes]
|
|
45
|
+
"""
|
|
46
|
+
endpoint = PATH_PREFIX + f"/{umap_id}/sequences"
|
|
47
|
+
response = session.get(endpoint)
|
|
48
|
+
return TypeAdapter(list[bytes]).validate_python(response.json())
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def embed_get_sequence_result(
|
|
52
|
+
session: APISession, job_id: str, sequence: str | bytes
|
|
53
|
+
) -> bytes:
|
|
54
|
+
"""
|
|
55
|
+
Get encoded umap embeddings result for a sequence from the request ID.
|
|
56
|
+
|
|
57
|
+
Parameters
|
|
58
|
+
----------
|
|
59
|
+
session : APISession
|
|
60
|
+
Session object for API communication.
|
|
61
|
+
job_id : str
|
|
62
|
+
job ID to retrieve results from
|
|
63
|
+
sequence : bytes
|
|
64
|
+
sequence to retrieve results for
|
|
65
|
+
|
|
66
|
+
Returns
|
|
67
|
+
-------
|
|
68
|
+
result : bytes
|
|
69
|
+
"""
|
|
70
|
+
if isinstance(sequence, bytes):
|
|
71
|
+
sequence = sequence.decode()
|
|
72
|
+
endpoint = PATH_PREFIX + f"/embed/{job_id}/{sequence}"
|
|
73
|
+
response = session.get(endpoint)
|
|
74
|
+
return response.content
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def embed_get_batch_result(session: APISession, job_id: str) -> bytes:
|
|
78
|
+
"""
|
|
79
|
+
Get encoded umap embeddings batched result from the request ID.
|
|
80
|
+
|
|
81
|
+
Parameters
|
|
82
|
+
----------
|
|
83
|
+
session : APISession
|
|
84
|
+
Session object for API communication.
|
|
85
|
+
job_id : str
|
|
86
|
+
Job ID to retrieve results from
|
|
87
|
+
|
|
88
|
+
Returns
|
|
89
|
+
-------
|
|
90
|
+
result : bytes
|
|
91
|
+
"""
|
|
92
|
+
endpoint = PATH_PREFIX + f"/embed/{job_id}/csv"
|
|
93
|
+
response = session.get(endpoint)
|
|
94
|
+
return response.content
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def embed_decode(data: bytes) -> np.ndarray:
|
|
98
|
+
"""
|
|
99
|
+
Decode embedding as numpy array.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
data (bytes): raw bytes encoding the array received over the API
|
|
104
|
+
|
|
105
|
+
Returns
|
|
106
|
+
-------
|
|
107
|
+
np.ndarray: decoded array
|
|
108
|
+
"""
|
|
109
|
+
s = io.BytesIO(data)
|
|
110
|
+
return np.load(s, allow_pickle=False)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def embed_batch_decode(data: bytes) -> np.ndarray:
|
|
114
|
+
"""
|
|
115
|
+
Decode prediction scores.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
data (bytes): raw bytes encoding the array received over the API
|
|
119
|
+
batched (bool): whether or not the result was batched. affects the retrieved csv format whether they contain additional columns and header rows.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
mus (np.ndarray): decoded array of means
|
|
123
|
+
vars (np.ndarray): decoded array of variances
|
|
124
|
+
"""
|
|
125
|
+
s = io.BytesIO(data)
|
|
126
|
+
# should contain header and sequence column
|
|
127
|
+
df = pd.read_csv(s)
|
|
128
|
+
umaps = df.iloc[:, 1:].values
|
|
129
|
+
return umaps
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def umap_delete(session: APISession, umap_id: str) -> bool:
|
|
133
|
+
"""
|
|
134
|
+
Delete and UMAP model.
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
session : APISession
|
|
139
|
+
Session object for API communication.
|
|
140
|
+
umap_id : str
|
|
141
|
+
UMAP model to delete
|
|
142
|
+
|
|
143
|
+
Returns
|
|
144
|
+
-------
|
|
145
|
+
bool
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
endpoint = PATH_PREFIX + f"/{umap_id}"
|
|
149
|
+
response = session.delete(endpoint)
|
|
150
|
+
if 200 <= response.status_code < 300:
|
|
151
|
+
return True
|
|
152
|
+
else:
|
|
153
|
+
raise APIError(response.text)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def umap_fit_post(
|
|
157
|
+
session: APISession,
|
|
158
|
+
model_id: str,
|
|
159
|
+
feature_type: str,
|
|
160
|
+
sequences: list[bytes] | list[str] | None = None,
|
|
161
|
+
assay_id: str | None = None,
|
|
162
|
+
n_components: int = 2,
|
|
163
|
+
n_neighbors: int = 15,
|
|
164
|
+
min_dist: float = 0.1,
|
|
165
|
+
reduction: str | None = None,
|
|
166
|
+
**kwargs,
|
|
167
|
+
) -> UMAPFitJob:
|
|
168
|
+
"""
|
|
169
|
+
Create UMAP fit job.
|
|
170
|
+
|
|
171
|
+
Parameters
|
|
172
|
+
----------
|
|
173
|
+
session : APISession
|
|
174
|
+
Session object for API communication.
|
|
175
|
+
model_id : str
|
|
176
|
+
Model to use. Can be either svd_id or id of a foundational model.
|
|
177
|
+
feature_type: str
|
|
178
|
+
Type of feature to use for fitting UMAP. Either PLM or SVD.
|
|
179
|
+
sequences : list[bytes] | None, optional
|
|
180
|
+
Optional sequences to fit UMAP with. Either use sequences or
|
|
181
|
+
assay_id. sequences is preferred.
|
|
182
|
+
assay_id: str | None, optional
|
|
183
|
+
Optional ID of assay containing sequences to fit UMAP with.
|
|
184
|
+
Either use sequences or assay_id. Ignored if sequences are
|
|
185
|
+
provided.
|
|
186
|
+
n_components: int
|
|
187
|
+
Number of UMAP components to fit. Defaults to 2.
|
|
188
|
+
n_neighbors: int
|
|
189
|
+
Number of neighbors to use for fitting. Defaults to 15.
|
|
190
|
+
min_dist: float
|
|
191
|
+
Minimum distance in UMAP fitting. Defaults to 0.1.
|
|
192
|
+
reduction : str | None
|
|
193
|
+
Embedding reduction to use for fitting the UMAP. Defaults to None.
|
|
194
|
+
kwargs:
|
|
195
|
+
Additional keyword arguments to be passed to foundational models, e.g. prompt_id for PoET models.
|
|
196
|
+
|
|
197
|
+
Returns
|
|
198
|
+
-------
|
|
199
|
+
UMAPFitJob
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
endpoint = PATH_PREFIX
|
|
203
|
+
|
|
204
|
+
body = {
|
|
205
|
+
"model_id": model_id,
|
|
206
|
+
"feature_type": feature_type,
|
|
207
|
+
"n_components": n_components,
|
|
208
|
+
"n_neighbors": n_neighbors,
|
|
209
|
+
"min_dist": min_dist,
|
|
210
|
+
}
|
|
211
|
+
if reduction is not None:
|
|
212
|
+
body["reduction"] = reduction
|
|
213
|
+
if sequences is not None:
|
|
214
|
+
# both provided
|
|
215
|
+
if assay_id is not None:
|
|
216
|
+
raise InvalidParameterError("Expected only either sequences or assay_id")
|
|
217
|
+
sequences = [(s if isinstance(s, str) else s.decode()) for s in sequences]
|
|
218
|
+
body["sequences"] = sequences
|
|
219
|
+
else:
|
|
220
|
+
# both are none
|
|
221
|
+
if assay_id is None:
|
|
222
|
+
raise InvalidParameterError("Expected either sequences or assay_id")
|
|
223
|
+
body["assay_id"] = assay_id
|
|
224
|
+
# add kwargs for embeddings kwargs
|
|
225
|
+
body.update(**kwargs)
|
|
226
|
+
|
|
227
|
+
response = session.post(endpoint, json=body)
|
|
228
|
+
# return job for metadata
|
|
229
|
+
return UMAPFitJob.model_validate(response.json())
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def umap_embed_post(
|
|
233
|
+
session: APISession, umap_id: str, sequences: list[bytes] | list[str]
|
|
234
|
+
) -> UMAPEmbeddingsJob:
|
|
235
|
+
"""
|
|
236
|
+
POST a request for embeddings from the given UMAP model.
|
|
237
|
+
|
|
238
|
+
Parameters
|
|
239
|
+
----------
|
|
240
|
+
session : APISession
|
|
241
|
+
Session object for API communication.
|
|
242
|
+
umap_id : str
|
|
243
|
+
UMAP model to use
|
|
244
|
+
sequences : List[bytes]
|
|
245
|
+
sequences to UMAP
|
|
246
|
+
|
|
247
|
+
Returns
|
|
248
|
+
-------
|
|
249
|
+
UMAPEmbeddingsJob
|
|
250
|
+
"""
|
|
251
|
+
endpoint = PATH_PREFIX + f"/{umap_id}/embed"
|
|
252
|
+
|
|
253
|
+
sequences_unicode = [(s if isinstance(s, str) else s.decode()) for s in sequences]
|
|
254
|
+
body = {
|
|
255
|
+
"sequences": sequences_unicode,
|
|
256
|
+
}
|
|
257
|
+
response = session.post(endpoint, json=body)
|
|
258
|
+
|
|
259
|
+
return UMAPEmbeddingsJob.model_validate(response.json())
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""UMAP models on the OpenProtein system which can be used directly to create projected embeddings useful for visualization."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from openprotein import config
|
|
6
|
+
from openprotein.base import APISession
|
|
7
|
+
from openprotein.embeddings import EmbeddingModel, EmbeddingsResultFuture
|
|
8
|
+
from openprotein.jobs import Future, JobsAPI
|
|
9
|
+
|
|
10
|
+
from . import api
|
|
11
|
+
from .schemas import UMAPEmbeddingsJob, UMAPFitJob, UMAPMetadata
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class UMAPModel(Future):
|
|
15
|
+
"""
|
|
16
|
+
UMAP model that can be used to create projected embeddings.
|
|
17
|
+
|
|
18
|
+
The model is also implemented as a `Future` to allow waiting for a fit job.
|
|
19
|
+
The projected embeddings of the sequences used to fit the UMAP can be
|
|
20
|
+
accessed using `embeddings`.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
job: UMAPFitJob
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
session: APISession,
|
|
28
|
+
job: UMAPFitJob | None = None,
|
|
29
|
+
metadata: UMAPMetadata | None = None,
|
|
30
|
+
):
|
|
31
|
+
# Initializes with either job get or umap metadata get.
|
|
32
|
+
if metadata is None:
|
|
33
|
+
# use job to fetch metadata
|
|
34
|
+
if job is None:
|
|
35
|
+
raise ValueError("Expected umap metadata or job")
|
|
36
|
+
metadata = api.umap_get(session, job.job_id)
|
|
37
|
+
self._metadata = metadata
|
|
38
|
+
if job is None:
|
|
39
|
+
jobs_api = getattr(session, "jobs", None)
|
|
40
|
+
assert isinstance(jobs_api, JobsAPI)
|
|
41
|
+
job = UMAPFitJob.create(jobs_api.get_job(job_id=metadata.id))
|
|
42
|
+
self._sequences = None
|
|
43
|
+
self._embeddings = None
|
|
44
|
+
# getter initializes job if not provided
|
|
45
|
+
super().__init__(session, job)
|
|
46
|
+
|
|
47
|
+
def __str__(self) -> str:
|
|
48
|
+
return str(self.metadata)
|
|
49
|
+
|
|
50
|
+
def __repr__(self) -> str:
|
|
51
|
+
return repr(self.metadata)
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def id(self):
|
|
55
|
+
"""UMAP unique identifier."""
|
|
56
|
+
|
|
57
|
+
return self._metadata.id
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def n_components(self):
|
|
61
|
+
"""Number of components specified for the UMAP."""
|
|
62
|
+
|
|
63
|
+
return self._metadata.n_components
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def n_neighbors(self):
|
|
67
|
+
"""Number of neighbors specified for the UMAP."""
|
|
68
|
+
|
|
69
|
+
return self._metadata.n_neighbors
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def min_dist(self):
|
|
73
|
+
"""Minimum distance specified for the UMAP."""
|
|
74
|
+
|
|
75
|
+
return self._metadata.min_dist
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def sequence_length(self):
|
|
79
|
+
"""Sequence length constraint of the UMAP."""
|
|
80
|
+
|
|
81
|
+
return self._metadata.sequence_length
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def reduction(self):
|
|
85
|
+
"""Reduction used to fit the UMAP."""
|
|
86
|
+
|
|
87
|
+
return self._metadata.reduction
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def metadata(self):
|
|
91
|
+
"""Metadata of the UMAP."""
|
|
92
|
+
|
|
93
|
+
self._refresh_metadata()
|
|
94
|
+
return self._metadata
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def sequences(self):
|
|
98
|
+
"""The sequences used to fit the UMAP."""
|
|
99
|
+
|
|
100
|
+
if self._sequences is not None:
|
|
101
|
+
return self._sequences
|
|
102
|
+
self._sequences = self.get_inputs()
|
|
103
|
+
return self._sequences
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def embeddings(self):
|
|
107
|
+
"""The projected embeddings of the sequences used to fit the UMAP."""
|
|
108
|
+
|
|
109
|
+
if self._embeddings is not None:
|
|
110
|
+
return self._embeddings
|
|
111
|
+
data = api.embed_get_batch_result(session=self.session, job_id=self.id)
|
|
112
|
+
embeddings = [
|
|
113
|
+
(seq, umap)
|
|
114
|
+
for seq, umap in zip(self.sequences, api.embed_batch_decode(data))
|
|
115
|
+
]
|
|
116
|
+
self._embeddings = embeddings
|
|
117
|
+
return self._embeddings
|
|
118
|
+
|
|
119
|
+
def _refresh_metadata(self):
|
|
120
|
+
if not self._metadata.is_done():
|
|
121
|
+
self._metadata = api.umap_get(self.session, self._metadata.id)
|
|
122
|
+
|
|
123
|
+
def get_model(self) -> EmbeddingModel:
|
|
124
|
+
model = EmbeddingModel.create(session=self.session, model_id=self._metadata.id)
|
|
125
|
+
return model
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
def model(self) -> EmbeddingModel:
|
|
129
|
+
"""Base embeddings model used for the UMAP."""
|
|
130
|
+
return self.get_model()
|
|
131
|
+
|
|
132
|
+
def delete(self) -> bool:
|
|
133
|
+
"""
|
|
134
|
+
Delete this UMAP model.
|
|
135
|
+
"""
|
|
136
|
+
return api.umap_delete(self.session, self.id)
|
|
137
|
+
|
|
138
|
+
def get(self, verbose: bool = False):
|
|
139
|
+
"""Retrieve this UMAP model itself."""
|
|
140
|
+
return self
|
|
141
|
+
|
|
142
|
+
def get_inputs(self) -> list[bytes]:
|
|
143
|
+
"""
|
|
144
|
+
Get sequences used for umap job.
|
|
145
|
+
|
|
146
|
+
Returns
|
|
147
|
+
-------
|
|
148
|
+
list[bytes]
|
|
149
|
+
list of sequences
|
|
150
|
+
"""
|
|
151
|
+
return api.umap_get_sequences(session=self.session, umap_id=self.id)
|
|
152
|
+
|
|
153
|
+
def embed(
|
|
154
|
+
self, sequences: list[bytes] | list[str], **kwargs
|
|
155
|
+
) -> "UMAPEmbeddingsResultFuture":
|
|
156
|
+
"""
|
|
157
|
+
Use this UMAP model to get projected embeddings from input sequences.
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
sequences : List[bytes]
|
|
162
|
+
List of protein sequences.
|
|
163
|
+
|
|
164
|
+
Returns
|
|
165
|
+
-------
|
|
166
|
+
UMAPEmbeddingsResultFuture
|
|
167
|
+
Future result containing the projected embeddings.
|
|
168
|
+
"""
|
|
169
|
+
return UMAPEmbeddingsResultFuture.create(
|
|
170
|
+
session=self.session,
|
|
171
|
+
job=api.umap_embed_post(
|
|
172
|
+
session=self.session, umap_id=self.id, sequences=sequences, **kwargs
|
|
173
|
+
),
|
|
174
|
+
sequences=sequences,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class UMAPEmbeddingsResultFuture(EmbeddingsResultFuture, Future):
|
|
179
|
+
"""UMAP embeddings results represented as a future."""
|
|
180
|
+
|
|
181
|
+
job: UMAPEmbeddingsJob
|
|
182
|
+
|
|
183
|
+
def wait(
|
|
184
|
+
self,
|
|
185
|
+
interval: int = config.POLLING_INTERVAL,
|
|
186
|
+
timeout: int | None = None,
|
|
187
|
+
verbose: bool = False,
|
|
188
|
+
) -> list[np.ndarray]:
|
|
189
|
+
"""Wait for the UMAP embeddings job and retrieve the embeddings."""
|
|
190
|
+
return super().wait(interval, timeout, verbose)
|
|
191
|
+
|
|
192
|
+
def get(self, verbose=False) -> list[np.ndarray]:
|
|
193
|
+
"""Get all the UMAP projected embeddings from the job."""
|
|
194
|
+
return super().get(verbose)
|
|
195
|
+
|
|
196
|
+
def get_item(self, sequence: bytes) -> np.ndarray:
|
|
197
|
+
"""
|
|
198
|
+
Get UMAP embeddings for specified sequence.
|
|
199
|
+
|
|
200
|
+
Parameters
|
|
201
|
+
----------
|
|
202
|
+
sequence: bytes
|
|
203
|
+
Sequence to fetch UMAP embeddings for.
|
|
204
|
+
|
|
205
|
+
Returns
|
|
206
|
+
-------
|
|
207
|
+
np.ndarray
|
|
208
|
+
UMAP embeddings represented a numpy array.
|
|
209
|
+
"""
|
|
210
|
+
data = api.embed_get_sequence_result(self.session, self.job.job_id, sequence)
|
|
211
|
+
return api.embed_decode(data)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Schemas for OpenProtein UMAP system."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, ConfigDict
|
|
7
|
+
|
|
8
|
+
from openprotein.common import FeatureType
|
|
9
|
+
from openprotein.jobs import BatchJob, Job, JobStatus, JobType
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class UMAPMetadata(BaseModel):
|
|
13
|
+
id: str
|
|
14
|
+
status: JobStatus
|
|
15
|
+
created_date: datetime | None = None
|
|
16
|
+
model_id: str
|
|
17
|
+
feature_type: FeatureType
|
|
18
|
+
n_components: int = 2
|
|
19
|
+
n_neighbors: int = 15
|
|
20
|
+
min_dist: float = 0.1
|
|
21
|
+
reduction: str | None = None
|
|
22
|
+
sequence_length: int | None = None
|
|
23
|
+
|
|
24
|
+
def is_done(self):
|
|
25
|
+
return self.status.done()
|
|
26
|
+
|
|
27
|
+
model_config = ConfigDict(protected_namespaces=())
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class UMAPFitJob(Job):
|
|
31
|
+
job_type: Literal[JobType.umap_fit]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class UMAPEmbeddingsJob(Job, BatchJob):
|
|
35
|
+
job_type: Literal[JobType.umap_embed]
|
openprotein/umap/umap.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""UMAP API providing the interface to fit and run UMAP visualizations."""
|
|
2
|
+
|
|
3
|
+
from openprotein.base import APISession
|
|
4
|
+
from openprotein.common import FeatureType, ReductionType
|
|
5
|
+
from openprotein.data import AssayDataset, AssayMetadata
|
|
6
|
+
from openprotein.embeddings import EmbeddingModel, EmbeddingsAPI
|
|
7
|
+
from openprotein.errors import InvalidParameterError
|
|
8
|
+
from openprotein.jobs import JobsAPI
|
|
9
|
+
from openprotein.svd import SVDAPI, SVDModel
|
|
10
|
+
|
|
11
|
+
from . import api
|
|
12
|
+
from .models import UMAPModel
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class UMAPAPI:
|
|
16
|
+
"""UMAP API providing the interface to fit and run UMAP visualizations."""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
session: APISession,
|
|
21
|
+
):
|
|
22
|
+
self.session = session
|
|
23
|
+
|
|
24
|
+
def fit_umap(
|
|
25
|
+
self,
|
|
26
|
+
model: EmbeddingModel | SVDModel | str,
|
|
27
|
+
feature_type: FeatureType | None = None,
|
|
28
|
+
sequences: list[bytes] | list[str] | None = None,
|
|
29
|
+
assay: AssayMetadata | AssayDataset | str | None = None,
|
|
30
|
+
n_components: int = 2,
|
|
31
|
+
n_neighbors: int = 15,
|
|
32
|
+
min_dist: float = 0.1,
|
|
33
|
+
reduction: ReductionType | None = None,
|
|
34
|
+
**kwargs,
|
|
35
|
+
) -> UMAPModel:
|
|
36
|
+
"""
|
|
37
|
+
Fit an UMAP on the sequences with the specified model_id and hyperparameters (n_components).
|
|
38
|
+
|
|
39
|
+
Parameters
|
|
40
|
+
----------
|
|
41
|
+
sequences: list of bytes or None, optional
|
|
42
|
+
Optional sequences to fit UMAP with. Either use sequences or
|
|
43
|
+
assay_id. sequences is preferred.
|
|
44
|
+
assay : AssayMetadata or AssayDataset or str or None, optional
|
|
45
|
+
Optional assay containing sequences to fit SVD with.
|
|
46
|
+
Or its assay_id. Either use sequences or assay.
|
|
47
|
+
Ignored if sequences are provided.
|
|
48
|
+
model : EmbeddingModel or SVDModel or str
|
|
49
|
+
Instance of either EmbeddingModel or SVDModel to use depending
|
|
50
|
+
on feature type. Can also be a str specifying the model id,
|
|
51
|
+
but then feature_type would have to be specified.
|
|
52
|
+
feature_type : FeatureType or None, optional
|
|
53
|
+
Type of features to use for encoding sequences. "SVD" or "PLM".
|
|
54
|
+
None would require model to be EmbeddingModel or SVDModel.
|
|
55
|
+
n_components : int, optional
|
|
56
|
+
Number of UMAP components to fit. Defaults to 2.
|
|
57
|
+
n_neighbors : int, optional
|
|
58
|
+
Number of neighbors to use for fitting. Defaults to 15.
|
|
59
|
+
min_dist : float, optional
|
|
60
|
+
Minimum distance in UMAP fitting. Defaults to 0.1.
|
|
61
|
+
reduction : str or None, optional
|
|
62
|
+
Type of embedding reduction to use for computing features.
|
|
63
|
+
E.g. "MEAN" or "SUM". Useful when dealing with variable length
|
|
64
|
+
sequence. Defaults to None.
|
|
65
|
+
kwargs :
|
|
66
|
+
Additional keyword arguments to be passed to foundational models, e.g. prompt_id for PoET models.
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
UMAPModel
|
|
71
|
+
The UMAP model being fit.
|
|
72
|
+
"""
|
|
73
|
+
# extract feature type
|
|
74
|
+
feature_type = (
|
|
75
|
+
FeatureType.PLM
|
|
76
|
+
if isinstance(model, EmbeddingModel)
|
|
77
|
+
else FeatureType.SVD if isinstance(model, SVDModel) else feature_type
|
|
78
|
+
)
|
|
79
|
+
if feature_type is None:
|
|
80
|
+
raise InvalidParameterError(
|
|
81
|
+
"Expected feature_type to be provided if passing str model_id as model"
|
|
82
|
+
)
|
|
83
|
+
# get model if model_id
|
|
84
|
+
if feature_type == FeatureType.PLM:
|
|
85
|
+
if reduction is None:
|
|
86
|
+
raise InvalidParameterError(
|
|
87
|
+
"Expected reduction if using EmbeddingModel"
|
|
88
|
+
)
|
|
89
|
+
if isinstance(model, str):
|
|
90
|
+
embeddings_api = getattr(self.session, "embedding", None)
|
|
91
|
+
assert isinstance(embeddings_api, EmbeddingsAPI)
|
|
92
|
+
model = embeddings_api.get_model(model)
|
|
93
|
+
assert isinstance(model, EmbeddingModel), "Expected EmbeddingModel"
|
|
94
|
+
model_id = model.id
|
|
95
|
+
elif feature_type == FeatureType.SVD:
|
|
96
|
+
if isinstance(model, str):
|
|
97
|
+
svd_api = getattr(self.session, "svd", None)
|
|
98
|
+
assert isinstance(svd_api, SVDAPI)
|
|
99
|
+
model = svd_api.get_svd(model)
|
|
100
|
+
assert isinstance(model, SVDModel), "Expected SVDModel"
|
|
101
|
+
model_id = model.id
|
|
102
|
+
# get assay_id
|
|
103
|
+
assay_id = (
|
|
104
|
+
assay.assay_id
|
|
105
|
+
if isinstance(assay, AssayMetadata)
|
|
106
|
+
else assay.id if isinstance(assay, AssayDataset) else assay
|
|
107
|
+
)
|
|
108
|
+
return UMAPModel(
|
|
109
|
+
session=self.session,
|
|
110
|
+
job=api.umap_fit_post(
|
|
111
|
+
session=self.session,
|
|
112
|
+
model_id=model_id,
|
|
113
|
+
feature_type=feature_type,
|
|
114
|
+
sequences=sequences,
|
|
115
|
+
assay_id=assay_id,
|
|
116
|
+
n_components=n_components,
|
|
117
|
+
n_neighbors=n_neighbors,
|
|
118
|
+
min_dist=min_dist,
|
|
119
|
+
reduction=reduction,
|
|
120
|
+
**kwargs,
|
|
121
|
+
),
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def get_umap(self, umap_id: str) -> UMAPModel:
|
|
125
|
+
"""
|
|
126
|
+
Get UMAP job results. Including UMAP dimension and sequence lengths.
|
|
127
|
+
|
|
128
|
+
Requires a successful UMAP job from fit_umap.
|
|
129
|
+
|
|
130
|
+
Parameters
|
|
131
|
+
----------
|
|
132
|
+
umap_id : str
|
|
133
|
+
The ID of the UMAP job.
|
|
134
|
+
Returns
|
|
135
|
+
-------
|
|
136
|
+
UMAPModel
|
|
137
|
+
The model with the UMAP fit.
|
|
138
|
+
"""
|
|
139
|
+
metadata = api.umap_get(self.session, umap_id)
|
|
140
|
+
return UMAPModel(session=self.session, metadata=metadata)
|
|
141
|
+
|
|
142
|
+
def __delete_umap(self, umap_id: str) -> bool:
|
|
143
|
+
"""
|
|
144
|
+
Delete UMAP model.
|
|
145
|
+
|
|
146
|
+
Parameters
|
|
147
|
+
----------
|
|
148
|
+
umap_id : str
|
|
149
|
+
The ID of the UMAP job.
|
|
150
|
+
Returns
|
|
151
|
+
-------
|
|
152
|
+
bool
|
|
153
|
+
True: successful deletion
|
|
154
|
+
|
|
155
|
+
"""
|
|
156
|
+
return api.umap_delete(self.session, umap_id)
|
|
157
|
+
|
|
158
|
+
def list_umap(self) -> list[UMAPModel]:
|
|
159
|
+
"""
|
|
160
|
+
List UMAP models made by user.
|
|
161
|
+
|
|
162
|
+
Takes no args.
|
|
163
|
+
|
|
164
|
+
Returns
|
|
165
|
+
-------
|
|
166
|
+
list[UMAPModel]
|
|
167
|
+
UMAPModels
|
|
168
|
+
|
|
169
|
+
"""
|
|
170
|
+
jobs_api = getattr(self.session, "jobs", None)
|
|
171
|
+
assert isinstance(jobs_api, JobsAPI)
|
|
172
|
+
return [
|
|
173
|
+
UMAPModel(session=self.session, metadata=metadata)
|
|
174
|
+
for metadata in api.umap_list_get(self.session)
|
|
175
|
+
]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from uuid import UUID
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def is_valid_uuid(uuid_to_test: str, version=4):
|
|
5
|
+
"""
|
|
6
|
+
Check if uuid_to_test is a valid UUID.
|
|
7
|
+
|
|
8
|
+
Parameters
|
|
9
|
+
----------
|
|
10
|
+
uuid_to_test : str
|
|
11
|
+
version : {1, 2, 3, 4}
|
|
12
|
+
|
|
13
|
+
Returns
|
|
14
|
+
-------
|
|
15
|
+
`True` if uuid_to_test is a valid UUID, otherwise `False`.
|
|
16
|
+
|
|
17
|
+
Examples
|
|
18
|
+
--------
|
|
19
|
+
>>> is_valid_uuid('c9bf9e57-1685-4c89-bafb-ff5af830be8a')
|
|
20
|
+
True
|
|
21
|
+
>>> is_valid_uuid('c9bf9e58')
|
|
22
|
+
False
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
uuid_obj = UUID(uuid_to_test, version=version)
|
|
27
|
+
except ValueError:
|
|
28
|
+
return False
|
|
29
|
+
return str(uuid_obj) == uuid_to_test
|