openprotein-python 0.8.2__1-py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openprotein/__init__.py +164 -0
- openprotein/_version.py +48 -0
- openprotein/align/__init__.py +8 -0
- openprotein/align/align.py +395 -0
- openprotein/align/api.py +428 -0
- openprotein/align/future.py +55 -0
- openprotein/align/msa.py +129 -0
- openprotein/align/schemas.py +165 -0
- openprotein/base.py +181 -0
- openprotein/chains.py +88 -0
- openprotein/common/__init__.py +5 -0
- openprotein/common/features.py +7 -0
- openprotein/common/model_metadata.py +33 -0
- openprotein/common/reduction.py +8 -0
- openprotein/config.py +9 -0
- openprotein/csv.py +31 -0
- openprotein/data/__init__.py +9 -0
- openprotein/data/api.py +218 -0
- openprotein/data/assaydataset.py +178 -0
- openprotein/data/data.py +93 -0
- openprotein/data/schemas.py +27 -0
- openprotein/design/__init__.py +16 -0
- openprotein/design/api.py +259 -0
- openprotein/design/design.py +125 -0
- openprotein/design/future.py +146 -0
- openprotein/design/schemas.py +607 -0
- openprotein/embeddings/__init__.py +27 -0
- openprotein/embeddings/api.py +619 -0
- openprotein/embeddings/embeddings.py +151 -0
- openprotein/embeddings/esm.py +33 -0
- openprotein/embeddings/future.py +146 -0
- openprotein/embeddings/models.py +421 -0
- openprotein/embeddings/openprotein.py +21 -0
- openprotein/embeddings/poet.py +446 -0
- openprotein/embeddings/poet2.py +505 -0
- openprotein/embeddings/schemas.py +78 -0
- openprotein/errors.py +76 -0
- openprotein/fasta.py +92 -0
- openprotein/fold/__init__.py +21 -0
- openprotein/fold/alphafold2.py +131 -0
- openprotein/fold/api.py +287 -0
- openprotein/fold/boltz.py +691 -0
- openprotein/fold/esmfold.py +54 -0
- openprotein/fold/fold.py +107 -0
- openprotein/fold/future.py +509 -0
- openprotein/fold/models.py +139 -0
- openprotein/fold/schemas.py +39 -0
- openprotein/jobs/__init__.py +9 -0
- openprotein/jobs/api.py +71 -0
- openprotein/jobs/futures.py +746 -0
- openprotein/jobs/jobs.py +69 -0
- openprotein/jobs/schemas.py +135 -0
- openprotein/models/__init__.py +4 -0
- openprotein/models/base.py +63 -0
- openprotein/models/foundation/rfdiffusion.py +283 -0
- openprotein/models/models.py +33 -0
- openprotein/predictor/__init__.py +25 -0
- openprotein/predictor/api.py +384 -0
- openprotein/predictor/models.py +374 -0
- openprotein/predictor/prediction.py +79 -0
- openprotein/predictor/predictor.py +242 -0
- openprotein/predictor/schemas.py +113 -0
- openprotein/predictor/validate.py +40 -0
- openprotein/prompt/__init__.py +9 -0
- openprotein/prompt/api.py +505 -0
- openprotein/prompt/models.py +142 -0
- openprotein/prompt/prompt.py +130 -0
- openprotein/prompt/schemas.py +49 -0
- openprotein/protein.py +587 -0
- openprotein/svd/__init__.py +9 -0
- openprotein/svd/api.py +206 -0
- openprotein/svd/models.py +288 -0
- openprotein/svd/schemas.py +31 -0
- openprotein/svd/svd.py +134 -0
- openprotein/umap/__init__.py +9 -0
- openprotein/umap/api.py +259 -0
- openprotein/umap/models.py +211 -0
- openprotein/umap/schemas.py +35 -0
- openprotein/umap/umap.py +175 -0
- openprotein/utils/uuid.py +29 -0
- openprotein_python-0.8.2.dist-info/METADATA +176 -0
- openprotein_python-0.8.2.dist-info/RECORD +84 -0
- openprotein_python-0.8.2.dist-info/WHEEL +4 -0
- openprotein_python-0.8.2.dist-info/licenses/LICENSE.txt +30 -0
openprotein/align/api.py
ADDED
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
"""Align REST API interface for making HTTP calls to our align backend."""
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import random
|
|
5
|
+
from typing import BinaryIO, Iterator
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
from openprotein import csv, fasta
|
|
10
|
+
from openprotein.base import APISession
|
|
11
|
+
from openprotein.errors import APIError, InvalidParameterError, MissingParameterError
|
|
12
|
+
from openprotein.jobs import Job
|
|
13
|
+
|
|
14
|
+
from .schemas import AbNumberScheme, AlignType, MSASamplingMethod
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_align_job_inputs(
|
|
18
|
+
session: APISession,
|
|
19
|
+
job_id: str,
|
|
20
|
+
input_type: AlignType,
|
|
21
|
+
prompt_index: int | None = None,
|
|
22
|
+
) -> requests.Response:
|
|
23
|
+
"""
|
|
24
|
+
Retrieve MSA and related data for an alignment job.
|
|
25
|
+
|
|
26
|
+
Depending on `input_type`, returns either the original user seed (RAW), the generated MSA, or the prompt.
|
|
27
|
+
If `input_type` is PROMPT, specify `prompt_index` to retrieve the specific prompt for each replicate.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
session : APISession
|
|
32
|
+
The API session.
|
|
33
|
+
job_id : str
|
|
34
|
+
The job identifier.
|
|
35
|
+
input_type : AlignType
|
|
36
|
+
The type of MSA data to retrieve.
|
|
37
|
+
prompt_index : int or None, optional
|
|
38
|
+
The replicate number for the prompt (only used if `input_type` is PROMPT).
|
|
39
|
+
|
|
40
|
+
Returns
|
|
41
|
+
-------
|
|
42
|
+
requests.Response
|
|
43
|
+
The response object from the server.
|
|
44
|
+
"""
|
|
45
|
+
endpoint = "v1/align/inputs"
|
|
46
|
+
|
|
47
|
+
params = {"job_id": job_id, "msa_type": input_type}
|
|
48
|
+
if prompt_index is not None:
|
|
49
|
+
params["replicate"] = prompt_index
|
|
50
|
+
|
|
51
|
+
response = session.get(endpoint, params=params, stream=True)
|
|
52
|
+
return response
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_input(
|
|
56
|
+
session: APISession,
|
|
57
|
+
job_id: str,
|
|
58
|
+
input_type: AlignType,
|
|
59
|
+
prompt_index: int | None = None,
|
|
60
|
+
) -> Iterator[tuple[str, str]]:
|
|
61
|
+
"""
|
|
62
|
+
Retrieve input data for a given alignment job.
|
|
63
|
+
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
session : APISession
|
|
67
|
+
The API session.
|
|
68
|
+
job_id : str
|
|
69
|
+
The job identifier.
|
|
70
|
+
input_type : AlignType
|
|
71
|
+
The type of MSA data to retrieve.
|
|
72
|
+
prompt_index : int or None, optional
|
|
73
|
+
The replicate number for the prompt (only used if `input_type` is PROMPT).
|
|
74
|
+
|
|
75
|
+
Returns
|
|
76
|
+
-------
|
|
77
|
+
Iterator[tuple[str, str]]
|
|
78
|
+
An iterator over the name, sequence of the response.
|
|
79
|
+
"""
|
|
80
|
+
response = get_align_job_inputs(
|
|
81
|
+
session=session, job_id=job_id, input_type=input_type, prompt_index=prompt_index
|
|
82
|
+
)
|
|
83
|
+
if response.headers.get("Content-Type") == "text/x-fasta":
|
|
84
|
+
return fasta.parse_stream(response.iter_lines(decode_unicode=True))
|
|
85
|
+
else:
|
|
86
|
+
# take first two columns only
|
|
87
|
+
return (
|
|
88
|
+
(row[0], row[1])
|
|
89
|
+
for row in csv.parse_stream(response.iter_lines(decode_unicode=True))
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_seed(session: APISession, job_id: str) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Retrieve the seed sequence for a given MSA job.
|
|
96
|
+
|
|
97
|
+
Parameters
|
|
98
|
+
----------
|
|
99
|
+
session : APISession
|
|
100
|
+
The API session.
|
|
101
|
+
job_id : str
|
|
102
|
+
The job identifier.
|
|
103
|
+
|
|
104
|
+
Returns
|
|
105
|
+
-------
|
|
106
|
+
str
|
|
107
|
+
The seed sequence.
|
|
108
|
+
"""
|
|
109
|
+
# HACK for some reason this returns a csv
|
|
110
|
+
r = get_input(session=session, job_id=job_id, input_type=AlignType.INPUT)
|
|
111
|
+
seed = next(r)[1]
|
|
112
|
+
return seed
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def get_msa(session: APISession, job_id: str) -> Iterator[tuple[str, str]]:
|
|
116
|
+
"""
|
|
117
|
+
Retrieve the generated MSA (Multiple Sequence Alignment) for a given job.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
session : APISession
|
|
122
|
+
The API session.
|
|
123
|
+
job_id : str
|
|
124
|
+
The job identifier.
|
|
125
|
+
|
|
126
|
+
Returns
|
|
127
|
+
-------
|
|
128
|
+
Iterator[tuple[str, str]]
|
|
129
|
+
An iterator over the name, sequence of the MSA.
|
|
130
|
+
"""
|
|
131
|
+
return get_input(session=session, job_id=job_id, input_type=AlignType.MSA)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def msa_post(
|
|
135
|
+
session: APISession,
|
|
136
|
+
msa_file: BinaryIO | None = None,
|
|
137
|
+
seed: str | bytes | None = None,
|
|
138
|
+
) -> Job:
|
|
139
|
+
"""
|
|
140
|
+
Create an MSA job.
|
|
141
|
+
|
|
142
|
+
Either a seed sequence (which will trigger MSA creation) or a ready-to-use MSA (via `msa_file`) must be provided.
|
|
143
|
+
`seed` and `msa_file` are mutually exclusive.
|
|
144
|
+
|
|
145
|
+
Parameters
|
|
146
|
+
----------
|
|
147
|
+
session : APISession
|
|
148
|
+
The API session.
|
|
149
|
+
msa_file : BinaryIO or None, optional
|
|
150
|
+
Ready-made MSA file. Defaults to None.
|
|
151
|
+
seed : str or bytes or None, optional
|
|
152
|
+
Seed sequence to trigger MSA job. Defaults to None.
|
|
153
|
+
|
|
154
|
+
Raises
|
|
155
|
+
------
|
|
156
|
+
MissingParameterError
|
|
157
|
+
If neither or both of `msa_file` and `seed` are provided.
|
|
158
|
+
|
|
159
|
+
Returns
|
|
160
|
+
-------
|
|
161
|
+
Job
|
|
162
|
+
Job details.
|
|
163
|
+
"""
|
|
164
|
+
if (msa_file is None and seed is None) or (
|
|
165
|
+
msa_file is not None and seed is not None
|
|
166
|
+
):
|
|
167
|
+
raise MissingParameterError("seed OR msa_file must be provided.")
|
|
168
|
+
endpoint = "v1/align/msa"
|
|
169
|
+
|
|
170
|
+
is_seed = False
|
|
171
|
+
if seed is not None:
|
|
172
|
+
seed = seed.encode() if isinstance(seed, str) else seed
|
|
173
|
+
msa_file = io.BytesIO(b"\n".join([b">seed", seed]))
|
|
174
|
+
is_seed = True
|
|
175
|
+
|
|
176
|
+
params = {"is_seed": is_seed}
|
|
177
|
+
files = {"msa_file": msa_file}
|
|
178
|
+
|
|
179
|
+
response = session.post(endpoint, files=files, params=params)
|
|
180
|
+
return Job.model_validate(response.json())
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def mafft_post(
|
|
184
|
+
session: APISession,
|
|
185
|
+
sequence_file: BinaryIO,
|
|
186
|
+
auto: bool = True,
|
|
187
|
+
ep: float | None = None,
|
|
188
|
+
op: float | None = None,
|
|
189
|
+
) -> Job:
|
|
190
|
+
"""
|
|
191
|
+
Align sequences using the MAFFT algorithm.
|
|
192
|
+
|
|
193
|
+
Sequences can be provided as FASTA or CSV formats. If CSV, the file must be headerless with either a single sequence column or name, sequence columns.
|
|
194
|
+
Set `auto` to True to automatically attempt the best parameters. Leave a parameter as None to use system defaults.
|
|
195
|
+
|
|
196
|
+
Parameters
|
|
197
|
+
----------
|
|
198
|
+
session : APISession
|
|
199
|
+
The API session.
|
|
200
|
+
sequence_file : BinaryIO
|
|
201
|
+
Sequences to align in FASTA or CSV format.
|
|
202
|
+
auto : bool, optional
|
|
203
|
+
Set to True to automatically set algorithm parameters. Default is True.
|
|
204
|
+
ep : float or None, optional
|
|
205
|
+
MAFFT parameter. Default is None.
|
|
206
|
+
op : float or None, optional
|
|
207
|
+
MAFFT parameter. Default is None.
|
|
208
|
+
|
|
209
|
+
Returns
|
|
210
|
+
-------
|
|
211
|
+
Job
|
|
212
|
+
Job details.
|
|
213
|
+
"""
|
|
214
|
+
endpoint = "v1/align/mafft"
|
|
215
|
+
|
|
216
|
+
files = {"file": sequence_file}
|
|
217
|
+
params: dict = {"auto": auto}
|
|
218
|
+
if ep is not None:
|
|
219
|
+
params["ep"] = ep
|
|
220
|
+
if op is not None:
|
|
221
|
+
params["op"] = op
|
|
222
|
+
|
|
223
|
+
response = session.post(endpoint, files=files, params=params)
|
|
224
|
+
return Job.model_validate(response.json())
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def clustalo_post(
|
|
228
|
+
session: APISession,
|
|
229
|
+
sequence_file: BinaryIO,
|
|
230
|
+
clustersize: int | None = None,
|
|
231
|
+
iterations: int | None = None,
|
|
232
|
+
) -> Job:
|
|
233
|
+
"""
|
|
234
|
+
Align sequences using the Clustal Omega algorithm.
|
|
235
|
+
|
|
236
|
+
Sequences can be provided as FASTA or CSV formats. If CSV, the file must be headerless with either a single sequence column or name, sequence columns.
|
|
237
|
+
Leave a parameter as None to use system defaults.
|
|
238
|
+
|
|
239
|
+
Parameters
|
|
240
|
+
----------
|
|
241
|
+
session : APISession
|
|
242
|
+
The API session.
|
|
243
|
+
sequence_file : BinaryIO
|
|
244
|
+
Sequences to align in FASTA or CSV format.
|
|
245
|
+
clustersize : int or None, optional
|
|
246
|
+
Clustal Omega parameter. Default is None.
|
|
247
|
+
iterations : int or None, optional
|
|
248
|
+
Clustal Omega parameter. Default is None.
|
|
249
|
+
|
|
250
|
+
Returns
|
|
251
|
+
-------
|
|
252
|
+
Job
|
|
253
|
+
Job details.
|
|
254
|
+
"""
|
|
255
|
+
endpoint = "v1/align/clustalo"
|
|
256
|
+
|
|
257
|
+
files = {"file": sequence_file}
|
|
258
|
+
params = {}
|
|
259
|
+
if clustersize is not None:
|
|
260
|
+
params["clustersize"] = clustersize
|
|
261
|
+
if iterations is not None:
|
|
262
|
+
params["iterations"] = iterations
|
|
263
|
+
|
|
264
|
+
response = session.post(endpoint, files=files, params=params)
|
|
265
|
+
return Job.model_validate(response.json())
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def abnumber_post(
|
|
269
|
+
session: APISession,
|
|
270
|
+
sequence_file: BinaryIO,
|
|
271
|
+
scheme: AbNumberScheme | str = AbNumberScheme.IMGT,
|
|
272
|
+
) -> Job:
|
|
273
|
+
"""
|
|
274
|
+
Align antibody sequences using AbNumber.
|
|
275
|
+
|
|
276
|
+
Sequences can be provided as FASTA or CSV formats. If CSV, the file must be headerless with either a single sequence column or name, sequence columns.
|
|
277
|
+
The antibody numbering scheme can be specified.
|
|
278
|
+
|
|
279
|
+
Parameters
|
|
280
|
+
----------
|
|
281
|
+
session : APISession
|
|
282
|
+
The API session.
|
|
283
|
+
sequence_file : BinaryIO
|
|
284
|
+
Sequences to align in FASTA or CSV format.
|
|
285
|
+
scheme : AbNumberScheme, optional
|
|
286
|
+
Antibody numbering scheme. Default is IMGT.
|
|
287
|
+
|
|
288
|
+
Returns
|
|
289
|
+
-------
|
|
290
|
+
Job
|
|
291
|
+
Job details.
|
|
292
|
+
"""
|
|
293
|
+
endpoint = "v1/align/abnumber"
|
|
294
|
+
|
|
295
|
+
if isinstance(scheme, str):
|
|
296
|
+
if scheme not in {value.value for value in AbNumberScheme}:
|
|
297
|
+
raise InvalidParameterError(f"Antibody numbering {scheme} not recognized")
|
|
298
|
+
|
|
299
|
+
files = {"file": sequence_file}
|
|
300
|
+
params = {"scheme": scheme if isinstance(scheme, str) else scheme.value}
|
|
301
|
+
|
|
302
|
+
response = session.post(endpoint, files=files, params=params)
|
|
303
|
+
return Job.model_validate(response.json())
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def antibody_schema_get(session: APISession, job_id: str):
|
|
307
|
+
"""
|
|
308
|
+
Retrieve the antibody numbering for an AbNumber job.
|
|
309
|
+
|
|
310
|
+
Parameters
|
|
311
|
+
----------
|
|
312
|
+
session : APISession
|
|
313
|
+
The API session.
|
|
314
|
+
job_id : str
|
|
315
|
+
The job identifier.
|
|
316
|
+
|
|
317
|
+
Raises
|
|
318
|
+
------
|
|
319
|
+
NotImplementedError
|
|
320
|
+
This function is not yet implemented.
|
|
321
|
+
|
|
322
|
+
Returns
|
|
323
|
+
-------
|
|
324
|
+
None
|
|
325
|
+
"""
|
|
326
|
+
raise NotImplementedError()
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def prompt_post(
|
|
330
|
+
session: APISession,
|
|
331
|
+
msa_id: str,
|
|
332
|
+
num_sequences: int | None = None,
|
|
333
|
+
num_residues: int | None = None,
|
|
334
|
+
method: MSASamplingMethod = MSASamplingMethod.NEIGHBORS_NONGAP_NORM_NO_LIMIT,
|
|
335
|
+
homology_level: float = 0.8,
|
|
336
|
+
max_similarity: float = 1.0,
|
|
337
|
+
min_similarity: float = 0.0,
|
|
338
|
+
always_include_seed_sequence: bool = False,
|
|
339
|
+
num_ensemble_prompts: int = 1,
|
|
340
|
+
random_seed: int | None = None,
|
|
341
|
+
) -> Job:
|
|
342
|
+
"""
|
|
343
|
+
Create a protein sequence prompt from a linked MSA (Multiple Sequence Alignment) for PoET jobs.
|
|
344
|
+
|
|
345
|
+
The MSA is specified by `msa_id` and created in `msa_post`.
|
|
346
|
+
|
|
347
|
+
Parameters
|
|
348
|
+
----------
|
|
349
|
+
session : APISession
|
|
350
|
+
The API session.
|
|
351
|
+
msa_id : str
|
|
352
|
+
The ID of the Multiple Sequence Alignment to use for the prompt.
|
|
353
|
+
num_sequences : int or None, optional
|
|
354
|
+
Maximum number of sequences in the prompt. Must be less than 100.
|
|
355
|
+
num_residues : int or None, optional
|
|
356
|
+
Maximum number of residues (tokens) in the prompt. Must be less than 24577.
|
|
357
|
+
method : MSASamplingMethod, optional
|
|
358
|
+
Method to use for MSA sampling. Default is NEIGHBORS_NONGAP_NORM_NO_LIMIT.
|
|
359
|
+
homology_level : float, optional
|
|
360
|
+
Level of homology for sequences in the MSA (neighbors methods only). Must be between 0 and 1. Default is 0.8.
|
|
361
|
+
max_similarity : float, optional
|
|
362
|
+
Maximum similarity between sequences in the MSA and the seed. Must be between 0 and 1. Default is 1.0.
|
|
363
|
+
min_similarity : float, optional
|
|
364
|
+
Minimum similarity between sequences in the MSA and the seed. Must be between 0 and 1. Default is 0.0.
|
|
365
|
+
always_include_seed_sequence : bool, optional
|
|
366
|
+
Whether to always include the seed sequence in the MSA. Default is False.
|
|
367
|
+
num_ensemble_prompts : int, optional
|
|
368
|
+
Number of ensemble jobs to run. Default is 1.
|
|
369
|
+
random_seed : int or None, optional
|
|
370
|
+
Seed for random number generation. Default is a random number between 0 and 2**32-1.
|
|
371
|
+
|
|
372
|
+
Raises
|
|
373
|
+
------
|
|
374
|
+
InvalidParameterError
|
|
375
|
+
If provided parameter values are not in the allowed range.
|
|
376
|
+
MissingParameterError
|
|
377
|
+
If both or neither of `num_sequences` and `num_residues` are specified.
|
|
378
|
+
|
|
379
|
+
Returns
|
|
380
|
+
-------
|
|
381
|
+
Job
|
|
382
|
+
Job details.
|
|
383
|
+
"""
|
|
384
|
+
endpoint = "v1/align/prompt"
|
|
385
|
+
|
|
386
|
+
if not (0 <= homology_level <= 1):
|
|
387
|
+
raise InvalidParameterError("The 'homology_level' must be between 0 and 1.")
|
|
388
|
+
if not (0 <= max_similarity <= 1):
|
|
389
|
+
raise InvalidParameterError("The 'max_similarity' must be between 0 and 1.")
|
|
390
|
+
if not (0 <= min_similarity <= 1):
|
|
391
|
+
raise InvalidParameterError("The 'min_similarity' must be between 0 and 1.")
|
|
392
|
+
|
|
393
|
+
if num_residues is None and num_sequences is None:
|
|
394
|
+
num_residues = 12288
|
|
395
|
+
|
|
396
|
+
if (num_sequences is None and num_residues is None) or (
|
|
397
|
+
num_sequences is not None and num_residues is not None
|
|
398
|
+
):
|
|
399
|
+
raise MissingParameterError(
|
|
400
|
+
"Either 'num_sequences' or 'num_residues' must be set, but not both."
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
if num_sequences is not None and not (0 <= num_sequences < 100):
|
|
404
|
+
raise InvalidParameterError("The 'num_sequences' must be between 0 and 100.")
|
|
405
|
+
|
|
406
|
+
if num_residues is not None and not (0 <= num_residues < 24577):
|
|
407
|
+
raise InvalidParameterError("The 'num_residues' must be between 0 and 24577.")
|
|
408
|
+
|
|
409
|
+
if random_seed is None:
|
|
410
|
+
random_seed = random.randrange(2**32)
|
|
411
|
+
|
|
412
|
+
params = {
|
|
413
|
+
"msa_id": msa_id,
|
|
414
|
+
"msa_method": method,
|
|
415
|
+
"homology_level": homology_level,
|
|
416
|
+
"max_similarity": max_similarity,
|
|
417
|
+
"min_similarity": min_similarity,
|
|
418
|
+
"force_include_first": always_include_seed_sequence,
|
|
419
|
+
"replicates": num_ensemble_prompts,
|
|
420
|
+
"seed": random_seed,
|
|
421
|
+
}
|
|
422
|
+
if num_sequences is not None:
|
|
423
|
+
params["max_msa_sequences"] = num_sequences
|
|
424
|
+
if num_residues is not None:
|
|
425
|
+
params["max_msa_tokens"] = num_residues
|
|
426
|
+
|
|
427
|
+
response = session.post(endpoint, params=params)
|
|
428
|
+
return Job.model_validate(response.json())
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Align results represented as futures."""
|
|
2
|
+
|
|
3
|
+
from openprotein.base import APISession
|
|
4
|
+
from openprotein.jobs import Job
|
|
5
|
+
|
|
6
|
+
from . import api
|
|
7
|
+
from .schemas import AlignType
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AlignFuture:
|
|
11
|
+
"""A future object representing an alignment job, providing methods to retrieve job inputs and seed sequences."""
|
|
12
|
+
|
|
13
|
+
session: APISession
|
|
14
|
+
job: Job
|
|
15
|
+
|
|
16
|
+
def get_input(self, input_type: AlignType):
|
|
17
|
+
"""
|
|
18
|
+
Retrieve input data for this alignment job.
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
input_type : AlignType
|
|
23
|
+
The type of input data to retrieve.
|
|
24
|
+
|
|
25
|
+
Returns
|
|
26
|
+
-------
|
|
27
|
+
Iterator[list[str]]
|
|
28
|
+
An iterator over the input data rows.
|
|
29
|
+
"""
|
|
30
|
+
return api.get_input(
|
|
31
|
+
session=self.session, job_id=self.job.job_id, input_type=input_type
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def get_seed(self):
|
|
35
|
+
"""
|
|
36
|
+
Retrieve the seed sequence for this alignment job.
|
|
37
|
+
|
|
38
|
+
Returns
|
|
39
|
+
-------
|
|
40
|
+
str
|
|
41
|
+
The seed sequence.
|
|
42
|
+
"""
|
|
43
|
+
return api.get_seed(session=self.session, job_id=self.job.job_id)
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def id(self):
|
|
47
|
+
"""
|
|
48
|
+
The job ID for this alignment job.
|
|
49
|
+
|
|
50
|
+
Returns
|
|
51
|
+
-------
|
|
52
|
+
str
|
|
53
|
+
The job ID.
|
|
54
|
+
"""
|
|
55
|
+
return self.job.job_id
|
openprotein/align/msa.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""MSA results represented as a future."""
|
|
2
|
+
|
|
3
|
+
from typing import Iterator
|
|
4
|
+
|
|
5
|
+
from openprotein import config
|
|
6
|
+
from openprotein.base import APISession
|
|
7
|
+
from openprotein.jobs import Future, JobType
|
|
8
|
+
from openprotein.prompt import Prompt
|
|
9
|
+
|
|
10
|
+
from . import api
|
|
11
|
+
from .future import AlignFuture
|
|
12
|
+
from .schemas import (
|
|
13
|
+
AbNumberJob,
|
|
14
|
+
ClustalOJob,
|
|
15
|
+
MafftJob,
|
|
16
|
+
MSAJob,
|
|
17
|
+
MSASamplingMethod,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# TODO - AbNumber should probably be different subclass, because it supports an additional `get` API for the antibody numbering
|
|
22
|
+
class MSAFuture(AlignFuture, Future):
|
|
23
|
+
"""
|
|
24
|
+
Represents a future for MSA (Multiple Sequence Alignment) results.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
job: MSAJob | MafftJob | ClustalOJob | AbNumberJob
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self, session: APISession, job: MSAJob, page_size: int = config.POET_PAGE_SIZE
|
|
31
|
+
):
|
|
32
|
+
"""
|
|
33
|
+
Initialize an MSAFuture instance.
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
session : APISession
|
|
38
|
+
An instance of APISession for API interactions.
|
|
39
|
+
job : MSAJob
|
|
40
|
+
The MSA job.
|
|
41
|
+
page_size : int, optional
|
|
42
|
+
The number of results to fetch in a single page. Defaults to config.POET_PAGE_SIZE.
|
|
43
|
+
"""
|
|
44
|
+
super().__init__(session, job)
|
|
45
|
+
self.page_size = page_size
|
|
46
|
+
self.msa_id = self.job.job_id
|
|
47
|
+
|
|
48
|
+
def get(self, verbose: bool = False) -> Iterator[tuple[str, str]]:
|
|
49
|
+
"""
|
|
50
|
+
Retrieve the MSA of the job.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
verbose : bool, optional
|
|
55
|
+
Whether to print verbose output. Defaults to False.
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
Iterator[tuple[str, str]]
|
|
60
|
+
An iterator over names and sequences of the MSA data.
|
|
61
|
+
"""
|
|
62
|
+
return api.get_msa(session=self.session, job_id=self.job.job_id)
|
|
63
|
+
|
|
64
|
+
def sample_prompt(
|
|
65
|
+
self,
|
|
66
|
+
num_sequences: int | None = None,
|
|
67
|
+
num_residues: int | None = None,
|
|
68
|
+
method: MSASamplingMethod = MSASamplingMethod.NEIGHBORS_NONGAP_NORM_NO_LIMIT,
|
|
69
|
+
homology_level: float = 0.8,
|
|
70
|
+
max_similarity: float = 1.0,
|
|
71
|
+
min_similarity: float = 0.0,
|
|
72
|
+
always_include_seed_sequence: bool = False,
|
|
73
|
+
num_ensemble_prompts: int = 1,
|
|
74
|
+
random_seed: int | None = None,
|
|
75
|
+
) -> Prompt:
|
|
76
|
+
"""
|
|
77
|
+
Create a protein sequence prompt from the linked MSA for PoET Jobs.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
num_sequences : int, optional
|
|
82
|
+
Maximum number of sequences in the prompt. Must be less than 100.
|
|
83
|
+
num_residues : int, optional
|
|
84
|
+
Maximum number of residues (tokens) in the prompt. Must be less than 24577.
|
|
85
|
+
method : MSASamplingMethod, optional
|
|
86
|
+
Method to use for MSA sampling. Defaults to NEIGHBORS_NONGAP_NORM_NO_LIMIT.
|
|
87
|
+
homology_level : float, optional
|
|
88
|
+
Level of homology for sequences in the MSA (neighbors methods only). Must be between 0 and 1. Defaults to 0.8.
|
|
89
|
+
max_similarity : float, optional
|
|
90
|
+
Maximum similarity between sequences in the MSA and the seed. Must be between 0 and 1. Defaults to 1.0.
|
|
91
|
+
min_similarity : float, optional
|
|
92
|
+
Minimum similarity between sequences in the MSA and the seed. Must be between 0 and 1. Defaults to 0.0.
|
|
93
|
+
always_include_seed_sequence : bool, optional
|
|
94
|
+
Whether to always include the seed sequence in the MSA. Defaults to False.
|
|
95
|
+
num_ensemble_prompts : int, optional
|
|
96
|
+
Number of ensemble jobs to run. Defaults to 1.
|
|
97
|
+
random_seed : int, optional
|
|
98
|
+
Seed for random number generation. Defaults to a random number between 0 and 2**32-1.
|
|
99
|
+
|
|
100
|
+
Raises
|
|
101
|
+
------
|
|
102
|
+
InvalidParameterError
|
|
103
|
+
If provided parameter values are not in the allowed range.
|
|
104
|
+
MissingParameterError
|
|
105
|
+
If both or none of 'num_sequences' and 'num_residues' are specified.
|
|
106
|
+
|
|
107
|
+
Returns
|
|
108
|
+
-------
|
|
109
|
+
Prompt
|
|
110
|
+
A Prompt instance for the created prompt job.
|
|
111
|
+
"""
|
|
112
|
+
msa_id = self.msa_id
|
|
113
|
+
job = api.prompt_post(
|
|
114
|
+
self.session,
|
|
115
|
+
msa_id=msa_id,
|
|
116
|
+
num_sequences=num_sequences,
|
|
117
|
+
num_residues=num_residues,
|
|
118
|
+
method=method,
|
|
119
|
+
homology_level=homology_level,
|
|
120
|
+
max_similarity=max_similarity,
|
|
121
|
+
min_similarity=min_similarity,
|
|
122
|
+
always_include_seed_sequence=always_include_seed_sequence,
|
|
123
|
+
num_ensemble_prompts=num_ensemble_prompts,
|
|
124
|
+
random_seed=random_seed,
|
|
125
|
+
)
|
|
126
|
+
future = Prompt.create(
|
|
127
|
+
session=self.session, job=job, num_replicates=num_ensemble_prompts
|
|
128
|
+
)
|
|
129
|
+
return future
|