openprotein-python 0.8.2__1-py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. openprotein/__init__.py +164 -0
  2. openprotein/_version.py +48 -0
  3. openprotein/align/__init__.py +8 -0
  4. openprotein/align/align.py +395 -0
  5. openprotein/align/api.py +428 -0
  6. openprotein/align/future.py +55 -0
  7. openprotein/align/msa.py +129 -0
  8. openprotein/align/schemas.py +165 -0
  9. openprotein/base.py +181 -0
  10. openprotein/chains.py +88 -0
  11. openprotein/common/__init__.py +5 -0
  12. openprotein/common/features.py +7 -0
  13. openprotein/common/model_metadata.py +33 -0
  14. openprotein/common/reduction.py +8 -0
  15. openprotein/config.py +9 -0
  16. openprotein/csv.py +31 -0
  17. openprotein/data/__init__.py +9 -0
  18. openprotein/data/api.py +218 -0
  19. openprotein/data/assaydataset.py +178 -0
  20. openprotein/data/data.py +93 -0
  21. openprotein/data/schemas.py +27 -0
  22. openprotein/design/__init__.py +16 -0
  23. openprotein/design/api.py +259 -0
  24. openprotein/design/design.py +125 -0
  25. openprotein/design/future.py +146 -0
  26. openprotein/design/schemas.py +607 -0
  27. openprotein/embeddings/__init__.py +27 -0
  28. openprotein/embeddings/api.py +619 -0
  29. openprotein/embeddings/embeddings.py +151 -0
  30. openprotein/embeddings/esm.py +33 -0
  31. openprotein/embeddings/future.py +146 -0
  32. openprotein/embeddings/models.py +421 -0
  33. openprotein/embeddings/openprotein.py +21 -0
  34. openprotein/embeddings/poet.py +446 -0
  35. openprotein/embeddings/poet2.py +505 -0
  36. openprotein/embeddings/schemas.py +78 -0
  37. openprotein/errors.py +76 -0
  38. openprotein/fasta.py +92 -0
  39. openprotein/fold/__init__.py +21 -0
  40. openprotein/fold/alphafold2.py +131 -0
  41. openprotein/fold/api.py +287 -0
  42. openprotein/fold/boltz.py +691 -0
  43. openprotein/fold/esmfold.py +54 -0
  44. openprotein/fold/fold.py +107 -0
  45. openprotein/fold/future.py +509 -0
  46. openprotein/fold/models.py +139 -0
  47. openprotein/fold/schemas.py +39 -0
  48. openprotein/jobs/__init__.py +9 -0
  49. openprotein/jobs/api.py +71 -0
  50. openprotein/jobs/futures.py +746 -0
  51. openprotein/jobs/jobs.py +69 -0
  52. openprotein/jobs/schemas.py +135 -0
  53. openprotein/models/__init__.py +4 -0
  54. openprotein/models/base.py +63 -0
  55. openprotein/models/foundation/rfdiffusion.py +283 -0
  56. openprotein/models/models.py +33 -0
  57. openprotein/predictor/__init__.py +25 -0
  58. openprotein/predictor/api.py +384 -0
  59. openprotein/predictor/models.py +374 -0
  60. openprotein/predictor/prediction.py +79 -0
  61. openprotein/predictor/predictor.py +242 -0
  62. openprotein/predictor/schemas.py +113 -0
  63. openprotein/predictor/validate.py +40 -0
  64. openprotein/prompt/__init__.py +9 -0
  65. openprotein/prompt/api.py +505 -0
  66. openprotein/prompt/models.py +142 -0
  67. openprotein/prompt/prompt.py +130 -0
  68. openprotein/prompt/schemas.py +49 -0
  69. openprotein/protein.py +587 -0
  70. openprotein/svd/__init__.py +9 -0
  71. openprotein/svd/api.py +206 -0
  72. openprotein/svd/models.py +288 -0
  73. openprotein/svd/schemas.py +31 -0
  74. openprotein/svd/svd.py +134 -0
  75. openprotein/umap/__init__.py +9 -0
  76. openprotein/umap/api.py +259 -0
  77. openprotein/umap/models.py +211 -0
  78. openprotein/umap/schemas.py +35 -0
  79. openprotein/umap/umap.py +175 -0
  80. openprotein/utils/uuid.py +29 -0
  81. openprotein_python-0.8.2.dist-info/METADATA +176 -0
  82. openprotein_python-0.8.2.dist-info/RECORD +84 -0
  83. openprotein_python-0.8.2.dist-info/WHEEL +4 -0
  84. openprotein_python-0.8.2.dist-info/licenses/LICENSE.txt +30 -0
@@ -0,0 +1,259 @@
1
+ """Design REST API for making HTTP calls to our design backend."""
2
+
3
+ from typing import Iterator
4
+
5
+ import numpy as np
6
+ from pydantic import TypeAdapter
7
+
8
+ from openprotein import csv
9
+ from openprotein.base import APISession
10
+
11
+ from .schemas import (
12
+ Criteria,
13
+ Criterion,
14
+ Design,
15
+ DesignConstraint,
16
+ DesignJob,
17
+ DesignResult,
18
+ Job,
19
+ Subcriterion,
20
+ )
21
+
22
+ PATH_PREFIX = "v1/designer/design"
23
+
24
+
25
+ def designs_list(session: APISession) -> list[Design]:
26
+ """
27
+ List designs.
28
+
29
+ Parameters
30
+ ----------
31
+ session : APISession
32
+ Session object for API communication.
33
+
34
+ Returns
35
+ -------
36
+ list[Design]
37
+ List of designs.
38
+ """
39
+ endpoint = PATH_PREFIX
40
+ response = session.get(endpoint)
41
+ return TypeAdapter(list[Design]).validate_python(response.json())
42
+
43
+
44
+ def design_get(session: APISession, design_id: str) -> Design:
45
+ """
46
+ Get design.
47
+
48
+ Parameters
49
+ ----------
50
+ session : APISession
51
+ Session object for API communication.
52
+ design_id: str
53
+ ID of design to get.
54
+
55
+ Returns
56
+ -------
57
+ Design
58
+ Design metadata.
59
+ """
60
+ endpoint = PATH_PREFIX + f"/{design_id}"
61
+ response = session.get(endpoint)
62
+ return TypeAdapter(Design).validate_python(response.json())
63
+
64
+
65
+ def designer_create_genetic_algorithm(
66
+ session: APISession,
67
+ assay_id: str,
68
+ criteria: Criteria | Subcriterion | Criterion,
69
+ num_steps: int = 25,
70
+ pop_size: int = 1024, # TODO - rename to library_size
71
+ n_offsprings: int = 5120,
72
+ crossover_prob: float = 1.0,
73
+ crossover_prob_pointwise: float = 0.2,
74
+ mutation_average_mutations_per_seq: int = 1,
75
+ allowed_tokens: DesignConstraint | dict[int, list[str]] = {},
76
+ ) -> Job:
77
+ """
78
+ Create design using genetic algorithm.
79
+
80
+ Parameters
81
+ ----------
82
+ session : APISession
83
+ Session object for API communication.
84
+ assay_id : str
85
+ Assay ID to fit GP on.
86
+ criteria: list[list[DesignCriterion]]
87
+ List of list of design criteria, logically grouping by OR then AND.
88
+ num_steps: int, optional
89
+ The number of steps in the genetic algorithm. Default is 8.
90
+ pop_size: int, optional
91
+ The population size for the genetic algorithm. Default is 256.
92
+ n_offsprings: int, optional
93
+ The number of offspring for the genetic algorithm. Default is 5120.
94
+ crossover_prob: float, optional
95
+ The crossover probability for the genetic algorithm. Default is 1.
96
+ crossover_prob_pointwise: float, optional
97
+ The pointwise crossover probability for the genetic algorithm. Default is 0.2.
98
+ mutation_average_mutations_per_seq: int, optional
99
+ The average number of mutations per sequence. Default is 1.
100
+ allowed_tokens: DesignConstraint | dict[int, list[str]]
101
+ A dict of positions and allows tokens (e.g. *{1:['G','L']})* ) designating how mutations may occur. Defaults to empty dict.
102
+
103
+ Returns
104
+ -------
105
+ DesignJob
106
+ """
107
+ if isinstance(criteria, Subcriterion):
108
+ criteria = Criteria([Criterion([criteria])])
109
+ elif isinstance(criteria, Criterion):
110
+ criteria = Criteria([criteria])
111
+
112
+ if isinstance(allowed_tokens, DesignConstraint):
113
+ allowed_tokens = allowed_tokens.as_dict()
114
+
115
+ endpoint = PATH_PREFIX + "/genetic-algorithm"
116
+
117
+ body = {
118
+ "assay_id": assay_id,
119
+ "criteria": criteria.model_dump(),
120
+ "num_steps": num_steps,
121
+ "pop_size": pop_size,
122
+ "n_offsprings": n_offsprings,
123
+ "crossover_prob": crossover_prob,
124
+ "crossover_prob_pointwise": crossover_prob_pointwise,
125
+ "mutation_average_mutations_per_seq": mutation_average_mutations_per_seq,
126
+ "allowed_tokens": allowed_tokens,
127
+ }
128
+ response = session.post(endpoint, json=body)
129
+ return DesignJob.model_validate(response.json())
130
+
131
+
132
+ def design_delete(session: APISession, design_id: str):
133
+ raise NotImplementedError()
134
+
135
+
136
+ def designer_get_design_results(
137
+ session: APISession,
138
+ design_id: str,
139
+ step: int | None = None,
140
+ ) -> Iterator[list[str]]:
141
+ """
142
+ Get csv encoded results for a design ID.
143
+
144
+ Parameters
145
+ ----------
146
+ session : APISession
147
+ Session object for API communication.
148
+ design_id : str
149
+ Design ID to retrieve results from.
150
+ step: int | None, optional
151
+ Step of the design whose results to fetch. Defaults to -1, which refers to the last step.
152
+
153
+ Returns
154
+ -------
155
+ bytes
156
+ """
157
+ params = {}
158
+ if step is not None:
159
+ if step != -1:
160
+ step -= 1
161
+ params["step"] = step
162
+ endpoint = PATH_PREFIX + f"/{design_id}/results"
163
+ response = session.get(endpoint, params=params, stream=True)
164
+ return csv.parse_stream(response.iter_lines())
165
+
166
+
167
+ def decode_design_result(
168
+ row: list[str],
169
+ score_start_index: int,
170
+ subscore_start_index: int,
171
+ pred_start_index: int,
172
+ ) -> DesignResult:
173
+ """
174
+ Decode prediction scores.
175
+
176
+ Args:
177
+ data (bytes): raw bytes encoding the array received over the API
178
+ batched (bool): whether or not the result was batched. affects the retrieved csv format whether they contain additional columns and header rows.
179
+
180
+ Returns:
181
+ mus (np.ndarray): decoded array of means
182
+ vars (np.ndarray): decoded array of variances
183
+ """
184
+ scores = np.array(
185
+ [float(score) for score in row[score_start_index:subscore_start_index]]
186
+ )
187
+ subscores = np.array(
188
+ [float(subscore) for subscore in row[subscore_start_index:pred_start_index]]
189
+ )
190
+ preds = np.array([float(pred) for pred in row[pred_start_index:]])
191
+ result = DesignResult(
192
+ step=int(row[0]) + 1,
193
+ sample_index=int(row[1]) + 1,
194
+ sequence=row[2],
195
+ scores=scores,
196
+ subscores=subscores,
197
+ means=preds[::2],
198
+ vars=preds[1::2],
199
+ )
200
+ return result
201
+
202
+
203
+ def decode_design_results_stream(
204
+ data: Iterator[list[str]], header: list[str] | None = None
205
+ ) -> Iterator[DesignResult]:
206
+ """
207
+ Decode design results.
208
+
209
+ Args:
210
+ data: Iterator[list[str]]
211
+ Data in the form of an iterator of list of string-encoded values
212
+ header: list[str] | None, optional
213
+ Headers describing the data. Should be same length as each row returned from the data iterator.
214
+ Defaults to None, which means the first row in the iterator should be header.
215
+
216
+ Returns:
217
+ step: int
218
+ Step index of the design.
219
+ sample_index: int
220
+ Index of the sample in the overall design.
221
+ sequence: str
222
+ Output designed sequence.
223
+ scores: np.ndarray[float]
224
+ M array of scores based on provided criteria (M groups of subcriteria).
225
+ subscores: np.ndarray[float]
226
+ N array of subscores based on provided criteria (flattened N subcriteria).
227
+ means: np.ndarray[float]
228
+ K array of means for each model subscriterion.
229
+ vars: np.ndarray[float]
230
+ K array of variances for each model subscriterion.
231
+ vars (np.ndarray): decoded array of variances
232
+ """
233
+ if header is None:
234
+ header = next(data)
235
+ if header[0].isnumeric():
236
+ raise ValueError(
237
+ "Expected first row in data to be header of 'step','sample_index',..."
238
+ )
239
+ score_start_index = subscore_start_index = pred_start_index = len(header)
240
+ # first start indices
241
+ for i, col_name in enumerate(header):
242
+ if col_name.startswith("score"):
243
+ score_start_index = i
244
+ break
245
+ for i, col_name in enumerate(header[score_start_index:]):
246
+ if col_name.endswith("score"):
247
+ subscore_start_index = score_start_index + i
248
+ break
249
+ for i, col_name in enumerate(header[subscore_start_index:]):
250
+ if col_name.endswith("y_mu"):
251
+ pred_start_index = subscore_start_index + i
252
+ break
253
+ for row in data:
254
+ yield decode_design_result(
255
+ row=row,
256
+ score_start_index=score_start_index,
257
+ subscore_start_index=subscore_start_index,
258
+ pred_start_index=pred_start_index,
259
+ )
@@ -0,0 +1,125 @@
1
+ """Design API providing the interface to design novel proteins based on a your design criteria."""
2
+
3
+ from openprotein.base import APISession
4
+ from openprotein.data import AssayDataset, DataAPI
5
+ from openprotein.jobs import JobsAPI
6
+
7
+ from . import api
8
+ from .future import DesignFuture
9
+ from .schemas import Criteria, Criterion, DesignConstraint, Subcriterion
10
+
11
+
12
+ class DesignAPI:
13
+ """Design API providing the interface to design novel proteins based on your design criteria."""
14
+
15
+ def __init__(
16
+ self,
17
+ session: APISession,
18
+ ):
19
+ self.session = session
20
+
21
+ def list_designs(self) -> list[DesignFuture]:
22
+ """
23
+ List all designs.
24
+
25
+ Returns
26
+ -------
27
+ list of DesignFuture
28
+ A list of DesignFuture objects representing all designs.
29
+ """
30
+ return [
31
+ DesignFuture(
32
+ session=self.session,
33
+ metadata=m,
34
+ )
35
+ for m in api.designs_list(session=self.session)
36
+ ]
37
+
38
+ def get_design(self, design_id: str) -> DesignFuture:
39
+ """
40
+ Retrieve a specific design by its ID.
41
+
42
+ Parameters
43
+ ----------
44
+ design_id : str
45
+ ID of the design to retrieve.
46
+
47
+ Returns
48
+ -------
49
+ DesignFuture
50
+ A future object representing the design job and its results.
51
+ """
52
+ return DesignFuture(
53
+ session=self.session,
54
+ metadata=api.design_get(session=self.session, design_id=design_id),
55
+ )
56
+
57
+ def create_genetic_algorithm_design(
58
+ self,
59
+ assay: AssayDataset,
60
+ criteria: Criteria | Subcriterion | Criterion,
61
+ num_steps: int = 25,
62
+ pop_size: int = 1024,
63
+ n_offsprings: int = 5120,
64
+ crossover_prob: float = 1.0,
65
+ crossover_prob_pointwise: float = 0.2,
66
+ mutation_average_mutations_per_seq: int = 1,
67
+ allowed_tokens: DesignConstraint | dict[int, list[str]] = {},
68
+ ) -> DesignFuture:
69
+ """
70
+ Start a protein design job using a genetic algorithm based on assay data, a trained ML model, and specified criteria.
71
+
72
+ Parameters
73
+ ----------
74
+ assay : AssayDataset
75
+ The AssayDataset to design from.
76
+ criteria : Criteria or Subcriterion or Criterion
77
+ Criteria for evaluating the design.
78
+ num_steps : int, optional
79
+ The number of steps in the genetic algorithm. Default is 25.
80
+ pop_size : int, optional
81
+ The population size for the genetic algorithm. Default is 1024.
82
+ n_offsprings : int, optional
83
+ The number of offspring for the genetic algorithm. Default is 5120.
84
+ crossover_prob : float, optional
85
+ The crossover probability for the genetic algorithm. Default is 1.0.
86
+ crossover_prob_pointwise : float, optional
87
+ The pointwise crossover probability for the genetic algorithm. Default is 0.2.
88
+ mutation_average_mutations_per_seq : int, optional
89
+ The average number of mutations per sequence. Default is 1.
90
+ allowed_tokens : DesignConstraint or dict of int to list of str, optional
91
+ A dict of positions and allowed tokens (e.g. {1: ['G', 'L']}) designating how mutations may occur. Defaults to empty dict.
92
+
93
+ Returns
94
+ -------
95
+ DesignFuture
96
+ A future object representing the design job and its results.
97
+ """
98
+ return DesignFuture.create(
99
+ session=self.session,
100
+ job=api.designer_create_genetic_algorithm(
101
+ self.session,
102
+ assay_id=assay.id,
103
+ criteria=criteria,
104
+ num_steps=num_steps,
105
+ pop_size=pop_size,
106
+ n_offsprings=n_offsprings,
107
+ crossover_prob=crossover_prob,
108
+ crossover_prob_pointwise=crossover_prob_pointwise,
109
+ mutation_average_mutations_per_seq=mutation_average_mutations_per_seq,
110
+ allowed_tokens=allowed_tokens,
111
+ ),
112
+ )
113
+
114
+ def create_design_job(
115
+ self,
116
+ *args,
117
+ ):
118
+ raise AttributeError(
119
+ "create_design_job belongs to the deprecated design interface. Use create_genetic_algorithm_design instead in the new design interface."
120
+ )
121
+
122
+ def get_design_results(self, *args):
123
+ raise AttributeError(
124
+ "get_design_results belongs to the deprecated design interface. Use get_design and wait instead in the new design interface."
125
+ )
@@ -0,0 +1,146 @@
1
+ """Design results represented as futures."""
2
+
3
+ from typing import Iterator
4
+
5
+ from openprotein.base import APISession
6
+ from openprotein.data import AssayDataset, DataAPI
7
+ from openprotein.jobs import Future, JobsAPI, StreamingFuture
8
+
9
+ from . import api
10
+ from .schemas import Criteria, Design, DesignAlgorithm, DesignJob, DesignResult
11
+
12
+
13
+ class DesignFuture(StreamingFuture, Future):
14
+ """A future object that will hold the results of the design job."""
15
+
16
+ job: DesignJob
17
+
18
+ def __init__(
19
+ self,
20
+ session: APISession,
21
+ job: DesignJob | None = None,
22
+ metadata: Design | None = None,
23
+ ):
24
+ """
25
+ Construct a future for a design job.
26
+
27
+ Takes in either a design job, or the design metadata.
28
+
29
+ :meta private:
30
+ """
31
+ self._design_assay = None
32
+ # initialize the metadata
33
+ if metadata is None:
34
+ if job is None:
35
+ raise ValueError("Expected design metadata or job")
36
+ metadata = api.design_get(session=session, design_id=job.job_id)
37
+ self._metadata = metadata
38
+ if job is None:
39
+ jobs_api = getattr(session, "jobs", None)
40
+ assert isinstance(jobs_api, JobsAPI)
41
+ job = DesignJob.create(jobs_api.get_job(job_id=metadata.id))
42
+ super().__init__(session, job)
43
+
44
+ @property
45
+ def id(self):
46
+ """ID of the design."""
47
+ return self._metadata.id
48
+
49
+ @property
50
+ def assay(self) -> AssayDataset:
51
+ """Assay used in the design."""
52
+ if self._design_assay is None:
53
+ self._design_assay = self.get_assay()
54
+ return self._design_assay
55
+
56
+ @property
57
+ def algorithm(self) -> DesignAlgorithm:
58
+ """Algorithm used in the design."""
59
+ return self._metadata.algorithm
60
+
61
+ @property
62
+ def criteria(self) -> Criteria:
63
+ """Criteria used in the design."""
64
+ return self._metadata.criteria
65
+
66
+ @property
67
+ def num_steps(self):
68
+ """Number of steps used in the design."""
69
+ return self._metadata.num_steps
70
+
71
+ @property
72
+ def num_rows(self):
73
+ """Number of rows in the total design output (across all steps)."""
74
+ return self._metadata.num_rows
75
+
76
+ @property
77
+ def allowed_tokens(self) -> dict[str, list[str]] | None:
78
+ """Allowed tokens used in the design."""
79
+ return self._metadata.allowed_tokens
80
+
81
+ @property
82
+ def pop_size(self) -> int:
83
+ """Population size used in the design."""
84
+ return self._metadata.pop_size
85
+
86
+ @property
87
+ def n_offsprings(self) -> int:
88
+ """Number of offsprings used in the design."""
89
+ return self._metadata.n_offsprings
90
+
91
+ @property
92
+ def crossover_prob(self) -> float:
93
+ """Crossover probability used in the design."""
94
+ return self._metadata.crossover_prob
95
+
96
+ @property
97
+ def crossover_prob_pointwise(self) -> float:
98
+ """Crossover probability pointwise used in the design."""
99
+ return self._metadata.crossover_prob_pointwise
100
+
101
+ @property
102
+ def mutation_average_mutations_per_seq(self) -> int:
103
+ """Average mutations per sequence used in the design."""
104
+ return self._metadata.mutation_average_mutations_per_seq
105
+
106
+ @property
107
+ def metadata(self):
108
+ """Design metadata."""
109
+ self._refresh_metadata()
110
+ return self._metadata
111
+
112
+ def _refresh_metadata(self):
113
+ if not self._metadata.is_done():
114
+ self._metadata = api.design_get(
115
+ session=self.session, design_id=self._metadata.id
116
+ )
117
+
118
+ def __delete(self) -> bool:
119
+ """
120
+ Delete this design.
121
+
122
+ TODO - implementation
123
+ """
124
+ return api.design_delete(session=self.session, design_id=self.id)
125
+
126
+ def stream(self, step: int | None = None) -> Iterator[DesignResult]:
127
+ stream = api.designer_get_design_results(
128
+ session=self.session, design_id=self.id, step=step
129
+ )
130
+ return api.decode_design_results_stream(data=stream)
131
+
132
+ def get(self, verbose: bool = False, **kwargs) -> list[DesignResult]:
133
+ return super().get(verbose, **kwargs)
134
+
135
+ def get_assay(self) -> AssayDataset:
136
+ """
137
+ Get assay used for design job.
138
+
139
+ Returns
140
+ -------
141
+ AssayDataset
142
+ Assay dataset used for design.
143
+ """
144
+ data_api = getattr(self.session, "data", None)
145
+ assert isinstance(data_api, DataAPI)
146
+ return data_api.get(self._metadata.assay_id)