openprotein-python 0.8.4__tar.gz → 0.8.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/PKG-INFO +9 -9
  2. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/README.md +8 -8
  3. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/common/__init__.py +2 -2
  4. openprotein_python-0.8.5/openprotein/common/features.py +15 -0
  5. openprotein_python-0.8.5/openprotein/common/reduction.py +14 -0
  6. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/data/api.py +13 -2
  7. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/data/data.py +9 -2
  8. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/embeddings/models.py +37 -28
  9. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/fold/future.py +2 -0
  10. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/predictor/api.py +2 -2
  11. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/predictor/predictor.py +40 -12
  12. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/predictor/schemas.py +2 -0
  13. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/protein.py +53 -36
  14. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/svd/svd.py +6 -4
  15. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/umap/umap.py +43 -14
  16. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/pyproject.toml +1 -18
  17. openprotein_python-0.8.4/openprotein/common/features.py +0 -7
  18. openprotein_python-0.8.4/openprotein/common/reduction.py +0 -8
  19. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/.gitignore +0 -0
  20. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/LICENSE.txt +0 -0
  21. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/__init__.py +0 -0
  22. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/_version.py +0 -0
  23. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/align/__init__.py +0 -0
  24. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/align/align.py +0 -0
  25. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/align/api.py +0 -0
  26. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/align/future.py +0 -0
  27. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/align/msa.py +0 -0
  28. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/align/schemas.py +0 -0
  29. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/base.py +0 -0
  30. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/chains.py +0 -0
  31. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/common/model_metadata.py +0 -0
  32. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/config.py +0 -0
  33. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/csv.py +0 -0
  34. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/data/__init__.py +0 -0
  35. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/data/assaydataset.py +0 -0
  36. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/data/schemas.py +0 -0
  37. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/design/__init__.py +0 -0
  38. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/design/api.py +0 -0
  39. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/design/design.py +0 -0
  40. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/design/future.py +0 -0
  41. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/design/schemas.py +0 -0
  42. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/embeddings/__init__.py +0 -0
  43. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/embeddings/api.py +0 -0
  44. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/embeddings/embeddings.py +0 -0
  45. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/embeddings/esm.py +0 -0
  46. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/embeddings/future.py +0 -0
  47. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/embeddings/openprotein.py +0 -0
  48. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/embeddings/poet.py +0 -0
  49. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/embeddings/poet2.py +0 -0
  50. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/embeddings/schemas.py +0 -0
  51. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/errors.py +0 -0
  52. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/fasta.py +0 -0
  53. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/fold/__init__.py +0 -0
  54. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/fold/alphafold2.py +0 -0
  55. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/fold/api.py +0 -0
  56. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/fold/boltz.py +0 -0
  57. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/fold/esmfold.py +0 -0
  58. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/fold/fold.py +0 -0
  59. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/fold/models.py +0 -0
  60. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/fold/schemas.py +0 -0
  61. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/jobs/__init__.py +0 -0
  62. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/jobs/api.py +0 -0
  63. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/jobs/futures.py +0 -0
  64. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/jobs/jobs.py +0 -0
  65. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/jobs/schemas.py +0 -0
  66. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/models/__init__.py +0 -0
  67. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/models/base.py +0 -0
  68. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/models/foundation/rfdiffusion.py +0 -0
  69. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/models/models.py +0 -0
  70. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/predictor/__init__.py +0 -0
  71. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/predictor/models.py +0 -0
  72. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/predictor/prediction.py +0 -0
  73. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/predictor/validate.py +0 -0
  74. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/prompt/__init__.py +0 -0
  75. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/prompt/api.py +0 -0
  76. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/prompt/models.py +0 -0
  77. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/prompt/prompt.py +0 -0
  78. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/prompt/schemas.py +0 -0
  79. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/svd/__init__.py +0 -0
  80. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/svd/api.py +0 -0
  81. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/svd/models.py +0 -0
  82. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/svd/schemas.py +0 -0
  83. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/umap/__init__.py +0 -0
  84. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/umap/api.py +0 -0
  85. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/umap/models.py +0 -0
  86. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/umap/schemas.py +0 -0
  87. {openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/utils/uuid.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openprotein-python
3
- Version: 0.8.4
3
+ Version: 0.8.5
4
4
  Summary: OpenProtein Python interface.
5
5
  Author-email: Mark Gee <markgee@ne47.bio>, "Timothy Truong Jr." <ttruong@ne47.bio>, Tristan Bepler <tbepler@ne47.bio>
6
6
  License-Expression: MIT
@@ -28,14 +28,14 @@ The OpenProtein.AI Python Interface provides a user-friendly library to interact
28
28
 
29
29
  # Table of Contents
30
30
 
31
- | | Workflow | Description |
32
- |---|----------------------------------------------------|------------------------------------------------------|
33
- | 0 | [`Quick start`](#Quick-start) | Quick start guide |
34
- | 1 | [`Installation`](https://docs.openprotein.ai/api-python/installation.html) | Install guide for pip and conda. |
35
- | 2 | [`Session management`](https://docs.openprotein.ai/api-python/overview.html) | An overview of the OpenProtein Python Client & the asynchronous jobs system. |
36
- | 3 | [`Asssay-based Sequence Learning`](https://docs.openprotein.ai/api-python/core_workflow.html) | Covers core tasks such as data upload, model training & prediction, and sequence design. |
37
- | 4 | [`De Novo prediction & generative models (PoET)`](https://docs.openprotein.ai/api-python/poet_workflow.html) | Covers PoET, a protein LLM for *de novo* scoring, as well as sequence generation. |
38
- | 5 | [`Protein Language Models & Embeddings`](https://docs.openprotein.ai/api-python/embedding_workflow.html) | Covers methods for creating sequence embeddings with proprietary & open-source models. |
31
+ | | Workflow | Description |
32
+ |---|--------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------|
33
+ | 0 | [`Quick start`](#Quick-start) | Quick start guide |
34
+ | 1 | [`Installation`](https://docs.openprotein.ai/api-python/installation.html) | Install guide for pip and conda. |
35
+ | 2 | [`Session management`](https://docs.openprotein.ai/api-python/overview.html) | An overview of the OpenProtein Python Client & the asynchronous jobs system. |
36
+ | 3 | [`Asssay-based Sequence Learning`](https://docs.openprotein.ai/api-python/core_workflow.html) | Covers core tasks such as data upload, model training & prediction, and sequence design. |
37
+ | 4 | [`De Novo prediction & generative models (PoET)`](https://docs.openprotein.ai/api-python/poet_workflow.html) | Covers PoET, a protein LLM for *de novo* scoring, as well as sequence generation. |
38
+ | 5 | [`Protein Language Models & Embeddings`](https://docs.openprotein.ai/api-python/embedding_workflow.html) | Covers methods for creating sequence embeddings with proprietary & open-source models. |
39
39
 
40
40
 
41
41
  # Quick-start
@@ -10,14 +10,14 @@ The OpenProtein.AI Python Interface provides a user-friendly library to interact
10
10
 
11
11
  # Table of Contents
12
12
 
13
- | | Workflow | Description |
14
- |---|----------------------------------------------------|------------------------------------------------------|
15
- | 0 | [`Quick start`](#Quick-start) | Quick start guide |
16
- | 1 | [`Installation`](https://docs.openprotein.ai/api-python/installation.html) | Install guide for pip and conda. |
17
- | 2 | [`Session management`](https://docs.openprotein.ai/api-python/overview.html) | An overview of the OpenProtein Python Client & the asynchronous jobs system. |
18
- | 3 | [`Asssay-based Sequence Learning`](https://docs.openprotein.ai/api-python/core_workflow.html) | Covers core tasks such as data upload, model training & prediction, and sequence design. |
19
- | 4 | [`De Novo prediction & generative models (PoET)`](https://docs.openprotein.ai/api-python/poet_workflow.html) | Covers PoET, a protein LLM for *de novo* scoring, as well as sequence generation. |
20
- | 5 | [`Protein Language Models & Embeddings`](https://docs.openprotein.ai/api-python/embedding_workflow.html) | Covers methods for creating sequence embeddings with proprietary & open-source models. |
13
+ | | Workflow | Description |
14
+ |---|--------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------|
15
+ | 0 | [`Quick start`](#Quick-start) | Quick start guide |
16
+ | 1 | [`Installation`](https://docs.openprotein.ai/api-python/installation.html) | Install guide for pip and conda. |
17
+ | 2 | [`Session management`](https://docs.openprotein.ai/api-python/overview.html) | An overview of the OpenProtein Python Client & the asynchronous jobs system. |
18
+ | 3 | [`Asssay-based Sequence Learning`](https://docs.openprotein.ai/api-python/core_workflow.html) | Covers core tasks such as data upload, model training & prediction, and sequence design. |
19
+ | 4 | [`De Novo prediction & generative models (PoET)`](https://docs.openprotein.ai/api-python/poet_workflow.html) | Covers PoET, a protein LLM for *de novo* scoring, as well as sequence generation. |
20
+ | 5 | [`Protein Language Models & Embeddings`](https://docs.openprotein.ai/api-python/embedding_workflow.html) | Covers methods for creating sequence embeddings with proprietary & open-source models. |
21
21
 
22
22
 
23
23
  # Quick-start
@@ -1,5 +1,5 @@
1
1
  """Common classes and utilities for OpenProtein."""
2
2
 
3
- from .features import FeatureType
3
+ from .features import Feature, FeatureType
4
4
  from .model_metadata import ModelDescription, ModelMetadata, TokenInfo
5
- from .reduction import ReductionType
5
+ from .reduction import Reduction, ReductionType
@@ -0,0 +1,15 @@
1
+ """Feature types used in OpenProtein."""
2
+
3
+ from enum import Enum
4
+ from typing import Literal
5
+
6
+
7
+ class FeatureType(str, Enum):
8
+
9
+ PLM = "PLM"
10
+ SVD = "SVD"
11
+
12
+
13
+ # NOTE: only works with python 3.12+
14
+ # Feature = Literal[*tuple([r.value for r in FeatureType])]
15
+ Feature = Literal["PLM", "SVD"]
@@ -0,0 +1,14 @@
1
+ """Reduction types used in OpenProtein."""
2
+
3
+ from enum import Enum
4
+ from typing import Literal
5
+
6
+
7
+ class ReductionType(str, Enum):
8
+ MEAN = "MEAN"
9
+ SUM = "SUM"
10
+
11
+
12
+ # NOTE: only works with python 3.12+
13
+ # Reduction = Literal[*tuple([r.value for r in ReductionType])]
14
+ Reduction = Literal["MEAN", "SUM"]
@@ -64,7 +64,9 @@ def assaydata_post(
64
64
  raise APIError(f"Unable to post assay data: {response.text}")
65
65
 
66
66
 
67
- def assaydata_list(session: APISession) -> list[AssayMetadata]:
67
+ def assaydata_list(
68
+ session: APISession, limit: int | None = None, offset: int | None = None
69
+ ) -> list[AssayMetadata]:
68
70
  """
69
71
  Get a list of all assay metadata.
70
72
 
@@ -72,6 +74,10 @@ def assaydata_list(session: APISession) -> list[AssayMetadata]:
72
74
  ----------
73
75
  session : APISession
74
76
  Session object for API communication.
77
+ limit : int, optional
78
+ Limit the number of assays to return.
79
+ offset : int, optional
80
+ Offset of assays to retrieve. Useful with limit.
75
81
 
76
82
  Returns
77
83
  -------
@@ -84,7 +90,12 @@ def assaydata_list(session: APISession) -> list[AssayMetadata]:
84
90
  If an error occurs during the API request.
85
91
  """
86
92
  endpoint = "v1/assaydata"
87
- response = session.get(endpoint)
93
+ params = {}
94
+ if limit is not None:
95
+ params["limit"] = limit
96
+ if offset is not None:
97
+ params["offset"] = offset
98
+ response = session.get(endpoint, params=params)
88
99
  if response.status_code == 200:
89
100
  return TypeAdapter(list[AssayMetadata]).validate_python(response.json())
90
101
  else:
@@ -14,16 +14,23 @@ class DataAPI:
14
14
  def __init__(self, session: APISession):
15
15
  self.session = session
16
16
 
17
- def list(self) -> list[AssayDataset]:
17
+ def list(
18
+ self, limit: int | None = None, offset: int | None = None
19
+ ) -> list[AssayDataset]:
18
20
  """
19
21
  List all assay datasets.
20
22
 
23
+ limit : int, optional
24
+ Limit the number of assays to return.
25
+ offset : int, optional
26
+ Offset of assays to retrieve. Useful with limit.
27
+
21
28
  Returns
22
29
  -------
23
30
  List[AssayDataset]
24
31
  List of all assay datasets.
25
32
  """
26
- metadata = api.assaydata_list(self.session)
33
+ metadata = api.assaydata_list(session=self.session, limit=limit, offset=offset)
27
34
  return [AssayDataset(self.session, x) for x in metadata]
28
35
 
29
36
  def create(
@@ -3,7 +3,13 @@
3
3
  from typing import TYPE_CHECKING
4
4
 
5
5
  from openprotein.base import APISession
6
- from openprotein.common import FeatureType, ModelMetadata, ReductionType
6
+ from openprotein.common import (
7
+ Feature,
8
+ FeatureType,
9
+ ModelMetadata,
10
+ Reduction,
11
+ ReductionType,
12
+ )
7
13
  from openprotein.data import AssayDataset, AssayMetadata, DataAPI
8
14
  from openprotein.errors import InvalidParameterError
9
15
 
@@ -199,9 +205,9 @@ class EmbeddingModel:
199
205
  def fit_svd(
200
206
  self,
201
207
  sequences: list[bytes] | list[str] | None = None,
202
- assay: AssayDataset | None = None,
208
+ assay: AssayDataset | AssayMetadata | None = None,
203
209
  n_components: int = 1024,
204
- reduction: ReductionType | None = None,
210
+ reduction: Reduction | ReductionType | None = None,
205
211
  **kwargs,
206
212
  ) -> "SVDModel":
207
213
  """
@@ -236,6 +242,11 @@ class EmbeddingModel:
236
242
  # local import for cyclic dep
237
243
  from openprotein.svd import SVDAPI
238
244
 
245
+ # runtime check on value
246
+ if isinstance(reduction, str):
247
+ reduction = ReductionType(reduction)
248
+ reduction = reduction.value
249
+
239
250
  svd_api = getattr(self.session, "svd", None)
240
251
  assert isinstance(svd_api, SVDAPI)
241
252
 
@@ -246,9 +257,8 @@ class EmbeddingModel:
246
257
  raise InvalidParameterError(
247
258
  "Expected either assay or sequences to fit SVD on!"
248
259
  )
249
- model_id = self.id
250
260
  return svd_api.fit_svd(
251
- model_id=model_id,
261
+ model=self,
252
262
  sequences=sequences,
253
263
  assay=assay,
254
264
  n_components=n_components,
@@ -259,9 +269,9 @@ class EmbeddingModel:
259
269
  def fit_umap(
260
270
  self,
261
271
  sequences: list[bytes] | list[str] | None = None,
262
- assay: AssayDataset | None = None,
272
+ assay: AssayDataset | AssayMetadata | None = None,
263
273
  n_components: int = 2,
264
- reduction: ReductionType | None = ReductionType.MEAN,
274
+ reduction: Reduction | ReductionType = "MEAN",
265
275
  **kwargs,
266
276
  ) -> "UMAPModel":
267
277
  """
@@ -274,11 +284,11 @@ class EmbeddingModel:
274
284
  ----------
275
285
  sequences : list of bytes or list of str or None, optional
276
286
  Optional sequences to fit UMAP with. Either use sequences or assay. Sequences is preferred.
277
- assay : AssayDataset or None, optional
287
+ assay : AssayDataset or AssayMetadata or None, optional
278
288
  Optional assay containing sequences to fit UMAP with. Either use sequences or assay. Ignored if sequences are provided.
279
289
  n_components : int, optional
280
290
  Number of components in UMAP fit. Determines output shapes. Default is 2.
281
- reduction : ReductionType or None, optional
291
+ reduction : Reduction or ReductionType or None, optional
282
292
  Embeddings reduction to use (e.g. mean). Defaults to MEAN.
283
293
  kwargs :
284
294
  Additional keyword arguments to be used from foundational models, e.g. prompt_id for PoET models.
@@ -296,6 +306,16 @@ class EmbeddingModel:
296
306
  # local import for cyclic dep
297
307
  from openprotein.umap import UMAPAPI
298
308
 
309
+ if reduction is None:
310
+ raise InvalidParameterError(
311
+ "Expected reduction if using EmbeddingModel to fit UMAP"
312
+ )
313
+
314
+ # runtime check on value
315
+ if isinstance(reduction, str):
316
+ reduction = ReductionType(reduction)
317
+ reduction = reduction.value
318
+
299
319
  umap_api = getattr(self.session, "umap", None)
300
320
  assert isinstance(umap_api, UMAPAPI)
301
321
 
@@ -306,12 +326,18 @@ class EmbeddingModel:
306
326
  raise InvalidParameterError(
307
327
  "Expected either assay or sequences to fit UMAP on!"
308
328
  )
329
+ # get assay_id
330
+ assay_id = (
331
+ assay.assay_id
332
+ if isinstance(assay, AssayMetadata)
333
+ else assay.id if isinstance(assay, AssayDataset) else assay
334
+ )
309
335
  model_id = self.id
310
336
  return umap_api.fit_umap(
311
337
  model_id=model_id,
312
338
  feature_type=FeatureType.PLM,
313
339
  sequences=sequences,
314
- assay_id=assay.id if assay is not None else None,
340
+ assay_id=assay_id,
315
341
  n_components=n_components,
316
342
  reduction=reduction,
317
343
  **kwargs,
@@ -319,7 +345,7 @@ class EmbeddingModel:
319
345
 
320
346
  def fit_gp(
321
347
  self,
322
- assay: AssayMetadata | AssayDataset | str,
348
+ assay: AssayDataset | AssayMetadata | str,
323
349
  properties: list[str],
324
350
  reduction: ReductionType,
325
351
  name: str | None = None,
@@ -358,26 +384,9 @@ class EmbeddingModel:
358
384
  # local import to resolve cyclic
359
385
  from openprotein.predictor import PredictorAPI
360
386
 
361
- data_api = getattr(self.session, "data", None)
362
- assert isinstance(data_api, DataAPI)
363
387
  predictor_api = getattr(self.session, "predictor", None)
364
388
  assert isinstance(predictor_api, PredictorAPI)
365
389
 
366
- # get assay if str
367
- assay = data_api.get(assay_id=assay) if isinstance(assay, str) else assay
368
- # extract assay_id
369
- if len(properties) == 0:
370
- raise InvalidParameterError("Expected (at-least) 1 property to train")
371
- if not set(properties) <= set(assay.measurement_names):
372
- raise InvalidParameterError(
373
- f"Expected all provided properties to be a subset of assay's measurements: {assay.measurement_names}"
374
- )
375
- # TODO - support multitask
376
- if len(properties) > 1:
377
- raise InvalidParameterError(
378
- "Training a multitask GP is not yet supported (i.e. number of properties should only be 1 for now)"
379
- )
380
-
381
390
  # inject into predictor api
382
391
  return predictor_api.fit_gp(
383
392
  assay=assay,
@@ -464,6 +464,8 @@ class FoldComplexResultFuture(Future):
464
464
  AttributeError
465
465
  If affinity is not supported for the model.
466
466
  """
467
+ from .boltz import BoltzAffinity
468
+
467
469
  if self.model_id not in {"boltz-1", "boltz-1x", "boltz-2"}:
468
470
  raise AttributeError("affinity not supported for non-Boltz model")
469
471
  if self._affinity is None:
@@ -162,8 +162,8 @@ def predictor_fit_gp_post(
162
162
  body["name"] = name
163
163
  if description is not None:
164
164
  body["description"] = description
165
- # add kwargs for embeddings kwargs
166
- body.update(kwargs)
165
+ # add kwargs for embeddings kwargs to features
166
+ body["features"].update(kwargs)
167
167
 
168
168
  response = session.post(endpoint, json=body)
169
169
  return PredictorTrainJob.model_validate(response.json())
@@ -1,10 +1,11 @@
1
1
  """Predictor API providing the interface to train and predict predictors."""
2
2
 
3
3
  from openprotein.base import APISession
4
- from openprotein.common import FeatureType, ReductionType
4
+ from openprotein.common import Feature, FeatureType, Reduction, ReductionType
5
5
  from openprotein.data import (
6
6
  AssayDataset,
7
7
  AssayMetadata,
8
+ DataAPI,
8
9
  )
9
10
  from openprotein.embeddings import EmbeddingModel, EmbeddingsAPI
10
11
  from openprotein.errors import InvalidParameterError
@@ -120,8 +121,8 @@ class PredictorAPI:
120
121
  assay: AssayDataset | AssayMetadata | str,
121
122
  properties: list[str],
122
123
  model: EmbeddingModel | SVDModel | str,
123
- feature_type: FeatureType | None = None,
124
- reduction: ReductionType | None = None,
124
+ feature_type: Feature | FeatureType | None = None,
125
+ reduction: Reduction | ReductionType | None = None,
125
126
  name: str | None = None,
126
127
  description: str | None = None,
127
128
  **kwargs,
@@ -139,10 +140,10 @@ class PredictorAPI:
139
140
  Instance of either EmbeddingModel or SVDModel to use depending
140
141
  on feature type. Can also be a str specifying the model id,
141
142
  but then feature_type would have to be specified.
142
- feature_type : FeatureType or None
143
+ feature_type : Feature or FeatureType or None
143
144
  Type of features to use for encoding sequences. "SVD" or "PLM".
144
145
  None would require model to be EmbeddingModel or SVDModel.
145
- reduction : str or None, optional
146
+ reduction : Reduction or ReductionType or None, optional
146
147
  Type of embedding reduction to use for computing features.
147
148
  E.g. "MEAN" or "SUM". Used only if using EmbeddingModel, and
148
149
  must be non-nil if using an EmbeddingModel. Defaults to None.
@@ -154,6 +155,29 @@ class PredictorAPI:
154
155
  PredictorModel
155
156
  The GP model being fit.
156
157
  """
158
+ data_api = getattr(self.session, "data", None)
159
+ assert isinstance(data_api, DataAPI)
160
+ # 1. Check assay data input
161
+ # get assay if str
162
+ assay = data_api.get(assay_id=assay) if isinstance(assay, str) else assay
163
+ # extract assay_id
164
+ assay_id = (
165
+ assay.assay_id
166
+ if isinstance(assay, AssayMetadata)
167
+ else assay.id if isinstance(assay, AssayDataset) else assay
168
+ )
169
+ if len(properties) == 0:
170
+ raise InvalidParameterError("Expected (at-least) 1 property to train")
171
+ if not set(properties) <= set(assay.measurement_names):
172
+ raise InvalidParameterError(
173
+ f"Expected all provided properties to be a subset of assay's measurements: {assay.measurement_names}"
174
+ )
175
+ # TODO - support multitask
176
+ if len(properties) > 1:
177
+ raise InvalidParameterError(
178
+ "Training a multitask GP is not yet supported (i.e. number of properties should only be 1 for now)"
179
+ )
180
+ # 2. Check features input
157
181
  # extract feature type
158
182
  feature_type = (
159
183
  FeatureType.PLM
@@ -164,6 +188,15 @@ class PredictorAPI:
164
188
  raise InvalidParameterError(
165
189
  "Expected feature_type to be provided if passing str model_id as model"
166
190
  )
191
+ # runtime check on value
192
+ if isinstance(feature_type, str):
193
+ feature_type = FeatureType(feature_type)
194
+
195
+ # 3. Check reduction
196
+ if isinstance(reduction, str):
197
+ reduction = ReductionType(reduction)
198
+ reduction = reduction.value
199
+
167
200
  # get model if model_id
168
201
  if feature_type == FeatureType.PLM:
169
202
  if reduction is None:
@@ -183,19 +216,14 @@ class PredictorAPI:
183
216
  model = svd_api.get_svd(model)
184
217
  assert isinstance(model, SVDModel), "Expected SVDModel"
185
218
  model_id = model.id
186
- # get assay_id
187
- assay_id = (
188
- assay.assay_id
189
- if isinstance(assay, AssayMetadata)
190
- else assay.id if isinstance(assay, AssayDataset) else assay
191
- )
219
+
192
220
  return PredictorModel(
193
221
  session=self.session,
194
222
  job=api.predictor_fit_gp_post(
195
223
  session=self.session,
196
224
  assay_id=assay_id,
197
225
  properties=properties,
198
- feature_type=feature_type,
226
+ feature_type=feature_type.value,
199
227
  model_id=model_id,
200
228
  reduction=reduction,
201
229
  name=name,
@@ -29,6 +29,8 @@ class Features(BaseModel):
29
29
  model_id: str | None = None
30
30
  reduction: str | None = None
31
31
 
32
+ # TODO: model extra kwargs
33
+
32
34
  model_config = ConfigDict(protected_namespaces=())
33
35
 
34
36
 
@@ -29,38 +29,6 @@ _BACKBONE_ATOM_TYPES = ("N", "CA", "C")
29
29
  _NAN_BFACTOR_VALUE = 9999.75 # can't/hard to use 9999.99 due to precision issues
30
30
 
31
31
 
32
- def calc_rmsd(
33
- xyz1: npt.NDArray[np.floating], xyz2: npt.NDArray[np.floating], eps: float = 1e-6
34
- ) -> tuple[float, npt.NDArray[np.floating]]:
35
- """
36
- Calculates RMSD between two sets of atoms (L, 3)
37
- Adapted from https://github.com/RosettaCommons/RFdiffusion/blob/b44206a2a79f219bb1a649ea50603a284c225050/rfdiffusion/util.py#L719
38
- """
39
- # center to CA centroid
40
- xyz1 = xyz1 - xyz1.mean(0)
41
- xyz2 = xyz2 - xyz2.mean(0)
42
-
43
- # Computation of the covariance matrix
44
- C = xyz2.T @ xyz1
45
-
46
- # Compute otimal rotation matrix using SVD
47
- V, S, W = np.linalg.svd(C)
48
-
49
- # get sign to ensure right-handedness
50
- d = np.ones([3, 3])
51
- d[:, -1] = np.sign(np.linalg.det(V) * np.linalg.det(W))
52
-
53
- # Rotation matrix U
54
- U = (d * V) @ W
55
-
56
- # Rotate xyz2
57
- xyz2_ = xyz2 @ U
58
- L = xyz2_.shape[0]
59
- rmsd = np.sqrt(np.sum((xyz2_ - xyz1) * (xyz2_ - xyz1), axis=(0, 1)) / L + eps)
60
-
61
- return rmsd, U
62
-
63
-
64
32
  class Protein:
65
33
  """
66
34
  Represents a protein with optional sequence, atomic coordinates, per-residue
@@ -416,10 +384,12 @@ class Protein:
416
384
  else:
417
385
  atom.b_iso = _NAN_BFACTOR_VALUE
418
386
  atom = residue.add_atom(atom)
419
- block = structure.make_mmcif_block()
420
387
  # NB: gemmi doesn't seem to write the _chem_comp category properly... it says
421
388
  # the type is `.`, but is should be something like `L-PEPTIDE LINKING`...
422
- block.find_mmcif_category("_chem_comp").erase() # ...so we remove it
389
+ # see also: https://github.com/project-gemmi/gemmi/discussions/362
390
+ block = structure.make_mmcif_block(
391
+ groups=gemmi.MmcifOutputGroups(True, chem_comp=False)
392
+ )
423
393
  return block.as_string()
424
394
 
425
395
  def make_fasta_bytes(self) -> bytes:
@@ -479,7 +449,6 @@ class Protein:
479
449
  model_idx: int = 0,
480
450
  verbose: bool = True,
481
451
  ) -> "Protein":
482
- filestring = filestring if isinstance(filestring, str) else filestring.decode()
483
452
  if format == "pdb":
484
453
  structure = gemmi.read_pdb_string(filestring)
485
454
  elif format == "cif":
@@ -507,7 +476,7 @@ class Protein:
507
476
  structure.setup_entities()
508
477
  structure.assign_label_seq_id()
509
478
  if use_bfactor_as_plddt is None:
510
- use_bfactor_as_plddt = structure.resolution == 0.0
479
+ use_bfactor_as_plddt = _use_bfactor_as_plddt(structure=structure)
511
480
  model = structure[model_idx]
512
481
  chain = model.find_chain(chain_id)
513
482
  assert chain is not None
@@ -585,3 +554,51 @@ def parse_fasta_as_proteins(path: str | Path) -> list[Protein]:
585
554
  for name, sequence in fasta.parse_stream(fp):
586
555
  proteins.append(Protein(name=name, sequence=sequence))
587
556
  return proteins
557
+
558
+
559
+ def _use_bfactor_as_plddt(structure: gemmi.Structure) -> bool:
560
+ """
561
+ This heuristic decides whether to use B-factor as pLDDT.
562
+ It uses B-factor as pLDDT when all of the following fields are *not* set:
563
+ - structure resolution
564
+ - _pdbx_database_status.recvd_initial_deposition_date
565
+ This heuristic may be changed in the future.
566
+ """
567
+ return (structure.resolution == 0.0) and (
568
+ structure.make_mmcif_block(
569
+ groups=gemmi.MmcifOutputGroups(False, database_status=True)
570
+ ).find_value("_pdbx_database_status.recvd_initial_deposition_date")
571
+ is None
572
+ )
573
+
574
+
575
+ def calc_rmsd(
576
+ xyz1: npt.NDArray[np.floating], xyz2: npt.NDArray[np.floating], eps: float = 1e-6
577
+ ) -> tuple[float, npt.NDArray[np.floating]]:
578
+ """
579
+ Calculates RMSD between two sets of atoms (L, 3)
580
+ Adapted from https://github.com/RosettaCommons/RFdiffusion/blob/b44206a2a79f219bb1a649ea50603a284c225050/rfdiffusion/util.py#L719
581
+ """
582
+ # center to CA centroid
583
+ xyz1 = xyz1 - xyz1.mean(0)
584
+ xyz2 = xyz2 - xyz2.mean(0)
585
+
586
+ # Computation of the covariance matrix
587
+ C = xyz2.T @ xyz1
588
+
589
+ # Compute otimal rotation matrix using SVD
590
+ V, S, W = np.linalg.svd(C)
591
+
592
+ # get sign to ensure right-handedness
593
+ d = np.ones([3, 3])
594
+ d[:, -1] = np.sign(np.linalg.det(V) * np.linalg.det(W))
595
+
596
+ # Rotation matrix U
597
+ U = (d * V) @ W
598
+
599
+ # Rotate xyz2
600
+ xyz2_ = xyz2 @ U
601
+ L = xyz2_.shape[0]
602
+ rmsd = np.sqrt(np.sum((xyz2_ - xyz1) * (xyz2_ - xyz1), axis=(0, 1)) / L + eps)
603
+
604
+ return rmsd, U
@@ -1,5 +1,7 @@
1
1
  """SVD API providing the interface for creating and using SVD models."""
2
2
 
3
+ from typing import Literal
4
+
3
5
  from openprotein.base import APISession
4
6
  from openprotein.common import ReductionType
5
7
  from openprotein.data import AssayDataset, AssayMetadata
@@ -20,11 +22,11 @@ class SVDAPI:
20
22
 
21
23
  def fit_svd(
22
24
  self,
23
- model_id: str,
25
+ model_id: str | EmbeddingModel,
24
26
  sequences: list[bytes] | list[str] | None = None,
25
27
  assay: AssayMetadata | AssayDataset | str | None = None,
26
28
  n_components: int = 1024,
27
- reduction: ReductionType | None = None,
29
+ reduction: Literal["MEAN", "SUM"] | None = None,
28
30
  **kwargs,
29
31
  ) -> SVDModel:
30
32
  """
@@ -32,7 +34,7 @@ class SVDAPI:
32
34
 
33
35
  Parameters
34
36
  ----------
35
- model_id : str
37
+ model_id : str or EmbeddingModel
36
38
  ID of embeddings model to use.
37
39
  sequences : list of bytes or None, optional
38
40
  Optional sequences to fit SVD with. Either use sequences or
@@ -43,7 +45,7 @@ class SVDAPI:
43
45
  Ignored if sequences are provided.
44
46
  n_components : int, optional
45
47
  The number of components for the SVD. Defaults to 1024.
46
- reduction : str or None, optional
48
+ reduction : str or ReductionType or None, optional
47
49
  Type of embedding reduction to use for computing features.
48
50
  E.g. "MEAN" or "SUM". Useful when dealing with variable length
49
51
  sequence. Defaults to None.
@@ -1,7 +1,10 @@
1
1
  """UMAP API providing the interface to fit and run UMAP visualizations."""
2
2
 
3
+ import typing
4
+ from typing import Literal
5
+
3
6
  from openprotein.base import APISession
4
- from openprotein.common import FeatureType, ReductionType
7
+ from openprotein.common import Feature, FeatureType, Reduction, ReductionType
5
8
  from openprotein.data import AssayDataset, AssayMetadata
6
9
  from openprotein.embeddings import EmbeddingModel, EmbeddingsAPI
7
10
  from openprotein.errors import InvalidParameterError
@@ -21,16 +24,35 @@ class UMAPAPI:
21
24
  ):
22
25
  self.session = session
23
26
 
27
+ @typing.overload
28
+ def fit_umap(
29
+ self,
30
+ model: EmbeddingModel,
31
+ reduction: Reduction | ReductionType,
32
+ feature_type: Literal["PLM"] = "PLM",
33
+ sequences: list[bytes] | list[str] | None = None,
34
+ assay: AssayDataset | AssayMetadata | str | None = None,
35
+ n_components: int = 2,
36
+ n_neighbors: int = 15,
37
+ min_dist: float = 0.1,
38
+ ) -> UMAPModel: ...
39
+
40
+ @typing.overload
41
+ def fit_umap(
42
+ self,
43
+ model: EmbeddingModel,
44
+ ) -> UMAPModel: ...
45
+
24
46
  def fit_umap(
25
47
  self,
26
48
  model: EmbeddingModel | SVDModel | str,
27
- feature_type: FeatureType | None = None,
49
+ reduction: Reduction | ReductionType | None = None,
50
+ feature_type: Feature | FeatureType | None = None,
28
51
  sequences: list[bytes] | list[str] | None = None,
29
- assay: AssayMetadata | AssayDataset | str | None = None,
52
+ assay: AssayDataset | AssayMetadata | str | None = None,
30
53
  n_components: int = 2,
31
54
  n_neighbors: int = 15,
32
55
  min_dist: float = 0.1,
33
- reduction: ReductionType | None = None,
34
56
  **kwargs,
35
57
  ) -> UMAPModel:
36
58
  """
@@ -42,14 +64,14 @@ class UMAPAPI:
42
64
  Optional sequences to fit UMAP with. Either use sequences or
43
65
  assay_id. sequences is preferred.
44
66
  assay : AssayMetadata or AssayDataset or str or None, optional
45
- Optional assay containing sequences to fit SVD with.
67
+ Optional assay containing sequences to fit UMAP with.
46
68
  Or its assay_id. Either use sequences or assay.
47
69
  Ignored if sequences are provided.
48
70
  model : EmbeddingModel or SVDModel or str
49
71
  Instance of either EmbeddingModel or SVDModel to use depending
50
72
  on feature type. Can also be a str specifying the model id,
51
73
  but then feature_type would have to be specified.
52
- feature_type : FeatureType or None, optional
74
+ feature_type : str or FeatureType or None, optional
53
75
  Type of features to use for encoding sequences. "SVD" or "PLM".
54
76
  None would require model to be EmbeddingModel or SVDModel.
55
77
  n_components : int, optional
@@ -58,7 +80,7 @@ class UMAPAPI:
58
80
  Number of neighbors to use for fitting. Defaults to 15.
59
81
  min_dist : float, optional
60
82
  Minimum distance in UMAP fitting. Defaults to 0.1.
61
- reduction : str or None, optional
83
+ reduction : str or ReductionType or None, optional
62
84
  Type of embedding reduction to use for computing features.
63
85
  E.g. "MEAN" or "SUM". Useful when dealing with variable length
64
86
  sequence. Defaults to None.
@@ -70,6 +92,13 @@ class UMAPAPI:
70
92
  UMAPModel
71
93
  The UMAP model being fit.
72
94
  """
95
+ # 1. Check assay data input - just need the id
96
+ # get assay_id
97
+ assay_id = (
98
+ assay.assay_id
99
+ if isinstance(assay, AssayMetadata)
100
+ else assay.id if isinstance(assay, AssayDataset) else assay
101
+ )
73
102
  # extract feature type
74
103
  feature_type = (
75
104
  FeatureType.PLM
@@ -80,11 +109,15 @@ class UMAPAPI:
80
109
  raise InvalidParameterError(
81
110
  "Expected feature_type to be provided if passing str model_id as model"
82
111
  )
112
+ if isinstance(feature_type, str):
113
+ feature_type = FeatureType(feature_type)
114
+ if isinstance(reduction, str):
115
+ reduction = ReductionType(reduction)
83
116
  # get model if model_id
84
117
  if feature_type == FeatureType.PLM:
85
118
  if reduction is None:
86
119
  raise InvalidParameterError(
87
- "Expected reduction if using EmbeddingModel"
120
+ "Expected reduction if using embedding model"
88
121
  )
89
122
  if isinstance(model, str):
90
123
  embeddings_api = getattr(self.session, "embedding", None)
@@ -93,18 +126,14 @@ class UMAPAPI:
93
126
  assert isinstance(model, EmbeddingModel), "Expected EmbeddingModel"
94
127
  model_id = model.id
95
128
  elif feature_type == FeatureType.SVD:
129
+ if reduction is not None:
130
+ raise InvalidParameterError("Unexpected reduction when using SVD model")
96
131
  if isinstance(model, str):
97
132
  svd_api = getattr(self.session, "svd", None)
98
133
  assert isinstance(svd_api, SVDAPI)
99
134
  model = svd_api.get_svd(model)
100
135
  assert isinstance(model, SVDModel), "Expected SVDModel"
101
136
  model_id = model.id
102
- # get assay_id
103
- assay_id = (
104
- assay.assay_id
105
- if isinstance(assay, AssayMetadata)
106
- else assay.id if isinstance(assay, AssayDataset) else assay
107
- )
108
137
  return UMAPModel(
109
138
  session=self.session,
110
139
  job=api.umap_fit_post(
@@ -35,6 +35,7 @@ dev = [
35
35
  "matplotlib>=3.9.2,<4",
36
36
  "scipy>=1.14.1,<2",
37
37
  "hatchling>=1.26.1",
38
+ "hatch-vcs>=0.5,<1",
38
39
  "editables>=0.5,<0.6",
39
40
  "seaborn>=0.13.2,<0.14",
40
41
  "jupyterlab>=4.4.1,<5",
@@ -55,24 +56,6 @@ jupyterinstall = "python -m ipykernel install --user --name=openprotein-python"
55
56
  [tool.pixi.environments]
56
57
  dev = ["dev"]
57
58
 
58
- [tool.pixi.package]
59
- name = "openprotein-python"
60
-
61
- [tool.pixi.package.build]
62
- backend = { name = "pixi-build-python", version = "0.1.*" }
63
- channels = ["conda-forge"]
64
-
65
- [tool.pixi.package.host-dependencies]
66
- hatchling = "*"
67
-
68
- [tool.pixi.package.run-dependencies]
69
- requests = ">=2.32.3,<3"
70
- pydantic = ">=2.5,<3"
71
- tqdm = ">=4.66.5,<5"
72
- pandas = ">=2.2.2,<3"
73
- numpy = ">=1.9,<3"
74
- gemmi = ">=0.7.0,<0.8"
75
-
76
59
  [build-system]
77
60
  requires = ["hatchling>=1.26.1", "hatch-vcs>=0.5.0"]
78
61
  build-backend = "hatchling.build"
@@ -1,7 +0,0 @@
1
- from enum import Enum
2
-
3
-
4
- class FeatureType(str, Enum):
5
-
6
- PLM = "PLM"
7
- SVD = "SVD"
@@ -1,8 +0,0 @@
1
- """Reduction types used in OpenProtein."""
2
-
3
- from enum import Enum
4
-
5
-
6
- class ReductionType(str, Enum):
7
- MEAN = "MEAN"
8
- SUM = "SUM"