PyPI - openprotein-python - Versions diffs - 0.8.4__tar.gz → 0.8.5__tar.gz - Mend

openprotein-python 0.8.4tar.gz → 0.8.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

{openprotein_python-0.8.4 → openprotein_python-0.8.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: openprotein-python
-Version: 0.8.4
+Version: 0.8.5
 Summary: OpenProtein Python interface.
 Author-email: Mark Gee <markgee@ne47.bio>, "Timothy Truong Jr." <ttruong@ne47.bio>, Tristan Bepler <tbepler@ne47.bio>
 License-Expression: MIT
@@ -28,14 +28,14 @@ The OpenProtein.AI Python Interface provides a user-friendly library to interact
 # Table of Contents
-|   | Workflow                                           | Description                                          |
-|---|----------------------------------------------------|------------------------------------------------------|
-| 0 | [`Quick start`](#Quick-start)                    | Quick start guide                     |
-| 1 | [`Installation`](https://docs.openprotein.ai/api-python/installation.html)                    | Install guide for pip and conda.                     |
-| 2 | [`Session management`](https://docs.openprotein.ai/api-python/overview.html)        | An overview of the OpenProtein Python Client & the asynchronous jobs system. |
-| 3 | [`Asssay-based Sequence Learning`](https://docs.openprotein.ai/api-python/core_workflow.html) | Covers core tasks such as data upload, model training & prediction, and sequence design. |
-| 4 | [`De Novo prediction & generative models (PoET)`](https://docs.openprotein.ai/api-python/poet_workflow.html) | Covers PoET, a protein LLM for *de novo* scoring, as well as sequence generation. |
-| 5 | [`Protein Language Models & Embeddings`](https://docs.openprotein.ai/api-python/embedding_workflow.html) | Covers methods for creating sequence embeddings with proprietary & open-source models. |
+|   | Workflow                                                                                                     | Description                                                                              |
+|---|--------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------|
+| 0 | [`Quick start`](#Quick-start)                                                                                | Quick start guide                                                                        |
+| 1 | [`Installation`](https://docs.openprotein.ai/api-python/installation.html)                                   | Install guide for pip and conda.                                                         |
+| 2 | [`Session management`](https://docs.openprotein.ai/api-python/overview.html)                                 | An overview of the OpenProtein Python Client & the asynchronous jobs system.             |
+| 3 | [`Asssay-based Sequence Learning`](https://docs.openprotein.ai/api-python/core_workflow.html)                | Covers core tasks such as data upload, model training & prediction, and sequence design. |
+| 4 | [`De Novo prediction & generative models (PoET)`](https://docs.openprotein.ai/api-python/poet_workflow.html) | Covers PoET, a protein LLM for *de novo* scoring, as well as sequence generation.        |
+| 5 | [`Protein Language Models & Embeddings`](https://docs.openprotein.ai/api-python/embedding_workflow.html)     | Covers methods for creating sequence embeddings with proprietary & open-source models.   |
 # Quick-start

{openprotein_python-0.8.4 → openprotein_python-0.8.5}/README.md RENAMED Viewed

@@ -10,14 +10,14 @@ The OpenProtein.AI Python Interface provides a user-friendly library to interact
 # Table of Contents
-|   | Workflow                                           | Description                                          |
-|---|----------------------------------------------------|------------------------------------------------------|
-| 0 | [`Quick start`](#Quick-start)                    | Quick start guide                     |
-| 1 | [`Installation`](https://docs.openprotein.ai/api-python/installation.html)                    | Install guide for pip and conda.                     |
-| 2 | [`Session management`](https://docs.openprotein.ai/api-python/overview.html)        | An overview of the OpenProtein Python Client & the asynchronous jobs system. |
-| 3 | [`Asssay-based Sequence Learning`](https://docs.openprotein.ai/api-python/core_workflow.html) | Covers core tasks such as data upload, model training & prediction, and sequence design. |
-| 4 | [`De Novo prediction & generative models (PoET)`](https://docs.openprotein.ai/api-python/poet_workflow.html) | Covers PoET, a protein LLM for *de novo* scoring, as well as sequence generation. |
-| 5 | [`Protein Language Models & Embeddings`](https://docs.openprotein.ai/api-python/embedding_workflow.html) | Covers methods for creating sequence embeddings with proprietary & open-source models. |
+|   | Workflow                                                                                                     | Description                                                                              |
+|---|--------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------|
+| 0 | [`Quick start`](#Quick-start)                                                                                | Quick start guide                                                                        |
+| 1 | [`Installation`](https://docs.openprotein.ai/api-python/installation.html)                                   | Install guide for pip and conda.                                                         |
+| 2 | [`Session management`](https://docs.openprotein.ai/api-python/overview.html)                                 | An overview of the OpenProtein Python Client & the asynchronous jobs system.             |
+| 3 | [`Asssay-based Sequence Learning`](https://docs.openprotein.ai/api-python/core_workflow.html)                | Covers core tasks such as data upload, model training & prediction, and sequence design. |
+| 4 | [`De Novo prediction & generative models (PoET)`](https://docs.openprotein.ai/api-python/poet_workflow.html) | Covers PoET, a protein LLM for *de novo* scoring, as well as sequence generation.        |
+| 5 | [`Protein Language Models & Embeddings`](https://docs.openprotein.ai/api-python/embedding_workflow.html)     | Covers methods for creating sequence embeddings with proprietary & open-source models.   |
 # Quick-start

{openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/common/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 """Common classes and utilities for OpenProtein."""
-from .features import FeatureType
+from .features import Feature, FeatureType
 from .model_metadata import ModelDescription, ModelMetadata, TokenInfo
-from .reduction import ReductionType
+from .reduction import Reduction, ReductionType

openprotein_python-0.8.5/openprotein/common/features.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""Feature types used in OpenProtein."""
+from enum import Enum
+from typing import Literal
+class FeatureType(str, Enum):
+    PLM = "PLM"
+    SVD = "SVD"
+# NOTE: only works with python 3.12+
+# Feature = Literal[*tuple([r.value for r in FeatureType])]
+Feature = Literal["PLM", "SVD"]

openprotein_python-0.8.5/openprotein/common/reduction.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""Reduction types used in OpenProtein."""
+from enum import Enum
+from typing import Literal
+class ReductionType(str, Enum):
+    MEAN = "MEAN"
+    SUM = "SUM"
+# NOTE: only works with python 3.12+
+# Reduction = Literal[*tuple([r.value for r in ReductionType])]
+Reduction = Literal["MEAN", "SUM"]

{openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/data/api.py RENAMED Viewed

@@ -64,7 +64,9 @@ def assaydata_post(
         raise APIError(f"Unable to post assay data: {response.text}")
-def assaydata_list(session: APISession) -> list[AssayMetadata]:
+def assaydata_list(
+    session: APISession, limit: int | None = None, offset: int | None = None
+) -> list[AssayMetadata]:
     """
     Get a list of all assay metadata.
@@ -72,6 +74,10 @@ def assaydata_list(session: APISession) -> list[AssayMetadata]:
     ----------
     session : APISession
         Session object for API communication.
+    limit : int, optional
+        Limit the number of assays to return.
+    offset : int, optional
+        Offset of assays to retrieve. Useful with limit.
     Returns
     -------
@@ -84,7 +90,12 @@ def assaydata_list(session: APISession) -> list[AssayMetadata]:
         If an error occurs during the API request.
     """
     endpoint = "v1/assaydata"
-    response = session.get(endpoint)
+    params = {}
+    if limit is not None:
+        params["limit"] = limit
+    if offset is not None:
+        params["offset"] = offset
+    response = session.get(endpoint, params=params)
     if response.status_code == 200:
         return TypeAdapter(list[AssayMetadata]).validate_python(response.json())
     else:

{openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/data/data.py RENAMED Viewed

@@ -14,16 +14,23 @@ class DataAPI:
     def __init__(self, session: APISession):
         self.session = session
-    def list(self) -> list[AssayDataset]:
+    def list(
+        self, limit: int | None = None, offset: int | None = None
+    ) -> list[AssayDataset]:
         """
         List all assay datasets.
+        limit : int, optional
+            Limit the number of assays to return.
+        offset : int, optional
+            Offset of assays to retrieve. Useful with limit.
         Returns
         -------
         List[AssayDataset]
             List of all assay datasets.
         """
-        metadata = api.assaydata_list(self.session)
+        metadata = api.assaydata_list(session=self.session, limit=limit, offset=offset)
         return [AssayDataset(self.session, x) for x in metadata]
     def create(

{openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/embeddings/models.py RENAMED Viewed

@@ -3,7 +3,13 @@
 from typing import TYPE_CHECKING
 from openprotein.base import APISession
-from openprotein.common import FeatureType, ModelMetadata, ReductionType
+from openprotein.common import (
+    Feature,
+    FeatureType,
+    ModelMetadata,
+    Reduction,
+    ReductionType,
+)
 from openprotein.data import AssayDataset, AssayMetadata, DataAPI
 from openprotein.errors import InvalidParameterError
@@ -199,9 +205,9 @@ class EmbeddingModel:
     def fit_svd(
         self,
         sequences: list[bytes] | list[str] | None = None,
-        assay: AssayDataset | None = None,
+        assay: AssayDataset | AssayMetadata | None = None,
         n_components: int = 1024,
-        reduction: ReductionType | None = None,
+        reduction: Reduction | ReductionType | None = None,
         **kwargs,
     ) -> "SVDModel":
         """
@@ -236,6 +242,11 @@ class EmbeddingModel:
         # local import for cyclic dep
         from openprotein.svd import SVDAPI
+        # runtime check on value
+        if isinstance(reduction, str):
+            reduction = ReductionType(reduction)
+            reduction = reduction.value
         svd_api = getattr(self.session, "svd", None)
         assert isinstance(svd_api, SVDAPI)
@@ -246,9 +257,8 @@ class EmbeddingModel:
             raise InvalidParameterError(
                 "Expected either assay or sequences to fit SVD on!"
             )
-        model_id = self.id
         return svd_api.fit_svd(
-            model_id=model_id,
+            model=self,
             sequences=sequences,
             assay=assay,
             n_components=n_components,
@@ -259,9 +269,9 @@ class EmbeddingModel:
     def fit_umap(
         self,
         sequences: list[bytes] | list[str] | None = None,
-        assay: AssayDataset | None = None,
+        assay: AssayDataset | AssayMetadata | None = None,
         n_components: int = 2,
-        reduction: ReductionType | None = ReductionType.MEAN,
+        reduction: Reduction | ReductionType = "MEAN",
         **kwargs,
     ) -> "UMAPModel":
         """
@@ -274,11 +284,11 @@ class EmbeddingModel:
         ----------
         sequences : list of bytes or list of str or None, optional
             Optional sequences to fit UMAP with. Either use sequences or assay. Sequences is preferred.
-        assay : AssayDataset or None, optional
+        assay : AssayDataset or AssayMetadata or None, optional
             Optional assay containing sequences to fit UMAP with. Either use sequences or assay. Ignored if sequences are provided.
         n_components : int, optional
             Number of components in UMAP fit. Determines output shapes. Default is 2.
-        reduction : ReductionType or None, optional
+        reduction : Reduction or ReductionType or None, optional
             Embeddings reduction to use (e.g. mean). Defaults to MEAN.
         kwargs :
             Additional keyword arguments to be used from foundational models, e.g. prompt_id for PoET models.
@@ -296,6 +306,16 @@ class EmbeddingModel:
         # local import for cyclic dep
         from openprotein.umap import UMAPAPI
+        if reduction is None:
+            raise InvalidParameterError(
+                "Expected reduction if using EmbeddingModel to fit UMAP"
+            )
+        # runtime check on value
+        if isinstance(reduction, str):
+            reduction = ReductionType(reduction)
+            reduction = reduction.value
         umap_api = getattr(self.session, "umap", None)
         assert isinstance(umap_api, UMAPAPI)
@@ -306,12 +326,18 @@ class EmbeddingModel:
             raise InvalidParameterError(
                 "Expected either assay or sequences to fit UMAP on!"
             )
+        # get assay_id
+        assay_id = (
+            assay.assay_id
+            if isinstance(assay, AssayMetadata)
+            else assay.id if isinstance(assay, AssayDataset) else assay
+        )
         model_id = self.id
         return umap_api.fit_umap(
             model_id=model_id,
             feature_type=FeatureType.PLM,
             sequences=sequences,
-            assay_id=assay.id if assay is not None else None,
+            assay_id=assay_id,
             n_components=n_components,
             reduction=reduction,
             **kwargs,
@@ -319,7 +345,7 @@ class EmbeddingModel:
     def fit_gp(
         self,
-        assay: AssayMetadata | AssayDataset | str,
+        assay: AssayDataset | AssayMetadata | str,
         properties: list[str],
         reduction: ReductionType,
         name: str | None = None,
@@ -358,26 +384,9 @@ class EmbeddingModel:
         # local import to resolve cyclic
         from openprotein.predictor import PredictorAPI
-        data_api = getattr(self.session, "data", None)
-        assert isinstance(data_api, DataAPI)
         predictor_api = getattr(self.session, "predictor", None)
         assert isinstance(predictor_api, PredictorAPI)
-        # get assay if str
-        assay = data_api.get(assay_id=assay) if isinstance(assay, str) else assay
-        # extract assay_id
-        if len(properties) == 0:
-            raise InvalidParameterError("Expected (at-least) 1 property to train")
-        if not set(properties) <= set(assay.measurement_names):
-            raise InvalidParameterError(
-                f"Expected all provided properties to be a subset of assay's measurements: {assay.measurement_names}"
-            )
-        # TODO - support multitask
-        if len(properties) > 1:
-            raise InvalidParameterError(
-                "Training a multitask GP is not yet supported (i.e. number of properties should only be 1 for now)"
-            )
         # inject into predictor api
         return predictor_api.fit_gp(
             assay=assay,

{openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/fold/future.py RENAMED Viewed

@@ -464,6 +464,8 @@ class FoldComplexResultFuture(Future):
         AttributeError
             If affinity is not supported for the model.
         """
+        from .boltz import BoltzAffinity
         if self.model_id not in {"boltz-1", "boltz-1x", "boltz-2"}:
             raise AttributeError("affinity not supported for non-Boltz model")
         if self._affinity is None:

{openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/predictor/api.py RENAMED Viewed

@@ -162,8 +162,8 @@ def predictor_fit_gp_post(
         body["name"] = name
     if description is not None:
         body["description"] = description
-    # add kwargs for embeddings kwargs
-    body.update(kwargs)
+    # add kwargs for embeddings kwargs to features
+    body["features"].update(kwargs)
     response = session.post(endpoint, json=body)
     return PredictorTrainJob.model_validate(response.json())

{openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/predictor/predictor.py RENAMED Viewed

@@ -1,10 +1,11 @@
 """Predictor API providing the interface to train and predict predictors."""
 from openprotein.base import APISession
-from openprotein.common import FeatureType, ReductionType
+from openprotein.common import Feature, FeatureType, Reduction, ReductionType
 from openprotein.data import (
     AssayDataset,
     AssayMetadata,
+    DataAPI,
 )
 from openprotein.embeddings import EmbeddingModel, EmbeddingsAPI
 from openprotein.errors import InvalidParameterError
@@ -120,8 +121,8 @@ class PredictorAPI:
         assay: AssayDataset | AssayMetadata | str,
         properties: list[str],
         model: EmbeddingModel | SVDModel | str,
-        feature_type: FeatureType | None = None,
-        reduction: ReductionType | None = None,
+        feature_type: Feature | FeatureType | None = None,
+        reduction: Reduction | ReductionType | None = None,
         name: str | None = None,
         description: str | None = None,
         **kwargs,
@@ -139,10 +140,10 @@ class PredictorAPI:
             Instance of either EmbeddingModel or SVDModel to use depending
             on feature type. Can also be a str specifying the model id,
             but then feature_type would have to be specified.
-        feature_type : FeatureType or None
+        feature_type : Feature or FeatureType or None
             Type of features to use for encoding sequences. "SVD" or "PLM".
             None would require model to be EmbeddingModel or SVDModel.
-        reduction  : str or None, optional
+        reduction  : Reduction or ReductionType or None, optional
             Type of embedding reduction to use for computing features.
             E.g. "MEAN" or "SUM". Used only if using EmbeddingModel, and
             must be non-nil if using an EmbeddingModel. Defaults to None.
@@ -154,6 +155,29 @@ class PredictorAPI:
         PredictorModel
             The GP model being fit.
         """
+        data_api = getattr(self.session, "data", None)
+        assert isinstance(data_api, DataAPI)
+        # 1. Check assay data input
+        # get assay if str
+        assay = data_api.get(assay_id=assay) if isinstance(assay, str) else assay
+        # extract assay_id
+        assay_id = (
+            assay.assay_id
+            if isinstance(assay, AssayMetadata)
+            else assay.id if isinstance(assay, AssayDataset) else assay
+        )
+        if len(properties) == 0:
+            raise InvalidParameterError("Expected (at-least) 1 property to train")
+        if not set(properties) <= set(assay.measurement_names):
+            raise InvalidParameterError(
+                f"Expected all provided properties to be a subset of assay's measurements: {assay.measurement_names}"
+            )
+        # TODO - support multitask
+        if len(properties) > 1:
+            raise InvalidParameterError(
+                "Training a multitask GP is not yet supported (i.e. number of properties should only be 1 for now)"
+            )
+        # 2. Check features input
         # extract feature type
         feature_type = (
             FeatureType.PLM
@@ -164,6 +188,15 @@ class PredictorAPI:
             raise InvalidParameterError(
                 "Expected feature_type to be provided if passing str model_id as model"
             )
+        # runtime check on value
+        if isinstance(feature_type, str):
+            feature_type = FeatureType(feature_type)
+        # 3. Check reduction
+        if isinstance(reduction, str):
+            reduction = ReductionType(reduction)
+            reduction = reduction.value
         # get model if model_id
         if feature_type == FeatureType.PLM:
             if reduction is None:
@@ -183,19 +216,14 @@ class PredictorAPI:
                 model = svd_api.get_svd(model)
             assert isinstance(model, SVDModel), "Expected SVDModel"
             model_id = model.id
-        # get assay_id
-        assay_id = (
-            assay.assay_id
-            if isinstance(assay, AssayMetadata)
-            else assay.id if isinstance(assay, AssayDataset) else assay
-        )
         return PredictorModel(
             session=self.session,
             job=api.predictor_fit_gp_post(
                 session=self.session,
                 assay_id=assay_id,
                 properties=properties,
-                feature_type=feature_type,
+                feature_type=feature_type.value,
                 model_id=model_id,
                 reduction=reduction,
                 name=name,

{openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/predictor/schemas.py RENAMED Viewed

@@ -29,6 +29,8 @@ class Features(BaseModel):
     model_id: str | None = None
     reduction: str | None = None
+    # TODO: model extra kwargs
     model_config = ConfigDict(protected_namespaces=())

{openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/protein.py RENAMED Viewed

@@ -29,38 +29,6 @@ _BACKBONE_ATOM_TYPES = ("N", "CA", "C")
 _NAN_BFACTOR_VALUE = 9999.75  # can't/hard to use 9999.99 due to precision issues
-def calc_rmsd(
-    xyz1: npt.NDArray[np.floating], xyz2: npt.NDArray[np.floating], eps: float = 1e-6
-) -> tuple[float, npt.NDArray[np.floating]]:
-    """
-    Calculates RMSD between two sets of atoms (L, 3)
-    Adapted from https://github.com/RosettaCommons/RFdiffusion/blob/b44206a2a79f219bb1a649ea50603a284c225050/rfdiffusion/util.py#L719
-    """
-    # center to CA centroid
-    xyz1 = xyz1 - xyz1.mean(0)
-    xyz2 = xyz2 - xyz2.mean(0)
-    # Computation of the covariance matrix
-    C = xyz2.T @ xyz1
-    # Compute otimal rotation matrix using SVD
-    V, S, W = np.linalg.svd(C)
-    # get sign to ensure right-handedness
-    d = np.ones([3, 3])
-    d[:, -1] = np.sign(np.linalg.det(V) * np.linalg.det(W))
-    # Rotation matrix U
-    U = (d * V) @ W
-    # Rotate xyz2
-    xyz2_ = xyz2 @ U
-    L = xyz2_.shape[0]
-    rmsd = np.sqrt(np.sum((xyz2_ - xyz1) * (xyz2_ - xyz1), axis=(0, 1)) / L + eps)
-    return rmsd, U
 class Protein:
     """
     Represents a protein with optional sequence, atomic coordinates, per-residue
@@ -416,10 +384,12 @@ class Protein:
                 else:
                     atom.b_iso = _NAN_BFACTOR_VALUE
                 atom = residue.add_atom(atom)
-        block = structure.make_mmcif_block()
         # NB: gemmi doesn't seem to write the _chem_comp category properly... it says
         #     the type is `.`, but is should be something like `L-PEPTIDE LINKING`...
-        block.find_mmcif_category("_chem_comp").erase()  # ...so we remove it
+        #     see also: https://github.com/project-gemmi/gemmi/discussions/362
+        block = structure.make_mmcif_block(
+            groups=gemmi.MmcifOutputGroups(True, chem_comp=False)
+        )
         return block.as_string()
     def make_fasta_bytes(self) -> bytes:
@@ -479,7 +449,6 @@ class Protein:
         model_idx: int = 0,
         verbose: bool = True,
     ) -> "Protein":
-        filestring = filestring if isinstance(filestring, str) else filestring.decode()
         if format == "pdb":
             structure = gemmi.read_pdb_string(filestring)
         elif format == "cif":
@@ -507,7 +476,7 @@ class Protein:
         structure.setup_entities()
         structure.assign_label_seq_id()
         if use_bfactor_as_plddt is None:
-            use_bfactor_as_plddt = structure.resolution == 0.0
+            use_bfactor_as_plddt = _use_bfactor_as_plddt(structure=structure)
         model = structure[model_idx]
         chain = model.find_chain(chain_id)
         assert chain is not None
@@ -585,3 +554,51 @@ def parse_fasta_as_proteins(path: str | Path) -> list[Protein]:
         for name, sequence in fasta.parse_stream(fp):
             proteins.append(Protein(name=name, sequence=sequence))
     return proteins
+def _use_bfactor_as_plddt(structure: gemmi.Structure) -> bool:
+    """
+    This heuristic decides whether to use B-factor as pLDDT.
+    It uses B-factor as pLDDT when all of the following fields are *not* set:
+        - structure resolution
+        - _pdbx_database_status.recvd_initial_deposition_date
+    This heuristic may be changed in the future.
+    """
+    return (structure.resolution == 0.0) and (
+        structure.make_mmcif_block(
+            groups=gemmi.MmcifOutputGroups(False, database_status=True)
+        ).find_value("_pdbx_database_status.recvd_initial_deposition_date")
+        is None
+    )
+def calc_rmsd(
+    xyz1: npt.NDArray[np.floating], xyz2: npt.NDArray[np.floating], eps: float = 1e-6
+) -> tuple[float, npt.NDArray[np.floating]]:
+    """
+    Calculates RMSD between two sets of atoms (L, 3)
+    Adapted from https://github.com/RosettaCommons/RFdiffusion/blob/b44206a2a79f219bb1a649ea50603a284c225050/rfdiffusion/util.py#L719
+    """
+    # center to CA centroid
+    xyz1 = xyz1 - xyz1.mean(0)
+    xyz2 = xyz2 - xyz2.mean(0)
+    # Computation of the covariance matrix
+    C = xyz2.T @ xyz1
+    # Compute otimal rotation matrix using SVD
+    V, S, W = np.linalg.svd(C)
+    # get sign to ensure right-handedness
+    d = np.ones([3, 3])
+    d[:, -1] = np.sign(np.linalg.det(V) * np.linalg.det(W))
+    # Rotation matrix U
+    U = (d * V) @ W
+    # Rotate xyz2
+    xyz2_ = xyz2 @ U
+    L = xyz2_.shape[0]
+    rmsd = np.sqrt(np.sum((xyz2_ - xyz1) * (xyz2_ - xyz1), axis=(0, 1)) / L + eps)
+    return rmsd, U

{openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/svd/svd.py RENAMED Viewed

@@ -1,5 +1,7 @@
 """SVD API providing the interface for creating and using SVD models."""
+from typing import Literal
 from openprotein.base import APISession
 from openprotein.common import ReductionType
 from openprotein.data import AssayDataset, AssayMetadata
@@ -20,11 +22,11 @@ class SVDAPI:
     def fit_svd(
         self,
-        model_id: str,
+        model_id: str | EmbeddingModel,
         sequences: list[bytes] | list[str] | None = None,
         assay: AssayMetadata | AssayDataset | str | None = None,
         n_components: int = 1024,
-        reduction: ReductionType | None = None,
+        reduction: Literal["MEAN", "SUM"] | None = None,
         **kwargs,
     ) -> SVDModel:
         """
@@ -32,7 +34,7 @@ class SVDAPI:
         Parameters
         ----------
-        model_id : str
+        model_id : str or EmbeddingModel
             ID of embeddings model to use.
         sequences : list of bytes or None, optional
             Optional sequences to fit SVD with. Either use sequences or
@@ -43,7 +45,7 @@ class SVDAPI:
             Ignored if sequences are provided.
         n_components : int, optional
             The number of components for the SVD. Defaults to 1024.
-        reduction : str or None, optional
+        reduction : str or ReductionType or None, optional
             Type of embedding reduction to use for computing features.
             E.g. "MEAN" or "SUM". Useful when dealing with variable length
             sequence. Defaults to None.

{openprotein_python-0.8.4 → openprotein_python-0.8.5}/openprotein/umap/umap.py RENAMED Viewed

@@ -1,7 +1,10 @@
 """UMAP API providing the interface to fit and run UMAP visualizations."""
+import typing
+from typing import Literal
 from openprotein.base import APISession
-from openprotein.common import FeatureType, ReductionType
+from openprotein.common import Feature, FeatureType, Reduction, ReductionType
 from openprotein.data import AssayDataset, AssayMetadata
 from openprotein.embeddings import EmbeddingModel, EmbeddingsAPI
 from openprotein.errors import InvalidParameterError
@@ -21,16 +24,35 @@ class UMAPAPI:
     ):
         self.session = session
+    @typing.overload
+    def fit_umap(
+        self,
+        model: EmbeddingModel,
+        reduction: Reduction | ReductionType,
+        feature_type: Literal["PLM"] = "PLM",
+        sequences: list[bytes] | list[str] | None = None,
+        assay: AssayDataset | AssayMetadata | str | None = None,
+        n_components: int = 2,
+        n_neighbors: int = 15,
+        min_dist: float = 0.1,
+    ) -> UMAPModel: ...
+    @typing.overload
+    def fit_umap(
+        self,
+        model: EmbeddingModel,
+    ) -> UMAPModel: ...
     def fit_umap(
         self,
         model: EmbeddingModel | SVDModel | str,
-        feature_type: FeatureType | None = None,
+        reduction: Reduction | ReductionType | None = None,
+        feature_type: Feature | FeatureType | None = None,
         sequences: list[bytes] | list[str] | None = None,
-        assay: AssayMetadata | AssayDataset | str | None = None,
+        assay: AssayDataset | AssayMetadata | str | None = None,
         n_components: int = 2,
         n_neighbors: int = 15,
         min_dist: float = 0.1,
-        reduction: ReductionType | None = None,
         **kwargs,
     ) -> UMAPModel:
         """
@@ -42,14 +64,14 @@ class UMAPAPI:
             Optional sequences to fit UMAP with. Either use sequences or
             assay_id. sequences is preferred.
         assay : AssayMetadata or AssayDataset or str or None, optional
-            Optional assay containing sequences to fit SVD with.
+            Optional assay containing sequences to fit UMAP with.
             Or its assay_id. Either use sequences or assay.
             Ignored if sequences are provided.
         model : EmbeddingModel or SVDModel or str
             Instance of either EmbeddingModel or SVDModel to use depending
             on feature type. Can also be a str specifying the model id,
             but then feature_type would have to be specified.
-        feature_type : FeatureType or None, optional
+        feature_type : str or FeatureType or None, optional
             Type of features to use for encoding sequences. "SVD" or "PLM".
             None would require model to be EmbeddingModel or SVDModel.
         n_components : int, optional
@@ -58,7 +80,7 @@ class UMAPAPI:
             Number of neighbors to use for fitting. Defaults to 15.
         min_dist : float, optional
             Minimum distance in UMAP fitting. Defaults to 0.1.
-        reduction : str or None, optional
+        reduction : str or ReductionType or None, optional
             Type of embedding reduction to use for computing features.
             E.g. "MEAN" or "SUM". Useful when dealing with variable length
             sequence. Defaults to None.
@@ -70,6 +92,13 @@ class UMAPAPI:
         UMAPModel
             The UMAP model being fit.
         """
+        # 1. Check assay data input - just need the id
+        # get assay_id
+        assay_id = (
+            assay.assay_id
+            if isinstance(assay, AssayMetadata)
+            else assay.id if isinstance(assay, AssayDataset) else assay
+        )
         # extract feature type
         feature_type = (
             FeatureType.PLM
@@ -80,11 +109,15 @@ class UMAPAPI:
             raise InvalidParameterError(
                 "Expected feature_type to be provided if passing str model_id as model"
             )
+        if isinstance(feature_type, str):
+            feature_type = FeatureType(feature_type)
+        if isinstance(reduction, str):
+            reduction = ReductionType(reduction)
         # get model if model_id
         if feature_type == FeatureType.PLM:
             if reduction is None:
                 raise InvalidParameterError(
-                    "Expected reduction if using EmbeddingModel"
+                    "Expected reduction if using embedding model"
                 )
             if isinstance(model, str):
                 embeddings_api = getattr(self.session, "embedding", None)
@@ -93,18 +126,14 @@ class UMAPAPI:
             assert isinstance(model, EmbeddingModel), "Expected EmbeddingModel"
             model_id = model.id
         elif feature_type == FeatureType.SVD:
+            if reduction is not None:
+                raise InvalidParameterError("Unexpected reduction when using SVD model")
             if isinstance(model, str):
                 svd_api = getattr(self.session, "svd", None)
                 assert isinstance(svd_api, SVDAPI)
                 model = svd_api.get_svd(model)
             assert isinstance(model, SVDModel), "Expected SVDModel"
             model_id = model.id
-        # get assay_id
-        assay_id = (
-            assay.assay_id
-            if isinstance(assay, AssayMetadata)
-            else assay.id if isinstance(assay, AssayDataset) else assay
-        )
         return UMAPModel(
             session=self.session,
             job=api.umap_fit_post(

{openprotein_python-0.8.4 → openprotein_python-0.8.5}/pyproject.toml RENAMED Viewed

@@ -35,6 +35,7 @@ dev = [
   "matplotlib>=3.9.2,<4",
   "scipy>=1.14.1,<2",
   "hatchling>=1.26.1",
+  "hatch-vcs>=0.5,<1",
   "editables>=0.5,<0.6",
   "seaborn>=0.13.2,<0.14",
   "jupyterlab>=4.4.1,<5",
@@ -55,24 +56,6 @@ jupyterinstall = "python -m ipykernel install --user --name=openprotein-python"
 [tool.pixi.environments]
 dev = ["dev"]
-[tool.pixi.package]
-name = "openprotein-python"
-[tool.pixi.package.build]
-backend = { name = "pixi-build-python", version = "0.1.*" }
-channels = ["conda-forge"]
-[tool.pixi.package.host-dependencies]
-hatchling = "*"
-[tool.pixi.package.run-dependencies]
-requests = ">=2.32.3,<3"
-pydantic = ">=2.5,<3"
-tqdm = ">=4.66.5,<5"
-pandas = ">=2.2.2,<3"
-numpy = ">=1.9,<3"
-gemmi = ">=0.7.0,<0.8"
 [build-system]
 requires = ["hatchling>=1.26.1", "hatch-vcs>=0.5.0"]
 build-backend = "hatchling.build"

openprotein_python-0.8.4/openprotein/common/features.py DELETED Viewed

@@ -1,7 +0,0 @@
-from enum import Enum
-class FeatureType(str, Enum):
-    PLM = "PLM"
-    SVD = "SVD"

openprotein_python-0.8.4/openprotein/common/reduction.py DELETED Viewed

@@ -1,8 +0,0 @@
-"""Reduction types used in OpenProtein."""
-from enum import Enum
-class ReductionType(str, Enum):
-    MEAN = "MEAN"
-    SUM = "SUM"