PyPI - openprotein-python - Versions diffs - 0.8.2__1-py3-none-any.whl - Mend

openprotein-python 0.8.2__1-py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

openprotein/__init__.py +164 -0
openprotein/_version.py +48 -0
openprotein/align/__init__.py +8 -0
openprotein/align/align.py +395 -0
openprotein/align/api.py +428 -0
openprotein/align/future.py +55 -0
openprotein/align/msa.py +129 -0
openprotein/align/schemas.py +165 -0
openprotein/base.py +181 -0
openprotein/chains.py +88 -0
openprotein/common/__init__.py +5 -0
openprotein/common/features.py +7 -0
openprotein/common/model_metadata.py +33 -0
openprotein/common/reduction.py +8 -0
openprotein/config.py +9 -0
openprotein/csv.py +31 -0
openprotein/data/__init__.py +9 -0
openprotein/data/api.py +218 -0
openprotein/data/assaydataset.py +178 -0
openprotein/data/data.py +93 -0
openprotein/data/schemas.py +27 -0
openprotein/design/__init__.py +16 -0
openprotein/design/api.py +259 -0
openprotein/design/design.py +125 -0
openprotein/design/future.py +146 -0
openprotein/design/schemas.py +607 -0
openprotein/embeddings/__init__.py +27 -0
openprotein/embeddings/api.py +619 -0
openprotein/embeddings/embeddings.py +151 -0
openprotein/embeddings/esm.py +33 -0
openprotein/embeddings/future.py +146 -0
openprotein/embeddings/models.py +421 -0
openprotein/embeddings/openprotein.py +21 -0
openprotein/embeddings/poet.py +446 -0
openprotein/embeddings/poet2.py +505 -0
openprotein/embeddings/schemas.py +78 -0
openprotein/errors.py +76 -0
openprotein/fasta.py +92 -0
openprotein/fold/__init__.py +21 -0
openprotein/fold/alphafold2.py +131 -0
openprotein/fold/api.py +287 -0
openprotein/fold/boltz.py +691 -0
openprotein/fold/esmfold.py +54 -0
openprotein/fold/fold.py +107 -0
openprotein/fold/future.py +509 -0
openprotein/fold/models.py +139 -0
openprotein/fold/schemas.py +39 -0
openprotein/jobs/__init__.py +9 -0
openprotein/jobs/api.py +71 -0
openprotein/jobs/futures.py +746 -0
openprotein/jobs/jobs.py +69 -0
openprotein/jobs/schemas.py +135 -0
openprotein/models/__init__.py +4 -0
openprotein/models/base.py +63 -0
openprotein/models/foundation/rfdiffusion.py +283 -0
openprotein/models/models.py +33 -0
openprotein/predictor/__init__.py +25 -0
openprotein/predictor/api.py +384 -0
openprotein/predictor/models.py +374 -0
openprotein/predictor/prediction.py +79 -0
openprotein/predictor/predictor.py +242 -0
openprotein/predictor/schemas.py +113 -0
openprotein/predictor/validate.py +40 -0
openprotein/prompt/__init__.py +9 -0
openprotein/prompt/api.py +505 -0
openprotein/prompt/models.py +142 -0
openprotein/prompt/prompt.py +130 -0
openprotein/prompt/schemas.py +49 -0
openprotein/protein.py +587 -0
openprotein/svd/__init__.py +9 -0
openprotein/svd/api.py +206 -0
openprotein/svd/models.py +288 -0
openprotein/svd/schemas.py +31 -0
openprotein/svd/svd.py +134 -0
openprotein/umap/__init__.py +9 -0
openprotein/umap/api.py +259 -0
openprotein/umap/models.py +211 -0
openprotein/umap/schemas.py +35 -0
openprotein/umap/umap.py +175 -0
openprotein/utils/uuid.py +29 -0
openprotein_python-0.8.2.dist-info/METADATA +176 -0
openprotein_python-0.8.2.dist-info/RECORD +84 -0
openprotein_python-0.8.2.dist-info/WHEEL +4 -0
openprotein_python-0.8.2.dist-info/licenses/LICENSE.txt +30 -0

openprotein/fasta.py ADDED Viewed

@@ -0,0 +1,92 @@
+from typing import Iterator, Sequence, overload
+@overload
+def parse_stream(
+    lines: Iterator[str], comment: str = "#"
+) -> Iterator[tuple[str, str]]: ...
+@overload
+def parse_stream(
+    lines: Iterator[bytes], comment: str = "#"
+) -> Iterator[tuple[bytes, bytes]]: ...
+def parse_stream(
+    lines: Iterator[str] | Iterator[bytes], comment: str = "#"
+) -> Iterator[tuple[str, str]] | Iterator[tuple[bytes, bytes]]:
+    is_bytes: bool | None = None
+    name = None
+    sequence = []
+    for line in lines:
+        if not line:
+            continue  # skip empty lines
+        if is_bytes := isinstance(line, bytes):
+            line = line.decode()
+        if line.startswith(comment):
+            continue
+        line = line.strip()
+        if line.startswith(">"):
+            if name is not None:
+                sequence = "".join(sequence)
+                if is_bytes:
+                    name = name.encode()
+                    sequence = sequence.encode()
+                    yield name, sequence
+                else:
+                    yield name, sequence
+            name = line[1:].strip()
+            sequence = []
+        else:
+            sequence.append(line.strip())
+    if name is not None:
+        sequence = "".join(sequence)
+        if is_bytes:
+            name = name.encode()
+            sequence = sequence.encode()
+            yield name, sequence
+        else:
+            yield name, sequence
+def parse(
+    f: Sequence[str] | Sequence[bytes], comment: str = "#"
+) -> tuple[list[str], list[str]] | tuple[list[bytes], list[bytes]]:
+    is_bytes: bool | None = None
+    names = []
+    sequences = []
+    name = None
+    sequence = []
+    for line in f:
+        if is_bytes := isinstance(line, bytes):
+            line = line.decode()
+        if line.startswith(comment):
+            continue
+        line = line.strip()
+        if line.startswith(">"):
+            # its a new entry
+            if name is not None:
+                sequence = "".join(sequence)
+                if is_bytes:
+                    name = name.encode()
+                    sequence = sequence.encode()
+                names.append(name)
+                sequences.append(sequence)
+            # reset the reading
+            name = line[1:]
+            sequence = []
+        else:
+            sequence.append(line.upper())
+    if name is not None:
+        # last entry
+        sequence = "".join(sequence)
+        if is_bytes:
+            name = name.encode()
+            sequence = sequence.encode()
+        names.append(name)
+        sequences.append(sequence)
+    return names, sequences

openprotein/fold/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""
+Fold module for predicting structures on OpenProtein.
+isort:skip_file
+"""
+from .schemas import FoldJob, FoldMetadata
+from .models import FoldModel
+from .esmfold import ESMFoldModel
+from .alphafold2 import AlphaFold2Model
+from .boltz import (
+    Boltz1Model,
+    Boltz1xModel,
+    Boltz2Model,
+    BoltzAffinity,
+    BoltzConfidence,
+    BoltzConstraint,
+    BoltzProperty,
+)
+from .future import FoldResultFuture, FoldComplexResultFuture
+from .fold import FoldAPI

openprotein/fold/alphafold2.py ADDED Viewed

@@ -0,0 +1,131 @@
+"""Community-based AlphaFold 2 model running using ColabFold."""
+import warnings
+from collections import Counter
+from openprotein.align import MSAFuture
+from openprotein.base import APISession
+from openprotein.common import ModelMetadata
+from openprotein.protein import Protein
+from . import api
+from .future import FoldComplexResultFuture
+from .models import FoldModel
+class AlphaFold2Model(FoldModel):
+    """
+    Class providing inference endpoints for AlphaFold2 structure prediction models, based on the implementation by ColabFold.
+    """
+    model_id: str = "alphafold2"
+    def __init__(
+        self,
+        session: APISession,
+        model_id: str,
+        metadata: ModelMetadata | None = None,
+    ):
+        super().__init__(session=session, model_id=model_id, metadata=metadata)
+    def fold(
+        self,
+        proteins: list[Protein] | MSAFuture | None = None,
+        num_recycles: int | None = None,
+        num_models: int = 1,
+        num_relax: int = 0,
+        **kwargs,
+    ) -> FoldComplexResultFuture:
+        """
+        Post sequences to alphafold model.
+        Parameters
+        ----------
+        proteins : List[Protein] | MSAFuture
+            List of protein sequences to fold. `Protein` objects must be tagged with an `msa`. Alternatively, supply an `MSAFuture` to use all query sequences as a multimer.
+        num_recycles : int
+            number of times to recycle models
+        num_models : int
+            number of models to train - best model will be used
+        max_msa : Union[str, int]
+            maximum number of sequences in the msa to use.
+        relax_max_iterations : int
+            maximum number of iterations
+        Returns
+        -------
+        job : Job
+        """
+        if "msa" in kwargs:
+            warnings.warn(
+                "Inputs to AlphaFold 2 have been updated. 'msa' should be supplied as 'proteins' argument. Support will be dropped in the future."
+            )
+            proteins = kwargs["msa"]
+        if "ligands" in kwargs or "dnas" in kwargs or "rnas" in kwargs:
+            with warnings.catch_warnings():
+                warnings.simplefilter("always")  # Force warning to always show
+                warnings.warn(
+                    "Alphafold 2 only supports proteins. All other chains will be ignored"
+                )
+        if proteins is None:
+            raise TypeError("Expected 'proteins' argument")
+        if isinstance(proteins, list):
+            msa_to_seed: dict[str, Counter] = dict()
+            for protein in proteins:
+                if (msa := protein.msa) is not None:
+                    msa_id = msa.id if isinstance(msa, MSAFuture) else msa
+                    if msa_id in msa_to_seed:
+                        seeds = msa_to_seed[msa_id]
+                    else:
+                        from openprotein.align import AlignAPI
+                        align_api = getattr(self.session, "align", None)
+                        assert isinstance(align_api, AlignAPI)
+                        seed = align_api.get_seed(job_id=msa_id)
+                        # need a counter so we can make sure later that the proteins make up the msa completely
+                        seeds = Counter(seed.split(":"))
+                        msa_to_seed[msa_id] = seeds
+                    # check that this protein is in the seed
+                    if protein.sequence.decode() not in seeds:
+                        raise ValueError(
+                            f"Expected specified msa_id {msa_id} for protein {protein.sequence} to contain the sequence as part of its seed/query"
+                        )
+                else:
+                    raise ValueError("Expected msa for protein when using AlphaFold 2")
+            # now make sure we only have one msa
+            if len(msa_to_seed) > 1:
+                raise ValueError("Expected only 1 unique msa when using AlphaFold 2")
+            # now check that the list of proteins completely make up the msa
+            seeds = list(msa_to_seed.values())[0]  # should have just 1
+            for protein in proteins:
+                # make sure to account for multimers
+                seeds[protein.sequence.decode()] -= (
+                    len(protein.chain_id) if isinstance(protein.chain_id, list) else 1
+                )
+                # handle when too many of a sequence in the list of proteins
+                if seeds[protein.sequence.decode()] < 0:
+                    raise ValueError(
+                        "List of proteins does not completely make up the MSA seed"
+                    )
+            if seeds.total() != 0:
+                # handle when overall mismatch - 1 and -1 case is handled above
+                raise ValueError(
+                    "List of proteins does not completely make up the MSA seed"
+                )
+            msa_id = list(msa_to_seed.keys())[0]
+        elif isinstance(proteins, MSAFuture):
+            msa_id = proteins.id
+        else:
+            raise TypeError("Expected either list of Proteins or MSAFuture")
+        return FoldComplexResultFuture.create(
+            session=self.session,
+            job=api.fold_models_post(
+                self.session,
+                model_id=self.model_id,
+                msa_id=msa_id,
+                num_recycles=num_recycles,
+                num_models=num_models,
+                num_relax=num_relax,
+            ),
+        )

openprotein/fold/api.py ADDED Viewed

@@ -0,0 +1,287 @@
+"""Fold REST API interface for making HTTP calls to our fold backend."""
+import io
+from typing import Literal
+import numpy as np
+from pydantic import TypeAdapter
+from openprotein.base import APISession
+from openprotein.common import ModelMetadata
+from openprotein.errors import HTTPError
+from .schemas import FoldJob, FoldMetadata
+PATH_PREFIX = "v1/fold"
+def fold_models_list_get(session: APISession) -> list[str]:
+    """
+    List available fold models.
+    Parameters
+    ----------
+    session : APISession
+        API session.
+    Returns
+    -------
+    list of str
+        List of model names.
+    """
+    endpoint = PATH_PREFIX + "/models"
+    response = session.get(endpoint)
+    result = response.json()
+    return result
+def fold_model_get(session: APISession, model_id: str) -> ModelMetadata:
+    """
+    Get metadata for a specific fold model.
+    Parameters
+    ----------
+    session : APISession
+        API session.
+    model_id : str
+        Model ID to fetch.
+    Returns
+    -------
+    ModelMetadata
+        Metadata for the specified model.
+    """
+    endpoint = PATH_PREFIX + f"/models/{model_id}"
+    response = session.get(endpoint)
+    result = response.json()
+    return ModelMetadata(**result)
+def fold_get(session: APISession, job_id: str) -> FoldMetadata:
+    """
+    Get metadata associated with the given request ID.
+    Parameters
+    ----------
+    session : APISession
+        Session object for API communication.
+    job_id : str
+        Fold ID to fetch.
+    Returns
+    -------
+    FoldMetadata
+        Metadata about the fold job.
+    """
+    endpoint = PATH_PREFIX + f"/{job_id}"
+    response = session.get(endpoint)
+    fold = FoldMetadata.model_validate(response.json())
+    return fold
+def fold_get_sequences(session: APISession, job_id: str) -> list[bytes]:
+    """
+    Get results associated with the given request ID.
+    Parameters
+    ----------
+    session : APISession
+        Session object for API communication.
+    job_id : str
+        Job ID to fetch.
+    Returns
+    -------
+    list of bytes
+        List of sequences as bytes.
+    """
+    endpoint = PATH_PREFIX + f"/{job_id}/sequences"
+    response = session.get(endpoint)
+    return TypeAdapter(list[bytes]).validate_python(response.json())
+def fold_get_sequence_result(
+    session: APISession, job_id: str, sequence: bytes | str
+) -> bytes:
+    """
+    Get encoded result for a sequence from the request ID.
+    Parameters
+    ----------
+    session : APISession
+        Session object for API communication.
+    job_id : str
+        Job ID to retrieve results from.
+    sequence : bytes or str
+        Sequence to retrieve results for.
+    Returns
+    -------
+    bytes
+        Encoded result for the sequence.
+    """
+    if isinstance(sequence, bytes):
+        sequence = sequence.decode()
+    endpoint = PATH_PREFIX + f"/{job_id}/{sequence}"
+    response = session.get(endpoint)
+    return response.content
+def fold_get_complex_result(
+    session: APISession, job_id: str, format: Literal["pdb", "mmcif"]
+) -> bytes:
+    """
+    Get encoded result for a complex from the request ID.
+    Parameters
+    ----------
+    session : APISession
+        Session object for API communication.
+    job_id : str
+        Job ID to retrieve results from.
+    format : {'pdb', 'mmcif'}
+        Format of the result.
+    Returns
+    -------
+    bytes
+        Encoded result for the complex.
+    """
+    endpoint = PATH_PREFIX + f"/{job_id}/complex"
+    response = session.get(
+        endpoint,
+        params={
+            "format": format,
+        },
+    )
+    return response.content
+def fold_get_complex_extra_result(
+    session: APISession,
+    job_id: str,
+    key: Literal["pae", "pde", "plddt", "confidence", "affinity"],
+) -> np.ndarray | list[dict]:
+    """
+    Get extra result for a complex from the request ID.
+    Parameters
+    ----------
+    session : APISession
+        Session object for API communication.
+    job_id : str
+        Job ID to retrieve results from.
+    key : {'pae', 'pde', 'plddt', 'confidence', 'affinity'}
+        The type of result to retrieve.
+    Returns
+    -------
+    numpy.ndarray or list of dict
+        The result as a numpy array (for "pae", "pde", "plddt") or a list of dictionaries (for "confidence", "affinity").
+    """
+    if key in {"pae", "pde", "plddt"}:
+        formatter = lambda response: np.load(io.BytesIO(response.content))
+    elif key in {"confidence", "affinity"}:
+        formatter = lambda response: response.json()
+    else:
+        raise ValueError(f"Unexpected key: {key}")
+    endpoint = PATH_PREFIX + f"/{job_id}/complex/{key}"
+    try:
+        response = session.get(
+            endpoint,
+        )
+    except HTTPError as e:
+        if e.status_code == 400 and key == "affinity":
+            raise ValueError("affinity not found for request") from None
+        raise e
+    output: np.ndarray | list[dict] = formatter(response)
+    return output
+def fold_models_post(
+    session: APISession,
+    model_id: str,
+    **kwargs,
+) -> FoldJob:
+    """
+    POST a request for structure prediction.
+    Returns a Job object referring to this request
+    that can be used to retrieve results later.
+    Parameters
+    ----------
+    session : APISession
+        Session object for API communication.
+    model_id : str
+        Model ID to use for prediction.
+    sequences : sequence of bytes or str, optional
+        Sequences to request results for.
+    msa_id : str, optional
+        MSA ID to use.
+    num_recycles : int, optional
+        Number of recycles for structure prediction.
+    num_models : int, optional
+        Number of models to generate.
+    num_relax : int, optional
+        Number of relaxation steps.
+    use_potentials : bool, optional
+        Whether to use potentials.
+    diffusion_samples : int, optional
+        Number of diffusion samples (boltz).
+    recycling_steps : int, optional
+        Number of recycling steps (boltz).
+    sampling_steps : int, optional
+        Number of sampling steps (boltz).
+    step_scale : float, optional
+        Step scale (boltz).
+    constraints : dict, optional
+        Constraints to apply.
+    templates : list, optional
+        Templates to use.
+    properties : dict, optional
+        Additional properties.
+    Returns
+    -------
+    FoldJob
+        Job object referring to this request.
+    """
+    endpoint = PATH_PREFIX + f"/models/{model_id}"
+    body: dict = {}
+    if kwargs.get("sequences"):
+        sequences = kwargs["sequences"]
+        # NOTE we are handling the boltz form here too
+        sequences = [s.decode() if isinstance(s, bytes) else s for s in sequences]
+        body["sequences"] = sequences
+    if kwargs.get("msa_id"):
+        body["msa_id"] = kwargs["msa_id"]
+    if kwargs.get("num_recycles"):
+        body["num_recycles"] = kwargs["num_recycles"]
+    if kwargs.get("num_models"):
+        body["num_models"] = kwargs["num_models"]
+    if kwargs.get("num_relax"):
+        body["num_relax"] = kwargs["num_relax"]
+    if kwargs.get("use_potentials"):
+        body["use_potentials"] = kwargs["use_potentials"]
+    # boltz
+    if kwargs.get("diffusion_samples"):
+        body["diffusion_samples"] = kwargs["diffusion_samples"]
+    if kwargs.get("recycling_steps"):
+        body["recycling_steps"] = kwargs["recycling_steps"]
+    if kwargs.get("sampling_steps"):
+        body["sampling_steps"] = kwargs["sampling_steps"]
+    if kwargs.get("step_scale"):
+        body["step_scale"] = kwargs["step_scale"]
+    if kwargs.get("constraints"):
+        body["constraints"] = kwargs["constraints"]
+    if kwargs.get("templates"):
+        body["templates"] = kwargs["templates"]
+    if kwargs.get("properties"):
+        body["properties"] = kwargs["properties"]
+    if kwargs.get("method"):
+        body["method"] = kwargs["method"]
+    response = session.post(endpoint, json=body)
+    return FoldJob.model_validate(response.json())