PyPI - protein-quest - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

protein-quest 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of protein-quest might be problematic. Click here for more details.

Files changed (12) hide show

protein_quest/__version__.py +1 -1
protein_quest/alphafold/entry_summary.py +46 -22
protein_quest/alphafold/fetch.py +53 -28
protein_quest/cli.py +263 -57
protein_quest/mcp_server.py +15 -4
protein_quest/structure.py +24 -0
protein_quest/uniprot.py +287 -15
{protein_quest-0.6.0.dist-info → protein_quest-0.7.0.dist-info}/METADATA +32 -6
{protein_quest-0.6.0.dist-info → protein_quest-0.7.0.dist-info}/RECORD +12 -12
{protein_quest-0.6.0.dist-info → protein_quest-0.7.0.dist-info}/WHEEL +0 -0
{protein_quest-0.6.0.dist-info → protein_quest-0.7.0.dist-info}/entry_points.txt +0 -0
{protein_quest-0.6.0.dist-info → protein_quest-0.7.0.dist-info}/licenses/LICENSE +0 -0

protein_quest/__version__.py CHANGED Viewed

@@ -1,2 +1,2 @@
-__version__ = "0.6.0"
+__version__ = "0.7.0"
 """The version of the package."""

protein_quest/alphafold/entry_summary.py CHANGED Viewed

@@ -8,33 +8,57 @@ from yarl import URL
 class EntrySummary:
     """Dataclass representing a summary of an AlphaFold entry.
-    Modelled after EntrySummary in [https://alphafold.ebi.ac.uk/api/openapi.json](https://alphafold.ebi.ac.uk/api/openapi.json)
+    Modelled after NewEntrySummary in [https://alphafold.ebi.ac.uk/api/openapi.json](https://alphafold.ebi.ac.uk/api/openapi.json)
+    with URL types and without deprecated fields.
     """
-    entryId: str
-    uniprotAccession: str
-    uniprotId: str
-    uniprotDescription: str
-    taxId: int
-    organismScientificName: str
-    uniprotStart: int
-    uniprotEnd: int
-    uniprotSequence: str
-    modelCreatedDate: str
-    latestVersion: int
     allVersions: list[int]
     bcifUrl: URL
     cifUrl: URL
-    pdbUrl: URL
-    paeImageUrl: URL
+    entityType: str
+    fractionPlddtConfident: float
+    fractionPlddtLow: float
+    fractionPlddtVeryHigh: float
+    fractionPlddtVeryLow: float
+    globalMetricValue: float
+    isUniProt: bool
+    latestVersion: int
+    modelCreatedDate: str
+    modelEntityId: str
     paeDocUrl: URL
-    gene: str | None = None
-    sequenceChecksum: str | None = None
-    sequenceVersionDate: str | None = None
-    amAnnotationsUrl: URL | None = None
+    pdbUrl: URL
+    providerId: str
+    sequence: str
+    sequenceChecksum: str
+    sequenceEnd: int
+    sequenceStart: int
+    sequenceVersionDate: str
+    toolUsed: str
+    alternativeNames: list[str] | None = None
     amAnnotationsHg19Url: URL | None = None
     amAnnotationsHg38Url: URL | None = None
-    isReviewed: bool | None = None
-    isReferenceProteome: bool | None = None
-    # TODO add new fields from https://alphafold.ebi.ac.uk/#/public-api/get_uniprot_summary_api_uniprot_summary__qualifier__json_get
-    # TODO like fractionPlddt* fields which can be used in filter_files_on_confidence()
+    amAnnotationsUrl: URL | None = None
+    catalyticActivities: list[str] | None = None
+    complexName: str | None = None
+    functions: list[str] | None = None
+    gene: str | None = None
+    geneSynonyms: list[str] | None = None
+    ipSAE: float | None = None
+    ipTM: float | None = None
+    isUniProtReferenceProteome: bool | None = None
+    isUniProtReviewed: bool | None = None
+    keywords: list[str] | None = None
+    msaUrl: URL | None = None
+    organismCommonNames: list[str] | None = None
+    organismScientificName: str | None = None
+    organismSynonyms: list[str] | None = None
+    plddtDocUrl: URL | None = None
+    proteinFullNames: list[str] | None = None
+    proteinShortNames: list[str] | None = None
+    stoichiometry: int | None = None
+    taxId: int | None = None
+    taxonomyLineage: list[str] | None = None
+    # uniprotAccession is isoform id (<uniprot_accession>-<isoform number>) when entry has multiple isoforms.
+    uniprotAccession: str | None = None
+    uniprotDescription: str | None = None
+    uniprotId: str | None = None

protein_quest/alphafold/fetch.py CHANGED Viewed

@@ -7,8 +7,9 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Literal, cast, get_args
+import aiofiles
+from aiofiles.ospath import exists
 from aiohttp_retry import RetryClient
-from aiopath import AsyncPath
 from tqdm.asyncio import tqdm
 from yarl import URL
@@ -24,11 +25,12 @@ DownloadableFormat = Literal[
     "bcif",
     "cif",
     "pdb",
-    "paeImage",
     "paeDoc",
     "amAnnotations",
     "amAnnotationsHg19",
     "amAnnotationsHg38",
+    "msaUrl",
+    "plddtDocUrl",
 ]
 """Types of formats that can be downloaded from the AlphaFold web service."""
@@ -43,22 +45,23 @@ def _camel_to_snake_case(name: str) -> str:
 @dataclass
 class AlphaFoldEntry:
-    """AlphaFoldEntry represents a minimal single entry in the AlphaFold database.
+    """AlphaFold entry with summary object and optionally local files.
-    See https://alphafold.ebi.ac.uk/api-docs for more details on the API and data structure.
+    See https://alphafold.ebi.ac.uk/api-docs for more details on the summary data structure.
     """
-    uniprot_acc: str
-    summary: EntrySummary | None
+    uniprot_accession: str
+    summary: EntrySummary
     summary_file: Path | None = None
     bcif_file: Path | None = None
     cif_file: Path | None = None
     pdb_file: Path | None = None
-    pae_image_file: Path | None = None
     pae_doc_file: Path | None = None
     am_annotations_file: Path | None = None
     am_annotations_hg19_file: Path | None = None
     am_annotations_hg38_file: Path | None = None
+    msa_file: Path | None = None
+    plddt_doc_file: Path | None = None
     @classmethod
     def format2attr(cls, dl_format: DownloadableFormat) -> str:
@@ -120,25 +123,28 @@ async def fetch_summary(
     Returns:
         A list of EntrySummary objects representing the fetched summary.
+        When qualifier has multiple isoforms then multiple summaries are returned,
+        otherwise a list of a single summary is returned.
     """
     url = f"https://alphafold.ebi.ac.uk/api/prediction/{qualifier}"
-    fn: AsyncPath | None = None
+    fn: Path | None = None
     if save_dir is not None:
-        fn = AsyncPath(save_dir / f"{qualifier}.json")
-        if await fn.exists():
+        fn = save_dir / f"{qualifier}.json"
+        if await exists(fn):
             logger.debug(f"File {fn} already exists. Skipping download from {url}.")
-            raw_data = await fn.read_bytes()
+            async with aiofiles.open(fn, "rb") as f:
+                raw_data = await f.read()
             return converter.loads(raw_data, list[EntrySummary])
         cached_file = await cacher.copy_from_cache(Path(fn))
         if cached_file is not None:
             logger.debug(f"Using cached file {cached_file} for summary of {qualifier}.")
-            raw_data = await AsyncPath(cached_file).read_bytes()
+            async with aiofiles.open(cached_file, "rb") as f:
+                raw_data = await f.read()
             return converter.loads(raw_data, list[EntrySummary])
     async with semaphore, session.get(url) as response:
         response.raise_for_status()
         raw_data = await response.content.read()
         if fn is not None:
-            # TODO return fn and make it part of AlphaFoldEntry as summary_file prop
             await cacher.write_bytes(Path(fn), raw_data)
         return converter.loads(raw_data, list[EntrySummary])
@@ -148,7 +154,7 @@ async def fetch_summaries(
     save_dir: Path | None = None,
     max_parallel_downloads: int = 5,
     cacher: Cacher | None = None,
-) -> AsyncGenerator[EntrySummary]:
+) -> AsyncGenerator[tuple[str, EntrySummary]]:
     semaphore = Semaphore(max_parallel_downloads)
     if save_dir is not None:
         save_dir.mkdir(parents=True, exist_ok=True)
@@ -159,9 +165,9 @@ async def fetch_summaries(
         summaries_per_qualifier: list[list[EntrySummary]] = await tqdm.gather(
             *tasks, desc="Fetching Alphafold summaries"
         )
-        for summaries in summaries_per_qualifier:
+        for qualifier, summaries in zip(qualifiers, summaries_per_qualifier, strict=True):
             for summary in summaries:
-                yield summary
+                yield qualifier, summary
 async def fetch_many_async(
@@ -171,17 +177,20 @@ async def fetch_many_async(
     max_parallel_downloads: int = 5,
     cacher: Cacher | None = None,
     gzip_files: bool = False,
+    all_isoforms: bool = False,
 ) -> AsyncGenerator[AlphaFoldEntry]:
     """Asynchronously fetches summaries and files from
     [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
     Args:
-        uniprot_accessions: A set of Uniprot acessions to fetch.
+        uniprot_accessions: A set of Uniprot accessions to fetch.
         save_dir: The directory to save the fetched files to.
         what: A set of formats to download.
         max_parallel_downloads: The maximum number of parallel downloads.
         cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
         gzip_files: Whether to gzip the downloaded files.
+        all_isoforms: Whether to yield all isoforms of each uniprot entry.
+            When False then yields only the canonical sequence of uniprot entry.
     Yields:
         A dataclass containing the summary, pdb file, and pae file.
@@ -193,8 +202,10 @@ async def fetch_many_async(
         async for s in fetch_summaries(
             uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads, cacher=cacher
         )
+        # Filter out isoforms if all_isoforms is False
+        # O60481 is canonical and O60481-2 is isoform, so we skip the isoform
+        if all_isoforms or s[0] == s[1].uniprotAccession
     ]
     files = files_to_download(what, summaries, gzip_files)
     await retrieve_files(
@@ -205,16 +216,16 @@ async def fetch_many_async(
         cacher=cacher,
         gzip_files=gzip_files,
     )
     gzext = ".gz" if gzip_files else ""
-    for summary in summaries:
+    for uniprot_accession, summary in summaries:
         yield AlphaFoldEntry(
-            uniprot_acc=summary.uniprotAccession,
+            uniprot_accession=uniprot_accession,
             summary=summary,
-            summary_file=save_dir / f"{summary.uniprotAccession}.json" if save_dir_for_summaries is not None else None,
+            summary_file=save_dir / f"{uniprot_accession}.json" if save_dir_for_summaries is not None else None,
             bcif_file=save_dir / (summary.bcifUrl.name + gzext) if "bcif" in what else None,
             cif_file=save_dir / (summary.cifUrl.name + gzext) if "cif" in what else None,
             pdb_file=save_dir / (summary.pdbUrl.name + gzext) if "pdb" in what else None,
-            pae_image_file=save_dir / (summary.paeImageUrl.name + gzext) if "paeImage" in what else None,
             pae_doc_file=save_dir / (summary.paeDocUrl.name + gzext) if "paeDoc" in what else None,
             am_annotations_file=(
                 save_dir / (summary.amAnnotationsUrl.name + gzext)
@@ -231,11 +242,15 @@ async def fetch_many_async(
                 if "amAnnotationsHg38" in what and summary.amAnnotationsHg38Url
                 else None
             ),
+            msa_file=(save_dir / (summary.msaUrl.name + gzext) if "msaUrl" in what and summary.msaUrl else None),
+            plddt_doc_file=(
+                save_dir / (summary.plddtDocUrl.name + gzext) if "plddtDocUrl" in what and summary.plddtDocUrl else None
+            ),
         )
 def files_to_download(
-    what: set[DownloadableFormat], summaries: Iterable[EntrySummary], gzip_files: bool
+    what: set[DownloadableFormat], summaries: Iterable[tuple[str, EntrySummary]], gzip_files: bool
 ) -> set[tuple[URL, str]]:
     if not (set(what) <= downloadable_formats):
         msg = (
@@ -245,14 +260,14 @@ def files_to_download(
         raise ValueError(msg)
     url_filename_pairs: set[tuple[URL, str]] = set()
-    for summary in summaries:
+    for _, summary in summaries:
         for fmt in what:
             if fmt == "summary":
                 # summary is handled already in fetch_summary
                 continue
             url = cast("URL | None", getattr(summary, f"{fmt}Url", None))
             if url is None:
-                logger.warning(f"Summary {summary.uniprotAccession} does not have a URL for format '{fmt}'. Skipping.")
+                logger.warning(f"Summary {summary.modelEntityId} does not have a URL for format '{fmt}'. Skipping.")
                 continue
             fn = url.name + (".gz" if gzip_files else "")
             url_filename_pair = (url, fn)
@@ -267,6 +282,7 @@ def fetch_many(
     max_parallel_downloads: int = 5,
     cacher: Cacher | None = None,
     gzip_files: bool = False,
+    all_isoforms: bool = False,
 ) -> list[AlphaFoldEntry]:
     """Synchronously fetches summaries and pdb and pae files from AlphaFold Protein Structure Database.
@@ -277,6 +293,8 @@ def fetch_many(
         max_parallel_downloads: The maximum number of parallel downloads.
         cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
         gzip_files: Whether to gzip the downloaded files.
+        all_isoforms: Whether to return all isoforms of each uniprot entry.
+            When False then returns only the canonical sequence of uniprot entry.
     Returns:
         A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
@@ -286,7 +304,13 @@ def fetch_many(
         return [
             entry
             async for entry in fetch_many_async(
-                ids, save_dir, what, max_parallel_downloads=max_parallel_downloads, cacher=cacher, gzip_files=gzip_files
+                ids,
+                save_dir,
+                what,
+                max_parallel_downloads=max_parallel_downloads,
+                cacher=cacher,
+                gzip_files=gzip_files,
+                all_isoforms=all_isoforms,
             )
         ]
@@ -304,13 +328,12 @@ def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
         An AlphaFoldEntry instance with paths relative to the session directory.
     """
     return AlphaFoldEntry(
-        uniprot_acc=entry.uniprot_acc,
+        uniprot_accession=entry.uniprot_accession,
         summary=entry.summary,
         summary_file=entry.summary_file.relative_to(session_dir) if entry.summary_file else None,
         bcif_file=entry.bcif_file.relative_to(session_dir) if entry.bcif_file else None,
         cif_file=entry.cif_file.relative_to(session_dir) if entry.cif_file else None,
         pdb_file=entry.pdb_file.relative_to(session_dir) if entry.pdb_file else None,
-        pae_image_file=entry.pae_image_file.relative_to(session_dir) if entry.pae_image_file else None,
         pae_doc_file=entry.pae_doc_file.relative_to(session_dir) if entry.pae_doc_file else None,
         am_annotations_file=entry.am_annotations_file.relative_to(session_dir) if entry.am_annotations_file else None,
         am_annotations_hg19_file=(
@@ -319,4 +342,6 @@ def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
         am_annotations_hg38_file=(
             entry.am_annotations_hg38_file.relative_to(session_dir) if entry.am_annotations_hg38_file else None
         ),
+        msa_file=entry.msa_file.relative_to(session_dir) if entry.msa_file else None,
+        plddt_doc_file=entry.plddt_doc_file.relative_to(session_dir) if entry.plddt_doc_file else None,
     )

protein-quest 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

Potentially problematic release.

protein-quest 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl