PyPI - protein-quest - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

protein-quest 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of protein-quest might be problematic. Click here for more details.

Files changed (21) hide show

protein_quest/__version__.py +2 -1
protein_quest/alphafold/confidence.py +44 -17
protein_quest/alphafold/entry_summary.py +11 -9
protein_quest/alphafold/fetch.py +37 -63
protein_quest/cli.py +187 -30
protein_quest/converter.py +45 -0
protein_quest/filters.py +78 -35
protein_quest/go.py +1 -4
protein_quest/mcp_server.py +8 -5
protein_quest/parallel.py +37 -1
protein_quest/pdbe/fetch.py +15 -1
protein_quest/pdbe/io.py +142 -46
protein_quest/ss.py +264 -0
protein_quest/taxonomy.py +13 -3
protein_quest/utils.py +65 -3
{protein_quest-0.3.0.dist-info → protein_quest-0.3.2.dist-info}/METADATA +21 -11
protein_quest-0.3.2.dist-info/RECORD +26 -0
protein_quest-0.3.0.dist-info/RECORD +0 -24
{protein_quest-0.3.0.dist-info → protein_quest-0.3.2.dist-info}/WHEEL +0 -0
{protein_quest-0.3.0.dist-info → protein_quest-0.3.2.dist-info}/entry_points.txt +0 -0
{protein_quest-0.3.0.dist-info → protein_quest-0.3.2.dist-info}/licenses/LICENSE +0 -0

protein_quest/__version__.py CHANGED Viewed

@@ -1 +1,2 @@
-__version__ = "0.3.0"
+__version__ = "0.3.2"
+"""The version of the package."""

protein_quest/alphafold/confidence.py CHANGED Viewed

@@ -7,7 +7,10 @@ from pathlib import Path
 import gemmi
+from protein_quest.converter import Percentage, PositiveInt, converter
 from protein_quest.pdbe.io import write_structure
+from protein_quest.ss import nr_of_residues_in_total
+from protein_quest.utils import CopyMethod, copyfile
 """
 Methods to filter AlphaFoldDB structures on confidence scores.
@@ -73,13 +76,25 @@ class ConfidenceFilterQuery:
     Parameters:
         confidence: The confidence threshold for filtering residues.
             Residues with a pLDDT (b-factor) above this value are considered high confidence.
-        min_threshold: The minimum number of high-confidence residues required to keep the structure.
-        max_threshold: The maximum number of high-confidence residues required to keep the structure.
+        min_residues: The minimum number of high-confidence residues required to keep the structure.
+        max_residues: The maximum number of high-confidence residues required to keep the structure.
     """
-    confidence: float
-    min_threshold: int
-    max_threshold: int
+    confidence: Percentage
+    min_residues: PositiveInt
+    max_residues: PositiveInt
+base_query_hook = converter.get_structure_hook(ConfidenceFilterQuery)
+@converter.register_structure_hook
+def confidence_filter_query_hook(val, _type) -> ConfidenceFilterQuery:
+    result: ConfidenceFilterQuery = base_query_hook(val, _type)
+    if result.min_residues > result.max_residues:
+        msg = f"min_residues {result.min_residues} cannot be larger than max_residues {result.max_residues}"
+        raise ValueError(msg)
+    return result
 @dataclass
@@ -93,38 +108,46 @@ class ConfidenceFilterResult:
     """
     input_file: str
-    count: int
+    count: PositiveInt
     filtered_file: Path | None = None
-def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_dir: Path) -> ConfidenceFilterResult:
-    """Filter a single AlphaFoldDB structure file based on confidence.
+def filter_file_on_residues(
+    file: Path, query: ConfidenceFilterQuery, filtered_dir: Path, copy_method: CopyMethod = "copy"
+) -> ConfidenceFilterResult:
+    """Filter a single AlphaFoldDB structure file (*.pdb[.gz], *.cif[.gz]) based on confidence.
     Args:
         file: The path to the PDB file to filter.
         query: The confidence filter query.
         filtered_dir: The directory to save the filtered PDB file.
+        copy_method: How to copy when no residues have to be removed.
     Returns:
         result with filtered_file property set to Path where filtered PDB file is saved.
-        or None if structure was filtered out.
+            or None if structure was filtered out.
     """
     structure = gemmi.read_structure(str(file))
     residues = set(find_high_confidence_residues(structure, query.confidence))
     count = len(residues)
-    if count < query.min_threshold or count > query.max_threshold:
+    if count < query.min_residues or count > query.max_residues:
         # Skip structure that is outside the min and max threshold
         # just return number of high confidence residues
         return ConfidenceFilterResult(
             input_file=file.name,
             count=count,
         )
+    total_residues = nr_of_residues_in_total(structure)
     filtered_file = filtered_dir / file.name
-    new_structure = filter_out_low_confidence_residues(
-        structure,
-        residues,
-    )
-    write_structure(new_structure, filtered_file)
+    if count == total_residues:
+        # if no residues have to be removed then copy instead of slower gemmi writing
+        copyfile(file, filtered_file, copy_method)
+    else:
+        new_structure = filter_out_low_confidence_residues(
+            structure,
+            residues,
+        )
+        write_structure(new_structure, filtered_file)
     return ConfidenceFilterResult(
         input_file=file.name,
         count=count,
@@ -133,7 +156,10 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
 def filter_files_on_confidence(
-    alphafold_pdb_files: list[Path], query: ConfidenceFilterQuery, filtered_dir: Path
+    alphafold_pdb_files: list[Path],
+    query: ConfidenceFilterQuery,
+    filtered_dir: Path,
+    copy_method: CopyMethod = "copy",
 ) -> Generator[ConfidenceFilterResult]:
     """Filter AlphaFoldDB structures based on confidence.
@@ -141,6 +167,7 @@ def filter_files_on_confidence(
         alphafold_pdb_files: List of mmcif/PDB files from AlphaFoldDB to filter.
         query: The confidence filter query containing the confidence thresholds.
         filtered_dir: Directory where the filtered mmcif/PDB files will be saved.
+        copy_method: How to copy when a direct copy is possible.
     Yields:
         For each mmcif/PDB files yields whether it was filtered or not,
@@ -150,4 +177,4 @@ def filter_files_on_confidence(
     # In ../filter.py:filter_files_on_residues() we filter on number of residues on a file level
     # here we filter on file level and inside file remove low confidence residues
     for pdb_file in alphafold_pdb_files:
-        yield filter_file_on_residues(pdb_file, query, filtered_dir)
+        yield filter_file_on_residues(pdb_file, query, filtered_dir, copy_method)

protein_quest/alphafold/entry_summary.py CHANGED Viewed

@@ -1,12 +1,14 @@
 # ruff: noqa: N815 allow camelCase follow what api returns
 from dataclasses import dataclass
+from yarl import URL
 @dataclass
 class EntrySummary:
     """Dataclass representing a summary of an AlphaFold entry.
-    Modelled after EntrySummary in https://alphafold.ebi.ac.uk/api/openapi.json
+    Modelled after EntrySummary in [https://alphafold.ebi.ac.uk/api/openapi.json](https://alphafold.ebi.ac.uk/api/openapi.json)
     """
     entryId: str
@@ -21,17 +23,17 @@ class EntrySummary:
     modelCreatedDate: str
     latestVersion: int
     allVersions: list[int]
-    bcifUrl: str
-    cifUrl: str
-    pdbUrl: str
-    paeImageUrl: str
-    paeDocUrl: str
+    bcifUrl: URL
+    cifUrl: URL
+    pdbUrl: URL
+    paeImageUrl: URL
+    paeDocUrl: URL
     gene: str | None = None
     sequenceChecksum: str | None = None
     sequenceVersionDate: str | None = None
-    amAnnotationsUrl: str | None = None
-    amAnnotationsHg19Url: str | None = None
-    amAnnotationsHg38Url: str | None = None
+    amAnnotationsUrl: URL | None = None
+    amAnnotationsHg19Url: URL | None = None
+    amAnnotationsHg38Url: URL | None = None
     isReviewed: bool | None = None
     isReferenceProteome: bool | None = None
     # TODO add new fields from https://alphafold.ebi.ac.uk/#/public-api/get_uniprot_summary_api_uniprot_summary__qualifier__json_get

protein_quest/alphafold/fetch.py CHANGED Viewed

@@ -1,26 +1,26 @@
 """Module for fetch Alphafold data."""
-import asyncio
 import logging
 from asyncio import Semaphore
 from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from pathlib import Path
-from textwrap import dedent
-from typing import Literal
+from typing import Literal, cast, get_args
 from aiohttp_retry import RetryClient
 from aiopath import AsyncPath
-from cattrs.preconf.orjson import make_converter
 from tqdm.asyncio import tqdm
+from yarl import URL
 from protein_quest.alphafold.entry_summary import EntrySummary
-from protein_quest.utils import friendly_session, retrieve_files
+from protein_quest.converter import converter
+from protein_quest.utils import friendly_session, retrieve_files, run_async
 logger = logging.getLogger(__name__)
-converter = make_converter()
 DownloadableFormat = Literal[
+    "summary",
     "bcif",
     "cif",
     "pdb",
@@ -32,16 +32,7 @@ DownloadableFormat = Literal[
 ]
 """Types of formats that can be downloaded from the AlphaFold web service."""
-downloadable_formats: set[DownloadableFormat] = {
-    "bcif",
-    "cif",
-    "pdb",
-    "paeImage",
-    "paeDoc",
-    "amAnnotations",
-    "amAnnotationsHg19",
-    "amAnnotationsHg38",
-}
+downloadable_formats: set[DownloadableFormat] = set(get_args(DownloadableFormat))
 """Set of formats that can be downloaded from the AlphaFold web service."""
@@ -59,6 +50,7 @@ class AlphaFoldEntry:
     uniprot_acc: str
     summary: EntrySummary | None
+    summary_file: Path | None = None
     bcif_file: Path | None = None
     cif_file: Path | None = None
     pdb_file: Path | None = None
@@ -127,10 +119,6 @@ async def fetch_summary(
     Returns:
         A list of EntrySummary objects representing the fetched summary.
-    Raises:
-        HTTPError: If the HTTP request returns an error status code.
-        Exception: If there is an error during file reading/writing or data conversion.
     """
     url = f"https://alphafold.ebi.ac.uk/api/prediction/{qualifier}"
     fn: AsyncPath | None = None
@@ -144,6 +132,7 @@ async def fetch_summary(
         response.raise_for_status()
         raw_data = await response.content.read()
         if fn is not None:
+            # TODO return fn and make it part of AlphaFoldEntry as summary_file prop
             await fn.write_bytes(raw_data)
         return converter.loads(raw_data, list[EntrySummary])
@@ -164,19 +153,14 @@ async def fetch_summaries(
                 yield summary
-def url2name(url: str) -> str:
-    """Given a URL, return the final path component as the name of the file."""
-    return url.split("/")[-1]
 async def fetch_many_async(
-    ids: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
+    uniprot_accessions: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
 ) -> AsyncGenerator[AlphaFoldEntry]:
-    """Asynchronously fetches summaries and pdb and pae (predicted alignment error) files from
+    """Asynchronously fetches summaries and files from
     [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
     Args:
-        ids: A set of Uniprot IDs to fetch.
+        uniprot_accessions: A set of Uniprot acessions to fetch.
         save_dir: The directory to save the fetched files to.
         what: A set of formats to download.
         max_parallel_downloads: The maximum number of parallel downloads.
@@ -184,7 +168,13 @@ async def fetch_many_async(
     Yields:
         A dataclass containing the summary, pdb file, and pae file.
     """
-    summaries = [s async for s in fetch_summaries(ids, save_dir, max_parallel_downloads=max_parallel_downloads)]
+    save_dir_for_summaries = save_dir if "summary" in what and save_dir is not None else None
+    summaries = [
+        s
+        async for s in fetch_summaries(
+            uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads
+        )
+    ]
     files = files_to_download(what, summaries)
@@ -198,30 +188,31 @@ async def fetch_many_async(
         yield AlphaFoldEntry(
             uniprot_acc=summary.uniprotAccession,
             summary=summary,
-            bcif_file=save_dir / url2name(summary.bcifUrl) if "bcif" in what else None,
-            cif_file=save_dir / url2name(summary.cifUrl) if "cif" in what else None,
-            pdb_file=save_dir / url2name(summary.pdbUrl) if "pdb" in what else None,
-            pae_image_file=save_dir / url2name(summary.paeImageUrl) if "paeImage" in what else None,
-            pae_doc_file=save_dir / url2name(summary.paeDocUrl) if "paeDoc" in what else None,
+            summary_file=save_dir / f"{summary.uniprotAccession}.json" if save_dir_for_summaries is not None else None,
+            bcif_file=save_dir / summary.bcifUrl.name if "bcif" in what else None,
+            cif_file=save_dir / summary.cifUrl.name if "cif" in what else None,
+            pdb_file=save_dir / summary.pdbUrl.name if "pdb" in what else None,
+            pae_image_file=save_dir / summary.paeImageUrl.name if "paeImage" in what else None,
+            pae_doc_file=save_dir / summary.paeDocUrl.name if "paeDoc" in what else None,
             am_annotations_file=(
-                save_dir / url2name(summary.amAnnotationsUrl)
+                save_dir / summary.amAnnotationsUrl.name
                 if "amAnnotations" in what and summary.amAnnotationsUrl
                 else None
             ),
             am_annotations_hg19_file=(
-                save_dir / url2name(summary.amAnnotationsHg19Url)
+                save_dir / summary.amAnnotationsHg19Url.name
                 if "amAnnotationsHg19" in what and summary.amAnnotationsHg19Url
                 else None
             ),
             am_annotations_hg38_file=(
-                save_dir / url2name(summary.amAnnotationsHg38Url)
+                save_dir / summary.amAnnotationsHg38Url.name
                 if "amAnnotationsHg38" in what and summary.amAnnotationsHg38Url
                 else None
             ),
         )
-def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySummary]) -> set[tuple[str, str]]:
+def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySummary]) -> set[tuple[URL, str]]:
     if not (set(what) <= downloadable_formats):
         msg = (
             f"Invalid format(s) specified: {set(what) - downloadable_formats}. "
@@ -229,24 +220,21 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
         )
         raise ValueError(msg)
-    files: set[tuple[str, str]] = set()
+    files: set[tuple[URL, str]] = set()
     for summary in summaries:
         for fmt in what:
-            url = getattr(summary, f"{fmt}Url", None)
+            if fmt == "summary":
+                # summary is handled already in fetch_summary
+                continue
+            url = cast("URL | None", getattr(summary, f"{fmt}Url", None))
             if url is None:
                 logger.warning(f"Summary {summary.uniprotAccession} does not have a URL for format '{fmt}'. Skipping.")
                 continue
-            file = (url, url2name(url))
+            file = (url, url.name)
             files.add(file)
     return files
-class NestedAsyncIOLoopError(RuntimeError):
-    """Custom error for nested async I/O loops."""
-    pass
 def fetch_many(
     ids: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
 ) -> list[AlphaFoldEntry]:
@@ -260,9 +248,6 @@ def fetch_many(
     Returns:
         A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
-    Raises:
-        NestedAsyncIOLoopError: If called from a nested async I/O loop like in a Jupyter notebook.
     """
     async def gather_entries():
@@ -271,19 +256,7 @@ def fetch_many(
             async for entry in fetch_many_async(ids, save_dir, what, max_parallel_downloads=max_parallel_downloads)
         ]
-    try:
-        return asyncio.run(gather_entries())
-    except RuntimeError as e:
-        msg = dedent("""\
-            Can not run async method from an environment where the asyncio event loop is already running.
-            Like a Jupyter notebook.
-            Please use the `fetch_many_async` function directly or before call
-                    import nest_asyncio
-                    nest_asyncio.apply()
-            """)
-        raise NestedAsyncIOLoopError(msg) from e
+    return run_async(gather_entries())
 def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
@@ -299,6 +272,7 @@ def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
     return AlphaFoldEntry(
         uniprot_acc=entry.uniprot_acc,
         summary=entry.summary,
+        summary_file=entry.summary_file.relative_to(session_dir) if entry.summary_file else None,
         bcif_file=entry.bcif_file.relative_to(session_dir) if entry.bcif_file else None,
         cif_file=entry.cif_file.relative_to(session_dir) if entry.cif_file else None,
         pdb_file=entry.pdb_file.relative_to(session_dir) if entry.pdb_file else None,

protein-quest 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

Potentially problematic release.

protein-quest 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl