PyPI - protein-quest - Versions diffs - 0.3.0__py3-none-any.whl - Mend

protein-quest 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of protein-quest might be problematic. Click here for more details.

Files changed (24) hide show

protein_quest/__init__.py +0 -0
protein_quest/__version__.py +1 -0
protein_quest/alphafold/__init__.py +1 -0
protein_quest/alphafold/confidence.py +153 -0
protein_quest/alphafold/entry_summary.py +38 -0
protein_quest/alphafold/fetch.py +314 -0
protein_quest/cli.py +782 -0
protein_quest/emdb.py +34 -0
protein_quest/filters.py +107 -0
protein_quest/go.py +168 -0
protein_quest/mcp_server.py +208 -0
protein_quest/parallel.py +68 -0
protein_quest/pdbe/__init__.py +1 -0
protein_quest/pdbe/fetch.py +51 -0
protein_quest/pdbe/io.py +185 -0
protein_quest/py.typed +0 -0
protein_quest/taxonomy.py +139 -0
protein_quest/uniprot.py +511 -0
protein_quest/utils.py +105 -0
protein_quest-0.3.0.dist-info/METADATA +219 -0
protein_quest-0.3.0.dist-info/RECORD +24 -0
protein_quest-0.3.0.dist-info/WHEEL +4 -0
protein_quest-0.3.0.dist-info/entry_points.txt +2 -0
protein_quest-0.3.0.dist-info/licenses/LICENSE +201 -0

protein_quest/__init__.py ADDED Viewed

File without changes

protein_quest/__version__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.3.0"

protein_quest/alphafold/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Modules related to AlphaFold Knowledge Base."""

protein_quest/alphafold/confidence.py ADDED Viewed

@@ -0,0 +1,153 @@
+"""Module for filtering alphafold structures on confidence."""
+import logging
+from collections.abc import Generator
+from dataclasses import dataclass
+from pathlib import Path
+import gemmi
+from protein_quest.pdbe.io import write_structure
+"""
+Methods to filter AlphaFoldDB structures on confidence scores.
+In AlphaFold PDB files, the b-factor column has the
+predicted local distance difference test (pLDDT).
+See https://www.ebi.ac.uk/training/online/courses/alphafold/inputs-and-outputs/evaluating-alphafolds-predicted-structures-using-confidence-scores/plddt-understanding-local-confidence/
+"""
+logger = logging.getLogger(__name__)
+def find_high_confidence_residues(structure: gemmi.Structure, confidence: float) -> Generator[int]:
+    """Find residues in the structure with pLDDT confidence above the given threshold.
+    Args:
+        structure: The AlphaFoldDB structure to search.
+        confidence: The confidence threshold (pLDDT) to use for filtering.
+    Yields:
+        The sequence numbers of residues with pLDDT above the confidence threshold.
+    """
+    for model in structure:
+        for chain in model:
+            for res in chain:
+                res_confidence = res[0].b_iso
+                if res_confidence > confidence:
+                    seqid = res.seqid.num
+                    if seqid is not None:
+                        yield seqid
+def filter_out_low_confidence_residues(structure: gemmi.Structure, allowed_residues: set[int]) -> gemmi.Structure:
+    """Filter out residues from the structure that do not have high confidence.
+    Args:
+        structure: The AlphaFoldDB structure to filter.
+        allowed_residues: The set of residue sequence numbers to keep.
+    Returns:
+        A new AlphaFoldDB structure with low confidence residues removed.
+    """
+    new_structure = structure.clone()
+    for model in new_structure:
+        new_chains = []
+        for chain in model:
+            new_chain = gemmi.Chain(chain.name)
+            for res in chain:
+                if res.seqid.num in allowed_residues:
+                    new_chain.add_residue(res)
+            new_chains.append(new_chain)
+        for new_chain in new_chains:
+            model.remove_chain(new_chain.name)
+            model.add_chain(new_chain)
+    return new_structure
+@dataclass
+class ConfidenceFilterQuery:
+    """Query for filtering AlphaFoldDB structures based on confidence.
+    Parameters:
+        confidence: The confidence threshold for filtering residues.
+            Residues with a pLDDT (b-factor) above this value are considered high confidence.
+        min_threshold: The minimum number of high-confidence residues required to keep the structure.
+        max_threshold: The maximum number of high-confidence residues required to keep the structure.
+    """
+    confidence: float
+    min_threshold: int
+    max_threshold: int
+@dataclass
+class ConfidenceFilterResult:
+    """Result of filtering AlphaFoldDB structures based on confidence (pLDDT).
+    Parameters:
+        input_file: The name of the mmcif/PDB file that was processed.
+        count: The number of residues with a pLDDT above the confidence threshold.
+        filtered_file: The path to the filtered mmcif/PDB file, if passed filter.
+    """
+    input_file: str
+    count: int
+    filtered_file: Path | None = None
+def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_dir: Path) -> ConfidenceFilterResult:
+    """Filter a single AlphaFoldDB structure file based on confidence.
+    Args:
+        file: The path to the PDB file to filter.
+        query: The confidence filter query.
+        filtered_dir: The directory to save the filtered PDB file.
+    Returns:
+        result with filtered_file property set to Path where filtered PDB file is saved.
+        or None if structure was filtered out.
+    """
+    structure = gemmi.read_structure(str(file))
+    residues = set(find_high_confidence_residues(structure, query.confidence))
+    count = len(residues)
+    if count < query.min_threshold or count > query.max_threshold:
+        # Skip structure that is outside the min and max threshold
+        # just return number of high confidence residues
+        return ConfidenceFilterResult(
+            input_file=file.name,
+            count=count,
+        )
+    filtered_file = filtered_dir / file.name
+    new_structure = filter_out_low_confidence_residues(
+        structure,
+        residues,
+    )
+    write_structure(new_structure, filtered_file)
+    return ConfidenceFilterResult(
+        input_file=file.name,
+        count=count,
+        filtered_file=filtered_file,
+    )
+def filter_files_on_confidence(
+    alphafold_pdb_files: list[Path], query: ConfidenceFilterQuery, filtered_dir: Path
+) -> Generator[ConfidenceFilterResult]:
+    """Filter AlphaFoldDB structures based on confidence.
+    Args:
+        alphafold_pdb_files: List of mmcif/PDB files from AlphaFoldDB to filter.
+        query: The confidence filter query containing the confidence thresholds.
+        filtered_dir: Directory where the filtered mmcif/PDB files will be saved.
+    Yields:
+        For each mmcif/PDB files yields whether it was filtered or not,
+            and number of residues with pLDDT above the confidence threshold.
+    """
+    # Note on why code looks duplicated:
+    # In ../filter.py:filter_files_on_residues() we filter on number of residues on a file level
+    # here we filter on file level and inside file remove low confidence residues
+    for pdb_file in alphafold_pdb_files:
+        yield filter_file_on_residues(pdb_file, query, filtered_dir)

protein_quest/alphafold/entry_summary.py ADDED Viewed

@@ -0,0 +1,38 @@
+# ruff: noqa: N815 allow camelCase follow what api returns
+from dataclasses import dataclass
+@dataclass
+class EntrySummary:
+    """Dataclass representing a summary of an AlphaFold entry.
+    Modelled after EntrySummary in https://alphafold.ebi.ac.uk/api/openapi.json
+    """
+    entryId: str
+    uniprotAccession: str
+    uniprotId: str
+    uniprotDescription: str
+    taxId: int
+    organismScientificName: str
+    uniprotStart: int
+    uniprotEnd: int
+    uniprotSequence: str
+    modelCreatedDate: str
+    latestVersion: int
+    allVersions: list[int]
+    bcifUrl: str
+    cifUrl: str
+    pdbUrl: str
+    paeImageUrl: str
+    paeDocUrl: str
+    gene: str | None = None
+    sequenceChecksum: str | None = None
+    sequenceVersionDate: str | None = None
+    amAnnotationsUrl: str | None = None
+    amAnnotationsHg19Url: str | None = None
+    amAnnotationsHg38Url: str | None = None
+    isReviewed: bool | None = None
+    isReferenceProteome: bool | None = None
+    # TODO add new fields from https://alphafold.ebi.ac.uk/#/public-api/get_uniprot_summary_api_uniprot_summary__qualifier__json_get
+    # TODO like fractionPlddt* fields which can be used in filter_files_on_confidence()

protein_quest/alphafold/fetch.py ADDED Viewed

@@ -0,0 +1,314 @@
+"""Module for fetch Alphafold data."""
+import asyncio
+import logging
+from asyncio import Semaphore
+from collections.abc import AsyncGenerator, Iterable
+from dataclasses import dataclass
+from pathlib import Path
+from textwrap import dedent
+from typing import Literal
+from aiohttp_retry import RetryClient
+from aiopath import AsyncPath
+from cattrs.preconf.orjson import make_converter
+from tqdm.asyncio import tqdm
+from protein_quest.alphafold.entry_summary import EntrySummary
+from protein_quest.utils import friendly_session, retrieve_files
+logger = logging.getLogger(__name__)
+converter = make_converter()
+DownloadableFormat = Literal[
+    "bcif",
+    "cif",
+    "pdb",
+    "paeImage",
+    "paeDoc",
+    "amAnnotations",
+    "amAnnotationsHg19",
+    "amAnnotationsHg38",
+]
+"""Types of formats that can be downloaded from the AlphaFold web service."""
+downloadable_formats: set[DownloadableFormat] = {
+    "bcif",
+    "cif",
+    "pdb",
+    "paeImage",
+    "paeDoc",
+    "amAnnotations",
+    "amAnnotationsHg19",
+    "amAnnotationsHg38",
+}
+"""Set of formats that can be downloaded from the AlphaFold web service."""
+def _camel_to_snake_case(name: str) -> str:
+    """Convert a camelCase string to snake_case."""
+    return "".join(["_" + c.lower() if c.isupper() else c for c in name]).lstrip("_")
+@dataclass
+class AlphaFoldEntry:
+    """AlphaFoldEntry represents a minimal single entry in the AlphaFold database.
+    See https://alphafold.ebi.ac.uk/api-docs for more details on the API and data structure.
+    """
+    uniprot_acc: str
+    summary: EntrySummary | None
+    bcif_file: Path | None = None
+    cif_file: Path | None = None
+    pdb_file: Path | None = None
+    pae_image_file: Path | None = None
+    pae_doc_file: Path | None = None
+    am_annotations_file: Path | None = None
+    am_annotations_hg19_file: Path | None = None
+    am_annotations_hg38_file: Path | None = None
+    @classmethod
+    def format2attr(cls, dl_format: DownloadableFormat) -> str:
+        """Get the attribute name for a specific download format.
+        Args:
+            dl_format: The format for which to get the attribute name.
+        Returns:
+            The attribute name corresponding to the download format.
+        Raises:
+            ValueError: If the format is not valid.
+        """
+        if dl_format not in downloadable_formats:
+            msg = f"Invalid format: {dl_format}. Valid formats are: {downloadable_formats}"
+            raise ValueError(msg)
+        return _camel_to_snake_case(dl_format) + "_file"
+    def by_format(self, dl_format: DownloadableFormat) -> Path | None:
+        """Get the file path for a specific format.
+        Args:
+            dl_format: The format for which to get the file path.
+        Returns:
+            The file path corresponding to the download format.
+            Or None if the file is not set.
+        Raises:
+            ValueError: If the format is not valid.
+        """
+        attr = self.format2attr(dl_format)
+        return getattr(self, attr, None)
+    def nr_of_files(self) -> int:
+        """Nr of _file properties that are set
+        Returns:
+            The number of _file properties that are set.
+        """
+        return sum(1 for attr in vars(self) if attr.endswith("_file") and getattr(self, attr) is not None)
+async def fetch_summary(
+    qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None
+) -> list[EntrySummary]:
+    """Fetches a summary from the AlphaFold database for a given qualifier.
+    Args:
+        qualifier: The uniprot accession for the protein or entry to fetch.
+            For example `Q5VSL9`.
+        session: An asynchronous HTTP client session with retry capabilities.
+        semaphore: A semaphore to limit the number of concurrent requests.
+        save_dir: An optional directory to save the fetched summary as a JSON file.
+            If set and summary exists then summary will be loaded from disk instead of being fetched from the API.
+            If not set then the summary will not be saved to disk and will always be fetched from the API.
+    Returns:
+        A list of EntrySummary objects representing the fetched summary.
+    Raises:
+        HTTPError: If the HTTP request returns an error status code.
+        Exception: If there is an error during file reading/writing or data conversion.
+    """
+    url = f"https://alphafold.ebi.ac.uk/api/prediction/{qualifier}"
+    fn: AsyncPath | None = None
+    if save_dir is not None:
+        fn = AsyncPath(save_dir / f"{qualifier}.json")
+        if await fn.exists():
+            logger.debug(f"File {fn} already exists. Skipping download from {url}.")
+            raw_data = await fn.read_bytes()
+            return converter.loads(raw_data, list[EntrySummary])
+    async with semaphore, session.get(url) as response:
+        response.raise_for_status()
+        raw_data = await response.content.read()
+        if fn is not None:
+            await fn.write_bytes(raw_data)
+        return converter.loads(raw_data, list[EntrySummary])
+async def fetch_summaries(
+    qualifiers: Iterable[str], save_dir: Path | None = None, max_parallel_downloads: int = 5
+) -> AsyncGenerator[EntrySummary]:
+    semaphore = Semaphore(max_parallel_downloads)
+    if save_dir is not None:
+        save_dir.mkdir(parents=True, exist_ok=True)
+    async with friendly_session() as session:
+        tasks = [fetch_summary(qualifier, session, semaphore, save_dir) for qualifier in qualifiers]
+        summaries_per_qualifier: list[list[EntrySummary]] = await tqdm.gather(
+            *tasks, desc="Fetching Alphafold summaries"
+        )
+        for summaries in summaries_per_qualifier:
+            for summary in summaries:
+                yield summary
+def url2name(url: str) -> str:
+    """Given a URL, return the final path component as the name of the file."""
+    return url.split("/")[-1]
+async def fetch_many_async(
+    ids: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
+) -> AsyncGenerator[AlphaFoldEntry]:
+    """Asynchronously fetches summaries and pdb and pae (predicted alignment error) files from
+    [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
+    Args:
+        ids: A set of Uniprot IDs to fetch.
+        save_dir: The directory to save the fetched files to.
+        what: A set of formats to download.
+        max_parallel_downloads: The maximum number of parallel downloads.
+    Yields:
+        A dataclass containing the summary, pdb file, and pae file.
+    """
+    summaries = [s async for s in fetch_summaries(ids, save_dir, max_parallel_downloads=max_parallel_downloads)]
+    files = files_to_download(what, summaries)
+    await retrieve_files(
+        files,
+        save_dir,
+        desc="Downloading AlphaFold files",
+        max_parallel_downloads=max_parallel_downloads,
+    )
+    for summary in summaries:
+        yield AlphaFoldEntry(
+            uniprot_acc=summary.uniprotAccession,
+            summary=summary,
+            bcif_file=save_dir / url2name(summary.bcifUrl) if "bcif" in what else None,
+            cif_file=save_dir / url2name(summary.cifUrl) if "cif" in what else None,
+            pdb_file=save_dir / url2name(summary.pdbUrl) if "pdb" in what else None,
+            pae_image_file=save_dir / url2name(summary.paeImageUrl) if "paeImage" in what else None,
+            pae_doc_file=save_dir / url2name(summary.paeDocUrl) if "paeDoc" in what else None,
+            am_annotations_file=(
+                save_dir / url2name(summary.amAnnotationsUrl)
+                if "amAnnotations" in what and summary.amAnnotationsUrl
+                else None
+            ),
+            am_annotations_hg19_file=(
+                save_dir / url2name(summary.amAnnotationsHg19Url)
+                if "amAnnotationsHg19" in what and summary.amAnnotationsHg19Url
+                else None
+            ),
+            am_annotations_hg38_file=(
+                save_dir / url2name(summary.amAnnotationsHg38Url)
+                if "amAnnotationsHg38" in what and summary.amAnnotationsHg38Url
+                else None
+            ),
+        )
+def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySummary]) -> set[tuple[str, str]]:
+    if not (set(what) <= downloadable_formats):
+        msg = (
+            f"Invalid format(s) specified: {set(what) - downloadable_formats}. "
+            f"Valid formats are: {downloadable_formats}"
+        )
+        raise ValueError(msg)
+    files: set[tuple[str, str]] = set()
+    for summary in summaries:
+        for fmt in what:
+            url = getattr(summary, f"{fmt}Url", None)
+            if url is None:
+                logger.warning(f"Summary {summary.uniprotAccession} does not have a URL for format '{fmt}'. Skipping.")
+                continue
+            file = (url, url2name(url))
+            files.add(file)
+    return files
+class NestedAsyncIOLoopError(RuntimeError):
+    """Custom error for nested async I/O loops."""
+    pass
+def fetch_many(
+    ids: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
+) -> list[AlphaFoldEntry]:
+    """Synchronously fetches summaries and pdb and pae files from AlphaFold Protein Structure Database.
+    Args:
+        ids: A set of Uniprot IDs to fetch.
+        save_dir: The directory to save the fetched files to.
+        what: A set of formats to download.
+        max_parallel_downloads: The maximum number of parallel downloads.
+    Returns:
+        A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
+    Raises:
+        NestedAsyncIOLoopError: If called from a nested async I/O loop like in a Jupyter notebook.
+    """
+    async def gather_entries():
+        return [
+            entry
+            async for entry in fetch_many_async(ids, save_dir, what, max_parallel_downloads=max_parallel_downloads)
+        ]
+    try:
+        return asyncio.run(gather_entries())
+    except RuntimeError as e:
+        msg = dedent("""\
+            Can not run async method from an environment where the asyncio event loop is already running.
+            Like a Jupyter notebook.
+            Please use the `fetch_many_async` function directly or before call
+                    import nest_asyncio
+                    nest_asyncio.apply()
+            """)
+        raise NestedAsyncIOLoopError(msg) from e
+def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
+    """Convert paths in an AlphaFoldEntry to be relative to the session directory.
+    Args:
+        entry: An AlphaFoldEntry instance with absolute paths.
+        session_dir: The session directory to which the paths should be made relative.
+    Returns:
+        An AlphaFoldEntry instance with paths relative to the session directory.
+    """
+    return AlphaFoldEntry(
+        uniprot_acc=entry.uniprot_acc,
+        summary=entry.summary,
+        bcif_file=entry.bcif_file.relative_to(session_dir) if entry.bcif_file else None,
+        cif_file=entry.cif_file.relative_to(session_dir) if entry.cif_file else None,
+        pdb_file=entry.pdb_file.relative_to(session_dir) if entry.pdb_file else None,
+        pae_image_file=entry.pae_image_file.relative_to(session_dir) if entry.pae_image_file else None,
+        pae_doc_file=entry.pae_doc_file.relative_to(session_dir) if entry.pae_doc_file else None,
+        am_annotations_file=entry.am_annotations_file.relative_to(session_dir) if entry.am_annotations_file else None,
+        am_annotations_hg19_file=(
+            entry.am_annotations_hg19_file.relative_to(session_dir) if entry.am_annotations_hg19_file else None
+        ),
+        am_annotations_hg38_file=(
+            entry.am_annotations_hg38_file.relative_to(session_dir) if entry.am_annotations_hg38_file else None
+        ),
+    )