PyPI - protein-quest - Versions diffs - 0.3.0__py3-none-any.whl - Mend

protein-quest 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of protein-quest might be problematic. Click here for more details.

Files changed (24) hide show

protein_quest/__init__.py +0 -0
protein_quest/__version__.py +1 -0
protein_quest/alphafold/__init__.py +1 -0
protein_quest/alphafold/confidence.py +153 -0
protein_quest/alphafold/entry_summary.py +38 -0
protein_quest/alphafold/fetch.py +314 -0
protein_quest/cli.py +782 -0
protein_quest/emdb.py +34 -0
protein_quest/filters.py +107 -0
protein_quest/go.py +168 -0
protein_quest/mcp_server.py +208 -0
protein_quest/parallel.py +68 -0
protein_quest/pdbe/__init__.py +1 -0
protein_quest/pdbe/fetch.py +51 -0
protein_quest/pdbe/io.py +185 -0
protein_quest/py.typed +0 -0
protein_quest/taxonomy.py +139 -0
protein_quest/uniprot.py +511 -0
protein_quest/utils.py +105 -0
protein_quest-0.3.0.dist-info/METADATA +219 -0
protein_quest-0.3.0.dist-info/RECORD +24 -0
protein_quest-0.3.0.dist-info/WHEEL +4 -0
protein_quest-0.3.0.dist-info/entry_points.txt +2 -0
protein_quest-0.3.0.dist-info/licenses/LICENSE +201 -0

protein_quest/emdb.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""Module dealing with Electron Microscopy Data Bank (EMDB)."""
+from collections.abc import Iterable, Mapping
+from pathlib import Path
+from protein_quest.utils import retrieve_files
+def _map_id2volume_url(emdb_id: str) -> tuple[str, str]:
+    # https://ftp.ebi.ac.uk/pub/databases/emdb/structures/EMD-19583/map/emd_19583.map.gz
+    fn = emdb_id.lower().replace("emd-", "emd_") + ".map.gz"
+    url = f"https://ftp.ebi.ac.uk/pub/databases/emdb/structures/{emdb_id}/map/{fn}"
+    return url, fn
+async def fetch(emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 1) -> Mapping[str, Path]:
+    """Fetches volume files from the EMDB database.
+    Args:
+        emdb_ids: A list of EMDB IDs to fetch.
+        save_dir: The directory to save the downloaded files.
+        max_parallel_downloads: The maximum number of parallel downloads.
+    Returns:
+        A mapping of EMDB IDs to their downloaded files.
+    """
+    id2urls = {emdb_id: _map_id2volume_url(emdb_id) for emdb_id in emdb_ids}
+    urls = list(id2urls.values())
+    id2paths = {emdb_id: save_dir / fn for emdb_id, (_, fn) in id2urls.items()}
+    # TODO show progress of each item
+    # TODO handle failed downloads, by skipping them instead of raising an error
+    await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading EMDB volume files")
+    return id2paths

protein_quest/filters.py ADDED Viewed

@@ -0,0 +1,107 @@
+"""Module for filtering structure files and their contents."""
+import logging
+from collections.abc import Generator
+from dataclasses import dataclass
+from pathlib import Path
+from shutil import copyfile
+from typing import cast
+from dask.distributed import Client, progress
+from distributed.deploy.cluster import Cluster
+from tqdm.auto import tqdm
+from protein_quest.parallel import configure_dask_scheduler
+from protein_quest.pdbe.io import (
+    locate_structure_file,
+    nr_residues_in_chain,
+    write_single_chain_pdb_file,
+)
+logger = logging.getLogger(__name__)
+def filter_files_on_chain(
+    input_dir: Path,
+    id2chains: dict[str, str],
+    output_dir: Path,
+    scheduler_address: str | Cluster | None = None,
+    out_chain: str = "A",
+) -> list[tuple[str, str, Path | None]]:
+    """Filter mmcif/PDB files by chain.
+    Args:
+        input_dir: The directory containing the input mmcif/PDB files.
+        id2chains: Which chain to keep for each PDB ID. Key is the PDB ID, value is the chain ID.
+        output_dir: The directory where the filtered files will be written.
+        scheduler_address: The address of the Dask scheduler.
+        out_chain: Under what name to write the kept chain.
+    Returns:
+        A list of tuples containing the PDB ID, chain ID, and path to the filtered file.
+        Last tuple item is None if something went wrong like chain not present.
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    scheduler_address = configure_dask_scheduler(
+        scheduler_address,
+        name="filter-chain",
+    )
+    def task(id2chain: tuple[str, str]) -> tuple[str, str, Path | None]:
+        pdb_id, chain = id2chain
+        input_file = locate_structure_file(input_dir, pdb_id)
+        return pdb_id, chain, write_single_chain_pdb_file(input_file, chain, output_dir, out_chain=out_chain)
+    with Client(scheduler_address) as client:
+        logger.info(f"Follow progress on dask dashboard at: {client.dashboard_link}")
+        futures = client.map(task, id2chains.items())
+        progress(futures)
+        results = client.gather(futures)
+        return cast("list[tuple[str,str, Path | None]]", results)
+@dataclass
+class FilterStat:
+    """Statistics for filtering files based on residue count in a specific chain.
+    Parameters:
+        input_file: The path to the input file.
+        residue_count: The number of residues.
+        passed: Whether the file passed the filtering criteria.
+        output_file: The path to the output file, if passed.
+    """
+    input_file: Path
+    residue_count: int
+    passed: bool
+    output_file: Path | None
+def filter_files_on_residues(
+    input_files: list[Path], output_dir: Path, min_residues: int, max_residues: int, chain: str = "A"
+) -> Generator[FilterStat]:
+    """Filter PDB/mmCIF files by number of residues in given chain.
+    Args:
+        input_files: The list of input PDB/mmCIF files.
+        output_dir: The directory where the filtered files will be written.
+        min_residues: The minimum number of residues in chain.
+        max_residues: The maximum number of residues in chain.
+        chain: The chain to count residues of.
+    Yields:
+        FilterStat objects containing information about the filtering process for each input file.
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for input_file in tqdm(input_files, unit="file"):
+        residue_count = nr_residues_in_chain(input_file, chain=chain)
+        passed = min_residues <= residue_count <= max_residues
+        if passed:
+            output_file = output_dir / input_file.name
+            copyfile(input_file, output_file)
+            yield FilterStat(input_file, residue_count, True, output_file)
+        else:
+            yield FilterStat(input_file, residue_count, False, None)

protein_quest/go.py ADDED Viewed

@@ -0,0 +1,168 @@
+"""Module for Gene Ontology (GO) functions."""
+import csv
+import logging
+from collections.abc import Generator
+from dataclasses import dataclass
+from io import TextIOWrapper
+from typing import Literal, get_args
+from cattrs.gen import make_dict_structure_fn, override
+from cattrs.preconf.orjson import make_converter
+from protein_quest.utils import friendly_session
+logger = logging.getLogger(__name__)
+Aspect = Literal["cellular_component", "biological_process", "molecular_function"]
+"""The aspect of the GO term."""
+allowed_aspects = set(get_args(Aspect))
+"""Allowed aspects for GO terms."""
+@dataclass(frozen=True, slots=True)
+class GoTerm:
+    """A Gene Ontology (GO) term.
+    Parameters:
+        id: The unique identifier for the GO term, e.g., 'GO:0043293'.
+        is_obsolete: Whether the GO term is obsolete.
+        name: The name of the GO term.
+        definition: The definition of the GO term.
+        aspect: The aspect of the GO term.
+    """
+    id: str
+    is_obsolete: bool
+    name: str
+    definition: str
+    aspect: Aspect
+@dataclass(frozen=True, slots=True)
+class PageInfo:
+    current: int
+    total: int
+@dataclass(frozen=True, slots=True)
+class SearchResponse:
+    results: list[GoTerm]
+    number_of_hits: int
+    page_info: PageInfo
+converter = make_converter()
+def flatten_definition(definition, _context) -> str:
+    return definition["text"]
+# Use hook to convert incoming camelCase to snake_case
+# and to flatten definition {text} to text
+# see https://catt.rs/en/stable/customizing.html#rename
+converter.register_structure_hook(
+    GoTerm,
+    make_dict_structure_fn(
+        GoTerm,
+        converter,
+        is_obsolete=override(rename="isObsolete"),
+        definition=override(struct_hook=flatten_definition),
+    ),
+)
+converter.register_structure_hook(
+    SearchResponse,
+    make_dict_structure_fn(
+        SearchResponse, converter, number_of_hits=override(rename="numberOfHits"), page_info=override(rename="pageInfo")
+    ),
+)
+async def search_gene_ontology_term(
+    term: str, aspect: Aspect | None = None, include_obsolete: bool = False, limit: int = 100
+) -> list[GoTerm]:
+    """Search for a Gene Ontology (GO) term by its name or ID.
+    Calls the EBI QuickGO API at https://www.ebi.ac.uk/QuickGO/api/index.html .
+    Examples:
+        To search for `apoptosome` terms do.
+        >>> from protein_quest.go import search_go_term
+        >>> r = await search_go_term('apoptosome')
+        >>> len(r)
+        5
+        >>> r[0]
+        GoTerm(id='GO:0043293', is_obsolete=False, name='apoptosome', definition='A multisubunit protein ...')
+    Args:
+        term: The GO term to search for. For example `nucleus` or `GO:0006816`.
+        aspect: The aspect to filter by. If not given, all aspects are included.
+        include_obsolete: Whether to include obsolete terms. By default, obsolete terms are excluded.
+        limit: The maximum number of results to return.
+    Returns:
+        List of GO terms
+    Raises:
+        ValueError: If the aspect is invalid.
+    """
+    url = "https://www.ebi.ac.uk/QuickGO/services/ontology/go/search"
+    page_limit = 100
+    params = {"query": term, "limit": str(page_limit), "page": "1"}
+    if aspect is not None and aspect not in allowed_aspects:
+        msg = f"Invalid aspect: {aspect}. Allowed aspects are: {allowed_aspects} or None."
+        raise ValueError(msg)
+    logger.debug("Fetching GO terms from %s with params %s", url, params)
+    async with friendly_session() as session:
+        # Fetch first page to learn how many pages there are
+        async with session.get(url, params=params) as response:
+            response.raise_for_status()
+            raw_data = await response.read()
+            data = converter.loads(raw_data, SearchResponse)
+        terms = list(_filter_go_terms(data.results, aspect, include_obsolete))
+        if len(terms) >= limit:
+            # Do not fetch additional pages if we have enough results
+            return terms[:limit]
+        total_pages = data.page_info.total
+        logger.debug("GO search returned %s pages (current=%s)", total_pages, data.page_info.current)
+        # Retrieve remaining pages (if any) and extend results
+        if total_pages > 1:
+            for page in range(2, total_pages + 1):
+                params["page"] = str(page)
+                logger.debug("Fetching additional GO terms page %s/%s with params %s", page, total_pages, params)
+                async with session.get(url, params=params) as response:
+                    response.raise_for_status()
+                    raw_data = await response.read()
+                    data = converter.loads(raw_data, SearchResponse)
+                terms.extend(_filter_go_terms(data.results, aspect, include_obsolete))
+                if len(terms) >= limit:
+                    # Do not fetch additional pages if we have enough results
+                    break
+        return terms[:limit]
+def _filter_go_terms(terms: list[GoTerm], aspect: Aspect | None, include_obsolete: bool) -> Generator[GoTerm]:
+    for oboterm in terms:
+        if not include_obsolete and oboterm.is_obsolete:
+            continue
+        if aspect and oboterm.aspect != aspect:
+            continue
+        yield oboterm
+def write_go_terms_to_csv(terms: list[GoTerm], csv_file: TextIOWrapper) -> None:
+    """Write a list of GO terms to a CSV file.
+    Args:
+        terms: The list of GO terms to write.
+        csv_file: The CSV file to write to.
+    """
+    writer = csv.writer(csv_file)
+    writer.writerow(["id", "name", "aspect", "definition"])
+    for term in terms:
+        writer.writerow([term.id, term.name, term.aspect, term.definition])

protein_quest/mcp_server.py ADDED Viewed

@@ -0,0 +1,208 @@
+"""MCP server for protein-quest.
+Can be run with:
+```shell
+# for development
+fastmcp dev src/protein_quest/mcp_server.py
+# or from inspector
+npx @modelcontextprotocol/inspector
+# tranport type: stdio
+# comand: protein-quest
+# arguments: mcp
+# or with server and inspector
+protein-quest mcp --transport streamable-http
+# in another shell
+npx @modelcontextprotocol/inspector
+# transport type: streamable http
+# URL: http://127.0.0.1:8000/mcp
+# or with copilot in VS code
+# ctrl + shift + p
+# mcp: add server...
+# Choose STDIO
+# command: uv run protein-quest mcp
+# id: protein-quest
+# Prompt: What are the PDBe structures for `A8MT69` uniprot accession?
+```
+Examples:
+    For search pdb use `A8MT69` as input.
+"""
+from pathlib import Path
+from textwrap import dedent
+from typing import Annotated
+from fastmcp import FastMCP
+from pydantic import Field
+from protein_quest.alphafold.confidence import ConfidenceFilterQuery, ConfidenceFilterResult, filter_file_on_residues
+from protein_quest.alphafold.fetch import AlphaFoldEntry, DownloadableFormat
+from protein_quest.alphafold.fetch import fetch_many as alphafold_fetch
+from protein_quest.emdb import fetch as emdb_fetch
+from protein_quest.go import search_gene_ontology_term
+from protein_quest.pdbe.fetch import fetch as pdbe_fetch
+from protein_quest.pdbe.io import glob_structure_files, nr_residues_in_chain, write_single_chain_pdb_file
+from protein_quest.taxonomy import search_taxon
+from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
+mcp = FastMCP("protein-quest")
+# do not want to make dataclasses in non-mcp code into Pydantic models,
+# so we use Annotated here to add description on roots.
+@mcp.tool
+def search_uniprot(
+    uniprot_query: Annotated[Query, Field(description=Query.__doc__)],
+    limit: Annotated[int, Field(gt=0, description="Limit the number of uniprot accessions returned")] = 100,
+) -> set[str]:
+    """Search UniProt for proteins matching the given query."""
+    return search4uniprot(uniprot_query, limit=limit)
+@mcp.tool
+def search_pdb(
+    uniprot_accs: set[str],
+    limit: Annotated[int, Field(gt=0, description="Limit the number of entries returned")] = 100,
+) -> Annotated[
+    dict[str, set[PdbResult]],
+    Field(
+        description=dedent(f"""\
+            Dictionary with protein IDs as keys and sets of PDB results as values.
+            A PDB result is {PdbResult.__doc__}""")
+    ),
+]:
+    """Search PDBe structures for given uniprot accessions."""
+    return search4pdb(uniprot_accs, limit=limit)
+mcp.tool(pdbe_fetch, name="fetch_pdbe_structures")
+@mcp.tool
+def extract_single_chain_from_structure(
+    input_file: Path,
+    chain2keep: str,
+    output_dir: Path,
+    out_chain: str = "A",
+) -> Path | None:
+    """
+    Extract a single chain from a mmCIF/pdb file and write to a new file.
+    Args:
+        input_file: Path to the input mmCIF/pdb file.
+        chain2keep: The chain to keep.
+        output_dir: Directory to save the output file.
+        out_chain: The chain identifier for the output file.
+    Returns:
+        Path to the output mmCIF/pdb file or None if not created.
+    """
+    return write_single_chain_pdb_file(input_file, chain2keep, output_dir, out_chain)
+@mcp.tool
+def list_structure_files(path: Path) -> list[Path]:
+    """List structure files (.pdb, .pdb.gz, .cif, .cif.gz) in the specified directory."""
+    return list(glob_structure_files(path))
+# TODO replace remaining decorators with wrapper if tool does single function call
+# so we do not have to replicate docstring,
+# minor con is that it does not show up in api docs
+mcp.tool(nr_residues_in_chain)
+mcp.tool(search_taxon)
+mcp.tool(search_gene_ontology_term)
+@mcp.tool
+def search_alphafolds(
+    uniprot_accs: set[str],
+    limit: Annotated[int, Field(gt=0, description="Limit the number of entries returned")] = 100,
+) -> Annotated[
+    set[str],
+    Field(description="Set of uniprot accessions which have an AlphaFold entry"),
+]:
+    """Search for AlphaFold entries in UniProtKB accessions."""
+    # each uniprot accesion can have one or more AlphaFold IDs
+    # an AlphaFold ID is the same as the uniprot accession
+    # so we return a subset of uniprot_accs
+    results = search4af(uniprot_accs, limit)
+    return {k for k, v in results.items() if v}
+mcp.tool(search4emdb, name="search_emdb")
+@mcp.tool
+def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[AlphaFoldEntry]:
+    """Fetch the AlphaFold summary and mmcif file for given UniProt accessions.
+    Args:
+        uniprot_accs: A set of UniProt accessions.
+        save_dir: The directory to save the fetched files.
+    Returns:
+        A list of AlphaFold entries.
+    """
+    what: set[DownloadableFormat] = {"cif"}
+    return alphafold_fetch(uniprot_accs, save_dir, what)
+mcp.tool(emdb_fetch, name="fetch_emdb_volumes")
+@mcp.tool
+def alphafold_confidence_filter(file: Path, query: ConfidenceFilterQuery, filtered_dir: Path) -> ConfidenceFilterResult:
+    """Take a mmcif/PDB file and filter it based on confidence (plDDT) scores.
+    If passes filter writes file to filtered_dir with residues above confidence threshold.
+    """
+    return filter_file_on_residues(file, query, filtered_dir)
+@mcp.prompt
+def candidate_structures(
+    species: str = "Human",
+    cellular_location: str = "nucleus",
+    confidence: int = 90,
+    min_residues: int = 100,
+    max_residues: int = 200,
+) -> str:
+    """Prompt to find candidate structures.
+    Args:
+        species: The species to search for (default: "Human").
+        cellular_location: The cellular location to search for (default: "nucleus").
+        confidence: The confidence threshold for AlphaFold structures (default: 90).
+        min_residues: Minimum number of high confidence residues (default: 100).
+        max_residues: Maximum number of high confidence residues (default: 200).
+    Returns:
+        A prompt string to find candidate structures.
+    """
+    return dedent(f"""\
+        Given the species '{species}' and cellular location '{cellular_location}' find the candidate structures.
+        Download structures from 2 sources namely PDB and Alphafold.
+        For alphafold I only want to use high confidence scores of over {confidence}.
+        and only keep structures with number of high confidence residues between {min_residues} and {max_residues}.
+        1. Search uniprot for proteins related to {species} and {cellular_location}.
+            1. For the species find the NCBI taxonomy id.
+            2. For cellular location find the associated GO term.
+            3. Find uniprot accessions based on NCBI taxonomy id and cellular location GO term.
+        2. For PDB
+            1. Search for structures related to the identified proteins.
+            2. Download each PDB entry from PDBe
+            3. Extract chain for the protein of interest.
+        3. For Alphafold
+            1. Search for AlphaFold entries related to the identified proteins.
+            2. Download each AlphaFold entry.
+            3. Filter the structures based on {confidence} as confidence
+               and nr residues between {min_residues} and {max_residues}.
+    """)

protein_quest/parallel.py ADDED Viewed

@@ -0,0 +1,68 @@
+"""Dask helper functions."""
+import logging
+import os
+from dask.distributed import LocalCluster
+from distributed.deploy.cluster import Cluster
+from psutil import cpu_count
+logger = logging.getLogger(__name__)
+def configure_dask_scheduler(
+    scheduler_address: str | Cluster | None,
+    name: str,
+    nproc: int = 1,
+) -> str | Cluster:
+    """Configure the Dask scheduler by reusing existing or creating a new cluster.
+    Args:
+        scheduler_address: Address of the Dask scheduler to connect to, or None for local cluster.
+        name: Name for the Dask cluster.
+        nproc: Number of processes to use per worker for CPU support.
+    Returns:
+        A Dask Cluster instance or a string address for the scheduler.
+    """
+    if scheduler_address is None:
+        scheduler_address = _configure_cpu_dask_scheduler(nproc, name)
+        logger.info(f"Using local Dask cluster: {scheduler_address}")
+    return scheduler_address
+def nr_cpus() -> int:
+    """Determine the number of CPU cores to use.
+    If the environment variables SLURM_CPUS_PER_TASK or OMP_NUM_THREADS are set,
+    their value is used. Otherwise, the number of physical CPU cores is returned.
+    Returns:
+        The number of CPU cores to use.
+    Raises:
+        ValueError: If the number of physical CPU cores cannot be determined.
+    """
+    physical_cores = cpu_count(logical=False)
+    if physical_cores is None:
+        msg = "Cannot determine number of logical CPU cores."
+        raise ValueError(msg)
+    for var in ["SLURM_CPUS_PER_TASK", "OMP_NUM_THREADS"]:
+        value = os.environ.get(var)
+        if value is not None:
+            logger.warning(
+                'Not using all CPU cores (%s) of machine, environment variable "%s" is set to %s.',
+                physical_cores,
+                var,
+                value,
+            )
+            return int(value)
+    return physical_cores
+def _configure_cpu_dask_scheduler(nproc: int, name: str) -> LocalCluster:
+    total_cpus = nr_cpus()
+    n_workers = total_cpus // nproc
+    # Use single thread per worker to prevent GIL slowing down the computations
+    return LocalCluster(name=name, threads_per_worker=1, n_workers=n_workers)

protein_quest/pdbe/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Modules related to PDBe (Protein Data Bank in Europe)."""

protein_quest/pdbe/fetch.py ADDED Viewed

@@ -0,0 +1,51 @@
+"""Module for fetching structures from PDBe."""
+from collections.abc import Iterable, Mapping
+from pathlib import Path
+from protein_quest.utils import retrieve_files
+def _map_id_mmcif(pdb_id: str) -> tuple[str, str]:
+    """
+    Map PDB id to a download gzipped mmCIF url and file.
+    For example for PDB id "8WAS", the url will be
+    "https://www.ebi.ac.uk/pdbe/entry-files/download/8was.cif.gz" and the file will be "8was.cif.gz".
+    Args:
+        pdb_id: The PDB ID to map.
+    Returns:
+        A tuple containing the URL to download the mmCIF file and the filename.
+    """
+    fn = f"{pdb_id.lower()}.cif.gz"
+    # On PDBe you can sometimes download an updated mmCIF file,
+    # Current url is for the archive mmCIF file
+    # TODO check if archive is OK, or if we should try to download the updated file
+    # this will cause many more requests, so we should only do this if needed
+    url = f"https://www.ebi.ac.uk/pdbe/entry-files/download/{fn}"
+    return url, fn
+async def fetch(ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 5) -> Mapping[str, Path]:
+    """Fetches mmCIF files from the PDBe database.
+    Args:
+        ids: A set of PDB IDs to fetch.
+        save_dir: The directory to save the fetched mmCIF files to.
+        max_parallel_downloads: The maximum number of parallel downloads.
+    Returns:
+        A dict of id and paths to the downloaded mmCIF files.
+    """
+    # The future result, is in a different order than the input ids,
+    # so we need to map the ids to the urls and filenames.
+    id2urls = {pdb_id: _map_id_mmcif(pdb_id) for pdb_id in ids}
+    urls = list(id2urls.values())
+    id2paths = {pdb_id: save_dir / fn for pdb_id, (_, fn) in id2urls.items()}
+    await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading PDBe mmCIF files")
+    return id2paths