PyPI - protein-quest - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

protein-quest 0.3.0py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of protein-quest might be problematic. Click here for more details.

Files changed (18) hide show

protein_quest/__version__.py +2 -1
protein_quest/alphafold/confidence.py +2 -2
protein_quest/alphafold/entry_summary.py +11 -9
protein_quest/alphafold/fetch.py +37 -61
protein_quest/cli.py +35 -18
protein_quest/filters.py +43 -32
protein_quest/mcp_server.py +4 -5
protein_quest/parallel.py +37 -1
protein_quest/pdbe/fetch.py +15 -1
protein_quest/pdbe/io.py +25 -10
protein_quest/taxonomy.py +12 -0
protein_quest/utils.py +38 -3
{protein_quest-0.3.0.dist-info → protein_quest-0.3.1.dist-info}/METADATA +4 -8
protein_quest-0.3.1.dist-info/RECORD +24 -0
protein_quest-0.3.0.dist-info/RECORD +0 -24
{protein_quest-0.3.0.dist-info → protein_quest-0.3.1.dist-info}/WHEEL +0 -0
{protein_quest-0.3.0.dist-info → protein_quest-0.3.1.dist-info}/entry_points.txt +0 -0
{protein_quest-0.3.0.dist-info → protein_quest-0.3.1.dist-info}/licenses/LICENSE +0 -0

protein_quest/__version__.py CHANGED Viewed

@@ -1 +1,2 @@
-__version__ = "0.3.0"
+__version__ = "0.3.1"
+"""The version of the package."""

protein_quest/alphafold/confidence.py CHANGED Viewed

@@ -98,7 +98,7 @@ class ConfidenceFilterResult:
 def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_dir: Path) -> ConfidenceFilterResult:
-    """Filter a single AlphaFoldDB structure file based on confidence.
+    """Filter a single AlphaFoldDB structure file (*.pdb[.gz], *.cif[.gz]) based on confidence.
     Args:
         file: The path to the PDB file to filter.
@@ -107,7 +107,7 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
     Returns:
         result with filtered_file property set to Path where filtered PDB file is saved.
-        or None if structure was filtered out.
+            or None if structure was filtered out.
     """
     structure = gemmi.read_structure(str(file))
     residues = set(find_high_confidence_residues(structure, query.confidence))

protein_quest/alphafold/entry_summary.py CHANGED Viewed

@@ -1,12 +1,14 @@
 # ruff: noqa: N815 allow camelCase follow what api returns
 from dataclasses import dataclass
+from yarl import URL
 @dataclass
 class EntrySummary:
     """Dataclass representing a summary of an AlphaFold entry.
-    Modelled after EntrySummary in https://alphafold.ebi.ac.uk/api/openapi.json
+    Modelled after EntrySummary in [https://alphafold.ebi.ac.uk/api/openapi.json](https://alphafold.ebi.ac.uk/api/openapi.json)
     """
     entryId: str
@@ -21,17 +23,17 @@ class EntrySummary:
     modelCreatedDate: str
     latestVersion: int
     allVersions: list[int]
-    bcifUrl: str
-    cifUrl: str
-    pdbUrl: str
-    paeImageUrl: str
-    paeDocUrl: str
+    bcifUrl: URL
+    cifUrl: URL
+    pdbUrl: URL
+    paeImageUrl: URL
+    paeDocUrl: URL
     gene: str | None = None
     sequenceChecksum: str | None = None
     sequenceVersionDate: str | None = None
-    amAnnotationsUrl: str | None = None
-    amAnnotationsHg19Url: str | None = None
-    amAnnotationsHg38Url: str | None = None
+    amAnnotationsUrl: URL | None = None
+    amAnnotationsHg19Url: URL | None = None
+    amAnnotationsHg38Url: URL | None = None
     isReviewed: bool | None = None
     isReferenceProteome: bool | None = None
     # TODO add new fields from https://alphafold.ebi.ac.uk/#/public-api/get_uniprot_summary_api_uniprot_summary__qualifier__json_get

protein_quest/alphafold/fetch.py CHANGED Viewed

@@ -1,26 +1,28 @@
 """Module for fetch Alphafold data."""
-import asyncio
 import logging
 from asyncio import Semaphore
 from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from pathlib import Path
-from textwrap import dedent
-from typing import Literal
+from typing import Literal, cast, get_args
 from aiohttp_retry import RetryClient
 from aiopath import AsyncPath
 from cattrs.preconf.orjson import make_converter
 from tqdm.asyncio import tqdm
+from yarl import URL
 from protein_quest.alphafold.entry_summary import EntrySummary
-from protein_quest.utils import friendly_session, retrieve_files
+from protein_quest.utils import friendly_session, retrieve_files, run_async
 logger = logging.getLogger(__name__)
 converter = make_converter()
+"""cattrs converter to read AlphaFold summary JSON document."""
+converter.register_structure_hook(URL, lambda v, _: URL(v))
 DownloadableFormat = Literal[
+    "summary",
     "bcif",
     "cif",
     "pdb",
@@ -32,16 +34,7 @@ DownloadableFormat = Literal[
 ]
 """Types of formats that can be downloaded from the AlphaFold web service."""
-downloadable_formats: set[DownloadableFormat] = {
-    "bcif",
-    "cif",
-    "pdb",
-    "paeImage",
-    "paeDoc",
-    "amAnnotations",
-    "amAnnotationsHg19",
-    "amAnnotationsHg38",
-}
+downloadable_formats: set[DownloadableFormat] = set(get_args(DownloadableFormat))
 """Set of formats that can be downloaded from the AlphaFold web service."""
@@ -59,6 +52,7 @@ class AlphaFoldEntry:
     uniprot_acc: str
     summary: EntrySummary | None
+    summary_file: Path | None = None
     bcif_file: Path | None = None
     cif_file: Path | None = None
     pdb_file: Path | None = None
@@ -127,10 +121,6 @@ async def fetch_summary(
     Returns:
         A list of EntrySummary objects representing the fetched summary.
-    Raises:
-        HTTPError: If the HTTP request returns an error status code.
-        Exception: If there is an error during file reading/writing or data conversion.
     """
     url = f"https://alphafold.ebi.ac.uk/api/prediction/{qualifier}"
     fn: AsyncPath | None = None
@@ -144,6 +134,7 @@ async def fetch_summary(
         response.raise_for_status()
         raw_data = await response.content.read()
         if fn is not None:
+            # TODO return fn and make it part of AlphaFoldEntry as summary_file prop
             await fn.write_bytes(raw_data)
         return converter.loads(raw_data, list[EntrySummary])
@@ -164,19 +155,14 @@ async def fetch_summaries(
                 yield summary
-def url2name(url: str) -> str:
-    """Given a URL, return the final path component as the name of the file."""
-    return url.split("/")[-1]
 async def fetch_many_async(
-    ids: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
+    uniprot_accessions: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
 ) -> AsyncGenerator[AlphaFoldEntry]:
-    """Asynchronously fetches summaries and pdb and pae (predicted alignment error) files from
+    """Asynchronously fetches summaries and files from
     [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
     Args:
-        ids: A set of Uniprot IDs to fetch.
+        uniprot_accessions: A set of Uniprot acessions to fetch.
         save_dir: The directory to save the fetched files to.
         what: A set of formats to download.
         max_parallel_downloads: The maximum number of parallel downloads.
@@ -184,7 +170,13 @@ async def fetch_many_async(
     Yields:
         A dataclass containing the summary, pdb file, and pae file.
     """
-    summaries = [s async for s in fetch_summaries(ids, save_dir, max_parallel_downloads=max_parallel_downloads)]
+    save_dir_for_summaries = save_dir if "summary" in what and save_dir is not None else None
+    summaries = [
+        s
+        async for s in fetch_summaries(
+            uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads
+        )
+    ]
     files = files_to_download(what, summaries)
@@ -198,30 +190,31 @@ async def fetch_many_async(
         yield AlphaFoldEntry(
             uniprot_acc=summary.uniprotAccession,
             summary=summary,
-            bcif_file=save_dir / url2name(summary.bcifUrl) if "bcif" in what else None,
-            cif_file=save_dir / url2name(summary.cifUrl) if "cif" in what else None,
-            pdb_file=save_dir / url2name(summary.pdbUrl) if "pdb" in what else None,
-            pae_image_file=save_dir / url2name(summary.paeImageUrl) if "paeImage" in what else None,
-            pae_doc_file=save_dir / url2name(summary.paeDocUrl) if "paeDoc" in what else None,
+            summary_file=save_dir / f"{summary.uniprotAccession}.json" if save_dir_for_summaries is not None else None,
+            bcif_file=save_dir / summary.bcifUrl.name if "bcif" in what else None,
+            cif_file=save_dir / summary.cifUrl.name if "cif" in what else None,
+            pdb_file=save_dir / summary.pdbUrl.name if "pdb" in what else None,
+            pae_image_file=save_dir / summary.paeImageUrl.name if "paeImage" in what else None,
+            pae_doc_file=save_dir / summary.paeDocUrl.name if "paeDoc" in what else None,
             am_annotations_file=(
-                save_dir / url2name(summary.amAnnotationsUrl)
+                save_dir / summary.amAnnotationsUrl.name
                 if "amAnnotations" in what and summary.amAnnotationsUrl
                 else None
             ),
             am_annotations_hg19_file=(
-                save_dir / url2name(summary.amAnnotationsHg19Url)
+                save_dir / summary.amAnnotationsHg19Url.name
                 if "amAnnotationsHg19" in what and summary.amAnnotationsHg19Url
                 else None
             ),
             am_annotations_hg38_file=(
-                save_dir / url2name(summary.amAnnotationsHg38Url)
+                save_dir / summary.amAnnotationsHg38Url.name
                 if "amAnnotationsHg38" in what and summary.amAnnotationsHg38Url
                 else None
             ),
         )
-def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySummary]) -> set[tuple[str, str]]:
+def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySummary]) -> set[tuple[URL, str]]:
     if not (set(what) <= downloadable_formats):
         msg = (
             f"Invalid format(s) specified: {set(what) - downloadable_formats}. "
@@ -229,24 +222,21 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
         )
         raise ValueError(msg)
-    files: set[tuple[str, str]] = set()
+    files: set[tuple[URL, str]] = set()
     for summary in summaries:
         for fmt in what:
-            url = getattr(summary, f"{fmt}Url", None)
+            if fmt == "summary":
+                # summary is handled already in fetch_summary
+                continue
+            url = cast("URL | None", getattr(summary, f"{fmt}Url", None))
             if url is None:
                 logger.warning(f"Summary {summary.uniprotAccession} does not have a URL for format '{fmt}'. Skipping.")
                 continue
-            file = (url, url2name(url))
+            file = (url, url.name)
             files.add(file)
     return files
-class NestedAsyncIOLoopError(RuntimeError):
-    """Custom error for nested async I/O loops."""
-    pass
 def fetch_many(
     ids: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
 ) -> list[AlphaFoldEntry]:
@@ -260,9 +250,6 @@ def fetch_many(
     Returns:
         A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
-    Raises:
-        NestedAsyncIOLoopError: If called from a nested async I/O loop like in a Jupyter notebook.
     """
     async def gather_entries():
@@ -271,19 +258,7 @@ def fetch_many(
             async for entry in fetch_many_async(ids, save_dir, what, max_parallel_downloads=max_parallel_downloads)
         ]
-    try:
-        return asyncio.run(gather_entries())
-    except RuntimeError as e:
-        msg = dedent("""\
-            Can not run async method from an environment where the asyncio event loop is already running.
-            Like a Jupyter notebook.
-            Please use the `fetch_many_async` function directly or before call
-                    import nest_asyncio
-                    nest_asyncio.apply()
-            """)
-        raise NestedAsyncIOLoopError(msg) from e
+    return run_async(gather_entries())
 def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
@@ -299,6 +274,7 @@ def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
     return AlphaFoldEntry(
         uniprot_acc=entry.uniprot_acc,
         summary=entry.summary,
+        summary_file=entry.summary_file.relative_to(session_dir) if entry.summary_file else None,
         bcif_file=entry.bcif_file.relative_to(session_dir) if entry.bcif_file else None,
         cif_file=entry.cif_file.relative_to(session_dir) if entry.cif_file else None,
         pdb_file=entry.pdb_file.relative_to(session_dir) if entry.pdb_file else None,

protein_quest/cli.py CHANGED Viewed

@@ -5,7 +5,8 @@ import asyncio
 import csv
 import logging
 import os
-from collections.abc import Callable, Iterable
+import sys
+from collections.abc import Callable, Generator, Iterable
 from importlib.util import find_spec
 from io import TextIOWrapper
 from pathlib import Path
@@ -14,6 +15,7 @@ from textwrap import dedent
 from cattrs import structure
 from rich import print as rprint
 from rich.logging import RichHandler
+from rich.panel import Panel
 from rich_argparse import ArgumentDefaultsRichHelpFormatter
 from tqdm.rich import tqdm
@@ -25,7 +27,7 @@ from protein_quest.emdb import fetch as emdb_fetch
 from protein_quest.filters import filter_files_on_chain, filter_files_on_residues
 from protein_quest.go import Aspect, allowed_aspects, search_gene_ontology_term, write_go_terms_to_csv
 from protein_quest.pdbe import fetch as pdbe_fetch
-from protein_quest.pdbe.io import glob_structure_files
+from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
 from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
 from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
@@ -246,12 +248,12 @@ def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
     )
     parser.add_argument("output_dir", type=Path, help="Directory to store downloaded AlphaFold files")
     parser.add_argument(
-        "--what-af-formats",
+        "--what-formats",
         type=str,
         action="append",
         choices=sorted(downloadable_formats),
         help=dedent("""AlphaFold formats to retrieve. Can be specified multiple times.
-            Default is 'pdb'. Summary is always downloaded as `<entryId>.json`."""),
+            Default is 'summary' and 'cif'."""),
     )
     parser.add_argument(
         "--max-parallel-downloads",
@@ -585,17 +587,17 @@ def _handle_retrieve_pdbe(args):
 def _handle_retrieve_alphafold(args):
     download_dir = args.output_dir
-    what_af_formats = args.what_af_formats
+    what_formats = args.what_formats
     alphafold_csv = args.alphafold_csv
     max_parallel_downloads = args.max_parallel_downloads
-    if what_af_formats is None:
-        what_af_formats = {"pdb"}
+    if what_formats is None:
+        what_formats = {"summary", "cif"}
     # TODO besides `uniprot_acc,af_id\n` csv also allow headless single column format
     #
-    af_ids = [r["af_id"] for r in _read_alphafold_csv(alphafold_csv)]
-    validated_what: set[DownloadableFormat] = structure(what_af_formats, set[DownloadableFormat])
+    af_ids = _read_column_from_csv(alphafold_csv, "af_id")
+    validated_what: set[DownloadableFormat] = structure(what_formats, set[DownloadableFormat])
     rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {validated_what}")
     afs = af_fetch(af_ids, download_dir, what=validated_what, max_parallel_downloads=max_parallel_downloads)
     total_nr_files = sum(af.nr_of_files() for af in afs)
@@ -658,12 +660,32 @@ def _handle_filter_chain(args):
     pdb_id2chain_mapping_file = args.chains
     scheduler_address = args.scheduler_address
+    # make sure files in input dir with entries in mapping file are the same
+    # complain when files from mapping file are missing on disk
     rows = list(_iter_csv_rows(pdb_id2chain_mapping_file))
-    id2chains: dict[str, str] = {row["pdb_id"]: row["chain"] for row in rows}
+    file2chain: set[tuple[Path, str]] = set()
+    errors: list[FileNotFoundError] = []
-    new_files = filter_files_on_chain(input_dir, id2chains, output_dir, scheduler_address)
+    for row in rows:
+        pdb_id = row["pdb_id"]
+        chain = row["chain"]
+        try:
+            f = locate_structure_file(input_dir, pdb_id)
+            file2chain.add((f, chain))
+        except FileNotFoundError as e:
+            errors.append(e)
-    nr_written = len([r for r in new_files if r[2] is not None])
+    if errors:
+        msg = f"Some structure files could not be found ({len(errors)} missing), skipping them"
+        rprint(Panel(os.linesep.join(map(str, errors)), title=msg, style="red"))
+    if not file2chain:
+        rprint("[red]No valid structure files found. Exiting.")
+        sys.exit(1)
+    results = filter_files_on_chain(file2chain, output_dir, scheduler_address=scheduler_address)
+    nr_written = len([r for r in results if r.passed])
     rprint(f"Wrote {nr_written} single-chain PDB/mmCIF files to {output_dir}.")
@@ -768,12 +790,7 @@ def _write_dict_of_sets2csv(file: TextIOWrapper, data: dict[str, set[str]], ref_
             writer.writerow({"uniprot_acc": uniprot_acc, ref_id_field: ref_id})
-def _read_alphafold_csv(file: TextIOWrapper):
-    reader = csv.DictReader(file)
-    yield from reader
-def _iter_csv_rows(file: TextIOWrapper):
+def _iter_csv_rows(file: TextIOWrapper) -> Generator[dict[str, str]]:
     reader = csv.DictReader(file)
     yield from reader

protein_quest/filters.py CHANGED Viewed

@@ -1,19 +1,17 @@
 """Module for filtering structure files and their contents."""
 import logging
-from collections.abc import Generator
+from collections.abc import Collection, Generator
 from dataclasses import dataclass
 from pathlib import Path
 from shutil import copyfile
-from typing import cast
-from dask.distributed import Client, progress
+from dask.distributed import Client
 from distributed.deploy.cluster import Cluster
 from tqdm.auto import tqdm
-from protein_quest.parallel import configure_dask_scheduler
+from protein_quest.parallel import configure_dask_scheduler, dask_map_with_progress
 from protein_quest.pdbe.io import (
-    locate_structure_file,
     nr_residues_in_chain,
     write_single_chain_pdb_file,
 )
@@ -21,25 +19,48 @@ from protein_quest.pdbe.io import (
 logger = logging.getLogger(__name__)
+@dataclass
+class ChainFilterStatistics:
+    input_file: Path
+    chain_id: str
+    passed: bool = False
+    output_file: Path | None = None
+    discard_reason: Exception | None = None
+def filter_file_on_chain(
+    file_and_chain: tuple[Path, str], output_dir: Path, out_chain: str = "A"
+) -> ChainFilterStatistics:
+    input_file, chain_id = file_and_chain
+    try:
+        output_file = write_single_chain_pdb_file(input_file, chain_id, output_dir, out_chain=out_chain)
+        return ChainFilterStatistics(
+            input_file=input_file,
+            chain_id=chain_id,
+            output_file=output_file,
+            passed=True,
+        )
+    except Exception as e:  # noqa: BLE001 - error is handled downstream
+        return ChainFilterStatistics(input_file=input_file, chain_id=chain_id, discard_reason=e)
 def filter_files_on_chain(
-    input_dir: Path,
-    id2chains: dict[str, str],
+    file2chains: Collection[tuple[Path, str]],
     output_dir: Path,
-    scheduler_address: str | Cluster | None = None,
     out_chain: str = "A",
-) -> list[tuple[str, str, Path | None]]:
+    scheduler_address: str | Cluster | None = None,
+) -> list[ChainFilterStatistics]:
     """Filter mmcif/PDB files by chain.
     Args:
-        input_dir: The directory containing the input mmcif/PDB files.
-        id2chains: Which chain to keep for each PDB ID. Key is the PDB ID, value is the chain ID.
+        file2chains: Which chain to keep for each PDB file.
+            First item is the PDB file path, second item is the chain ID.
         output_dir: The directory where the filtered files will be written.
-        scheduler_address: The address of the Dask scheduler.
         out_chain: Under what name to write the kept chain.
+        scheduler_address: The address of the Dask scheduler.
     Returns:
-        A list of tuples containing the PDB ID, chain ID, and path to the filtered file.
-        Last tuple item is None if something went wrong like chain not present.
+        Result of the filtering process.
     """
     output_dir.mkdir(parents=True, exist_ok=True)
     scheduler_address = configure_dask_scheduler(
@@ -47,24 +68,14 @@ def filter_files_on_chain(
         name="filter-chain",
     )
-    def task(id2chain: tuple[str, str]) -> tuple[str, str, Path | None]:
-        pdb_id, chain = id2chain
-        input_file = locate_structure_file(input_dir, pdb_id)
-        return pdb_id, chain, write_single_chain_pdb_file(input_file, chain, output_dir, out_chain=out_chain)
     with Client(scheduler_address) as client:
-        logger.info(f"Follow progress on dask dashboard at: {client.dashboard_link}")
-        futures = client.map(task, id2chains.items())
-        progress(futures)
-        results = client.gather(futures)
-        return cast("list[tuple[str,str, Path | None]]", results)
+        return dask_map_with_progress(
+            client, filter_file_on_chain, file2chains, output_dir=output_dir, out_chain=out_chain
+        )
 @dataclass
-class FilterStat:
+class ResidueFilterStatistics:
     """Statistics for filtering files based on residue count in a specific chain.
     Parameters:
@@ -82,7 +93,7 @@ class FilterStat:
 def filter_files_on_residues(
     input_files: list[Path], output_dir: Path, min_residues: int, max_residues: int, chain: str = "A"
-) -> Generator[FilterStat]:
+) -> Generator[ResidueFilterStatistics]:
     """Filter PDB/mmCIF files by number of residues in given chain.
     Args:
@@ -93,7 +104,7 @@ def filter_files_on_residues(
         chain: The chain to count residues of.
     Yields:
-        FilterStat objects containing information about the filtering process for each input file.
+        Objects containing information about the filtering process for each input file.
     """
     output_dir.mkdir(parents=True, exist_ok=True)
     for input_file in tqdm(input_files, unit="file"):
@@ -102,6 +113,6 @@ def filter_files_on_residues(
         if passed:
             output_file = output_dir / input_file.name
             copyfile(input_file, output_file)
-            yield FilterStat(input_file, residue_count, True, output_file)
+            yield ResidueFilterStatistics(input_file, residue_count, True, output_file)
         else:
-            yield FilterStat(input_file, residue_count, False, None)
+            yield ResidueFilterStatistics(input_file, residue_count, False, None)

protein_quest/mcp_server.py CHANGED Viewed

@@ -24,12 +24,11 @@ npx @modelcontextprotocol/inspector
 # Choose STDIO
 # command: uv run protein-quest mcp
 # id: protein-quest
-# Prompt: What are the PDBe structures for `A8MT69` uniprot accession?
 ```
 Examples:
-    For search pdb use `A8MT69` as input.
+   - What are the PDBe structures for `A8MT69` uniprot accession?
 """
@@ -90,7 +89,7 @@ def extract_single_chain_from_structure(
     chain2keep: str,
     output_dir: Path,
     out_chain: str = "A",
-) -> Path | None:
+) -> Path:
     """
     Extract a single chain from a mmCIF/pdb file and write to a new file.
@@ -101,7 +100,7 @@ def extract_single_chain_from_structure(
         out_chain: The chain identifier for the output file.
     Returns:
-        Path to the output mmCIF/pdb file or None if not created.
+        Path to the output mmCIF/pdb file
     """
     return write_single_chain_pdb_file(input_file, chain2keep, output_dir, out_chain)
@@ -150,7 +149,7 @@ def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[A
     Returns:
         A list of AlphaFold entries.
     """
-    what: set[DownloadableFormat] = {"cif"}
+    what: set[DownloadableFormat] = {"summary", "cif"}
     return alphafold_fetch(uniprot_accs, save_dir, what)

protein_quest/parallel.py CHANGED Viewed

@@ -2,8 +2,10 @@
 import logging
 import os
+from collections.abc import Callable, Collection
+from typing import Concatenate, ParamSpec, cast
-from dask.distributed import LocalCluster
+from dask.distributed import Client, LocalCluster, progress
 from distributed.deploy.cluster import Cluster
 from psutil import cpu_count
@@ -66,3 +68,37 @@ def _configure_cpu_dask_scheduler(nproc: int, name: str) -> LocalCluster:
     n_workers = total_cpus // nproc
     # Use single thread per worker to prevent GIL slowing down the computations
     return LocalCluster(name=name, threads_per_worker=1, n_workers=n_workers)
+# Generic type parameters used across helpers
+P = ParamSpec("P")
+def dask_map_with_progress[T, R, **P](
+    client: Client,
+    func: Callable[Concatenate[T, P], R],
+    iterable: Collection[T],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> list[R]:
+    """
+    Wrapper for map, progress, and gather of Dask that returns a correctly typed list.
+    Args:
+        client: Dask client.
+        func: Function to map; first parameter comes from ``iterable`` and any
+            additional parameters can be provided positionally via ``*args`` or
+            as keyword arguments via ``**kwargs``.
+        iterable: Collection of arguments to map over.
+        *args: Additional positional arguments to pass to client.map().
+        **kwargs: Additional keyword arguments to pass to client.map().
+    Returns:
+        List of results of type returned by `func` function.
+    """
+    if client.dashboard_link:
+        logger.info(f"Follow progress on dask dashboard at: {client.dashboard_link}")
+    futures = client.map(func, iterable, *args, **kwargs)
+    progress(futures)
+    results = client.gather(futures)
+    return cast("list[R]", results)

protein_quest/pdbe/fetch.py CHANGED Viewed

@@ -3,7 +3,7 @@
 from collections.abc import Iterable, Mapping
 from pathlib import Path
-from protein_quest.utils import retrieve_files
+from protein_quest.utils import retrieve_files, run_async
 def _map_id_mmcif(pdb_id: str) -> tuple[str, str]:
@@ -49,3 +49,17 @@ async def fetch(ids: Iterable[str], save_dir: Path, max_parallel_downloads: int
     await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading PDBe mmCIF files")
     return id2paths
+def sync_fetch(ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 5) -> Mapping[str, Path]:
+    """Synchronously fetches mmCIF files from the PDBe database.
+    Args:
+        ids: A set of PDB IDs to fetch.
+        save_dir: The directory to save the fetched mmCIF files to.
+        max_parallel_downloads: The maximum number of parallel downloads.
+    Returns:
+        A dict of id and paths to the downloaded mmCIF files.
+    """
+    return run_async(fetch(ids, save_dir, max_parallel_downloads))

protein_quest/pdbe/io.py CHANGED Viewed

@@ -11,6 +11,11 @@ from protein_quest import __version__
 logger = logging.getLogger(__name__)
+# TODO remove once v0.7.4 of gemmi is released,
+# as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
+# Swallow gemmi leaked function warnings
+gemmi.set_leak_warnings(False)
 def nr_residues_in_chain(file: Path | str, chain: str = "A") -> int:
     """Returns the number of residues in a specific chain from a mmCIF/pdb file.
@@ -131,9 +136,16 @@ def glob_structure_files(input_dir: Path) -> Generator[Path]:
         yield from input_dir.glob(f"*{ext}")
-def write_single_chain_pdb_file(
-    input_file: Path, chain2keep: str, output_dir: Path, out_chain: str = "A"
-) -> Path | None:
+class ChainNotFoundError(IndexError):
+    """Exception raised when a chain is not found in a structure."""
+    def __init__(self, chain: str, file: Path | str):
+        super().__init__(f"Chain {chain} not found in {file}")
+        self.chain_id = chain
+        self.file = file
+def write_single_chain_pdb_file(input_file: Path, chain2keep: str, output_dir: Path, out_chain: str = "A") -> Path:
     """Write a single chain from a mmCIF/pdb file to a new mmCIF/pdb file.
     Args:
@@ -143,7 +155,11 @@ def write_single_chain_pdb_file(
         out_chain: The chain identifier for the output file.
     Returns:
-        Path to the output mmCIF/pdb file or None if not created.
+        Path to the output mmCIF/pdb file
+    Raises:
+        FileNotFoundError: If the input file does not exist.
+        ChainNotFoundError: If the specified chain is not found in the input file.
     """
     structure = gemmi.read_structure(str(input_file))
@@ -154,15 +170,14 @@ def write_single_chain_pdb_file(
     chain = find_chain_in_model(model, chain2keep)
     if chain is None:
-        logger.warning(
-            "Chain %s not found in %s. Skipping.",
-            chain2keep,
-            input_file,
-        )
-        return None
+        raise ChainNotFoundError(chain2keep, input_file)
     name, extension = _split_name_and_extension(input_file.name)
     output_file = output_dir / f"{name}_{chain.name}2{out_chain}{extension}"
+    if output_file.exists():
+        logger.info("Output file %s already exists for input file %s. Skipping.", output_file, input_file)
+        return output_file
     new_structure = gemmi.Structure()
     new_structure.resolution = structure.resolution
     new_id = structure.name + f"{chain2keep}2{out_chain}"

protein_quest/taxonomy.py CHANGED Viewed

@@ -20,6 +20,16 @@ logger = logging.getLogger(__name__)
 @dataclass(frozen=True, slots=True)
 class Taxon:
+    """Dataclass representing a taxon.
+    Arguments:
+        taxon_id: The unique identifier for the taxon.
+        scientific_name: The scientific name of the taxon.
+        rank: The taxonomic rank of the taxon (e.g., species, genus).
+        common_name: The common name of the taxon (if available).
+        other_names: A set of other names for the taxon (if available).
+    """
     taxon_id: str
     scientific_name: str
     rank: str
@@ -47,7 +57,9 @@ converter.register_structure_hook(
 )
 SearchField = Literal["tax_id", "scientific", "common", "parent"]
+"""Type of search field"""
 search_fields: set[SearchField | None] = set(get_args(SearchField)) | {None}
+"""Set of valid search fields"""
 def _get_next_page(response: ClientResponse) -> URL | str | None:

protein_quest/utils.py CHANGED Viewed

@@ -2,20 +2,23 @@
 import asyncio
 import logging
-from collections.abc import Iterable
+from collections.abc import Coroutine, Iterable
 from contextlib import asynccontextmanager
 from pathlib import Path
+from textwrap import dedent
+from typing import Any
 import aiofiles
 import aiohttp
 from aiohttp_retry import ExponentialRetry, RetryClient
 from tqdm.asyncio import tqdm
+from yarl import URL
 logger = logging.getLogger(__name__)
 async def retrieve_files(
-    urls: Iterable[tuple[str, str]],
+    urls: Iterable[tuple[URL | str, str]],
     save_dir: Path,
     max_parallel_downloads: int = 5,
     retries: int = 3,
@@ -45,7 +48,7 @@ async def retrieve_files(
 async def _retrieve_file(
     session: RetryClient,
-    url: str,
+    url: URL | str,
     save_path: Path,
     semaphore: asyncio.Semaphore,
     ovewrite: bool = False,
@@ -103,3 +106,35 @@ async def friendly_session(retries: int = 3, total_timeout: int = 300):
     async with aiohttp.ClientSession(timeout=timeout) as session:
         client = RetryClient(client_session=session, retry_options=retry_options)
         yield client
+class NestedAsyncIOLoopError(RuntimeError):
+    """Custom error for nested async I/O loops."""
+    def __init__(self) -> None:
+        msg = dedent("""\
+            Can not run async method from an environment where the asyncio event loop is already running.
+            Like a Jupyter notebook.
+            Please use the async function directly or
+            call `import nest_asyncio; nest_asyncio.apply()` and try again.
+            """)
+        super().__init__(msg)
+def run_async[R](coroutine: Coroutine[Any, Any, R]) -> R:
+    """Run an async coroutine with nicer error.
+    Args:
+        coroutine: The async coroutine to run.
+    Returns:
+        The result of the coroutine.
+    Raises:
+        NestedAsyncIOLoopError: If called from a nested async I/O loop like in a Jupyter notebook.
+    """
+    try:
+        return asyncio.run(coroutine)
+    except RuntimeError as e:
+        raise NestedAsyncIOLoopError from e

{protein_quest-0.3.0.dist-info → protein_quest-0.3.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: protein_quest
-Version: 0.3.0
+Version: 0.3.1
 Summary: Search/retrieve/filter proteins and protein structures
 Project-URL: Homepage, https://github.com/haddocking/protein-quest
 Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
@@ -13,19 +13,16 @@ Requires-Dist: aiohttp-retry>=2.9.1
 Requires-Dist: aiohttp[speedups]>=3.11.18
 Requires-Dist: aiopath>=0.7.7
 Requires-Dist: attrs>=25.3.0
-Requires-Dist: bokeh>=3.7.3
 Requires-Dist: cattrs[orjson]>=24.1.3
 Requires-Dist: dask>=2025.5.1
 Requires-Dist: distributed>=2025.5.1
 Requires-Dist: gemmi>=0.7.3
-Requires-Dist: molviewspec>=1.6.0
-Requires-Dist: pandas>=2.3.0
-Requires-Dist: platformdirs>=4.3.8
 Requires-Dist: psutil>=7.0.0
 Requires-Dist: rich-argparse>=1.7.1
 Requires-Dist: rich>=14.0.0
 Requires-Dist: sparqlwrapper>=2.0.0
 Requires-Dist: tqdm>=4.67.1
+Requires-Dist: yarl>=1.20.1
 Provides-Extra: mcp
 Requires-Dist: fastmcp>=2.11.3; extra == 'mcp'
 Requires-Dist: pydantic>=2.11.7; extra == 'mcp'
@@ -37,8 +34,7 @@ Description-Content-Type: text/markdown
 [![CI](https://github.com/haddocking/protein-quest/actions/workflows/ci.yml/badge.svg)](https://github.com/haddocking/protein-quest/actions/workflows/ci.yml)
 [![Research Software Directory Badge](https://img.shields.io/badge/rsd-00a3e3.svg)](https://www.research-software.nl/software/protein-quest)
 [![PyPI](https://img.shields.io/pypi/v/protein-quest)](https://pypi.org/project/protein-quest/)
-<!-- TODO replace with correct zenodo id -->
-[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.15632658.svg)](https://doi.org/10.5281/zenodo.15632658)
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.16941288.svg)](https://doi.org/10.5281/zenodo.16941288)
 [![Codacy Badge](https://app.codacy.com/project/badge/Coverage/7a3f3f1fe64640d583a5e50fe7ba828e)](https://app.codacy.com/gh/haddocking/protein-quest/coverage?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_coverage)
 Python package to search/retrieve/filter proteins and protein structures.
@@ -90,7 +86,7 @@ pip install git+https://github.com/haddocking/protein-quest.git
 The main entry point is the `protein-quest` command line tool which has multiple subcommands to perform actions.
-To use programmaticly, see [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
+To use programmaticly, see the [Jupyter notebooks](https://www.bonvinlab.org/protein-quest/notebooks) and [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
 ### Search Uniprot accessions

protein_quest-0.3.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,24 @@
+protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+protein_quest/__version__.py,sha256=Bu2gp24I4eIxc1qgY2e0PnF8N-szjUpFQwVAe10IRAo,56
+protein_quest/cli.py,sha256=xjiWtRDqv-Ruv1fpvXq4dmDSuuyewxw81akDs1ktVbI,31772
+protein_quest/emdb.py,sha256=QEeU0VJQ4lLM-o5yAU3QZlrtzDZNgnC5fCjlqPtTyAY,1370
+protein_quest/filters.py,sha256=3vqfFH87Lz7r9uYiSvwMxzShMfRNv1Zv_freJtDljrU,4051
+protein_quest/go.py,sha256=ycV3-grxuIKFt28bFgH6iRKmt5AEGi7txoTbaAnBxQE,5684
+protein_quest/mcp_server.py,sha256=1_CGC0peqoNUFBvgFWupKwIWjmHsKxN5Vxy1K7dt5Dw,7130
+protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
+protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+protein_quest/taxonomy.py,sha256=wPzLjum5n_SEkL2rHUKvyRnjL1pG7bhEnE2vMmXixEc,5105
+protein_quest/uniprot.py,sha256=8qWV4GWqHTRfed0bE_TdgsLYcnDT_vzKu-6JxIgapJQ,18680
+protein_quest/utils.py,sha256=YhlTJreIr1bExbh1M514l6sz4GmLVa3RN57mI1kjjuw,4730
+protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
+protein_quest/alphafold/confidence.py,sha256=GGd_vYsqVvs9InvFKtqHdGKB_61GHllPmDyIztvzG7E,5625
+protein_quest/alphafold/entry_summary.py,sha256=GtE3rT7wH3vIOOeiXY2s80Fo6EzdoqlcvakW8K591Yk,1257
+protein_quest/alphafold/fetch.py,sha256=1mDbQNm01cxlwFNDsKHBWD7MEwzB3PaheskdaLN7XJs,11491
+protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
+protein_quest/pdbe/fetch.py,sha256=tlCrWoaOrwxnQFrf-PnimUUa6lmtHwwysS51efYsBcA,2379
+protein_quest/pdbe/io.py,sha256=J6fHlRLHLALnpxDgSUUnFCNFV9Hr3u6eJDO6j81ftT4,6936
+protein_quest-0.3.1.dist-info/METADATA,sha256=fWvmMbm5aEMb3WbWgPAqwEOWeYJSY47iuZLaRIgBuuk,7305
+protein_quest-0.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+protein_quest-0.3.1.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
+protein_quest-0.3.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+protein_quest-0.3.1.dist-info/RECORD,,

protein_quest-0.3.0.dist-info/RECORD DELETED Viewed

@@ -1,24 +0,0 @@
-protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-protein_quest/__version__.py,sha256=VrXpHDu3erkzwl_WXrqINBm9xWkcyUy53IQOj042dOs,22
-protein_quest/cli.py,sha256=oyDin6Z92Q17mUmTCasKgju3YUJbPu298gniNakQUwY,31121
-protein_quest/emdb.py,sha256=QEeU0VJQ4lLM-o5yAU3QZlrtzDZNgnC5fCjlqPtTyAY,1370
-protein_quest/filters.py,sha256=GNtM1N1S1mNUqAvX7OvyhOvnUWo4qx2hMneORbc-Qz8,3797
-protein_quest/go.py,sha256=ycV3-grxuIKFt28bFgH6iRKmt5AEGi7txoTbaAnBxQE,5684
-protein_quest/mcp_server.py,sha256=xIaOy6sY_gW5R_oMImI2yBmbBGtZZICOxXLzOkFmm-w,7197
-protein_quest/parallel.py,sha256=kCH6KCJYJZVoq0_Qz8ZLbHnf2OJG-h4uxd9oH2rLNKc,2201
-protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-protein_quest/taxonomy.py,sha256=kAKKZT_mOtmX8ZWNIE9i7emE23VEewkj12X7d_t3p2Y,4659
-protein_quest/uniprot.py,sha256=8qWV4GWqHTRfed0bE_TdgsLYcnDT_vzKu-6JxIgapJQ,18680
-protein_quest/utils.py,sha256=HUvqfsuMBIFOVFlb_QC2to_UQkiZ0_fwHLlckifuXss,3700
-protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
-protein_quest/alphafold/confidence.py,sha256=-lbwijzVMhRd98bxwFDbSi7idiUKJ5BpOsGFrvuTEnQ,5596
-protein_quest/alphafold/entry_summary.py,sha256=P-S8qrXkU-wwIccA1nGol1lfDkUW0Sg0th_3EU-WjN8,1187
-protein_quest/alphafold/fetch.py,sha256=eq__PfqisuUIQBUM8KVghpiEOBGF-zXWNC6Ll_Hlz2E,11828
-protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
-protein_quest/pdbe/fetch.py,sha256=iTyS4ucV2KZl4jTgrUFOZhsXs3cRUIuvmTbXNm_pY8U,1850
-protein_quest/pdbe/io.py,sha256=0ldsrIHKaaurrM2FfWXbqm1iRj3q6xw8-lptfYU1yEw,6231
-protein_quest-0.3.0.dist-info/METADATA,sha256=yiHZn4gDdwilbCoxrF0pCjVk04v_O5pwpwrtr6oPLrE,7369
-protein_quest-0.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-protein_quest-0.3.0.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
-protein_quest-0.3.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-protein_quest-0.3.0.dist-info/RECORD,,

{protein_quest-0.3.0.dist-info → protein_quest-0.3.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{protein_quest-0.3.0.dist-info → protein_quest-0.3.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{protein_quest-0.3.0.dist-info → protein_quest-0.3.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

protein-quest 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

Potentially problematic release.

protein-quest 0.3.0py3-none-any.whl → 0.3.1py3-none-any.whl