PyPI - protein-quest - Versions diffs - 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

protein-quest 0.7.0py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

protein_quest/__version__.py +1 -1
protein_quest/alphafold/fetch.py +265 -78
protein_quest/cli.py +80 -47
protein_quest/mcp_server.py +3 -3
protein_quest/uniprot.py +53 -18
protein_quest/utils.py +15 -3
{protein_quest-0.7.0.dist-info → protein_quest-0.8.0.dist-info}/METADATA +23 -2
{protein_quest-0.7.0.dist-info → protein_quest-0.8.0.dist-info}/RECORD +11 -11
{protein_quest-0.7.0.dist-info → protein_quest-0.8.0.dist-info}/WHEEL +0 -0
{protein_quest-0.7.0.dist-info → protein_quest-0.8.0.dist-info}/entry_points.txt +0 -0
{protein_quest-0.7.0.dist-info → protein_quest-0.8.0.dist-info}/licenses/LICENSE +0 -0

protein_quest/__version__.py CHANGED Viewed

@@ -1,2 +1,2 @@
-__version__ = "0.7.0"
+__version__ = "0.8.0"
 """The version of the package."""

protein_quest/alphafold/fetch.py CHANGED Viewed

@@ -29,14 +29,19 @@ DownloadableFormat = Literal[
     "amAnnotations",
     "amAnnotationsHg19",
     "amAnnotationsHg38",
-    "msaUrl",
-    "plddtDocUrl",
+    "msa",
+    "plddtDoc",
 ]
 """Types of formats that can be downloaded from the AlphaFold web service."""
 downloadable_formats: set[DownloadableFormat] = set(get_args(DownloadableFormat))
 """Set of formats that can be downloaded from the AlphaFold web service."""
+UrlFileNamePair = tuple[URL, str]
+"""A tuple of a URL and a filename."""
+UrlFileNamePairsOfFormats = dict[DownloadableFormat, UrlFileNamePair]
+"""A mapping of DownloadableFormat to UrlFileNamePair."""
 def _camel_to_snake_case(name: str) -> str:
     """Convert a camelCase string to snake_case."""
@@ -51,7 +56,7 @@ class AlphaFoldEntry:
     """
     uniprot_accession: str
-    summary: EntrySummary
+    summary: EntrySummary | None = None
     summary_file: Path | None = None
     bcif_file: Path | None = None
     cif_file: Path | None = None
@@ -105,6 +110,35 @@ class AlphaFoldEntry:
         """
         return sum(1 for attr in vars(self) if attr.endswith("_file") and getattr(self, attr) is not None)
+    def relative_to(self, session_dir: Path) -> "AlphaFoldEntry":
+        """Convert paths in an AlphaFoldEntry to be relative to the session directory.
+        Args:
+            entry: An AlphaFoldEntry instance with absolute paths.
+            session_dir: The session directory to which the paths should be made relative.
+        Returns:
+            An AlphaFoldEntry instance with paths relative to the session directory.
+        """
+        return AlphaFoldEntry(
+            uniprot_accession=self.uniprot_accession,
+            summary=self.summary,
+            summary_file=self.summary_file.relative_to(session_dir) if self.summary_file else None,
+            bcif_file=self.bcif_file.relative_to(session_dir) if self.bcif_file else None,
+            cif_file=self.cif_file.relative_to(session_dir) if self.cif_file else None,
+            pdb_file=self.pdb_file.relative_to(session_dir) if self.pdb_file else None,
+            pae_doc_file=self.pae_doc_file.relative_to(session_dir) if self.pae_doc_file else None,
+            am_annotations_file=self.am_annotations_file.relative_to(session_dir) if self.am_annotations_file else None,
+            am_annotations_hg19_file=(
+                self.am_annotations_hg19_file.relative_to(session_dir) if self.am_annotations_hg19_file else None
+            ),
+            am_annotations_hg38_file=(
+                self.am_annotations_hg38_file.relative_to(session_dir) if self.am_annotations_hg38_file else None
+            ),
+            msa_file=self.msa_file.relative_to(session_dir) if self.msa_file else None,
+            plddt_doc_file=self.plddt_doc_file.relative_to(session_dir) if self.plddt_doc_file else None,
+        )
 async def fetch_summary(
     qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None, cacher: Cacher
@@ -170,32 +204,16 @@ async def fetch_summaries(
                 yield qualifier, summary
-async def fetch_many_async(
+async def _fetch_many_async_with_summary(
     uniprot_accessions: Iterable[str],
     save_dir: Path,
-    what: set[DownloadableFormat],
+    formats: set[DownloadableFormat],
     max_parallel_downloads: int = 5,
     cacher: Cacher | None = None,
     gzip_files: bool = False,
     all_isoforms: bool = False,
 ) -> AsyncGenerator[AlphaFoldEntry]:
-    """Asynchronously fetches summaries and files from
-    [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
-    Args:
-        uniprot_accessions: A set of Uniprot accessions to fetch.
-        save_dir: The directory to save the fetched files to.
-        what: A set of formats to download.
-        max_parallel_downloads: The maximum number of parallel downloads.
-        cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
-        gzip_files: Whether to gzip the downloaded files.
-        all_isoforms: Whether to yield all isoforms of each uniprot entry.
-            When False then yields only the canonical sequence of uniprot entry.
-    Yields:
-        A dataclass containing the summary, pdb file, and pae file.
-    """
-    save_dir_for_summaries = save_dir if "summary" in what and save_dir is not None else None
+    save_dir_for_summaries = save_dir if "summary" in formats else None
     summaries = [
         s
@@ -206,7 +224,7 @@ async def fetch_many_async(
         # O60481 is canonical and O60481-2 is isoform, so we skip the isoform
         if all_isoforms or s[0] == s[1].uniprotAccession
     ]
-    files = files_to_download(what, summaries, gzip_files)
+    files = files_to_download(formats, summaries, gzip_files)
     await retrieve_files(
         files,
@@ -223,45 +241,45 @@ async def fetch_many_async(
             uniprot_accession=uniprot_accession,
             summary=summary,
             summary_file=save_dir / f"{uniprot_accession}.json" if save_dir_for_summaries is not None else None,
-            bcif_file=save_dir / (summary.bcifUrl.name + gzext) if "bcif" in what else None,
-            cif_file=save_dir / (summary.cifUrl.name + gzext) if "cif" in what else None,
-            pdb_file=save_dir / (summary.pdbUrl.name + gzext) if "pdb" in what else None,
-            pae_doc_file=save_dir / (summary.paeDocUrl.name + gzext) if "paeDoc" in what else None,
+            bcif_file=save_dir / (summary.bcifUrl.name + gzext) if "bcif" in formats else None,
+            cif_file=save_dir / (summary.cifUrl.name + gzext) if "cif" in formats else None,
+            pdb_file=save_dir / (summary.pdbUrl.name + gzext) if "pdb" in formats else None,
+            pae_doc_file=save_dir / (summary.paeDocUrl.name + gzext) if "paeDoc" in formats else None,
             am_annotations_file=(
                 save_dir / (summary.amAnnotationsUrl.name + gzext)
-                if "amAnnotations" in what and summary.amAnnotationsUrl
+                if "amAnnotations" in formats and summary.amAnnotationsUrl
                 else None
             ),
             am_annotations_hg19_file=(
                 save_dir / (summary.amAnnotationsHg19Url.name + gzext)
-                if "amAnnotationsHg19" in what and summary.amAnnotationsHg19Url
+                if "amAnnotationsHg19" in formats and summary.amAnnotationsHg19Url
                 else None
             ),
             am_annotations_hg38_file=(
                 save_dir / (summary.amAnnotationsHg38Url.name + gzext)
-                if "amAnnotationsHg38" in what and summary.amAnnotationsHg38Url
+                if "amAnnotationsHg38" in formats and summary.amAnnotationsHg38Url
                 else None
             ),
-            msa_file=(save_dir / (summary.msaUrl.name + gzext) if "msaUrl" in what and summary.msaUrl else None),
+            msa_file=(save_dir / (summary.msaUrl.name + gzext) if "msa" in formats and summary.msaUrl else None),
             plddt_doc_file=(
-                save_dir / (summary.plddtDocUrl.name + gzext) if "plddtDocUrl" in what and summary.plddtDocUrl else None
+                save_dir / (summary.plddtDocUrl.name + gzext) if "plddtDoc" in formats and summary.plddtDocUrl else None
             ),
         )
 def files_to_download(
-    what: set[DownloadableFormat], summaries: Iterable[tuple[str, EntrySummary]], gzip_files: bool
-) -> set[tuple[URL, str]]:
-    if not (set(what) <= downloadable_formats):
+    formats: set[DownloadableFormat], summaries: Iterable[tuple[str, EntrySummary]], gzip_files: bool
+) -> set[UrlFileNamePair]:
+    if not (set(formats) <= downloadable_formats):
         msg = (
-            f"Invalid format(s) specified: {set(what) - downloadable_formats}. "
+            f"Invalid format(s) specified: {set(formats) - downloadable_formats}. "
             f"Valid formats are: {downloadable_formats}"
         )
         raise ValueError(msg)
-    url_filename_pairs: set[tuple[URL, str]] = set()
+    url_filename_pairs: set[UrlFileNamePair] = set()
     for _, summary in summaries:
-        for fmt in what:
+        for fmt in formats:
             if fmt == "summary":
                 # summary is handled already in fetch_summary
                 continue
@@ -275,26 +293,224 @@ def files_to_download(
     return url_filename_pairs
+async def fetch_alphafold_db_version() -> str:
+    """Fetch the current version of the AlphaFold database.
+    Returns:
+        The current version of the AlphaFold database as a string. For example: "6".
+    """
+    url = "https://ftp.ebi.ac.uk/pub/databases/alphafold/accession_ids.csv"
+    headers = {"Range": "bytes=0-200"}
+    logger.debug(f"Detecting AlphaFold DB version from head of {url}")
+    async with friendly_session() as session, session.get(url, headers=headers) as response:
+        response.raise_for_status()
+        raw = await response.content.read(200)
+        text = raw.decode("utf-8")
+        first_line = text.splitlines()[1]
+        version = first_line.split(",")[-1]
+        logger.debug(f"Found current AlphaFold DB version is '{version}'")
+        return version
+def _files_for_alphafold_entry(
+    uniprot_accession: str,
+    formats: set[DownloadableFormat],
+    db_version: str,
+    gzip_files: bool,
+) -> UrlFileNamePairsOfFormats:
+    templates: dict[DownloadableFormat, URL] = {
+        "bcif": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-model_v{db_version}.bcif"),
+        "cif": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-model_v{db_version}.cif"),
+        "pdb": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-model_v{db_version}.pdb"),
+        "paeDoc": URL(
+            f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-predicted_aligned_error_v{db_version}.json"
+        ),
+        "amAnnotations": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-aa-substitutions.csv"),
+        "amAnnotationsHg19": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-hg19.csv"),
+        "amAnnotationsHg38": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-hg38.csv"),
+        "msa": URL(f"https://alphafold.ebi.ac.uk/files/msa/AF-{uniprot_accession}-F1-msa_v{db_version}.a3m"),
+        "plddtDoc": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-confidence_v{db_version}.json"),
+    }
+    url_filename_pairs = {}
+    for fmt in formats:
+        if fmt == "summary":
+            # Summaries are downloaded separately as its using API instead of static files
+            continue
+        if fmt not in templates:
+            logger.warning(f"No URL template found for format '{fmt}'. Skipping.")
+            continue
+        url = templates[cast("DownloadableFormat", fmt)]
+        fn = url.name
+        if gzip_files:
+            fn += ".gz"
+        url_filename_pair = (url, fn)
+        url_filename_pairs[fmt] = url_filename_pair
+    return url_filename_pairs
+def files_for_alphafold_entries(
+    uniprot_accessions: Iterable[str],
+    formats: set[DownloadableFormat],
+    db_version: str,
+    gzip_files: bool,
+) -> dict[str, UrlFileNamePairsOfFormats]:
+    """Get the files to download for multiple AlphaFold entries.
+    Args:
+        uniprot_accessions: A set of Uniprot accessions.
+        formats: A set of formats to download.
+        db_version: The version of the AlphaFold database to use.
+        gzip_files: Whether to download gzipped files. Otherwise downloads uncompressed files.
+    Returns:
+        A mapping of Uniprot accession to a mapping of DownloadableFormat to UrlFileNamePair.
+    """
+    return {
+        uniprot_accession: _files_for_alphafold_entry(
+            uniprot_accession, formats=formats, db_version=db_version, gzip_files=gzip_files
+        )
+        for uniprot_accession in uniprot_accessions
+    }
+async def _fetch_many_async_without_summary(
+    uniprot_accessions: Iterable[str],
+    save_dir: Path,
+    formats: set[DownloadableFormat],
+    db_version: str | None = None,
+    max_parallel_downloads: int = 5,
+    cacher: Cacher | None = None,
+    gzip_files: bool = False,
+) -> AsyncGenerator[AlphaFoldEntry]:
+    if db_version is None:
+        db_version = await fetch_alphafold_db_version()
+    nested_files = files_for_alphafold_entries(
+        uniprot_accessions, formats=formats, db_version=db_version, gzip_files=gzip_files
+    )
+    files: set[UrlFileNamePair] = set()
+    for uniprot_accession in uniprot_accessions:
+        files.update(nested_files[uniprot_accession].values())
+    retrieved_files = await retrieve_files(
+        files,
+        save_dir,
+        desc="Downloading AlphaFold files",
+        max_parallel_downloads=max_parallel_downloads,
+        cacher=cacher,
+        gzip_files=gzip_files,
+        raise_for_not_found=False,
+    )
+    retrieved_files_set = set(retrieved_files)
+    for uniprot_accession in uniprot_accessions:
+        entry = AlphaFoldEntry(
+            uniprot_accession=uniprot_accession,
+        )
+        for af_format, url_filename_pair in nested_files[uniprot_accession].items():
+            _, filename = url_filename_pair
+            filepath = save_dir / filename
+            if filepath in retrieved_files_set:
+                attr = AlphaFoldEntry.format2attr(af_format)
+                setattr(entry, attr, filepath)
+            # else: File was not found (404) during download, so we leave the attribute as None
+        yield entry
+def fetch_many_async(
+    uniprot_accessions: Iterable[str],
+    save_dir: Path,
+    formats: set[DownloadableFormat],
+    db_version: str | None = None,
+    max_parallel_downloads: int = 5,
+    cacher: Cacher | None = None,
+    gzip_files: bool = False,
+    all_isoforms: bool = False,
+) -> AsyncGenerator[AlphaFoldEntry]:
+    """Asynchronously fetches summaries and/or files from
+    [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
+    Args:
+        uniprot_accessions: A set of Uniprot accessions to fetch.
+        save_dir: The directory to save the fetched files to.
+        formats: A set of formats to download.
+            If `summary` is in the set then summaries will be fetched using the API endpoint.
+            and later the other files will be downloaded using static file URLs.
+            If `summary` is not in the set then all files will be downloaded using static file
+            URLs only.
+        db_version: The version of the AlphaFold database to use. If None, the latest version will be used.
+        max_parallel_downloads: The maximum number of parallel downloads.
+        cacher: A cacher to use for caching the fetched files.
+        gzip_files: Whether to gzip the downloaded files.
+            Summaries are never gzipped.
+        all_isoforms: Whether to yield all isoforms of each uniprot entry.
+            When False then yields only the canonical sequence per uniprot entry.
+    Yields:
+        A dataclass containing the summary, pdb file, and pae file.
+    Raises:
+        ValueError: If 'formats' set is empty.
+        ValueError: If all_isoforms is True and 'summary' is not in 'formats' set.
+    """
+    if len(formats) == 0:
+        msg = "At least one format must be specified. The 'formats' argument is empty."
+        raise ValueError(msg)
+    if "summary" in formats:
+        if db_version is not None:
+            logger.warning("db_version is ignored when 'summary' is in 'formats' set. Always uses latest version.")
+        return _fetch_many_async_with_summary(
+            uniprot_accessions,
+            save_dir,
+            formats,
+            max_parallel_downloads=max_parallel_downloads,
+            cacher=cacher,
+            gzip_files=gzip_files,
+            all_isoforms=all_isoforms,
+        )
+    if all_isoforms:
+        msg = "Cannot fetch all isoforms when 'summary' is not in 'formats' set."
+        raise ValueError(msg)
+    return _fetch_many_async_without_summary(
+        uniprot_accessions,
+        save_dir,
+        formats,
+        db_version=db_version,
+        max_parallel_downloads=max_parallel_downloads,
+        cacher=cacher,
+        gzip_files=gzip_files,
+    )
 def fetch_many(
-    ids: Iterable[str],
+    uniprot_accessions: Iterable[str],
     save_dir: Path,
-    what: set[DownloadableFormat],
+    formats: set[DownloadableFormat],
+    db_version: str | None = None,
     max_parallel_downloads: int = 5,
     cacher: Cacher | None = None,
     gzip_files: bool = False,
     all_isoforms: bool = False,
 ) -> list[AlphaFoldEntry]:
-    """Synchronously fetches summaries and pdb and pae files from AlphaFold Protein Structure Database.
+    """Synchronously fetches summaries and/or files like cif from AlphaFold Protein Structure Database.
     Args:
-        ids: A set of Uniprot IDs to fetch.
+        uniprot_accessions: A set of Uniprot accessions to fetch.
         save_dir: The directory to save the fetched files to.
-        what: A set of formats to download.
+        formats: A set of formats to download.
+            If `summary` is in the set then summaries will be fetched using the API endpoint.
+            and later the other files will be downloaded using static file URLs.
+            If `summary` is not in the set then all files will be downloaded using static file
+            URLs only.
+            Excluding 'summary' is much faster as it avoids slow API calls.
+        db_version: The version of the AlphaFold database to use. If None, the latest version will be used.
         max_parallel_downloads: The maximum number of parallel downloads.
-        cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
+        cacher: A cacher to use for caching the fetched files.
         gzip_files: Whether to gzip the downloaded files.
-        all_isoforms: Whether to return all isoforms of each uniprot entry.
-            When False then returns only the canonical sequence of uniprot entry.
+            Summaries are never gzipped.
+        all_isoforms: Whether to yield all isoforms of each uniprot entry.
+            When False then yields only the canonical sequence per uniprot entry.
     Returns:
         A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
@@ -304,9 +520,10 @@ def fetch_many(
         return [
             entry
             async for entry in fetch_many_async(
-                ids,
+                uniprot_accessions,
                 save_dir,
-                what,
+                formats,
+                db_version=db_version,
                 max_parallel_downloads=max_parallel_downloads,
                 cacher=cacher,
                 gzip_files=gzip_files,
@@ -315,33 +532,3 @@ def fetch_many(
         ]
     return run_async(gather_entries())
-def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
-    """Convert paths in an AlphaFoldEntry to be relative to the session directory.
-    Args:
-        entry: An AlphaFoldEntry instance with absolute paths.
-        session_dir: The session directory to which the paths should be made relative.
-    Returns:
-        An AlphaFoldEntry instance with paths relative to the session directory.
-    """
-    return AlphaFoldEntry(
-        uniprot_accession=entry.uniprot_accession,
-        summary=entry.summary,
-        summary_file=entry.summary_file.relative_to(session_dir) if entry.summary_file else None,
-        bcif_file=entry.bcif_file.relative_to(session_dir) if entry.bcif_file else None,
-        cif_file=entry.cif_file.relative_to(session_dir) if entry.cif_file else None,
-        pdb_file=entry.pdb_file.relative_to(session_dir) if entry.pdb_file else None,
-        pae_doc_file=entry.pae_doc_file.relative_to(session_dir) if entry.pae_doc_file else None,
-        am_annotations_file=entry.am_annotations_file.relative_to(session_dir) if entry.am_annotations_file else None,
-        am_annotations_hg19_file=(
-            entry.am_annotations_hg19_file.relative_to(session_dir) if entry.am_annotations_hg19_file else None
-        ),
-        am_annotations_hg38_file=(
-            entry.am_annotations_hg38_file.relative_to(session_dir) if entry.am_annotations_hg38_file else None
-        ),
-        msa_file=entry.msa_file.relative_to(session_dir) if entry.msa_file else None,
-        plddt_doc_file=entry.plddt_doc_file.relative_to(session_dir) if entry.plddt_doc_file else None,
-    )

protein_quest/cli.py CHANGED Viewed

@@ -13,6 +13,7 @@ from io import BytesIO, TextIOWrapper
 from pathlib import Path
 from textwrap import dedent
+import shtab
 from cattrs import structure
 from rich.console import Console
 from rich.logging import RichHandler
@@ -81,7 +82,7 @@ def _add_search_uniprot_parser(subparsers: argparse._SubParsersAction):
         "output",
         type=argparse.FileType("w", encoding="UTF-8"),
         help="Output text file for UniProt accessions (one per line). Use `-` for stdout.",
-    )
+    ).complete = shtab.FILE
     parser.add_argument("--taxon-id", type=str, help="NCBI Taxon ID, e.g. 9606 for Homo Sapiens")
     parser.add_argument(
         "--reviewed",
@@ -124,7 +125,7 @@ def _add_search_pdbe_parser(subparsers: argparse._SubParsersAction):
         "uniprot_accessions",
         type=argparse.FileType("r", encoding="UTF-8"),
         help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
-    )
+    ).complete = shtab.FILE
     parser.add_argument(
         "output_csv",
         type=argparse.FileType("w", encoding="UTF-8"),
@@ -136,7 +137,7 @@ def _add_search_pdbe_parser(subparsers: argparse._SubParsersAction):
             and `chain_length` is the length of the chain, for example `100`.
             Use `-` for stdout.
         """),
-    )
+    ).complete = shtab.FILE
     parser.add_argument(
         "--limit", type=int, default=10_000, help="Maximum number of PDB uniprot accessions combinations to return"
     )
@@ -150,6 +151,15 @@ def _add_search_pdbe_parser(subparsers: argparse._SubParsersAction):
         type=int,
         help="Maximum number of residues allowed in chain mapped to the UniProt accession.",
     )
+    parser.add_argument(
+        "--keep-invalid",
+        action="store_true",
+        help=dedent("""\
+            Keep PDB results when chain length could not be determined.
+            If not given, such results are dropped.
+            Only applies if min/max residues arguments are set.
+        """),
+    )
     parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
@@ -165,12 +175,12 @@ def _add_search_alphafold_parser(subparsers: argparse._SubParsersAction):
         "uniprot_accessions",
         type=argparse.FileType("r", encoding="UTF-8"),
         help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
-    )
+    ).complete = shtab.FILE
     parser.add_argument(
         "output_csv",
         type=argparse.FileType("w", encoding="UTF-8"),
         help="Output CSV with AlphaFold IDs per UniProt accession. Use `-` for stdout.",
-    )
+    ).complete = shtab.FILE
     parser.add_argument("--min-sequence-length", type=int, help="Minimum length of the canonical sequence.")
     parser.add_argument("--max-sequence-length", type=int, help="Maximum length of the canonical sequence.")
     parser.add_argument(
@@ -194,12 +204,12 @@ def _add_search_emdb_parser(subparsers: argparse._SubParsersAction):
         "uniprot_accs",
         type=argparse.FileType("r", encoding="UTF-8"),
         help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
-    )
+    ).complete = shtab.FILE
     parser.add_argument(
         "output_csv",
         type=argparse.FileType("w", encoding="UTF-8"),
         help="Output CSV with EMDB IDs per UniProt accession. Use `-` for stdout.",
-    )
+    ).complete = shtab.FILE
     parser.add_argument("--limit", type=int, default=10_000, help="Maximum number of EMDB entry identifiers to return")
     parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
@@ -222,7 +232,7 @@ def _add_search_go_parser(subparsers: argparse._SubParsersAction):
         "output_csv",
         type=argparse.FileType("w", encoding="UTF-8"),
         help="Output CSV with GO term results. Use `-` for stdout.",
-    )
+    ).complete = shtab.FILE
     parser.add_argument("--limit", type=int, default=100, help="Maximum number of GO term results to return")
@@ -244,7 +254,7 @@ def _add_search_taxonomy_parser(subparser: argparse._SubParsersAction):
         "output_csv",
         type=argparse.FileType("w", encoding="UTF-8"),
         help="Output CSV with taxonomy results. Use `-` for stdout.",
-    )
+    ).complete = shtab.FILE
     parser.add_argument(
         "--field",
         type=str,
@@ -285,7 +295,7 @@ def _add_search_interaction_partners_parser(subparsers: argparse._SubParsersActi
         "output_csv",
         type=argparse.FileType("w", encoding="UTF-8"),
         help="Output CSV with interaction partners per UniProt accession. Use `-` for stdout.",
-    )
+    ).complete = shtab.FILE
     parser.add_argument(
         "--limit", type=int, default=10_000, help="Maximum number of interaction partner uniprot accessions to return"
     )
@@ -316,12 +326,12 @@ def _add_search_complexes_parser(subparsers: argparse._SubParsersAction):
         "uniprot_accessions",
         type=argparse.FileType("r", encoding="UTF-8"),
         help="Text file with UniProt accessions (one per line) as query for searching complexes. Use `-` for stdin.",
-    )
+    ).complete = shtab.FILE
     parser.add_argument(
         "output_csv",
         type=argparse.FileType("w", encoding="UTF-8"),
         help="Output CSV file with complex results. Use `-` for stdout.",
-    )
+    ).complete = shtab.FILE
     parser.add_argument("--limit", type=int, default=100, help="Maximum number of complex results to return")
     parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
@@ -354,12 +364,12 @@ def _add_search_uniprot_details_parser(subparsers: argparse._SubParsersAction):
         "uniprot_accessions",
         type=argparse.FileType("r", encoding="UTF-8"),
         help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
-    )
+    ).complete = shtab.FILE
     parser.add_argument(
         "output_csv",
         type=argparse.FileType("w", encoding="UTF-8"),
         help="Output CSV with UniProt details. Use `-` for stdout.",
-    )
+    ).complete = shtab.FILE
     parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
     parser.add_argument("--batch-size", type=int, default=1_000, help="Number of accessions to query per batch")
@@ -387,12 +397,13 @@ def _add_cacher_arguments(parser: argparse.ArgumentParser):
         action="store_true",
         help="Disable caching of files to central location.",
     )
-    parser.add_argument(
+    cache_dir_action = parser.add_argument(
         "--cache-dir",
         type=Path,
         default=user_cache_root_dir(),
         help="Directory to use as cache for files.",
     )
+    cache_dir_action.complete = shtab.DIRECTORY  # type: ignore[missing-attribute]
     _add_copy_method_arguments(parser)
@@ -411,8 +422,10 @@ def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
         "pdbe_csv",
         type=argparse.FileType("r", encoding="UTF-8"),
         help="CSV file with `pdb_id` column. Other columns are ignored. Use `-` for stdin.",
-    )
-    parser.add_argument("output_dir", type=Path, help="Directory to store downloaded PDBe mmCIF files")
+    ).complete = shtab.FILE
+    parser.add_argument(
+        "output_dir", type=Path, help="Directory to store downloaded PDBe mmCIF files"
+    ).complete = shtab.DIRECTORY
     parser.add_argument(
         "--max-parallel-downloads",
         type=int,
@@ -434,15 +447,22 @@ def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
         "alphafold_csv",
         type=argparse.FileType("r", encoding="UTF-8"),
         help="CSV file with `af_id` column. Other columns are ignored. Use `-` for stdin.",
-    )
-    parser.add_argument("output_dir", type=Path, help="Directory to store downloaded AlphaFold files")
+    ).complete = shtab.FILE
     parser.add_argument(
-        "--what-formats",
+        "output_dir", type=Path, help="Directory to store downloaded AlphaFold files"
+    ).complete = shtab.DIRECTORY
+    parser.add_argument(
+        "--format",
         type=str,
         action="append",
         choices=sorted(downloadable_formats),
         help=dedent("""AlphaFold formats to retrieve. Can be specified multiple times.
-            Default is 'summary' and 'cif'."""),
+            Default is 'cif'."""),
+    )
+    parser.add_argument(
+        "--db-version",
+        type=str,
+        help="AlphaFold database version to use. If not given, the latest version is used. For example '6'.",
     )
     parser.add_argument(
         "--gzip-files",
@@ -481,8 +501,10 @@ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
         "emdb_csv",
         type=argparse.FileType("r", encoding="UTF-8"),
         help="CSV file with `emdb_id` column. Other columns are ignored. Use `-` for stdin.",
-    )
-    parser.add_argument("output_dir", type=Path, help="Directory to store downloaded EMDB volume files")
+    ).complete = shtab.FILE
+    parser.add_argument(
+        "output_dir", type=Path, help="Directory to store downloaded EMDB volume files"
+    ).complete = shtab.DIRECTORY
     _add_cacher_arguments(parser)
@@ -496,8 +518,12 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
             Passed files are written with residues below threshold removed."""),
         formatter_class=ArgumentDefaultsRichHelpFormatter,
     )
-    parser.add_argument("input_dir", type=Path, help="Directory with AlphaFold mmcif/PDB files")
-    parser.add_argument("output_dir", type=Path, help="Directory to write filtered mmcif/PDB files")
+    parser.add_argument(
+        "input_dir", type=Path, help="Directory with AlphaFold mmcif/PDB files"
+    ).complete = shtab.DIRECTORY
+    parser.add_argument(
+        "output_dir", type=Path, help="Directory to write filtered mmcif/PDB files"
+    ).complete = shtab.DIRECTORY
     parser.add_argument("--confidence-threshold", type=float, default=70, help="pLDDT confidence threshold (0-100)")
     parser.add_argument(
         "--min-residues", type=int, default=0, help="Minimum number of high-confidence residues a structure should have"
@@ -515,7 +541,7 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
             Write filter statistics to file.
             In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
             Use `-` for stdout."""),
-    )
+    ).complete = shtab.FILE
     _add_copy_method_arguments(parser)
@@ -535,7 +561,7 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
         "chains",
         type=argparse.FileType("r", encoding="UTF-8"),
         help="CSV file with `pdb_id` and `chain` columns. Other columns are ignored.",
-    )
+    ).complete = shtab.FILE
     parser.add_argument(
         "input_dir",
         type=Path,
@@ -543,13 +569,13 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
         Directory with PDB/mmCIF files.
         Expected filenames are `{pdb_id}.cif.gz`, `{pdb_id}.cif`, `{pdb_id}.pdb.gz` or `{pdb_id}.pdb`.
     """),
-    )
+    ).complete = shtab.DIRECTORY
     parser.add_argument(
         "output_dir",
         type=Path,
         help=dedent("""\
         Directory to write the single-chain PDB/mmCIF files. Output files are in same format as input files."""),
-    )
+    ).complete = shtab.DIRECTORY
     parser.add_argument(
         "--scheduler-address",
         help=dedent("""Address of the Dask scheduler to connect to.
@@ -569,14 +595,16 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
         """),
         formatter_class=ArgumentDefaultsRichHelpFormatter,
     )
-    parser.add_argument("input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')")
+    parser.add_argument(
+        "input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')"
+    ).complete = shtab.DIRECTORY
     parser.add_argument(
         "output_dir",
         type=Path,
         help=dedent("""\
         Directory to write filtered PDB/mmCIF files. Files are copied without modification.
     """),
-    )
+    ).complete = shtab.DIRECTORY
     parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
     parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
     parser.add_argument(
@@ -586,7 +614,7 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
             Write filter statistics to file.
             In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
             Use `-` for stdout."""),
-    )
+    ).complete = shtab.FILE
     _add_copy_method_arguments(parser)
@@ -598,14 +626,16 @@ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
         description="Filter PDB/mmCIF files by secondary structure",
         formatter_class=ArgumentDefaultsRichHelpFormatter,
     )
-    parser.add_argument("input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')")
+    parser.add_argument(
+        "input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')"
+    ).complete = shtab.DIRECTORY
     parser.add_argument(
         "output_dir",
         type=Path,
         help=dedent("""\
             Directory to write filtered PDB/mmCIF files. Files are copied without modification.
         """),
-    )
+    ).complete = shtab.DIRECTORY
     parser.add_argument("--abs-min-helix-residues", type=int, help="Min residues in helices")
     parser.add_argument("--abs-max-helix-residues", type=int, help="Max residues in helices")
     parser.add_argument("--abs-min-sheet-residues", type=int, help="Min residues in sheets")
@@ -623,7 +653,7 @@ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
             <helix_ratio>,<sheet_ratio>,<passed>,<output_file>`.
             Use `-` for stdout.
         """),
-    )
+    ).complete = shtab.FILE
     _add_copy_method_arguments(parser)
@@ -687,12 +717,12 @@ def _add_convert_uniprot_parser(subparsers: argparse._SubParsersAction):
         "input_dir",
         type=Path,
         help=f"Directory with structure files. Supported extensions are {valid_structure_file_extensions}",
-    )
+    ).complete = shtab.DIRECTORY
     parser.add_argument(
         "output",
         type=argparse.FileType("wt", encoding="UTF-8"),
         help="Output text file with UniProt accessions (one per line). Use '-' for stdout.",
-    )
+    ).complete = shtab.FILE
     parser.add_argument(
         "--grouped",
         action="store_true",
@@ -712,14 +742,14 @@ def _add_convert_structures_parser(subparsers: argparse._SubParsersAction):
         "input_dir",
         type=Path,
         help=f"Directory with structure files. Supported extensions are {valid_structure_file_extensions}",
-    )
+    ).complete = shtab.DIRECTORY
     parser.add_argument(
         "--output-dir",
         type=Path,
         help=dedent("""\
             Directory to write converted structure files. If not given, files are written to `input_dir`.
         """),
-    )
+    ).complete = shtab.DIRECTORY
     parser.add_argument(
         "--format",
         type=str,
@@ -768,6 +798,7 @@ def make_parser() -> argparse.ArgumentParser:
     )
     parser.add_argument("--log-level", default="WARNING", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
     parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
+    shtab.add_argument_to(parser, ["--print-completion"])
     subparsers = parser.add_subparsers(dest="command", required=True)
@@ -825,6 +856,7 @@ def _handle_search_pdbe(args):
     output_csv = args.output_csv
     min_residues = converter.structure(args.min_residues, PositiveInt | None)  # pyright: ignore[reportArgumentType]
     max_residues = converter.structure(args.max_residues, PositiveInt | None)  # pyright: ignore[reportArgumentType]
+    keep_invalid = args.keep_invalid
     accs = set(_read_lines(uniprot_accessions))
     rprint(f"Finding PDB entries for {len(accs)} uniprot accessions")
@@ -833,7 +865,7 @@ def _handle_search_pdbe(args):
     raw_nr_results = len(results)
     raw_total_pdbs = sum([len(v) for v in results.values()])
     if min_residues or max_residues:
-        results = filter_pdb_results_on_chain_length(results, min_residues, max_residues)
+        results = filter_pdb_results_on_chain_length(results, min_residues, max_residues, keep_invalid=keep_invalid)
         total_pdbs = sum([len(v) for v in results.values()])
         rprint(f"Before filtering found {raw_total_pdbs} PDB entries for {raw_nr_results} uniprot accessions.")
         rprint(
@@ -976,25 +1008,26 @@ def _handle_retrieve_pdbe(args: argparse.Namespace):
 def _handle_retrieve_alphafold(args):
     download_dir = args.output_dir
-    what_formats = args.what_formats
+    raw_formats = args.format
     alphafold_csv = args.alphafold_csv
     max_parallel_downloads = args.max_parallel_downloads
     cacher = _initialize_cacher(args)
     gzip_files = args.gzip_files
     all_isoforms = args.all_isoforms
+    db_version = args.db_version
-    if what_formats is None:
-        what_formats = {"summary", "cif"}
+    if raw_formats is None:
+        raw_formats = {"cif"}
     # TODO besides `uniprot_accession,af_id\n` csv also allow headless single column format
-    #
     af_ids = _read_column_from_csv(alphafold_csv, "af_id")
-    validated_what: set[DownloadableFormat] = structure(what_formats, set[DownloadableFormat])
-    rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {validated_what}")
+    formats: set[DownloadableFormat] = structure(raw_formats, set[DownloadableFormat])
+    rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {formats}")
     afs = af_fetch(
         af_ids,
         download_dir,
-        what=validated_what,
+        formats=formats,
+        db_version=db_version,
         max_parallel_downloads=max_parallel_downloads,
         cacher=cacher,
         gzip_files=gzip_files,

protein_quest/mcp_server.py CHANGED Viewed

@@ -167,7 +167,7 @@ mcp.tool(search4macromolecular_complexes, name="search_macromolecular_complexes"
 @mcp.tool
 def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[AlphaFoldEntry]:
-    """Fetch the AlphaFold summary and mmcif file for given UniProt accessions.
+    """Fetch the AlphaFold mmCIF file for given UniProt accessions.
     Args:
         uniprot_accs: A set of UniProt accessions.
@@ -176,8 +176,8 @@ def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[A
     Returns:
         A list of AlphaFold entries.
     """
-    what: set[DownloadableFormat] = {"summary", "cif"}
-    return alphafold_fetch(uniprot_accs, save_dir, what)
+    formats: set[DownloadableFormat] = {"cif"}
+    return alphafold_fetch(uniprot_accs, save_dir, formats)
 @mcp.tool

protein_quest/uniprot.py CHANGED Viewed

@@ -93,6 +93,14 @@ def _chain_length_from_uniprot_chains(uniprot_chains: str) -> int:
     return total_length
+class PdbChainLengthError(ValueError):
+    """Raised when a UniProt chain description does not yield a chain length."""
+    def __init__(self, pdb_id: str, uniprot_chains: str):
+        msg = f"Could not determine chain length of '{pdb_id}' from '{uniprot_chains}'"
+        super().__init__(msg)
 @dataclass(frozen=True)
 class PdbResult:
     """Result of a PDB search in UniProtKB.
@@ -117,7 +125,10 @@ class PdbResult:
     @cached_property
     def chain_length(self) -> int:
         """The length of the chain from the UniProt chains aka self.uniprot_chains."""
-        return _chain_length_from_uniprot_chains(self.uniprot_chains)
+        try:
+            return _chain_length_from_uniprot_chains(self.uniprot_chains)
+        except ValueError as e:
+            raise PdbChainLengthError(self.id, self.uniprot_chains) from e
 type PdbResults = dict[str, set[PdbResult]]
@@ -128,6 +139,7 @@ def filter_pdb_results_on_chain_length(
     pdb_results: PdbResults,
     min_residues: int | None,
     max_residues: int | None,
+    keep_invalid: bool = False,
 ) -> PdbResults:
     """Filter PDB results based on chain length.
@@ -137,6 +149,9 @@ def filter_pdb_results_on_chain_length(
             If None, no minimum is applied.
         max_residues: Maximum number of residues allowed in chain mapped to the UniProt accession.
             If None, no maximum is applied.
+        keep_invalid: If True, PDB results with invalid chain length (could not be determined) are kept.
+            If False, PDB results with invalid chain length are filtered out.
+            Warnings are logged when length can not be determined.
     Returns:
         Filtered dictionary with protein IDs as keys and sets of PDB results as values.
@@ -149,12 +164,26 @@ def filter_pdb_results_on_chain_length(
         raise ValueError(msg)
     results: PdbResults = {}
     for uniprot_accession, pdb_entries in pdb_results.items():
-        filtered_pdb_entries = {
-            pdb_entry
-            for pdb_entry in pdb_entries
-            if (min_residues is None or pdb_entry.chain_length >= min_residues)
-            and (max_residues is None or pdb_entry.chain_length <= max_residues)
-        }
+        filtered_pdb_entries = set()
+        for pdb_entry in pdb_entries:
+            try:
+                if (min_residues is None or pdb_entry.chain_length >= min_residues) and (
+                    max_residues is None or pdb_entry.chain_length <= max_residues
+                ):
+                    filtered_pdb_entries.add(pdb_entry)
+            except PdbChainLengthError:
+                if keep_invalid:
+                    logger.warning(
+                        f"Could not determine chain length of '{pdb_entry.id}' from '{pdb_entry.uniprot_chains}' "
+                        f"belonging to uniprot accession '{uniprot_accession}', "
+                        "for completeness not filtering it out"
+                    )
+                    filtered_pdb_entries.add(pdb_entry)
+                else:
+                    logger.warning(
+                        f"Filtering out PDB entry '{pdb_entry.id}' belonging to uniprot accession "
+                        f"'{uniprot_accession}' due to invalid chain length from '{pdb_entry.uniprot_chains}'"
+                    )
         if filtered_pdb_entries:
             # Only include uniprot_accession if there are any pdb entries left after filtering
             results[uniprot_accession] = filtered_pdb_entries
@@ -337,13 +366,13 @@ def _build_sparql_query_sequence_length_filter(min_length: int | None = None, ma
     # - http://purl.uniprot.org/isoforms/P42284-2 is ok
     # - http://purl.uniprot.org/isoforms/P42284-1 is not ok, because it is based on P42284-2
     # - http://purl.uniprot.org/isoforms/Q7KQZ4-1 is not ok, because it is from another uniprot entry
-    # TODO use same approach as in retrieve_uniprot_details function
     header = dedent("""\
         ?protein up:sequence ?isoform .
-        FILTER NOT EXISTS { ?isoform up:basedOn ?parent_isoform }
-        FILTER(
-            STRAFTER(STR(?protein), "http://purl.uniprot.org/uniprot/") =
-            STRBEFORE(STRAFTER(STR(?isoform), "http://purl.uniprot.org/isoforms/"), "-"))
+        ?isoform a up:Simple_Sequence .
+        BIND (IRI(STRBEFORE(REPLACE(
+            STR(?isoform), "http://purl.uniprot.org/isoforms/", "http://purl.uniprot.org/uniprot/"
+        ), "-")) AS ?ac_of_isoform)
+        FILTER (?protein = ?ac_of_isoform)
         ?isoform rdf:value ?sequence .
         BIND (STRLEN(?sequence) AS ?seq_length)
     """)
@@ -875,8 +904,10 @@ def map_uniprot_accessions2uniprot_details(
     ?protein up:sequence ?isoform .
     ?isoform a up:Simple_Sequence .
     ?isoform rdf:value ?sequence .
-    BIND (STRBEFORE(STRAFTER(STR(?isoform), "http://purl.uniprot.org/isoforms/"), "-") AS ?ac_of_isoform)
-    FILTER(?ac_of_isoform = ?ac)
+    BIND (IRI(STRBEFORE(REPLACE(
+        STR(?isoform), "http://purl.uniprot.org/isoforms/", "http://purl.uniprot.org/uniprot/"
+    ), "-")) AS ?ac_of_isoform)
+    FILTER(?ac_of_isoform = ?protein)
     }
     ```
@@ -898,17 +929,20 @@ def map_uniprot_accessions2uniprot_details(
         (STRLEN(?sequence) AS ?seq_length)
     """)
     where_clause = dedent("""
-        ?protein a up:Protein .
         ?protein up:mnemonic ?uniprot_id .
         ?protein up:organism ?organism .
         ?organism up:scientificName ?taxon_name .
         ?protein up:reviewed ?reviewed .
+        OPTIONAL {
         ?protein up:recommendedName/up:fullName ?protein_name .
+        }
         ?protein up:sequence ?isoform .
         ?isoform a up:Simple_Sequence .
         ?isoform rdf:value ?sequence .
-        BIND (STRBEFORE(STRAFTER(STR(?isoform), "http://purl.uniprot.org/isoforms/"), "-") AS ?ac_of_isoform)
-        FILTER(?ac_of_isoform = ?ac)
+        BIND (IRI(STRBEFORE(REPLACE(
+            STR(?isoform), "http://purl.uniprot.org/isoforms/", "http://purl.uniprot.org/uniprot/"
+        ), "-")) AS ?ac_of_isoform)
+        FILTER(?ac_of_isoform = ?protein)
     """)
     total = len(uniprot_accessions)
     with tqdm(
@@ -927,12 +961,13 @@ def map_uniprot_accessions2uniprot_details(
                 timeout=timeout,
             )
             for raw_result in raw_results:
+                protein_name = raw_result.get("protein_name", {}).get("value", "")
                 result = UniprotDetails(
                     uniprot_accession=raw_result["uniprot_accession"]["value"],
                     uniprot_id=raw_result["uniprot_id"]["value"],
                     sequence_length=int(raw_result["seq_length"]["value"]),
                     reviewed=raw_result["reviewed"]["value"] == "true",
-                    protein_name=raw_result["protein_name"]["value"],
+                    protein_name=protein_name,
                     taxon_id=int(raw_result["taxon_id"]["value"]),
                     taxon_name=raw_result["taxon_name"]["value"],
                 )

protein_quest/utils.py CHANGED Viewed

@@ -266,6 +266,7 @@ async def retrieve_files(
     cacher: Cacher | None = None,
     chunk_size: int = 524288,  # 512 KiB
     gzip_files: bool = False,
+    raise_for_not_found: bool = True,
 ) -> list[Path]:
     """Retrieve files from a list of URLs and save them to a directory.
@@ -279,6 +280,9 @@ async def retrieve_files(
         cacher: An optional cacher to use for caching files.
         chunk_size: The size of each chunk to read from the response.
         gzip_files: Whether to gzip the downloaded files.
+            This requires the server can send gzip encoded content.
+        raise_for_not_found: Whether to raise an error for HTTP 404 errors.
+            If false then function does not returns Path for which url gave HTTP 404 error and logs as debug message.
     Returns:
         A list of paths to the downloaded files.
@@ -295,11 +299,12 @@ async def retrieve_files(
                 cacher=cacher,
                 chunk_size=chunk_size,
                 gzip_files=gzip_files,
+                raise_for_not_found=raise_for_not_found,
             )
             for url, filename in urls
         ]
-        files: list[Path] = await tqdm.gather(*tasks, desc=desc)
-        return files
+        raw_files: list[Path | None] = await tqdm.gather(*tasks, desc=desc)
+        return [f for f in raw_files if f is not None]
 class InvalidContentEncodingError(aiohttp.ClientResponseError):
@@ -314,7 +319,8 @@ async def _retrieve_file(
     cacher: Cacher | None = None,
     chunk_size: int = 524288,  # 512 KiB
     gzip_files: bool = False,
-) -> Path:
+    raise_for_not_found=True,
+) -> Path | None:
     """Retrieve a single file from a URL and save it to a specified path.
     Args:
@@ -325,6 +331,9 @@ async def _retrieve_file(
         cacher: An optional cacher to use for caching files.
         chunk_size: The size of each chunk to read from the response.
         gzip_files: Whether to gzip the downloaded file.
+            This requires the server can send gzip encoded content.
+        raise_for_not_found: Whether to raise an error for HTTP 404 errors.
+            If false then function returns None on HTTP 404 errors and logs as debug message.
     Returns:
         The path to the saved file.
@@ -348,6 +357,9 @@ async def _retrieve_file(
         semaphore,
         session.get(url, headers=headers, auto_decompress=auto_decompress) as resp,
     ):
+        if not raise_for_not_found and resp.status == 404:
+            logger.debug(f"File not found at {url}, skipping download.")
+            return None
         resp.raise_for_status()
         if gzip_files and resp.headers.get("Content-Encoding") != "gzip":
             msg = f"Server did not send gzip encoded content for {url}, can not save as gzipped file."

{protein_quest-0.7.0.dist-info → protein_quest-0.8.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: protein_quest
-Version: 0.7.0
+Version: 0.8.0
 Summary: Search/retrieve/filter proteins and protein structures
 Project-URL: Homepage, https://github.com/haddocking/protein-quest
 Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
@@ -21,6 +21,7 @@ Requires-Dist: platformdirs>=4.3.8
 Requires-Dist: psutil>=7.0.0
 Requires-Dist: rich-argparse>=1.7.1
 Requires-Dist: rich>=14.0.0
+Requires-Dist: shtab>=1.7.2
 Requires-Dist: sparqlwrapper>=2.0.0
 Requires-Dist: tqdm>=4.67.1
 Requires-Dist: yarl>=1.20.1
@@ -154,7 +155,7 @@ protein-quest retrieve pdbe pdbe.csv downloads-pdbe/
 protein-quest retrieve alphafold alphafold.csv downloads-af/
 ```
-For each entry downloads the summary.json and cif file.
+For each entry downloads the cif file.
 ### To retrieve EMDB volume files
@@ -299,6 +300,26 @@ protein-quest mcp
 The mcp server contains an prompt template to search/retrieve/filter candidate structures.
+## Shell autocompletion
+The `protein-quest` command line tool supports shell autocompletion using [shtab](https://shtab.readthedocs.io/).
+Initialize for bash shell with:
+```shell
+mkdir -p ~/.local/share/bash-completion/completions
+protein-quest --print-completion bash > ~/.local/share/bash-completion/completions/protein-quest
+```
+Initialize for zsh shell with:
+```shell
+mkdir -p ~/.local/share/zsh/site-functions
+protein-quest --print-completion zsh > ~/.local/share/zsh/site-functions/_protein-quest
+fpath=("$HOME/.local/share/zsh/site-functions" $fpath)
+autoload -Uz compinit && compinit
+```
 ## Contributing
 For development information and contribution guidelines, please see [CONTRIBUTING.md](CONTRIBUTING.md).

{protein_quest-0.7.0.dist-info → protein_quest-0.8.0.dist-info}/RECORD RENAMED Viewed

@@ -1,27 +1,27 @@
 protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-protein_quest/__version__.py,sha256=F9kNagC7uEvuPDju8Gzo4Jt81LSvbf0VyONV3GMXT2M,56
-protein_quest/cli.py,sha256=082CmSSmxVZoWbnX35AmhqedA4T1dD9v-eMe0vsIDp4,55572
+protein_quest/__version__.py,sha256=z22DsH46rJUgc917FJyc2z9XDmdScvBS92-z4i4GZ98,56
+protein_quest/cli.py,sha256=bE0Xq93LjdMnDoHeRIDUXUU79LyWICnhX8B3m2Lk8ZE,57264
 protein_quest/converter.py,sha256=Y-Oxf7lDNbEicL6GS-IpNWDwaAiHgIgs5bFAcEHCKdQ,1441
 protein_quest/emdb.py,sha256=641c6RwNYnu-0GBFyCFBiI58fNc0jMkd0ZZ9MW9-Jmc,1501
 protein_quest/filters.py,sha256=Xr-cJTtbNjHKuzmXLBf7yZfqKf_U3RTivcVbr620LVU,5225
 protein_quest/go.py,sha256=lZNEcw8nTc9wpV3cl4y2FG9Lsj8wsXQ6zemmAQs_DWE,5650
 protein_quest/io.py,sha256=ngV_HU2HIQFO-bP2xQj_fhgv0MYjW4puqz_9CxGpBv8,13017
-protein_quest/mcp_server.py,sha256=tZkSG1yx4ocN1rlKgVlU8nUbs6LKpyLrNqP3y6fbJm0,8564
+protein_quest/mcp_server.py,sha256=oHbNjN-Lctc2mY-sjEuo82yRsp1bBsHo2Ag5MwsWx8k,8547
 protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
 protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 protein_quest/ss.py,sha256=4ZGIHfjTlodYTXqGUKhMnGbgaStYOGaWg2oYrWIjdgo,10118
 protein_quest/structure.py,sha256=QozElPz0kbPB_HW-J1WxArTT5e-1vRyBJoBSfHnwoRM,8117
 protein_quest/taxonomy.py,sha256=4mKv8zll4mX02Ow8CTvyqMJE2KJZvcq3QlTjjjLOJJk,5072
-protein_quest/uniprot.py,sha256=mODAcneCnDvinvJ3jffyR11klsgq5b96T_4aVWd-Luw,35158
-protein_quest/utils.py,sha256=6OF8X4ia_z1HOYiXy6e-zEWlp_bF1DoZCVrCSg1qivY,19076
+protein_quest/uniprot.py,sha256=kV1lOZ_ugcF-LUff9hvmJPaGwA_uaHPJCL_3DLBIvSE,36798
+protein_quest/utils.py,sha256=5Ncdid-dslggy-Ti1yhOHwdAM7Bxpyia7Re-xDkc2P0,19909
 protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
 protein_quest/alphafold/confidence.py,sha256=mVAYTIzdbR8xBjRiUzA0at8wJq9vpfEQWPz5cJefLKs,6766
 protein_quest/alphafold/entry_summary.py,sha256=Qhnw75RXFaoOU332g7axg_jYbbdZbUpsGPUOwPNDSeU,2114
-protein_quest/alphafold/fetch.py,sha256=l8pcXeuDfoXYiwpW5N_uB_9oZpomBgUeF9kROLrM11M,14038
+protein_quest/alphafold/fetch.py,sha256=eKCQHkAMko-d36VvRHLCllLxuAXBdbBUhUONOSCPsds,21970
 protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
 protein_quest/pdbe/fetch.py,sha256=e8CHWDX2QzWnVLmYXCfNrscw1UcN1lI9Uz6Z5HmEOEQ,2510
-protein_quest-0.7.0.dist-info/METADATA,sha256=JvsZl9XGN57iJn5oSBRIVNIqL6aYEHXQlGpE87nsSvQ,10722
-protein_quest-0.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-protein_quest-0.7.0.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
-protein_quest-0.7.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-protein_quest-0.7.0.dist-info/RECORD,,
+protein_quest-0.8.0.dist-info/METADATA,sha256=jotRxaLadElgixAW72Axk8qL8wAvzl-cq26mYJBy9zc,11335
+protein_quest-0.8.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+protein_quest-0.8.0.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
+protein_quest-0.8.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+protein_quest-0.8.0.dist-info/RECORD,,

{protein_quest-0.7.0.dist-info → protein_quest-0.8.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{protein_quest-0.7.0.dist-info → protein_quest-0.8.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{protein_quest-0.7.0.dist-info → protein_quest-0.8.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

protein-quest 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

protein-quest 0.7.0py3-none-any.whl → 0.8.0py3-none-any.whl