PyPI - protein-quest - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

protein-quest 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of protein-quest might be problematic. Click here for more details.

Files changed (12) hide show

protein_quest/__version__.py +1 -1
protein_quest/alphafold/fetch.py +34 -9
protein_quest/cli.py +68 -25
protein_quest/emdb.py +6 -3
protein_quest/mcp_server.py +24 -2
protein_quest/pdbe/fetch.py +6 -3
protein_quest/utils.py +367 -23
{protein_quest-0.4.0.dist-info → protein_quest-0.5.0.dist-info}/METADATA +9 -1
{protein_quest-0.4.0.dist-info → protein_quest-0.5.0.dist-info}/RECORD +12 -12
{protein_quest-0.4.0.dist-info → protein_quest-0.5.0.dist-info}/WHEEL +0 -0
{protein_quest-0.4.0.dist-info → protein_quest-0.5.0.dist-info}/entry_points.txt +0 -0
{protein_quest-0.4.0.dist-info → protein_quest-0.5.0.dist-info}/licenses/LICENSE +0 -0

protein_quest/__version__.py CHANGED Viewed

@@ -1,2 +1,2 @@
-__version__ = "0.4.0"
+__version__ = "0.5.0"
 """The version of the package."""

protein_quest/alphafold/fetch.py CHANGED Viewed

@@ -14,7 +14,7 @@ from yarl import URL
 from protein_quest.alphafold.entry_summary import EntrySummary
 from protein_quest.converter import converter
-from protein_quest.utils import friendly_session, retrieve_files, run_async
+from protein_quest.utils import Cacher, PassthroughCacher, friendly_session, retrieve_files, run_async
 logger = logging.getLogger(__name__)
@@ -104,7 +104,7 @@ class AlphaFoldEntry:
 async def fetch_summary(
-    qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None
+    qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None, cacher: Cacher
 ) -> list[EntrySummary]:
     """Fetches a summary from the AlphaFold database for a given qualifier.
@@ -116,6 +116,7 @@ async def fetch_summary(
         save_dir: An optional directory to save the fetched summary as a JSON file.
             If set and summary exists then summary will be loaded from disk instead of being fetched from the API.
             If not set then the summary will not be saved to disk and will always be fetched from the API.
+        cacher: A cacher to use for caching the fetched summary. Only used if save_dir is not None.
     Returns:
         A list of EntrySummary objects representing the fetched summary.
@@ -124,6 +125,11 @@ async def fetch_summary(
     fn: AsyncPath | None = None
     if save_dir is not None:
         fn = AsyncPath(save_dir / f"{qualifier}.json")
+        cached_file = await cacher.copy_from_cache(Path(fn))
+        if cached_file is not None:
+            logger.debug(f"Using cached file {cached_file} for summary of {qualifier}.")
+            raw_data = await AsyncPath(cached_file).read_bytes()
+            return converter.loads(raw_data, list[EntrySummary])
         if await fn.exists():
             logger.debug(f"File {fn} already exists. Skipping download from {url}.")
             raw_data = await fn.read_bytes()
@@ -133,18 +139,23 @@ async def fetch_summary(
         raw_data = await response.content.read()
         if fn is not None:
             # TODO return fn and make it part of AlphaFoldEntry as summary_file prop
-            await fn.write_bytes(raw_data)
+            await cacher.write_bytes(Path(fn), raw_data)
         return converter.loads(raw_data, list[EntrySummary])
 async def fetch_summaries(
-    qualifiers: Iterable[str], save_dir: Path | None = None, max_parallel_downloads: int = 5
+    qualifiers: Iterable[str],
+    save_dir: Path | None = None,
+    max_parallel_downloads: int = 5,
+    cacher: Cacher | None = None,
 ) -> AsyncGenerator[EntrySummary]:
     semaphore = Semaphore(max_parallel_downloads)
     if save_dir is not None:
         save_dir.mkdir(parents=True, exist_ok=True)
+    if cacher is None:
+        cacher = PassthroughCacher()
     async with friendly_session() as session:
-        tasks = [fetch_summary(qualifier, session, semaphore, save_dir) for qualifier in qualifiers]
+        tasks = [fetch_summary(qualifier, session, semaphore, save_dir, cacher) for qualifier in qualifiers]
         summaries_per_qualifier: list[list[EntrySummary]] = await tqdm.gather(
             *tasks, desc="Fetching Alphafold summaries"
         )
@@ -154,7 +165,11 @@ async def fetch_summaries(
 async def fetch_many_async(
-    uniprot_accessions: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
+    uniprot_accessions: Iterable[str],
+    save_dir: Path,
+    what: set[DownloadableFormat],
+    max_parallel_downloads: int = 5,
+    cacher: Cacher | None = None,
 ) -> AsyncGenerator[AlphaFoldEntry]:
     """Asynchronously fetches summaries and files from
     [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
@@ -164,15 +179,17 @@ async def fetch_many_async(
         save_dir: The directory to save the fetched files to.
         what: A set of formats to download.
         max_parallel_downloads: The maximum number of parallel downloads.
+        cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
     Yields:
         A dataclass containing the summary, pdb file, and pae file.
     """
     save_dir_for_summaries = save_dir if "summary" in what and save_dir is not None else None
     summaries = [
         s
         async for s in fetch_summaries(
-            uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads
+            uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads, cacher=cacher
         )
     ]
@@ -183,6 +200,7 @@ async def fetch_many_async(
         save_dir,
         desc="Downloading AlphaFold files",
         max_parallel_downloads=max_parallel_downloads,
+        cacher=cacher,
     )
     for summary in summaries:
         yield AlphaFoldEntry(
@@ -236,7 +254,11 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
 def fetch_many(
-    ids: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
+    ids: Iterable[str],
+    save_dir: Path,
+    what: set[DownloadableFormat],
+    max_parallel_downloads: int = 5,
+    cacher: Cacher | None = None,
 ) -> list[AlphaFoldEntry]:
     """Synchronously fetches summaries and pdb and pae files from AlphaFold Protein Structure Database.
@@ -245,6 +267,7 @@ def fetch_many(
         save_dir: The directory to save the fetched files to.
         what: A set of formats to download.
         max_parallel_downloads: The maximum number of parallel downloads.
+        cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
     Returns:
         A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
@@ -253,7 +276,9 @@ def fetch_many(
     async def gather_entries():
         return [
             entry
-            async for entry in fetch_many_async(ids, save_dir, what, max_parallel_downloads=max_parallel_downloads)
+            async for entry in fetch_many_async(
+                ids, save_dir, what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
+            )
         ]
     return run_async(gather_entries())

protein_quest/cli.py CHANGED Viewed

@@ -43,7 +43,15 @@ from protein_quest.uniprot import (
     search4pdb,
     search4uniprot,
 )
-from protein_quest.utils import CopyMethod, copy_methods, copyfile
+from protein_quest.utils import (
+    Cacher,
+    CopyMethod,
+    DirectoryCacher,
+    PassthroughCacher,
+    copy_methods,
+    copyfile,
+    user_cache_root_dir,
+)
 logger = logging.getLogger(__name__)
@@ -312,6 +320,7 @@ def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
         default=5,
         help="Maximum number of parallel downloads",
     )
+    _add_cacher_arguments(parser)
 def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
@@ -342,6 +351,7 @@ def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
         default=5,
         help="Maximum number of parallel downloads",
     )
+    _add_cacher_arguments(parser)
 def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
@@ -361,22 +371,7 @@ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
         help="CSV file with `emdb_id` column. Other columns are ignored. Use `-` for stdin.",
     )
     parser.add_argument("output_dir", type=Path, help="Directory to store downloaded EMDB volume files")
-def _add_copy_method_argument(parser: argparse.ArgumentParser):
-    """Add copy method argument to parser."""
-    default_copy_method = "symlink"
-    if os.name == "nt":
-        # On Windows you need developer mode or admin privileges to create symlinks
-        # so we default to copying files instead of symlinking
-        default_copy_method = "copy"
-    parser.add_argument(
-        "--copy-method",
-        type=str,
-        choices=copy_methods,
-        default=default_copy_method,
-        help="How to copy files when no changes are needed to output file.",
-    )
+    _add_cacher_arguments(parser)
 def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
@@ -409,7 +404,7 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
             In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
             Use `-` for stdout."""),
     )
-    _add_copy_method_argument(parser)
+    _add_copy_method_arguments(parser)
 def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
@@ -449,7 +444,7 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
             If not provided, will create a local cluster.
             If set to `sequential` will run tasks sequentially."""),
     )
-    _add_copy_method_argument(parser)
+    _add_copy_method_arguments(parser)
 def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
@@ -472,7 +467,6 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
     )
     parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
     parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
-    _add_copy_method_argument(parser)
     parser.add_argument(
         "--write-stats",
         type=argparse.FileType("w", encoding="UTF-8"),
@@ -481,6 +475,7 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
             In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
             Use `-` for stdout."""),
     )
+    _add_copy_method_arguments(parser)
 def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
@@ -507,7 +502,6 @@ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
     parser.add_argument("--ratio-max-helix-residues", type=float, help="Max residues in helices (relative)")
     parser.add_argument("--ratio-min-sheet-residues", type=float, help="Min residues in sheets (relative)")
     parser.add_argument("--ratio-max-sheet-residues", type=float, help="Max residues in sheets (relative)")
-    _add_copy_method_argument(parser)
     parser.add_argument(
         "--write-stats",
         type=argparse.FileType("w", encoding="UTF-8"),
@@ -518,6 +512,7 @@ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
             Use `-` for stdout.
         """),
     )
+    _add_copy_method_arguments(parser)
 def _add_search_subcommands(subparsers: argparse._SubParsersAction):
@@ -585,6 +580,38 @@ def _add_mcp_command(subparsers: argparse._SubParsersAction):
     parser.add_argument("--port", default=8000, type=int, help="Port to bind the server to")
+def _add_copy_method_arguments(parser):
+    parser.add_argument(
+        "--copy-method",
+        type=str,
+        choices=copy_methods,
+        default="hardlink",
+        help=dedent("""\
+            How to make target file be same file as source file.
+            By default uses hardlinks to save disk space.
+            Note that hardlinks only work within the same filesystem and are harder to track.
+            If you want to track cached files easily then use 'symlink'.
+            On Windows you need developer mode or admin privileges to create symlinks.
+        """),
+    )
+def _add_cacher_arguments(parser: argparse.ArgumentParser):
+    """Add cacher arguments to parser."""
+    parser.add_argument(
+        "--no-cache",
+        action="store_true",
+        help="Disable caching of files to central location.",
+    )
+    parser.add_argument(
+        "--cache-dir",
+        type=Path,
+        default=user_cache_root_dir(),
+        help="Directory to use as cache for files.",
+    )
+    _add_copy_method_arguments(parser)
 def make_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(
         description="Protein Quest CLI", prog="protein-quest", formatter_class=ArgumentDefaultsRichHelpFormatter
@@ -742,14 +769,26 @@ def _handle_search_complexes(args: argparse.Namespace):
     _write_complexes_csv(results, output_csv)
-def _handle_retrieve_pdbe(args):
+def _initialize_cacher(args: argparse.Namespace) -> Cacher:
+    if args.no_cache:
+        return PassthroughCacher()
+    return DirectoryCacher(
+        cache_dir=args.cache_dir,
+        copy_method=args.copy_method,
+    )
+def _handle_retrieve_pdbe(args: argparse.Namespace):
     pdbe_csv = args.pdbe_csv
     output_dir = args.output_dir
     max_parallel_downloads = args.max_parallel_downloads
+    cacher = _initialize_cacher(args)
     pdb_ids = _read_column_from_csv(pdbe_csv, "pdb_id")
     rprint(f"Retrieving {len(pdb_ids)} PDBe entries")
-    result = asyncio.run(pdbe_fetch.fetch(pdb_ids, output_dir, max_parallel_downloads=max_parallel_downloads))
+    result = asyncio.run(
+        pdbe_fetch.fetch(pdb_ids, output_dir, max_parallel_downloads=max_parallel_downloads, cacher=cacher)
+    )
     rprint(f"Retrieved {len(result)} PDBe entries")
@@ -758,6 +797,7 @@ def _handle_retrieve_alphafold(args):
     what_formats = args.what_formats
     alphafold_csv = args.alphafold_csv
     max_parallel_downloads = args.max_parallel_downloads
+    cacher = _initialize_cacher(args)
     if what_formats is None:
         what_formats = {"summary", "cif"}
@@ -767,7 +807,9 @@ def _handle_retrieve_alphafold(args):
     af_ids = _read_column_from_csv(alphafold_csv, "af_id")
     validated_what: set[DownloadableFormat] = structure(what_formats, set[DownloadableFormat])
     rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {validated_what}")
-    afs = af_fetch(af_ids, download_dir, what=validated_what, max_parallel_downloads=max_parallel_downloads)
+    afs = af_fetch(
+        af_ids, download_dir, what=validated_what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
+    )
     total_nr_files = sum(af.nr_of_files() for af in afs)
     rprint(f"Retrieved {total_nr_files} AlphaFold files and {len(afs)} summaries, written to {download_dir}")
@@ -775,10 +817,11 @@ def _handle_retrieve_alphafold(args):
 def _handle_retrieve_emdb(args):
     emdb_csv = args.emdb_csv
     output_dir = args.output_dir
+    cacher = _initialize_cacher(args)
     emdb_ids = _read_column_from_csv(emdb_csv, "emdb_id")
     rprint(f"Retrieving {len(emdb_ids)} EMDB entries")
-    result = asyncio.run(emdb_fetch(emdb_ids, output_dir))
+    result = asyncio.run(emdb_fetch(emdb_ids, output_dir, cacher=cacher))
     rprint(f"Retrieved {len(result)} EMDB entries")

protein_quest/emdb.py CHANGED Viewed

@@ -3,7 +3,7 @@
 from collections.abc import Iterable, Mapping
 from pathlib import Path
-from protein_quest.utils import retrieve_files
+from protein_quest.utils import Cacher, retrieve_files
 def _map_id2volume_url(emdb_id: str) -> tuple[str, str]:
@@ -13,13 +13,16 @@ def _map_id2volume_url(emdb_id: str) -> tuple[str, str]:
     return url, fn
-async def fetch(emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 1) -> Mapping[str, Path]:
+async def fetch(
+    emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 1, cacher: Cacher | None = None
+) -> Mapping[str, Path]:
     """Fetches volume files from the EMDB database.
     Args:
         emdb_ids: A list of EMDB IDs to fetch.
         save_dir: The directory to save the downloaded files.
         max_parallel_downloads: The maximum number of parallel downloads.
+        cacher: An optional cacher to use for caching downloaded files.
     Returns:
         A mapping of EMDB IDs to their downloaded files.
@@ -30,5 +33,5 @@ async def fetch(emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads:
     # TODO show progress of each item
     # TODO handle failed downloads, by skipping them instead of raising an error
-    await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading EMDB volume files")
+    await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading EMDB volume files", cacher=cacher)
     return id2paths

protein_quest/mcp_server.py CHANGED Viewed

@@ -32,6 +32,7 @@ Examples:
 """
+from collections.abc import Mapping
 from pathlib import Path
 from textwrap import dedent
 from typing import Annotated
@@ -89,7 +90,18 @@ def search_pdb(
     return search4pdb(uniprot_accs, limit=limit)
-mcp.tool(pdbe_fetch, name="fetch_pdbe_structures")
+@mcp.tool
+async def fetch_pdbe_structures(pdb_ids: set[str], save_dir: Path) -> Mapping[str, Path]:
+    """Fetch the PDBe structures for given PDB IDs.
+    Args:
+        pdb_ids: A set of PDB IDs.
+        save_dir: The directory to save the fetched files.
+    Returns:
+        A mapping of PDB ID to the path of the fetched structure file.
+    """
+    return await pdbe_fetch(pdb_ids, save_dir)
 @mcp.tool
@@ -163,7 +175,17 @@ def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[A
     return alphafold_fetch(uniprot_accs, save_dir, what)
-mcp.tool(emdb_fetch, name="fetch_emdb_volumes")
+@mcp.tool
+async def fetch_emdb_volumes(emdb_ids: set[str], save_dir: Path) -> Mapping[str, Path]:
+    """Fetch EMDB volumes for given EMDB IDs.
+    Args:
+        emdb_ids: A set of EMDB IDs.
+        save_dir: The directory to save the fetched files.
+    Returns:
+        A mapping of EMDB ID to the path of the fetched volume file.
+    """
+    return await emdb_fetch(emdb_ids=emdb_ids, save_dir=save_dir)
 @mcp.tool

protein_quest/pdbe/fetch.py CHANGED Viewed

@@ -3,7 +3,7 @@
 from collections.abc import Iterable, Mapping
 from pathlib import Path
-from protein_quest.utils import retrieve_files, run_async
+from protein_quest.utils import Cacher, retrieve_files, run_async
 def _map_id_mmcif(pdb_id: str) -> tuple[str, str]:
@@ -28,13 +28,16 @@ def _map_id_mmcif(pdb_id: str) -> tuple[str, str]:
     return url, fn
-async def fetch(ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 5) -> Mapping[str, Path]:
+async def fetch(
+    ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 5, cacher: Cacher | None = None
+) -> Mapping[str, Path]:
     """Fetches mmCIF files from the PDBe database.
     Args:
         ids: A set of PDB IDs to fetch.
         save_dir: The directory to save the fetched mmCIF files to.
         max_parallel_downloads: The maximum number of parallel downloads.
+        cacher: An optional cacher to use for caching downloaded files.
     Returns:
         A dict of id and paths to the downloaded mmCIF files.
@@ -47,7 +50,7 @@ async def fetch(ids: Iterable[str], save_dir: Path, max_parallel_downloads: int
     urls = list(id2urls.values())
     id2paths = {pdb_id: save_dir / fn for pdb_id, (_, fn) in id2urls.items()}
-    await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading PDBe mmCIF files")
+    await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading PDBe mmCIF files", cacher=cacher)
     return id2paths

protein_quest/utils.py CHANGED Viewed

@@ -1,22 +1,260 @@
 """Module for functions that are used in multiple places."""
+import argparse
 import asyncio
+import hashlib
 import logging
 import shutil
-from collections.abc import Coroutine, Iterable
+from collections.abc import Coroutine, Iterable, Sequence
 from contextlib import asynccontextmanager
+from functools import lru_cache
 from pathlib import Path
 from textwrap import dedent
-from typing import Any, Literal, get_args
+from typing import Any, Literal, Protocol, get_args, runtime_checkable
 import aiofiles
+import aiofiles.os
 import aiohttp
+import rich
+from aiohttp.streams import AsyncStreamIterator
 from aiohttp_retry import ExponentialRetry, RetryClient
+from platformdirs import user_cache_dir
+from rich_argparse import ArgumentDefaultsRichHelpFormatter
 from tqdm.asyncio import tqdm
 from yarl import URL
 logger = logging.getLogger(__name__)
+CopyMethod = Literal["copy", "symlink", "hardlink"]
+"""Methods for copying files."""
+copy_methods = set(get_args(CopyMethod))
+"""Set of valid copy methods."""
+@lru_cache
+def _cache_sub_dir(root_cache_dir: Path, filename: str, hash_length: int = 4) -> Path:
+    """Get the cache sub-directory for a given path.
+    To not have too many files in a single directory,
+    we create sub-directories based on the hash of the filename.
+    Args:
+        root_cache_dir: The root directory for the cache.
+        filename: The filename to be cached.
+        hash_length: The length of the hash to use for the sub-directory.
+    Returns:
+        The parent path to the cached file.
+    """
+    full_hash = hashlib.blake2b(filename.encode("utf-8")).hexdigest()
+    cache_sub_dir = full_hash[:hash_length]
+    cache_sub_dir_path = root_cache_dir / cache_sub_dir
+    cache_sub_dir_path.mkdir(parents=True, exist_ok=True)
+    return cache_sub_dir_path
+@runtime_checkable
+class Cacher(Protocol):
+    """Protocol for a cacher."""
+    def __contains__(self, item: str | Path) -> bool:
+        """Check if a file is in the cache.
+        Args:
+            item: The filename or Path to check.
+        Returns:
+            True if the file is in the cache, False otherwise.
+        """
+        ...
+    async def copy_from_cache(self, target: Path) -> Path | None:
+        """Copy a file from the cache to a target location if it exists in the cache.
+        Assumes:
+        - target does not exist.
+        - the parent directory of target exists.
+        Args:
+            target: The path to copy the file to.
+        Returns:
+            The path to the cached file if it was copied, None otherwise.
+        """
+        ...
+    async def write_iter(self, target: Path, content: AsyncStreamIterator[bytes]) -> Path:
+        """Write content to a file and cache it.
+        Args:
+            target: The path to write the content to.
+            content: An async iterator that yields bytes to write to the file.
+        Returns:
+            The path to the cached file.
+        Raises:
+            FileExistsError: If the target file already exists.
+        """
+        ...
+    async def write_bytes(self, target: Path, content: bytes) -> Path:
+        """Write bytes to a file and cache it.
+        Args:
+            target: The path to write the content to.
+            content: The bytes to write to the file.
+        Returns:
+            The path to the cached file.
+        Raises:
+            FileExistsError: If the target file already exists.
+        """
+        ...
+class PassthroughCacher(Cacher):
+    """A cacher that caches nothing.
+    On writes it just writes to the target path.
+    """
+    def __contains__(self, item: str | Path) -> bool:
+        # We don't have anything cached ever
+        return False
+    async def copy_from_cache(self, target: Path) -> Path | None:  # noqa: ARG002
+        # We don't have anything cached ever
+        return None
+    async def write_iter(self, target: Path, content: AsyncStreamIterator[bytes]) -> Path:
+        if target.exists():
+            raise FileExistsError(target)
+        target.write_bytes(b"".join([chunk async for chunk in content]))
+        return target
+    async def write_bytes(self, target: Path, content: bytes) -> Path:
+        if target.exists():
+            raise FileExistsError(target)
+        target.write_bytes(content)
+        return target
+def user_cache_root_dir() -> Path:
+    """Get the users root directory for caching files.
+    Returns:
+        The path to the user's cache directory for protein-quest.
+    """
+    return Path(user_cache_dir("protein-quest"))
+class DirectoryCacher(Cacher):
+    """Class to cache files in a directory.
+    Caching logic is based on the file name only.
+    If file name of paths are the same then the files are considered the same.
+    Attributes:
+        cache_dir: The directory to use for caching.
+        copy_method: The method to use for copying files.
+    """
+    def __init__(
+        self,
+        cache_dir: Path | None = None,
+        copy_method: CopyMethod = "hardlink",
+    ) -> None:
+        """Initialize the cacher.
+        If file name of paths are the same then the files are considered the same.
+        Args:
+            cache_dir: The directory to use for caching.
+                If None, a default cache directory (~/.cache/protein-quest) is used.
+            copy_method: The method to use for copying.
+        """
+        if cache_dir is None:
+            cache_dir = user_cache_root_dir()
+        self.cache_dir: Path = cache_dir
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        if copy_method == "copy":
+            logger.warning(
+                "Using copy as copy_method to cache files is not recommended. "
+                "This will use more disk space and be slower than symlink or hardlink."
+            )
+        if copy_method not in copy_methods:
+            msg = f"Unknown copy method: {copy_method}. Must be one of {copy_methods}."
+            raise ValueError(msg)
+        self.copy_method: CopyMethod = copy_method
+    def __contains__(self, item: str | Path) -> bool:
+        cached_file = self._as_cached_path(item)
+        return cached_file.exists()
+    def _as_cached_path(self, item: str | Path) -> Path:
+        file_name = item.name if isinstance(item, Path) else item
+        cache_sub_dir = _cache_sub_dir(self.cache_dir, file_name)
+        return cache_sub_dir / file_name
+    async def copy_from_cache(self, target: Path) -> Path | None:
+        cached_file = self._as_cached_path(target.name)
+        exists = await aiofiles.os.path.exists(str(cached_file))
+        if exists:
+            await async_copyfile(cached_file, target, copy_method=self.copy_method)
+            return cached_file
+        return None
+    async def write_iter(self, target: Path, content: AsyncStreamIterator[bytes]) -> Path:
+        cached_file = self._as_cached_path(target.name)
+        # Write file to cache dir
+        async with aiofiles.open(cached_file, "xb") as f:
+            async for chunk in content:
+                await f.write(chunk)
+        # Copy to target location
+        await async_copyfile(cached_file, target, copy_method=self.copy_method)
+        return cached_file
+    async def write_bytes(self, target: Path, content: bytes) -> Path:
+        cached_file = self._as_cached_path(target.name)
+        # Write file to cache dir
+        async with aiofiles.open(cached_file, "xb") as f:
+            await f.write(content)
+        # Copy to target location
+        await async_copyfile(cached_file, target, copy_method=self.copy_method)
+        return cached_file
+    def populate_cache(self, source_dir: Path) -> dict[Path, Path]:
+        """Populate the cache from an existing directory.
+        This will copy all files from the source directory to the cache directory.
+        If a file with the same name already exists in the cache, it will be skipped.
+        Args:
+            source_dir: The directory to populate the cache from.
+        Returns:
+            A dictionary mapping source file paths to their cached paths.
+        Raises:
+            NotADirectoryError: If the source_dir is not a directory.
+        """
+        if not source_dir.is_dir():
+            raise NotADirectoryError(source_dir)
+        cached = {}
+        for file_path in source_dir.iterdir():
+            if not file_path.is_file():
+                continue
+            cached_path = self._as_cached_path(file_path.name)
+            if cached_path.exists():
+                logger.debug(f"File {file_path.name} already in cache. Skipping.")
+                continue
+            copyfile(file_path, cached_path, copy_method=self.copy_method)
+            cached[file_path] = cached_path
+        return cached
 async def retrieve_files(
     urls: Iterable[tuple[URL | str, str]],
@@ -25,6 +263,8 @@ async def retrieve_files(
     retries: int = 3,
     total_timeout: int = 300,
     desc: str = "Downloading files",
+    cacher: Cacher | None = None,
+    chunk_size: int = 524288,  # 512 KiB
 ) -> list[Path]:
     """Retrieve files from a list of URLs and save them to a directory.
@@ -35,6 +275,8 @@ async def retrieve_files(
         retries: The number of times to retry a failed download.
         total_timeout: The total timeout for a download in seconds.
         desc: Description for the progress bar.
+        cacher: An optional cacher to use for caching files.
+        chunk_size: The size of each chunk to read from the response.
     Returns:
         A list of paths to the downloaded files.
@@ -42,7 +284,17 @@ async def retrieve_files(
     save_dir.mkdir(parents=True, exist_ok=True)
     semaphore = asyncio.Semaphore(max_parallel_downloads)
     async with friendly_session(retries, total_timeout) as session:
-        tasks = [_retrieve_file(session, url, save_dir / filename, semaphore) for url, filename in urls]
+        tasks = [
+            _retrieve_file(
+                session=session,
+                url=url,
+                save_path=save_dir / filename,
+                semaphore=semaphore,
+                cacher=cacher,
+                chunk_size=chunk_size,
+            )
+            for url, filename in urls
+        ]
         files: list[Path] = await tqdm.gather(*tasks, desc=desc)
         return files
@@ -52,8 +304,8 @@ async def _retrieve_file(
     url: URL | str,
     save_path: Path,
     semaphore: asyncio.Semaphore,
-    ovewrite: bool = False,
-    chunk_size: int = 131072,  # 128 KiB
+    cacher: Cacher | None = None,
+    chunk_size: int = 524288,  # 512 KiB
 ) -> Path:
     """Retrieve a single file from a URL and save it to a specified path.
@@ -62,26 +314,28 @@ async def _retrieve_file(
         url: The URL to download the file from.
         save_path: The path where the file should be saved.
         semaphore: A semaphore to limit the number of concurrent downloads.
-        ovewrite: Whether to overwrite the file if it already exists.
+        cacher: An optional cacher to use for caching files.
         chunk_size: The size of each chunk to read from the response.
     Returns:
         The path to the saved file.
     """
     if save_path.exists():
-        if ovewrite:
-            save_path.unlink()
-        else:
-            logger.debug(f"File {save_path} already exists. Skipping download from {url}.")
-            return save_path
+        logger.debug(f"File {save_path} already exists. Skipping download from {url}.")
+        return save_path
+    if cacher is None:
+        cacher = PassthroughCacher()
+    if cached_file := await cacher.copy_from_cache(save_path):
+        logger.debug(f"File {save_path} was copied from cache {cached_file}. Skipping download from {url}.")
+        return save_path
     async with (
         semaphore,
-        aiofiles.open(save_path, "xb") as f,
         session.get(url) as resp,
     ):
         resp.raise_for_status()
-        async for chunk in resp.content.iter_chunked(chunk_size):
-            await f.write(chunk)
+        await cacher.write_iter(save_path, resp.content.iter_chunked(chunk_size))
     return save_path
@@ -141,27 +395,117 @@ def run_async[R](coroutine: Coroutine[Any, Any, R]) -> R:
         raise NestedAsyncIOLoopError from e
-CopyMethod = Literal["copy", "symlink"]
-copy_methods = set(get_args(CopyMethod))
 def copyfile(source: Path, target: Path, copy_method: CopyMethod = "copy"):
-    """Make target path be same file as source by either copying or symlinking.
+    """Make target path be same file as source by either copying or symlinking or hardlinking.
+    Note that the hardlink copy method only works within the same filesystem and is harder to track.
+    If you want to track cached files easily then use 'symlink'.
+    On Windows you need developer mode or admin privileges to create symlinks.
     Args:
-        source: The source file to copy or symlink.
+        source: The source file to copy or link.
         target: The target file to create.
         copy_method: The method to use for copying.
     Raises:
         FileNotFoundError: If the source file or parent of target does not exist.
-        ValueError: If the method is not "copy" or "symlink".
+        FileExistsError: If the target file already exists.
+        ValueError: If an unknown copy method is provided.
     """
     if copy_method == "copy":
         shutil.copyfile(source, target)
     elif copy_method == "symlink":
-        rel_source = source.relative_to(target.parent, walk_up=True)
+        rel_source = source.absolute().relative_to(target.parent.absolute(), walk_up=True)
         target.symlink_to(rel_source)
+    elif copy_method == "hardlink":
+        target.hardlink_to(source)
     else:
-        msg = f"Unknown method: {copy_method}"
+        msg = f"Unknown method: {copy_method}. Valid methods are: {copy_methods}"
         raise ValueError(msg)
+async def async_copyfile(
+    source: Path,
+    target: Path,
+    copy_method: CopyMethod = "copy",
+):
+    """Asynchronously make target path be same file as source by either copying or symlinking or hardlinking.
+    Note that the hardlink copy method only works within the same filesystem and is harder to track.
+    If you want to track cached files easily then use 'symlink'.
+    On Windows you need developer mode or admin privileges to create symlinks.
+    Args:
+        source: The source file to copy.
+        target: The target file to create.
+        copy_method: The method to use for copying.
+    Raises:
+        FileNotFoundError: If the source file or parent of target does not exist.
+        FileExistsError: If the target file already exists.
+        ValueError: If an unknown copy method is provided.
+    """
+    if copy_method == "copy":
+        # Could use loop of chunks with aiofiles,
+        # but shutil is ~1.9x faster on my machine
+        # due to fastcopy and sendfile optimizations in shutil.
+        await asyncio.to_thread(shutil.copyfile, source, target)
+    elif copy_method == "symlink":
+        rel_source = source.relative_to(target.parent, walk_up=True)
+        await aiofiles.os.symlink(str(rel_source), str(target))
+    elif copy_method == "hardlink":
+        await aiofiles.os.link(str(source), str(target))
+    else:
+        msg = f"Unknown method: {copy_method}. Valid methods are: {copy_methods}"
+        raise ValueError(msg)
+def populate_cache_command(raw_args: Sequence[str] | None = None):
+    """Command line interface to populate the cache from an existing directory.
+    Can be called from the command line as:
+    ```bash
+    python3 -m protein_quest.utils populate-cache /path/to/source/dir
+    ```
+    Args:
+        raw_args: The raw command line arguments to parse. If None, uses sys.argv.
+    """
+    root_parser = argparse.ArgumentParser(formatter_class=ArgumentDefaultsRichHelpFormatter)
+    subparsers = root_parser.add_subparsers(dest="command")
+    desc = "Populate the cache directory with files from the source directory."
+    populate_cache_parser = subparsers.add_parser(
+        "populate-cache",
+        help=desc,
+        description=desc,
+        formatter_class=ArgumentDefaultsRichHelpFormatter,
+    )
+    populate_cache_parser.add_argument("source_dir", type=Path)
+    populate_cache_parser.add_argument(
+        "--cache-dir",
+        type=Path,
+        default=user_cache_root_dir(),
+        help="Directory to use for caching. If not provided, a default cache directory is used.",
+    )
+    populate_cache_parser.add_argument(
+        "--copy-method",
+        type=str,
+        default="hardlink",
+        choices=copy_methods,
+        help="Method to use for copying files to cache.",
+    )
+    args = root_parser.parse_args(raw_args)
+    if args.command == "populate-cache":
+        source_dir = args.source_dir
+        cacher = DirectoryCacher(cache_dir=args.cache_dir, copy_method=args.copy_method)
+        cached_files = cacher.populate_cache(source_dir)
+        rich.print(f"Cached {len(cached_files)} files from {source_dir} to {cacher.cache_dir}")
+        for src, cached in cached_files.items():
+            rich.print(f"- {src.relative_to(source_dir)} -> {cached.relative_to(cacher.cache_dir)}")
+if __name__ == "__main__":
+    populate_cache_command()

{protein_quest-0.4.0.dist-info → protein_quest-0.5.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: protein_quest
-Version: 0.4.0
+Version: 0.5.0
 Summary: Search/retrieve/filter proteins and protein structures
 Project-URL: Homepage, https://github.com/haddocking/protein-quest
 Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
@@ -17,6 +17,7 @@ Requires-Dist: cattrs[orjson]>=24.1.3
 Requires-Dist: dask>=2025.5.1
 Requires-Dist: distributed>=2025.5.1
 Requires-Dist: gemmi>=0.7.3
+Requires-Dist: platformdirs>=4.3.8
 Requires-Dist: psutil>=7.0.0
 Requires-Dist: rich-argparse>=1.7.1
 Requires-Dist: rich>=14.0.0
@@ -47,6 +48,10 @@ It uses
 - [gemmi](https://project-gemmi.github.io/) to work with macromolecular models.
 - [dask-distributed](https://docs.dask.org/en/latest/) to compute in parallel.
+The package is used by
+- [protein-detective](https://github.com/haddocking/protein-detective)
 An example workflow:
 ```mermaid
@@ -94,6 +99,9 @@ The main entry point is the `protein-quest` command line tool which has multiple
 To use programmaticly, see the [Jupyter notebooks](https://www.bonvinlab.org/protein-quest/notebooks) and [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
+While downloading or copying files it uses a global cache (located at `~/.cache/protein-quest`) and hardlinks to save disk space and improve speed.
+This behavior can be customized with the `--no-cache`, `--cache-dir`, and `--copy-method` command line arguments.
 ### Search Uniprot accessions
 ```shell

{protein_quest-0.4.0.dist-info → protein_quest-0.5.0.dist-info}/RECORD RENAMED Viewed

@@ -1,26 +1,26 @@
 protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-protein_quest/__version__.py,sha256=je7v2gXyxr6yRVCFAS0wS-iABSLJOuCb-IPR-x90UAU,56
-protein_quest/cli.py,sha256=9Cgvn5BXbrAloIU2KCiFxLxJSyAoa2RLdmuB0HGsUJM,43078
+protein_quest/__version__.py,sha256=AyGZhrskazcQPC8spzJ45d4XNxgla5DnO1bmKuzRj_Q,56
+protein_quest/cli.py,sha256=xiXt_2l3MxbTbmxm2sz0w8_OdJr8gz_B68GBVv5wHjE,44182
 protein_quest/converter.py,sha256=Y-Oxf7lDNbEicL6GS-IpNWDwaAiHgIgs5bFAcEHCKdQ,1441
-protein_quest/emdb.py,sha256=QEeU0VJQ4lLM-o5yAU3QZlrtzDZNgnC5fCjlqPtTyAY,1370
+protein_quest/emdb.py,sha256=641c6RwNYnu-0GBFyCFBiI58fNc0jMkd0ZZ9MW9-Jmc,1501
 protein_quest/filters.py,sha256=-gasSXR4g5SzYSYbkfcDwR-tm2KCAhCMdpIVJrUPR1w,5224
 protein_quest/go.py,sha256=lZNEcw8nTc9wpV3cl4y2FG9Lsj8wsXQ6zemmAQs_DWE,5650
-protein_quest/mcp_server.py,sha256=CXw5rTStunXdAVQ3BWPXy19zmgQGwV5uPcWlN1HF9do,7389
+protein_quest/mcp_server.py,sha256=PCXxcU3GElKg2sjMlxbsM63OiFxg9AtmfKwBJ1_0AQE,8130
 protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
 protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 protein_quest/ss.py,sha256=qOr0aMycNAtZmXXvhCN-KZH3Qp4EejnBcE6fsFgCrmY,10343
 protein_quest/taxonomy.py,sha256=4mKv8zll4mX02Ow8CTvyqMJE2KJZvcq3QlTjjjLOJJk,5072
 protein_quest/uniprot.py,sha256=DIwQYzWZREZ7SGhkJT4Ozgl36pdz47FNfZ1QoEgEaXE,24239
-protein_quest/utils.py,sha256=z4PPPcog6nvPhA93DWVf7stv5uJ4h_2BP5owdhoO5mo,5626
+protein_quest/utils.py,sha256=2lQ7jPHWtDySBTYnoL9VTKl5XUgQVYgp9Prb7qEnjtQ,17982
 protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
 protein_quest/alphafold/confidence.py,sha256=pYIuwYdkuPuHLagcX1dSvSyZ_84xboRLfHUxkEoc4MY,6766
 protein_quest/alphafold/entry_summary.py,sha256=GtE3rT7wH3vIOOeiXY2s80Fo6EzdoqlcvakW8K591Yk,1257
-protein_quest/alphafold/fetch.py,sha256=iFHORaO-2NvPwmpm33tfOFUcSJx8mBGwMXxwc4bRuk8,11336
+protein_quest/alphafold/fetch.py,sha256=wIsgPZmtnE5EoAL9G22Y6Ehx9d0md53Mw88-6LLGp0Q,12298
 protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
-protein_quest/pdbe/fetch.py,sha256=tlCrWoaOrwxnQFrf-PnimUUa6lmtHwwysS51efYsBcA,2379
+protein_quest/pdbe/fetch.py,sha256=e8CHWDX2QzWnVLmYXCfNrscw1UcN1lI9Uz6Z5HmEOEQ,2510
 protein_quest/pdbe/io.py,sha256=iGLvmsD-eEYnrgZDYfkGWIDCzwDRRD5dwqB480talCs,10037
-protein_quest-0.4.0.dist-info/METADATA,sha256=y5DAnM4mhSincjslsvQZ4zk1QcMysGmnsBltK_Vz4MQ,8842
-protein_quest-0.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-protein_quest-0.4.0.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
-protein_quest-0.4.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-protein_quest-0.4.0.dist-info/RECORD,,
+protein_quest-0.5.0.dist-info/METADATA,sha256=atoElM2xwPd9ubxXSQsFQYz2hjALJi-AegCRkrynEYc,9236
+protein_quest-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+protein_quest-0.5.0.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
+protein_quest-0.5.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+protein_quest-0.5.0.dist-info/RECORD,,

{protein_quest-0.4.0.dist-info → protein_quest-0.5.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{protein_quest-0.4.0.dist-info → protein_quest-0.5.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{protein_quest-0.4.0.dist-info → protein_quest-0.5.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

protein-quest 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

Potentially problematic release.

protein-quest 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl