PyPI - protein-quest - Versions diffs - 0.3.2__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

protein-quest 0.3.2py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of protein-quest might be problematic. Click here for more details.

Files changed (16) hide show

protein_quest/__version__.py +1 -1
protein_quest/alphafold/fetch.py +34 -9
protein_quest/cli.py +207 -26
protein_quest/converter.py +1 -0
protein_quest/emdb.py +6 -3
protein_quest/mcp_server.py +34 -3
protein_quest/pdbe/fetch.py +6 -3
protein_quest/ss.py +20 -0
protein_quest/uniprot.py +157 -4
protein_quest/utils.py +367 -23
{protein_quest-0.3.2.dist-info → protein_quest-0.5.0.dist-info}/METADATA +41 -3
protein_quest-0.5.0.dist-info/RECORD +26 -0
protein_quest-0.3.2.dist-info/RECORD +0 -26
{protein_quest-0.3.2.dist-info → protein_quest-0.5.0.dist-info}/WHEEL +0 -0
{protein_quest-0.3.2.dist-info → protein_quest-0.5.0.dist-info}/entry_points.txt +0 -0
{protein_quest-0.3.2.dist-info → protein_quest-0.5.0.dist-info}/licenses/LICENSE +0 -0

protein_quest/__version__.py CHANGED Viewed

@@ -1,2 +1,2 @@
-__version__ = "0.3.2"
+__version__ = "0.5.0"
 """The version of the package."""

protein_quest/alphafold/fetch.py CHANGED Viewed

@@ -14,7 +14,7 @@ from yarl import URL
 from protein_quest.alphafold.entry_summary import EntrySummary
 from protein_quest.converter import converter
-from protein_quest.utils import friendly_session, retrieve_files, run_async
+from protein_quest.utils import Cacher, PassthroughCacher, friendly_session, retrieve_files, run_async
 logger = logging.getLogger(__name__)
@@ -104,7 +104,7 @@ class AlphaFoldEntry:
 async def fetch_summary(
-    qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None
+    qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None, cacher: Cacher
 ) -> list[EntrySummary]:
     """Fetches a summary from the AlphaFold database for a given qualifier.
@@ -116,6 +116,7 @@ async def fetch_summary(
         save_dir: An optional directory to save the fetched summary as a JSON file.
             If set and summary exists then summary will be loaded from disk instead of being fetched from the API.
             If not set then the summary will not be saved to disk and will always be fetched from the API.
+        cacher: A cacher to use for caching the fetched summary. Only used if save_dir is not None.
     Returns:
         A list of EntrySummary objects representing the fetched summary.
@@ -124,6 +125,11 @@ async def fetch_summary(
     fn: AsyncPath | None = None
     if save_dir is not None:
         fn = AsyncPath(save_dir / f"{qualifier}.json")
+        cached_file = await cacher.copy_from_cache(Path(fn))
+        if cached_file is not None:
+            logger.debug(f"Using cached file {cached_file} for summary of {qualifier}.")
+            raw_data = await AsyncPath(cached_file).read_bytes()
+            return converter.loads(raw_data, list[EntrySummary])
         if await fn.exists():
             logger.debug(f"File {fn} already exists. Skipping download from {url}.")
             raw_data = await fn.read_bytes()
@@ -133,18 +139,23 @@ async def fetch_summary(
         raw_data = await response.content.read()
         if fn is not None:
             # TODO return fn and make it part of AlphaFoldEntry as summary_file prop
-            await fn.write_bytes(raw_data)
+            await cacher.write_bytes(Path(fn), raw_data)
         return converter.loads(raw_data, list[EntrySummary])
 async def fetch_summaries(
-    qualifiers: Iterable[str], save_dir: Path | None = None, max_parallel_downloads: int = 5
+    qualifiers: Iterable[str],
+    save_dir: Path | None = None,
+    max_parallel_downloads: int = 5,
+    cacher: Cacher | None = None,
 ) -> AsyncGenerator[EntrySummary]:
     semaphore = Semaphore(max_parallel_downloads)
     if save_dir is not None:
         save_dir.mkdir(parents=True, exist_ok=True)
+    if cacher is None:
+        cacher = PassthroughCacher()
     async with friendly_session() as session:
-        tasks = [fetch_summary(qualifier, session, semaphore, save_dir) for qualifier in qualifiers]
+        tasks = [fetch_summary(qualifier, session, semaphore, save_dir, cacher) for qualifier in qualifiers]
         summaries_per_qualifier: list[list[EntrySummary]] = await tqdm.gather(
             *tasks, desc="Fetching Alphafold summaries"
         )
@@ -154,7 +165,11 @@ async def fetch_summaries(
 async def fetch_many_async(
-    uniprot_accessions: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
+    uniprot_accessions: Iterable[str],
+    save_dir: Path,
+    what: set[DownloadableFormat],
+    max_parallel_downloads: int = 5,
+    cacher: Cacher | None = None,
 ) -> AsyncGenerator[AlphaFoldEntry]:
     """Asynchronously fetches summaries and files from
     [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
@@ -164,15 +179,17 @@ async def fetch_many_async(
         save_dir: The directory to save the fetched files to.
         what: A set of formats to download.
         max_parallel_downloads: The maximum number of parallel downloads.
+        cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
     Yields:
         A dataclass containing the summary, pdb file, and pae file.
     """
     save_dir_for_summaries = save_dir if "summary" in what and save_dir is not None else None
     summaries = [
         s
         async for s in fetch_summaries(
-            uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads
+            uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads, cacher=cacher
         )
     ]
@@ -183,6 +200,7 @@ async def fetch_many_async(
         save_dir,
         desc="Downloading AlphaFold files",
         max_parallel_downloads=max_parallel_downloads,
+        cacher=cacher,
     )
     for summary in summaries:
         yield AlphaFoldEntry(
@@ -236,7 +254,11 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
 def fetch_many(
-    ids: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
+    ids: Iterable[str],
+    save_dir: Path,
+    what: set[DownloadableFormat],
+    max_parallel_downloads: int = 5,
+    cacher: Cacher | None = None,
 ) -> list[AlphaFoldEntry]:
     """Synchronously fetches summaries and pdb and pae files from AlphaFold Protein Structure Database.
@@ -245,6 +267,7 @@ def fetch_many(
         save_dir: The directory to save the fetched files to.
         what: A set of formats to download.
         max_parallel_downloads: The maximum number of parallel downloads.
+        cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
     Returns:
         A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
@@ -253,7 +276,9 @@ def fetch_many(
     async def gather_entries():
         return [
             entry
-            async for entry in fetch_many_async(ids, save_dir, what, max_parallel_downloads=max_parallel_downloads)
+            async for entry in fetch_many_async(
+                ids, save_dir, what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
+            )
         ]
     return run_async(gather_entries())

protein_quest/cli.py CHANGED Viewed

@@ -15,6 +15,7 @@ from textwrap import dedent
 from cattrs import structure
 from rich import print as rprint
 from rich.logging import RichHandler
+from rich.markdown import Markdown
 from rich.panel import Panel
 from rich_argparse import ArgumentDefaultsRichHelpFormatter
 from tqdm.rich import tqdm
@@ -31,8 +32,26 @@ from protein_quest.pdbe import fetch as pdbe_fetch
 from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
 from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
 from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
-from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
-from protein_quest.utils import CopyMethod, copy_methods, copyfile
+from protein_quest.uniprot import (
+    ComplexPortalEntry,
+    PdbResult,
+    Query,
+    search4af,
+    search4emdb,
+    search4interaction_partners,
+    search4macromolecular_complexes,
+    search4pdb,
+    search4uniprot,
+)
+from protein_quest.utils import (
+    Cacher,
+    CopyMethod,
+    DirectoryCacher,
+    PassthroughCacher,
+    copy_methods,
+    copyfile,
+    user_cache_root_dir,
+)
 logger = logging.getLogger(__name__)
@@ -211,6 +230,73 @@ def _add_search_taxonomy_parser(subparser: argparse._SubParsersAction):
     parser.add_argument("--limit", type=int, default=100, help="Maximum number of results to return")
+def _add_search_interaction_partners_parser(subparsers: argparse._SubParsersAction):
+    """Add search interaction partners subcommand parser."""
+    parser = subparsers.add_parser(
+        "interaction-partners",
+        help="Search for interaction partners of given UniProt accession",
+        description=dedent("""\
+            Search for interaction partners of given UniProt accession
+            in the Uniprot SPARQL endpoint and Complex Portal.
+        """),
+        formatter_class=ArgumentDefaultsRichHelpFormatter,
+    )
+    parser.add_argument(
+        "uniprot_acc",
+        type=str,
+        help="UniProt accession (for example P12345).",
+    )
+    parser.add_argument(
+        "--exclude",
+        type=str,
+        action="append",
+        help="UniProt accessions to exclude from the results. For example already known interaction partners.",
+    )
+    parser.add_argument(
+        "output_csv",
+        type=argparse.FileType("w", encoding="UTF-8"),
+        help="Output CSV with interaction partners per UniProt accession. Use `-` for stdout.",
+    )
+    parser.add_argument(
+        "--limit", type=int, default=10_000, help="Maximum number of interaction partner uniprot accessions to return"
+    )
+    parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
+def _add_search_complexes_parser(subparsers: argparse._SubParsersAction):
+    """Add search complexes subcommand parser."""
+    description = dedent("""\
+        Search for complexes in the Complex Portal.
+        https://www.ebi.ac.uk/complexportal/
+        The output CSV file has the following columns:
+        - query_protein: UniProt accession used as query
+        - complex_id: Complex Portal identifier
+        - complex_url: URL to the Complex Portal entry
+        - complex_title: Title of the complex
+        - members: Semicolon-separated list of UniProt accessions of complex members
+    """)
+    parser = subparsers.add_parser(
+        "complexes",
+        help="Search for complexes in the Complex Portal",
+        description=Markdown(description, style="argparse.text"),  # type: ignore using rich formatter makes this OK
+        formatter_class=ArgumentDefaultsRichHelpFormatter,
+    )
+    parser.add_argument(
+        "uniprot_accs",
+        type=argparse.FileType("r", encoding="UTF-8"),
+        help="Text file with UniProt accessions (one per line) as query for searching complexes. Use `-` for stdin.",
+    )
+    parser.add_argument(
+        "output_csv",
+        type=argparse.FileType("w", encoding="UTF-8"),
+        help="Output CSV file with complex results. Use `-` for stdout.",
+    )
+    parser.add_argument("--limit", type=int, default=100, help="Maximum number of complex results to return")
+    parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
 def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
     """Add retrieve pdbe subcommand parser."""
     parser = subparsers.add_parser(
@@ -234,6 +320,7 @@ def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
         default=5,
         help="Maximum number of parallel downloads",
     )
+    _add_cacher_arguments(parser)
 def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
@@ -264,6 +351,7 @@ def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
         default=5,
         help="Maximum number of parallel downloads",
     )
+    _add_cacher_arguments(parser)
 def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
@@ -283,22 +371,7 @@ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
         help="CSV file with `emdb_id` column. Other columns are ignored. Use `-` for stdin.",
     )
     parser.add_argument("output_dir", type=Path, help="Directory to store downloaded EMDB volume files")
-def _add_copy_method_argument(parser: argparse.ArgumentParser):
-    """Add copy method argument to parser."""
-    default_copy_method = "symlink"
-    if os.name == "nt":
-        # On Windows you need developer mode or admin privileges to create symlinks
-        # so we default to copying files instead of symlinking
-        default_copy_method = "copy"
-    parser.add_argument(
-        "--copy-method",
-        type=str,
-        choices=copy_methods,
-        default=default_copy_method,
-        help="How to copy files when no changes are needed to output file.",
-    )
+    _add_cacher_arguments(parser)
 def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
@@ -331,7 +404,7 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
             In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
             Use `-` for stdout."""),
     )
-    _add_copy_method_argument(parser)
+    _add_copy_method_arguments(parser)
 def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
@@ -371,7 +444,7 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
             If not provided, will create a local cluster.
             If set to `sequential` will run tasks sequentially."""),
     )
-    _add_copy_method_argument(parser)
+    _add_copy_method_arguments(parser)
 def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
@@ -394,7 +467,6 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
     )
     parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
     parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
-    _add_copy_method_argument(parser)
     parser.add_argument(
         "--write-stats",
         type=argparse.FileType("w", encoding="UTF-8"),
@@ -403,6 +475,7 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
             In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
             Use `-` for stdout."""),
     )
+    _add_copy_method_arguments(parser)
 def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
@@ -429,7 +502,6 @@ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
     parser.add_argument("--ratio-max-helix-residues", type=float, help="Max residues in helices (relative)")
     parser.add_argument("--ratio-min-sheet-residues", type=float, help="Min residues in sheets (relative)")
     parser.add_argument("--ratio-max-sheet-residues", type=float, help="Max residues in sheets (relative)")
-    _add_copy_method_argument(parser)
     parser.add_argument(
         "--write-stats",
         type=argparse.FileType("w", encoding="UTF-8"),
@@ -440,6 +512,7 @@ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
             Use `-` for stdout.
         """),
     )
+    _add_copy_method_arguments(parser)
 def _add_search_subcommands(subparsers: argparse._SubParsersAction):
@@ -458,6 +531,8 @@ def _add_search_subcommands(subparsers: argparse._SubParsersAction):
     _add_search_emdb_parser(subsubparsers)
     _add_search_go_parser(subsubparsers)
     _add_search_taxonomy_parser(subsubparsers)
+    _add_search_interaction_partners_parser(subsubparsers)
+    _add_search_complexes_parser(subsubparsers)
 def _add_retrieve_subcommands(subparsers: argparse._SubParsersAction):
@@ -505,6 +580,38 @@ def _add_mcp_command(subparsers: argparse._SubParsersAction):
     parser.add_argument("--port", default=8000, type=int, help="Port to bind the server to")
+def _add_copy_method_arguments(parser):
+    parser.add_argument(
+        "--copy-method",
+        type=str,
+        choices=copy_methods,
+        default="hardlink",
+        help=dedent("""\
+            How to make target file be same file as source file.
+            By default uses hardlinks to save disk space.
+            Note that hardlinks only work within the same filesystem and are harder to track.
+            If you want to track cached files easily then use 'symlink'.
+            On Windows you need developer mode or admin privileges to create symlinks.
+        """),
+    )
+def _add_cacher_arguments(parser: argparse.ArgumentParser):
+    """Add cacher arguments to parser."""
+    parser.add_argument(
+        "--no-cache",
+        action="store_true",
+        help="Disable caching of files to central location.",
+    )
+    parser.add_argument(
+        "--cache-dir",
+        type=Path,
+        default=user_cache_root_dir(),
+        help="Directory to use as cache for files.",
+    )
+    _add_copy_method_arguments(parser)
 def make_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(
         description="Protein Quest CLI", prog="protein-quest", formatter_class=ArgumentDefaultsRichHelpFormatter
@@ -636,14 +743,52 @@ def _handle_search_taxonomy(args):
     _write_taxonomy_csv(results, output_csv)
-def _handle_retrieve_pdbe(args):
+def _handle_search_interaction_partners(args: argparse.Namespace):
+    uniprot_acc: str = args.uniprot_acc
+    excludes: set[str] = set(args.exclude) if args.exclude else set()
+    limit: int = args.limit
+    timeout: int = args.timeout
+    output_csv: TextIOWrapper = args.output_csv
+    rprint(f"Searching for interaction partners of '{uniprot_acc}'")
+    results = search4interaction_partners(uniprot_acc, excludes=excludes, limit=limit, timeout=timeout)
+    rprint(f"Found {len(results)} interaction partners, written to {output_csv.name}")
+    _write_lines(output_csv, results.keys())
+def _handle_search_complexes(args: argparse.Namespace):
+    uniprot_accs = args.uniprot_accs
+    limit = args.limit
+    timeout = args.timeout
+    output_csv = args.output_csv
+    accs = _read_lines(uniprot_accs)
+    rprint(f"Finding complexes for {len(accs)} uniprot accessions")
+    results = search4macromolecular_complexes(accs, limit=limit, timeout=timeout)
+    rprint(f"Found {len(results)} complexes, written to {output_csv.name}")
+    _write_complexes_csv(results, output_csv)
+def _initialize_cacher(args: argparse.Namespace) -> Cacher:
+    if args.no_cache:
+        return PassthroughCacher()
+    return DirectoryCacher(
+        cache_dir=args.cache_dir,
+        copy_method=args.copy_method,
+    )
+def _handle_retrieve_pdbe(args: argparse.Namespace):
     pdbe_csv = args.pdbe_csv
     output_dir = args.output_dir
     max_parallel_downloads = args.max_parallel_downloads
+    cacher = _initialize_cacher(args)
     pdb_ids = _read_column_from_csv(pdbe_csv, "pdb_id")
     rprint(f"Retrieving {len(pdb_ids)} PDBe entries")
-    result = asyncio.run(pdbe_fetch.fetch(pdb_ids, output_dir, max_parallel_downloads=max_parallel_downloads))
+    result = asyncio.run(
+        pdbe_fetch.fetch(pdb_ids, output_dir, max_parallel_downloads=max_parallel_downloads, cacher=cacher)
+    )
     rprint(f"Retrieved {len(result)} PDBe entries")
@@ -652,6 +797,7 @@ def _handle_retrieve_alphafold(args):
     what_formats = args.what_formats
     alphafold_csv = args.alphafold_csv
     max_parallel_downloads = args.max_parallel_downloads
+    cacher = _initialize_cacher(args)
     if what_formats is None:
         what_formats = {"summary", "cif"}
@@ -661,7 +807,9 @@ def _handle_retrieve_alphafold(args):
     af_ids = _read_column_from_csv(alphafold_csv, "af_id")
     validated_what: set[DownloadableFormat] = structure(what_formats, set[DownloadableFormat])
     rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {validated_what}")
-    afs = af_fetch(af_ids, download_dir, what=validated_what, max_parallel_downloads=max_parallel_downloads)
+    afs = af_fetch(
+        af_ids, download_dir, what=validated_what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
+    )
     total_nr_files = sum(af.nr_of_files() for af in afs)
     rprint(f"Retrieved {total_nr_files} AlphaFold files and {len(afs)} summaries, written to {download_dir}")
@@ -669,10 +817,11 @@ def _handle_retrieve_alphafold(args):
 def _handle_retrieve_emdb(args):
     emdb_csv = args.emdb_csv
     output_dir = args.output_dir
+    cacher = _initialize_cacher(args)
     emdb_ids = _read_column_from_csv(emdb_csv, "emdb_id")
     rprint(f"Retrieving {len(emdb_ids)} EMDB entries")
-    result = asyncio.run(emdb_fetch(emdb_ids, output_dir))
+    result = asyncio.run(emdb_fetch(emdb_ids, output_dir, cacher=cacher))
     rprint(f"Retrieved {len(result)} EMDB entries")
@@ -875,6 +1024,8 @@ HANDLERS: dict[tuple[str, str | None], Callable] = {
     ("search", "emdb"): _handle_search_emdb,
     ("search", "go"): _handle_search_go,
     ("search", "taxonomy"): _handle_search_taxonomy,
+    ("search", "interaction-partners"): _handle_search_interaction_partners,
+    ("search", "complexes"): _handle_search_complexes,
     ("retrieve", "pdbe"): _handle_retrieve_pdbe,
     ("retrieve", "alphafold"): _handle_retrieve_alphafold,
     ("retrieve", "emdb"): _handle_retrieve_emdb,
@@ -937,3 +1088,33 @@ def _iter_csv_rows(file: TextIOWrapper) -> Generator[dict[str, str]]:
 def _read_column_from_csv(file: TextIOWrapper, column: str) -> set[str]:
     return {row[column] for row in _iter_csv_rows(file)}
+def _write_complexes_csv(complexes: list[ComplexPortalEntry], output_csv: TextIOWrapper) -> None:
+    """Write ComplexPortal information to a CSV file.
+    Args:
+        complexes: List of ComplexPortalEntry objects.
+        output_csv: TextIOWrapper to write the CSV data to.
+    """
+    writer = csv.writer(output_csv)
+    writer.writerow(
+        [
+            "query_protein",
+            "complex_id",
+            "complex_url",
+            "complex_title",
+            "members",
+        ]
+    )
+    for entry in complexes:
+        members_str = ";".join(sorted(entry.members))
+        writer.writerow(
+            [
+                entry.query_protein,
+                entry.complex_id,
+                entry.complex_url,
+                entry.complex_title,
+                members_str,
+            ]
+        )

protein_quest/converter.py CHANGED Viewed

@@ -13,6 +13,7 @@ type PositiveInt = int
 converter = make_converter()
 """cattrs converter to read JSON document or dict to Python objects."""
 converter.register_structure_hook(URL, lambda v, _: URL(v))
+converter.register_unstructure_hook(URL, lambda u: str(u))
 @converter.register_structure_hook

protein_quest/emdb.py CHANGED Viewed

@@ -3,7 +3,7 @@
 from collections.abc import Iterable, Mapping
 from pathlib import Path
-from protein_quest.utils import retrieve_files
+from protein_quest.utils import Cacher, retrieve_files
 def _map_id2volume_url(emdb_id: str) -> tuple[str, str]:
@@ -13,13 +13,16 @@ def _map_id2volume_url(emdb_id: str) -> tuple[str, str]:
     return url, fn
-async def fetch(emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 1) -> Mapping[str, Path]:
+async def fetch(
+    emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 1, cacher: Cacher | None = None
+) -> Mapping[str, Path]:
     """Fetches volume files from the EMDB database.
     Args:
         emdb_ids: A list of EMDB IDs to fetch.
         save_dir: The directory to save the downloaded files.
         max_parallel_downloads: The maximum number of parallel downloads.
+        cacher: An optional cacher to use for caching downloaded files.
     Returns:
         A mapping of EMDB IDs to their downloaded files.
@@ -30,5 +33,5 @@ async def fetch(emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads:
     # TODO show progress of each item
     # TODO handle failed downloads, by skipping them instead of raising an error
-    await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading EMDB volume files")
+    await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading EMDB volume files", cacher=cacher)
     return id2paths

protein_quest/mcp_server.py CHANGED Viewed

@@ -32,6 +32,7 @@ Examples:
 """
+from collections.abc import Mapping
 from pathlib import Path
 from textwrap import dedent
 from typing import Annotated
@@ -48,7 +49,15 @@ from protein_quest.pdbe.fetch import fetch as pdbe_fetch
 from protein_quest.pdbe.io import glob_structure_files, nr_residues_in_chain, write_single_chain_pdb_file
 from protein_quest.ss import filter_file_on_secondary_structure
 from protein_quest.taxonomy import search_taxon
-from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
+from protein_quest.uniprot import (
+    PdbResult,
+    Query,
+    search4af,
+    search4emdb,
+    search4macromolecular_complexes,
+    search4pdb,
+    search4uniprot,
+)
 mcp = FastMCP("protein-quest")
@@ -81,7 +90,18 @@ def search_pdb(
     return search4pdb(uniprot_accs, limit=limit)
-mcp.tool(pdbe_fetch, name="fetch_pdbe_structures")
+@mcp.tool
+async def fetch_pdbe_structures(pdb_ids: set[str], save_dir: Path) -> Mapping[str, Path]:
+    """Fetch the PDBe structures for given PDB IDs.
+    Args:
+        pdb_ids: A set of PDB IDs.
+        save_dir: The directory to save the fetched files.
+    Returns:
+        A mapping of PDB ID to the path of the fetched structure file.
+    """
+    return await pdbe_fetch(pdb_ids, save_dir)
 @mcp.tool
@@ -137,6 +157,7 @@ def search_alphafolds(
 mcp.tool(search4emdb, name="search_emdb")
+mcp.tool(search4macromolecular_complexes, name="search_macromolecular_complexes")
 @mcp.tool
@@ -154,7 +175,17 @@ def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[A
     return alphafold_fetch(uniprot_accs, save_dir, what)
-mcp.tool(emdb_fetch, name="fetch_emdb_volumes")
+@mcp.tool
+async def fetch_emdb_volumes(emdb_ids: set[str], save_dir: Path) -> Mapping[str, Path]:
+    """Fetch EMDB volumes for given EMDB IDs.
+    Args:
+        emdb_ids: A set of EMDB IDs.
+        save_dir: The directory to save the fetched files.
+    Returns:
+        A mapping of EMDB ID to the path of the fetched volume file.
+    """
+    return await emdb_fetch(emdb_ids=emdb_ids, save_dir=save_dir)
 @mcp.tool

protein_quest/pdbe/fetch.py CHANGED Viewed

@@ -3,7 +3,7 @@
 from collections.abc import Iterable, Mapping
 from pathlib import Path
-from protein_quest.utils import retrieve_files, run_async
+from protein_quest.utils import Cacher, retrieve_files, run_async
 def _map_id_mmcif(pdb_id: str) -> tuple[str, str]:
@@ -28,13 +28,16 @@ def _map_id_mmcif(pdb_id: str) -> tuple[str, str]:
     return url, fn
-async def fetch(ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 5) -> Mapping[str, Path]:
+async def fetch(
+    ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 5, cacher: Cacher | None = None
+) -> Mapping[str, Path]:
     """Fetches mmCIF files from the PDBe database.
     Args:
         ids: A set of PDB IDs to fetch.
         save_dir: The directory to save the fetched mmCIF files to.
         max_parallel_downloads: The maximum number of parallel downloads.
+        cacher: An optional cacher to use for caching downloaded files.
     Returns:
         A dict of id and paths to the downloaded mmCIF files.
@@ -47,7 +50,7 @@ async def fetch(ids: Iterable[str], save_dir: Path, max_parallel_downloads: int
     urls = list(id2urls.values())
     id2paths = {pdb_id: save_dir / fn for pdb_id, (_, fn) in id2urls.items()}
-    await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading PDBe mmCIF files")
+    await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading PDBe mmCIF files", cacher=cacher)
     return id2paths

protein_quest/ss.py CHANGED Viewed

@@ -111,6 +111,26 @@ class SecondaryStructureFilterQuery:
     ratio_min_sheet_residues: Ratio | None = None
     ratio_max_sheet_residues: Ratio | None = None
+    def is_actionable(self) -> bool:
+        """Check if the secondary structure query has any actionable filters.
+        Returns:
+            True if any of the filters are set, False otherwise.
+        """
+        return any(
+            field is not None
+            for field in [
+                self.abs_min_helix_residues,
+                self.abs_max_helix_residues,
+                self.abs_min_sheet_residues,
+                self.abs_max_sheet_residues,
+                self.ratio_min_helix_residues,
+                self.ratio_max_helix_residues,
+                self.ratio_min_sheet_residues,
+                self.ratio_max_sheet_residues,
+            ]
+        )
 def _check_range(min_val, max_val, label):
     if min_val is not None and max_val is not None and min_val >= max_val:

protein-quest 0.3.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

Potentially problematic release.

protein-quest 0.3.2py3-none-any.whl → 0.5.0py3-none-any.whl