PyPI - protein-quest - Versions diffs - 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

protein-quest 0.5.1py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of protein-quest might be problematic. Click here for more details.

Files changed (16) hide show

protein_quest/__version__.py +1 -1
protein_quest/alphafold/confidence.py +2 -2
protein_quest/alphafold/fetch.py +28 -19
protein_quest/cli.py +133 -68
protein_quest/filters.py +2 -5
protein_quest/io.py +350 -0
protein_quest/mcp_server.py +8 -5
protein_quest/ss.py +3 -7
protein_quest/{pdbe/io.py → structure.py} +53 -126
protein_quest/utils.py +26 -2
{protein_quest-0.5.1.dist-info → protein_quest-0.6.0.dist-info}/METADATA +12 -1
protein_quest-0.6.0.dist-info/RECORD +27 -0
protein_quest-0.5.1.dist-info/RECORD +0 -26
{protein_quest-0.5.1.dist-info → protein_quest-0.6.0.dist-info}/WHEEL +0 -0
{protein_quest-0.5.1.dist-info → protein_quest-0.6.0.dist-info}/entry_points.txt +0 -0
{protein_quest-0.5.1.dist-info → protein_quest-0.6.0.dist-info}/licenses/LICENSE +0 -0

protein_quest/__version__.py CHANGED Viewed

@@ -1,2 +1,2 @@
-__version__ = "0.5.1"
+__version__ = "0.6.0"
 """The version of the package."""

protein_quest/alphafold/confidence.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pathlib import Path
 import gemmi
 from protein_quest.converter import Percentage, PositiveInt, converter
-from protein_quest.pdbe.io import write_structure
+from protein_quest.io import read_structure, write_structure
 from protein_quest.ss import nr_of_residues_in_total
 from protein_quest.utils import CopyMethod, copyfile
@@ -127,7 +127,7 @@ def filter_file_on_residues(
         result with filtered_file property set to Path where filtered PDB file is saved.
             or None if structure was filtered out.
     """
-    structure = gemmi.read_structure(str(file))
+    structure = read_structure(file)
     residues = set(find_high_confidence_residues(structure, query.confidence))
     count = len(residues)
     if count < query.min_residues or count > query.max_residues:

protein_quest/alphafold/fetch.py CHANGED Viewed

@@ -125,15 +125,15 @@ async def fetch_summary(
     fn: AsyncPath | None = None
     if save_dir is not None:
         fn = AsyncPath(save_dir / f"{qualifier}.json")
+        if await fn.exists():
+            logger.debug(f"File {fn} already exists. Skipping download from {url}.")
+            raw_data = await fn.read_bytes()
+            return converter.loads(raw_data, list[EntrySummary])
         cached_file = await cacher.copy_from_cache(Path(fn))
         if cached_file is not None:
             logger.debug(f"Using cached file {cached_file} for summary of {qualifier}.")
             raw_data = await AsyncPath(cached_file).read_bytes()
             return converter.loads(raw_data, list[EntrySummary])
-        if await fn.exists():
-            logger.debug(f"File {fn} already exists. Skipping download from {url}.")
-            raw_data = await fn.read_bytes()
-            return converter.loads(raw_data, list[EntrySummary])
     async with semaphore, session.get(url) as response:
         response.raise_for_status()
         raw_data = await response.content.read()
@@ -170,6 +170,7 @@ async def fetch_many_async(
     what: set[DownloadableFormat],
     max_parallel_downloads: int = 5,
     cacher: Cacher | None = None,
+    gzip_files: bool = False,
 ) -> AsyncGenerator[AlphaFoldEntry]:
     """Asynchronously fetches summaries and files from
     [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
@@ -180,6 +181,7 @@ async def fetch_many_async(
         what: A set of formats to download.
         max_parallel_downloads: The maximum number of parallel downloads.
         cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
+        gzip_files: Whether to gzip the downloaded files.
     Yields:
         A dataclass containing the summary, pdb file, and pae file.
@@ -193,7 +195,7 @@ async def fetch_many_async(
         )
     ]
-    files = files_to_download(what, summaries)
+    files = files_to_download(what, summaries, gzip_files)
     await retrieve_files(
         files,
@@ -201,36 +203,40 @@ async def fetch_many_async(
         desc="Downloading AlphaFold files",
         max_parallel_downloads=max_parallel_downloads,
         cacher=cacher,
+        gzip_files=gzip_files,
     )
+    gzext = ".gz" if gzip_files else ""
     for summary in summaries:
         yield AlphaFoldEntry(
             uniprot_acc=summary.uniprotAccession,
             summary=summary,
             summary_file=save_dir / f"{summary.uniprotAccession}.json" if save_dir_for_summaries is not None else None,
-            bcif_file=save_dir / summary.bcifUrl.name if "bcif" in what else None,
-            cif_file=save_dir / summary.cifUrl.name if "cif" in what else None,
-            pdb_file=save_dir / summary.pdbUrl.name if "pdb" in what else None,
-            pae_image_file=save_dir / summary.paeImageUrl.name if "paeImage" in what else None,
-            pae_doc_file=save_dir / summary.paeDocUrl.name if "paeDoc" in what else None,
+            bcif_file=save_dir / (summary.bcifUrl.name + gzext) if "bcif" in what else None,
+            cif_file=save_dir / (summary.cifUrl.name + gzext) if "cif" in what else None,
+            pdb_file=save_dir / (summary.pdbUrl.name + gzext) if "pdb" in what else None,
+            pae_image_file=save_dir / (summary.paeImageUrl.name + gzext) if "paeImage" in what else None,
+            pae_doc_file=save_dir / (summary.paeDocUrl.name + gzext) if "paeDoc" in what else None,
             am_annotations_file=(
-                save_dir / summary.amAnnotationsUrl.name
+                save_dir / (summary.amAnnotationsUrl.name + gzext)
                 if "amAnnotations" in what and summary.amAnnotationsUrl
                 else None
             ),
             am_annotations_hg19_file=(
-                save_dir / summary.amAnnotationsHg19Url.name
+                save_dir / (summary.amAnnotationsHg19Url.name + gzext)
                 if "amAnnotationsHg19" in what and summary.amAnnotationsHg19Url
                 else None
             ),
             am_annotations_hg38_file=(
-                save_dir / summary.amAnnotationsHg38Url.name
+                save_dir / (summary.amAnnotationsHg38Url.name + gzext)
                 if "amAnnotationsHg38" in what and summary.amAnnotationsHg38Url
                 else None
             ),
         )
-def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySummary]) -> set[tuple[URL, str]]:
+def files_to_download(
+    what: set[DownloadableFormat], summaries: Iterable[EntrySummary], gzip_files: bool
+) -> set[tuple[URL, str]]:
     if not (set(what) <= downloadable_formats):
         msg = (
             f"Invalid format(s) specified: {set(what) - downloadable_formats}. "
@@ -238,7 +244,7 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
         )
         raise ValueError(msg)
-    files: set[tuple[URL, str]] = set()
+    url_filename_pairs: set[tuple[URL, str]] = set()
     for summary in summaries:
         for fmt in what:
             if fmt == "summary":
@@ -248,9 +254,10 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
             if url is None:
                 logger.warning(f"Summary {summary.uniprotAccession} does not have a URL for format '{fmt}'. Skipping.")
                 continue
-            file = (url, url.name)
-            files.add(file)
-    return files
+            fn = url.name + (".gz" if gzip_files else "")
+            url_filename_pair = (url, fn)
+            url_filename_pairs.add(url_filename_pair)
+    return url_filename_pairs
 def fetch_many(
@@ -259,6 +266,7 @@ def fetch_many(
     what: set[DownloadableFormat],
     max_parallel_downloads: int = 5,
     cacher: Cacher | None = None,
+    gzip_files: bool = False,
 ) -> list[AlphaFoldEntry]:
     """Synchronously fetches summaries and pdb and pae files from AlphaFold Protein Structure Database.
@@ -268,6 +276,7 @@ def fetch_many(
         what: A set of formats to download.
         max_parallel_downloads: The maximum number of parallel downloads.
         cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
+        gzip_files: Whether to gzip the downloaded files.
     Returns:
         A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
@@ -277,7 +286,7 @@ def fetch_many(
         return [
             entry
             async for entry in fetch_many_async(
-                ids, save_dir, what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
+                ids, save_dir, what, max_parallel_downloads=max_parallel_downloads, cacher=cacher, gzip_files=gzip_files
             )
         ]

protein_quest/cli.py CHANGED Viewed

@@ -28,8 +28,13 @@ from protein_quest.converter import converter
 from protein_quest.emdb import fetch as emdb_fetch
 from protein_quest.filters import filter_files_on_chain, filter_files_on_residues
 from protein_quest.go import Aspect, allowed_aspects, search_gene_ontology_term, write_go_terms_to_csv
+from protein_quest.io import (
+    convert_to_cif_files,
+    glob_structure_files,
+    locate_structure_file,
+    valid_structure_file_extensions,
+)
 from protein_quest.pdbe import fetch as pdbe_fetch
-from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
 from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
 from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
 from protein_quest.uniprot import (
@@ -297,6 +302,38 @@ def _add_search_complexes_parser(subparsers: argparse._SubParsersAction):
     parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
+def _add_copy_method_arguments(parser):
+    parser.add_argument(
+        "--copy-method",
+        type=str,
+        choices=copy_methods,
+        default="hardlink",
+        help=dedent("""\
+            How to make target file be same file as source file.
+            By default uses hardlinks to save disk space.
+            Note that hardlinks only work within the same filesystem and are harder to track.
+            If you want to track cached files easily then use 'symlink'.
+            On Windows you need developer mode or admin privileges to create symlinks.
+        """),
+    )
+def _add_cacher_arguments(parser: argparse.ArgumentParser):
+    """Add cacher arguments to parser."""
+    parser.add_argument(
+        "--no-cache",
+        action="store_true",
+        help="Disable caching of files to central location.",
+    )
+    parser.add_argument(
+        "--cache-dir",
+        type=Path,
+        default=user_cache_root_dir(),
+        help="Directory to use as cache for files.",
+    )
+    _add_copy_method_arguments(parser)
 def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
     """Add retrieve pdbe subcommand parser."""
     parser = subparsers.add_parser(
@@ -345,6 +382,11 @@ def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
         help=dedent("""AlphaFold formats to retrieve. Can be specified multiple times.
             Default is 'summary' and 'cif'."""),
     )
+    parser.add_argument(
+        "--gzip-files",
+        action="store_true",
+        help="Whether to gzip the downloaded files. Excludes summary files, they are always uncompressed.",
+    )
     parser.add_argument(
         "--max-parallel-downloads",
         type=int,
@@ -561,6 +603,33 @@ def _add_filter_subcommands(subparsers: argparse._SubParsersAction):
     _add_filter_ss_parser(subsubparsers)
+def _add_convert_subcommands(subparsers: argparse._SubParsersAction):
+    """Add convert command."""
+    parser = subparsers.add_parser(
+        "convert", help="Convert structure files between formats", formatter_class=ArgumentDefaultsRichHelpFormatter
+    )
+    parser.add_argument(
+        "input_dir",
+        type=Path,
+        help=f"Directory with structure files. Supported extensions are {valid_structure_file_extensions}",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        help=dedent("""\
+            Directory to write converted structure files. If not given, files are written to `input_dir`.
+        """),
+    )
+    parser.add_argument(
+        "--format",
+        type=str,
+        choices=("cif",),
+        default="cif",
+        help="Output format to convert to.",
+    )
+    _add_copy_method_arguments(parser)
 def _add_mcp_command(subparsers: argparse._SubParsersAction):
     """Add MCP command."""
@@ -580,38 +649,6 @@ def _add_mcp_command(subparsers: argparse._SubParsersAction):
     parser.add_argument("--port", default=8000, type=int, help="Port to bind the server to")
-def _add_copy_method_arguments(parser):
-    parser.add_argument(
-        "--copy-method",
-        type=str,
-        choices=copy_methods,
-        default="hardlink",
-        help=dedent("""\
-            How to make target file be same file as source file.
-            By default uses hardlinks to save disk space.
-            Note that hardlinks only work within the same filesystem and are harder to track.
-            If you want to track cached files easily then use 'symlink'.
-            On Windows you need developer mode or admin privileges to create symlinks.
-        """),
-    )
-def _add_cacher_arguments(parser: argparse.ArgumentParser):
-    """Add cacher arguments to parser."""
-    parser.add_argument(
-        "--no-cache",
-        action="store_true",
-        help="Disable caching of files to central location.",
-    )
-    parser.add_argument(
-        "--cache-dir",
-        type=Path,
-        default=user_cache_root_dir(),
-        help="Directory to use as cache for files.",
-    )
-    _add_copy_method_arguments(parser)
 def make_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(
         description="Protein Quest CLI", prog="protein-quest", formatter_class=ArgumentDefaultsRichHelpFormatter
@@ -624,27 +661,12 @@ def make_parser() -> argparse.ArgumentParser:
     _add_search_subcommands(subparsers)
     _add_retrieve_subcommands(subparsers)
     _add_filter_subcommands(subparsers)
+    _add_convert_subcommands(subparsers)
     _add_mcp_command(subparsers)
     return parser
-def main():
-    """Main entry point for the CLI."""
-    parser = make_parser()
-    args = parser.parse_args()
-    logging.basicConfig(level=args.log_level, handlers=[RichHandler(show_level=False)])
-    # Dispatch table to reduce complexity
-    cmd = args.command
-    sub = getattr(args, f"{cmd}_cmd", None)
-    handler = HANDLERS.get((cmd, sub))
-    if handler is None:
-        msg = f"Unknown command: {cmd} {sub}"
-        raise SystemExit(msg)
-    handler(args)
 def _handle_search_uniprot(args):
     taxon_id = args.taxon_id
     reviewed = args.reviewed
@@ -798,6 +820,7 @@ def _handle_retrieve_alphafold(args):
     alphafold_csv = args.alphafold_csv
     max_parallel_downloads = args.max_parallel_downloads
     cacher = _initialize_cacher(args)
+    gzip_files = args.gzip_files
     if what_formats is None:
         what_formats = {"summary", "cif"}
@@ -808,7 +831,12 @@ def _handle_retrieve_alphafold(args):
     validated_what: set[DownloadableFormat] = structure(what_formats, set[DownloadableFormat])
     rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {validated_what}")
     afs = af_fetch(
-        af_ids, download_dir, what=validated_what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
+        af_ids,
+        download_dir,
+        what=validated_what,
+        max_parallel_downloads=max_parallel_downloads,
+        cacher=cacher,
+        gzip_files=gzip_files,
     )
     total_nr_files = sum(af.nr_of_files() for af in afs)
     rprint(f"Retrieved {total_nr_files} AlphaFold files and {len(afs)} summaries, written to {download_dir}")
@@ -1017,24 +1045,24 @@ def _handle_mcp(args):
         mcp.run(transport=args.transport, host=args.host, port=args.port)
-HANDLERS: dict[tuple[str, str | None], Callable] = {
-    ("search", "uniprot"): _handle_search_uniprot,
-    ("search", "pdbe"): _handle_search_pdbe,
-    ("search", "alphafold"): _handle_search_alphafold,
-    ("search", "emdb"): _handle_search_emdb,
-    ("search", "go"): _handle_search_go,
-    ("search", "taxonomy"): _handle_search_taxonomy,
-    ("search", "interaction-partners"): _handle_search_interaction_partners,
-    ("search", "complexes"): _handle_search_complexes,
-    ("retrieve", "pdbe"): _handle_retrieve_pdbe,
-    ("retrieve", "alphafold"): _handle_retrieve_alphafold,
-    ("retrieve", "emdb"): _handle_retrieve_emdb,
-    ("filter", "confidence"): _handle_filter_confidence,
-    ("filter", "chain"): _handle_filter_chain,
-    ("filter", "residue"): _handle_filter_residue,
-    ("filter", "secondary-structure"): _handle_filter_ss,
-    ("mcp", None): _handle_mcp,
-}
+def _handle_convert(args):
+    input_dir = structure(args.input_dir, Path)
+    output_dir = input_dir if args.output_dir is None else structure(args.output_dir, Path)
+    copy_method: CopyMethod = structure(args.copy_method, CopyMethod)  # pyright: ignore[reportArgumentType]
+    input_files = sorted(glob_structure_files(input_dir))
+    rprint(f"Converting {len(input_files)} files in {input_dir} directory to cif format.")
+    for _ in tqdm(
+        convert_to_cif_files(
+            input_files,
+            output_dir,
+            copy_method=copy_method,
+        ),
+        total=len(input_files),
+        unit="file",
+    ):
+        pass
+    rprint(f"Converted {len(input_files)} files into {output_dir}.")
 def _read_lines(file: TextIOWrapper) -> list[str]:
@@ -1118,3 +1146,40 @@ def _write_complexes_csv(complexes: list[ComplexPortalEntry], output_csv: TextIO
                 members_str,
             ]
         )
+HANDLERS: dict[tuple[str, str | None], Callable] = {
+    ("search", "uniprot"): _handle_search_uniprot,
+    ("search", "pdbe"): _handle_search_pdbe,
+    ("search", "alphafold"): _handle_search_alphafold,
+    ("search", "emdb"): _handle_search_emdb,
+    ("search", "go"): _handle_search_go,
+    ("search", "taxonomy"): _handle_search_taxonomy,
+    ("search", "interaction-partners"): _handle_search_interaction_partners,
+    ("search", "complexes"): _handle_search_complexes,
+    ("retrieve", "pdbe"): _handle_retrieve_pdbe,
+    ("retrieve", "alphafold"): _handle_retrieve_alphafold,
+    ("retrieve", "emdb"): _handle_retrieve_emdb,
+    ("filter", "confidence"): _handle_filter_confidence,
+    ("filter", "chain"): _handle_filter_chain,
+    ("filter", "residue"): _handle_filter_residue,
+    ("filter", "secondary-structure"): _handle_filter_ss,
+    ("mcp", None): _handle_mcp,
+    ("convert", None): _handle_convert,
+}
+def main():
+    """Main entry point for the CLI."""
+    parser = make_parser()
+    args = parser.parse_args()
+    logging.basicConfig(level=args.log_level, handlers=[RichHandler(show_level=False)])
+    # Dispatch table to reduce complexity
+    cmd = args.command
+    sub = getattr(args, f"{cmd}_cmd", None)
+    handler = HANDLERS.get((cmd, sub))
+    if handler is None:
+        msg = f"Unknown command: {cmd} {sub}"
+        raise SystemExit(msg)
+    handler(args)

protein_quest/filters.py CHANGED Viewed

@@ -11,10 +11,7 @@ from distributed.deploy.cluster import Cluster
 from tqdm.auto import tqdm
 from protein_quest.parallel import configure_dask_scheduler, dask_map_with_progress
-from protein_quest.pdbe.io import (
-    nr_residues_in_chain,
-    write_single_chain_pdb_file,
-)
+from protein_quest.structure import nr_residues_in_chain, write_single_chain_structure_file
 from protein_quest.utils import CopyMethod, copyfile
 logger = logging.getLogger(__name__)
@@ -38,7 +35,7 @@ def filter_file_on_chain(
     input_file, chain_id = file_and_chain
     logger.debug("Filtering %s on chain %s", input_file, chain_id)
     try:
-        output_file = write_single_chain_pdb_file(
+        output_file = write_single_chain_structure_file(
             input_file, chain_id, output_dir, out_chain=out_chain, copy_method=copy_method
         )
         return ChainFilterStatistics(

protein_quest/io.py ADDED Viewed

@@ -0,0 +1,350 @@
+"""Module for structure file input/output."""
+import gzip
+import logging
+import shutil
+import tempfile
+from collections.abc import Generator, Iterable
+from io import StringIO
+from pathlib import Path
+from typing import Literal, get_args
+from urllib.request import urlopen
+import gemmi
+from mmcif.api.DictionaryApi import DictionaryApi
+from mmcif.io.BinaryCifReader import BinaryCifReader
+from mmcif.io.BinaryCifWriter import BinaryCifWriter
+from mmcif.io.PdbxReader import PdbxReader
+from mmcif.io.PdbxWriter import PdbxWriter
+from protein_quest.utils import CopyMethod, copyfile, user_cache_root_dir
+logger = logging.getLogger(__name__)
+# TODO remove once v0.7.4 of gemmi is released,
+# as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
+# Swallow gemmi leaked function warnings
+gemmi.set_leak_warnings(False)
+StructureFileExtensions = Literal[".pdb", ".pdb.gz", ".ent", ".ent.gz", ".cif", ".cif.gz", ".bcif", ".bcif.gz"]
+"""Type of supported structure file extensions."""
+valid_structure_file_extensions: set[str] = set(get_args(StructureFileExtensions))
+"""Set of valid structure file extensions."""
+def write_structure(structure: gemmi.Structure, path: Path):
+    """Write a gemmi structure to a file.
+    Args:
+        structure: The gemmi structure to write.
+        path: The file path to write the structure to.
+            The format depends on the file extension.
+            See [StructureFileExtensions][protein_quest.io.StructureFileExtensions]
+            for supported extensions.
+    Raises:
+        ValueError: If the file extension is not supported.
+    """
+    if path.name.endswith(".pdb") or path.name.endswith(".ent"):
+        body: str = structure.make_pdb_string()
+        path.write_text(body)
+    elif path.name.endswith(".pdb.gz") or path.name.endswith(".ent.gz"):
+        body: str = structure.make_pdb_string()
+        with gzip.open(path, "wt") as f:
+            f.write(body)
+    elif path.name.endswith(".cif"):
+        # do not write chem_comp so it is viewable by molstar
+        # see https://github.com/project-gemmi/gemmi/discussions/362
+        doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
+        doc.write_file(str(path))
+    elif path.name.endswith(".cif.gz"):
+        doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
+        cif_str = doc.as_string()
+        with gzip.open(path, "wt") as f:
+            f.write(cif_str)
+    elif path.name.endswith(".bcif"):
+        structure2bcif(structure, path)
+    elif path.name.endswith(".bcif.gz"):
+        structure2bcifgz(structure, path)
+    else:
+        msg = f"Unsupported file extension in {path.name}. Supported extensions are: {valid_structure_file_extensions}"
+        raise ValueError(msg)
+def read_structure(file: Path) -> gemmi.Structure:
+    """Read a structure from a file.
+    Args:
+        file: Path to the input structure file.
+            See [StructureFileExtensions][protein_quest.io.StructureFileExtensions]
+            for supported extensions.
+    Returns:
+        A gemmi Structure object representing the structure in the file.
+    """
+    if file.name.endswith(".bcif"):
+        return bcif2structure(file)
+    if file.name.endswith(".bcif.gz"):
+        return bcifgz2structure(file)
+    return gemmi.read_structure(str(file))
+def bcif2cif(bcif_file: Path) -> str:
+    """Convert a binary CIF (bcif) file to a CIF string.
+    Args:
+        bcif_file: Path to the binary CIF file.
+    Returns:
+        A string containing the CIF representation of the structure.
+    """
+    reader = BinaryCifReader()
+    container = reader.deserialize(str(bcif_file))
+    capture = StringIO()
+    writer = PdbxWriter(capture)
+    writer.write(container)
+    return capture.getvalue()
+def bcifgz2structure(bcif_gz_file: Path) -> gemmi.Structure:
+    """Read a binary CIF (bcif) gzipped file and return a gemmi Structure object.
+    This is slower than other formats because gemmi does not support reading bcif files directly.
+    So we first gunzip the file to a temporary location, convert it to a cif string using mmcif package,
+    and then read the cif string using gemmi.
+    Args:
+        bcif_gz_file: Path to the binary CIF gzipped file.
+    Returns:
+        A gemmi Structure object representing the structure in the bcif.gz file.
+    """
+    with tempfile.NamedTemporaryFile(suffix=".bcif", delete=True) as tmp_bcif:
+        tmp_path = Path(tmp_bcif.name)
+        gunzip_file(bcif_gz_file, output_file=tmp_path, keep_original=True)
+        return bcif2structure(tmp_path)
+def bcif2structure(bcif_file: Path) -> gemmi.Structure:
+    """Read a binary CIF (bcif) file and return a gemmi Structure object.
+    This is slower than other formats because gemmi does not support reading bcif files directly.
+    So we convert it to a cif string first using mmcif package and then read the cif string using gemmi.
+    Args:
+        bcif_file: Path to the binary CIF file.
+    Returns:
+        A gemmi Structure object representing the structure in the bcif file.
+    """
+    cif_content = bcif2cif(bcif_file)
+    doc = gemmi.cif.read_string(cif_content)
+    block = doc.sole_block()
+    return gemmi.make_structure_from_block(block)
+def _initialize_dictionary_api(containers) -> DictionaryApi:
+    dict_local = user_cache_root_dir() / "mmcif_pdbx_v5_next.dic"
+    if not dict_local.exists():
+        dict_url = "https://raw.githubusercontent.com/wwpdb-dictionaries/mmcif_pdbx/master/dist/mmcif_pdbx_v5_next.dic"
+        logger.info("Downloading mmcif dictionary from %s to %s", dict_url, dict_local)
+        dict_local.parent.mkdir(parents=True, exist_ok=True)
+        with dict_local.open("wb") as f, urlopen(dict_url) as response:  # noqa: S310 url is hardcoded and https
+            f.write(response.read())
+    return DictionaryApi(containerList=containers, consolidate=True)
+def structure2bcif(structure: gemmi.Structure, bcif_file: Path):
+    """Write a gemmi Structure object to a binary CIF (bcif) file.
+    This is slower than other formats because gemmi does not support writing bcif files directly.
+    So we convert it to a cif string first using gemmi and then convert cif to bcif using mmcif package.
+    Args:
+        structure: The gemmi Structure object to write.
+        bcif_file: Path to the output binary CIF file.
+    """
+    doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
+    containers = []
+    with StringIO(doc.as_string()) as sio:
+        reader = PdbxReader(sio)
+        reader.read(containers)
+    dict_api = _initialize_dictionary_api(containers)
+    writer = BinaryCifWriter(dictionaryApi=dict_api)
+    writer.serialize(str(bcif_file), containers)
+def gunzip_file(gz_file: Path, output_file: Path | None = None, keep_original: bool = True) -> Path:
+    """Unzip a .gz file.
+    Args:
+        gz_file: Path to the .gz file.
+        output_file: Optional path to the output unzipped file. If None, the .gz suffix is removed from gz_file.
+        keep_original: Whether to keep the original .gz file. Default is True.
+    Returns:
+        Path to the unzipped file.
+    Raises:
+        ValueError: If output_file is None and gz_file does not end with .gz.
+    """
+    if output_file is None and not gz_file.name.endswith(".gz"):
+        msg = f"If output_file is not provided, {gz_file} must end with .gz"
+        raise ValueError(msg)
+    out_file = output_file or gz_file.with_suffix("")
+    with gzip.open(gz_file, "rb") as f_in, out_file.open("wb") as f_out:
+        shutil.copyfileobj(f_in, f_out)
+    if not keep_original:
+        gz_file.unlink()
+    return out_file
+def structure2bcifgz(structure: gemmi.Structure, bcif_gz_file: Path):
+    """Write a gemmi Structure object to a binary CIF gzipped (bcif.gz) file.
+    This is slower than other formats because gemmi does not support writing bcif files directly.
+    So we convert it to a cif string first using gemmi and then convert cif to bcif using mmcif package.
+    Finally, we gzip the bcif file.
+    Args:
+        structure: The gemmi Structure object to write.
+        bcif_gz_file: Path to the output binary CIF gzipped file.
+    """
+    with tempfile.NamedTemporaryFile(suffix=".bcif", delete=True) as tmp_bcif:
+        tmp_path = Path(tmp_bcif.name)
+        structure2bcif(structure, tmp_path)
+        with tmp_path.open("rb") as f_in, gzip.open(bcif_gz_file, "wb") as f_out:
+            shutil.copyfileobj(f_in, f_out)
+def convert_to_cif_files(
+    input_files: Iterable[Path], output_dir: Path, copy_method: CopyMethod
+) -> Generator[tuple[Path, Path]]:
+    """Convert structure files to .cif format.
+    Args:
+        input_files: Iterable of structure files to convert.
+        output_dir: Directory to save the converted .cif files.
+        copy_method: How to copy when no changes are needed to output file.
+    Yields:
+        A tuple of the input file and the output file.
+    """
+    for input_file in input_files:
+        output_file = convert_to_cif_file(input_file, output_dir, copy_method)
+        yield input_file, output_file
+def convert_to_cif_file(input_file: Path, output_dir: Path, copy_method: CopyMethod) -> Path:
+    """Convert a single structure file to .cif format.
+    Args:
+        input_file: The structure file to convert.
+            See [StructureFileExtensions][protein_quest.io.StructureFileExtensions]
+            for supported extensions.
+        output_dir: Directory to save the converted .cif file.
+        copy_method: How to copy when no changes are needed to output file.
+    Returns:
+        Path to the converted .cif file.
+    """
+    name, extension = split_name_and_extension(input_file.name)
+    output_file = output_dir / f"{name}.cif"
+    if output_file.exists():
+        logger.info("Output file %s already exists for input file %s. Skipping.", output_file, input_file)
+    elif extension in {".pdb", ".pdb.gz", ".ent", ".ent.gz"}:
+        structure = read_structure(input_file)
+        write_structure(structure, output_file)
+    elif extension == ".cif":
+        logger.info("File %s is already in .cif format, copying to %s", input_file, output_dir)
+        copyfile(input_file, output_file, copy_method)
+    elif extension == ".cif.gz":
+        gunzip_file(input_file, output_file=output_file, keep_original=True)
+    elif extension == ".bcif":
+        with output_file.open("w") as f:
+            f.write(bcif2cif(input_file))
+    else:
+        msg = (
+            f"Unsupported file extension {extension} in {input_file}. "
+            f"Supported extensions are {valid_structure_file_extensions}."
+        )
+        raise ValueError(msg)
+    return output_file
+def split_name_and_extension(name: str) -> tuple[str, str]:
+    """Split a filename into its name and extension.
+    `.gz` is considered part of the extension if present.
+    Examples:
+        Some example usages.
+        >>> from protein_quest.pdbe.io import split_name_and_extension
+        >>> split_name_and_extension("1234.pdb")
+        ('1234', '.pdb')
+        >>> split_name_and_extension("1234.pdb.gz")
+        ('1234', '.pdb.gz')
+    Args:
+        name: The filename to split.
+    Returns:
+        A tuple containing the name and the extension.
+    """
+    ext = ""
+    if name.endswith(".gz"):
+        ext = ".gz"
+        name = name.removesuffix(".gz")
+    i = name.rfind(".")
+    if 0 < i < len(name) - 1:
+        ext = name[i:] + ext
+        name = name[:i]
+    return name, ext
+def locate_structure_file(root: Path, pdb_id: str) -> Path:
+    """Locate a structure file for a given PDB ID in the specified directory.
+    Uses [StructureFileExtensions][protein_quest.io.StructureFileExtensions] as potential extensions.
+    Also tries different casing of the PDB ID.
+    Args:
+        root: The root directory to search in.
+        pdb_id: The PDB ID to locate.
+    Returns:
+        The path to the located structure file.
+    Raises:
+        FileNotFoundError: If no structure file is found for the given PDB ID.
+    """
+    for ext in valid_structure_file_extensions:
+        candidates = (
+            root / f"{pdb_id}{ext}",
+            root / f"{pdb_id.lower()}{ext}",
+            root / f"{pdb_id.upper()}{ext}",
+            root / f"pdb{pdb_id.lower()}{ext}",
+        )
+        for candidate in candidates:
+            if candidate.exists():
+                return candidate
+    msg = f"No structure file found for {pdb_id} in {root}"
+    raise FileNotFoundError(msg)
+def glob_structure_files(input_dir: Path) -> Generator[Path]:
+    """Glob for structure files in a directory.
+    Uses [StructureFileExtensions][protein_quest.io.StructureFileExtensions] as valid extensions.
+    Does not search recursively.
+    Args:
+        input_dir: The input directory to search for structure files.
+    Yields:
+        Paths to the found structure files.
+    """
+    for ext in valid_structure_file_extensions:
+        yield from input_dir.glob(f"*{ext}")

protein_quest/mcp_server.py CHANGED Viewed

@@ -45,9 +45,10 @@ from protein_quest.alphafold.fetch import AlphaFoldEntry, DownloadableFormat
 from protein_quest.alphafold.fetch import fetch_many as alphafold_fetch
 from protein_quest.emdb import fetch as emdb_fetch
 from protein_quest.go import search_gene_ontology_term
+from protein_quest.io import convert_to_cif_file, glob_structure_files
 from protein_quest.pdbe.fetch import fetch as pdbe_fetch
-from protein_quest.pdbe.io import glob_structure_files, nr_residues_in_chain, write_single_chain_pdb_file
 from protein_quest.ss import filter_file_on_secondary_structure
+from protein_quest.structure import nr_residues_in_chain, write_single_chain_structure_file
 from protein_quest.taxonomy import search_taxon
 from protein_quest.uniprot import (
     PdbResult,
@@ -112,18 +113,18 @@ def extract_single_chain_from_structure(
     out_chain: str = "A",
 ) -> Path:
     """
-    Extract a single chain from a mmCIF/pdb file and write to a new file.
+    Extract a single chain from a structure (mmCIF or pdb) file and write to a new file.
     Args:
-        input_file: Path to the input mmCIF/pdb file.
+        input_file: Path to the input structure (mmCIF or pdb) file.
         chain2keep: The chain to keep.
         output_dir: Directory to save the output file.
         out_chain: The chain identifier for the output file.
     Returns:
-        Path to the output mmCIF/pdb file
+        Path to the output structure (mmCIF or pdb) file
     """
-    return write_single_chain_pdb_file(input_file, chain2keep, output_dir, out_chain)
+    return write_single_chain_structure_file(input_file, chain2keep, output_dir, out_chain)
 @mcp.tool
@@ -199,6 +200,8 @@ def alphafold_confidence_filter(file: Path, query: ConfidenceFilterQuery, filter
 mcp.tool(filter_file_on_secondary_structure)
+mcp.tool(convert_to_cif_file)
 @mcp.prompt
 def candidate_structures(

protein_quest/ss.py CHANGED Viewed

@@ -5,17 +5,13 @@ from collections.abc import Generator, Iterable
 from dataclasses import dataclass
 from pathlib import Path
-from gemmi import Structure, read_structure, set_leak_warnings
+from gemmi import Structure
 from protein_quest.converter import PositiveInt, Ratio, converter
+from protein_quest.io import read_structure
 logger = logging.getLogger(__name__)
-# TODO remove once v0.7.4 of gemmi is released,
-# as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
-# Swallow gemmi leaked function warnings
-set_leak_warnings(False)
 # TODO if a structure has no secondary structure information, calculate it with `gemmi ss`.
 # https://github.com/MonomerLibrary/monomers/wiki/Installation as --monomers dir
 # gemmi executable is in https://pypi.org/project/gemmi-program/
@@ -261,7 +257,7 @@ def filter_file_on_secondary_structure(
     Returns:
         Filtering statistics and whether file passed.
     """
-    structure = read_structure(str(file_path))
+    structure = read_structure(file_path)
     return filter_on_secondary_structure(structure, query)

protein_quest/{pdbe/io.py → structure.py} RENAMED Viewed

@@ -1,51 +1,29 @@
-"""Module for structure file input/output."""
+"""Module for querying and modifying [gemmi structures][gemmi.Structure]."""
-import gzip
 import logging
-from collections.abc import Generator, Iterable
+from collections.abc import Iterable
 from datetime import UTC, datetime
 from pathlib import Path
 import gemmi
 from protein_quest.__version__ import __version__
+from protein_quest.io import read_structure, split_name_and_extension, write_structure
 from protein_quest.utils import CopyMethod, copyfile
 logger = logging.getLogger(__name__)
-# TODO remove once v0.7.4 of gemmi is released,
-# as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
-# Swallow gemmi leaked function warnings
-gemmi.set_leak_warnings(False)
-def nr_residues_in_chain(file: Path | str, chain: str = "A") -> int:
-    """Returns the number of residues in a specific chain from a mmCIF/pdb file.
+def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain | None:
+    """Find a chain in a model.
     Args:
-        file: Path to the input mmCIF/pdb file.
-        chain: Chain to count residues of.
+        model: The gemmi model to search in.
+        wanted_chain: The chain identifier to search for.
     Returns:
-        The number of residues in the specified chain.
+        The found chain or None if not found.
     """
-    structure = gemmi.read_structure(str(file))
-    gchain = find_chain_in_structure(structure, chain)
-    if gchain is None:
-        logger.warning("Chain %s not found in %s. Returning 0.", chain, file)
-        return 0
-    return len(gchain)
-def find_chain_in_structure(structure: gemmi.Structure, wanted_chain: str) -> gemmi.Chain | None:
-    for model in structure:
-        chain = find_chain_in_model(model, wanted_chain)
-        if chain is not None:
-            return chain
-    return None
-def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain | None:
     chain = model.find_chain(wanted_chain)
     if chain is None:
         # For chain A in 4v92 the find_chain method returns None,
@@ -57,106 +35,39 @@ def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain |
     return chain
-def write_structure(structure: gemmi.Structure, path: Path):
-    """Write a gemmi structure to a file.
-    Args:
-        structure: The gemmi structure to write.
-        path: The file path to write the structure to.
-            The format depends on the file extension.
-            Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz.
-    Raises:
-        ValueError: If the file extension is not supported.
-    """
-    if path.name.endswith(".pdb"):
-        body: str = structure.make_pdb_string()
-        path.write_text(body)
-    elif path.name.endswith(".pdb.gz"):
-        body: str = structure.make_pdb_string()
-        with gzip.open(path, "wt") as f:
-            f.write(body)
-    elif path.name.endswith(".cif"):
-        # do not write chem_comp so it is viewable by molstar
-        # see https://github.com/project-gemmi/gemmi/discussions/362
-        doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
-        doc.write_file(str(path))
-    elif path.name.endswith(".cif.gz"):
-        doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
-        cif_str = doc.as_string()
-        with gzip.open(path, "wt") as f:
-            f.write(cif_str)
-    else:
-        msg = f"Unsupported file extension in {path.name}. Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz"
-        raise ValueError(msg)
-def _split_name_and_extension(name: str) -> tuple[str, str]:
-    # 1234.pdb -> (1234, .pdb)
-    # 1234.pdb.gz -> (1234, .pdb.gz)
-    # 1234.cif -> (1234, .cif)
-    # 1234.cif.gz -> (1234, .cif.gz)
-    if name.endswith(".pdb.gz"):
-        return name.replace(".pdb.gz", ""), ".pdb.gz"
-    if name.endswith(".cif.gz"):
-        return name.replace(".cif.gz", ""), ".cif.gz"
-    if name.endswith(".pdb"):
-        return name.replace(".pdb", ""), ".pdb"
-    if name.endswith(".cif"):
-        return name.replace(".cif", ""), ".cif"
-    msg = f"Unknown file extension in {name}. Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz"
-    raise ValueError(msg)
-def locate_structure_file(root: Path, pdb_id: str) -> Path:
-    """Locate a structure file for a given PDB ID in the specified directory.
+def find_chain_in_structure(structure: gemmi.Structure, wanted_chain: str) -> gemmi.Chain | None:
+    """Find a chain in a structure.
     Args:
-        root: The root directory to search in.
-        pdb_id: The PDB ID to locate.
+        structure: The gemmi structure to search in.
+        wanted_chain: The chain identifier to search for.
     Returns:
-        The path to the located structure file.
-    Raises:
-        FileNotFoundError: If no structure file is found for the given PDB ID.
+        The found chain or None if not found.
     """
-    exts = [".cif.gz", ".cif", ".pdb.gz", ".pdb", ".ent", ".ent.gz"]
-    for ext in exts:
-        candidates = (
-            root / f"{pdb_id}{ext}",
-            root / f"{pdb_id.lower()}{ext}",
-            root / f"{pdb_id.upper()}{ext}",
-            root / f"pdb{pdb_id.lower()}{ext}",
-        )
-        for candidate in candidates:
-            if candidate.exists():
-                return candidate
-    msg = f"No structure file found for {pdb_id} in {root}"
-    raise FileNotFoundError(msg)
+    for model in structure:
+        chain = find_chain_in_model(model, wanted_chain)
+        if chain is not None:
+            return chain
+    return None
-def glob_structure_files(input_dir: Path) -> Generator[Path]:
-    """Glob for structure files in a directory.
+def nr_residues_in_chain(file: Path, chain: str = "A") -> int:
+    """Returns the number of residues in a specific chain from a structure file.
     Args:
-        input_dir: The input directory to search for structure files.
+        file: Path to the input structure file.
+        chain: Chain to count residues of.
-    Yields:
-        Paths to the found structure files.
+    Returns:
+        The number of residues in the specified chain.
     """
-    for ext in [".cif.gz", ".cif", ".pdb.gz", ".pdb"]:
-        yield from input_dir.glob(f"*{ext}")
-class ChainNotFoundError(IndexError):
-    """Exception raised when a chain is not found in a structure."""
-    def __init__(self, chain: str, file: Path | str, available_chains: Iterable[str]):
-        super().__init__(f"Chain {chain} not found in {file}. Available chains are: {available_chains}")
-        self.chain_id = chain
-        self.file = file
+    structure = read_structure(file)
+    gchain = find_chain_in_structure(structure, chain)
+    if gchain is None:
+        logger.warning("Chain %s not found in %s. Returning 0.", chain, file)
+        return 0
+    return len(gchain)
 def _dedup_helices(structure: gemmi.Structure):
@@ -198,18 +109,34 @@ def _add_provenance_info(structure: gemmi.Structure, chain2keep: str, out_chain:
 def chains_in_structure(structure: gemmi.Structure) -> set[gemmi.Chain]:
-    """Get a list of chains in a structure."""
+    """Get a list of chains in a structure.
+    Args:
+        structure: The gemmi structure to get chains from.
+    Returns:
+        A set of chains in the structure.
+    """
     return {c for model in structure for c in model}
-def write_single_chain_pdb_file(
+class ChainNotFoundError(IndexError):
+    """Exception raised when a chain is not found in a structure."""
+    def __init__(self, chain: str, file: Path | str, available_chains: Iterable[str]):
+        super().__init__(f"Chain {chain} not found in {file}. Available chains are: {available_chains}")
+        self.chain_id = chain
+        self.file = file
+def write_single_chain_structure_file(
     input_file: Path,
     chain2keep: str,
     output_dir: Path,
     out_chain: str = "A",
     copy_method: CopyMethod = "copy",
 ) -> Path:
-    """Write a single chain from a mmCIF/pdb file to a new mmCIF/pdb file.
+    """Write a single chain from a structure file to a new structure file.
     Also
@@ -226,14 +153,14 @@ def write_single_chain_pdb_file(
     ```
     Args:
-        input_file: Path to the input mmCIF/pdb file.
+        input_file: Path to the input structure file.
         chain2keep: The chain to keep.
         output_dir: Directory to save the output file.
         out_chain: The chain identifier for the output file.
         copy_method: How to copy when no changes are needed to output file.
     Returns:
-        Path to the output mmCIF/pdb file
+        Path to the output structure file
     Raises:
         FileNotFoundError: If the input file does not exist.
@@ -241,7 +168,7 @@ def write_single_chain_pdb_file(
     """
     logger.debug(f"chain2keep: {chain2keep}, out_chain: {out_chain}")
-    structure = gemmi.read_structure(str(input_file))
+    structure = read_structure(input_file)
     structure.setup_entities()
     chain = find_chain_in_structure(structure, chain2keep)
@@ -249,7 +176,7 @@ def write_single_chain_pdb_file(
     if chain is None:
         raise ChainNotFoundError(chain2keep, input_file, chainnames_in_structure)
     chain_name = chain.name
-    name, extension = _split_name_and_extension(input_file.name)
+    name, extension = split_name_and_extension(input_file.name)
     output_file = output_dir / f"{name}_{chain_name}2{out_chain}{extension}"
     if output_file.exists():

protein_quest/utils.py CHANGED Viewed

@@ -265,6 +265,7 @@ async def retrieve_files(
     desc: str = "Downloading files",
     cacher: Cacher | None = None,
     chunk_size: int = 524288,  # 512 KiB
+    gzip_files: bool = False,
 ) -> list[Path]:
     """Retrieve files from a list of URLs and save them to a directory.
@@ -277,6 +278,7 @@ async def retrieve_files(
         desc: Description for the progress bar.
         cacher: An optional cacher to use for caching files.
         chunk_size: The size of each chunk to read from the response.
+        gzip_files: Whether to gzip the downloaded files.
     Returns:
         A list of paths to the downloaded files.
@@ -292,6 +294,7 @@ async def retrieve_files(
                 semaphore=semaphore,
                 cacher=cacher,
                 chunk_size=chunk_size,
+                gzip_files=gzip_files,
             )
             for url, filename in urls
         ]
@@ -299,6 +302,10 @@ async def retrieve_files(
         return files
+class InvalidContentEncodingError(aiohttp.ClientResponseError):
+    """Content encoding is invalid."""
 async def _retrieve_file(
     session: RetryClient,
     url: URL | str,
@@ -306,6 +313,7 @@ async def _retrieve_file(
     semaphore: asyncio.Semaphore,
     cacher: Cacher | None = None,
     chunk_size: int = 524288,  # 512 KiB
+    gzip_files: bool = False,
 ) -> Path:
     """Retrieve a single file from a URL and save it to a specified path.
@@ -316,6 +324,7 @@ async def _retrieve_file(
         semaphore: A semaphore to limit the number of concurrent downloads.
         cacher: An optional cacher to use for caching files.
         chunk_size: The size of each chunk to read from the response.
+        gzip_files: Whether to gzip the downloaded file.
     Returns:
         The path to the saved file.
@@ -330,12 +339,27 @@ async def _retrieve_file(
         logger.debug(f"File {save_path} was copied from cache {cached_file}. Skipping download from {url}.")
         return save_path
+    # Alphafold server and many other web servers can return gzipped responses,
+    # when we want to save as *.gz, we use raw stream
+    # otherwise aiohttp will decompress it automatically for us.
+    auto_decompress = not gzip_files
+    headers = {"Accept-Encoding": "gzip"}
     async with (
         semaphore,
-        session.get(url) as resp,
+        session.get(url, headers=headers, auto_decompress=auto_decompress) as resp,
     ):
         resp.raise_for_status()
-        await cacher.write_iter(save_path, resp.content.iter_chunked(chunk_size))
+        if gzip_files and resp.headers.get("Content-Encoding") != "gzip":
+            msg = f"Server did not send gzip encoded content for {url}, can not save as gzipped file."
+            raise InvalidContentEncodingError(
+                request_info=resp.request_info,
+                history=resp.history,
+                status=415,
+                message=msg,
+                headers=resp.headers,
+            )
+        iterator = resp.content.iter_chunked(chunk_size)
+        await cacher.write_iter(save_path, iterator)
     return save_path

{protein_quest-0.5.1.dist-info → protein_quest-0.6.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: protein_quest
-Version: 0.5.1
+Version: 0.6.0
 Summary: Search/retrieve/filter proteins and protein structures
 Project-URL: Homepage, https://github.com/haddocking/protein-quest
 Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
@@ -17,6 +17,7 @@ Requires-Dist: cattrs[orjson]>=24.1.3
 Requires-Dist: dask>=2025.5.1
 Requires-Dist: distributed>=2025.5.1
 Requires-Dist: gemmi>=0.7.3
+Requires-Dist: mmcif>=0.92.0
 Requires-Dist: platformdirs>=4.3.8
 Requires-Dist: psutil>=7.0.0
 Requires-Dist: rich-argparse>=1.7.1
@@ -71,6 +72,7 @@ graph TB;
     fetchad -->|mmcif_files| confidencefilter{{Filter out low confidence}}
     confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
     residuefilter --> |mmcif_files| ssfilter
+    ssfilter -. mmcif_files .-> convert2cif([Convert to cif])
     classDef dashedBorder stroke-dasharray: 5 5;
     goterm:::dashedBorder
     taxonomy:::dashedBorder
@@ -78,6 +80,7 @@ graph TB;
     fetchemdb:::dashedBorder
     searchintactionpartners:::dashedBorder
     searchcomplexes:::dashedBorder
+    convert2cif:::dashedBorder
 ```
 (Dotted nodes and edges are side-quests.)
@@ -242,6 +245,14 @@ query_protein,complex_id,complex_url,complex_title,members
 Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
 ```
+### Convert structure files to .cif format
+Some tools (for example [powerfit](https://github.com/haddocking/powerfit)) only work with `.cif` files and not `*.cif.gz` or `*.bcif` files.
+```shell
+protein-quest convert --output-dir ./filtered-cif ./filtered-ss
+```
 ##  Model Context Protocol (MCP) server
 Protein quest can also help LLMs like Claude Sonnet 4 by providing a [set of tools](https://modelcontextprotocol.io/docs/learn/server-concepts#tools-ai-actions) for protein structures.

protein_quest-0.6.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,27 @@
+protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+protein_quest/__version__.py,sha256=z_nR_Ti0YfIwFSKDD18DIrz_r3zxWQ8EGCNr2XUWkY0,56
+protein_quest/cli.py,sha256=pWwMIzWBrtqhZbvTIkvd1XhA5u9J-WAAg7A3hJZGtlk,46201
+protein_quest/converter.py,sha256=Y-Oxf7lDNbEicL6GS-IpNWDwaAiHgIgs5bFAcEHCKdQ,1441
+protein_quest/emdb.py,sha256=641c6RwNYnu-0GBFyCFBiI58fNc0jMkd0ZZ9MW9-Jmc,1501
+protein_quest/filters.py,sha256=Xr-cJTtbNjHKuzmXLBf7yZfqKf_U3RTivcVbr620LVU,5225
+protein_quest/go.py,sha256=lZNEcw8nTc9wpV3cl4y2FG9Lsj8wsXQ6zemmAQs_DWE,5650
+protein_quest/io.py,sha256=ngV_HU2HIQFO-bP2xQj_fhgv0MYjW4puqz_9CxGpBv8,13017
+protein_quest/mcp_server.py,sha256=rQv2srhF3_SYYK1TD3htIyxNiunU7a8FDC7CYT_oJFE,8269
+protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
+protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+protein_quest/ss.py,sha256=4ZGIHfjTlodYTXqGUKhMnGbgaStYOGaWg2oYrWIjdgo,10118
+protein_quest/structure.py,sha256=1FTKN0mYKTwZHlyIB4ORSAgSHFKK-UAK7T-qoFo1vyI,7162
+protein_quest/taxonomy.py,sha256=4mKv8zll4mX02Ow8CTvyqMJE2KJZvcq3QlTjjjLOJJk,5072
+protein_quest/uniprot.py,sha256=92G5YiJAJwUBKJQHPrM6DZlaLe-XG4qBg0zy0BDGFYY,24354
+protein_quest/utils.py,sha256=6OF8X4ia_z1HOYiXy6e-zEWlp_bF1DoZCVrCSg1qivY,19076
+protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
+protein_quest/alphafold/confidence.py,sha256=mVAYTIzdbR8xBjRiUzA0at8wJq9vpfEQWPz5cJefLKs,6766
+protein_quest/alphafold/entry_summary.py,sha256=GtE3rT7wH3vIOOeiXY2s80Fo6EzdoqlcvakW8K591Yk,1257
+protein_quest/alphafold/fetch.py,sha256=n5SlqbQfU1PE4X8saV4O1nCrKRn3Q2UcMlrNw5-163w,12801
+protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
+protein_quest/pdbe/fetch.py,sha256=e8CHWDX2QzWnVLmYXCfNrscw1UcN1lI9Uz6Z5HmEOEQ,2510
+protein_quest-0.6.0.dist-info/METADATA,sha256=8rX0ixi4Xl516LkxOlOKKRe364nKIjP7mKn67xuOcDA,9623
+protein_quest-0.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+protein_quest-0.6.0.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
+protein_quest-0.6.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+protein_quest-0.6.0.dist-info/RECORD,,

protein_quest-0.5.1.dist-info/RECORD DELETED Viewed

@@ -1,26 +0,0 @@
-protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-protein_quest/__version__.py,sha256=iRjDp09jO2JFmZdsWS3ikyYYQ8S33AzhMdrr00gEG9g,56
-protein_quest/cli.py,sha256=xiXt_2l3MxbTbmxm2sz0w8_OdJr8gz_B68GBVv5wHjE,44182
-protein_quest/converter.py,sha256=Y-Oxf7lDNbEicL6GS-IpNWDwaAiHgIgs5bFAcEHCKdQ,1441
-protein_quest/emdb.py,sha256=641c6RwNYnu-0GBFyCFBiI58fNc0jMkd0ZZ9MW9-Jmc,1501
-protein_quest/filters.py,sha256=-gasSXR4g5SzYSYbkfcDwR-tm2KCAhCMdpIVJrUPR1w,5224
-protein_quest/go.py,sha256=lZNEcw8nTc9wpV3cl4y2FG9Lsj8wsXQ6zemmAQs_DWE,5650
-protein_quest/mcp_server.py,sha256=PCXxcU3GElKg2sjMlxbsM63OiFxg9AtmfKwBJ1_0AQE,8130
-protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
-protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-protein_quest/ss.py,sha256=qOr0aMycNAtZmXXvhCN-KZH3Qp4EejnBcE6fsFgCrmY,10343
-protein_quest/taxonomy.py,sha256=4mKv8zll4mX02Ow8CTvyqMJE2KJZvcq3QlTjjjLOJJk,5072
-protein_quest/uniprot.py,sha256=92G5YiJAJwUBKJQHPrM6DZlaLe-XG4qBg0zy0BDGFYY,24354
-protein_quest/utils.py,sha256=2lQ7jPHWtDySBTYnoL9VTKl5XUgQVYgp9Prb7qEnjtQ,17982
-protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
-protein_quest/alphafold/confidence.py,sha256=pYIuwYdkuPuHLagcX1dSvSyZ_84xboRLfHUxkEoc4MY,6766
-protein_quest/alphafold/entry_summary.py,sha256=GtE3rT7wH3vIOOeiXY2s80Fo6EzdoqlcvakW8K591Yk,1257
-protein_quest/alphafold/fetch.py,sha256=wIsgPZmtnE5EoAL9G22Y6Ehx9d0md53Mw88-6LLGp0Q,12298
-protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
-protein_quest/pdbe/fetch.py,sha256=e8CHWDX2QzWnVLmYXCfNrscw1UcN1lI9Uz6Z5HmEOEQ,2510
-protein_quest/pdbe/io.py,sha256=iGLvmsD-eEYnrgZDYfkGWIDCzwDRRD5dwqB480talCs,10037
-protein_quest-0.5.1.dist-info/METADATA,sha256=MPfZLLa8XC1tZ3okRYIT3Hs3pMvd8ShA17Yy2axKBe8,9236
-protein_quest-0.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-protein_quest-0.5.1.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
-protein_quest-0.5.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-protein_quest-0.5.1.dist-info/RECORD,,

{protein_quest-0.5.1.dist-info → protein_quest-0.6.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{protein_quest-0.5.1.dist-info → protein_quest-0.6.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{protein_quest-0.5.1.dist-info → protein_quest-0.6.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

protein-quest 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl

Potentially problematic release.

protein-quest 0.5.1py3-none-any.whl → 0.6.0py3-none-any.whl