PyPI - protein-quest - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

protein-quest 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of protein-quest might be problematic. Click here for more details.

Files changed (21) hide show

protein_quest/__version__.py +2 -1
protein_quest/alphafold/confidence.py +44 -17
protein_quest/alphafold/entry_summary.py +11 -9
protein_quest/alphafold/fetch.py +37 -63
protein_quest/cli.py +187 -30
protein_quest/converter.py +45 -0
protein_quest/filters.py +78 -35
protein_quest/go.py +1 -4
protein_quest/mcp_server.py +8 -5
protein_quest/parallel.py +37 -1
protein_quest/pdbe/fetch.py +15 -1
protein_quest/pdbe/io.py +142 -46
protein_quest/ss.py +264 -0
protein_quest/taxonomy.py +13 -3
protein_quest/utils.py +65 -3
{protein_quest-0.3.0.dist-info → protein_quest-0.3.2.dist-info}/METADATA +21 -11
protein_quest-0.3.2.dist-info/RECORD +26 -0
protein_quest-0.3.0.dist-info/RECORD +0 -24
{protein_quest-0.3.0.dist-info → protein_quest-0.3.2.dist-info}/WHEEL +0 -0
{protein_quest-0.3.0.dist-info → protein_quest-0.3.2.dist-info}/entry_points.txt +0 -0
{protein_quest-0.3.0.dist-info → protein_quest-0.3.2.dist-info}/licenses/LICENSE +0 -0

protein_quest/cli.py CHANGED Viewed

@@ -5,7 +5,8 @@ import asyncio
 import csv
 import logging
 import os
-from collections.abc import Callable, Iterable
+import sys
+from collections.abc import Callable, Generator, Iterable
 from importlib.util import find_spec
 from io import TextIOWrapper
 from pathlib import Path
@@ -14,6 +15,7 @@ from textwrap import dedent
 from cattrs import structure
 from rich import print as rprint
 from rich.logging import RichHandler
+from rich.panel import Panel
 from rich_argparse import ArgumentDefaultsRichHelpFormatter
 from tqdm.rich import tqdm
@@ -21,13 +23,16 @@ from protein_quest.__version__ import __version__
 from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence
 from protein_quest.alphafold.fetch import DownloadableFormat, downloadable_formats
 from protein_quest.alphafold.fetch import fetch_many as af_fetch
+from protein_quest.converter import converter
 from protein_quest.emdb import fetch as emdb_fetch
 from protein_quest.filters import filter_files_on_chain, filter_files_on_residues
 from protein_quest.go import Aspect, allowed_aspects, search_gene_ontology_term, write_go_terms_to_csv
 from protein_quest.pdbe import fetch as pdbe_fetch
-from protein_quest.pdbe.io import glob_structure_files
+from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
+from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
 from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
 from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
+from protein_quest.utils import CopyMethod, copy_methods, copyfile
 logger = logging.getLogger(__name__)
@@ -246,12 +251,12 @@ def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
     )
     parser.add_argument("output_dir", type=Path, help="Directory to store downloaded AlphaFold files")
     parser.add_argument(
-        "--what-af-formats",
+        "--what-formats",
         type=str,
         action="append",
         choices=sorted(downloadable_formats),
         help=dedent("""AlphaFold formats to retrieve. Can be specified multiple times.
-            Default is 'pdb'. Summary is always downloaded as `<entryId>.json`."""),
+            Default is 'summary' and 'cif'."""),
     )
     parser.add_argument(
         "--max-parallel-downloads",
@@ -280,6 +285,22 @@ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
     parser.add_argument("output_dir", type=Path, help="Directory to store downloaded EMDB volume files")
+def _add_copy_method_argument(parser: argparse.ArgumentParser):
+    """Add copy method argument to parser."""
+    default_copy_method = "symlink"
+    if os.name == "nt":
+        # On Windows you need developer mode or admin privileges to create symlinks
+        # so we default to copying files instead of symlinking
+        default_copy_method = "copy"
+    parser.add_argument(
+        "--copy-method",
+        type=str,
+        choices=copy_methods,
+        default=default_copy_method,
+        help="How to copy files when no changes are needed to output file.",
+    )
 def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
     """Add filter confidence subcommand parser."""
     parser = subparsers.add_parser(
@@ -310,6 +331,7 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
             In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
             Use `-` for stdout."""),
     )
+    _add_copy_method_argument(parser)
 def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
@@ -345,8 +367,11 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
     )
     parser.add_argument(
         "--scheduler-address",
-        help="Address of the Dask scheduler to connect to. If not provided, will create a local cluster.",
+        help=dedent("""Address of the Dask scheduler to connect to.
+            If not provided, will create a local cluster.
+            If set to `sequential` will run tasks sequentially."""),
     )
+    _add_copy_method_argument(parser)
 def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
@@ -369,6 +394,7 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
     )
     parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
     parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
+    _add_copy_method_argument(parser)
     parser.add_argument(
         "--write-stats",
         type=argparse.FileType("w", encoding="UTF-8"),
@@ -379,6 +405,43 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
     )
+def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
+    """Add filter secondary structure subcommand parser."""
+    parser = subparsers.add_parser(
+        "secondary-structure",
+        help="Filter PDB/mmCIF files by secondary structure",
+        description="Filter PDB/mmCIF files by secondary structure",
+        formatter_class=ArgumentDefaultsRichHelpFormatter,
+    )
+    parser.add_argument("input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')")
+    parser.add_argument(
+        "output_dir",
+        type=Path,
+        help=dedent("""\
+            Directory to write filtered PDB/mmCIF files. Files are copied without modification.
+        """),
+    )
+    parser.add_argument("--abs-min-helix-residues", type=int, help="Min residues in helices")
+    parser.add_argument("--abs-max-helix-residues", type=int, help="Max residues in helices")
+    parser.add_argument("--abs-min-sheet-residues", type=int, help="Min residues in sheets")
+    parser.add_argument("--abs-max-sheet-residues", type=int, help="Max residues in sheets")
+    parser.add_argument("--ratio-min-helix-residues", type=float, help="Min residues in helices (relative)")
+    parser.add_argument("--ratio-max-helix-residues", type=float, help="Max residues in helices (relative)")
+    parser.add_argument("--ratio-min-sheet-residues", type=float, help="Min residues in sheets (relative)")
+    parser.add_argument("--ratio-max-sheet-residues", type=float, help="Max residues in sheets (relative)")
+    _add_copy_method_argument(parser)
+    parser.add_argument(
+        "--write-stats",
+        type=argparse.FileType("w", encoding="UTF-8"),
+        help=dedent("""
+            Write filter statistics to file. In CSV format with columns:
+            `<input_file>,<nr_residues>,<nr_helix_residues>,<nr_sheet_residues>,
+            <helix_ratio>,<sheet_ratio>,<passed>,<output_file>`.
+            Use `-` for stdout.
+        """),
+    )
 def _add_search_subcommands(subparsers: argparse._SubParsersAction):
     """Add search command and its subcommands."""
     parser = subparsers.add_parser(
@@ -420,6 +483,7 @@ def _add_filter_subcommands(subparsers: argparse._SubParsersAction):
     _add_filter_confidence_parser(subsubparsers)
     _add_filter_chain_parser(subsubparsers)
     _add_filter_residue_parser(subsubparsers)
+    _add_filter_ss_parser(subsubparsers)
 def _add_mcp_command(subparsers: argparse._SubParsersAction):
@@ -585,17 +649,17 @@ def _handle_retrieve_pdbe(args):
 def _handle_retrieve_alphafold(args):
     download_dir = args.output_dir
-    what_af_formats = args.what_af_formats
+    what_formats = args.what_formats
     alphafold_csv = args.alphafold_csv
     max_parallel_downloads = args.max_parallel_downloads
-    if what_af_formats is None:
-        what_af_formats = {"pdb"}
+    if what_formats is None:
+        what_formats = {"summary", "cif"}
     # TODO besides `uniprot_acc,af_id\n` csv also allow headless single column format
     #
-    af_ids = [r["af_id"] for r in _read_alphafold_csv(alphafold_csv)]
-    validated_what: set[DownloadableFormat] = structure(what_af_formats, set[DownloadableFormat])
+    af_ids = _read_column_from_csv(alphafold_csv, "af_id")
+    validated_what: set[DownloadableFormat] = structure(what_formats, set[DownloadableFormat])
     rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {validated_what}")
     afs = af_fetch(af_ids, download_dir, what=validated_what, max_parallel_downloads=max_parallel_downloads)
     total_nr_files = sum(af.nr_of_files() for af in afs)
@@ -618,21 +682,22 @@ def _handle_filter_confidence(args: argparse.Namespace):
     # to get rid of duplication
     input_dir = structure(args.input_dir, Path)
     output_dir = structure(args.output_dir, Path)
-    confidence_threshold = structure(args.confidence_threshold, float)
-    # TODO add min/max
-    min_residues = structure(args.min_residues, int)
-    max_residues = structure(args.max_residues, int)
+    confidence_threshold = args.confidence_threshold
+    min_residues = args.min_residues
+    max_residues = args.max_residues
     stats_file: TextIOWrapper | None = args.write_stats
+    copy_method: CopyMethod = structure(args.copy_method, CopyMethod)  # pyright: ignore[reportArgumentType]
     output_dir.mkdir(parents=True, exist_ok=True)
     input_files = sorted(glob_structure_files(input_dir))
     nr_input_files = len(input_files)
     rprint(f"Starting confidence filtering of {nr_input_files} mmcif/PDB files in {input_dir} directory.")
-    query = structure(
+    query = converter.structure(
         {
             "confidence": confidence_threshold,
-            "min_threshold": min_residues,
-            "max_threshold": max_residues,
+            "min_residues": min_residues,
+            "max_residues": max_residues,
         },
         ConfidenceFilterQuery,
     )
@@ -641,7 +706,11 @@ def _handle_filter_confidence(args: argparse.Namespace):
         writer.writerow(["input_file", "residue_count", "passed", "output_file"])
     passed_count = 0
-    for r in tqdm(filter_files_on_confidence(input_files, query, output_dir), total=len(input_files), unit="file"):
+    for r in tqdm(
+        filter_files_on_confidence(input_files, query, output_dir, copy_method=copy_method),
+        total=len(input_files),
+        unit="file",
+    ):
         if r.filtered_file:
             passed_count += 1
         if stats_file:
@@ -654,25 +723,53 @@ def _handle_filter_confidence(args: argparse.Namespace):
 def _handle_filter_chain(args):
     input_dir = args.input_dir
-    output_dir = args.output_dir
+    output_dir = structure(args.output_dir, Path)
     pdb_id2chain_mapping_file = args.chains
-    scheduler_address = args.scheduler_address
+    scheduler_address = structure(args.scheduler_address, str | None)  # pyright: ignore[reportArgumentType]
+    copy_method: CopyMethod = structure(args.copy_method, CopyMethod)  # pyright: ignore[reportArgumentType]
+    # make sure files in input dir with entries in mapping file are the same
+    # complain when files from mapping file are missing on disk
     rows = list(_iter_csv_rows(pdb_id2chain_mapping_file))
-    id2chains: dict[str, str] = {row["pdb_id"]: row["chain"] for row in rows}
+    file2chain: set[tuple[Path, str]] = set()
+    errors: list[FileNotFoundError] = []
-    new_files = filter_files_on_chain(input_dir, id2chains, output_dir, scheduler_address)
+    for row in rows:
+        pdb_id = row["pdb_id"]
+        chain = row["chain"]
+        try:
+            f = locate_structure_file(input_dir, pdb_id)
+            file2chain.add((f, chain))
+        except FileNotFoundError as e:
+            errors.append(e)
-    nr_written = len([r for r in new_files if r[2] is not None])
+    if errors:
+        msg = f"Some structure files could not be found ({len(errors)} missing), skipping them"
+        rprint(Panel(os.linesep.join(map(str, errors)), title=msg, style="red"))
+    if not file2chain:
+        rprint("[red]No valid structure files found. Exiting.")
+        sys.exit(1)
+    results = filter_files_on_chain(
+        file2chain, output_dir, scheduler_address=scheduler_address, copy_method=copy_method
+    )
+    nr_written = len([r for r in results if r.passed])
     rprint(f"Wrote {nr_written} single-chain PDB/mmCIF files to {output_dir}.")
+    for result in results:
+        if result.discard_reason:
+            rprint(f"[red]Discarding {result.input_file} ({result.discard_reason})[/red]")
 def _handle_filter_residue(args):
     input_dir = structure(args.input_dir, Path)
     output_dir = structure(args.output_dir, Path)
     min_residues = structure(args.min_residues, int)
     max_residues = structure(args.max_residues, int)
+    copy_method: CopyMethod = structure(args.copy_method, CopyMethod)  # pyright: ignore[reportArgumentType]
     stats_file: TextIOWrapper | None = args.write_stats
     if stats_file:
@@ -683,7 +780,9 @@ def _handle_filter_residue(args):
     input_files = sorted(glob_structure_files(input_dir))
     nr_total = len(input_files)
     rprint(f"Filtering {nr_total} files in {input_dir} directory by number of residues in chain A.")
-    for r in filter_files_on_residues(input_files, output_dir, min_residues=min_residues, max_residues=max_residues):
+    for r in filter_files_on_residues(
+        input_files, output_dir, min_residues=min_residues, max_residues=max_residues, copy_method=copy_method
+    ):
         if stats_file:
             writer.writerow([r.input_file, r.residue_count, r.passed, r.output_file])
         if r.passed:
@@ -694,6 +793,68 @@ def _handle_filter_residue(args):
         rprint(f"Statistics written to {stats_file.name}")
+def _handle_filter_ss(args):
+    input_dir = structure(args.input_dir, Path)
+    output_dir = structure(args.output_dir, Path)
+    copy_method: CopyMethod = structure(args.copy_method, CopyMethod)  # pyright: ignore[reportArgumentType]
+    stats_file: TextIOWrapper | None = args.write_stats
+    raw_query = {
+        "abs_min_helix_residues": args.abs_min_helix_residues,
+        "abs_max_helix_residues": args.abs_max_helix_residues,
+        "abs_min_sheet_residues": args.abs_min_sheet_residues,
+        "abs_max_sheet_residues": args.abs_max_sheet_residues,
+        "ratio_min_helix_residues": args.ratio_min_helix_residues,
+        "ratio_max_helix_residues": args.ratio_max_helix_residues,
+        "ratio_min_sheet_residues": args.ratio_min_sheet_residues,
+        "ratio_max_sheet_residues": args.ratio_max_sheet_residues,
+    }
+    query = converter.structure(raw_query, SecondaryStructureFilterQuery)
+    input_files = sorted(glob_structure_files(input_dir))
+    nr_total = len(input_files)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if stats_file:
+        writer = csv.writer(stats_file)
+        writer.writerow(
+            [
+                "input_file",
+                "nr_residues",
+                "nr_helix_residues",
+                "nr_sheet_residues",
+                "helix_ratio",
+                "sheet_ratio",
+                "passed",
+                "output_file",
+            ]
+        )
+    rprint(f"Filtering {nr_total} files in {input_dir} directory by secondary structure.")
+    nr_passed = 0
+    for input_file, result in filter_files_on_secondary_structure(input_files, query=query):
+        output_file: Path | None = None
+        if result.passed:
+            output_file = output_dir / input_file.name
+            copyfile(input_file, output_file, copy_method)
+            nr_passed += 1
+        if stats_file:
+            writer.writerow(
+                [
+                    input_file,
+                    result.stats.nr_residues,
+                    result.stats.nr_helix_residues,
+                    result.stats.nr_sheet_residues,
+                    round(result.stats.helix_ratio, 3),
+                    round(result.stats.sheet_ratio, 3),
+                    result.passed,
+                    output_file,
+                ]
+            )
+    rprint(f"Wrote {nr_passed} files to {output_dir} directory.")
+    if stats_file:
+        rprint(f"Statistics written to {stats_file.name}")
 def _handle_mcp(args):
     if find_spec("fastmcp") is None:
         msg = "Unable to start MCP server, please install `protein-quest[mcp]`."
@@ -720,6 +881,7 @@ HANDLERS: dict[tuple[str, str | None], Callable] = {
     ("filter", "confidence"): _handle_filter_confidence,
     ("filter", "chain"): _handle_filter_chain,
     ("filter", "residue"): _handle_filter_residue,
+    ("filter", "secondary-structure"): _handle_filter_ss,
     ("mcp", None): _handle_mcp,
 }
@@ -768,12 +930,7 @@ def _write_dict_of_sets2csv(file: TextIOWrapper, data: dict[str, set[str]], ref_
             writer.writerow({"uniprot_acc": uniprot_acc, ref_id_field: ref_id})
-def _read_alphafold_csv(file: TextIOWrapper):
-    reader = csv.DictReader(file)
-    yield from reader
-def _iter_csv_rows(file: TextIOWrapper):
+def _iter_csv_rows(file: TextIOWrapper) -> Generator[dict[str, str]]:
     reader = csv.DictReader(file)
     yield from reader

protein_quest/converter.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Convert json or dict to Python objects."""
+from cattrs.preconf.orjson import make_converter
+from yarl import URL
+type Percentage = float
+"""Type alias for percentage values (0.0-100.0)."""
+type Ratio = float
+"""Type alias for ratio values (0.0-1.0)."""
+type PositiveInt = int
+"""Type alias for positive integer values (>= 0)."""
+converter = make_converter()
+"""cattrs converter to read JSON document or dict to Python objects."""
+converter.register_structure_hook(URL, lambda v, _: URL(v))
+@converter.register_structure_hook
+def percentage_hook(val, _) -> Percentage:
+    value = float(val)
+    """Cattrs hook to validate percentage values."""
+    if not 0.0 <= value <= 100.0:
+        msg = f"Value {value} is not a valid percentage (0.0-100.0)"
+        raise ValueError(msg)
+    return value
+@converter.register_structure_hook
+def ratio_hook(val, _) -> Ratio:
+    """Cattrs hook to validate ratio values."""
+    value = float(val)
+    if not 0.0 <= value <= 1.0:
+        msg = f"Value {value} is not a valid ratio (0.0-1.0)"
+        raise ValueError(msg)
+    return value
+@converter.register_structure_hook
+def positive_int_hook(val, _) -> PositiveInt:
+    """Cattrs hook to validate positive integer values."""
+    value = int(val)
+    if value < 0:
+        msg = f"Value {value} is not a valid positive integer (>= 0)"
+        raise ValueError(msg)
+    return value

protein_quest/filters.py CHANGED Viewed

@@ -1,70 +1,107 @@
 """Module for filtering structure files and their contents."""
 import logging
-from collections.abc import Generator
+from collections.abc import Collection, Generator
 from dataclasses import dataclass
 from pathlib import Path
-from shutil import copyfile
-from typing import cast
+from typing import Literal
-from dask.distributed import Client, progress
+from dask.distributed import Client
 from distributed.deploy.cluster import Cluster
 from tqdm.auto import tqdm
-from protein_quest.parallel import configure_dask_scheduler
+from protein_quest.parallel import configure_dask_scheduler, dask_map_with_progress
 from protein_quest.pdbe.io import (
-    locate_structure_file,
     nr_residues_in_chain,
     write_single_chain_pdb_file,
 )
+from protein_quest.utils import CopyMethod, copyfile
 logger = logging.getLogger(__name__)
+@dataclass
+class ChainFilterStatistics:
+    input_file: Path
+    chain_id: str
+    passed: bool = False
+    output_file: Path | None = None
+    discard_reason: Exception | None = None
+def filter_file_on_chain(
+    file_and_chain: tuple[Path, str],
+    output_dir: Path,
+    out_chain: str = "A",
+    copy_method: CopyMethod = "copy",
+) -> ChainFilterStatistics:
+    input_file, chain_id = file_and_chain
+    logger.debug("Filtering %s on chain %s", input_file, chain_id)
+    try:
+        output_file = write_single_chain_pdb_file(
+            input_file, chain_id, output_dir, out_chain=out_chain, copy_method=copy_method
+        )
+        return ChainFilterStatistics(
+            input_file=input_file,
+            chain_id=chain_id,
+            output_file=output_file,
+            passed=True,
+        )
+    except Exception as e:  # noqa: BLE001 - error is handled downstream
+        return ChainFilterStatistics(input_file=input_file, chain_id=chain_id, discard_reason=e)
 def filter_files_on_chain(
-    input_dir: Path,
-    id2chains: dict[str, str],
+    file2chains: Collection[tuple[Path, str]],
     output_dir: Path,
-    scheduler_address: str | Cluster | None = None,
     out_chain: str = "A",
-) -> list[tuple[str, str, Path | None]]:
+    scheduler_address: str | Cluster | Literal["sequential"] | None = None,
+    copy_method: CopyMethod = "copy",
+) -> list[ChainFilterStatistics]:
     """Filter mmcif/PDB files by chain.
     Args:
-        input_dir: The directory containing the input mmcif/PDB files.
-        id2chains: Which chain to keep for each PDB ID. Key is the PDB ID, value is the chain ID.
+        file2chains: Which chain to keep for each PDB file.
+            First item is the PDB file path, second item is the chain ID.
         output_dir: The directory where the filtered files will be written.
-        scheduler_address: The address of the Dask scheduler.
         out_chain: Under what name to write the kept chain.
+        scheduler_address: The address of the Dask scheduler.
+            If not provided, will create a local cluster.
+            If set to `sequential` will run tasks sequentially.
+        copy_method: How to copy when a direct copy is possible.
     Returns:
-        A list of tuples containing the PDB ID, chain ID, and path to the filtered file.
-        Last tuple item is None if something went wrong like chain not present.
+        Result of the filtering process.
     """
     output_dir.mkdir(parents=True, exist_ok=True)
+    if scheduler_address == "sequential":
+        def task(file_and_chain: tuple[Path, str]) -> ChainFilterStatistics:
+            return filter_file_on_chain(file_and_chain, output_dir, out_chain=out_chain, copy_method=copy_method)
+        return list(map(task, file2chains))
+    # TODO make logger.debug in filter_file_on_chain show to user when --log
+    # GPT-5 generated a fairly difficult setup with a WorkerPlugin, need to find a simpler approach
     scheduler_address = configure_dask_scheduler(
         scheduler_address,
         name="filter-chain",
     )
-    def task(id2chain: tuple[str, str]) -> tuple[str, str, Path | None]:
-        pdb_id, chain = id2chain
-        input_file = locate_structure_file(input_dir, pdb_id)
-        return pdb_id, chain, write_single_chain_pdb_file(input_file, chain, output_dir, out_chain=out_chain)
     with Client(scheduler_address) as client:
-        logger.info(f"Follow progress on dask dashboard at: {client.dashboard_link}")
-        futures = client.map(task, id2chains.items())
-        progress(futures)
-        results = client.gather(futures)
-        return cast("list[tuple[str,str, Path | None]]", results)
+        client.forward_logging()
+        return dask_map_with_progress(
+            client,
+            filter_file_on_chain,
+            file2chains,
+            output_dir=output_dir,
+            out_chain=out_chain,
+            copy_method=copy_method,
+        )
 @dataclass
-class FilterStat:
+class ResidueFilterStatistics:
     """Statistics for filtering files based on residue count in a specific chain.
     Parameters:
@@ -81,8 +118,13 @@ class FilterStat:
 def filter_files_on_residues(
-    input_files: list[Path], output_dir: Path, min_residues: int, max_residues: int, chain: str = "A"
-) -> Generator[FilterStat]:
+    input_files: list[Path],
+    output_dir: Path,
+    min_residues: int,
+    max_residues: int,
+    chain: str = "A",
+    copy_method: CopyMethod = "copy",
+) -> Generator[ResidueFilterStatistics]:
     """Filter PDB/mmCIF files by number of residues in given chain.
     Args:
@@ -91,9 +133,10 @@ def filter_files_on_residues(
         min_residues: The minimum number of residues in chain.
         max_residues: The maximum number of residues in chain.
         chain: The chain to count residues of.
+        copy_method: How to copy passed files to output directory:
     Yields:
-        FilterStat objects containing information about the filtering process for each input file.
+        Objects containing information about the filtering process for each input file.
     """
     output_dir.mkdir(parents=True, exist_ok=True)
     for input_file in tqdm(input_files, unit="file"):
@@ -101,7 +144,7 @@ def filter_files_on_residues(
         passed = min_residues <= residue_count <= max_residues
         if passed:
             output_file = output_dir / input_file.name
-            copyfile(input_file, output_file)
-            yield FilterStat(input_file, residue_count, True, output_file)
+            copyfile(input_file, output_file, copy_method)
+            yield ResidueFilterStatistics(input_file, residue_count, True, output_file)
         else:
-            yield FilterStat(input_file, residue_count, False, None)
+            yield ResidueFilterStatistics(input_file, residue_count, False, None)

protein_quest/go.py CHANGED Viewed

@@ -8,8 +8,8 @@ from io import TextIOWrapper
 from typing import Literal, get_args
 from cattrs.gen import make_dict_structure_fn, override
-from cattrs.preconf.orjson import make_converter
+from protein_quest.converter import converter
 from protein_quest.utils import friendly_session
 logger = logging.getLogger(__name__)
@@ -52,9 +52,6 @@ class SearchResponse:
     page_info: PageInfo
-converter = make_converter()
 def flatten_definition(definition, _context) -> str:
     return definition["text"]

protein_quest/mcp_server.py CHANGED Viewed

@@ -24,12 +24,11 @@ npx @modelcontextprotocol/inspector
 # Choose STDIO
 # command: uv run protein-quest mcp
 # id: protein-quest
-# Prompt: What are the PDBe structures for `A8MT69` uniprot accession?
 ```
 Examples:
-    For search pdb use `A8MT69` as input.
+   - What are the PDBe structures for `A8MT69` uniprot accession?
 """
@@ -47,6 +46,7 @@ from protein_quest.emdb import fetch as emdb_fetch
 from protein_quest.go import search_gene_ontology_term
 from protein_quest.pdbe.fetch import fetch as pdbe_fetch
 from protein_quest.pdbe.io import glob_structure_files, nr_residues_in_chain, write_single_chain_pdb_file
+from protein_quest.ss import filter_file_on_secondary_structure
 from protein_quest.taxonomy import search_taxon
 from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
@@ -90,7 +90,7 @@ def extract_single_chain_from_structure(
     chain2keep: str,
     output_dir: Path,
     out_chain: str = "A",
-) -> Path | None:
+) -> Path:
     """
     Extract a single chain from a mmCIF/pdb file and write to a new file.
@@ -101,7 +101,7 @@ def extract_single_chain_from_structure(
         out_chain: The chain identifier for the output file.
     Returns:
-        Path to the output mmCIF/pdb file or None if not created.
+        Path to the output mmCIF/pdb file
     """
     return write_single_chain_pdb_file(input_file, chain2keep, output_dir, out_chain)
@@ -150,7 +150,7 @@ def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[A
     Returns:
         A list of AlphaFold entries.
     """
-    what: set[DownloadableFormat] = {"cif"}
+    what: set[DownloadableFormat] = {"summary", "cif"}
     return alphafold_fetch(uniprot_accs, save_dir, what)
@@ -166,6 +166,9 @@ def alphafold_confidence_filter(file: Path, query: ConfidenceFilterQuery, filter
     return filter_file_on_residues(file, query, filtered_dir)
+mcp.tool(filter_file_on_secondary_structure)
 @mcp.prompt
 def candidate_structures(
     species: str = "Human",

protein-quest 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

Potentially problematic release.

protein-quest 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl