PyPI - protein-quest - Versions diffs - 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

protein-quest 0.3.1py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of protein-quest might be problematic. Click here for more details.

Files changed (19) hide show

protein_quest/__version__.py +1 -1
protein_quest/alphafold/confidence.py +42 -15
protein_quest/alphafold/fetch.py +2 -4
protein_quest/cli.py +292 -14
protein_quest/converter.py +46 -0
protein_quest/filters.py +39 -7
protein_quest/go.py +1 -4
protein_quest/mcp_server.py +14 -1
protein_quest/pdbe/io.py +122 -41
protein_quest/ss.py +284 -0
protein_quest/taxonomy.py +1 -3
protein_quest/uniprot.py +157 -4
protein_quest/utils.py +28 -1
{protein_quest-0.3.1.dist-info → protein_quest-0.4.0.dist-info}/METADATA +48 -4
protein_quest-0.4.0.dist-info/RECORD +26 -0
protein_quest-0.3.1.dist-info/RECORD +0 -24
{protein_quest-0.3.1.dist-info → protein_quest-0.4.0.dist-info}/WHEEL +0 -0
{protein_quest-0.3.1.dist-info → protein_quest-0.4.0.dist-info}/entry_points.txt +0 -0
{protein_quest-0.3.1.dist-info → protein_quest-0.4.0.dist-info}/licenses/LICENSE +0 -0

protein_quest/__version__.py CHANGED Viewed

@@ -1,2 +1,2 @@
-__version__ = "0.3.1"
+__version__ = "0.4.0"
 """The version of the package."""

protein_quest/alphafold/confidence.py CHANGED Viewed

@@ -7,7 +7,10 @@ from pathlib import Path
 import gemmi
+from protein_quest.converter import Percentage, PositiveInt, converter
 from protein_quest.pdbe.io import write_structure
+from protein_quest.ss import nr_of_residues_in_total
+from protein_quest.utils import CopyMethod, copyfile
 """
 Methods to filter AlphaFoldDB structures on confidence scores.
@@ -73,13 +76,25 @@ class ConfidenceFilterQuery:
     Parameters:
         confidence: The confidence threshold for filtering residues.
             Residues with a pLDDT (b-factor) above this value are considered high confidence.
-        min_threshold: The minimum number of high-confidence residues required to keep the structure.
-        max_threshold: The maximum number of high-confidence residues required to keep the structure.
+        min_residues: The minimum number of high-confidence residues required to keep the structure.
+        max_residues: The maximum number of high-confidence residues required to keep the structure.
     """
-    confidence: float
-    min_threshold: int
-    max_threshold: int
+    confidence: Percentage
+    min_residues: PositiveInt
+    max_residues: PositiveInt
+base_query_hook = converter.get_structure_hook(ConfidenceFilterQuery)
+@converter.register_structure_hook
+def confidence_filter_query_hook(val, _type) -> ConfidenceFilterQuery:
+    result: ConfidenceFilterQuery = base_query_hook(val, _type)
+    if result.min_residues > result.max_residues:
+        msg = f"min_residues {result.min_residues} cannot be larger than max_residues {result.max_residues}"
+        raise ValueError(msg)
+    return result
 @dataclass
@@ -93,17 +108,20 @@ class ConfidenceFilterResult:
     """
     input_file: str
-    count: int
+    count: PositiveInt
     filtered_file: Path | None = None
-def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_dir: Path) -> ConfidenceFilterResult:
+def filter_file_on_residues(
+    file: Path, query: ConfidenceFilterQuery, filtered_dir: Path, copy_method: CopyMethod = "copy"
+) -> ConfidenceFilterResult:
     """Filter a single AlphaFoldDB structure file (*.pdb[.gz], *.cif[.gz]) based on confidence.
     Args:
         file: The path to the PDB file to filter.
         query: The confidence filter query.
         filtered_dir: The directory to save the filtered PDB file.
+        copy_method: How to copy when no residues have to be removed.
     Returns:
         result with filtered_file property set to Path where filtered PDB file is saved.
@@ -112,19 +130,24 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
     structure = gemmi.read_structure(str(file))
     residues = set(find_high_confidence_residues(structure, query.confidence))
     count = len(residues)
-    if count < query.min_threshold or count > query.max_threshold:
+    if count < query.min_residues or count > query.max_residues:
         # Skip structure that is outside the min and max threshold
         # just return number of high confidence residues
         return ConfidenceFilterResult(
             input_file=file.name,
             count=count,
         )
+    total_residues = nr_of_residues_in_total(structure)
     filtered_file = filtered_dir / file.name
-    new_structure = filter_out_low_confidence_residues(
-        structure,
-        residues,
-    )
-    write_structure(new_structure, filtered_file)
+    if count == total_residues:
+        # if no residues have to be removed then copy instead of slower gemmi writing
+        copyfile(file, filtered_file, copy_method)
+    else:
+        new_structure = filter_out_low_confidence_residues(
+            structure,
+            residues,
+        )
+        write_structure(new_structure, filtered_file)
     return ConfidenceFilterResult(
         input_file=file.name,
         count=count,
@@ -133,7 +156,10 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
 def filter_files_on_confidence(
-    alphafold_pdb_files: list[Path], query: ConfidenceFilterQuery, filtered_dir: Path
+    alphafold_pdb_files: list[Path],
+    query: ConfidenceFilterQuery,
+    filtered_dir: Path,
+    copy_method: CopyMethod = "copy",
 ) -> Generator[ConfidenceFilterResult]:
     """Filter AlphaFoldDB structures based on confidence.
@@ -141,6 +167,7 @@ def filter_files_on_confidence(
         alphafold_pdb_files: List of mmcif/PDB files from AlphaFoldDB to filter.
         query: The confidence filter query containing the confidence thresholds.
         filtered_dir: Directory where the filtered mmcif/PDB files will be saved.
+        copy_method: How to copy when a direct copy is possible.
     Yields:
         For each mmcif/PDB files yields whether it was filtered or not,
@@ -150,4 +177,4 @@ def filter_files_on_confidence(
     # In ../filter.py:filter_files_on_residues() we filter on number of residues on a file level
     # here we filter on file level and inside file remove low confidence residues
     for pdb_file in alphafold_pdb_files:
-        yield filter_file_on_residues(pdb_file, query, filtered_dir)
+        yield filter_file_on_residues(pdb_file, query, filtered_dir, copy_method)

protein_quest/alphafold/fetch.py CHANGED Viewed

@@ -9,17 +9,15 @@ from typing import Literal, cast, get_args
 from aiohttp_retry import RetryClient
 from aiopath import AsyncPath
-from cattrs.preconf.orjson import make_converter
 from tqdm.asyncio import tqdm
 from yarl import URL
 from protein_quest.alphafold.entry_summary import EntrySummary
+from protein_quest.converter import converter
 from protein_quest.utils import friendly_session, retrieve_files, run_async
 logger = logging.getLogger(__name__)
-converter = make_converter()
-"""cattrs converter to read AlphaFold summary JSON document."""
-converter.register_structure_hook(URL, lambda v, _: URL(v))
 DownloadableFormat = Literal[
     "summary",

protein_quest/cli.py CHANGED Viewed

@@ -15,6 +15,7 @@ from textwrap import dedent
 from cattrs import structure
 from rich import print as rprint
 from rich.logging import RichHandler
+from rich.markdown import Markdown
 from rich.panel import Panel
 from rich_argparse import ArgumentDefaultsRichHelpFormatter
 from tqdm.rich import tqdm
@@ -23,13 +24,26 @@ from protein_quest.__version__ import __version__
 from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence
 from protein_quest.alphafold.fetch import DownloadableFormat, downloadable_formats
 from protein_quest.alphafold.fetch import fetch_many as af_fetch
+from protein_quest.converter import converter
 from protein_quest.emdb import fetch as emdb_fetch
 from protein_quest.filters import filter_files_on_chain, filter_files_on_residues
 from protein_quest.go import Aspect, allowed_aspects, search_gene_ontology_term, write_go_terms_to_csv
 from protein_quest.pdbe import fetch as pdbe_fetch
 from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
+from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
 from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
-from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
+from protein_quest.uniprot import (
+    ComplexPortalEntry,
+    PdbResult,
+    Query,
+    search4af,
+    search4emdb,
+    search4interaction_partners,
+    search4macromolecular_complexes,
+    search4pdb,
+    search4uniprot,
+)
+from protein_quest.utils import CopyMethod, copy_methods, copyfile
 logger = logging.getLogger(__name__)
@@ -208,6 +222,73 @@ def _add_search_taxonomy_parser(subparser: argparse._SubParsersAction):
     parser.add_argument("--limit", type=int, default=100, help="Maximum number of results to return")
+def _add_search_interaction_partners_parser(subparsers: argparse._SubParsersAction):
+    """Add search interaction partners subcommand parser."""
+    parser = subparsers.add_parser(
+        "interaction-partners",
+        help="Search for interaction partners of given UniProt accession",
+        description=dedent("""\
+            Search for interaction partners of given UniProt accession
+            in the Uniprot SPARQL endpoint and Complex Portal.
+        """),
+        formatter_class=ArgumentDefaultsRichHelpFormatter,
+    )
+    parser.add_argument(
+        "uniprot_acc",
+        type=str,
+        help="UniProt accession (for example P12345).",
+    )
+    parser.add_argument(
+        "--exclude",
+        type=str,
+        action="append",
+        help="UniProt accessions to exclude from the results. For example already known interaction partners.",
+    )
+    parser.add_argument(
+        "output_csv",
+        type=argparse.FileType("w", encoding="UTF-8"),
+        help="Output CSV with interaction partners per UniProt accession. Use `-` for stdout.",
+    )
+    parser.add_argument(
+        "--limit", type=int, default=10_000, help="Maximum number of interaction partner uniprot accessions to return"
+    )
+    parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
+def _add_search_complexes_parser(subparsers: argparse._SubParsersAction):
+    """Add search complexes subcommand parser."""
+    description = dedent("""\
+        Search for complexes in the Complex Portal.
+        https://www.ebi.ac.uk/complexportal/
+        The output CSV file has the following columns:
+        - query_protein: UniProt accession used as query
+        - complex_id: Complex Portal identifier
+        - complex_url: URL to the Complex Portal entry
+        - complex_title: Title of the complex
+        - members: Semicolon-separated list of UniProt accessions of complex members
+    """)
+    parser = subparsers.add_parser(
+        "complexes",
+        help="Search for complexes in the Complex Portal",
+        description=Markdown(description, style="argparse.text"),  # type: ignore using rich formatter makes this OK
+        formatter_class=ArgumentDefaultsRichHelpFormatter,
+    )
+    parser.add_argument(
+        "uniprot_accs",
+        type=argparse.FileType("r", encoding="UTF-8"),
+        help="Text file with UniProt accessions (one per line) as query for searching complexes. Use `-` for stdin.",
+    )
+    parser.add_argument(
+        "output_csv",
+        type=argparse.FileType("w", encoding="UTF-8"),
+        help="Output CSV file with complex results. Use `-` for stdout.",
+    )
+    parser.add_argument("--limit", type=int, default=100, help="Maximum number of complex results to return")
+    parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
 def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
     """Add retrieve pdbe subcommand parser."""
     parser = subparsers.add_parser(
@@ -282,6 +363,22 @@ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
     parser.add_argument("output_dir", type=Path, help="Directory to store downloaded EMDB volume files")
+def _add_copy_method_argument(parser: argparse.ArgumentParser):
+    """Add copy method argument to parser."""
+    default_copy_method = "symlink"
+    if os.name == "nt":
+        # On Windows you need developer mode or admin privileges to create symlinks
+        # so we default to copying files instead of symlinking
+        default_copy_method = "copy"
+    parser.add_argument(
+        "--copy-method",
+        type=str,
+        choices=copy_methods,
+        default=default_copy_method,
+        help="How to copy files when no changes are needed to output file.",
+    )
 def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
     """Add filter confidence subcommand parser."""
     parser = subparsers.add_parser(
@@ -312,6 +409,7 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
             In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
             Use `-` for stdout."""),
     )
+    _add_copy_method_argument(parser)
 def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
@@ -347,8 +445,11 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
     )
     parser.add_argument(
         "--scheduler-address",
-        help="Address of the Dask scheduler to connect to. If not provided, will create a local cluster.",
+        help=dedent("""Address of the Dask scheduler to connect to.
+            If not provided, will create a local cluster.
+            If set to `sequential` will run tasks sequentially."""),
     )
+    _add_copy_method_argument(parser)
 def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
@@ -371,6 +472,7 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
     )
     parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
     parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
+    _add_copy_method_argument(parser)
     parser.add_argument(
         "--write-stats",
         type=argparse.FileType("w", encoding="UTF-8"),
@@ -381,6 +483,43 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
     )
+def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
+    """Add filter secondary structure subcommand parser."""
+    parser = subparsers.add_parser(
+        "secondary-structure",
+        help="Filter PDB/mmCIF files by secondary structure",
+        description="Filter PDB/mmCIF files by secondary structure",
+        formatter_class=ArgumentDefaultsRichHelpFormatter,
+    )
+    parser.add_argument("input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')")
+    parser.add_argument(
+        "output_dir",
+        type=Path,
+        help=dedent("""\
+            Directory to write filtered PDB/mmCIF files. Files are copied without modification.
+        """),
+    )
+    parser.add_argument("--abs-min-helix-residues", type=int, help="Min residues in helices")
+    parser.add_argument("--abs-max-helix-residues", type=int, help="Max residues in helices")
+    parser.add_argument("--abs-min-sheet-residues", type=int, help="Min residues in sheets")
+    parser.add_argument("--abs-max-sheet-residues", type=int, help="Max residues in sheets")
+    parser.add_argument("--ratio-min-helix-residues", type=float, help="Min residues in helices (relative)")
+    parser.add_argument("--ratio-max-helix-residues", type=float, help="Max residues in helices (relative)")
+    parser.add_argument("--ratio-min-sheet-residues", type=float, help="Min residues in sheets (relative)")
+    parser.add_argument("--ratio-max-sheet-residues", type=float, help="Max residues in sheets (relative)")
+    _add_copy_method_argument(parser)
+    parser.add_argument(
+        "--write-stats",
+        type=argparse.FileType("w", encoding="UTF-8"),
+        help=dedent("""
+            Write filter statistics to file. In CSV format with columns:
+            `<input_file>,<nr_residues>,<nr_helix_residues>,<nr_sheet_residues>,
+            <helix_ratio>,<sheet_ratio>,<passed>,<output_file>`.
+            Use `-` for stdout.
+        """),
+    )
 def _add_search_subcommands(subparsers: argparse._SubParsersAction):
     """Add search command and its subcommands."""
     parser = subparsers.add_parser(
@@ -397,6 +536,8 @@ def _add_search_subcommands(subparsers: argparse._SubParsersAction):
     _add_search_emdb_parser(subsubparsers)
     _add_search_go_parser(subsubparsers)
     _add_search_taxonomy_parser(subsubparsers)
+    _add_search_interaction_partners_parser(subsubparsers)
+    _add_search_complexes_parser(subsubparsers)
 def _add_retrieve_subcommands(subparsers: argparse._SubParsersAction):
@@ -422,6 +563,7 @@ def _add_filter_subcommands(subparsers: argparse._SubParsersAction):
     _add_filter_confidence_parser(subsubparsers)
     _add_filter_chain_parser(subsubparsers)
     _add_filter_residue_parser(subsubparsers)
+    _add_filter_ss_parser(subsubparsers)
 def _add_mcp_command(subparsers: argparse._SubParsersAction):
@@ -574,6 +716,32 @@ def _handle_search_taxonomy(args):
     _write_taxonomy_csv(results, output_csv)
+def _handle_search_interaction_partners(args: argparse.Namespace):
+    uniprot_acc: str = args.uniprot_acc
+    excludes: set[str] = set(args.exclude) if args.exclude else set()
+    limit: int = args.limit
+    timeout: int = args.timeout
+    output_csv: TextIOWrapper = args.output_csv
+    rprint(f"Searching for interaction partners of '{uniprot_acc}'")
+    results = search4interaction_partners(uniprot_acc, excludes=excludes, limit=limit, timeout=timeout)
+    rprint(f"Found {len(results)} interaction partners, written to {output_csv.name}")
+    _write_lines(output_csv, results.keys())
+def _handle_search_complexes(args: argparse.Namespace):
+    uniprot_accs = args.uniprot_accs
+    limit = args.limit
+    timeout = args.timeout
+    output_csv = args.output_csv
+    accs = _read_lines(uniprot_accs)
+    rprint(f"Finding complexes for {len(accs)} uniprot accessions")
+    results = search4macromolecular_complexes(accs, limit=limit, timeout=timeout)
+    rprint(f"Found {len(results)} complexes, written to {output_csv.name}")
+    _write_complexes_csv(results, output_csv)
 def _handle_retrieve_pdbe(args):
     pdbe_csv = args.pdbe_csv
     output_dir = args.output_dir
@@ -620,21 +788,22 @@ def _handle_filter_confidence(args: argparse.Namespace):
     # to get rid of duplication
     input_dir = structure(args.input_dir, Path)
     output_dir = structure(args.output_dir, Path)
-    confidence_threshold = structure(args.confidence_threshold, float)
-    # TODO add min/max
-    min_residues = structure(args.min_residues, int)
-    max_residues = structure(args.max_residues, int)
+    confidence_threshold = args.confidence_threshold
+    min_residues = args.min_residues
+    max_residues = args.max_residues
     stats_file: TextIOWrapper | None = args.write_stats
+    copy_method: CopyMethod = structure(args.copy_method, CopyMethod)  # pyright: ignore[reportArgumentType]
     output_dir.mkdir(parents=True, exist_ok=True)
     input_files = sorted(glob_structure_files(input_dir))
     nr_input_files = len(input_files)
     rprint(f"Starting confidence filtering of {nr_input_files} mmcif/PDB files in {input_dir} directory.")
-    query = structure(
+    query = converter.structure(
         {
             "confidence": confidence_threshold,
-            "min_threshold": min_residues,
-            "max_threshold": max_residues,
+            "min_residues": min_residues,
+            "max_residues": max_residues,
         },
         ConfidenceFilterQuery,
     )
@@ -643,7 +812,11 @@ def _handle_filter_confidence(args: argparse.Namespace):
         writer.writerow(["input_file", "residue_count", "passed", "output_file"])
     passed_count = 0
-    for r in tqdm(filter_files_on_confidence(input_files, query, output_dir), total=len(input_files), unit="file"):
+    for r in tqdm(
+        filter_files_on_confidence(input_files, query, output_dir, copy_method=copy_method),
+        total=len(input_files),
+        unit="file",
+    ):
         if r.filtered_file:
             passed_count += 1
         if stats_file:
@@ -656,9 +829,10 @@ def _handle_filter_confidence(args: argparse.Namespace):
 def _handle_filter_chain(args):
     input_dir = args.input_dir
-    output_dir = args.output_dir
+    output_dir = structure(args.output_dir, Path)
     pdb_id2chain_mapping_file = args.chains
-    scheduler_address = args.scheduler_address
+    scheduler_address = structure(args.scheduler_address, str | None)  # pyright: ignore[reportArgumentType]
+    copy_method: CopyMethod = structure(args.copy_method, CopyMethod)  # pyright: ignore[reportArgumentType]
     # make sure files in input dir with entries in mapping file are the same
     # complain when files from mapping file are missing on disk
@@ -683,18 +857,25 @@ def _handle_filter_chain(args):
         rprint("[red]No valid structure files found. Exiting.")
         sys.exit(1)
-    results = filter_files_on_chain(file2chain, output_dir, scheduler_address=scheduler_address)
+    results = filter_files_on_chain(
+        file2chain, output_dir, scheduler_address=scheduler_address, copy_method=copy_method
+    )
     nr_written = len([r for r in results if r.passed])
     rprint(f"Wrote {nr_written} single-chain PDB/mmCIF files to {output_dir}.")
+    for result in results:
+        if result.discard_reason:
+            rprint(f"[red]Discarding {result.input_file} ({result.discard_reason})[/red]")
 def _handle_filter_residue(args):
     input_dir = structure(args.input_dir, Path)
     output_dir = structure(args.output_dir, Path)
     min_residues = structure(args.min_residues, int)
     max_residues = structure(args.max_residues, int)
+    copy_method: CopyMethod = structure(args.copy_method, CopyMethod)  # pyright: ignore[reportArgumentType]
     stats_file: TextIOWrapper | None = args.write_stats
     if stats_file:
@@ -705,7 +886,9 @@ def _handle_filter_residue(args):
     input_files = sorted(glob_structure_files(input_dir))
     nr_total = len(input_files)
     rprint(f"Filtering {nr_total} files in {input_dir} directory by number of residues in chain A.")
-    for r in filter_files_on_residues(input_files, output_dir, min_residues=min_residues, max_residues=max_residues):
+    for r in filter_files_on_residues(
+        input_files, output_dir, min_residues=min_residues, max_residues=max_residues, copy_method=copy_method
+    ):
         if stats_file:
             writer.writerow([r.input_file, r.residue_count, r.passed, r.output_file])
         if r.passed:
@@ -716,6 +899,68 @@ def _handle_filter_residue(args):
         rprint(f"Statistics written to {stats_file.name}")
+def _handle_filter_ss(args):
+    input_dir = structure(args.input_dir, Path)
+    output_dir = structure(args.output_dir, Path)
+    copy_method: CopyMethod = structure(args.copy_method, CopyMethod)  # pyright: ignore[reportArgumentType]
+    stats_file: TextIOWrapper | None = args.write_stats
+    raw_query = {
+        "abs_min_helix_residues": args.abs_min_helix_residues,
+        "abs_max_helix_residues": args.abs_max_helix_residues,
+        "abs_min_sheet_residues": args.abs_min_sheet_residues,
+        "abs_max_sheet_residues": args.abs_max_sheet_residues,
+        "ratio_min_helix_residues": args.ratio_min_helix_residues,
+        "ratio_max_helix_residues": args.ratio_max_helix_residues,
+        "ratio_min_sheet_residues": args.ratio_min_sheet_residues,
+        "ratio_max_sheet_residues": args.ratio_max_sheet_residues,
+    }
+    query = converter.structure(raw_query, SecondaryStructureFilterQuery)
+    input_files = sorted(glob_structure_files(input_dir))
+    nr_total = len(input_files)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if stats_file:
+        writer = csv.writer(stats_file)
+        writer.writerow(
+            [
+                "input_file",
+                "nr_residues",
+                "nr_helix_residues",
+                "nr_sheet_residues",
+                "helix_ratio",
+                "sheet_ratio",
+                "passed",
+                "output_file",
+            ]
+        )
+    rprint(f"Filtering {nr_total} files in {input_dir} directory by secondary structure.")
+    nr_passed = 0
+    for input_file, result in filter_files_on_secondary_structure(input_files, query=query):
+        output_file: Path | None = None
+        if result.passed:
+            output_file = output_dir / input_file.name
+            copyfile(input_file, output_file, copy_method)
+            nr_passed += 1
+        if stats_file:
+            writer.writerow(
+                [
+                    input_file,
+                    result.stats.nr_residues,
+                    result.stats.nr_helix_residues,
+                    result.stats.nr_sheet_residues,
+                    round(result.stats.helix_ratio, 3),
+                    round(result.stats.sheet_ratio, 3),
+                    result.passed,
+                    output_file,
+                ]
+            )
+    rprint(f"Wrote {nr_passed} files to {output_dir} directory.")
+    if stats_file:
+        rprint(f"Statistics written to {stats_file.name}")
 def _handle_mcp(args):
     if find_spec("fastmcp") is None:
         msg = "Unable to start MCP server, please install `protein-quest[mcp]`."
@@ -736,12 +981,15 @@ HANDLERS: dict[tuple[str, str | None], Callable] = {
     ("search", "emdb"): _handle_search_emdb,
     ("search", "go"): _handle_search_go,
     ("search", "taxonomy"): _handle_search_taxonomy,
+    ("search", "interaction-partners"): _handle_search_interaction_partners,
+    ("search", "complexes"): _handle_search_complexes,
     ("retrieve", "pdbe"): _handle_retrieve_pdbe,
     ("retrieve", "alphafold"): _handle_retrieve_alphafold,
     ("retrieve", "emdb"): _handle_retrieve_emdb,
     ("filter", "confidence"): _handle_filter_confidence,
     ("filter", "chain"): _handle_filter_chain,
     ("filter", "residue"): _handle_filter_residue,
+    ("filter", "secondary-structure"): _handle_filter_ss,
     ("mcp", None): _handle_mcp,
 }
@@ -797,3 +1045,33 @@ def _iter_csv_rows(file: TextIOWrapper) -> Generator[dict[str, str]]:
 def _read_column_from_csv(file: TextIOWrapper, column: str) -> set[str]:
     return {row[column] for row in _iter_csv_rows(file)}
+def _write_complexes_csv(complexes: list[ComplexPortalEntry], output_csv: TextIOWrapper) -> None:
+    """Write ComplexPortal information to a CSV file.
+    Args:
+        complexes: List of ComplexPortalEntry objects.
+        output_csv: TextIOWrapper to write the CSV data to.
+    """
+    writer = csv.writer(output_csv)
+    writer.writerow(
+        [
+            "query_protein",
+            "complex_id",
+            "complex_url",
+            "complex_title",
+            "members",
+        ]
+    )
+    for entry in complexes:
+        members_str = ";".join(sorted(entry.members))
+        writer.writerow(
+            [
+                entry.query_protein,
+                entry.complex_id,
+                entry.complex_url,
+                entry.complex_title,
+                members_str,
+            ]
+        )

protein_quest/converter.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""Convert json or dict to Python objects."""
+from cattrs.preconf.orjson import make_converter
+from yarl import URL
+type Percentage = float
+"""Type alias for percentage values (0.0-100.0)."""
+type Ratio = float
+"""Type alias for ratio values (0.0-1.0)."""
+type PositiveInt = int
+"""Type alias for positive integer values (>= 0)."""
+converter = make_converter()
+"""cattrs converter to read JSON document or dict to Python objects."""
+converter.register_structure_hook(URL, lambda v, _: URL(v))
+converter.register_unstructure_hook(URL, lambda u: str(u))
+@converter.register_structure_hook
+def percentage_hook(val, _) -> Percentage:
+    value = float(val)
+    """Cattrs hook to validate percentage values."""
+    if not 0.0 <= value <= 100.0:
+        msg = f"Value {value} is not a valid percentage (0.0-100.0)"
+        raise ValueError(msg)
+    return value
+@converter.register_structure_hook
+def ratio_hook(val, _) -> Ratio:
+    """Cattrs hook to validate ratio values."""
+    value = float(val)
+    if not 0.0 <= value <= 1.0:
+        msg = f"Value {value} is not a valid ratio (0.0-1.0)"
+        raise ValueError(msg)
+    return value
+@converter.register_structure_hook
+def positive_int_hook(val, _) -> PositiveInt:
+    """Cattrs hook to validate positive integer values."""
+    value = int(val)
+    if value < 0:
+        msg = f"Value {value} is not a valid positive integer (>= 0)"
+        raise ValueError(msg)
+    return value

protein-quest 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

protein-quest 0.3.1py3-none-any.whl → 0.4.0py3-none-any.whl