PyPI - protein-quest - Versions diffs - 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

protein-quest 0.3.1py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of protein-quest might be problematic. Click here for more details.

Files changed (19) hide show

protein_quest/__version__.py +1 -1
protein_quest/alphafold/confidence.py +42 -15
protein_quest/alphafold/fetch.py +2 -4
protein_quest/cli.py +292 -14
protein_quest/converter.py +46 -0
protein_quest/filters.py +39 -7
protein_quest/go.py +1 -4
protein_quest/mcp_server.py +14 -1
protein_quest/pdbe/io.py +122 -41
protein_quest/ss.py +284 -0
protein_quest/taxonomy.py +1 -3
protein_quest/uniprot.py +157 -4
protein_quest/utils.py +28 -1
{protein_quest-0.3.1.dist-info → protein_quest-0.4.0.dist-info}/METADATA +48 -4
protein_quest-0.4.0.dist-info/RECORD +26 -0
protein_quest-0.3.1.dist-info/RECORD +0 -24
{protein_quest-0.3.1.dist-info → protein_quest-0.4.0.dist-info}/WHEEL +0 -0
{protein_quest-0.3.1.dist-info → protein_quest-0.4.0.dist-info}/entry_points.txt +0 -0
{protein_quest-0.3.1.dist-info → protein_quest-0.4.0.dist-info}/licenses/LICENSE +0 -0

protein_quest/filters.py CHANGED Viewed

@@ -4,7 +4,7 @@ import logging
 from collections.abc import Collection, Generator
 from dataclasses import dataclass
 from pathlib import Path
-from shutil import copyfile
+from typing import Literal
 from dask.distributed import Client
 from distributed.deploy.cluster import Cluster
@@ -15,6 +15,7 @@ from protein_quest.pdbe.io import (
     nr_residues_in_chain,
     write_single_chain_pdb_file,
 )
+from protein_quest.utils import CopyMethod, copyfile
 logger = logging.getLogger(__name__)
@@ -29,11 +30,17 @@ class ChainFilterStatistics:
 def filter_file_on_chain(
-    file_and_chain: tuple[Path, str], output_dir: Path, out_chain: str = "A"
+    file_and_chain: tuple[Path, str],
+    output_dir: Path,
+    out_chain: str = "A",
+    copy_method: CopyMethod = "copy",
 ) -> ChainFilterStatistics:
     input_file, chain_id = file_and_chain
+    logger.debug("Filtering %s on chain %s", input_file, chain_id)
     try:
-        output_file = write_single_chain_pdb_file(input_file, chain_id, output_dir, out_chain=out_chain)
+        output_file = write_single_chain_pdb_file(
+            input_file, chain_id, output_dir, out_chain=out_chain, copy_method=copy_method
+        )
         return ChainFilterStatistics(
             input_file=input_file,
             chain_id=chain_id,
@@ -48,7 +55,8 @@ def filter_files_on_chain(
     file2chains: Collection[tuple[Path, str]],
     output_dir: Path,
     out_chain: str = "A",
-    scheduler_address: str | Cluster | None = None,
+    scheduler_address: str | Cluster | Literal["sequential"] | None = None,
+    copy_method: CopyMethod = "copy",
 ) -> list[ChainFilterStatistics]:
     """Filter mmcif/PDB files by chain.
@@ -58,19 +66,37 @@ def filter_files_on_chain(
         output_dir: The directory where the filtered files will be written.
         out_chain: Under what name to write the kept chain.
         scheduler_address: The address of the Dask scheduler.
+            If not provided, will create a local cluster.
+            If set to `sequential` will run tasks sequentially.
+        copy_method: How to copy when a direct copy is possible.
     Returns:
         Result of the filtering process.
     """
     output_dir.mkdir(parents=True, exist_ok=True)
+    if scheduler_address == "sequential":
+        def task(file_and_chain: tuple[Path, str]) -> ChainFilterStatistics:
+            return filter_file_on_chain(file_and_chain, output_dir, out_chain=out_chain, copy_method=copy_method)
+        return list(map(task, file2chains))
+    # TODO make logger.debug in filter_file_on_chain show to user when --log
+    # GPT-5 generated a fairly difficult setup with a WorkerPlugin, need to find a simpler approach
     scheduler_address = configure_dask_scheduler(
         scheduler_address,
         name="filter-chain",
     )
     with Client(scheduler_address) as client:
+        client.forward_logging()
         return dask_map_with_progress(
-            client, filter_file_on_chain, file2chains, output_dir=output_dir, out_chain=out_chain
+            client,
+            filter_file_on_chain,
+            file2chains,
+            output_dir=output_dir,
+            out_chain=out_chain,
+            copy_method=copy_method,
         )
@@ -92,7 +118,12 @@ class ResidueFilterStatistics:
 def filter_files_on_residues(
-    input_files: list[Path], output_dir: Path, min_residues: int, max_residues: int, chain: str = "A"
+    input_files: list[Path],
+    output_dir: Path,
+    min_residues: int,
+    max_residues: int,
+    chain: str = "A",
+    copy_method: CopyMethod = "copy",
 ) -> Generator[ResidueFilterStatistics]:
     """Filter PDB/mmCIF files by number of residues in given chain.
@@ -102,6 +133,7 @@ def filter_files_on_residues(
         min_residues: The minimum number of residues in chain.
         max_residues: The maximum number of residues in chain.
         chain: The chain to count residues of.
+        copy_method: How to copy passed files to output directory:
     Yields:
         Objects containing information about the filtering process for each input file.
@@ -112,7 +144,7 @@ def filter_files_on_residues(
         passed = min_residues <= residue_count <= max_residues
         if passed:
             output_file = output_dir / input_file.name
-            copyfile(input_file, output_file)
+            copyfile(input_file, output_file, copy_method)
             yield ResidueFilterStatistics(input_file, residue_count, True, output_file)
         else:
             yield ResidueFilterStatistics(input_file, residue_count, False, None)

protein_quest/go.py CHANGED Viewed

@@ -8,8 +8,8 @@ from io import TextIOWrapper
 from typing import Literal, get_args
 from cattrs.gen import make_dict_structure_fn, override
-from cattrs.preconf.orjson import make_converter
+from protein_quest.converter import converter
 from protein_quest.utils import friendly_session
 logger = logging.getLogger(__name__)
@@ -52,9 +52,6 @@ class SearchResponse:
     page_info: PageInfo
-converter = make_converter()
 def flatten_definition(definition, _context) -> str:
     return definition["text"]

protein_quest/mcp_server.py CHANGED Viewed

@@ -46,8 +46,17 @@ from protein_quest.emdb import fetch as emdb_fetch
 from protein_quest.go import search_gene_ontology_term
 from protein_quest.pdbe.fetch import fetch as pdbe_fetch
 from protein_quest.pdbe.io import glob_structure_files, nr_residues_in_chain, write_single_chain_pdb_file
+from protein_quest.ss import filter_file_on_secondary_structure
 from protein_quest.taxonomy import search_taxon
-from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
+from protein_quest.uniprot import (
+    PdbResult,
+    Query,
+    search4af,
+    search4emdb,
+    search4macromolecular_complexes,
+    search4pdb,
+    search4uniprot,
+)
 mcp = FastMCP("protein-quest")
@@ -136,6 +145,7 @@ def search_alphafolds(
 mcp.tool(search4emdb, name="search_emdb")
+mcp.tool(search4macromolecular_complexes, name="search_macromolecular_complexes")
 @mcp.tool
@@ -165,6 +175,9 @@ def alphafold_confidence_filter(file: Path, query: ConfidenceFilterQuery, filter
     return filter_file_on_residues(file, query, filtered_dir)
+mcp.tool(filter_file_on_secondary_structure)
 @mcp.prompt
 def candidate_structures(
     species: str = "Human",

protein_quest/pdbe/io.py CHANGED Viewed

@@ -2,12 +2,14 @@
 import gzip
 import logging
-from collections.abc import Generator
+from collections.abc import Generator, Iterable
+from datetime import UTC, datetime
 from pathlib import Path
 import gemmi
-from protein_quest import __version__
+from protein_quest.__version__ import __version__
+from protein_quest.utils import CopyMethod, copyfile
 logger = logging.getLogger(__name__)
@@ -28,14 +30,21 @@ def nr_residues_in_chain(file: Path | str, chain: str = "A") -> int:
         The number of residues in the specified chain.
     """
     structure = gemmi.read_structure(str(file))
-    model = structure[0]
-    gchain = find_chain_in_model(model, chain)
+    gchain = find_chain_in_structure(structure, chain)
     if gchain is None:
         logger.warning("Chain %s not found in %s. Returning 0.", chain, file)
         return 0
     return len(gchain)
+def find_chain_in_structure(structure: gemmi.Structure, wanted_chain: str) -> gemmi.Chain | None:
+    for model in structure:
+        chain = find_chain_in_model(model, wanted_chain)
+        if chain is not None:
+            return chain
+    return None
 def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain | None:
     chain = model.find_chain(wanted_chain)
     if chain is None:
@@ -68,10 +77,12 @@ def write_structure(structure: gemmi.Structure, path: Path):
         with gzip.open(path, "wt") as f:
             f.write(body)
     elif path.name.endswith(".cif"):
-        doc = structure.make_mmcif_document()
+        # do not write chem_comp so it is viewable by molstar
+        # see https://github.com/project-gemmi/gemmi/discussions/362
+        doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
         doc.write_file(str(path))
     elif path.name.endswith(".cif.gz"):
-        doc = structure.make_mmcif_document()
+        doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
         cif_str = doc.as_string()
         with gzip.open(path, "wt") as f:
             f.write(cif_str)
@@ -111,14 +122,17 @@ def locate_structure_file(root: Path, pdb_id: str) -> Path:
     Raises:
         FileNotFoundError: If no structure file is found for the given PDB ID.
     """
-    exts = [".cif.gz", ".cif", ".pdb.gz", ".pdb"]
-    # files downloaded from https://www.ebi.ac.uk/pdbe/ website
-    # have file names like pdb6t5y.ent or pdb6t5y.ent.gz for a PDB formatted file.
-    # TODO support pdb6t5y.ent or pdb6t5y.ent.gz file names
+    exts = [".cif.gz", ".cif", ".pdb.gz", ".pdb", ".ent", ".ent.gz"]
     for ext in exts:
-        candidate = root / f"{pdb_id.lower()}{ext}"
-        if candidate.exists():
-            return candidate
+        candidates = (
+            root / f"{pdb_id}{ext}",
+            root / f"{pdb_id.lower()}{ext}",
+            root / f"{pdb_id.upper()}{ext}",
+            root / f"pdb{pdb_id.lower()}{ext}",
+        )
+        for candidate in candidates:
+            if candidate.exists():
+                return candidate
     msg = f"No structure file found for {pdb_id} in {root}"
     raise FileNotFoundError(msg)
@@ -139,20 +153,84 @@ def glob_structure_files(input_dir: Path) -> Generator[Path]:
 class ChainNotFoundError(IndexError):
     """Exception raised when a chain is not found in a structure."""
-    def __init__(self, chain: str, file: Path | str):
-        super().__init__(f"Chain {chain} not found in {file}")
+    def __init__(self, chain: str, file: Path | str, available_chains: Iterable[str]):
+        super().__init__(f"Chain {chain} not found in {file}. Available chains are: {available_chains}")
         self.chain_id = chain
         self.file = file
-def write_single_chain_pdb_file(input_file: Path, chain2keep: str, output_dir: Path, out_chain: str = "A") -> Path:
+def _dedup_helices(structure: gemmi.Structure):
+    helix_starts: set[str] = set()
+    duplicate_helix_indexes: list[int] = []
+    for hindex, helix in enumerate(structure.helices):
+        if str(helix.start) in helix_starts:
+            logger.debug(f"Duplicate start helix found: {hindex} {helix.start}, removing")
+            duplicate_helix_indexes.append(hindex)
+        else:
+            helix_starts.add(str(helix.start))
+    for helix_index in reversed(duplicate_helix_indexes):
+        structure.helices.pop(helix_index)
+def _dedup_sheets(structure: gemmi.Structure, chain2keep: str):
+    duplicate_sheet_indexes: list[int] = []
+    for sindex, sheet in enumerate(structure.sheets):
+        if sheet.name != chain2keep:
+            duplicate_sheet_indexes.append(sindex)
+    for sheet_index in reversed(duplicate_sheet_indexes):
+        structure.sheets.pop(sheet_index)
+def _add_provenance_info(structure: gemmi.Structure, chain2keep: str, out_chain: str):
+    old_id = structure.name
+    new_id = structure.name + f"{chain2keep}2{out_chain}"
+    structure.name = new_id
+    structure.info["_entry.id"] = new_id
+    new_title = f"From {old_id} chain {chain2keep} to {out_chain}"
+    structure.info["_struct.title"] = new_title
+    structure.info["_struct_keywords.pdbx_keywords"] = new_title.upper()
+    new_si = gemmi.SoftwareItem()
+    new_si.classification = gemmi.SoftwareItem.Classification.DataExtraction
+    new_si.name = "protein-quest.pdbe.io.write_single_chain_pdb_file"
+    new_si.version = str(__version__)
+    new_si.date = str(datetime.now(tz=UTC).date())
+    structure.meta.software = [*structure.meta.software, new_si]
+def chains_in_structure(structure: gemmi.Structure) -> set[gemmi.Chain]:
+    """Get a list of chains in a structure."""
+    return {c for model in structure for c in model}
+def write_single_chain_pdb_file(
+    input_file: Path,
+    chain2keep: str,
+    output_dir: Path,
+    out_chain: str = "A",
+    copy_method: CopyMethod = "copy",
+) -> Path:
     """Write a single chain from a mmCIF/pdb file to a new mmCIF/pdb file.
+    Also
+    - removes ligands and waters
+    - renumbers atoms ids
+    - removes chem_comp section from cif files
+    - adds provenance information to the header like software and input file+chain
+    This function is equivalent to the following gemmi commands:
+    ```shell
+    gemmi convert --remove-lig-wat --select=B --to=cif chain-in/3JRS.cif - | \\
+    gemmi convert --from=cif --rename-chain=B:A - chain-out/3JRS_B2A.gemmi.cif
+    ```
     Args:
         input_file: Path to the input mmCIF/pdb file.
         chain2keep: The chain to keep.
         output_dir: Directory to save the output file.
         out_chain: The chain identifier for the output file.
+        copy_method: How to copy when no changes are needed to output file.
     Returns:
         Path to the output mmCIF/pdb file
@@ -162,39 +240,42 @@ def write_single_chain_pdb_file(input_file: Path, chain2keep: str, output_dir: P
         ChainNotFoundError: If the specified chain is not found in the input file.
     """
+    logger.debug(f"chain2keep: {chain2keep}, out_chain: {out_chain}")
     structure = gemmi.read_structure(str(input_file))
-    model = structure[0]
-    # Only count residues of polymer
-    model.remove_ligands_and_waters()
+    structure.setup_entities()
-    chain = find_chain_in_model(model, chain2keep)
+    chain = find_chain_in_structure(structure, chain2keep)
+    chainnames_in_structure = {c.name for c in chains_in_structure(structure)}
     if chain is None:
-        raise ChainNotFoundError(chain2keep, input_file)
+        raise ChainNotFoundError(chain2keep, input_file, chainnames_in_structure)
+    chain_name = chain.name
     name, extension = _split_name_and_extension(input_file.name)
-    output_file = output_dir / f"{name}_{chain.name}2{out_chain}{extension}"
+    output_file = output_dir / f"{name}_{chain_name}2{out_chain}{extension}"
     if output_file.exists():
         logger.info("Output file %s already exists for input file %s. Skipping.", output_file, input_file)
         return output_file
-    new_structure = gemmi.Structure()
-    new_structure.resolution = structure.resolution
-    new_id = structure.name + f"{chain2keep}2{out_chain}"
-    new_structure.name = new_id
-    new_structure.info["_entry.id"] = new_id
-    new_title = f"From {structure.info['_entry.id']} chain {chain2keep} to {out_chain}"
-    new_structure.info["_struct.title"] = new_title
-    new_structure.info["_struct_keywords.pdbx_keywords"] = new_title.upper()
-    new_si = gemmi.SoftwareItem()
-    new_si.classification = gemmi.SoftwareItem.Classification.DataExtraction
-    new_si.name = "protein-quest"
-    new_si.version = str(__version__)
-    new_structure.meta.software.append(new_si)
-    new_model = gemmi.Model(1)
-    chain.name = out_chain
-    new_model.add_chain(chain)
-    new_structure.add_model(new_model)
-    write_structure(new_structure, output_file)
+    if chain_name == out_chain and len(chainnames_in_structure) == 1:
+        logger.info(
+            "%s only has chain %s and out_chain is also %s. Copying file to %s.",
+            input_file,
+            chain_name,
+            out_chain,
+            output_file,
+        )
+        copyfile(input_file, output_file, copy_method)
+        return output_file
+    gemmi.Selection(chain_name).remove_not_selected(structure)
+    for m in structure:
+        m.remove_ligands_and_waters()
+    structure.setup_entities()
+    structure.rename_chain(chain_name, out_chain)
+    _dedup_helices(structure)
+    _dedup_sheets(structure, out_chain)
+    _add_provenance_info(structure, chain_name, out_chain)
+    write_structure(structure, output_file)
     return output_file

protein_quest/ss.py ADDED Viewed

@@ -0,0 +1,284 @@
+"""Module for dealing with secondary structure."""
+import logging
+from collections.abc import Generator, Iterable
+from dataclasses import dataclass
+from pathlib import Path
+from gemmi import Structure, read_structure, set_leak_warnings
+from protein_quest.converter import PositiveInt, Ratio, converter
+logger = logging.getLogger(__name__)
+# TODO remove once v0.7.4 of gemmi is released,
+# as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
+# Swallow gemmi leaked function warnings
+set_leak_warnings(False)
+# TODO if a structure has no secondary structure information, calculate it with `gemmi ss`.
+# https://github.com/MonomerLibrary/monomers/wiki/Installation as --monomers dir
+# gemmi executable is in https://pypi.org/project/gemmi-program/
+# `gemmi ss` only prints secondary structure to stdout with `-v` flag.
+def nr_of_residues_in_total(structure: Structure) -> int:
+    """Count the total number of residues in the structure.
+    Args:
+        structure: The gemmi Structure object to analyze.
+    Returns:
+        The total number of residues in the structure.
+    """
+    count = 0
+    for model in structure:
+        for chain in model:
+            count += len(chain)
+    return count
+def nr_of_residues_in_helix(structure: Structure) -> int:
+    """Count the number of residues in alpha helices.
+    Requires structure to have secondary structure information.
+    Args:
+        structure: The gemmi Structure object to analyze.
+    Returns:
+        The number of residues in alpha helices.
+    """
+    # For cif files from AlphaFold the helix.length is set to -1
+    # so use resid instead
+    count = 0
+    for helix in structure.helices:
+        end = helix.end.res_id.seqid.num
+        start = helix.start.res_id.seqid.num
+        if end is None or start is None:
+            logger.warning(f"Invalid helix coordinates: {helix.end} or {helix.start}")
+            continue
+        length = end - start + 1
+        count += length
+    return count
+def nr_of_residues_in_sheet(structure: Structure) -> int:
+    """Count the number of residues in beta sheets.
+    Requires structure to have secondary structure information.
+    Args:
+        structure: The gemmi Structure object to analyze.
+    Returns:
+        The number of residues in beta sheets.
+    """
+    count = 0
+    for sheet in structure.sheets:
+        for strand in sheet.strands:
+            end = strand.end.res_id.seqid.num
+            start = strand.start.res_id.seqid.num
+            if end is None or start is None:
+                logger.warning(f"Invalid strand coordinates: {strand.end} or {strand.start}")
+                continue
+            length = end - start + 1
+            count += length
+    return count
+@dataclass
+class SecondaryStructureFilterQuery:
+    """Query object to filter on secondary structure.
+    Parameters:
+        abs_min_helix_residues: Minimum number of residues in helices (absolute).
+        abs_max_helix_residues: Maximum number of residues in helices (absolute).
+        abs_min_sheet_residues: Minimum number of residues in sheets (absolute).
+        abs_max_sheet_residues: Maximum number of residues in sheets (absolute).
+        ratio_min_helix_residues: Minimum number of residues in helices (relative).
+        ratio_max_helix_residues: Maximum number of residues in helices (relative).
+        ratio_min_sheet_residues: Minimum number of residues in sheets (relative).
+        ratio_max_sheet_residues: Maximum number of residues in sheets (relative).
+    """
+    abs_min_helix_residues: PositiveInt | None = None
+    abs_max_helix_residues: PositiveInt | None = None
+    abs_min_sheet_residues: PositiveInt | None = None
+    abs_max_sheet_residues: PositiveInt | None = None
+    ratio_min_helix_residues: Ratio | None = None
+    ratio_max_helix_residues: Ratio | None = None
+    ratio_min_sheet_residues: Ratio | None = None
+    ratio_max_sheet_residues: Ratio | None = None
+    def is_actionable(self) -> bool:
+        """Check if the secondary structure query has any actionable filters.
+        Returns:
+            True if any of the filters are set, False otherwise.
+        """
+        return any(
+            field is not None
+            for field in [
+                self.abs_min_helix_residues,
+                self.abs_max_helix_residues,
+                self.abs_min_sheet_residues,
+                self.abs_max_sheet_residues,
+                self.ratio_min_helix_residues,
+                self.ratio_max_helix_residues,
+                self.ratio_min_sheet_residues,
+                self.ratio_max_sheet_residues,
+            ]
+        )
+def _check_range(min_val, max_val, label):
+    if min_val is not None and max_val is not None and min_val >= max_val:
+        msg = f"Invalid {label} range: min {min_val} must be smaller than max {max_val}"
+        raise ValueError(msg)
+base_query_hook = converter.get_structure_hook(SecondaryStructureFilterQuery)
+@converter.register_structure_hook
+def secondary_structure_filter_query_hook(value, _type) -> SecondaryStructureFilterQuery:
+    result: SecondaryStructureFilterQuery = base_query_hook(value, _type)
+    _check_range(result.abs_min_helix_residues, result.abs_max_helix_residues, "absolute helix residue")
+    _check_range(result.abs_min_sheet_residues, result.abs_max_sheet_residues, "absolute sheet residue")
+    _check_range(result.ratio_min_helix_residues, result.ratio_max_helix_residues, "ratio helix residue")
+    _check_range(result.ratio_min_sheet_residues, result.ratio_max_sheet_residues, "ratio sheet residue")
+    return result
+@dataclass
+class SecondaryStructureStats:
+    """Statistics about the secondary structure of a protein.
+    Parameters:
+        nr_residues: Total number of residues in the structure.
+        nr_helix_residues: Number of residues in helices.
+        nr_sheet_residues: Number of residues in sheets.
+        helix_ratio: Ratio of residues in helices.
+        sheet_ratio: Ratio of residues in sheets.
+    """
+    nr_residues: PositiveInt
+    nr_helix_residues: PositiveInt
+    nr_sheet_residues: PositiveInt
+    helix_ratio: Ratio
+    sheet_ratio: Ratio
+@dataclass
+class SecondaryStructureFilterResult:
+    """Result of filtering on secondary structure.
+    Parameters:
+        stats: The secondary structure statistics.
+        passed: Whether the structure passed the filtering criteria.
+    """
+    stats: SecondaryStructureStats
+    passed: bool = False
+def _gather_stats(structure: Structure) -> SecondaryStructureStats:
+    nr_total_residues = nr_of_residues_in_total(structure)
+    nr_helix_residues = nr_of_residues_in_helix(structure)
+    nr_sheet_residues = nr_of_residues_in_sheet(structure)
+    if nr_total_residues == 0:
+        msg = "Structure has zero residues; cannot compute secondary structure ratios."
+        raise ValueError(msg)
+    helix_ratio = nr_helix_residues / nr_total_residues
+    sheet_ratio = nr_sheet_residues / nr_total_residues
+    return SecondaryStructureStats(
+        nr_residues=nr_total_residues,
+        nr_helix_residues=nr_helix_residues,
+        nr_sheet_residues=nr_sheet_residues,
+        helix_ratio=helix_ratio,
+        sheet_ratio=sheet_ratio,
+    )
+def filter_on_secondary_structure(
+    structure: Structure,
+    query: SecondaryStructureFilterQuery,
+) -> SecondaryStructureFilterResult:
+    """Filter a structure based on secondary structure criteria.
+    Args:
+        structure: The gemmi Structure object to analyze.
+        query: The filtering criteria to apply.
+    Returns:
+        Filtering statistics and whether structure passed.
+    """
+    stats = _gather_stats(structure)
+    conditions: list[bool] = []
+    # Helix absolute thresholds
+    if query.abs_min_helix_residues is not None:
+        conditions.append(stats.nr_helix_residues >= query.abs_min_helix_residues)
+    if query.abs_max_helix_residues is not None:
+        conditions.append(stats.nr_helix_residues <= query.abs_max_helix_residues)
+    # Helix ratio thresholds
+    if query.ratio_min_helix_residues is not None:
+        conditions.append(stats.helix_ratio >= query.ratio_min_helix_residues)
+    if query.ratio_max_helix_residues is not None:
+        conditions.append(stats.helix_ratio <= query.ratio_max_helix_residues)
+    # Sheet absolute thresholds
+    if query.abs_min_sheet_residues is not None:
+        conditions.append(stats.nr_sheet_residues >= query.abs_min_sheet_residues)
+    if query.abs_max_sheet_residues is not None:
+        conditions.append(stats.nr_sheet_residues <= query.abs_max_sheet_residues)
+    # Sheet ratio thresholds
+    if query.ratio_min_sheet_residues is not None:
+        conditions.append(stats.sheet_ratio >= query.ratio_min_sheet_residues)
+    if query.ratio_max_sheet_residues is not None:
+        conditions.append(stats.sheet_ratio <= query.ratio_max_sheet_residues)
+    if not conditions:
+        msg = "No filtering conditions provided. Please specify at least one condition."
+        raise ValueError(msg)
+    passed = all(conditions)
+    return SecondaryStructureFilterResult(stats=stats, passed=passed)
+def filter_file_on_secondary_structure(
+    file_path: Path,
+    query: SecondaryStructureFilterQuery,
+) -> SecondaryStructureFilterResult:
+    """Filter a structure file based on secondary structure criteria.
+    Args:
+        file_path: The path to the structure file to analyze.
+        query: The filtering criteria to apply.
+    Returns:
+        Filtering statistics and whether file passed.
+    """
+    structure = read_structure(str(file_path))
+    return filter_on_secondary_structure(structure, query)
+def filter_files_on_secondary_structure(
+    file_paths: Iterable[Path],
+    query: SecondaryStructureFilterQuery,
+) -> Generator[tuple[Path, SecondaryStructureFilterResult]]:
+    """Filter multiple structure files based on secondary structure criteria.
+    Args:
+        file_paths: A list of paths to the structure files to analyze.
+        query: The filtering criteria to apply.
+    Yields:
+        For each file returns the filtering statistics and whether structure passed.
+    """
+    # TODO check if quick enough in serial mode, if not switch to dask map
+    for file_path in file_paths:
+        result = filter_file_on_secondary_structure(file_path, query)
+        yield file_path, result

protein-quest 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

protein-quest 0.3.1py3-none-any.whl → 0.4.0py3-none-any.whl