PyPI - protein-quest - Versions diffs - 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

protein-quest 0.5.1py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of protein-quest might be problematic. Click here for more details.

Files changed (18) hide show

protein_quest/__version__.py +1 -1
protein_quest/alphafold/confidence.py +2 -2
protein_quest/alphafold/entry_summary.py +46 -22
protein_quest/alphafold/fetch.py +76 -42
protein_quest/cli.py +385 -114
protein_quest/filters.py +2 -5
protein_quest/io.py +350 -0
protein_quest/mcp_server.py +21 -7
protein_quest/ss.py +3 -7
protein_quest/{pdbe/io.py → structure.py} +77 -126
protein_quest/uniprot.py +287 -15
protein_quest/utils.py +26 -2
{protein_quest-0.5.1.dist-info → protein_quest-0.7.0.dist-info}/METADATA +42 -5
protein_quest-0.7.0.dist-info/RECORD +27 -0
protein_quest-0.5.1.dist-info/RECORD +0 -26
{protein_quest-0.5.1.dist-info → protein_quest-0.7.0.dist-info}/WHEEL +0 -0
{protein_quest-0.5.1.dist-info → protein_quest-0.7.0.dist-info}/entry_points.txt +0 -0
{protein_quest-0.5.1.dist-info → protein_quest-0.7.0.dist-info}/licenses/LICENSE +0 -0

protein_quest/uniprot.py CHANGED Viewed

@@ -1,8 +1,9 @@
 """Module for searching UniProtKB using SPARQL."""
 import logging
-from collections.abc import Collection, Iterable
+from collections.abc import Collection, Generator, Iterable
 from dataclasses import dataclass
+from functools import cached_property
 from itertools import batched
 from textwrap import dedent
@@ -24,6 +25,8 @@ class Query:
             (e.g., ["GO:0005634"]) or a collection of GO terms (e.g., ["GO:0005634", "GO:0005737"]).
         molecular_function_go: Molecular function in GO format. Can be a single GO term
             (e.g., ["GO:0003674"]) or a collection of GO terms (e.g., ["GO:0003674", "GO:0008150"]).
+        min_sequence_length: Minimum length of the canonical sequence.
+        max_sequence_length: Maximum length of the canonical sequence.
     """
     # TODO make taxon_id an int
@@ -32,6 +35,8 @@ class Query:
     subcellular_location_uniprot: str | None = None
     subcellular_location_go: list[str] | None = None
     molecular_function_go: list[str] | None = None
+    min_sequence_length: int | None = None
+    max_sequence_length: int | None = None
 def _first_chain_from_uniprot_chains(uniprot_chains: str) -> str:
@@ -39,16 +44,17 @@ def _first_chain_from_uniprot_chains(uniprot_chains: str) -> str:
     The UniProt chains string is formatted (with EBNF notation) as follows:
-        chain_group(=range)?(,chain_group(=range)?)*
+        chain_group=range(,chain_group=range)*
     where:
         chain_group := chain_id(/chain_id)*
-        chain_id    := [A-Za-z]+
+        chain_id    := [A-Za-z0-9]+
         range       := start-end
         start, end  := integer
     Args:
         uniprot_chains: A string representing UniProt chains, For example "B/D=1-81".
     Returns:
         The first chain identifier from the UniProt chain string. For example "B".
     """
@@ -66,6 +72,27 @@ def _first_chain_from_uniprot_chains(uniprot_chains: str) -> str:
     return chain
+def _chain_length_from_uniprot_chains(uniprot_chains: str) -> int:
+    """Calculates the total length of chain from a UniProt chains string.
+    See `_first_chain_from_uniprot_chains` for the format of the UniProt chains string.
+    Args:
+        uniprot_chains: A string representing UniProt chains, For example "B/D=1-81".
+    Returns:
+        The length of the chain in the UniProt chain string. For example 81 for "B/D=1-81".
+    """
+    total_length = 0
+    chains = uniprot_chains.split(",")
+    for chain in chains:
+        _, rangestr = chain.split("=")
+        start, stop = rangestr.split("-")
+        # Residue positions are 1-based so + 1
+        total_length += int(stop) - int(start) + 1
+    return total_length
 @dataclass(frozen=True)
 class PdbResult:
     """Result of a PDB search in UniProtKB.
@@ -82,11 +109,57 @@ class PdbResult:
     uniprot_chains: str
     resolution: str | None = None
-    @property
+    @cached_property
     def chain(self) -> str:
         """The first chain from the UniProt chains aka self.uniprot_chains."""
         return _first_chain_from_uniprot_chains(self.uniprot_chains)
+    @cached_property
+    def chain_length(self) -> int:
+        """The length of the chain from the UniProt chains aka self.uniprot_chains."""
+        return _chain_length_from_uniprot_chains(self.uniprot_chains)
+type PdbResults = dict[str, set[PdbResult]]
+"""Dictionary with uniprot accessions as keys and sets of PDB results as values."""
+def filter_pdb_results_on_chain_length(
+    pdb_results: PdbResults,
+    min_residues: int | None,
+    max_residues: int | None,
+) -> PdbResults:
+    """Filter PDB results based on chain length.
+    Args:
+        pdb_results: Dictionary with protein IDs as keys and sets of PDB results as values.
+        min_residues: Minimum number of residues required in the chain mapped to the UniProt accession.
+            If None, no minimum is applied.
+        max_residues: Maximum number of residues allowed in chain mapped to the UniProt accession.
+            If None, no maximum is applied.
+    Returns:
+        Filtered dictionary with protein IDs as keys and sets of PDB results as values.
+    """
+    if min_residues is None and max_residues is None:
+        # No filtering needed
+        return pdb_results
+    if min_residues is not None and max_residues is not None and max_residues <= min_residues:
+        msg = f"Maximum number of residues ({max_residues}) must be > minimum number of residues ({min_residues})"
+        raise ValueError(msg)
+    results: PdbResults = {}
+    for uniprot_accession, pdb_entries in pdb_results.items():
+        filtered_pdb_entries = {
+            pdb_entry
+            for pdb_entry in pdb_entries
+            if (min_residues is None or pdb_entry.chain_length >= min_residues)
+            and (max_residues is None or pdb_entry.chain_length <= max_residues)
+        }
+        if filtered_pdb_entries:
+            # Only include uniprot_accession if there are any pdb entries left after filtering
+            results[uniprot_accession] = filtered_pdb_entries
+    return results
 def _query2dynamic_sparql_triples(query: Query):
     parts: list[str] = []
@@ -110,6 +183,13 @@ def _query2dynamic_sparql_triples(query: Query):
         molecular_function_filter = _create_go_filter(go_terms, "Molecular function")
         parts.append(molecular_function_filter)
+    if query.min_sequence_length is not None or query.max_sequence_length is not None:
+        length_filter = _build_sparql_query_sequence_length_filter(
+            min_length=query.min_sequence_length,
+            max_length=query.max_sequence_length,
+        )
+        parts.append(length_filter)
     return "\n".join(parts)
@@ -237,6 +317,57 @@ def _build_sparql_query_uniprot(query: Query, limit=10_000) -> str:
     return _build_sparql_generic_query(select_clause, dedent(where_clause), limit)
+def _build_sparql_query_sequence_length_filter(min_length: int | None = None, max_length: int | None = None) -> str:
+    """Builds a SPARQL filter for sequence length.
+    See 107_uniprot_sequences_and_mark_which_is_cannonical_for_human
+    on https://sparql.uniprot.org/.well-known/sparql-examples/ for similar query.
+    Args:
+        min_length: Minimum sequence length. If None, no minimum is applied.
+        max_length: Maximum sequence length. If None, no maximum is applied.
+    """
+    if min_length is None and max_length is None:
+        return ""
+    # An uniprot entry can have multiple isoforms,
+    # we want to check the length of the canonical isoform
+    # We do this by selecting the isoform that is not based on another isoform
+    # and excluding isoforms from other uniprot entries.
+    # For example for http://purl.uniprot.org/uniprot/P42284:
+    # - http://purl.uniprot.org/isoforms/P42284-2 is ok
+    # - http://purl.uniprot.org/isoforms/P42284-1 is not ok, because it is based on P42284-2
+    # - http://purl.uniprot.org/isoforms/Q7KQZ4-1 is not ok, because it is from another uniprot entry
+    # TODO use same approach as in retrieve_uniprot_details function
+    header = dedent("""\
+        ?protein up:sequence ?isoform .
+        FILTER NOT EXISTS { ?isoform up:basedOn ?parent_isoform }
+        FILTER(
+            STRAFTER(STR(?protein), "http://purl.uniprot.org/uniprot/") =
+            STRBEFORE(STRAFTER(STR(?isoform), "http://purl.uniprot.org/isoforms/"), "-"))
+        ?isoform rdf:value ?sequence .
+        BIND (STRLEN(?sequence) AS ?seq_length)
+    """)
+    if min_length is not None and max_length is not None:
+        if max_length <= min_length:
+            msg = f"Maximum sequence length ({max_length}) must be greater than minimum sequence length ({min_length})"
+            raise ValueError(msg)
+        return dedent(f"""\
+            {header}
+            FILTER (?seq_length >= {min_length} && ?seq_length <= {max_length})
+        """)
+    if min_length is not None:
+        return dedent(f"""\
+            {header}
+            FILTER (?seq_length >= {min_length})
+        """)
+    if max_length is not None:
+        return dedent(f"""\
+            {header}
+            FILTER (?seq_length <= {max_length})
+        """)
+    return ""
 def _build_sparql_query_pdb(uniprot_accs: Iterable[str], limit=10_000) -> str:
     # For http://purl.uniprot.org/uniprot/O00268 + http://rdf.wwpdb.org/pdb/1H3O
     # the chainSequenceMapping are
@@ -248,7 +379,7 @@ def _build_sparql_query_pdb(uniprot_accs: Iterable[str], limit=10_000) -> str:
     # http://purl.uniprot.org/isoforms/O00255-2#PDB_3U84_tt2tt459
     # To get the the chain belonging to the uniprot/pdb pair we need to
     # do some string filtering.
-    # Also there can be multiple cnhins for the same uniprot/pdb pair, so we need to
+    # Also there can be multiple chains for the same uniprot/pdb pair, so we need to
     # do a group by and concat
     select_clause = dedent("""\
@@ -274,7 +405,12 @@ def _build_sparql_query_pdb(uniprot_accs: Iterable[str], limit=10_000) -> str:
     )
-def _build_sparql_query_af(uniprot_accs: Iterable[str], limit=10_000) -> str:
+def _build_sparql_query_af(
+    uniprot_accs: Iterable[str],
+    min_sequence_length: int | None = None,
+    max_sequence_length: int | None = None,
+    limit=10_000,
+) -> str:
     select_clause = "?protein ?af_db"
     where_clause = dedent("""
         # --- Protein Selection ---
@@ -284,6 +420,12 @@ def _build_sparql_query_af(uniprot_accs: Iterable[str], limit=10_000) -> str:
         ?protein rdfs:seeAlso ?af_db .
         ?af_db up:database <http://purl.uniprot.org/database/AlphaFoldDB> .
     """)
+    if min_sequence_length is not None or max_sequence_length is not None:
+        length_filter = _build_sparql_query_sequence_length_filter(
+            min_length=min_sequence_length,
+            max_length=max_sequence_length,
+        )
+        where_clause += "\n" + length_filter
     return _build_sparql_generic_by_uniprot_accessions_query(uniprot_accs, select_clause, dedent(where_clause), limit)
@@ -337,8 +479,8 @@ def _execute_sparql_search(
     return bindings
-def _flatten_results_pdb(rawresults: Iterable) -> dict[str, set[PdbResult]]:
-    pdb_entries: dict[str, set[PdbResult]] = {}
+def _flatten_results_pdb(rawresults: Iterable) -> PdbResults:
+    pdb_entries: PdbResults = {}
     for result in rawresults:
         protein = result["protein"]["value"].split("/")[-1]
         if "pdb_db" not in result:  # Should not happen with build_sparql_query_pdb
@@ -424,7 +566,7 @@ def search4uniprot(query: Query, limit: int = 10_000, timeout: int = 1_800) -> s
 def search4pdb(
     uniprot_accs: Collection[str], limit: int = 10_000, timeout: int = 1_800, batch_size: int = 10_000
-) -> dict[str, set[PdbResult]]:
+) -> PdbResults:
     """
     Search for PDB entries in UniProtKB accessions.
@@ -456,13 +598,20 @@ def search4pdb(
 def search4af(
-    uniprot_accs: Collection[str], limit: int = 10_000, timeout: int = 1_800, batch_size: int = 10_000
+    uniprot_accs: Collection[str],
+    min_sequence_length: int | None = None,
+    max_sequence_length: int | None = None,
+    limit: int = 10_000,
+    timeout: int = 1_800,
+    batch_size: int = 10_000,
 ) -> dict[str, set[str]]:
     """
     Search for AlphaFold entries in UniProtKB accessions.
     Args:
         uniprot_accs: UniProt accessions.
+        min_sequence_length: Minimum length of the canonical sequence.
+        max_sequence_length: Maximum length of the canonical sequence.
         limit: Maximum number of results to return.
         timeout: Timeout for the SPARQL query in seconds.
         batch_size: Size of batches to process the UniProt accessions.
@@ -474,7 +623,7 @@ def search4af(
     total = len(uniprot_accs)
     with tqdm(total=total, desc="Searching for AlphaFolds of uniprots", disable=total < batch_size, unit="acc") as pbar:
         for batch in batched(uniprot_accs, batch_size, strict=False):
-            sparql_query = _build_sparql_query_af(batch, limit)
+            sparql_query = _build_sparql_query_af(batch, min_sequence_length, max_sequence_length, limit)
             logger.info("Executing SPARQL query for AlphaFold: %s", sparql_query)
             raw_results = _execute_sparql_search(
@@ -639,12 +788,12 @@ def search4macromolecular_complexes(
 def search4interaction_partners(
-    uniprot_acc: str, excludes: set[str] | None = None, limit: int = 10_000, timeout: int = 1_800
+    uniprot_accession: str, excludes: set[str] | None = None, limit: int = 10_000, timeout: int = 1_800
 ) -> dict[str, set[str]]:
     """Search for interaction partners of a given UniProt accession using ComplexPortal database references.
     Args:
-        uniprot_acc: UniProt accession to search interaction partners for.
+        uniprot_accession: UniProt accession to search interaction partners for.
         excludes: Set of UniProt accessions to exclude from the results.
             For example already known interaction partners.
             If None then no complex members are excluded.
@@ -655,14 +804,137 @@ def search4interaction_partners(
         Dictionary with UniProt accessions of interaction partners as keys and sets of ComplexPortal entry IDs
         in which the interaction occurs as values.
     """
-    ucomplexes = search4macromolecular_complexes([uniprot_acc], limit=limit, timeout=timeout)
+    ucomplexes = search4macromolecular_complexes([uniprot_accession], limit=limit, timeout=timeout)
     hits: dict[str, set[str]] = {}
     if excludes is None:
         excludes = set()
     for ucomplex in ucomplexes:
         for member in ucomplex.members:
-            if member != uniprot_acc and member not in excludes:
+            if member != uniprot_accession and member not in excludes:
                 if member not in hits:
                     hits[member] = set()
                 hits[member].add(ucomplex.complex_id)
     return hits
+@dataclass(frozen=True)
+class UniprotDetails:
+    """Details of an UniProt entry.
+    Parameters:
+        uniprot_accession: UniProt accession.
+        uniprot_id: UniProt ID (mnemonic).
+        sequence_length: Length of the canonical sequence.
+        reviewed: Whether the entry is reviewed (Swiss-Prot) or unreviewed (TrEMBL).
+        protein_name: Recommended protein name.
+        taxon_id: NCBI Taxonomy ID of the organism.
+        taxon_name: Scientific name of the organism.
+    """
+    uniprot_accession: str
+    uniprot_id: str
+    sequence_length: int
+    reviewed: bool
+    protein_name: str
+    taxon_id: int
+    taxon_name: str
+def map_uniprot_accessions2uniprot_details(
+    uniprot_accessions: Collection[str], timeout: int = 1_800, batch_size: int = 1000
+) -> Generator[UniprotDetails]:
+    """Map UniProt accessions to UniProt details by querying the UniProt SPARQL endpoint.
+    Example:
+    SPARQL query to get details for 7 UniProt entries, run on [https://sparql.uniprot.org/sparql](https://sparql.uniprot.org/sparql).
+    ```sparql
+    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
+    PREFIX up:   <http://purl.uniprot.org/core/>
+    PREFIX rdf:  <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+    SELECT
+    (?ac AS ?uniprot_accession)
+    ?uniprot_id
+    (STRAFTER(STR(?organism), "taxonomy/") AS ?taxon_id)
+    ?taxon_name
+    ?reviewed
+    ?protein_name
+    (STRLEN(?sequence) AS ?seq_length)
+    WHERE {
+    # Input UniProt accessions
+    VALUES (?ac) { ("P05067") ("A6NGD5") ("O14627") ("P00697") ("P42284") ("A0A0B5AC95") ("A0A0S2Z4R0")}
+    BIND (IRI(CONCAT("http://purl.uniprot.org/uniprot/", ?ac)) AS ?protein)
+    ?protein a up:Protein .
+    ?protein up:mnemonic ?uniprot_id .
+    ?protein up:organism ?organism .
+    ?organism up:scientificName ?taxon_name .
+    ?protein up:reviewed ?reviewed .
+    ?protein up:recommendedName/up:fullName ?protein_name .
+    ?protein up:sequence ?isoform .
+    ?isoform a up:Simple_Sequence .
+    ?isoform rdf:value ?sequence .
+    BIND (STRBEFORE(STRAFTER(STR(?isoform), "http://purl.uniprot.org/isoforms/"), "-") AS ?ac_of_isoform)
+    FILTER(?ac_of_isoform = ?ac)
+    }
+    ```
+    Args:
+        uniprot_accessions: Iterable of UniProt accessions.
+        timeout: Timeout for the SPARQL query in seconds.
+        batch_size: Size of batches to process the UniProt accessions.
+    Yields:
+        UniprotDetails objects in random order.
+    """
+    select_clause = dedent("""\
+        (?ac AS ?uniprot_accession)
+        ?uniprot_id
+        (STRAFTER(STR(?organism), "taxonomy/") AS ?taxon_id)
+        ?taxon_name
+        ?reviewed
+        ?protein_name
+        (STRLEN(?sequence) AS ?seq_length)
+    """)
+    where_clause = dedent("""
+        ?protein a up:Protein .
+        ?protein up:mnemonic ?uniprot_id .
+        ?protein up:organism ?organism .
+        ?organism up:scientificName ?taxon_name .
+        ?protein up:reviewed ?reviewed .
+        ?protein up:recommendedName/up:fullName ?protein_name .
+        ?protein up:sequence ?isoform .
+        ?isoform a up:Simple_Sequence .
+        ?isoform rdf:value ?sequence .
+        BIND (STRBEFORE(STRAFTER(STR(?isoform), "http://purl.uniprot.org/isoforms/"), "-") AS ?ac_of_isoform)
+        FILTER(?ac_of_isoform = ?ac)
+    """)
+    total = len(uniprot_accessions)
+    with tqdm(
+        total=total,
+        desc="Retrieving UniProt details",
+        disable=total < batch_size,
+        unit="acc",
+    ) as pbar:
+        for batch in batched(uniprot_accessions, batch_size, strict=False):
+            sparql_query = _build_sparql_generic_by_uniprot_accessions_query(
+                batch, select_clause, where_clause, limit=batch_size
+            )
+            logger.info("Executing SPARQL query for UniProt details: %s", sparql_query)
+            raw_results = _execute_sparql_search(
+                sparql_query=sparql_query,
+                timeout=timeout,
+            )
+            for raw_result in raw_results:
+                result = UniprotDetails(
+                    uniprot_accession=raw_result["uniprot_accession"]["value"],
+                    uniprot_id=raw_result["uniprot_id"]["value"],
+                    sequence_length=int(raw_result["seq_length"]["value"]),
+                    reviewed=raw_result["reviewed"]["value"] == "true",
+                    protein_name=raw_result["protein_name"]["value"],
+                    taxon_id=int(raw_result["taxon_id"]["value"]),
+                    taxon_name=raw_result["taxon_name"]["value"],
+                )
+                yield result
+            pbar.update(len(batch))

protein_quest/utils.py CHANGED Viewed

@@ -265,6 +265,7 @@ async def retrieve_files(
     desc: str = "Downloading files",
     cacher: Cacher | None = None,
     chunk_size: int = 524288,  # 512 KiB
+    gzip_files: bool = False,
 ) -> list[Path]:
     """Retrieve files from a list of URLs and save them to a directory.
@@ -277,6 +278,7 @@ async def retrieve_files(
         desc: Description for the progress bar.
         cacher: An optional cacher to use for caching files.
         chunk_size: The size of each chunk to read from the response.
+        gzip_files: Whether to gzip the downloaded files.
     Returns:
         A list of paths to the downloaded files.
@@ -292,6 +294,7 @@ async def retrieve_files(
                 semaphore=semaphore,
                 cacher=cacher,
                 chunk_size=chunk_size,
+                gzip_files=gzip_files,
             )
             for url, filename in urls
         ]
@@ -299,6 +302,10 @@ async def retrieve_files(
         return files
+class InvalidContentEncodingError(aiohttp.ClientResponseError):
+    """Content encoding is invalid."""
 async def _retrieve_file(
     session: RetryClient,
     url: URL | str,
@@ -306,6 +313,7 @@ async def _retrieve_file(
     semaphore: asyncio.Semaphore,
     cacher: Cacher | None = None,
     chunk_size: int = 524288,  # 512 KiB
+    gzip_files: bool = False,
 ) -> Path:
     """Retrieve a single file from a URL and save it to a specified path.
@@ -316,6 +324,7 @@ async def _retrieve_file(
         semaphore: A semaphore to limit the number of concurrent downloads.
         cacher: An optional cacher to use for caching files.
         chunk_size: The size of each chunk to read from the response.
+        gzip_files: Whether to gzip the downloaded file.
     Returns:
         The path to the saved file.
@@ -330,12 +339,27 @@ async def _retrieve_file(
         logger.debug(f"File {save_path} was copied from cache {cached_file}. Skipping download from {url}.")
         return save_path
+    # Alphafold server and many other web servers can return gzipped responses,
+    # when we want to save as *.gz, we use raw stream
+    # otherwise aiohttp will decompress it automatically for us.
+    auto_decompress = not gzip_files
+    headers = {"Accept-Encoding": "gzip"}
     async with (
         semaphore,
-        session.get(url) as resp,
+        session.get(url, headers=headers, auto_decompress=auto_decompress) as resp,
     ):
         resp.raise_for_status()
-        await cacher.write_iter(save_path, resp.content.iter_chunked(chunk_size))
+        if gzip_files and resp.headers.get("Content-Encoding") != "gzip":
+            msg = f"Server did not send gzip encoded content for {url}, can not save as gzipped file."
+            raise InvalidContentEncodingError(
+                request_info=resp.request_info,
+                history=resp.history,
+                status=415,
+                message=msg,
+                headers=resp.headers,
+            )
+        iterator = resp.content.iter_chunked(chunk_size)
+        await cacher.write_iter(save_path, iterator)
     return save_path

{protein_quest-0.5.1.dist-info → protein_quest-0.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: protein_quest
-Version: 0.5.1
+Version: 0.7.0
 Summary: Search/retrieve/filter proteins and protein structures
 Project-URL: Homepage, https://github.com/haddocking/protein-quest
 Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
@@ -11,12 +11,12 @@ Requires-Python: >=3.13
 Requires-Dist: aiofiles>=24.1.0
 Requires-Dist: aiohttp-retry>=2.9.1
 Requires-Dist: aiohttp[speedups]>=3.11.18
-Requires-Dist: aiopath>=0.7.7
 Requires-Dist: attrs>=25.3.0
 Requires-Dist: cattrs[orjson]>=24.1.3
 Requires-Dist: dask>=2025.5.1
 Requires-Dist: distributed>=2025.5.1
 Requires-Dist: gemmi>=0.7.3
+Requires-Dist: mmcif>=0.92.0
 Requires-Dist: platformdirs>=4.3.8
 Requires-Dist: psutil>=7.0.0
 Requires-Dist: rich-argparse>=1.7.1
@@ -26,7 +26,7 @@ Requires-Dist: tqdm>=4.67.1
 Requires-Dist: yarl>=1.20.1
 Provides-Extra: mcp
 Requires-Dist: fastmcp>=2.11.3; extra == 'mcp'
-Requires-Dist: pydantic>=2.11.7; extra == 'mcp'
+Requires-Dist: pydantic>=2.12.0; extra == 'mcp'
 Description-Content-Type: text/markdown
 # protein-quest
@@ -61,6 +61,7 @@ graph TB;
     searchuniprot --> |uniprot_accessions|searchpdbe[/Search PDBe/]
     searchuniprot --> |uniprot_accessions|searchaf[/Search Alphafold/]
     searchuniprot -. uniprot_accessions .-> searchemdb[/Search EMDB/]
+    searchuniprot -. uniprot_accessions .-> searchuniprotdetails[/Search UniProt details/]
     searchintactionpartners[/Search interaction partners/] -.-x |uniprot_accessions|searchuniprot
     searchcomplexes[/Search complexes/]
     searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
@@ -71,6 +72,8 @@ graph TB;
     fetchad -->|mmcif_files| confidencefilter{{Filter out low confidence}}
     confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
     residuefilter --> |mmcif_files| ssfilter
+    ssfilter -. mmcif_files .-> convert2cif([Convert to cif])
+    ssfilter -. mmcif_files .-> convert2uniprot_accessions([Convert to UniProt accessions])
     classDef dashedBorder stroke-dasharray: 5 5;
     goterm:::dashedBorder
     taxonomy:::dashedBorder
@@ -78,6 +81,9 @@ graph TB;
     fetchemdb:::dashedBorder
     searchintactionpartners:::dashedBorder
     searchcomplexes:::dashedBorder
+    searchuniprotdetails:::dashedBorder
+    convert2cif:::dashedBorder
+    convert2uniprot_accessions:::dashedBorder
 ```
 (Dotted nodes and edges are side-quests.)
@@ -108,7 +114,7 @@ This behavior can be customized with the `--no-cache`, `--cache-dir`, and `--cop
 protein-quest search uniprot \
     --taxon-id 9606 \
     --reviewed \
-    --subcellular-location-uniprot nucleus \
+    --subcellular-location-uniprot "nucleus" \
     --subcellular-location-go GO:0005634 \
     --molecular-function-go GO:0003677 \
     --limit 100 \
@@ -191,7 +197,7 @@ protein-quest filter residue  \
 ### To filter on secondary structure
-To filter on structure being mostly alpha helices and have no beta sheets.
+To filter on structure being mostly alpha helices and have no beta sheets. See the following [notebook](https://www.bonvinlab.org/protein-detective/SSE_elements.html) to determine the ratio of secondary structure elements.
 ```shell
 protein-quest filter secondary-structure \
@@ -242,6 +248,37 @@ query_protein,complex_id,complex_url,complex_title,members
 Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
 ```
+### Search for UniProt details
+To get details (like protein name, sequence length, organism) for a list of UniProt accessions.
+```shell
+protein-quest search uniprot-details uniprot_accs.txt uniprot_details.csv
+```
+The `uniprot_details.csv` looks like:
+```csv
+uniprot_accession,uniprot_id,sequence_length,reviewed,protein_name,taxon_id,taxon_name
+A0A087WUV0,ZN892_HUMAN,522,True,Zinc finger protein 892,9606,Homo sapiens
+```
+### Convert structure files to .cif format
+Some tools (for example [powerfit](https://github.com/haddocking/powerfit)) only work with `.cif` files and not `*.cif.gz` or `*.bcif` files.
+```shell
+protein-quest convert structures --format cif --output-dir ./filtered-cif ./filtered-ss
+```
+### Convert structure files to UniProt accessions
+After running some filters you might want to know which UniProt accessions are still present in the filtered structures.
+```shell
+protein-quest convert uniprot ./filtered-ss uniprot_accs.filtered.txt
+```
 ##  Model Context Protocol (MCP) server
 Protein quest can also help LLMs like Claude Sonnet 4 by providing a [set of tools](https://modelcontextprotocol.io/docs/learn/server-concepts#tools-ai-actions) for protein structures.

protein_quest-0.7.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,27 @@
+protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+protein_quest/__version__.py,sha256=F9kNagC7uEvuPDju8Gzo4Jt81LSvbf0VyONV3GMXT2M,56
+protein_quest/cli.py,sha256=082CmSSmxVZoWbnX35AmhqedA4T1dD9v-eMe0vsIDp4,55572
+protein_quest/converter.py,sha256=Y-Oxf7lDNbEicL6GS-IpNWDwaAiHgIgs5bFAcEHCKdQ,1441
+protein_quest/emdb.py,sha256=641c6RwNYnu-0GBFyCFBiI58fNc0jMkd0ZZ9MW9-Jmc,1501
+protein_quest/filters.py,sha256=Xr-cJTtbNjHKuzmXLBf7yZfqKf_U3RTivcVbr620LVU,5225
+protein_quest/go.py,sha256=lZNEcw8nTc9wpV3cl4y2FG9Lsj8wsXQ6zemmAQs_DWE,5650
+protein_quest/io.py,sha256=ngV_HU2HIQFO-bP2xQj_fhgv0MYjW4puqz_9CxGpBv8,13017
+protein_quest/mcp_server.py,sha256=tZkSG1yx4ocN1rlKgVlU8nUbs6LKpyLrNqP3y6fbJm0,8564
+protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
+protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+protein_quest/ss.py,sha256=4ZGIHfjTlodYTXqGUKhMnGbgaStYOGaWg2oYrWIjdgo,10118
+protein_quest/structure.py,sha256=QozElPz0kbPB_HW-J1WxArTT5e-1vRyBJoBSfHnwoRM,8117
+protein_quest/taxonomy.py,sha256=4mKv8zll4mX02Ow8CTvyqMJE2KJZvcq3QlTjjjLOJJk,5072
+protein_quest/uniprot.py,sha256=mODAcneCnDvinvJ3jffyR11klsgq5b96T_4aVWd-Luw,35158
+protein_quest/utils.py,sha256=6OF8X4ia_z1HOYiXy6e-zEWlp_bF1DoZCVrCSg1qivY,19076
+protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
+protein_quest/alphafold/confidence.py,sha256=mVAYTIzdbR8xBjRiUzA0at8wJq9vpfEQWPz5cJefLKs,6766
+protein_quest/alphafold/entry_summary.py,sha256=Qhnw75RXFaoOU332g7axg_jYbbdZbUpsGPUOwPNDSeU,2114
+protein_quest/alphafold/fetch.py,sha256=l8pcXeuDfoXYiwpW5N_uB_9oZpomBgUeF9kROLrM11M,14038
+protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
+protein_quest/pdbe/fetch.py,sha256=e8CHWDX2QzWnVLmYXCfNrscw1UcN1lI9Uz6Z5HmEOEQ,2510
+protein_quest-0.7.0.dist-info/METADATA,sha256=JvsZl9XGN57iJn5oSBRIVNIqL6aYEHXQlGpE87nsSvQ,10722
+protein_quest-0.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+protein_quest-0.7.0.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
+protein_quest-0.7.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+protein_quest-0.7.0.dist-info/RECORD,,

protein-quest 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

Potentially problematic release.

protein-quest 0.5.1py3-none-any.whl → 0.7.0py3-none-any.whl