protein-quest 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of protein-quest might be problematic. Click here for more details.

protein_quest/uniprot.py CHANGED
@@ -1,8 +1,9 @@
1
1
  """Module for searching UniProtKB using SPARQL."""
2
2
 
3
3
  import logging
4
- from collections.abc import Collection, Iterable
4
+ from collections.abc import Collection, Generator, Iterable
5
5
  from dataclasses import dataclass
6
+ from functools import cached_property
6
7
  from itertools import batched
7
8
  from textwrap import dedent
8
9
 
@@ -24,6 +25,8 @@ class Query:
24
25
  (e.g., ["GO:0005634"]) or a collection of GO terms (e.g., ["GO:0005634", "GO:0005737"]).
25
26
  molecular_function_go: Molecular function in GO format. Can be a single GO term
26
27
  (e.g., ["GO:0003674"]) or a collection of GO terms (e.g., ["GO:0003674", "GO:0008150"]).
28
+ min_sequence_length: Minimum length of the canonical sequence.
29
+ max_sequence_length: Maximum length of the canonical sequence.
27
30
  """
28
31
 
29
32
  # TODO make taxon_id an int
@@ -32,6 +35,8 @@ class Query:
32
35
  subcellular_location_uniprot: str | None = None
33
36
  subcellular_location_go: list[str] | None = None
34
37
  molecular_function_go: list[str] | None = None
38
+ min_sequence_length: int | None = None
39
+ max_sequence_length: int | None = None
35
40
 
36
41
 
37
42
  def _first_chain_from_uniprot_chains(uniprot_chains: str) -> str:
@@ -39,16 +44,17 @@ def _first_chain_from_uniprot_chains(uniprot_chains: str) -> str:
39
44
 
40
45
  The UniProt chains string is formatted (with EBNF notation) as follows:
41
46
 
42
- chain_group(=range)?(,chain_group(=range)?)*
47
+ chain_group=range(,chain_group=range)*
43
48
 
44
49
  where:
45
50
  chain_group := chain_id(/chain_id)*
46
- chain_id := [A-Za-z]+
51
+ chain_id := [A-Za-z0-9]+
47
52
  range := start-end
48
53
  start, end := integer
49
54
 
50
55
  Args:
51
56
  uniprot_chains: A string representing UniProt chains, For example "B/D=1-81".
57
+
52
58
  Returns:
53
59
  The first chain identifier from the UniProt chain string. For example "B".
54
60
  """
@@ -66,6 +72,27 @@ def _first_chain_from_uniprot_chains(uniprot_chains: str) -> str:
66
72
  return chain
67
73
 
68
74
 
75
+ def _chain_length_from_uniprot_chains(uniprot_chains: str) -> int:
76
+ """Calculates the total length of chain from a UniProt chains string.
77
+
78
+ See `_first_chain_from_uniprot_chains` for the format of the UniProt chains string.
79
+
80
+ Args:
81
+ uniprot_chains: A string representing UniProt chains, For example "B/D=1-81".
82
+
83
+ Returns:
84
+ The length of the chain in the UniProt chain string. For example 81 for "B/D=1-81".
85
+ """
86
+ total_length = 0
87
+ chains = uniprot_chains.split(",")
88
+ for chain in chains:
89
+ _, rangestr = chain.split("=")
90
+ start, stop = rangestr.split("-")
91
+ # Residue positions are 1-based so + 1
92
+ total_length += int(stop) - int(start) + 1
93
+ return total_length
94
+
95
+
69
96
  @dataclass(frozen=True)
70
97
  class PdbResult:
71
98
  """Result of a PDB search in UniProtKB.
@@ -82,11 +109,57 @@ class PdbResult:
82
109
  uniprot_chains: str
83
110
  resolution: str | None = None
84
111
 
85
- @property
112
+ @cached_property
86
113
  def chain(self) -> str:
87
114
  """The first chain from the UniProt chains aka self.uniprot_chains."""
88
115
  return _first_chain_from_uniprot_chains(self.uniprot_chains)
89
116
 
117
+ @cached_property
118
+ def chain_length(self) -> int:
119
+ """The length of the chain from the UniProt chains aka self.uniprot_chains."""
120
+ return _chain_length_from_uniprot_chains(self.uniprot_chains)
121
+
122
+
123
+ type PdbResults = dict[str, set[PdbResult]]
124
+ """Dictionary with uniprot accessions as keys and sets of PDB results as values."""
125
+
126
+
127
+ def filter_pdb_results_on_chain_length(
128
+ pdb_results: PdbResults,
129
+ min_residues: int | None,
130
+ max_residues: int | None,
131
+ ) -> PdbResults:
132
+ """Filter PDB results based on chain length.
133
+
134
+ Args:
135
+ pdb_results: Dictionary with protein IDs as keys and sets of PDB results as values.
136
+ min_residues: Minimum number of residues required in the chain mapped to the UniProt accession.
137
+ If None, no minimum is applied.
138
+ max_residues: Maximum number of residues allowed in chain mapped to the UniProt accession.
139
+ If None, no maximum is applied.
140
+
141
+ Returns:
142
+ Filtered dictionary with protein IDs as keys and sets of PDB results as values.
143
+ """
144
+ if min_residues is None and max_residues is None:
145
+ # No filtering needed
146
+ return pdb_results
147
+ if min_residues is not None and max_residues is not None and max_residues <= min_residues:
148
+ msg = f"Maximum number of residues ({max_residues}) must be > minimum number of residues ({min_residues})"
149
+ raise ValueError(msg)
150
+ results: PdbResults = {}
151
+ for uniprot_accession, pdb_entries in pdb_results.items():
152
+ filtered_pdb_entries = {
153
+ pdb_entry
154
+ for pdb_entry in pdb_entries
155
+ if (min_residues is None or pdb_entry.chain_length >= min_residues)
156
+ and (max_residues is None or pdb_entry.chain_length <= max_residues)
157
+ }
158
+ if filtered_pdb_entries:
159
+ # Only include uniprot_accession if there are any pdb entries left after filtering
160
+ results[uniprot_accession] = filtered_pdb_entries
161
+ return results
162
+
90
163
 
91
164
  def _query2dynamic_sparql_triples(query: Query):
92
165
  parts: list[str] = []
@@ -110,6 +183,13 @@ def _query2dynamic_sparql_triples(query: Query):
110
183
  molecular_function_filter = _create_go_filter(go_terms, "Molecular function")
111
184
  parts.append(molecular_function_filter)
112
185
 
186
+ if query.min_sequence_length is not None or query.max_sequence_length is not None:
187
+ length_filter = _build_sparql_query_sequence_length_filter(
188
+ min_length=query.min_sequence_length,
189
+ max_length=query.max_sequence_length,
190
+ )
191
+ parts.append(length_filter)
192
+
113
193
  return "\n".join(parts)
114
194
 
115
195
 
@@ -237,6 +317,57 @@ def _build_sparql_query_uniprot(query: Query, limit=10_000) -> str:
237
317
  return _build_sparql_generic_query(select_clause, dedent(where_clause), limit)
238
318
 
239
319
 
320
+ def _build_sparql_query_sequence_length_filter(min_length: int | None = None, max_length: int | None = None) -> str:
321
+ """Builds a SPARQL filter for sequence length.
322
+
323
+ See 107_uniprot_sequences_and_mark_which_is_cannonical_for_human
324
+ on https://sparql.uniprot.org/.well-known/sparql-examples/ for similar query.
325
+
326
+ Args:
327
+ min_length: Minimum sequence length. If None, no minimum is applied.
328
+ max_length: Maximum sequence length. If None, no maximum is applied.
329
+ """
330
+ if min_length is None and max_length is None:
331
+ return ""
332
+ # An uniprot entry can have multiple isoforms,
333
+ # we want to check the length of the canonical isoform
334
+ # We do this by selecting the isoform that is not based on another isoform
335
+ # and excluding isoforms from other uniprot entries.
336
+ # For example for http://purl.uniprot.org/uniprot/P42284:
337
+ # - http://purl.uniprot.org/isoforms/P42284-2 is ok
338
+ # - http://purl.uniprot.org/isoforms/P42284-1 is not ok, because it is based on P42284-2
339
+ # - http://purl.uniprot.org/isoforms/Q7KQZ4-1 is not ok, because it is from another uniprot entry
340
+ # TODO use same approach as in retrieve_uniprot_details function
341
+ header = dedent("""\
342
+ ?protein up:sequence ?isoform .
343
+ FILTER NOT EXISTS { ?isoform up:basedOn ?parent_isoform }
344
+ FILTER(
345
+ STRAFTER(STR(?protein), "http://purl.uniprot.org/uniprot/") =
346
+ STRBEFORE(STRAFTER(STR(?isoform), "http://purl.uniprot.org/isoforms/"), "-"))
347
+ ?isoform rdf:value ?sequence .
348
+ BIND (STRLEN(?sequence) AS ?seq_length)
349
+ """)
350
+ if min_length is not None and max_length is not None:
351
+ if max_length <= min_length:
352
+ msg = f"Maximum sequence length ({max_length}) must be greater than minimum sequence length ({min_length})"
353
+ raise ValueError(msg)
354
+ return dedent(f"""\
355
+ {header}
356
+ FILTER (?seq_length >= {min_length} && ?seq_length <= {max_length})
357
+ """)
358
+ if min_length is not None:
359
+ return dedent(f"""\
360
+ {header}
361
+ FILTER (?seq_length >= {min_length})
362
+ """)
363
+ if max_length is not None:
364
+ return dedent(f"""\
365
+ {header}
366
+ FILTER (?seq_length <= {max_length})
367
+ """)
368
+ return ""
369
+
370
+
240
371
  def _build_sparql_query_pdb(uniprot_accs: Iterable[str], limit=10_000) -> str:
241
372
  # For http://purl.uniprot.org/uniprot/O00268 + http://rdf.wwpdb.org/pdb/1H3O
242
373
  # the chainSequenceMapping are
@@ -248,7 +379,7 @@ def _build_sparql_query_pdb(uniprot_accs: Iterable[str], limit=10_000) -> str:
248
379
  # http://purl.uniprot.org/isoforms/O00255-2#PDB_3U84_tt2tt459
249
380
  # To get the the chain belonging to the uniprot/pdb pair we need to
250
381
  # do some string filtering.
251
- # Also there can be multiple cnhins for the same uniprot/pdb pair, so we need to
382
+ # Also there can be multiple chains for the same uniprot/pdb pair, so we need to
252
383
  # do a group by and concat
253
384
 
254
385
  select_clause = dedent("""\
@@ -274,7 +405,12 @@ def _build_sparql_query_pdb(uniprot_accs: Iterable[str], limit=10_000) -> str:
274
405
  )
275
406
 
276
407
 
277
- def _build_sparql_query_af(uniprot_accs: Iterable[str], limit=10_000) -> str:
408
+ def _build_sparql_query_af(
409
+ uniprot_accs: Iterable[str],
410
+ min_sequence_length: int | None = None,
411
+ max_sequence_length: int | None = None,
412
+ limit=10_000,
413
+ ) -> str:
278
414
  select_clause = "?protein ?af_db"
279
415
  where_clause = dedent("""
280
416
  # --- Protein Selection ---
@@ -284,6 +420,12 @@ def _build_sparql_query_af(uniprot_accs: Iterable[str], limit=10_000) -> str:
284
420
  ?protein rdfs:seeAlso ?af_db .
285
421
  ?af_db up:database <http://purl.uniprot.org/database/AlphaFoldDB> .
286
422
  """)
423
+ if min_sequence_length is not None or max_sequence_length is not None:
424
+ length_filter = _build_sparql_query_sequence_length_filter(
425
+ min_length=min_sequence_length,
426
+ max_length=max_sequence_length,
427
+ )
428
+ where_clause += "\n" + length_filter
287
429
  return _build_sparql_generic_by_uniprot_accessions_query(uniprot_accs, select_clause, dedent(where_clause), limit)
288
430
 
289
431
 
@@ -337,8 +479,8 @@ def _execute_sparql_search(
337
479
  return bindings
338
480
 
339
481
 
340
- def _flatten_results_pdb(rawresults: Iterable) -> dict[str, set[PdbResult]]:
341
- pdb_entries: dict[str, set[PdbResult]] = {}
482
+ def _flatten_results_pdb(rawresults: Iterable) -> PdbResults:
483
+ pdb_entries: PdbResults = {}
342
484
  for result in rawresults:
343
485
  protein = result["protein"]["value"].split("/")[-1]
344
486
  if "pdb_db" not in result: # Should not happen with build_sparql_query_pdb
@@ -424,7 +566,7 @@ def search4uniprot(query: Query, limit: int = 10_000, timeout: int = 1_800) -> s
424
566
 
425
567
  def search4pdb(
426
568
  uniprot_accs: Collection[str], limit: int = 10_000, timeout: int = 1_800, batch_size: int = 10_000
427
- ) -> dict[str, set[PdbResult]]:
569
+ ) -> PdbResults:
428
570
  """
429
571
  Search for PDB entries in UniProtKB accessions.
430
572
 
@@ -456,13 +598,20 @@ def search4pdb(
456
598
 
457
599
 
458
600
  def search4af(
459
- uniprot_accs: Collection[str], limit: int = 10_000, timeout: int = 1_800, batch_size: int = 10_000
601
+ uniprot_accs: Collection[str],
602
+ min_sequence_length: int | None = None,
603
+ max_sequence_length: int | None = None,
604
+ limit: int = 10_000,
605
+ timeout: int = 1_800,
606
+ batch_size: int = 10_000,
460
607
  ) -> dict[str, set[str]]:
461
608
  """
462
609
  Search for AlphaFold entries in UniProtKB accessions.
463
610
 
464
611
  Args:
465
612
  uniprot_accs: UniProt accessions.
613
+ min_sequence_length: Minimum length of the canonical sequence.
614
+ max_sequence_length: Maximum length of the canonical sequence.
466
615
  limit: Maximum number of results to return.
467
616
  timeout: Timeout for the SPARQL query in seconds.
468
617
  batch_size: Size of batches to process the UniProt accessions.
@@ -474,7 +623,7 @@ def search4af(
474
623
  total = len(uniprot_accs)
475
624
  with tqdm(total=total, desc="Searching for AlphaFolds of uniprots", disable=total < batch_size, unit="acc") as pbar:
476
625
  for batch in batched(uniprot_accs, batch_size, strict=False):
477
- sparql_query = _build_sparql_query_af(batch, limit)
626
+ sparql_query = _build_sparql_query_af(batch, min_sequence_length, max_sequence_length, limit)
478
627
  logger.info("Executing SPARQL query for AlphaFold: %s", sparql_query)
479
628
 
480
629
  raw_results = _execute_sparql_search(
@@ -639,12 +788,12 @@ def search4macromolecular_complexes(
639
788
 
640
789
 
641
790
  def search4interaction_partners(
642
- uniprot_acc: str, excludes: set[str] | None = None, limit: int = 10_000, timeout: int = 1_800
791
+ uniprot_accession: str, excludes: set[str] | None = None, limit: int = 10_000, timeout: int = 1_800
643
792
  ) -> dict[str, set[str]]:
644
793
  """Search for interaction partners of a given UniProt accession using ComplexPortal database references.
645
794
 
646
795
  Args:
647
- uniprot_acc: UniProt accession to search interaction partners for.
796
+ uniprot_accession: UniProt accession to search interaction partners for.
648
797
  excludes: Set of UniProt accessions to exclude from the results.
649
798
  For example already known interaction partners.
650
799
  If None then no complex members are excluded.
@@ -655,14 +804,137 @@ def search4interaction_partners(
655
804
  Dictionary with UniProt accessions of interaction partners as keys and sets of ComplexPortal entry IDs
656
805
  in which the interaction occurs as values.
657
806
  """
658
- ucomplexes = search4macromolecular_complexes([uniprot_acc], limit=limit, timeout=timeout)
807
+ ucomplexes = search4macromolecular_complexes([uniprot_accession], limit=limit, timeout=timeout)
659
808
  hits: dict[str, set[str]] = {}
660
809
  if excludes is None:
661
810
  excludes = set()
662
811
  for ucomplex in ucomplexes:
663
812
  for member in ucomplex.members:
664
- if member != uniprot_acc and member not in excludes:
813
+ if member != uniprot_accession and member not in excludes:
665
814
  if member not in hits:
666
815
  hits[member] = set()
667
816
  hits[member].add(ucomplex.complex_id)
668
817
  return hits
818
+
819
+
820
+ @dataclass(frozen=True)
821
+ class UniprotDetails:
822
+ """Details of an UniProt entry.
823
+
824
+ Parameters:
825
+ uniprot_accession: UniProt accession.
826
+ uniprot_id: UniProt ID (mnemonic).
827
+ sequence_length: Length of the canonical sequence.
828
+ reviewed: Whether the entry is reviewed (Swiss-Prot) or unreviewed (TrEMBL).
829
+ protein_name: Recommended protein name.
830
+ taxon_id: NCBI Taxonomy ID of the organism.
831
+ taxon_name: Scientific name of the organism.
832
+ """
833
+
834
+ uniprot_accession: str
835
+ uniprot_id: str
836
+ sequence_length: int
837
+ reviewed: bool
838
+ protein_name: str
839
+ taxon_id: int
840
+ taxon_name: str
841
+
842
+
843
+ def map_uniprot_accessions2uniprot_details(
844
+ uniprot_accessions: Collection[str], timeout: int = 1_800, batch_size: int = 1000
845
+ ) -> Generator[UniprotDetails]:
846
+ """Map UniProt accessions to UniProt details by querying the UniProt SPARQL endpoint.
847
+
848
+ Example:
849
+
850
+ SPARQL query to get details for 7 UniProt entries, run on [https://sparql.uniprot.org/sparql](https://sparql.uniprot.org/sparql).
851
+
852
+ ```sparql
853
+ PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
854
+ PREFIX up: <http://purl.uniprot.org/core/>
855
+ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
856
+
857
+ SELECT
858
+ (?ac AS ?uniprot_accession)
859
+ ?uniprot_id
860
+ (STRAFTER(STR(?organism), "taxonomy/") AS ?taxon_id)
861
+ ?taxon_name
862
+ ?reviewed
863
+ ?protein_name
864
+ (STRLEN(?sequence) AS ?seq_length)
865
+ WHERE {
866
+ # Input UniProt accessions
867
+ VALUES (?ac) { ("P05067") ("A6NGD5") ("O14627") ("P00697") ("P42284") ("A0A0B5AC95") ("A0A0S2Z4R0")}
868
+ BIND (IRI(CONCAT("http://purl.uniprot.org/uniprot/", ?ac)) AS ?protein)
869
+ ?protein a up:Protein .
870
+ ?protein up:mnemonic ?uniprot_id .
871
+ ?protein up:organism ?organism .
872
+ ?organism up:scientificName ?taxon_name .
873
+ ?protein up:reviewed ?reviewed .
874
+ ?protein up:recommendedName/up:fullName ?protein_name .
875
+ ?protein up:sequence ?isoform .
876
+ ?isoform a up:Simple_Sequence .
877
+ ?isoform rdf:value ?sequence .
878
+ BIND (STRBEFORE(STRAFTER(STR(?isoform), "http://purl.uniprot.org/isoforms/"), "-") AS ?ac_of_isoform)
879
+ FILTER(?ac_of_isoform = ?ac)
880
+ }
881
+ ```
882
+
883
+ Args:
884
+ uniprot_accessions: Iterable of UniProt accessions.
885
+ timeout: Timeout for the SPARQL query in seconds.
886
+ batch_size: Size of batches to process the UniProt accessions.
887
+
888
+ Yields:
889
+ UniprotDetails objects in random order.
890
+ """
891
+ select_clause = dedent("""\
892
+ (?ac AS ?uniprot_accession)
893
+ ?uniprot_id
894
+ (STRAFTER(STR(?organism), "taxonomy/") AS ?taxon_id)
895
+ ?taxon_name
896
+ ?reviewed
897
+ ?protein_name
898
+ (STRLEN(?sequence) AS ?seq_length)
899
+ """)
900
+ where_clause = dedent("""
901
+ ?protein a up:Protein .
902
+ ?protein up:mnemonic ?uniprot_id .
903
+ ?protein up:organism ?organism .
904
+ ?organism up:scientificName ?taxon_name .
905
+ ?protein up:reviewed ?reviewed .
906
+ ?protein up:recommendedName/up:fullName ?protein_name .
907
+ ?protein up:sequence ?isoform .
908
+ ?isoform a up:Simple_Sequence .
909
+ ?isoform rdf:value ?sequence .
910
+ BIND (STRBEFORE(STRAFTER(STR(?isoform), "http://purl.uniprot.org/isoforms/"), "-") AS ?ac_of_isoform)
911
+ FILTER(?ac_of_isoform = ?ac)
912
+ """)
913
+ total = len(uniprot_accessions)
914
+ with tqdm(
915
+ total=total,
916
+ desc="Retrieving UniProt details",
917
+ disable=total < batch_size,
918
+ unit="acc",
919
+ ) as pbar:
920
+ for batch in batched(uniprot_accessions, batch_size, strict=False):
921
+ sparql_query = _build_sparql_generic_by_uniprot_accessions_query(
922
+ batch, select_clause, where_clause, limit=batch_size
923
+ )
924
+ logger.info("Executing SPARQL query for UniProt details: %s", sparql_query)
925
+ raw_results = _execute_sparql_search(
926
+ sparql_query=sparql_query,
927
+ timeout=timeout,
928
+ )
929
+ for raw_result in raw_results:
930
+ result = UniprotDetails(
931
+ uniprot_accession=raw_result["uniprot_accession"]["value"],
932
+ uniprot_id=raw_result["uniprot_id"]["value"],
933
+ sequence_length=int(raw_result["seq_length"]["value"]),
934
+ reviewed=raw_result["reviewed"]["value"] == "true",
935
+ protein_name=raw_result["protein_name"]["value"],
936
+ taxon_id=int(raw_result["taxon_id"]["value"]),
937
+ taxon_name=raw_result["taxon_name"]["value"],
938
+ )
939
+ yield result
940
+ pbar.update(len(batch))
protein_quest/utils.py CHANGED
@@ -265,6 +265,7 @@ async def retrieve_files(
265
265
  desc: str = "Downloading files",
266
266
  cacher: Cacher | None = None,
267
267
  chunk_size: int = 524288, # 512 KiB
268
+ gzip_files: bool = False,
268
269
  ) -> list[Path]:
269
270
  """Retrieve files from a list of URLs and save them to a directory.
270
271
 
@@ -277,6 +278,7 @@ async def retrieve_files(
277
278
  desc: Description for the progress bar.
278
279
  cacher: An optional cacher to use for caching files.
279
280
  chunk_size: The size of each chunk to read from the response.
281
+ gzip_files: Whether to gzip the downloaded files.
280
282
 
281
283
  Returns:
282
284
  A list of paths to the downloaded files.
@@ -292,6 +294,7 @@ async def retrieve_files(
292
294
  semaphore=semaphore,
293
295
  cacher=cacher,
294
296
  chunk_size=chunk_size,
297
+ gzip_files=gzip_files,
295
298
  )
296
299
  for url, filename in urls
297
300
  ]
@@ -299,6 +302,10 @@ async def retrieve_files(
299
302
  return files
300
303
 
301
304
 
305
+ class InvalidContentEncodingError(aiohttp.ClientResponseError):
306
+ """Content encoding is invalid."""
307
+
308
+
302
309
  async def _retrieve_file(
303
310
  session: RetryClient,
304
311
  url: URL | str,
@@ -306,6 +313,7 @@ async def _retrieve_file(
306
313
  semaphore: asyncio.Semaphore,
307
314
  cacher: Cacher | None = None,
308
315
  chunk_size: int = 524288, # 512 KiB
316
+ gzip_files: bool = False,
309
317
  ) -> Path:
310
318
  """Retrieve a single file from a URL and save it to a specified path.
311
319
 
@@ -316,6 +324,7 @@ async def _retrieve_file(
316
324
  semaphore: A semaphore to limit the number of concurrent downloads.
317
325
  cacher: An optional cacher to use for caching files.
318
326
  chunk_size: The size of each chunk to read from the response.
327
+ gzip_files: Whether to gzip the downloaded file.
319
328
 
320
329
  Returns:
321
330
  The path to the saved file.
@@ -330,12 +339,27 @@ async def _retrieve_file(
330
339
  logger.debug(f"File {save_path} was copied from cache {cached_file}. Skipping download from {url}.")
331
340
  return save_path
332
341
 
342
+ # Alphafold server and many other web servers can return gzipped responses,
343
+ # when we want to save as *.gz, we use raw stream
344
+ # otherwise aiohttp will decompress it automatically for us.
345
+ auto_decompress = not gzip_files
346
+ headers = {"Accept-Encoding": "gzip"}
333
347
  async with (
334
348
  semaphore,
335
- session.get(url) as resp,
349
+ session.get(url, headers=headers, auto_decompress=auto_decompress) as resp,
336
350
  ):
337
351
  resp.raise_for_status()
338
- await cacher.write_iter(save_path, resp.content.iter_chunked(chunk_size))
352
+ if gzip_files and resp.headers.get("Content-Encoding") != "gzip":
353
+ msg = f"Server did not send gzip encoded content for {url}, can not save as gzipped file."
354
+ raise InvalidContentEncodingError(
355
+ request_info=resp.request_info,
356
+ history=resp.history,
357
+ status=415,
358
+ message=msg,
359
+ headers=resp.headers,
360
+ )
361
+ iterator = resp.content.iter_chunked(chunk_size)
362
+ await cacher.write_iter(save_path, iterator)
339
363
  return save_path
340
364
 
341
365
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: protein_quest
3
- Version: 0.5.1
3
+ Version: 0.7.0
4
4
  Summary: Search/retrieve/filter proteins and protein structures
5
5
  Project-URL: Homepage, https://github.com/haddocking/protein-quest
6
6
  Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
@@ -11,12 +11,12 @@ Requires-Python: >=3.13
11
11
  Requires-Dist: aiofiles>=24.1.0
12
12
  Requires-Dist: aiohttp-retry>=2.9.1
13
13
  Requires-Dist: aiohttp[speedups]>=3.11.18
14
- Requires-Dist: aiopath>=0.7.7
15
14
  Requires-Dist: attrs>=25.3.0
16
15
  Requires-Dist: cattrs[orjson]>=24.1.3
17
16
  Requires-Dist: dask>=2025.5.1
18
17
  Requires-Dist: distributed>=2025.5.1
19
18
  Requires-Dist: gemmi>=0.7.3
19
+ Requires-Dist: mmcif>=0.92.0
20
20
  Requires-Dist: platformdirs>=4.3.8
21
21
  Requires-Dist: psutil>=7.0.0
22
22
  Requires-Dist: rich-argparse>=1.7.1
@@ -26,7 +26,7 @@ Requires-Dist: tqdm>=4.67.1
26
26
  Requires-Dist: yarl>=1.20.1
27
27
  Provides-Extra: mcp
28
28
  Requires-Dist: fastmcp>=2.11.3; extra == 'mcp'
29
- Requires-Dist: pydantic>=2.11.7; extra == 'mcp'
29
+ Requires-Dist: pydantic>=2.12.0; extra == 'mcp'
30
30
  Description-Content-Type: text/markdown
31
31
 
32
32
  # protein-quest
@@ -61,6 +61,7 @@ graph TB;
61
61
  searchuniprot --> |uniprot_accessions|searchpdbe[/Search PDBe/]
62
62
  searchuniprot --> |uniprot_accessions|searchaf[/Search Alphafold/]
63
63
  searchuniprot -. uniprot_accessions .-> searchemdb[/Search EMDB/]
64
+ searchuniprot -. uniprot_accessions .-> searchuniprotdetails[/Search UniProt details/]
64
65
  searchintactionpartners[/Search interaction partners/] -.-x |uniprot_accessions|searchuniprot
65
66
  searchcomplexes[/Search complexes/]
66
67
  searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
@@ -71,6 +72,8 @@ graph TB;
71
72
  fetchad -->|mmcif_files| confidencefilter{{Filter out low confidence}}
72
73
  confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
73
74
  residuefilter --> |mmcif_files| ssfilter
75
+ ssfilter -. mmcif_files .-> convert2cif([Convert to cif])
76
+ ssfilter -. mmcif_files .-> convert2uniprot_accessions([Convert to UniProt accessions])
74
77
  classDef dashedBorder stroke-dasharray: 5 5;
75
78
  goterm:::dashedBorder
76
79
  taxonomy:::dashedBorder
@@ -78,6 +81,9 @@ graph TB;
78
81
  fetchemdb:::dashedBorder
79
82
  searchintactionpartners:::dashedBorder
80
83
  searchcomplexes:::dashedBorder
84
+ searchuniprotdetails:::dashedBorder
85
+ convert2cif:::dashedBorder
86
+ convert2uniprot_accessions:::dashedBorder
81
87
  ```
82
88
 
83
89
  (Dotted nodes and edges are side-quests.)
@@ -108,7 +114,7 @@ This behavior can be customized with the `--no-cache`, `--cache-dir`, and `--cop
108
114
  protein-quest search uniprot \
109
115
  --taxon-id 9606 \
110
116
  --reviewed \
111
- --subcellular-location-uniprot nucleus \
117
+ --subcellular-location-uniprot "nucleus" \
112
118
  --subcellular-location-go GO:0005634 \
113
119
  --molecular-function-go GO:0003677 \
114
120
  --limit 100 \
@@ -191,7 +197,7 @@ protein-quest filter residue \
191
197
 
192
198
  ### To filter on secondary structure
193
199
 
194
- To filter on structure being mostly alpha helices and have no beta sheets.
200
+ To filter on structure being mostly alpha helices and have no beta sheets. See the following [notebook](https://www.bonvinlab.org/protein-detective/SSE_elements.html) to determine the ratio of secondary structure elements.
195
201
 
196
202
  ```shell
197
203
  protein-quest filter secondary-structure \
@@ -242,6 +248,37 @@ query_protein,complex_id,complex_url,complex_title,members
242
248
  Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
243
249
  ```
244
250
 
251
+ ### Search for UniProt details
252
+
253
+ To get details (like protein name, sequence length, organism) for a list of UniProt accessions.
254
+
255
+ ```shell
256
+ protein-quest search uniprot-details uniprot_accs.txt uniprot_details.csv
257
+ ```
258
+
259
+ The `uniprot_details.csv` looks like:
260
+
261
+ ```csv
262
+ uniprot_accession,uniprot_id,sequence_length,reviewed,protein_name,taxon_id,taxon_name
263
+ A0A087WUV0,ZN892_HUMAN,522,True,Zinc finger protein 892,9606,Homo sapiens
264
+ ```
265
+
266
+ ### Convert structure files to .cif format
267
+
268
+ Some tools (for example [powerfit](https://github.com/haddocking/powerfit)) only work with `.cif` files and not `*.cif.gz` or `*.bcif` files.
269
+
270
+ ```shell
271
+ protein-quest convert structures --format cif --output-dir ./filtered-cif ./filtered-ss
272
+ ```
273
+
274
+ ### Convert structure files to UniProt accessions
275
+
276
+ After running some filters you might want to know which UniProt accessions are still present in the filtered structures.
277
+
278
+ ```shell
279
+ protein-quest convert uniprot ./filtered-ss uniprot_accs.filtered.txt
280
+ ```
281
+
245
282
  ## Model Context Protocol (MCP) server
246
283
 
247
284
  Protein quest can also help LLMs like Claude Sonnet 4 by providing a [set of tools](https://modelcontextprotocol.io/docs/learn/server-concepts#tools-ai-actions) for protein structures.
@@ -0,0 +1,27 @@
1
+ protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ protein_quest/__version__.py,sha256=F9kNagC7uEvuPDju8Gzo4Jt81LSvbf0VyONV3GMXT2M,56
3
+ protein_quest/cli.py,sha256=082CmSSmxVZoWbnX35AmhqedA4T1dD9v-eMe0vsIDp4,55572
4
+ protein_quest/converter.py,sha256=Y-Oxf7lDNbEicL6GS-IpNWDwaAiHgIgs5bFAcEHCKdQ,1441
5
+ protein_quest/emdb.py,sha256=641c6RwNYnu-0GBFyCFBiI58fNc0jMkd0ZZ9MW9-Jmc,1501
6
+ protein_quest/filters.py,sha256=Xr-cJTtbNjHKuzmXLBf7yZfqKf_U3RTivcVbr620LVU,5225
7
+ protein_quest/go.py,sha256=lZNEcw8nTc9wpV3cl4y2FG9Lsj8wsXQ6zemmAQs_DWE,5650
8
+ protein_quest/io.py,sha256=ngV_HU2HIQFO-bP2xQj_fhgv0MYjW4puqz_9CxGpBv8,13017
9
+ protein_quest/mcp_server.py,sha256=tZkSG1yx4ocN1rlKgVlU8nUbs6LKpyLrNqP3y6fbJm0,8564
10
+ protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
11
+ protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ protein_quest/ss.py,sha256=4ZGIHfjTlodYTXqGUKhMnGbgaStYOGaWg2oYrWIjdgo,10118
13
+ protein_quest/structure.py,sha256=QozElPz0kbPB_HW-J1WxArTT5e-1vRyBJoBSfHnwoRM,8117
14
+ protein_quest/taxonomy.py,sha256=4mKv8zll4mX02Ow8CTvyqMJE2KJZvcq3QlTjjjLOJJk,5072
15
+ protein_quest/uniprot.py,sha256=mODAcneCnDvinvJ3jffyR11klsgq5b96T_4aVWd-Luw,35158
16
+ protein_quest/utils.py,sha256=6OF8X4ia_z1HOYiXy6e-zEWlp_bF1DoZCVrCSg1qivY,19076
17
+ protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
18
+ protein_quest/alphafold/confidence.py,sha256=mVAYTIzdbR8xBjRiUzA0at8wJq9vpfEQWPz5cJefLKs,6766
19
+ protein_quest/alphafold/entry_summary.py,sha256=Qhnw75RXFaoOU332g7axg_jYbbdZbUpsGPUOwPNDSeU,2114
20
+ protein_quest/alphafold/fetch.py,sha256=l8pcXeuDfoXYiwpW5N_uB_9oZpomBgUeF9kROLrM11M,14038
21
+ protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
22
+ protein_quest/pdbe/fetch.py,sha256=e8CHWDX2QzWnVLmYXCfNrscw1UcN1lI9Uz6Z5HmEOEQ,2510
23
+ protein_quest-0.7.0.dist-info/METADATA,sha256=JvsZl9XGN57iJn5oSBRIVNIqL6aYEHXQlGpE87nsSvQ,10722
24
+ protein_quest-0.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
25
+ protein_quest-0.7.0.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
26
+ protein_quest-0.7.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
27
+ protein_quest-0.7.0.dist-info/RECORD,,