protein-quest 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -45,10 +45,14 @@ from protein_quest.alphafold.fetch import AlphaFoldEntry, DownloadableFormat
45
45
  from protein_quest.alphafold.fetch import fetch_many as alphafold_fetch
46
46
  from protein_quest.emdb import fetch as emdb_fetch
47
47
  from protein_quest.go import search_gene_ontology_term
48
- from protein_quest.io import convert_to_cif_file, glob_structure_files
48
+ from protein_quest.io import convert_to_cif_file, glob_structure_files, read_structure
49
49
  from protein_quest.pdbe.fetch import fetch as pdbe_fetch
50
50
  from protein_quest.ss import filter_file_on_secondary_structure
51
- from protein_quest.structure import nr_residues_in_chain, write_single_chain_structure_file
51
+ from protein_quest.structure import (
52
+ nr_residues_in_chain,
53
+ structure2uniprot_accessions,
54
+ write_single_chain_structure_file,
55
+ )
52
56
  from protein_quest.taxonomy import search_taxon
53
57
  from protein_quest.uniprot import (
54
58
  PdbResult,
@@ -129,7 +133,7 @@ def extract_single_chain_from_structure(
129
133
 
130
134
  @mcp.tool
131
135
  def list_structure_files(path: Path) -> list[Path]:
132
- """List structure files (.pdb, .pdb.gz, .cif, .cif.gz) in the specified directory."""
136
+ """List structure files (.pdb, .pdb.gz, .cif, .cif.gz, .bcif) in the specified directory."""
133
137
  return list(glob_structure_files(path))
134
138
 
135
139
 
@@ -150,7 +154,7 @@ def search_alphafolds(
150
154
  Field(description="Set of uniprot accessions which have an AlphaFold entry"),
151
155
  ]:
152
156
  """Search for AlphaFold entries in UniProtKB accessions."""
153
- # each uniprot accesion can have one or more AlphaFold IDs
157
+ # each uniprot accession can have one or more AlphaFold IDs
154
158
  # an AlphaFold ID is the same as the uniprot accession
155
159
  # so we return a subset of uniprot_accs
156
160
  results = search4af(uniprot_accs, limit)
@@ -163,7 +167,7 @@ mcp.tool(search4macromolecular_complexes, name="search_macromolecular_complexes"
163
167
 
164
168
  @mcp.tool
165
169
  def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[AlphaFoldEntry]:
166
- """Fetch the AlphaFold summary and mmcif file for given UniProt accessions.
170
+ """Fetch the AlphaFold mmCIF file for given UniProt accessions.
167
171
 
168
172
  Args:
169
173
  uniprot_accs: A set of UniProt accessions.
@@ -172,8 +176,8 @@ def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[A
172
176
  Returns:
173
177
  A list of AlphaFold entries.
174
178
  """
175
- what: set[DownloadableFormat] = {"summary", "cif"}
176
- return alphafold_fetch(uniprot_accs, save_dir, what)
179
+ formats: set[DownloadableFormat] = {"cif"}
180
+ return alphafold_fetch(uniprot_accs, save_dir, formats)
177
181
 
178
182
 
179
183
  @mcp.tool
@@ -203,6 +207,13 @@ mcp.tool(filter_file_on_secondary_structure)
203
207
  mcp.tool(convert_to_cif_file)
204
208
 
205
209
 
210
+ @mcp.tool
211
+ def uniprot_accessions_of_structure_file(file: Path) -> set[str]:
212
+ """Extract UniProt accessions from structure file."""
213
+ structure = read_structure(file)
214
+ return structure2uniprot_accessions(structure)
215
+
216
+
206
217
  @mcp.prompt
207
218
  def candidate_structures(
208
219
  species: str = "Human",
@@ -206,3 +206,27 @@ def write_single_chain_structure_file(
206
206
  write_structure(structure, output_file)
207
207
 
208
208
  return output_file
209
+
210
+
211
+ def structure2uniprot_accessions(structure: gemmi.Structure) -> set[str]:
212
+ """Extract UniProt accessions from a gemmi Structure object.
213
+
214
+ Logs a warning and returns an empty set if no accessions are found in structure.
215
+
216
+ Args:
217
+ structure: The gemmi Structure object to extract UniProt accessions from.
218
+
219
+ Returns:
220
+ A set of UniProt accessions found in the structure.
221
+ """
222
+ block = structure.make_mmcif_block(gemmi.MmcifOutputGroups(False, struct_ref=True))
223
+ struct_ref = block.get_mmcif_category("_struct_ref.")
224
+ uniprot_accessions: set[str] = set()
225
+ for i, db_name in enumerate(struct_ref["db_name"]):
226
+ if db_name != "UNP":
227
+ continue
228
+ pdbx_db_accession = struct_ref["pdbx_db_accession"][i]
229
+ uniprot_accessions.add(pdbx_db_accession)
230
+ if not uniprot_accessions:
231
+ logger.warning("No UniProt accessions found in structure %s", structure.name)
232
+ return uniprot_accessions
protein_quest/uniprot.py CHANGED
@@ -1,8 +1,9 @@
1
1
  """Module for searching UniProtKB using SPARQL."""
2
2
 
3
3
  import logging
4
- from collections.abc import Collection, Iterable
4
+ from collections.abc import Collection, Generator, Iterable
5
5
  from dataclasses import dataclass
6
+ from functools import cached_property
6
7
  from itertools import batched
7
8
  from textwrap import dedent
8
9
 
@@ -24,6 +25,8 @@ class Query:
24
25
  (e.g., ["GO:0005634"]) or a collection of GO terms (e.g., ["GO:0005634", "GO:0005737"]).
25
26
  molecular_function_go: Molecular function in GO format. Can be a single GO term
26
27
  (e.g., ["GO:0003674"]) or a collection of GO terms (e.g., ["GO:0003674", "GO:0008150"]).
28
+ min_sequence_length: Minimum length of the canonical sequence.
29
+ max_sequence_length: Maximum length of the canonical sequence.
27
30
  """
28
31
 
29
32
  # TODO make taxon_id an int
@@ -32,6 +35,8 @@ class Query:
32
35
  subcellular_location_uniprot: str | None = None
33
36
  subcellular_location_go: list[str] | None = None
34
37
  molecular_function_go: list[str] | None = None
38
+ min_sequence_length: int | None = None
39
+ max_sequence_length: int | None = None
35
40
 
36
41
 
37
42
  def _first_chain_from_uniprot_chains(uniprot_chains: str) -> str:
@@ -39,16 +44,17 @@ def _first_chain_from_uniprot_chains(uniprot_chains: str) -> str:
39
44
 
40
45
  The UniProt chains string is formatted (with EBNF notation) as follows:
41
46
 
42
- chain_group(=range)?(,chain_group(=range)?)*
47
+ chain_group=range(,chain_group=range)*
43
48
 
44
49
  where:
45
50
  chain_group := chain_id(/chain_id)*
46
- chain_id := [A-Za-z]+
51
+ chain_id := [A-Za-z0-9]+
47
52
  range := start-end
48
53
  start, end := integer
49
54
 
50
55
  Args:
51
56
  uniprot_chains: A string representing UniProt chains, For example "B/D=1-81".
57
+
52
58
  Returns:
53
59
  The first chain identifier from the UniProt chain string. For example "B".
54
60
  """
@@ -66,6 +72,35 @@ def _first_chain_from_uniprot_chains(uniprot_chains: str) -> str:
66
72
  return chain
67
73
 
68
74
 
75
+ def _chain_length_from_uniprot_chains(uniprot_chains: str) -> int:
76
+ """Calculates the total length of chain from a UniProt chains string.
77
+
78
+ See `_first_chain_from_uniprot_chains` for the format of the UniProt chains string.
79
+
80
+ Args:
81
+ uniprot_chains: A string representing UniProt chains, For example "B/D=1-81".
82
+
83
+ Returns:
84
+ The length of the chain in the UniProt chain string. For example 81 for "B/D=1-81".
85
+ """
86
+ total_length = 0
87
+ chains = uniprot_chains.split(",")
88
+ for chain in chains:
89
+ _, rangestr = chain.split("=")
90
+ start, stop = rangestr.split("-")
91
+ # Residue positions are 1-based so + 1
92
+ total_length += int(stop) - int(start) + 1
93
+ return total_length
94
+
95
+
96
+ class PdbChainLengthError(ValueError):
97
+ """Raised when a UniProt chain description does not yield a chain length."""
98
+
99
+ def __init__(self, pdb_id: str, uniprot_chains: str):
100
+ msg = f"Could not determine chain length of '{pdb_id}' from '{uniprot_chains}'"
101
+ super().__init__(msg)
102
+
103
+
69
104
  @dataclass(frozen=True)
70
105
  class PdbResult:
71
106
  """Result of a PDB search in UniProtKB.
@@ -82,11 +117,78 @@ class PdbResult:
82
117
  uniprot_chains: str
83
118
  resolution: str | None = None
84
119
 
85
- @property
120
+ @cached_property
86
121
  def chain(self) -> str:
87
122
  """The first chain from the UniProt chains aka self.uniprot_chains."""
88
123
  return _first_chain_from_uniprot_chains(self.uniprot_chains)
89
124
 
125
+ @cached_property
126
+ def chain_length(self) -> int:
127
+ """The length of the chain from the UniProt chains aka self.uniprot_chains."""
128
+ try:
129
+ return _chain_length_from_uniprot_chains(self.uniprot_chains)
130
+ except ValueError as e:
131
+ raise PdbChainLengthError(self.id, self.uniprot_chains) from e
132
+
133
+
134
+ type PdbResults = dict[str, set[PdbResult]]
135
+ """Dictionary with uniprot accessions as keys and sets of PDB results as values."""
136
+
137
+
138
+ def filter_pdb_results_on_chain_length(
139
+ pdb_results: PdbResults,
140
+ min_residues: int | None,
141
+ max_residues: int | None,
142
+ keep_invalid: bool = False,
143
+ ) -> PdbResults:
144
+ """Filter PDB results based on chain length.
145
+
146
+ Args:
147
+ pdb_results: Dictionary with protein IDs as keys and sets of PDB results as values.
148
+ min_residues: Minimum number of residues required in the chain mapped to the UniProt accession.
149
+ If None, no minimum is applied.
150
+ max_residues: Maximum number of residues allowed in chain mapped to the UniProt accession.
151
+ If None, no maximum is applied.
152
+ keep_invalid: If True, PDB results with invalid chain length (could not be determined) are kept.
153
+ If False, PDB results with invalid chain length are filtered out.
154
+ Warnings are logged when length can not be determined.
155
+
156
+ Returns:
157
+ Filtered dictionary with protein IDs as keys and sets of PDB results as values.
158
+ """
159
+ if min_residues is None and max_residues is None:
160
+ # No filtering needed
161
+ return pdb_results
162
+ if min_residues is not None and max_residues is not None and max_residues <= min_residues:
163
+ msg = f"Maximum number of residues ({max_residues}) must be > minimum number of residues ({min_residues})"
164
+ raise ValueError(msg)
165
+ results: PdbResults = {}
166
+ for uniprot_accession, pdb_entries in pdb_results.items():
167
+ filtered_pdb_entries = set()
168
+ for pdb_entry in pdb_entries:
169
+ try:
170
+ if (min_residues is None or pdb_entry.chain_length >= min_residues) and (
171
+ max_residues is None or pdb_entry.chain_length <= max_residues
172
+ ):
173
+ filtered_pdb_entries.add(pdb_entry)
174
+ except PdbChainLengthError:
175
+ if keep_invalid:
176
+ logger.warning(
177
+ f"Could not determine chain length of '{pdb_entry.id}' from '{pdb_entry.uniprot_chains}' "
178
+ f"belonging to uniprot accession '{uniprot_accession}', "
179
+ "for completeness not filtering it out"
180
+ )
181
+ filtered_pdb_entries.add(pdb_entry)
182
+ else:
183
+ logger.warning(
184
+ f"Filtering out PDB entry '{pdb_entry.id}' belonging to uniprot accession "
185
+ f"'{uniprot_accession}' due to invalid chain length from '{pdb_entry.uniprot_chains}'"
186
+ )
187
+ if filtered_pdb_entries:
188
+ # Only include uniprot_accession if there are any pdb entries left after filtering
189
+ results[uniprot_accession] = filtered_pdb_entries
190
+ return results
191
+
90
192
 
91
193
  def _query2dynamic_sparql_triples(query: Query):
92
194
  parts: list[str] = []
@@ -110,6 +212,13 @@ def _query2dynamic_sparql_triples(query: Query):
110
212
  molecular_function_filter = _create_go_filter(go_terms, "Molecular function")
111
213
  parts.append(molecular_function_filter)
112
214
 
215
+ if query.min_sequence_length is not None or query.max_sequence_length is not None:
216
+ length_filter = _build_sparql_query_sequence_length_filter(
217
+ min_length=query.min_sequence_length,
218
+ max_length=query.max_sequence_length,
219
+ )
220
+ parts.append(length_filter)
221
+
113
222
  return "\n".join(parts)
114
223
 
115
224
 
@@ -237,6 +346,57 @@ def _build_sparql_query_uniprot(query: Query, limit=10_000) -> str:
237
346
  return _build_sparql_generic_query(select_clause, dedent(where_clause), limit)
238
347
 
239
348
 
349
+ def _build_sparql_query_sequence_length_filter(min_length: int | None = None, max_length: int | None = None) -> str:
350
+ """Builds a SPARQL filter for sequence length.
351
+
352
+ See 107_uniprot_sequences_and_mark_which_is_cannonical_for_human
353
+ on https://sparql.uniprot.org/.well-known/sparql-examples/ for similar query.
354
+
355
+ Args:
356
+ min_length: Minimum sequence length. If None, no minimum is applied.
357
+ max_length: Maximum sequence length. If None, no maximum is applied.
358
+ """
359
+ if min_length is None and max_length is None:
360
+ return ""
361
+ # An uniprot entry can have multiple isoforms,
362
+ # we want to check the length of the canonical isoform
363
+ # We do this by selecting the isoform that is not based on another isoform
364
+ # and excluding isoforms from other uniprot entries.
365
+ # For example for http://purl.uniprot.org/uniprot/P42284:
366
+ # - http://purl.uniprot.org/isoforms/P42284-2 is ok
367
+ # - http://purl.uniprot.org/isoforms/P42284-1 is not ok, because it is based on P42284-2
368
+ # - http://purl.uniprot.org/isoforms/Q7KQZ4-1 is not ok, because it is from another uniprot entry
369
+ header = dedent("""\
370
+ ?protein up:sequence ?isoform .
371
+ ?isoform a up:Simple_Sequence .
372
+ BIND (IRI(STRBEFORE(REPLACE(
373
+ STR(?isoform), "http://purl.uniprot.org/isoforms/", "http://purl.uniprot.org/uniprot/"
374
+ ), "-")) AS ?ac_of_isoform)
375
+ FILTER (?protein = ?ac_of_isoform)
376
+ ?isoform rdf:value ?sequence .
377
+ BIND (STRLEN(?sequence) AS ?seq_length)
378
+ """)
379
+ if min_length is not None and max_length is not None:
380
+ if max_length <= min_length:
381
+ msg = f"Maximum sequence length ({max_length}) must be greater than minimum sequence length ({min_length})"
382
+ raise ValueError(msg)
383
+ return dedent(f"""\
384
+ {header}
385
+ FILTER (?seq_length >= {min_length} && ?seq_length <= {max_length})
386
+ """)
387
+ if min_length is not None:
388
+ return dedent(f"""\
389
+ {header}
390
+ FILTER (?seq_length >= {min_length})
391
+ """)
392
+ if max_length is not None:
393
+ return dedent(f"""\
394
+ {header}
395
+ FILTER (?seq_length <= {max_length})
396
+ """)
397
+ return ""
398
+
399
+
240
400
  def _build_sparql_query_pdb(uniprot_accs: Iterable[str], limit=10_000) -> str:
241
401
  # For http://purl.uniprot.org/uniprot/O00268 + http://rdf.wwpdb.org/pdb/1H3O
242
402
  # the chainSequenceMapping are
@@ -248,7 +408,7 @@ def _build_sparql_query_pdb(uniprot_accs: Iterable[str], limit=10_000) -> str:
248
408
  # http://purl.uniprot.org/isoforms/O00255-2#PDB_3U84_tt2tt459
249
409
  # To get the the chain belonging to the uniprot/pdb pair we need to
250
410
  # do some string filtering.
251
- # Also there can be multiple cnhins for the same uniprot/pdb pair, so we need to
411
+ # Also there can be multiple chains for the same uniprot/pdb pair, so we need to
252
412
  # do a group by and concat
253
413
 
254
414
  select_clause = dedent("""\
@@ -274,7 +434,12 @@ def _build_sparql_query_pdb(uniprot_accs: Iterable[str], limit=10_000) -> str:
274
434
  )
275
435
 
276
436
 
277
- def _build_sparql_query_af(uniprot_accs: Iterable[str], limit=10_000) -> str:
437
+ def _build_sparql_query_af(
438
+ uniprot_accs: Iterable[str],
439
+ min_sequence_length: int | None = None,
440
+ max_sequence_length: int | None = None,
441
+ limit=10_000,
442
+ ) -> str:
278
443
  select_clause = "?protein ?af_db"
279
444
  where_clause = dedent("""
280
445
  # --- Protein Selection ---
@@ -284,6 +449,12 @@ def _build_sparql_query_af(uniprot_accs: Iterable[str], limit=10_000) -> str:
284
449
  ?protein rdfs:seeAlso ?af_db .
285
450
  ?af_db up:database <http://purl.uniprot.org/database/AlphaFoldDB> .
286
451
  """)
452
+ if min_sequence_length is not None or max_sequence_length is not None:
453
+ length_filter = _build_sparql_query_sequence_length_filter(
454
+ min_length=min_sequence_length,
455
+ max_length=max_sequence_length,
456
+ )
457
+ where_clause += "\n" + length_filter
287
458
  return _build_sparql_generic_by_uniprot_accessions_query(uniprot_accs, select_clause, dedent(where_clause), limit)
288
459
 
289
460
 
@@ -337,8 +508,8 @@ def _execute_sparql_search(
337
508
  return bindings
338
509
 
339
510
 
340
- def _flatten_results_pdb(rawresults: Iterable) -> dict[str, set[PdbResult]]:
341
- pdb_entries: dict[str, set[PdbResult]] = {}
511
+ def _flatten_results_pdb(rawresults: Iterable) -> PdbResults:
512
+ pdb_entries: PdbResults = {}
342
513
  for result in rawresults:
343
514
  protein = result["protein"]["value"].split("/")[-1]
344
515
  if "pdb_db" not in result: # Should not happen with build_sparql_query_pdb
@@ -424,7 +595,7 @@ def search4uniprot(query: Query, limit: int = 10_000, timeout: int = 1_800) -> s
424
595
 
425
596
  def search4pdb(
426
597
  uniprot_accs: Collection[str], limit: int = 10_000, timeout: int = 1_800, batch_size: int = 10_000
427
- ) -> dict[str, set[PdbResult]]:
598
+ ) -> PdbResults:
428
599
  """
429
600
  Search for PDB entries in UniProtKB accessions.
430
601
 
@@ -456,13 +627,20 @@ def search4pdb(
456
627
 
457
628
 
458
629
  def search4af(
459
- uniprot_accs: Collection[str], limit: int = 10_000, timeout: int = 1_800, batch_size: int = 10_000
630
+ uniprot_accs: Collection[str],
631
+ min_sequence_length: int | None = None,
632
+ max_sequence_length: int | None = None,
633
+ limit: int = 10_000,
634
+ timeout: int = 1_800,
635
+ batch_size: int = 10_000,
460
636
  ) -> dict[str, set[str]]:
461
637
  """
462
638
  Search for AlphaFold entries in UniProtKB accessions.
463
639
 
464
640
  Args:
465
641
  uniprot_accs: UniProt accessions.
642
+ min_sequence_length: Minimum length of the canonical sequence.
643
+ max_sequence_length: Maximum length of the canonical sequence.
466
644
  limit: Maximum number of results to return.
467
645
  timeout: Timeout for the SPARQL query in seconds.
468
646
  batch_size: Size of batches to process the UniProt accessions.
@@ -474,7 +652,7 @@ def search4af(
474
652
  total = len(uniprot_accs)
475
653
  with tqdm(total=total, desc="Searching for AlphaFolds of uniprots", disable=total < batch_size, unit="acc") as pbar:
476
654
  for batch in batched(uniprot_accs, batch_size, strict=False):
477
- sparql_query = _build_sparql_query_af(batch, limit)
655
+ sparql_query = _build_sparql_query_af(batch, min_sequence_length, max_sequence_length, limit)
478
656
  logger.info("Executing SPARQL query for AlphaFold: %s", sparql_query)
479
657
 
480
658
  raw_results = _execute_sparql_search(
@@ -639,12 +817,12 @@ def search4macromolecular_complexes(
639
817
 
640
818
 
641
819
  def search4interaction_partners(
642
- uniprot_acc: str, excludes: set[str] | None = None, limit: int = 10_000, timeout: int = 1_800
820
+ uniprot_accession: str, excludes: set[str] | None = None, limit: int = 10_000, timeout: int = 1_800
643
821
  ) -> dict[str, set[str]]:
644
822
  """Search for interaction partners of a given UniProt accession using ComplexPortal database references.
645
823
 
646
824
  Args:
647
- uniprot_acc: UniProt accession to search interaction partners for.
825
+ uniprot_accession: UniProt accession to search interaction partners for.
648
826
  excludes: Set of UniProt accessions to exclude from the results.
649
827
  For example already known interaction partners.
650
828
  If None then no complex members are excluded.
@@ -655,14 +833,143 @@ def search4interaction_partners(
655
833
  Dictionary with UniProt accessions of interaction partners as keys and sets of ComplexPortal entry IDs
656
834
  in which the interaction occurs as values.
657
835
  """
658
- ucomplexes = search4macromolecular_complexes([uniprot_acc], limit=limit, timeout=timeout)
836
+ ucomplexes = search4macromolecular_complexes([uniprot_accession], limit=limit, timeout=timeout)
659
837
  hits: dict[str, set[str]] = {}
660
838
  if excludes is None:
661
839
  excludes = set()
662
840
  for ucomplex in ucomplexes:
663
841
  for member in ucomplex.members:
664
- if member != uniprot_acc and member not in excludes:
842
+ if member != uniprot_accession and member not in excludes:
665
843
  if member not in hits:
666
844
  hits[member] = set()
667
845
  hits[member].add(ucomplex.complex_id)
668
846
  return hits
847
+
848
+
849
+ @dataclass(frozen=True)
850
+ class UniprotDetails:
851
+ """Details of an UniProt entry.
852
+
853
+ Parameters:
854
+ uniprot_accession: UniProt accession.
855
+ uniprot_id: UniProt ID (mnemonic).
856
+ sequence_length: Length of the canonical sequence.
857
+ reviewed: Whether the entry is reviewed (Swiss-Prot) or unreviewed (TrEMBL).
858
+ protein_name: Recommended protein name.
859
+ taxon_id: NCBI Taxonomy ID of the organism.
860
+ taxon_name: Scientific name of the organism.
861
+ """
862
+
863
+ uniprot_accession: str
864
+ uniprot_id: str
865
+ sequence_length: int
866
+ reviewed: bool
867
+ protein_name: str
868
+ taxon_id: int
869
+ taxon_name: str
870
+
871
+
872
+ def map_uniprot_accessions2uniprot_details(
873
+ uniprot_accessions: Collection[str], timeout: int = 1_800, batch_size: int = 1000
874
+ ) -> Generator[UniprotDetails]:
875
+ """Map UniProt accessions to UniProt details by querying the UniProt SPARQL endpoint.
876
+
877
+ Example:
878
+
879
+ SPARQL query to get details for 7 UniProt entries, run on [https://sparql.uniprot.org/sparql](https://sparql.uniprot.org/sparql).
880
+
881
+ ```sparql
882
+ PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
883
+ PREFIX up: <http://purl.uniprot.org/core/>
884
+ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
885
+
886
+ SELECT
887
+ (?ac AS ?uniprot_accession)
888
+ ?uniprot_id
889
+ (STRAFTER(STR(?organism), "taxonomy/") AS ?taxon_id)
890
+ ?taxon_name
891
+ ?reviewed
892
+ ?protein_name
893
+ (STRLEN(?sequence) AS ?seq_length)
894
+ WHERE {
895
+ # Input UniProt accessions
896
+ VALUES (?ac) { ("P05067") ("A6NGD5") ("O14627") ("P00697") ("P42284") ("A0A0B5AC95") ("A0A0S2Z4R0")}
897
+ BIND (IRI(CONCAT("http://purl.uniprot.org/uniprot/", ?ac)) AS ?protein)
898
+ ?protein a up:Protein .
899
+ ?protein up:mnemonic ?uniprot_id .
900
+ ?protein up:organism ?organism .
901
+ ?organism up:scientificName ?taxon_name .
902
+ ?protein up:reviewed ?reviewed .
903
+ ?protein up:recommendedName/up:fullName ?protein_name .
904
+ ?protein up:sequence ?isoform .
905
+ ?isoform a up:Simple_Sequence .
906
+ ?isoform rdf:value ?sequence .
907
+ BIND (IRI(STRBEFORE(REPLACE(
908
+ STR(?isoform), "http://purl.uniprot.org/isoforms/", "http://purl.uniprot.org/uniprot/"
909
+ ), "-")) AS ?ac_of_isoform)
910
+ FILTER(?ac_of_isoform = ?protein)
911
+ }
912
+ ```
913
+
914
+ Args:
915
+ uniprot_accessions: Iterable of UniProt accessions.
916
+ timeout: Timeout for the SPARQL query in seconds.
917
+ batch_size: Size of batches to process the UniProt accessions.
918
+
919
+ Yields:
920
+ UniprotDetails objects in random order.
921
+ """
922
+ select_clause = dedent("""\
923
+ (?ac AS ?uniprot_accession)
924
+ ?uniprot_id
925
+ (STRAFTER(STR(?organism), "taxonomy/") AS ?taxon_id)
926
+ ?taxon_name
927
+ ?reviewed
928
+ ?protein_name
929
+ (STRLEN(?sequence) AS ?seq_length)
930
+ """)
931
+ where_clause = dedent("""
932
+ ?protein up:mnemonic ?uniprot_id .
933
+ ?protein up:organism ?organism .
934
+ ?organism up:scientificName ?taxon_name .
935
+ ?protein up:reviewed ?reviewed .
936
+ OPTIONAL {
937
+ ?protein up:recommendedName/up:fullName ?protein_name .
938
+ }
939
+ ?protein up:sequence ?isoform .
940
+ ?isoform a up:Simple_Sequence .
941
+ ?isoform rdf:value ?sequence .
942
+ BIND (IRI(STRBEFORE(REPLACE(
943
+ STR(?isoform), "http://purl.uniprot.org/isoforms/", "http://purl.uniprot.org/uniprot/"
944
+ ), "-")) AS ?ac_of_isoform)
945
+ FILTER(?ac_of_isoform = ?protein)
946
+ """)
947
+ total = len(uniprot_accessions)
948
+ with tqdm(
949
+ total=total,
950
+ desc="Retrieving UniProt details",
951
+ disable=total < batch_size,
952
+ unit="acc",
953
+ ) as pbar:
954
+ for batch in batched(uniprot_accessions, batch_size, strict=False):
955
+ sparql_query = _build_sparql_generic_by_uniprot_accessions_query(
956
+ batch, select_clause, where_clause, limit=batch_size
957
+ )
958
+ logger.info("Executing SPARQL query for UniProt details: %s", sparql_query)
959
+ raw_results = _execute_sparql_search(
960
+ sparql_query=sparql_query,
961
+ timeout=timeout,
962
+ )
963
+ for raw_result in raw_results:
964
+ protein_name = raw_result.get("protein_name", {}).get("value", "")
965
+ result = UniprotDetails(
966
+ uniprot_accession=raw_result["uniprot_accession"]["value"],
967
+ uniprot_id=raw_result["uniprot_id"]["value"],
968
+ sequence_length=int(raw_result["seq_length"]["value"]),
969
+ reviewed=raw_result["reviewed"]["value"] == "true",
970
+ protein_name=protein_name,
971
+ taxon_id=int(raw_result["taxon_id"]["value"]),
972
+ taxon_name=raw_result["taxon_name"]["value"],
973
+ )
974
+ yield result
975
+ pbar.update(len(batch))
protein_quest/utils.py CHANGED
@@ -266,6 +266,7 @@ async def retrieve_files(
266
266
  cacher: Cacher | None = None,
267
267
  chunk_size: int = 524288, # 512 KiB
268
268
  gzip_files: bool = False,
269
+ raise_for_not_found: bool = True,
269
270
  ) -> list[Path]:
270
271
  """Retrieve files from a list of URLs and save them to a directory.
271
272
 
@@ -279,6 +280,9 @@ async def retrieve_files(
279
280
  cacher: An optional cacher to use for caching files.
280
281
  chunk_size: The size of each chunk to read from the response.
281
282
  gzip_files: Whether to gzip the downloaded files.
283
+ This requires the server can send gzip encoded content.
284
+ raise_for_not_found: Whether to raise an error for HTTP 404 errors.
285
+ If false then function does not returns Path for which url gave HTTP 404 error and logs as debug message.
282
286
 
283
287
  Returns:
284
288
  A list of paths to the downloaded files.
@@ -295,11 +299,12 @@ async def retrieve_files(
295
299
  cacher=cacher,
296
300
  chunk_size=chunk_size,
297
301
  gzip_files=gzip_files,
302
+ raise_for_not_found=raise_for_not_found,
298
303
  )
299
304
  for url, filename in urls
300
305
  ]
301
- files: list[Path] = await tqdm.gather(*tasks, desc=desc)
302
- return files
306
+ raw_files: list[Path | None] = await tqdm.gather(*tasks, desc=desc)
307
+ return [f for f in raw_files if f is not None]
303
308
 
304
309
 
305
310
  class InvalidContentEncodingError(aiohttp.ClientResponseError):
@@ -314,7 +319,8 @@ async def _retrieve_file(
314
319
  cacher: Cacher | None = None,
315
320
  chunk_size: int = 524288, # 512 KiB
316
321
  gzip_files: bool = False,
317
- ) -> Path:
322
+ raise_for_not_found=True,
323
+ ) -> Path | None:
318
324
  """Retrieve a single file from a URL and save it to a specified path.
319
325
 
320
326
  Args:
@@ -325,6 +331,9 @@ async def _retrieve_file(
325
331
  cacher: An optional cacher to use for caching files.
326
332
  chunk_size: The size of each chunk to read from the response.
327
333
  gzip_files: Whether to gzip the downloaded file.
334
+ This requires the server can send gzip encoded content.
335
+ raise_for_not_found: Whether to raise an error for HTTP 404 errors.
336
+ If false then function returns None on HTTP 404 errors and logs as debug message.
328
337
 
329
338
  Returns:
330
339
  The path to the saved file.
@@ -348,6 +357,9 @@ async def _retrieve_file(
348
357
  semaphore,
349
358
  session.get(url, headers=headers, auto_decompress=auto_decompress) as resp,
350
359
  ):
360
+ if not raise_for_not_found and resp.status == 404:
361
+ logger.debug(f"File not found at {url}, skipping download.")
362
+ return None
351
363
  resp.raise_for_status()
352
364
  if gzip_files and resp.headers.get("Content-Encoding") != "gzip":
353
365
  msg = f"Server did not send gzip encoded content for {url}, can not save as gzipped file."