protein-quest 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- protein_quest/__init__.py +0 -0
- protein_quest/__version__.py +2 -0
- protein_quest/alphafold/__init__.py +1 -0
- protein_quest/alphafold/confidence.py +226 -0
- protein_quest/alphafold/entry_summary.py +64 -0
- protein_quest/alphafold/fetch.py +534 -0
- protein_quest/cli.py +1428 -0
- protein_quest/converter.py +46 -0
- protein_quest/emdb.py +37 -0
- protein_quest/filters.py +163 -0
- protein_quest/go.py +165 -0
- protein_quest/io.py +350 -0
- protein_quest/mcp_server.py +256 -0
- protein_quest/parallel.py +104 -0
- protein_quest/pdbe/__init__.py +1 -0
- protein_quest/pdbe/fetch.py +68 -0
- protein_quest/py.typed +0 -0
- protein_quest/ss.py +280 -0
- protein_quest/structure.py +232 -0
- protein_quest/taxonomy.py +149 -0
- protein_quest/uniprot.py +975 -0
- protein_quest/utils.py +547 -0
- protein_quest-0.9.0.dist-info/METADATA +325 -0
- protein_quest-0.9.0.dist-info/RECORD +27 -0
- protein_quest-0.9.0.dist-info/WHEEL +4 -0
- protein_quest-0.9.0.dist-info/entry_points.txt +2 -0
- protein_quest-0.9.0.dist-info/licenses/LICENSE +201 -0
protein_quest/uniprot.py
ADDED
|
@@ -0,0 +1,975 @@
|
|
|
1
|
+
"""Module for searching UniProtKB using SPARQL."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections.abc import Collection, Generator, Iterable
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from functools import cached_property
|
|
7
|
+
from itertools import batched
|
|
8
|
+
from textwrap import dedent
|
|
9
|
+
|
|
10
|
+
from SPARQLWrapper import JSON, SPARQLWrapper
|
|
11
|
+
from tqdm.auto import tqdm
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class Query:
|
|
18
|
+
"""Search query for UniProtKB.
|
|
19
|
+
|
|
20
|
+
Parameters:
|
|
21
|
+
taxon_id: NCBI Taxon ID to filter results by organism (e.g., "9606" for human).
|
|
22
|
+
reviewed: Whether to filter results by reviewed status (True for reviewed, False for unreviewed).
|
|
23
|
+
subcellular_location_uniprot: Subcellular location in UniProt format (e.g., "nucleus").
|
|
24
|
+
subcellular_location_go: Subcellular location in GO format. Can be a single GO term
|
|
25
|
+
(e.g., ["GO:0005634"]) or a collection of GO terms (e.g., ["GO:0005634", "GO:0005737"]).
|
|
26
|
+
molecular_function_go: Molecular function in GO format. Can be a single GO term
|
|
27
|
+
(e.g., ["GO:0003674"]) or a collection of GO terms (e.g., ["GO:0003674", "GO:0008150"]).
|
|
28
|
+
min_sequence_length: Minimum length of the canonical sequence.
|
|
29
|
+
max_sequence_length: Maximum length of the canonical sequence.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
# TODO make taxon_id an int
|
|
33
|
+
taxon_id: str | None
|
|
34
|
+
reviewed: bool | None = None
|
|
35
|
+
subcellular_location_uniprot: str | None = None
|
|
36
|
+
subcellular_location_go: list[str] | None = None
|
|
37
|
+
molecular_function_go: list[str] | None = None
|
|
38
|
+
min_sequence_length: int | None = None
|
|
39
|
+
max_sequence_length: int | None = None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _first_chain_from_uniprot_chains(uniprot_chains: str) -> str:
|
|
43
|
+
"""Extracts the first chain identifier from a UniProt chains string.
|
|
44
|
+
|
|
45
|
+
The UniProt chains string is formatted (with EBNF notation) as follows:
|
|
46
|
+
|
|
47
|
+
chain_group=range(,chain_group=range)*
|
|
48
|
+
|
|
49
|
+
where:
|
|
50
|
+
chain_group := chain_id(/chain_id)*
|
|
51
|
+
chain_id := [A-Za-z0-9]+
|
|
52
|
+
range := start-end
|
|
53
|
+
start, end := integer
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
uniprot_chains: A string representing UniProt chains, For example "B/D=1-81".
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
The first chain identifier from the UniProt chain string. For example "B".
|
|
60
|
+
"""
|
|
61
|
+
chains = uniprot_chains.split("=")
|
|
62
|
+
parts = chains[0].split("/")
|
|
63
|
+
chain = parts[0]
|
|
64
|
+
try:
|
|
65
|
+
# Workaround for Q9Y2Q5 │ 5YK3 │ 1/B/G=1-124, 1 does not exist but B does
|
|
66
|
+
int(chain)
|
|
67
|
+
if len(parts) > 1:
|
|
68
|
+
return parts[1]
|
|
69
|
+
except ValueError:
|
|
70
|
+
# A letter
|
|
71
|
+
pass
|
|
72
|
+
return chain
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _chain_length_from_uniprot_chains(uniprot_chains: str) -> int:
|
|
76
|
+
"""Calculates the total length of chain from a UniProt chains string.
|
|
77
|
+
|
|
78
|
+
See `_first_chain_from_uniprot_chains` for the format of the UniProt chains string.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
uniprot_chains: A string representing UniProt chains, For example "B/D=1-81".
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
The length of the chain in the UniProt chain string. For example 81 for "B/D=1-81".
|
|
85
|
+
"""
|
|
86
|
+
total_length = 0
|
|
87
|
+
chains = uniprot_chains.split(",")
|
|
88
|
+
for chain in chains:
|
|
89
|
+
_, rangestr = chain.split("=")
|
|
90
|
+
start, stop = rangestr.split("-")
|
|
91
|
+
# Residue positions are 1-based so + 1
|
|
92
|
+
total_length += int(stop) - int(start) + 1
|
|
93
|
+
return total_length
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class PdbChainLengthError(ValueError):
|
|
97
|
+
"""Raised when a UniProt chain description does not yield a chain length."""
|
|
98
|
+
|
|
99
|
+
def __init__(self, pdb_id: str, uniprot_chains: str):
|
|
100
|
+
msg = f"Could not determine chain length of '{pdb_id}' from '{uniprot_chains}'"
|
|
101
|
+
super().__init__(msg)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass(frozen=True)
|
|
105
|
+
class PdbResult:
|
|
106
|
+
"""Result of a PDB search in UniProtKB.
|
|
107
|
+
|
|
108
|
+
Parameters:
|
|
109
|
+
id: PDB ID (e.g., "1H3O").
|
|
110
|
+
method: Method used for the PDB entry (e.g., "X-ray diffraction").
|
|
111
|
+
uniprot_chains: Chains in UniProt format (e.g., "A/B=1-42,A/B=50-99").
|
|
112
|
+
resolution: Resolution of the PDB entry (e.g., "2.0" for 2.0 Å). Optional.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
id: str
|
|
116
|
+
method: str
|
|
117
|
+
uniprot_chains: str
|
|
118
|
+
resolution: str | None = None
|
|
119
|
+
|
|
120
|
+
@cached_property
|
|
121
|
+
def chain(self) -> str:
|
|
122
|
+
"""The first chain from the UniProt chains aka self.uniprot_chains."""
|
|
123
|
+
return _first_chain_from_uniprot_chains(self.uniprot_chains)
|
|
124
|
+
|
|
125
|
+
@cached_property
|
|
126
|
+
def chain_length(self) -> int:
|
|
127
|
+
"""The length of the chain from the UniProt chains aka self.uniprot_chains."""
|
|
128
|
+
try:
|
|
129
|
+
return _chain_length_from_uniprot_chains(self.uniprot_chains)
|
|
130
|
+
except ValueError as e:
|
|
131
|
+
raise PdbChainLengthError(self.id, self.uniprot_chains) from e
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
type PdbResults = dict[str, set[PdbResult]]
|
|
135
|
+
"""Dictionary with uniprot accessions as keys and sets of PDB results as values."""
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def filter_pdb_results_on_chain_length(
|
|
139
|
+
pdb_results: PdbResults,
|
|
140
|
+
min_residues: int | None,
|
|
141
|
+
max_residues: int | None,
|
|
142
|
+
keep_invalid: bool = False,
|
|
143
|
+
) -> PdbResults:
|
|
144
|
+
"""Filter PDB results based on chain length.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
pdb_results: Dictionary with protein IDs as keys and sets of PDB results as values.
|
|
148
|
+
min_residues: Minimum number of residues required in the chain mapped to the UniProt accession.
|
|
149
|
+
If None, no minimum is applied.
|
|
150
|
+
max_residues: Maximum number of residues allowed in chain mapped to the UniProt accession.
|
|
151
|
+
If None, no maximum is applied.
|
|
152
|
+
keep_invalid: If True, PDB results with invalid chain length (could not be determined) are kept.
|
|
153
|
+
If False, PDB results with invalid chain length are filtered out.
|
|
154
|
+
Warnings are logged when length can not be determined.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Filtered dictionary with protein IDs as keys and sets of PDB results as values.
|
|
158
|
+
"""
|
|
159
|
+
if min_residues is None and max_residues is None:
|
|
160
|
+
# No filtering needed
|
|
161
|
+
return pdb_results
|
|
162
|
+
if min_residues is not None and max_residues is not None and max_residues <= min_residues:
|
|
163
|
+
msg = f"Maximum number of residues ({max_residues}) must be > minimum number of residues ({min_residues})"
|
|
164
|
+
raise ValueError(msg)
|
|
165
|
+
results: PdbResults = {}
|
|
166
|
+
for uniprot_accession, pdb_entries in pdb_results.items():
|
|
167
|
+
filtered_pdb_entries = set()
|
|
168
|
+
for pdb_entry in pdb_entries:
|
|
169
|
+
try:
|
|
170
|
+
if (min_residues is None or pdb_entry.chain_length >= min_residues) and (
|
|
171
|
+
max_residues is None or pdb_entry.chain_length <= max_residues
|
|
172
|
+
):
|
|
173
|
+
filtered_pdb_entries.add(pdb_entry)
|
|
174
|
+
except PdbChainLengthError:
|
|
175
|
+
if keep_invalid:
|
|
176
|
+
logger.warning(
|
|
177
|
+
f"Could not determine chain length of '{pdb_entry.id}' from '{pdb_entry.uniprot_chains}' "
|
|
178
|
+
f"belonging to uniprot accession '{uniprot_accession}', "
|
|
179
|
+
"for completeness not filtering it out"
|
|
180
|
+
)
|
|
181
|
+
filtered_pdb_entries.add(pdb_entry)
|
|
182
|
+
else:
|
|
183
|
+
logger.warning(
|
|
184
|
+
f"Filtering out PDB entry '{pdb_entry.id}' belonging to uniprot accession "
|
|
185
|
+
f"'{uniprot_accession}' due to invalid chain length from '{pdb_entry.uniprot_chains}'"
|
|
186
|
+
)
|
|
187
|
+
if filtered_pdb_entries:
|
|
188
|
+
# Only include uniprot_accession if there are any pdb entries left after filtering
|
|
189
|
+
results[uniprot_accession] = filtered_pdb_entries
|
|
190
|
+
return results
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _query2dynamic_sparql_triples(query: Query):
|
|
194
|
+
parts: list[str] = []
|
|
195
|
+
if query.taxon_id:
|
|
196
|
+
parts.append(f"?protein up:organism taxon:{query.taxon_id} .")
|
|
197
|
+
|
|
198
|
+
if query.reviewed:
|
|
199
|
+
parts.append("?protein up:reviewed true .")
|
|
200
|
+
elif query.reviewed is False:
|
|
201
|
+
parts.append("?protein up:reviewed false .")
|
|
202
|
+
|
|
203
|
+
parts.append(_append_subcellular_location_filters(query))
|
|
204
|
+
|
|
205
|
+
if query.molecular_function_go:
|
|
206
|
+
# Handle both single GO term (string) and multiple GO terms (list)
|
|
207
|
+
if isinstance(query.molecular_function_go, str):
|
|
208
|
+
go_terms = [query.molecular_function_go]
|
|
209
|
+
else:
|
|
210
|
+
go_terms = query.molecular_function_go
|
|
211
|
+
|
|
212
|
+
molecular_function_filter = _create_go_filter(go_terms, "Molecular function")
|
|
213
|
+
parts.append(molecular_function_filter)
|
|
214
|
+
|
|
215
|
+
if query.min_sequence_length is not None or query.max_sequence_length is not None:
|
|
216
|
+
length_filter = _build_sparql_query_sequence_length_filter(
|
|
217
|
+
min_length=query.min_sequence_length,
|
|
218
|
+
max_length=query.max_sequence_length,
|
|
219
|
+
)
|
|
220
|
+
parts.append(length_filter)
|
|
221
|
+
|
|
222
|
+
return "\n".join(parts)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _create_go_filter(go_terms: Collection[str], term_type: str) -> str:
|
|
226
|
+
"""Create SPARQL filter for GO terms.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
go_terms: Collection of GO terms to filter by.
|
|
230
|
+
term_type: Type of GO terms for error messages (e.g., "Molecular function", "Subcellular location").
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
SPARQL filter string.
|
|
234
|
+
"""
|
|
235
|
+
# Validate all GO terms start with "GO:"
|
|
236
|
+
for term in go_terms:
|
|
237
|
+
if not term.startswith("GO:"):
|
|
238
|
+
msg = f"{term_type} GO term must start with 'GO:', got: {term}"
|
|
239
|
+
raise ValueError(msg)
|
|
240
|
+
|
|
241
|
+
if len(go_terms) == 1:
|
|
242
|
+
# Single GO term - get the first (and only) term
|
|
243
|
+
term = next(iter(go_terms))
|
|
244
|
+
return dedent(f"""
|
|
245
|
+
?protein up:classifiedWith|(up:classifiedWith/rdfs:subClassOf+) {term} .
|
|
246
|
+
""")
|
|
247
|
+
|
|
248
|
+
# Multiple GO terms - use UNION for OR logic
|
|
249
|
+
union_parts = [
|
|
250
|
+
dedent(f"""
|
|
251
|
+
{{ ?protein up:classifiedWith|(up:classifiedWith/rdfs:subClassOf+) {term} . }}
|
|
252
|
+
""").strip()
|
|
253
|
+
for term in go_terms
|
|
254
|
+
]
|
|
255
|
+
return " UNION ".join(union_parts)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _append_subcellular_location_filters(query: Query) -> str:
|
|
259
|
+
subcellular_location_uniprot_part = ""
|
|
260
|
+
subcellular_location_go_part = ""
|
|
261
|
+
|
|
262
|
+
if query.subcellular_location_uniprot:
|
|
263
|
+
subcellular_location_uniprot_part = dedent(f"""
|
|
264
|
+
?protein up:annotation ?subcellAnnotation .
|
|
265
|
+
?subcellAnnotation up:locatedIn/up:cellularComponent ?cellcmpt .
|
|
266
|
+
?cellcmpt skos:prefLabel "{query.subcellular_location_uniprot}" .
|
|
267
|
+
""")
|
|
268
|
+
|
|
269
|
+
if query.subcellular_location_go:
|
|
270
|
+
# Handle both single GO term (string) and multiple GO terms (list)
|
|
271
|
+
if isinstance(query.subcellular_location_go, str):
|
|
272
|
+
go_terms = [query.subcellular_location_go]
|
|
273
|
+
else:
|
|
274
|
+
go_terms = query.subcellular_location_go
|
|
275
|
+
|
|
276
|
+
subcellular_location_go_part = _create_go_filter(go_terms, "Subcellular location")
|
|
277
|
+
|
|
278
|
+
if subcellular_location_uniprot_part and subcellular_location_go_part:
|
|
279
|
+
# If both are provided include results for both with logical OR
|
|
280
|
+
return dedent(f"""
|
|
281
|
+
{{
|
|
282
|
+
{subcellular_location_uniprot_part}
|
|
283
|
+
}} UNION {{
|
|
284
|
+
{subcellular_location_go_part}
|
|
285
|
+
}}
|
|
286
|
+
""")
|
|
287
|
+
|
|
288
|
+
return subcellular_location_uniprot_part or subcellular_location_go_part
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _build_sparql_generic_query(select_clause: str, where_clause: str, limit: int = 10_000, groupby_clause="") -> str:
|
|
292
|
+
"""
|
|
293
|
+
Builds a generic SPARQL query with the given select and where clauses.
|
|
294
|
+
"""
|
|
295
|
+
groupby = f" GROUP BY {groupby_clause}" if groupby_clause else ""
|
|
296
|
+
return dedent(f"""
|
|
297
|
+
PREFIX up: <http://purl.uniprot.org/core/>
|
|
298
|
+
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
|
|
299
|
+
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
300
|
+
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
|
|
301
|
+
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
|
|
302
|
+
PREFIX GO:<http://purl.obolibrary.org/obo/GO_>
|
|
303
|
+
|
|
304
|
+
SELECT {select_clause}
|
|
305
|
+
WHERE {{
|
|
306
|
+
{where_clause}
|
|
307
|
+
}}
|
|
308
|
+
{groupby}
|
|
309
|
+
LIMIT {limit}
|
|
310
|
+
""")
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _build_sparql_generic_by_uniprot_accessions_query(
|
|
314
|
+
uniprot_accs: Iterable[str], select_clause: str, where_clause: str, limit: int = 10_000, groupby_clause=""
|
|
315
|
+
) -> str:
|
|
316
|
+
values = " ".join(f'("{ac}")' for ac in uniprot_accs)
|
|
317
|
+
where_clause2 = dedent(f"""
|
|
318
|
+
# --- Protein Selection ---
|
|
319
|
+
VALUES (?ac) {{ {values}}}
|
|
320
|
+
BIND (IRI(CONCAT("http://purl.uniprot.org/uniprot/",?ac)) AS ?protein)
|
|
321
|
+
?protein a up:Protein .
|
|
322
|
+
|
|
323
|
+
{where_clause}
|
|
324
|
+
""")
|
|
325
|
+
return _build_sparql_generic_query(
|
|
326
|
+
select_clause=select_clause,
|
|
327
|
+
where_clause=where_clause2,
|
|
328
|
+
limit=limit,
|
|
329
|
+
groupby_clause=groupby_clause,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _build_sparql_query_uniprot(query: Query, limit=10_000) -> str:
|
|
334
|
+
dynamic_triples = _query2dynamic_sparql_triples(query)
|
|
335
|
+
# TODO add usefull columns that have 1:1 mapping to protein
|
|
336
|
+
# like uniprot_id with `?protein up:mnemonic ?mnemonic .`
|
|
337
|
+
# and sequence, take care to take first isoform
|
|
338
|
+
# ?protein up:sequence ?isoform .
|
|
339
|
+
# ?isoform rdf:value ?sequence .
|
|
340
|
+
select_clause = "DISTINCT ?protein"
|
|
341
|
+
where_clause = dedent(f"""
|
|
342
|
+
# --- Protein Selection ---
|
|
343
|
+
?protein a up:Protein .
|
|
344
|
+
{dynamic_triples}
|
|
345
|
+
""")
|
|
346
|
+
return _build_sparql_generic_query(select_clause, dedent(where_clause), limit)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def _build_sparql_query_sequence_length_filter(min_length: int | None = None, max_length: int | None = None) -> str:
|
|
350
|
+
"""Builds a SPARQL filter for sequence length.
|
|
351
|
+
|
|
352
|
+
See 107_uniprot_sequences_and_mark_which_is_cannonical_for_human
|
|
353
|
+
on https://sparql.uniprot.org/.well-known/sparql-examples/ for similar query.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
min_length: Minimum sequence length. If None, no minimum is applied.
|
|
357
|
+
max_length: Maximum sequence length. If None, no maximum is applied.
|
|
358
|
+
"""
|
|
359
|
+
if min_length is None and max_length is None:
|
|
360
|
+
return ""
|
|
361
|
+
# An uniprot entry can have multiple isoforms,
|
|
362
|
+
# we want to check the length of the canonical isoform
|
|
363
|
+
# We do this by selecting the isoform that is not based on another isoform
|
|
364
|
+
# and excluding isoforms from other uniprot entries.
|
|
365
|
+
# For example for http://purl.uniprot.org/uniprot/P42284:
|
|
366
|
+
# - http://purl.uniprot.org/isoforms/P42284-2 is ok
|
|
367
|
+
# - http://purl.uniprot.org/isoforms/P42284-1 is not ok, because it is based on P42284-2
|
|
368
|
+
# - http://purl.uniprot.org/isoforms/Q7KQZ4-1 is not ok, because it is from another uniprot entry
|
|
369
|
+
header = dedent("""\
|
|
370
|
+
?protein up:sequence ?isoform .
|
|
371
|
+
?isoform a up:Simple_Sequence .
|
|
372
|
+
BIND (IRI(STRBEFORE(REPLACE(
|
|
373
|
+
STR(?isoform), "http://purl.uniprot.org/isoforms/", "http://purl.uniprot.org/uniprot/"
|
|
374
|
+
), "-")) AS ?ac_of_isoform)
|
|
375
|
+
FILTER (?protein = ?ac_of_isoform)
|
|
376
|
+
?isoform rdf:value ?sequence .
|
|
377
|
+
BIND (STRLEN(?sequence) AS ?seq_length)
|
|
378
|
+
""")
|
|
379
|
+
if min_length is not None and max_length is not None:
|
|
380
|
+
if max_length <= min_length:
|
|
381
|
+
msg = f"Maximum sequence length ({max_length}) must be greater than minimum sequence length ({min_length})"
|
|
382
|
+
raise ValueError(msg)
|
|
383
|
+
return dedent(f"""\
|
|
384
|
+
{header}
|
|
385
|
+
FILTER (?seq_length >= {min_length} && ?seq_length <= {max_length})
|
|
386
|
+
""")
|
|
387
|
+
if min_length is not None:
|
|
388
|
+
return dedent(f"""\
|
|
389
|
+
{header}
|
|
390
|
+
FILTER (?seq_length >= {min_length})
|
|
391
|
+
""")
|
|
392
|
+
if max_length is not None:
|
|
393
|
+
return dedent(f"""\
|
|
394
|
+
{header}
|
|
395
|
+
FILTER (?seq_length <= {max_length})
|
|
396
|
+
""")
|
|
397
|
+
return ""
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def _build_sparql_query_pdb(uniprot_accs: Iterable[str], limit=10_000) -> str:
|
|
401
|
+
# For http://purl.uniprot.org/uniprot/O00268 + http://rdf.wwpdb.org/pdb/1H3O
|
|
402
|
+
# the chainSequenceMapping are
|
|
403
|
+
# http://purl.uniprot.org/isoforms/O00268-1#PDB_1H3O_tt872tt945
|
|
404
|
+
# http://purl.uniprot.org/isoforms/Q16514-1#PDB_1H3O_tt57tt128
|
|
405
|
+
# For http://purl.uniprot.org/uniprot/O00255 + http://rdf.wwpdb.org/pdb/3U84
|
|
406
|
+
# the chainSequenceMapping are
|
|
407
|
+
# http://purl.uniprot.org/isoforms/O00255-2#PDB_3U84_tt520tt610
|
|
408
|
+
# http://purl.uniprot.org/isoforms/O00255-2#PDB_3U84_tt2tt459
|
|
409
|
+
# To get the the chain belonging to the uniprot/pdb pair we need to
|
|
410
|
+
# do some string filtering.
|
|
411
|
+
# Also there can be multiple chains for the same uniprot/pdb pair, so we need to
|
|
412
|
+
# do a group by and concat
|
|
413
|
+
|
|
414
|
+
select_clause = dedent("""\
|
|
415
|
+
?protein ?pdb_db ?pdb_method ?pdb_resolution
|
|
416
|
+
(GROUP_CONCAT(DISTINCT ?pdb_chain; separator=",") AS ?pdb_chains)
|
|
417
|
+
""")
|
|
418
|
+
|
|
419
|
+
where_clause = dedent("""
|
|
420
|
+
# --- PDB Info ---
|
|
421
|
+
?protein rdfs:seeAlso ?pdb_db .
|
|
422
|
+
?pdb_db up:database <http://purl.uniprot.org/database/PDB> .
|
|
423
|
+
?pdb_db up:method ?pdb_method .
|
|
424
|
+
?pdb_db up:chainSequenceMapping ?chainSequenceMapping .
|
|
425
|
+
BIND(STRAFTER(STR(?chainSequenceMapping), "isoforms/") AS ?isoformPart)
|
|
426
|
+
FILTER(STRSTARTS(?isoformPart, CONCAT(?ac, "-")))
|
|
427
|
+
?chainSequenceMapping up:chain ?pdb_chain .
|
|
428
|
+
OPTIONAL { ?pdb_db up:resolution ?pdb_resolution . }
|
|
429
|
+
""")
|
|
430
|
+
|
|
431
|
+
groupby_clause = "?protein ?pdb_db ?pdb_method ?pdb_resolution"
|
|
432
|
+
return _build_sparql_generic_by_uniprot_accessions_query(
|
|
433
|
+
uniprot_accs, select_clause, where_clause, limit, groupby_clause
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def _build_sparql_query_af(
|
|
438
|
+
uniprot_accs: Iterable[str],
|
|
439
|
+
min_sequence_length: int | None = None,
|
|
440
|
+
max_sequence_length: int | None = None,
|
|
441
|
+
limit=10_000,
|
|
442
|
+
) -> str:
|
|
443
|
+
select_clause = "?protein ?af_db"
|
|
444
|
+
where_clause = dedent("""
|
|
445
|
+
# --- Protein Selection ---
|
|
446
|
+
?protein a up:Protein .
|
|
447
|
+
|
|
448
|
+
# --- AlphaFoldDB Info ---
|
|
449
|
+
?protein rdfs:seeAlso ?af_db .
|
|
450
|
+
?af_db up:database <http://purl.uniprot.org/database/AlphaFoldDB> .
|
|
451
|
+
""")
|
|
452
|
+
if min_sequence_length is not None or max_sequence_length is not None:
|
|
453
|
+
length_filter = _build_sparql_query_sequence_length_filter(
|
|
454
|
+
min_length=min_sequence_length,
|
|
455
|
+
max_length=max_sequence_length,
|
|
456
|
+
)
|
|
457
|
+
where_clause += "\n" + length_filter
|
|
458
|
+
return _build_sparql_generic_by_uniprot_accessions_query(uniprot_accs, select_clause, dedent(where_clause), limit)
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def _build_sparql_query_emdb(uniprot_accs: Iterable[str], limit=10_000) -> str:
|
|
462
|
+
select_clause = "?protein ?emdb_db"
|
|
463
|
+
where_clause = dedent("""
|
|
464
|
+
# --- Protein Selection ---
|
|
465
|
+
?protein a up:Protein .
|
|
466
|
+
|
|
467
|
+
# --- EMDB Info ---
|
|
468
|
+
?protein rdfs:seeAlso ?emdb_db .
|
|
469
|
+
?emdb_db up:database <http://purl.uniprot.org/database/EMDB> .
|
|
470
|
+
""")
|
|
471
|
+
return _build_sparql_generic_by_uniprot_accessions_query(uniprot_accs, select_clause, dedent(where_clause), limit)
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def _execute_sparql_search(
|
|
475
|
+
sparql_query: str,
|
|
476
|
+
timeout: int,
|
|
477
|
+
) -> list:
|
|
478
|
+
"""
|
|
479
|
+
Execute a SPARQL query.
|
|
480
|
+
"""
|
|
481
|
+
if timeout > 2_700:
|
|
482
|
+
msg = "Uniprot SPARQL timeout is limited to 2700 seconds (45 minutes)."
|
|
483
|
+
raise ValueError(msg)
|
|
484
|
+
|
|
485
|
+
# Execute the query
|
|
486
|
+
sparql = SPARQLWrapper("https://sparql.uniprot.org/sparql")
|
|
487
|
+
sparql.setReturnFormat(JSON)
|
|
488
|
+
sparql.setTimeout(timeout)
|
|
489
|
+
|
|
490
|
+
# Default is GET method which can be cached by the server so is preferred.
|
|
491
|
+
# Too prevent URITooLong errors, we use POST method for large queries.
|
|
492
|
+
too_long_for_get = 5_000
|
|
493
|
+
if len(sparql_query) > too_long_for_get:
|
|
494
|
+
sparql.setMethod("POST")
|
|
495
|
+
|
|
496
|
+
sparql.setQuery(sparql_query)
|
|
497
|
+
rawresults = sparql.queryAndConvert()
|
|
498
|
+
if not isinstance(rawresults, dict):
|
|
499
|
+
msg = f"Expected rawresults to be a dict, but got {type(rawresults)}"
|
|
500
|
+
raise TypeError(msg)
|
|
501
|
+
|
|
502
|
+
bindings = rawresults.get("results", {}).get("bindings")
|
|
503
|
+
if not isinstance(bindings, list):
|
|
504
|
+
logger.warning("SPARQL query did not return 'bindings' list as expected.")
|
|
505
|
+
return []
|
|
506
|
+
|
|
507
|
+
logger.debug(bindings)
|
|
508
|
+
return bindings
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def _flatten_results_pdb(rawresults: Iterable) -> PdbResults:
|
|
512
|
+
pdb_entries: PdbResults = {}
|
|
513
|
+
for result in rawresults:
|
|
514
|
+
protein = result["protein"]["value"].split("/")[-1]
|
|
515
|
+
if "pdb_db" not in result: # Should not happen with build_sparql_query_pdb
|
|
516
|
+
continue
|
|
517
|
+
pdb_id = result["pdb_db"]["value"].split("/")[-1]
|
|
518
|
+
method = result["pdb_method"]["value"].split("/")[-1]
|
|
519
|
+
uniprot_chains = result["pdb_chains"]["value"]
|
|
520
|
+
pdb = PdbResult(id=pdb_id, method=method, uniprot_chains=uniprot_chains)
|
|
521
|
+
if "pdb_resolution" in result:
|
|
522
|
+
pdb = PdbResult(
|
|
523
|
+
id=pdb_id,
|
|
524
|
+
method=method,
|
|
525
|
+
uniprot_chains=uniprot_chains,
|
|
526
|
+
resolution=result["pdb_resolution"]["value"],
|
|
527
|
+
)
|
|
528
|
+
if protein not in pdb_entries:
|
|
529
|
+
pdb_entries[protein] = set()
|
|
530
|
+
pdb_entries[protein].add(pdb)
|
|
531
|
+
|
|
532
|
+
return pdb_entries
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
def _flatten_results_af(rawresults: Iterable) -> dict[str, set[str]]:
|
|
536
|
+
alphafold_entries: dict[str, set[str]] = {}
|
|
537
|
+
for result in rawresults:
|
|
538
|
+
protein = result["protein"]["value"].split("/")[-1]
|
|
539
|
+
if "af_db" in result:
|
|
540
|
+
af_id = result["af_db"]["value"].split("/")[-1]
|
|
541
|
+
if protein not in alphafold_entries:
|
|
542
|
+
alphafold_entries[protein] = set()
|
|
543
|
+
alphafold_entries[protein].add(af_id)
|
|
544
|
+
return alphafold_entries
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
def _flatten_results_emdb(rawresults: Iterable) -> dict[str, set[str]]:
|
|
548
|
+
emdb_entries: dict[str, set[str]] = {}
|
|
549
|
+
for result in rawresults:
|
|
550
|
+
protein = result["protein"]["value"].split("/")[-1]
|
|
551
|
+
if "emdb_db" in result:
|
|
552
|
+
emdb_id = result["emdb_db"]["value"].split("/")[-1]
|
|
553
|
+
if protein not in emdb_entries:
|
|
554
|
+
emdb_entries[protein] = set()
|
|
555
|
+
emdb_entries[protein].add(emdb_id)
|
|
556
|
+
return emdb_entries
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def limit_check(what: str, limit: int, len_raw_results: int):
|
|
560
|
+
if len_raw_results >= limit:
|
|
561
|
+
logger.warning(
|
|
562
|
+
"%s returned %d results. "
|
|
563
|
+
"There may be more results available, "
|
|
564
|
+
"but they are not returned due to the limit of %d. "
|
|
565
|
+
"Consider increasing the limit to get more results.",
|
|
566
|
+
what,
|
|
567
|
+
len_raw_results,
|
|
568
|
+
limit,
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def search4uniprot(query: Query, limit: int = 10_000, timeout: int = 1_800) -> set[str]:
|
|
573
|
+
"""
|
|
574
|
+
Search for UniProtKB entries based on the given query.
|
|
575
|
+
|
|
576
|
+
Args:
|
|
577
|
+
query: Query object containing search parameters.
|
|
578
|
+
limit: Maximum number of results to return.
|
|
579
|
+
timeout: Timeout for the SPARQL query in seconds.
|
|
580
|
+
|
|
581
|
+
Returns:
|
|
582
|
+
Set of uniprot accessions.
|
|
583
|
+
"""
|
|
584
|
+
sparql_query = _build_sparql_query_uniprot(query, limit)
|
|
585
|
+
logger.info("Executing SPARQL query for UniProt: %s", sparql_query)
|
|
586
|
+
|
|
587
|
+
# Type assertion is needed because _execute_sparql_search returns a Union
|
|
588
|
+
raw_results = _execute_sparql_search(
|
|
589
|
+
sparql_query=sparql_query,
|
|
590
|
+
timeout=timeout,
|
|
591
|
+
)
|
|
592
|
+
limit_check("Search for uniprot accessions", limit, len(raw_results))
|
|
593
|
+
return {result["protein"]["value"].split("/")[-1] for result in raw_results}
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
def search4pdb(
|
|
597
|
+
uniprot_accs: Collection[str], limit: int = 10_000, timeout: int = 1_800, batch_size: int = 10_000
|
|
598
|
+
) -> PdbResults:
|
|
599
|
+
"""
|
|
600
|
+
Search for PDB entries in UniProtKB accessions.
|
|
601
|
+
|
|
602
|
+
Args:
|
|
603
|
+
uniprot_accs: UniProt accessions.
|
|
604
|
+
limit: Maximum number of results to return.
|
|
605
|
+
timeout: Timeout for the SPARQL query in seconds.
|
|
606
|
+
batch_size: Size of batches to process the UniProt accessions.
|
|
607
|
+
|
|
608
|
+
Returns:
|
|
609
|
+
Dictionary with protein IDs as keys and sets of PDB results as values.
|
|
610
|
+
"""
|
|
611
|
+
all_raw_results = []
|
|
612
|
+
total = len(uniprot_accs)
|
|
613
|
+
with tqdm(total=total, desc="Searching for PDBs of uniprots", disable=total < batch_size, unit="acc") as pbar:
|
|
614
|
+
for batch in batched(uniprot_accs, batch_size, strict=False):
|
|
615
|
+
sparql_query = _build_sparql_query_pdb(batch, limit)
|
|
616
|
+
logger.info("Executing SPARQL query for PDB: %s", sparql_query)
|
|
617
|
+
|
|
618
|
+
raw_results = _execute_sparql_search(
|
|
619
|
+
sparql_query=sparql_query,
|
|
620
|
+
timeout=timeout,
|
|
621
|
+
)
|
|
622
|
+
all_raw_results.extend(raw_results)
|
|
623
|
+
pbar.update(len(batch))
|
|
624
|
+
|
|
625
|
+
limit_check("Search for pdbs on uniprot", limit, len(all_raw_results))
|
|
626
|
+
return _flatten_results_pdb(all_raw_results)
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
def search4af(
|
|
630
|
+
uniprot_accs: Collection[str],
|
|
631
|
+
min_sequence_length: int | None = None,
|
|
632
|
+
max_sequence_length: int | None = None,
|
|
633
|
+
limit: int = 10_000,
|
|
634
|
+
timeout: int = 1_800,
|
|
635
|
+
batch_size: int = 10_000,
|
|
636
|
+
) -> dict[str, set[str]]:
|
|
637
|
+
"""
|
|
638
|
+
Search for AlphaFold entries in UniProtKB accessions.
|
|
639
|
+
|
|
640
|
+
Args:
|
|
641
|
+
uniprot_accs: UniProt accessions.
|
|
642
|
+
min_sequence_length: Minimum length of the canonical sequence.
|
|
643
|
+
max_sequence_length: Maximum length of the canonical sequence.
|
|
644
|
+
limit: Maximum number of results to return.
|
|
645
|
+
timeout: Timeout for the SPARQL query in seconds.
|
|
646
|
+
batch_size: Size of batches to process the UniProt accessions.
|
|
647
|
+
|
|
648
|
+
Returns:
|
|
649
|
+
Dictionary with protein IDs as keys and sets of AlphaFold IDs as values.
|
|
650
|
+
"""
|
|
651
|
+
all_raw_results = []
|
|
652
|
+
total = len(uniprot_accs)
|
|
653
|
+
with tqdm(total=total, desc="Searching for AlphaFolds of uniprots", disable=total < batch_size, unit="acc") as pbar:
|
|
654
|
+
for batch in batched(uniprot_accs, batch_size, strict=False):
|
|
655
|
+
sparql_query = _build_sparql_query_af(batch, min_sequence_length, max_sequence_length, limit)
|
|
656
|
+
logger.info("Executing SPARQL query for AlphaFold: %s", sparql_query)
|
|
657
|
+
|
|
658
|
+
raw_results = _execute_sparql_search(
|
|
659
|
+
sparql_query=sparql_query,
|
|
660
|
+
timeout=timeout,
|
|
661
|
+
)
|
|
662
|
+
all_raw_results.extend(raw_results)
|
|
663
|
+
pbar.update(len(batch))
|
|
664
|
+
|
|
665
|
+
limit_check("Search for alphafold entries on uniprot", limit, len(all_raw_results))
|
|
666
|
+
return _flatten_results_af(all_raw_results)
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
def search4emdb(uniprot_accs: Iterable[str], limit: int = 10_000, timeout: int = 1_800) -> dict[str, set[str]]:
|
|
670
|
+
"""
|
|
671
|
+
Search for EMDB entries in UniProtKB accessions.
|
|
672
|
+
|
|
673
|
+
Args:
|
|
674
|
+
uniprot_accs: UniProt accessions.
|
|
675
|
+
limit: Maximum number of results to return.
|
|
676
|
+
timeout: Timeout for the SPARQL query in seconds.
|
|
677
|
+
|
|
678
|
+
Returns:
|
|
679
|
+
Dictionary with protein IDs as keys and sets of EMDB IDs as values.
|
|
680
|
+
"""
|
|
681
|
+
sparql_query = _build_sparql_query_emdb(uniprot_accs, limit)
|
|
682
|
+
logger.info("Executing SPARQL query for EMDB: %s", sparql_query)
|
|
683
|
+
|
|
684
|
+
raw_results = _execute_sparql_search(
|
|
685
|
+
sparql_query=sparql_query,
|
|
686
|
+
timeout=timeout,
|
|
687
|
+
)
|
|
688
|
+
limit_check("Search for EMDB entries on uniprot", limit, len(raw_results))
|
|
689
|
+
return _flatten_results_emdb(raw_results)
|
|
690
|
+
|
|
691
|
+
|
|
692
|
+
def _build_complex_sparql_query(uniprot_accs: Iterable[str], limit: int) -> str:
|
|
693
|
+
"""Builds a SPARQL query to retrieve ComplexPortal information for given UniProt accessions.
|
|
694
|
+
|
|
695
|
+
Example:
|
|
696
|
+
|
|
697
|
+
```sparql
|
|
698
|
+
PREFIX up: <http://purl.uniprot.org/core/>
|
|
699
|
+
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
700
|
+
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
|
|
701
|
+
|
|
702
|
+
SELECT
|
|
703
|
+
?protein
|
|
704
|
+
?cp_db
|
|
705
|
+
?cp_comment
|
|
706
|
+
(GROUP_CONCAT(
|
|
707
|
+
DISTINCT STRAFTER(STR(?member), "http://purl.uniprot.org/uniprot/"); separator=","
|
|
708
|
+
) AS ?complex_members)
|
|
709
|
+
(COUNT(DISTINCT ?member) AS ?member_count)
|
|
710
|
+
WHERE {
|
|
711
|
+
# Input UniProt accessions
|
|
712
|
+
VALUES (?ac) { ("P05067") ("P60709") ("Q05471")}
|
|
713
|
+
BIND (IRI(CONCAT("http://purl.uniprot.org/uniprot/", ?ac)) AS ?protein)
|
|
714
|
+
|
|
715
|
+
# ComplexPortal cross-reference for each input protein
|
|
716
|
+
?protein a up:Protein ;
|
|
717
|
+
rdfs:seeAlso ?cp_db .
|
|
718
|
+
?cp_db up:database <http://purl.uniprot.org/database/ComplexPortal> .
|
|
719
|
+
OPTIONAL { ?cp_db rdfs:comment ?cp_comment . }
|
|
720
|
+
|
|
721
|
+
# All member proteins of the same ComplexPortal complex
|
|
722
|
+
?member a up:Protein ;
|
|
723
|
+
rdfs:seeAlso ?cp_db .
|
|
724
|
+
}
|
|
725
|
+
GROUP BY ?protein ?cp_db ?cp_comment
|
|
726
|
+
ORDER BY ?protein ?cp_db
|
|
727
|
+
LIMIT 500
|
|
728
|
+
```
|
|
729
|
+
|
|
730
|
+
"""
|
|
731
|
+
select_clause = dedent("""\
|
|
732
|
+
?protein ?cp_db ?cp_comment
|
|
733
|
+
(GROUP_CONCAT(
|
|
734
|
+
DISTINCT STRAFTER(STR(?member), "http://purl.uniprot.org/uniprot/"); separator=","
|
|
735
|
+
) AS ?complex_members)
|
|
736
|
+
""")
|
|
737
|
+
where_clause = dedent("""
|
|
738
|
+
# --- Complex Info ---
|
|
739
|
+
?protein a up:Protein ;
|
|
740
|
+
rdfs:seeAlso ?cp_db .
|
|
741
|
+
?cp_db up:database <http://purl.uniprot.org/database/ComplexPortal> .
|
|
742
|
+
OPTIONAL { ?cp_db rdfs:comment ?cp_comment . }
|
|
743
|
+
# All member proteins of the same ComplexPortal complex
|
|
744
|
+
?member a up:Protein ;
|
|
745
|
+
rdfs:seeAlso ?cp_db .
|
|
746
|
+
""")
|
|
747
|
+
group_by = dedent("""
|
|
748
|
+
?protein ?cp_db ?cp_comment
|
|
749
|
+
""")
|
|
750
|
+
return _build_sparql_generic_by_uniprot_accessions_query(
|
|
751
|
+
uniprot_accs, select_clause, where_clause, limit, groupby_clause=group_by
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
|
|
755
|
+
@dataclass(frozen=True)
|
|
756
|
+
class ComplexPortalEntry:
|
|
757
|
+
"""A ComplexPortal entry.
|
|
758
|
+
|
|
759
|
+
Parameters:
|
|
760
|
+
query_protein: The UniProt accession used to find entry.
|
|
761
|
+
complex_id: The ComplexPortal identifier (for example "CPX-1234").
|
|
762
|
+
complex_url: The URL to the ComplexPortal entry.
|
|
763
|
+
complex_title: The title of the complex.
|
|
764
|
+
members: UniProt accessions which are members of the complex.
|
|
765
|
+
"""
|
|
766
|
+
|
|
767
|
+
query_protein: str
|
|
768
|
+
complex_id: str
|
|
769
|
+
complex_url: str
|
|
770
|
+
complex_title: str
|
|
771
|
+
members: set[str]
|
|
772
|
+
|
|
773
|
+
|
|
774
|
+
def _flatten_results_complex(raw_results) -> list[ComplexPortalEntry]:
|
|
775
|
+
results = []
|
|
776
|
+
for raw_result in raw_results:
|
|
777
|
+
query_protein = raw_result["protein"]["value"].split("/")[-1]
|
|
778
|
+
complex_id = raw_result["cp_db"]["value"].split("/")[-1]
|
|
779
|
+
complex_url = f"https://www.ebi.ac.uk/complexportal/complex/{complex_id}"
|
|
780
|
+
complex_title = raw_result.get("cp_comment", {}).get("value", "")
|
|
781
|
+
members = set(raw_result["complex_members"]["value"].split(","))
|
|
782
|
+
results.append(
|
|
783
|
+
ComplexPortalEntry(
|
|
784
|
+
query_protein=query_protein,
|
|
785
|
+
complex_id=complex_id,
|
|
786
|
+
complex_url=complex_url,
|
|
787
|
+
complex_title=complex_title,
|
|
788
|
+
members=members,
|
|
789
|
+
)
|
|
790
|
+
)
|
|
791
|
+
return results
|
|
792
|
+
|
|
793
|
+
|
|
794
|
+
def search4macromolecular_complexes(
|
|
795
|
+
uniprot_accs: Iterable[str], limit: int = 10_000, timeout: int = 1_800
|
|
796
|
+
) -> list[ComplexPortalEntry]:
|
|
797
|
+
"""Search for macromolecular complexes by UniProtKB accessions.
|
|
798
|
+
|
|
799
|
+
Queries for references to/from https://www.ebi.ac.uk/complexportal/ database in the Uniprot SPARQL endpoint.
|
|
800
|
+
|
|
801
|
+
Args:
|
|
802
|
+
uniprot_accs: UniProt accessions.
|
|
803
|
+
limit: Maximum number of results to return.
|
|
804
|
+
timeout: Timeout for the SPARQL query in seconds.
|
|
805
|
+
|
|
806
|
+
Returns:
|
|
807
|
+
List of ComplexPortalEntry objects.
|
|
808
|
+
"""
|
|
809
|
+
sparql_query = _build_complex_sparql_query(uniprot_accs, limit)
|
|
810
|
+
logger.info("Executing SPARQL query for macromolecular complexes: %s", sparql_query)
|
|
811
|
+
raw_results = _execute_sparql_search(
|
|
812
|
+
sparql_query=sparql_query,
|
|
813
|
+
timeout=timeout,
|
|
814
|
+
)
|
|
815
|
+
limit_check("Search for complexes", limit, len(raw_results))
|
|
816
|
+
return _flatten_results_complex(raw_results)
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
def search4interaction_partners(
|
|
820
|
+
uniprot_accession: str, excludes: set[str] | None = None, limit: int = 10_000, timeout: int = 1_800
|
|
821
|
+
) -> dict[str, set[str]]:
|
|
822
|
+
"""Search for interaction partners of a given UniProt accession using ComplexPortal database references.
|
|
823
|
+
|
|
824
|
+
Args:
|
|
825
|
+
uniprot_accession: UniProt accession to search interaction partners for.
|
|
826
|
+
excludes: Set of UniProt accessions to exclude from the results.
|
|
827
|
+
For example already known interaction partners.
|
|
828
|
+
If None then no complex members are excluded.
|
|
829
|
+
limit: Maximum number of results to return.
|
|
830
|
+
timeout: Timeout for the SPARQL query in seconds.
|
|
831
|
+
|
|
832
|
+
Returns:
|
|
833
|
+
Dictionary with UniProt accessions of interaction partners as keys and sets of ComplexPortal entry IDs
|
|
834
|
+
in which the interaction occurs as values.
|
|
835
|
+
"""
|
|
836
|
+
ucomplexes = search4macromolecular_complexes([uniprot_accession], limit=limit, timeout=timeout)
|
|
837
|
+
hits: dict[str, set[str]] = {}
|
|
838
|
+
if excludes is None:
|
|
839
|
+
excludes = set()
|
|
840
|
+
for ucomplex in ucomplexes:
|
|
841
|
+
for member in ucomplex.members:
|
|
842
|
+
if member != uniprot_accession and member not in excludes:
|
|
843
|
+
if member not in hits:
|
|
844
|
+
hits[member] = set()
|
|
845
|
+
hits[member].add(ucomplex.complex_id)
|
|
846
|
+
return hits
|
|
847
|
+
|
|
848
|
+
|
|
849
|
+
@dataclass(frozen=True)
|
|
850
|
+
class UniprotDetails:
|
|
851
|
+
"""Details of an UniProt entry.
|
|
852
|
+
|
|
853
|
+
Parameters:
|
|
854
|
+
uniprot_accession: UniProt accession.
|
|
855
|
+
uniprot_id: UniProt ID (mnemonic).
|
|
856
|
+
sequence_length: Length of the canonical sequence.
|
|
857
|
+
reviewed: Whether the entry is reviewed (Swiss-Prot) or unreviewed (TrEMBL).
|
|
858
|
+
protein_name: Recommended protein name.
|
|
859
|
+
taxon_id: NCBI Taxonomy ID of the organism.
|
|
860
|
+
taxon_name: Scientific name of the organism.
|
|
861
|
+
"""
|
|
862
|
+
|
|
863
|
+
uniprot_accession: str
|
|
864
|
+
uniprot_id: str
|
|
865
|
+
sequence_length: int
|
|
866
|
+
reviewed: bool
|
|
867
|
+
protein_name: str
|
|
868
|
+
taxon_id: int
|
|
869
|
+
taxon_name: str
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
def map_uniprot_accessions2uniprot_details(
|
|
873
|
+
uniprot_accessions: Collection[str], timeout: int = 1_800, batch_size: int = 1000
|
|
874
|
+
) -> Generator[UniprotDetails]:
|
|
875
|
+
"""Map UniProt accessions to UniProt details by querying the UniProt SPARQL endpoint.
|
|
876
|
+
|
|
877
|
+
Example:
|
|
878
|
+
|
|
879
|
+
SPARQL query to get details for 7 UniProt entries, run on [https://sparql.uniprot.org/sparql](https://sparql.uniprot.org/sparql).
|
|
880
|
+
|
|
881
|
+
```sparql
|
|
882
|
+
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
|
|
883
|
+
PREFIX up: <http://purl.uniprot.org/core/>
|
|
884
|
+
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
885
|
+
|
|
886
|
+
SELECT
|
|
887
|
+
(?ac AS ?uniprot_accession)
|
|
888
|
+
?uniprot_id
|
|
889
|
+
(STRAFTER(STR(?organism), "taxonomy/") AS ?taxon_id)
|
|
890
|
+
?taxon_name
|
|
891
|
+
?reviewed
|
|
892
|
+
?protein_name
|
|
893
|
+
(STRLEN(?sequence) AS ?seq_length)
|
|
894
|
+
WHERE {
|
|
895
|
+
# Input UniProt accessions
|
|
896
|
+
VALUES (?ac) { ("P05067") ("A6NGD5") ("O14627") ("P00697") ("P42284") ("A0A0B5AC95") ("A0A0S2Z4R0")}
|
|
897
|
+
BIND (IRI(CONCAT("http://purl.uniprot.org/uniprot/", ?ac)) AS ?protein)
|
|
898
|
+
?protein a up:Protein .
|
|
899
|
+
?protein up:mnemonic ?uniprot_id .
|
|
900
|
+
?protein up:organism ?organism .
|
|
901
|
+
?organism up:scientificName ?taxon_name .
|
|
902
|
+
?protein up:reviewed ?reviewed .
|
|
903
|
+
?protein up:recommendedName/up:fullName ?protein_name .
|
|
904
|
+
?protein up:sequence ?isoform .
|
|
905
|
+
?isoform a up:Simple_Sequence .
|
|
906
|
+
?isoform rdf:value ?sequence .
|
|
907
|
+
BIND (IRI(STRBEFORE(REPLACE(
|
|
908
|
+
STR(?isoform), "http://purl.uniprot.org/isoforms/", "http://purl.uniprot.org/uniprot/"
|
|
909
|
+
), "-")) AS ?ac_of_isoform)
|
|
910
|
+
FILTER(?ac_of_isoform = ?protein)
|
|
911
|
+
}
|
|
912
|
+
```
|
|
913
|
+
|
|
914
|
+
Args:
|
|
915
|
+
uniprot_accessions: Iterable of UniProt accessions.
|
|
916
|
+
timeout: Timeout for the SPARQL query in seconds.
|
|
917
|
+
batch_size: Size of batches to process the UniProt accessions.
|
|
918
|
+
|
|
919
|
+
Yields:
|
|
920
|
+
UniprotDetails objects in random order.
|
|
921
|
+
"""
|
|
922
|
+
select_clause = dedent("""\
|
|
923
|
+
(?ac AS ?uniprot_accession)
|
|
924
|
+
?uniprot_id
|
|
925
|
+
(STRAFTER(STR(?organism), "taxonomy/") AS ?taxon_id)
|
|
926
|
+
?taxon_name
|
|
927
|
+
?reviewed
|
|
928
|
+
?protein_name
|
|
929
|
+
(STRLEN(?sequence) AS ?seq_length)
|
|
930
|
+
""")
|
|
931
|
+
where_clause = dedent("""
|
|
932
|
+
?protein up:mnemonic ?uniprot_id .
|
|
933
|
+
?protein up:organism ?organism .
|
|
934
|
+
?organism up:scientificName ?taxon_name .
|
|
935
|
+
?protein up:reviewed ?reviewed .
|
|
936
|
+
OPTIONAL {
|
|
937
|
+
?protein up:recommendedName/up:fullName ?protein_name .
|
|
938
|
+
}
|
|
939
|
+
?protein up:sequence ?isoform .
|
|
940
|
+
?isoform a up:Simple_Sequence .
|
|
941
|
+
?isoform rdf:value ?sequence .
|
|
942
|
+
BIND (IRI(STRBEFORE(REPLACE(
|
|
943
|
+
STR(?isoform), "http://purl.uniprot.org/isoforms/", "http://purl.uniprot.org/uniprot/"
|
|
944
|
+
), "-")) AS ?ac_of_isoform)
|
|
945
|
+
FILTER(?ac_of_isoform = ?protein)
|
|
946
|
+
""")
|
|
947
|
+
total = len(uniprot_accessions)
|
|
948
|
+
with tqdm(
|
|
949
|
+
total=total,
|
|
950
|
+
desc="Retrieving UniProt details",
|
|
951
|
+
disable=total < batch_size,
|
|
952
|
+
unit="acc",
|
|
953
|
+
) as pbar:
|
|
954
|
+
for batch in batched(uniprot_accessions, batch_size, strict=False):
|
|
955
|
+
sparql_query = _build_sparql_generic_by_uniprot_accessions_query(
|
|
956
|
+
batch, select_clause, where_clause, limit=batch_size
|
|
957
|
+
)
|
|
958
|
+
logger.info("Executing SPARQL query for UniProt details: %s", sparql_query)
|
|
959
|
+
raw_results = _execute_sparql_search(
|
|
960
|
+
sparql_query=sparql_query,
|
|
961
|
+
timeout=timeout,
|
|
962
|
+
)
|
|
963
|
+
for raw_result in raw_results:
|
|
964
|
+
protein_name = raw_result.get("protein_name", {}).get("value", "")
|
|
965
|
+
result = UniprotDetails(
|
|
966
|
+
uniprot_accession=raw_result["uniprot_accession"]["value"],
|
|
967
|
+
uniprot_id=raw_result["uniprot_id"]["value"],
|
|
968
|
+
sequence_length=int(raw_result["seq_length"]["value"]),
|
|
969
|
+
reviewed=raw_result["reviewed"]["value"] == "true",
|
|
970
|
+
protein_name=protein_name,
|
|
971
|
+
taxon_id=int(raw_result["taxon_id"]["value"]),
|
|
972
|
+
taxon_name=raw_result["taxon_name"]["value"],
|
|
973
|
+
)
|
|
974
|
+
yield result
|
|
975
|
+
pbar.update(len(batch))
|