protein-quest 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of protein-quest might be problematic. Click here for more details.
- protein_quest/__version__.py +1 -1
- protein_quest/alphafold/entry_summary.py +46 -22
- protein_quest/alphafold/fetch.py +53 -28
- protein_quest/cli.py +263 -57
- protein_quest/mcp_server.py +15 -4
- protein_quest/structure.py +24 -0
- protein_quest/uniprot.py +287 -15
- {protein_quest-0.6.0.dist-info → protein_quest-0.7.0.dist-info}/METADATA +32 -6
- {protein_quest-0.6.0.dist-info → protein_quest-0.7.0.dist-info}/RECORD +12 -12
- {protein_quest-0.6.0.dist-info → protein_quest-0.7.0.dist-info}/WHEEL +0 -0
- {protein_quest-0.6.0.dist-info → protein_quest-0.7.0.dist-info}/entry_points.txt +0 -0
- {protein_quest-0.6.0.dist-info → protein_quest-0.7.0.dist-info}/licenses/LICENSE +0 -0
protein_quest/mcp_server.py
CHANGED
|
@@ -45,10 +45,14 @@ from protein_quest.alphafold.fetch import AlphaFoldEntry, DownloadableFormat
|
|
|
45
45
|
from protein_quest.alphafold.fetch import fetch_many as alphafold_fetch
|
|
46
46
|
from protein_quest.emdb import fetch as emdb_fetch
|
|
47
47
|
from protein_quest.go import search_gene_ontology_term
|
|
48
|
-
from protein_quest.io import convert_to_cif_file, glob_structure_files
|
|
48
|
+
from protein_quest.io import convert_to_cif_file, glob_structure_files, read_structure
|
|
49
49
|
from protein_quest.pdbe.fetch import fetch as pdbe_fetch
|
|
50
50
|
from protein_quest.ss import filter_file_on_secondary_structure
|
|
51
|
-
from protein_quest.structure import
|
|
51
|
+
from protein_quest.structure import (
|
|
52
|
+
nr_residues_in_chain,
|
|
53
|
+
structure2uniprot_accessions,
|
|
54
|
+
write_single_chain_structure_file,
|
|
55
|
+
)
|
|
52
56
|
from protein_quest.taxonomy import search_taxon
|
|
53
57
|
from protein_quest.uniprot import (
|
|
54
58
|
PdbResult,
|
|
@@ -129,7 +133,7 @@ def extract_single_chain_from_structure(
|
|
|
129
133
|
|
|
130
134
|
@mcp.tool
|
|
131
135
|
def list_structure_files(path: Path) -> list[Path]:
|
|
132
|
-
"""List structure files (.pdb, .pdb.gz, .cif, .cif.gz) in the specified directory."""
|
|
136
|
+
"""List structure files (.pdb, .pdb.gz, .cif, .cif.gz, .bcif) in the specified directory."""
|
|
133
137
|
return list(glob_structure_files(path))
|
|
134
138
|
|
|
135
139
|
|
|
@@ -150,7 +154,7 @@ def search_alphafolds(
|
|
|
150
154
|
Field(description="Set of uniprot accessions which have an AlphaFold entry"),
|
|
151
155
|
]:
|
|
152
156
|
"""Search for AlphaFold entries in UniProtKB accessions."""
|
|
153
|
-
# each uniprot
|
|
157
|
+
# each uniprot accession can have one or more AlphaFold IDs
|
|
154
158
|
# an AlphaFold ID is the same as the uniprot accession
|
|
155
159
|
# so we return a subset of uniprot_accs
|
|
156
160
|
results = search4af(uniprot_accs, limit)
|
|
@@ -203,6 +207,13 @@ mcp.tool(filter_file_on_secondary_structure)
|
|
|
203
207
|
mcp.tool(convert_to_cif_file)
|
|
204
208
|
|
|
205
209
|
|
|
210
|
+
@mcp.tool
|
|
211
|
+
def uniprot_accessions_of_structure_file(file: Path) -> set[str]:
|
|
212
|
+
"""Extract UniProt accessions from structure file."""
|
|
213
|
+
structure = read_structure(file)
|
|
214
|
+
return structure2uniprot_accessions(structure)
|
|
215
|
+
|
|
216
|
+
|
|
206
217
|
@mcp.prompt
|
|
207
218
|
def candidate_structures(
|
|
208
219
|
species: str = "Human",
|
protein_quest/structure.py
CHANGED
|
@@ -206,3 +206,27 @@ def write_single_chain_structure_file(
|
|
|
206
206
|
write_structure(structure, output_file)
|
|
207
207
|
|
|
208
208
|
return output_file
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def structure2uniprot_accessions(structure: gemmi.Structure) -> set[str]:
|
|
212
|
+
"""Extract UniProt accessions from a gemmi Structure object.
|
|
213
|
+
|
|
214
|
+
Logs a warning and returns an empty set if no accessions are found in structure.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
structure: The gemmi Structure object to extract UniProt accessions from.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
A set of UniProt accessions found in the structure.
|
|
221
|
+
"""
|
|
222
|
+
block = structure.make_mmcif_block(gemmi.MmcifOutputGroups(False, struct_ref=True))
|
|
223
|
+
struct_ref = block.get_mmcif_category("_struct_ref.")
|
|
224
|
+
uniprot_accessions: set[str] = set()
|
|
225
|
+
for i, db_name in enumerate(struct_ref["db_name"]):
|
|
226
|
+
if db_name != "UNP":
|
|
227
|
+
continue
|
|
228
|
+
pdbx_db_accession = struct_ref["pdbx_db_accession"][i]
|
|
229
|
+
uniprot_accessions.add(pdbx_db_accession)
|
|
230
|
+
if not uniprot_accessions:
|
|
231
|
+
logger.warning("No UniProt accessions found in structure %s", structure.name)
|
|
232
|
+
return uniprot_accessions
|
protein_quest/uniprot.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
"""Module for searching UniProtKB using SPARQL."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
from collections.abc import Collection, Iterable
|
|
4
|
+
from collections.abc import Collection, Generator, Iterable
|
|
5
5
|
from dataclasses import dataclass
|
|
6
|
+
from functools import cached_property
|
|
6
7
|
from itertools import batched
|
|
7
8
|
from textwrap import dedent
|
|
8
9
|
|
|
@@ -24,6 +25,8 @@ class Query:
|
|
|
24
25
|
(e.g., ["GO:0005634"]) or a collection of GO terms (e.g., ["GO:0005634", "GO:0005737"]).
|
|
25
26
|
molecular_function_go: Molecular function in GO format. Can be a single GO term
|
|
26
27
|
(e.g., ["GO:0003674"]) or a collection of GO terms (e.g., ["GO:0003674", "GO:0008150"]).
|
|
28
|
+
min_sequence_length: Minimum length of the canonical sequence.
|
|
29
|
+
max_sequence_length: Maximum length of the canonical sequence.
|
|
27
30
|
"""
|
|
28
31
|
|
|
29
32
|
# TODO make taxon_id an int
|
|
@@ -32,6 +35,8 @@ class Query:
|
|
|
32
35
|
subcellular_location_uniprot: str | None = None
|
|
33
36
|
subcellular_location_go: list[str] | None = None
|
|
34
37
|
molecular_function_go: list[str] | None = None
|
|
38
|
+
min_sequence_length: int | None = None
|
|
39
|
+
max_sequence_length: int | None = None
|
|
35
40
|
|
|
36
41
|
|
|
37
42
|
def _first_chain_from_uniprot_chains(uniprot_chains: str) -> str:
|
|
@@ -39,16 +44,17 @@ def _first_chain_from_uniprot_chains(uniprot_chains: str) -> str:
|
|
|
39
44
|
|
|
40
45
|
The UniProt chains string is formatted (with EBNF notation) as follows:
|
|
41
46
|
|
|
42
|
-
chain_group
|
|
47
|
+
chain_group=range(,chain_group=range)*
|
|
43
48
|
|
|
44
49
|
where:
|
|
45
50
|
chain_group := chain_id(/chain_id)*
|
|
46
|
-
chain_id := [A-Za-
|
|
51
|
+
chain_id := [A-Za-z0-9]+
|
|
47
52
|
range := start-end
|
|
48
53
|
start, end := integer
|
|
49
54
|
|
|
50
55
|
Args:
|
|
51
56
|
uniprot_chains: A string representing UniProt chains, For example "B/D=1-81".
|
|
57
|
+
|
|
52
58
|
Returns:
|
|
53
59
|
The first chain identifier from the UniProt chain string. For example "B".
|
|
54
60
|
"""
|
|
@@ -66,6 +72,27 @@ def _first_chain_from_uniprot_chains(uniprot_chains: str) -> str:
|
|
|
66
72
|
return chain
|
|
67
73
|
|
|
68
74
|
|
|
75
|
+
def _chain_length_from_uniprot_chains(uniprot_chains: str) -> int:
|
|
76
|
+
"""Calculates the total length of chain from a UniProt chains string.
|
|
77
|
+
|
|
78
|
+
See `_first_chain_from_uniprot_chains` for the format of the UniProt chains string.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
uniprot_chains: A string representing UniProt chains, For example "B/D=1-81".
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
The length of the chain in the UniProt chain string. For example 81 for "B/D=1-81".
|
|
85
|
+
"""
|
|
86
|
+
total_length = 0
|
|
87
|
+
chains = uniprot_chains.split(",")
|
|
88
|
+
for chain in chains:
|
|
89
|
+
_, rangestr = chain.split("=")
|
|
90
|
+
start, stop = rangestr.split("-")
|
|
91
|
+
# Residue positions are 1-based so + 1
|
|
92
|
+
total_length += int(stop) - int(start) + 1
|
|
93
|
+
return total_length
|
|
94
|
+
|
|
95
|
+
|
|
69
96
|
@dataclass(frozen=True)
|
|
70
97
|
class PdbResult:
|
|
71
98
|
"""Result of a PDB search in UniProtKB.
|
|
@@ -82,11 +109,57 @@ class PdbResult:
|
|
|
82
109
|
uniprot_chains: str
|
|
83
110
|
resolution: str | None = None
|
|
84
111
|
|
|
85
|
-
@
|
|
112
|
+
@cached_property
|
|
86
113
|
def chain(self) -> str:
|
|
87
114
|
"""The first chain from the UniProt chains aka self.uniprot_chains."""
|
|
88
115
|
return _first_chain_from_uniprot_chains(self.uniprot_chains)
|
|
89
116
|
|
|
117
|
+
@cached_property
|
|
118
|
+
def chain_length(self) -> int:
|
|
119
|
+
"""The length of the chain from the UniProt chains aka self.uniprot_chains."""
|
|
120
|
+
return _chain_length_from_uniprot_chains(self.uniprot_chains)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
type PdbResults = dict[str, set[PdbResult]]
|
|
124
|
+
"""Dictionary with uniprot accessions as keys and sets of PDB results as values."""
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def filter_pdb_results_on_chain_length(
|
|
128
|
+
pdb_results: PdbResults,
|
|
129
|
+
min_residues: int | None,
|
|
130
|
+
max_residues: int | None,
|
|
131
|
+
) -> PdbResults:
|
|
132
|
+
"""Filter PDB results based on chain length.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
pdb_results: Dictionary with protein IDs as keys and sets of PDB results as values.
|
|
136
|
+
min_residues: Minimum number of residues required in the chain mapped to the UniProt accession.
|
|
137
|
+
If None, no minimum is applied.
|
|
138
|
+
max_residues: Maximum number of residues allowed in chain mapped to the UniProt accession.
|
|
139
|
+
If None, no maximum is applied.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Filtered dictionary with protein IDs as keys and sets of PDB results as values.
|
|
143
|
+
"""
|
|
144
|
+
if min_residues is None and max_residues is None:
|
|
145
|
+
# No filtering needed
|
|
146
|
+
return pdb_results
|
|
147
|
+
if min_residues is not None and max_residues is not None and max_residues <= min_residues:
|
|
148
|
+
msg = f"Maximum number of residues ({max_residues}) must be > minimum number of residues ({min_residues})"
|
|
149
|
+
raise ValueError(msg)
|
|
150
|
+
results: PdbResults = {}
|
|
151
|
+
for uniprot_accession, pdb_entries in pdb_results.items():
|
|
152
|
+
filtered_pdb_entries = {
|
|
153
|
+
pdb_entry
|
|
154
|
+
for pdb_entry in pdb_entries
|
|
155
|
+
if (min_residues is None or pdb_entry.chain_length >= min_residues)
|
|
156
|
+
and (max_residues is None or pdb_entry.chain_length <= max_residues)
|
|
157
|
+
}
|
|
158
|
+
if filtered_pdb_entries:
|
|
159
|
+
# Only include uniprot_accession if there are any pdb entries left after filtering
|
|
160
|
+
results[uniprot_accession] = filtered_pdb_entries
|
|
161
|
+
return results
|
|
162
|
+
|
|
90
163
|
|
|
91
164
|
def _query2dynamic_sparql_triples(query: Query):
|
|
92
165
|
parts: list[str] = []
|
|
@@ -110,6 +183,13 @@ def _query2dynamic_sparql_triples(query: Query):
|
|
|
110
183
|
molecular_function_filter = _create_go_filter(go_terms, "Molecular function")
|
|
111
184
|
parts.append(molecular_function_filter)
|
|
112
185
|
|
|
186
|
+
if query.min_sequence_length is not None or query.max_sequence_length is not None:
|
|
187
|
+
length_filter = _build_sparql_query_sequence_length_filter(
|
|
188
|
+
min_length=query.min_sequence_length,
|
|
189
|
+
max_length=query.max_sequence_length,
|
|
190
|
+
)
|
|
191
|
+
parts.append(length_filter)
|
|
192
|
+
|
|
113
193
|
return "\n".join(parts)
|
|
114
194
|
|
|
115
195
|
|
|
@@ -237,6 +317,57 @@ def _build_sparql_query_uniprot(query: Query, limit=10_000) -> str:
|
|
|
237
317
|
return _build_sparql_generic_query(select_clause, dedent(where_clause), limit)
|
|
238
318
|
|
|
239
319
|
|
|
320
|
+
def _build_sparql_query_sequence_length_filter(min_length: int | None = None, max_length: int | None = None) -> str:
|
|
321
|
+
"""Builds a SPARQL filter for sequence length.
|
|
322
|
+
|
|
323
|
+
See 107_uniprot_sequences_and_mark_which_is_cannonical_for_human
|
|
324
|
+
on https://sparql.uniprot.org/.well-known/sparql-examples/ for similar query.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
min_length: Minimum sequence length. If None, no minimum is applied.
|
|
328
|
+
max_length: Maximum sequence length. If None, no maximum is applied.
|
|
329
|
+
"""
|
|
330
|
+
if min_length is None and max_length is None:
|
|
331
|
+
return ""
|
|
332
|
+
# An uniprot entry can have multiple isoforms,
|
|
333
|
+
# we want to check the length of the canonical isoform
|
|
334
|
+
# We do this by selecting the isoform that is not based on another isoform
|
|
335
|
+
# and excluding isoforms from other uniprot entries.
|
|
336
|
+
# For example for http://purl.uniprot.org/uniprot/P42284:
|
|
337
|
+
# - http://purl.uniprot.org/isoforms/P42284-2 is ok
|
|
338
|
+
# - http://purl.uniprot.org/isoforms/P42284-1 is not ok, because it is based on P42284-2
|
|
339
|
+
# - http://purl.uniprot.org/isoforms/Q7KQZ4-1 is not ok, because it is from another uniprot entry
|
|
340
|
+
# TODO use same approach as in retrieve_uniprot_details function
|
|
341
|
+
header = dedent("""\
|
|
342
|
+
?protein up:sequence ?isoform .
|
|
343
|
+
FILTER NOT EXISTS { ?isoform up:basedOn ?parent_isoform }
|
|
344
|
+
FILTER(
|
|
345
|
+
STRAFTER(STR(?protein), "http://purl.uniprot.org/uniprot/") =
|
|
346
|
+
STRBEFORE(STRAFTER(STR(?isoform), "http://purl.uniprot.org/isoforms/"), "-"))
|
|
347
|
+
?isoform rdf:value ?sequence .
|
|
348
|
+
BIND (STRLEN(?sequence) AS ?seq_length)
|
|
349
|
+
""")
|
|
350
|
+
if min_length is not None and max_length is not None:
|
|
351
|
+
if max_length <= min_length:
|
|
352
|
+
msg = f"Maximum sequence length ({max_length}) must be greater than minimum sequence length ({min_length})"
|
|
353
|
+
raise ValueError(msg)
|
|
354
|
+
return dedent(f"""\
|
|
355
|
+
{header}
|
|
356
|
+
FILTER (?seq_length >= {min_length} && ?seq_length <= {max_length})
|
|
357
|
+
""")
|
|
358
|
+
if min_length is not None:
|
|
359
|
+
return dedent(f"""\
|
|
360
|
+
{header}
|
|
361
|
+
FILTER (?seq_length >= {min_length})
|
|
362
|
+
""")
|
|
363
|
+
if max_length is not None:
|
|
364
|
+
return dedent(f"""\
|
|
365
|
+
{header}
|
|
366
|
+
FILTER (?seq_length <= {max_length})
|
|
367
|
+
""")
|
|
368
|
+
return ""
|
|
369
|
+
|
|
370
|
+
|
|
240
371
|
def _build_sparql_query_pdb(uniprot_accs: Iterable[str], limit=10_000) -> str:
|
|
241
372
|
# For http://purl.uniprot.org/uniprot/O00268 + http://rdf.wwpdb.org/pdb/1H3O
|
|
242
373
|
# the chainSequenceMapping are
|
|
@@ -248,7 +379,7 @@ def _build_sparql_query_pdb(uniprot_accs: Iterable[str], limit=10_000) -> str:
|
|
|
248
379
|
# http://purl.uniprot.org/isoforms/O00255-2#PDB_3U84_tt2tt459
|
|
249
380
|
# To get the the chain belonging to the uniprot/pdb pair we need to
|
|
250
381
|
# do some string filtering.
|
|
251
|
-
# Also there can be multiple
|
|
382
|
+
# Also there can be multiple chains for the same uniprot/pdb pair, so we need to
|
|
252
383
|
# do a group by and concat
|
|
253
384
|
|
|
254
385
|
select_clause = dedent("""\
|
|
@@ -274,7 +405,12 @@ def _build_sparql_query_pdb(uniprot_accs: Iterable[str], limit=10_000) -> str:
|
|
|
274
405
|
)
|
|
275
406
|
|
|
276
407
|
|
|
277
|
-
def _build_sparql_query_af(
|
|
408
|
+
def _build_sparql_query_af(
|
|
409
|
+
uniprot_accs: Iterable[str],
|
|
410
|
+
min_sequence_length: int | None = None,
|
|
411
|
+
max_sequence_length: int | None = None,
|
|
412
|
+
limit=10_000,
|
|
413
|
+
) -> str:
|
|
278
414
|
select_clause = "?protein ?af_db"
|
|
279
415
|
where_clause = dedent("""
|
|
280
416
|
# --- Protein Selection ---
|
|
@@ -284,6 +420,12 @@ def _build_sparql_query_af(uniprot_accs: Iterable[str], limit=10_000) -> str:
|
|
|
284
420
|
?protein rdfs:seeAlso ?af_db .
|
|
285
421
|
?af_db up:database <http://purl.uniprot.org/database/AlphaFoldDB> .
|
|
286
422
|
""")
|
|
423
|
+
if min_sequence_length is not None or max_sequence_length is not None:
|
|
424
|
+
length_filter = _build_sparql_query_sequence_length_filter(
|
|
425
|
+
min_length=min_sequence_length,
|
|
426
|
+
max_length=max_sequence_length,
|
|
427
|
+
)
|
|
428
|
+
where_clause += "\n" + length_filter
|
|
287
429
|
return _build_sparql_generic_by_uniprot_accessions_query(uniprot_accs, select_clause, dedent(where_clause), limit)
|
|
288
430
|
|
|
289
431
|
|
|
@@ -337,8 +479,8 @@ def _execute_sparql_search(
|
|
|
337
479
|
return bindings
|
|
338
480
|
|
|
339
481
|
|
|
340
|
-
def _flatten_results_pdb(rawresults: Iterable) ->
|
|
341
|
-
pdb_entries:
|
|
482
|
+
def _flatten_results_pdb(rawresults: Iterable) -> PdbResults:
|
|
483
|
+
pdb_entries: PdbResults = {}
|
|
342
484
|
for result in rawresults:
|
|
343
485
|
protein = result["protein"]["value"].split("/")[-1]
|
|
344
486
|
if "pdb_db" not in result: # Should not happen with build_sparql_query_pdb
|
|
@@ -424,7 +566,7 @@ def search4uniprot(query: Query, limit: int = 10_000, timeout: int = 1_800) -> s
|
|
|
424
566
|
|
|
425
567
|
def search4pdb(
|
|
426
568
|
uniprot_accs: Collection[str], limit: int = 10_000, timeout: int = 1_800, batch_size: int = 10_000
|
|
427
|
-
) ->
|
|
569
|
+
) -> PdbResults:
|
|
428
570
|
"""
|
|
429
571
|
Search for PDB entries in UniProtKB accessions.
|
|
430
572
|
|
|
@@ -456,13 +598,20 @@ def search4pdb(
|
|
|
456
598
|
|
|
457
599
|
|
|
458
600
|
def search4af(
|
|
459
|
-
uniprot_accs: Collection[str],
|
|
601
|
+
uniprot_accs: Collection[str],
|
|
602
|
+
min_sequence_length: int | None = None,
|
|
603
|
+
max_sequence_length: int | None = None,
|
|
604
|
+
limit: int = 10_000,
|
|
605
|
+
timeout: int = 1_800,
|
|
606
|
+
batch_size: int = 10_000,
|
|
460
607
|
) -> dict[str, set[str]]:
|
|
461
608
|
"""
|
|
462
609
|
Search for AlphaFold entries in UniProtKB accessions.
|
|
463
610
|
|
|
464
611
|
Args:
|
|
465
612
|
uniprot_accs: UniProt accessions.
|
|
613
|
+
min_sequence_length: Minimum length of the canonical sequence.
|
|
614
|
+
max_sequence_length: Maximum length of the canonical sequence.
|
|
466
615
|
limit: Maximum number of results to return.
|
|
467
616
|
timeout: Timeout for the SPARQL query in seconds.
|
|
468
617
|
batch_size: Size of batches to process the UniProt accessions.
|
|
@@ -474,7 +623,7 @@ def search4af(
|
|
|
474
623
|
total = len(uniprot_accs)
|
|
475
624
|
with tqdm(total=total, desc="Searching for AlphaFolds of uniprots", disable=total < batch_size, unit="acc") as pbar:
|
|
476
625
|
for batch in batched(uniprot_accs, batch_size, strict=False):
|
|
477
|
-
sparql_query = _build_sparql_query_af(batch, limit)
|
|
626
|
+
sparql_query = _build_sparql_query_af(batch, min_sequence_length, max_sequence_length, limit)
|
|
478
627
|
logger.info("Executing SPARQL query for AlphaFold: %s", sparql_query)
|
|
479
628
|
|
|
480
629
|
raw_results = _execute_sparql_search(
|
|
@@ -639,12 +788,12 @@ def search4macromolecular_complexes(
|
|
|
639
788
|
|
|
640
789
|
|
|
641
790
|
def search4interaction_partners(
|
|
642
|
-
|
|
791
|
+
uniprot_accession: str, excludes: set[str] | None = None, limit: int = 10_000, timeout: int = 1_800
|
|
643
792
|
) -> dict[str, set[str]]:
|
|
644
793
|
"""Search for interaction partners of a given UniProt accession using ComplexPortal database references.
|
|
645
794
|
|
|
646
795
|
Args:
|
|
647
|
-
|
|
796
|
+
uniprot_accession: UniProt accession to search interaction partners for.
|
|
648
797
|
excludes: Set of UniProt accessions to exclude from the results.
|
|
649
798
|
For example already known interaction partners.
|
|
650
799
|
If None then no complex members are excluded.
|
|
@@ -655,14 +804,137 @@ def search4interaction_partners(
|
|
|
655
804
|
Dictionary with UniProt accessions of interaction partners as keys and sets of ComplexPortal entry IDs
|
|
656
805
|
in which the interaction occurs as values.
|
|
657
806
|
"""
|
|
658
|
-
ucomplexes = search4macromolecular_complexes([
|
|
807
|
+
ucomplexes = search4macromolecular_complexes([uniprot_accession], limit=limit, timeout=timeout)
|
|
659
808
|
hits: dict[str, set[str]] = {}
|
|
660
809
|
if excludes is None:
|
|
661
810
|
excludes = set()
|
|
662
811
|
for ucomplex in ucomplexes:
|
|
663
812
|
for member in ucomplex.members:
|
|
664
|
-
if member !=
|
|
813
|
+
if member != uniprot_accession and member not in excludes:
|
|
665
814
|
if member not in hits:
|
|
666
815
|
hits[member] = set()
|
|
667
816
|
hits[member].add(ucomplex.complex_id)
|
|
668
817
|
return hits
|
|
818
|
+
|
|
819
|
+
|
|
820
|
+
@dataclass(frozen=True)
|
|
821
|
+
class UniprotDetails:
|
|
822
|
+
"""Details of an UniProt entry.
|
|
823
|
+
|
|
824
|
+
Parameters:
|
|
825
|
+
uniprot_accession: UniProt accession.
|
|
826
|
+
uniprot_id: UniProt ID (mnemonic).
|
|
827
|
+
sequence_length: Length of the canonical sequence.
|
|
828
|
+
reviewed: Whether the entry is reviewed (Swiss-Prot) or unreviewed (TrEMBL).
|
|
829
|
+
protein_name: Recommended protein name.
|
|
830
|
+
taxon_id: NCBI Taxonomy ID of the organism.
|
|
831
|
+
taxon_name: Scientific name of the organism.
|
|
832
|
+
"""
|
|
833
|
+
|
|
834
|
+
uniprot_accession: str
|
|
835
|
+
uniprot_id: str
|
|
836
|
+
sequence_length: int
|
|
837
|
+
reviewed: bool
|
|
838
|
+
protein_name: str
|
|
839
|
+
taxon_id: int
|
|
840
|
+
taxon_name: str
|
|
841
|
+
|
|
842
|
+
|
|
843
|
+
def map_uniprot_accessions2uniprot_details(
|
|
844
|
+
uniprot_accessions: Collection[str], timeout: int = 1_800, batch_size: int = 1000
|
|
845
|
+
) -> Generator[UniprotDetails]:
|
|
846
|
+
"""Map UniProt accessions to UniProt details by querying the UniProt SPARQL endpoint.
|
|
847
|
+
|
|
848
|
+
Example:
|
|
849
|
+
|
|
850
|
+
SPARQL query to get details for 7 UniProt entries, run on [https://sparql.uniprot.org/sparql](https://sparql.uniprot.org/sparql).
|
|
851
|
+
|
|
852
|
+
```sparql
|
|
853
|
+
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
|
|
854
|
+
PREFIX up: <http://purl.uniprot.org/core/>
|
|
855
|
+
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
856
|
+
|
|
857
|
+
SELECT
|
|
858
|
+
(?ac AS ?uniprot_accession)
|
|
859
|
+
?uniprot_id
|
|
860
|
+
(STRAFTER(STR(?organism), "taxonomy/") AS ?taxon_id)
|
|
861
|
+
?taxon_name
|
|
862
|
+
?reviewed
|
|
863
|
+
?protein_name
|
|
864
|
+
(STRLEN(?sequence) AS ?seq_length)
|
|
865
|
+
WHERE {
|
|
866
|
+
# Input UniProt accessions
|
|
867
|
+
VALUES (?ac) { ("P05067") ("A6NGD5") ("O14627") ("P00697") ("P42284") ("A0A0B5AC95") ("A0A0S2Z4R0")}
|
|
868
|
+
BIND (IRI(CONCAT("http://purl.uniprot.org/uniprot/", ?ac)) AS ?protein)
|
|
869
|
+
?protein a up:Protein .
|
|
870
|
+
?protein up:mnemonic ?uniprot_id .
|
|
871
|
+
?protein up:organism ?organism .
|
|
872
|
+
?organism up:scientificName ?taxon_name .
|
|
873
|
+
?protein up:reviewed ?reviewed .
|
|
874
|
+
?protein up:recommendedName/up:fullName ?protein_name .
|
|
875
|
+
?protein up:sequence ?isoform .
|
|
876
|
+
?isoform a up:Simple_Sequence .
|
|
877
|
+
?isoform rdf:value ?sequence .
|
|
878
|
+
BIND (STRBEFORE(STRAFTER(STR(?isoform), "http://purl.uniprot.org/isoforms/"), "-") AS ?ac_of_isoform)
|
|
879
|
+
FILTER(?ac_of_isoform = ?ac)
|
|
880
|
+
}
|
|
881
|
+
```
|
|
882
|
+
|
|
883
|
+
Args:
|
|
884
|
+
uniprot_accessions: Iterable of UniProt accessions.
|
|
885
|
+
timeout: Timeout for the SPARQL query in seconds.
|
|
886
|
+
batch_size: Size of batches to process the UniProt accessions.
|
|
887
|
+
|
|
888
|
+
Yields:
|
|
889
|
+
UniprotDetails objects in random order.
|
|
890
|
+
"""
|
|
891
|
+
select_clause = dedent("""\
|
|
892
|
+
(?ac AS ?uniprot_accession)
|
|
893
|
+
?uniprot_id
|
|
894
|
+
(STRAFTER(STR(?organism), "taxonomy/") AS ?taxon_id)
|
|
895
|
+
?taxon_name
|
|
896
|
+
?reviewed
|
|
897
|
+
?protein_name
|
|
898
|
+
(STRLEN(?sequence) AS ?seq_length)
|
|
899
|
+
""")
|
|
900
|
+
where_clause = dedent("""
|
|
901
|
+
?protein a up:Protein .
|
|
902
|
+
?protein up:mnemonic ?uniprot_id .
|
|
903
|
+
?protein up:organism ?organism .
|
|
904
|
+
?organism up:scientificName ?taxon_name .
|
|
905
|
+
?protein up:reviewed ?reviewed .
|
|
906
|
+
?protein up:recommendedName/up:fullName ?protein_name .
|
|
907
|
+
?protein up:sequence ?isoform .
|
|
908
|
+
?isoform a up:Simple_Sequence .
|
|
909
|
+
?isoform rdf:value ?sequence .
|
|
910
|
+
BIND (STRBEFORE(STRAFTER(STR(?isoform), "http://purl.uniprot.org/isoforms/"), "-") AS ?ac_of_isoform)
|
|
911
|
+
FILTER(?ac_of_isoform = ?ac)
|
|
912
|
+
""")
|
|
913
|
+
total = len(uniprot_accessions)
|
|
914
|
+
with tqdm(
|
|
915
|
+
total=total,
|
|
916
|
+
desc="Retrieving UniProt details",
|
|
917
|
+
disable=total < batch_size,
|
|
918
|
+
unit="acc",
|
|
919
|
+
) as pbar:
|
|
920
|
+
for batch in batched(uniprot_accessions, batch_size, strict=False):
|
|
921
|
+
sparql_query = _build_sparql_generic_by_uniprot_accessions_query(
|
|
922
|
+
batch, select_clause, where_clause, limit=batch_size
|
|
923
|
+
)
|
|
924
|
+
logger.info("Executing SPARQL query for UniProt details: %s", sparql_query)
|
|
925
|
+
raw_results = _execute_sparql_search(
|
|
926
|
+
sparql_query=sparql_query,
|
|
927
|
+
timeout=timeout,
|
|
928
|
+
)
|
|
929
|
+
for raw_result in raw_results:
|
|
930
|
+
result = UniprotDetails(
|
|
931
|
+
uniprot_accession=raw_result["uniprot_accession"]["value"],
|
|
932
|
+
uniprot_id=raw_result["uniprot_id"]["value"],
|
|
933
|
+
sequence_length=int(raw_result["seq_length"]["value"]),
|
|
934
|
+
reviewed=raw_result["reviewed"]["value"] == "true",
|
|
935
|
+
protein_name=raw_result["protein_name"]["value"],
|
|
936
|
+
taxon_id=int(raw_result["taxon_id"]["value"]),
|
|
937
|
+
taxon_name=raw_result["taxon_name"]["value"],
|
|
938
|
+
)
|
|
939
|
+
yield result
|
|
940
|
+
pbar.update(len(batch))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: protein_quest
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: Search/retrieve/filter proteins and protein structures
|
|
5
5
|
Project-URL: Homepage, https://github.com/haddocking/protein-quest
|
|
6
6
|
Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
|
|
@@ -11,7 +11,6 @@ Requires-Python: >=3.13
|
|
|
11
11
|
Requires-Dist: aiofiles>=24.1.0
|
|
12
12
|
Requires-Dist: aiohttp-retry>=2.9.1
|
|
13
13
|
Requires-Dist: aiohttp[speedups]>=3.11.18
|
|
14
|
-
Requires-Dist: aiopath>=0.7.7
|
|
15
14
|
Requires-Dist: attrs>=25.3.0
|
|
16
15
|
Requires-Dist: cattrs[orjson]>=24.1.3
|
|
17
16
|
Requires-Dist: dask>=2025.5.1
|
|
@@ -27,7 +26,7 @@ Requires-Dist: tqdm>=4.67.1
|
|
|
27
26
|
Requires-Dist: yarl>=1.20.1
|
|
28
27
|
Provides-Extra: mcp
|
|
29
28
|
Requires-Dist: fastmcp>=2.11.3; extra == 'mcp'
|
|
30
|
-
Requires-Dist: pydantic>=2.
|
|
29
|
+
Requires-Dist: pydantic>=2.12.0; extra == 'mcp'
|
|
31
30
|
Description-Content-Type: text/markdown
|
|
32
31
|
|
|
33
32
|
# protein-quest
|
|
@@ -62,6 +61,7 @@ graph TB;
|
|
|
62
61
|
searchuniprot --> |uniprot_accessions|searchpdbe[/Search PDBe/]
|
|
63
62
|
searchuniprot --> |uniprot_accessions|searchaf[/Search Alphafold/]
|
|
64
63
|
searchuniprot -. uniprot_accessions .-> searchemdb[/Search EMDB/]
|
|
64
|
+
searchuniprot -. uniprot_accessions .-> searchuniprotdetails[/Search UniProt details/]
|
|
65
65
|
searchintactionpartners[/Search interaction partners/] -.-x |uniprot_accessions|searchuniprot
|
|
66
66
|
searchcomplexes[/Search complexes/]
|
|
67
67
|
searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
|
|
@@ -73,6 +73,7 @@ graph TB;
|
|
|
73
73
|
confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
|
|
74
74
|
residuefilter --> |mmcif_files| ssfilter
|
|
75
75
|
ssfilter -. mmcif_files .-> convert2cif([Convert to cif])
|
|
76
|
+
ssfilter -. mmcif_files .-> convert2uniprot_accessions([Convert to UniProt accessions])
|
|
76
77
|
classDef dashedBorder stroke-dasharray: 5 5;
|
|
77
78
|
goterm:::dashedBorder
|
|
78
79
|
taxonomy:::dashedBorder
|
|
@@ -80,7 +81,9 @@ graph TB;
|
|
|
80
81
|
fetchemdb:::dashedBorder
|
|
81
82
|
searchintactionpartners:::dashedBorder
|
|
82
83
|
searchcomplexes:::dashedBorder
|
|
84
|
+
searchuniprotdetails:::dashedBorder
|
|
83
85
|
convert2cif:::dashedBorder
|
|
86
|
+
convert2uniprot_accessions:::dashedBorder
|
|
84
87
|
```
|
|
85
88
|
|
|
86
89
|
(Dotted nodes and edges are side-quests.)
|
|
@@ -111,7 +114,7 @@ This behavior can be customized with the `--no-cache`, `--cache-dir`, and `--cop
|
|
|
111
114
|
protein-quest search uniprot \
|
|
112
115
|
--taxon-id 9606 \
|
|
113
116
|
--reviewed \
|
|
114
|
-
--subcellular-location-uniprot nucleus \
|
|
117
|
+
--subcellular-location-uniprot "nucleus" \
|
|
115
118
|
--subcellular-location-go GO:0005634 \
|
|
116
119
|
--molecular-function-go GO:0003677 \
|
|
117
120
|
--limit 100 \
|
|
@@ -194,7 +197,7 @@ protein-quest filter residue \
|
|
|
194
197
|
|
|
195
198
|
### To filter on secondary structure
|
|
196
199
|
|
|
197
|
-
To filter on structure being mostly alpha helices and have no beta sheets.
|
|
200
|
+
To filter on structure being mostly alpha helices and have no beta sheets. See the following [notebook](https://www.bonvinlab.org/protein-detective/SSE_elements.html) to determine the ratio of secondary structure elements.
|
|
198
201
|
|
|
199
202
|
```shell
|
|
200
203
|
protein-quest filter secondary-structure \
|
|
@@ -245,12 +248,35 @@ query_protein,complex_id,complex_url,complex_title,members
|
|
|
245
248
|
Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
|
|
246
249
|
```
|
|
247
250
|
|
|
251
|
+
### Search for UniProt details
|
|
252
|
+
|
|
253
|
+
To get details (like protein name, sequence length, organism) for a list of UniProt accessions.
|
|
254
|
+
|
|
255
|
+
```shell
|
|
256
|
+
protein-quest search uniprot-details uniprot_accs.txt uniprot_details.csv
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
The `uniprot_details.csv` looks like:
|
|
260
|
+
|
|
261
|
+
```csv
|
|
262
|
+
uniprot_accession,uniprot_id,sequence_length,reviewed,protein_name,taxon_id,taxon_name
|
|
263
|
+
A0A087WUV0,ZN892_HUMAN,522,True,Zinc finger protein 892,9606,Homo sapiens
|
|
264
|
+
```
|
|
265
|
+
|
|
248
266
|
### Convert structure files to .cif format
|
|
249
267
|
|
|
250
268
|
Some tools (for example [powerfit](https://github.com/haddocking/powerfit)) only work with `.cif` files and not `*.cif.gz` or `*.bcif` files.
|
|
251
269
|
|
|
252
270
|
```shell
|
|
253
|
-
protein-quest convert --output-dir ./filtered-cif ./filtered-ss
|
|
271
|
+
protein-quest convert structures --format cif --output-dir ./filtered-cif ./filtered-ss
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
### Convert structure files to UniProt accessions
|
|
275
|
+
|
|
276
|
+
After running some filters you might want to know which UniProt accessions are still present in the filtered structures.
|
|
277
|
+
|
|
278
|
+
```shell
|
|
279
|
+
protein-quest convert uniprot ./filtered-ss uniprot_accs.filtered.txt
|
|
254
280
|
```
|
|
255
281
|
|
|
256
282
|
## Model Context Protocol (MCP) server
|
|
@@ -1,27 +1,27 @@
|
|
|
1
1
|
protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
protein_quest/__version__.py,sha256=
|
|
3
|
-
protein_quest/cli.py,sha256=
|
|
2
|
+
protein_quest/__version__.py,sha256=F9kNagC7uEvuPDju8Gzo4Jt81LSvbf0VyONV3GMXT2M,56
|
|
3
|
+
protein_quest/cli.py,sha256=082CmSSmxVZoWbnX35AmhqedA4T1dD9v-eMe0vsIDp4,55572
|
|
4
4
|
protein_quest/converter.py,sha256=Y-Oxf7lDNbEicL6GS-IpNWDwaAiHgIgs5bFAcEHCKdQ,1441
|
|
5
5
|
protein_quest/emdb.py,sha256=641c6RwNYnu-0GBFyCFBiI58fNc0jMkd0ZZ9MW9-Jmc,1501
|
|
6
6
|
protein_quest/filters.py,sha256=Xr-cJTtbNjHKuzmXLBf7yZfqKf_U3RTivcVbr620LVU,5225
|
|
7
7
|
protein_quest/go.py,sha256=lZNEcw8nTc9wpV3cl4y2FG9Lsj8wsXQ6zemmAQs_DWE,5650
|
|
8
8
|
protein_quest/io.py,sha256=ngV_HU2HIQFO-bP2xQj_fhgv0MYjW4puqz_9CxGpBv8,13017
|
|
9
|
-
protein_quest/mcp_server.py,sha256=
|
|
9
|
+
protein_quest/mcp_server.py,sha256=tZkSG1yx4ocN1rlKgVlU8nUbs6LKpyLrNqP3y6fbJm0,8564
|
|
10
10
|
protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
|
|
11
11
|
protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
12
|
protein_quest/ss.py,sha256=4ZGIHfjTlodYTXqGUKhMnGbgaStYOGaWg2oYrWIjdgo,10118
|
|
13
|
-
protein_quest/structure.py,sha256=
|
|
13
|
+
protein_quest/structure.py,sha256=QozElPz0kbPB_HW-J1WxArTT5e-1vRyBJoBSfHnwoRM,8117
|
|
14
14
|
protein_quest/taxonomy.py,sha256=4mKv8zll4mX02Ow8CTvyqMJE2KJZvcq3QlTjjjLOJJk,5072
|
|
15
|
-
protein_quest/uniprot.py,sha256=
|
|
15
|
+
protein_quest/uniprot.py,sha256=mODAcneCnDvinvJ3jffyR11klsgq5b96T_4aVWd-Luw,35158
|
|
16
16
|
protein_quest/utils.py,sha256=6OF8X4ia_z1HOYiXy6e-zEWlp_bF1DoZCVrCSg1qivY,19076
|
|
17
17
|
protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
|
|
18
18
|
protein_quest/alphafold/confidence.py,sha256=mVAYTIzdbR8xBjRiUzA0at8wJq9vpfEQWPz5cJefLKs,6766
|
|
19
|
-
protein_quest/alphafold/entry_summary.py,sha256=
|
|
20
|
-
protein_quest/alphafold/fetch.py,sha256=
|
|
19
|
+
protein_quest/alphafold/entry_summary.py,sha256=Qhnw75RXFaoOU332g7axg_jYbbdZbUpsGPUOwPNDSeU,2114
|
|
20
|
+
protein_quest/alphafold/fetch.py,sha256=l8pcXeuDfoXYiwpW5N_uB_9oZpomBgUeF9kROLrM11M,14038
|
|
21
21
|
protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
|
|
22
22
|
protein_quest/pdbe/fetch.py,sha256=e8CHWDX2QzWnVLmYXCfNrscw1UcN1lI9Uz6Z5HmEOEQ,2510
|
|
23
|
-
protein_quest-0.
|
|
24
|
-
protein_quest-0.
|
|
25
|
-
protein_quest-0.
|
|
26
|
-
protein_quest-0.
|
|
27
|
-
protein_quest-0.
|
|
23
|
+
protein_quest-0.7.0.dist-info/METADATA,sha256=JvsZl9XGN57iJn5oSBRIVNIqL6aYEHXQlGpE87nsSvQ,10722
|
|
24
|
+
protein_quest-0.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
25
|
+
protein_quest-0.7.0.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
|
|
26
|
+
protein_quest-0.7.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
27
|
+
protein_quest-0.7.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|