protein-quest 0.10.1__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- protein_quest/__version__.py +1 -1
- protein_quest/cli.py +47 -3
- protein_quest/converter.py +1 -1
- protein_quest/mcp_server.py +2 -2
- protein_quest/parallel.py +22 -5
- protein_quest/uniprot.py +1 -1
- {protein_quest-0.10.1.dist-info → protein_quest-1.1.0.dist-info}/METADATA +82 -30
- {protein_quest-0.10.1.dist-info → protein_quest-1.1.0.dist-info}/RECORD +11 -11
- {protein_quest-0.10.1.dist-info → protein_quest-1.1.0.dist-info}/WHEEL +1 -1
- {protein_quest-0.10.1.dist-info → protein_quest-1.1.0.dist-info}/entry_points.txt +0 -0
- {protein_quest-0.10.1.dist-info → protein_quest-1.1.0.dist-info}/licenses/LICENSE +0 -0
protein_quest/__version__.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
__version__ = "
|
|
1
|
+
__version__ = "1.1.0"
|
|
2
2
|
"""The version of the package."""
|
protein_quest/cli.py
CHANGED
|
@@ -8,6 +8,7 @@ import os
|
|
|
8
8
|
import sys
|
|
9
9
|
from collections.abc import Callable, Generator, Iterable, Sequence
|
|
10
10
|
from contextlib import suppress
|
|
11
|
+
from functools import lru_cache
|
|
11
12
|
from importlib.util import find_spec
|
|
12
13
|
from io import BytesIO, TextIOWrapper
|
|
13
14
|
from pathlib import Path
|
|
@@ -20,6 +21,7 @@ from rich.logging import RichHandler
|
|
|
20
21
|
from rich.markdown import Markdown
|
|
21
22
|
from rich.panel import Panel
|
|
22
23
|
from rich_argparse import ArgumentDefaultsRichHelpFormatter
|
|
24
|
+
from rocrate_action_recorder import recorded_argparse
|
|
23
25
|
from tqdm.rich import tqdm
|
|
24
26
|
|
|
25
27
|
from protein_quest.__version__ import __version__
|
|
@@ -797,12 +799,18 @@ def _add_mcp_command(subparsers: argparse._SubParsersAction):
|
|
|
797
799
|
parser.add_argument("--port", default=8000, type=int, help="Port to bind the server to")
|
|
798
800
|
|
|
799
801
|
|
|
802
|
+
@lru_cache(maxsize=1)
|
|
800
803
|
def make_parser() -> argparse.ArgumentParser:
|
|
801
804
|
parser = argparse.ArgumentParser(
|
|
802
805
|
description="Protein Quest CLI", prog="protein-quest", formatter_class=ArgumentDefaultsRichHelpFormatter
|
|
803
806
|
)
|
|
804
807
|
parser.add_argument("--log-level", default="WARNING", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
|
|
805
808
|
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
|
|
809
|
+
parser.add_argument(
|
|
810
|
+
"--prov",
|
|
811
|
+
action="store_true",
|
|
812
|
+
help="Whether to write provenance information about the command execution to ro-crate-metadata.json file.",
|
|
813
|
+
)
|
|
806
814
|
shtab.add_argument_to(parser, ["--print-completion"])
|
|
807
815
|
|
|
808
816
|
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
@@ -824,7 +832,26 @@ def _name_of(file: TextIOWrapper | BytesIO) -> str:
|
|
|
824
832
|
return "<stdout>"
|
|
825
833
|
|
|
826
834
|
|
|
827
|
-
def
|
|
835
|
+
def prov(
|
|
836
|
+
input_dirs: list[str] | None = None,
|
|
837
|
+
output_dirs: list[str] | None = None,
|
|
838
|
+
input_files: list[str] | None = None,
|
|
839
|
+
output_files: list[str] | None = None,
|
|
840
|
+
):
|
|
841
|
+
"""Decorator to record provenance for protein-quest commands."""
|
|
842
|
+
return recorded_argparse(
|
|
843
|
+
parser=make_parser(),
|
|
844
|
+
input_dirs=input_dirs,
|
|
845
|
+
output_dirs=output_dirs,
|
|
846
|
+
input_files=input_files,
|
|
847
|
+
output_files=output_files,
|
|
848
|
+
enabled_argument="prov",
|
|
849
|
+
dataset_license="CC BY 4.0",
|
|
850
|
+
)
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
@prov(output_files=["output"])
|
|
854
|
+
def _handle_search_uniprot(args: argparse.Namespace):
|
|
828
855
|
taxon_id = args.taxon_id
|
|
829
856
|
reviewed = args.reviewed
|
|
830
857
|
subcellular_location_uniprot = args.subcellular_location_uniprot
|
|
@@ -854,7 +881,8 @@ def _handle_search_uniprot(args):
|
|
|
854
881
|
_write_lines(output_file, sorted(accs))
|
|
855
882
|
|
|
856
883
|
|
|
857
|
-
|
|
884
|
+
@prov(input_files=["uniprot_accessions"], output_files=["output_csv"])
|
|
885
|
+
def _handle_search_pdbe(args: argparse.Namespace):
|
|
858
886
|
uniprot_accessions = args.uniprot_accessions
|
|
859
887
|
limit = args.limit
|
|
860
888
|
timeout = args.timeout
|
|
@@ -884,6 +912,7 @@ def _handle_search_pdbe(args):
|
|
|
884
912
|
rprint(f"Written to {_name_of(output_csv)}")
|
|
885
913
|
|
|
886
914
|
|
|
915
|
+
@prov(input_files=["uniprot_accessions"], output_files=["output_csv"])
|
|
887
916
|
def _handle_search_alphafold(args):
|
|
888
917
|
uniprot_accessions = args.uniprot_accessions
|
|
889
918
|
min_sequence_length = converter.structure(args.min_sequence_length, PositiveInt | None) # pyright: ignore[reportArgumentType]
|
|
@@ -905,6 +934,7 @@ def _handle_search_alphafold(args):
|
|
|
905
934
|
_write_dict_of_sets2csv(output_csv, results, "af_id")
|
|
906
935
|
|
|
907
936
|
|
|
937
|
+
@prov(input_files=["uniprot_accessions"], output_files=["output_csv"])
|
|
908
938
|
def _handle_search_emdb(args):
|
|
909
939
|
uniprot_accessions = args.uniprot_accessions
|
|
910
940
|
limit = args.limit
|
|
@@ -919,6 +949,7 @@ def _handle_search_emdb(args):
|
|
|
919
949
|
_write_dict_of_sets2csv(output_csv, results, "emdb_id")
|
|
920
950
|
|
|
921
951
|
|
|
952
|
+
@prov(output_files=["output_csv"])
|
|
922
953
|
def _handle_search_go(args):
|
|
923
954
|
term = structure(args.term, str)
|
|
924
955
|
aspect: Aspect | None = args.aspect
|
|
@@ -934,6 +965,7 @@ def _handle_search_go(args):
|
|
|
934
965
|
write_go_terms_to_csv(results, output_csv)
|
|
935
966
|
|
|
936
967
|
|
|
968
|
+
@prov(output_files=["output_csv"])
|
|
937
969
|
def _handle_search_taxonomy(args):
|
|
938
970
|
query: str = args.query
|
|
939
971
|
field: SearchField | None = args.field
|
|
@@ -949,6 +981,7 @@ def _handle_search_taxonomy(args):
|
|
|
949
981
|
_write_taxonomy_csv(results, output_csv)
|
|
950
982
|
|
|
951
983
|
|
|
984
|
+
@prov(input_files=["uniprot_accession"], output_files=["output_csv"])
|
|
952
985
|
def _handle_search_interaction_partners(args: argparse.Namespace):
|
|
953
986
|
uniprot_accession: str = args.uniprot_accession
|
|
954
987
|
excludes: set[str] = set(args.exclude) if args.exclude else set()
|
|
@@ -962,6 +995,7 @@ def _handle_search_interaction_partners(args: argparse.Namespace):
|
|
|
962
995
|
_write_lines(output_csv, results.keys())
|
|
963
996
|
|
|
964
997
|
|
|
998
|
+
@prov(input_files=["uniprot_accessions"], output_files=["output_csv"])
|
|
965
999
|
def _handle_search_complexes(args: argparse.Namespace):
|
|
966
1000
|
uniprot_accessions = args.uniprot_accessions
|
|
967
1001
|
limit = args.limit
|
|
@@ -975,6 +1009,7 @@ def _handle_search_complexes(args: argparse.Namespace):
|
|
|
975
1009
|
_write_complexes_csv(results, output_csv)
|
|
976
1010
|
|
|
977
1011
|
|
|
1012
|
+
@prov(input_files=["uniprot_accessions"], output_files=["output_csv"])
|
|
978
1013
|
def _handle_search_uniprot_details(args: argparse.Namespace):
|
|
979
1014
|
uniprot_accessions = args.uniprot_accessions
|
|
980
1015
|
timeout = args.timeout
|
|
@@ -997,6 +1032,7 @@ def _initialize_cacher(args: argparse.Namespace) -> Cacher:
|
|
|
997
1032
|
)
|
|
998
1033
|
|
|
999
1034
|
|
|
1035
|
+
@prov(input_files=["pdbe_csv"], output_dirs=["output_dir"])
|
|
1000
1036
|
def _handle_retrieve_pdbe(args: argparse.Namespace):
|
|
1001
1037
|
pdbe_csv = args.pdbe_csv
|
|
1002
1038
|
output_dir = args.output_dir
|
|
@@ -1011,6 +1047,7 @@ def _handle_retrieve_pdbe(args: argparse.Namespace):
|
|
|
1011
1047
|
rprint(f"Retrieved {len(result)} PDBe entries")
|
|
1012
1048
|
|
|
1013
1049
|
|
|
1050
|
+
@prov(input_files=["alphafold_csv"], output_dirs=["output_dir"])
|
|
1014
1051
|
def _handle_retrieve_alphafold(args):
|
|
1015
1052
|
download_dir = args.output_dir
|
|
1016
1053
|
raw_formats = args.format
|
|
@@ -1042,6 +1079,7 @@ def _handle_retrieve_alphafold(args):
|
|
|
1042
1079
|
rprint(f"Retrieved {total_nr_files} AlphaFold files and {len(afs)} summaries, written to {download_dir}")
|
|
1043
1080
|
|
|
1044
1081
|
|
|
1082
|
+
@prov(input_files=["emdb_csv"], output_dirs=["output_dir"])
|
|
1045
1083
|
def _handle_retrieve_emdb(args):
|
|
1046
1084
|
emdb_csv = args.emdb_csv
|
|
1047
1085
|
output_dir = args.output_dir
|
|
@@ -1053,6 +1091,7 @@ def _handle_retrieve_emdb(args):
|
|
|
1053
1091
|
rprint(f"Retrieved {len(result)} EMDB entries")
|
|
1054
1092
|
|
|
1055
1093
|
|
|
1094
|
+
@prov(input_dirs=["input_dir"], output_dirs=["output_dir"], output_files=["write_stats"])
|
|
1056
1095
|
def _handle_filter_confidence(args: argparse.Namespace):
|
|
1057
1096
|
# we are repeating types here and in add_argument call
|
|
1058
1097
|
# TODO replace argparse with modern alternative like cyclopts
|
|
@@ -1097,6 +1136,7 @@ def _handle_filter_confidence(args: argparse.Namespace):
|
|
|
1097
1136
|
rprint(f"Statistics written to {_name_of(stats_file)}")
|
|
1098
1137
|
|
|
1099
1138
|
|
|
1139
|
+
@prov(input_dirs=["input_dir"], output_dirs=["output_dir"], output_files=["write_stats"])
|
|
1100
1140
|
def _handle_filter_chain(args):
|
|
1101
1141
|
input_dir = args.input_dir
|
|
1102
1142
|
output_dir = structure(args.output_dir, Path)
|
|
@@ -1140,6 +1180,7 @@ def _handle_filter_chain(args):
|
|
|
1140
1180
|
rprint(f"[red]Discarding {result.input_file} ({result.discard_reason})[/red]")
|
|
1141
1181
|
|
|
1142
1182
|
|
|
1183
|
+
@prov(input_dirs=["input_dir"], output_dirs=["output_dir"], output_files=["write_stats"])
|
|
1143
1184
|
def _handle_filter_residue(args):
|
|
1144
1185
|
input_dir = structure(args.input_dir, Path)
|
|
1145
1186
|
output_dir = structure(args.output_dir, Path)
|
|
@@ -1169,6 +1210,7 @@ def _handle_filter_residue(args):
|
|
|
1169
1210
|
rprint(f"Statistics written to {_name_of(stats_file)}")
|
|
1170
1211
|
|
|
1171
1212
|
|
|
1213
|
+
@prov(input_dirs=["input_dir"], output_dirs=["output_dir"], output_files=["write_stats"])
|
|
1172
1214
|
def _handle_filter_ss(args):
|
|
1173
1215
|
input_dir = structure(args.input_dir, Path)
|
|
1174
1216
|
output_dir = structure(args.output_dir, Path)
|
|
@@ -1236,7 +1278,7 @@ def _handle_mcp(args):
|
|
|
1236
1278
|
msg = "Unable to start MCP server, please install `protein-quest[mcp]`."
|
|
1237
1279
|
raise ImportError(msg)
|
|
1238
1280
|
|
|
1239
|
-
from protein_quest.mcp_server import mcp # noqa: PLC0415
|
|
1281
|
+
from protein_quest.mcp_server import mcp # noqa: PLC0415 fastmcp is an extra dependency
|
|
1240
1282
|
|
|
1241
1283
|
if args.transport == "stdio":
|
|
1242
1284
|
mcp.run(transport=args.transport)
|
|
@@ -1244,6 +1286,7 @@ def _handle_mcp(args):
|
|
|
1244
1286
|
mcp.run(transport=args.transport, host=args.host, port=args.port)
|
|
1245
1287
|
|
|
1246
1288
|
|
|
1289
|
+
@prov(input_dirs=["input_dir"], output_files=["output"])
|
|
1247
1290
|
def _handle_convert_uniprot(args):
|
|
1248
1291
|
input_dir = structure(args.input_dir, Path)
|
|
1249
1292
|
output_file: TextIOWrapper = args.output
|
|
@@ -1264,6 +1307,7 @@ def _handle_convert_uniprot(args):
|
|
|
1264
1307
|
_write_lines(output_file, sorted(uniprot_accessions))
|
|
1265
1308
|
|
|
1266
1309
|
|
|
1310
|
+
@prov(input_dirs=["input_dir"], output_dirs=["output_dir"])
|
|
1267
1311
|
def _handle_convert_structures(args):
|
|
1268
1312
|
input_dir = structure(args.input_dir, Path)
|
|
1269
1313
|
output_dir = input_dir if args.output_dir is None else structure(args.output_dir, Path)
|
protein_quest/converter.py
CHANGED
|
@@ -13,7 +13,7 @@ type PositiveInt = int
|
|
|
13
13
|
converter = make_converter()
|
|
14
14
|
"""cattrs converter to read JSON document or dict to Python objects."""
|
|
15
15
|
converter.register_structure_hook(URL, lambda v, _: URL(v))
|
|
16
|
-
converter.register_unstructure_hook(URL,
|
|
16
|
+
converter.register_unstructure_hook(URL, str)
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
@converter.register_structure_hook
|
protein_quest/mcp_server.py
CHANGED
|
@@ -7,8 +7,8 @@ Can be run with:
|
|
|
7
7
|
fastmcp dev src/protein_quest/mcp_server.py
|
|
8
8
|
# or from inspector
|
|
9
9
|
npx @modelcontextprotocol/inspector
|
|
10
|
-
#
|
|
11
|
-
#
|
|
10
|
+
# transport type: stdio
|
|
11
|
+
# command: protein-quest
|
|
12
12
|
# arguments: mcp
|
|
13
13
|
|
|
14
14
|
# or with server and inspector
|
protein_quest/parallel.py
CHANGED
|
@@ -86,12 +86,15 @@ def _configure_cpu_dask_scheduler(nproc: int, name: str) -> LocalCluster:
|
|
|
86
86
|
return LocalCluster(name=name, threads_per_worker=1, n_workers=n_workers)
|
|
87
87
|
|
|
88
88
|
|
|
89
|
-
|
|
90
|
-
|
|
89
|
+
class MyProgressBar(ProgressBar):
|
|
90
|
+
"""Show progress of Dask computations.
|
|
91
91
|
|
|
92
|
+
Copy of distributed.diagnostics.progressbar.TextProgressBar that:
|
|
92
93
|
|
|
93
|
-
|
|
94
|
-
|
|
94
|
+
- prints to stderr instead of stdout
|
|
95
|
+
- Can have its interval (in seconds) set with `TQDM_MININTERVAL` environment variable
|
|
96
|
+
|
|
97
|
+
"""
|
|
95
98
|
|
|
96
99
|
__loop: IOLoop | None = None
|
|
97
100
|
|
|
@@ -107,6 +110,11 @@ class _StderrTextProgressBar(ProgressBar):
|
|
|
107
110
|
**kwargs, # noqa: ARG002
|
|
108
111
|
):
|
|
109
112
|
self._loop_runner = loop_runner = LoopRunner(loop=loop)
|
|
113
|
+
if interval == "100ms":
|
|
114
|
+
interval_env = os.getenv("TQDM_MININTERVAL")
|
|
115
|
+
if interval_env is not None:
|
|
116
|
+
interval = interval_env + "s"
|
|
117
|
+
|
|
110
118
|
super().__init__(keys, scheduler, interval, complete)
|
|
111
119
|
self.width = width
|
|
112
120
|
|
|
@@ -144,6 +152,10 @@ class _StderrTextProgressBar(ProgressBar):
|
|
|
144
152
|
sys.stderr.flush()
|
|
145
153
|
|
|
146
154
|
|
|
155
|
+
# Generic type parameters used across helpers
|
|
156
|
+
P = ParamSpec("P")
|
|
157
|
+
|
|
158
|
+
|
|
147
159
|
def dask_map_with_progress[T, R, **P](
|
|
148
160
|
client: Client,
|
|
149
161
|
func: Callable[Concatenate[T, P], R],
|
|
@@ -154,6 +166,10 @@ def dask_map_with_progress[T, R, **P](
|
|
|
154
166
|
"""
|
|
155
167
|
Wrapper for map, progress, and gather of Dask that returns a correctly typed list.
|
|
156
168
|
|
|
169
|
+
Environment variables:
|
|
170
|
+
- Set interval (in seconds) of progress updates with `TQDM_MININTERVAL`
|
|
171
|
+
- Disabled by setting `TQDM_DISABLE` to any value
|
|
172
|
+
|
|
157
173
|
Args:
|
|
158
174
|
client: Dask client.
|
|
159
175
|
func: Function to map; first parameter comes from ``iterable`` and any
|
|
@@ -169,6 +185,7 @@ def dask_map_with_progress[T, R, **P](
|
|
|
169
185
|
if client.dashboard_link:
|
|
170
186
|
logger.info(f"Follow progress on dask dashboard at: {client.dashboard_link}")
|
|
171
187
|
futures = client.map(func, iterable, *args, **kwargs)
|
|
172
|
-
|
|
188
|
+
if not os.getenv("TQDM_DISABLE"):
|
|
189
|
+
MyProgressBar(futures)
|
|
173
190
|
results = client.gather(futures)
|
|
174
191
|
return cast("list[R]", results)
|
protein_quest/uniprot.py
CHANGED
|
@@ -332,7 +332,7 @@ def _build_sparql_generic_by_uniprot_accessions_query(
|
|
|
332
332
|
|
|
333
333
|
def _build_sparql_query_uniprot(query: Query, limit=10_000) -> str:
|
|
334
334
|
dynamic_triples = _query2dynamic_sparql_triples(query)
|
|
335
|
-
# TODO add
|
|
335
|
+
# TODO add useful columns that have 1:1 mapping to protein
|
|
336
336
|
# like uniprot_id with `?protein up:mnemonic ?mnemonic .`
|
|
337
337
|
# and sequence, take care to take first isoform
|
|
338
338
|
# ?protein up:sequence ?isoform .
|
|
@@ -1,12 +1,27 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: protein_quest
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Search/retrieve/filter proteins and protein structures
|
|
5
5
|
Project-URL: Homepage, https://github.com/haddocking/protein-quest
|
|
6
6
|
Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
|
|
7
7
|
Project-URL: Documentation, https://www.bonvinlab.org/protein-quest/
|
|
8
8
|
Project-URL: Source, https://github.com/haddocking/protein-quest
|
|
9
9
|
License-File: LICENSE
|
|
10
|
+
Keywords: alphafold,mmcif,pdb,protein,protein structure,uniprot
|
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Framework :: AsyncIO
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
16
|
+
Classifier: Natural Language :: English
|
|
17
|
+
Classifier: Operating System :: MacOS
|
|
18
|
+
Classifier: Operating System :: POSIX
|
|
19
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Chemistry
|
|
24
|
+
Classifier: Typing :: Typed
|
|
10
25
|
Requires-Python: >=3.13
|
|
11
26
|
Requires-Dist: aiofiles>=24.1.0
|
|
12
27
|
Requires-Dist: aiohttp-retry>=2.9.1
|
|
@@ -21,6 +36,7 @@ Requires-Dist: platformdirs>=4.3.8
|
|
|
21
36
|
Requires-Dist: psutil>=7.0.0
|
|
22
37
|
Requires-Dist: rich-argparse>=1.7.1
|
|
23
38
|
Requires-Dist: rich>=14.0.0
|
|
39
|
+
Requires-Dist: rocrate-action-recorder>=0.2.0
|
|
24
40
|
Requires-Dist: shtab>=1.7.2
|
|
25
41
|
Requires-Dist: sparqlwrapper>=2.0.0
|
|
26
42
|
Requires-Dist: tqdm>=4.67.1
|
|
@@ -38,19 +54,22 @@ Description-Content-Type: text/markdown
|
|
|
38
54
|
[](https://bio.tools/protein-quest)
|
|
39
55
|
[](https://pypi.org/project/protein-quest/)
|
|
40
56
|
[](https://doi.org/10.5281/zenodo.16941288)
|
|
57
|
+
[](https://doi.org/10.5281/zenodo.17910832)
|
|
41
58
|
[](https://app.codacy.com/gh/haddocking/protein-quest/coverage?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_coverage)
|
|
42
59
|
[](https://fairsoftwarechecklist.net/v0.2?f=31&a=32113&i=32121&r=133)
|
|
43
60
|
[](https://fair-software.eu)
|
|
44
61
|
[](https://github.com/kucherenko/jscpd/)
|
|
45
62
|
|
|
46
|
-
|
|
47
63
|
Python package to search/retrieve/filter proteins and protein structures.
|
|
48
64
|
|
|
49
65
|
It uses
|
|
50
66
|
|
|
51
|
-
- [Uniprot Sparql endpoint](https://sparql.uniprot.org/) to search for proteins
|
|
52
|
-
|
|
53
|
-
- [
|
|
67
|
+
- [Uniprot Sparql endpoint](https://sparql.uniprot.org/) to search for proteins
|
|
68
|
+
and their measured or predicted 3D structures.
|
|
69
|
+
- [Uniprot taxonomy](https://www.uniprot.org/taxonomy?query=*) to search for
|
|
70
|
+
taxonomy.
|
|
71
|
+
- [QuickGO](https://www.ebi.ac.uk/QuickGO/api/index.html) to search for Gene
|
|
72
|
+
Ontology terms.
|
|
54
73
|
- [gemmi](https://project-gemmi.github.io/) to work with macromolecular models.
|
|
55
74
|
- [dask-distributed](https://docs.dask.org/en/latest/) to compute in parallel.
|
|
56
75
|
|
|
@@ -101,18 +120,24 @@ pip install protein-quest
|
|
|
101
120
|
```
|
|
102
121
|
|
|
103
122
|
Or to use the latest development version:
|
|
104
|
-
|
|
123
|
+
|
|
124
|
+
```shell
|
|
105
125
|
pip install git+https://github.com/haddocking/protein-quest.git
|
|
106
126
|
```
|
|
107
127
|
|
|
108
128
|
## Usage
|
|
109
129
|
|
|
110
|
-
The main entry point is the `protein-quest` command line tool which has multiple
|
|
130
|
+
The main entry point is the `protein-quest` command line tool which has multiple
|
|
131
|
+
subcommands to perform actions.
|
|
111
132
|
|
|
112
|
-
To use programmaticly, see the
|
|
133
|
+
To use programmaticly, see the
|
|
134
|
+
[Jupyter notebooks](https://www.bonvinlab.org/protein-quest/notebooks) and
|
|
135
|
+
[API documentation](https://www.bonvinlab.org/protein-quest/autoapi/protein_quest/).
|
|
113
136
|
|
|
114
|
-
While downloading or copying files it uses a global cache (located at
|
|
115
|
-
|
|
137
|
+
While downloading or copying files it uses a global cache (located at
|
|
138
|
+
`~/.cache/protein-quest`) and hardlinks to save disk space and improve speed.
|
|
139
|
+
This behavior can be customized with the `--no-cache`, `--cache-dir`, and
|
|
140
|
+
`--copy-method` command line arguments.
|
|
116
141
|
|
|
117
142
|
### Search Uniprot accessions
|
|
118
143
|
|
|
@@ -126,7 +151,9 @@ protein-quest search uniprot \
|
|
|
126
151
|
--limit 100 \
|
|
127
152
|
uniprot_accs.txt
|
|
128
153
|
```
|
|
129
|
-
|
|
154
|
+
|
|
155
|
+
([GO:0005634](https://www.ebi.ac.uk/QuickGO/term/GO:0005634) is "Nucleus" and
|
|
156
|
+
[GO:0003677](https://www.ebi.ac.uk/QuickGO/term/GO:0003677) is "DNA binding")
|
|
130
157
|
|
|
131
158
|
### Search for PDBe structures of uniprot accessions
|
|
132
159
|
|
|
@@ -134,7 +161,8 @@ protein-quest search uniprot \
|
|
|
134
161
|
protein-quest search pdbe uniprot_accs.txt pdbe.csv
|
|
135
162
|
```
|
|
136
163
|
|
|
137
|
-
`pdbe.csv` file is written containing the the PDB id and chain of each uniprot
|
|
164
|
+
`pdbe.csv` file is written containing the the PDB id and chain of each uniprot
|
|
165
|
+
accession.
|
|
138
166
|
|
|
139
167
|
### Search for Alphafold structures of uniprot accessions
|
|
140
168
|
|
|
@@ -170,8 +198,8 @@ protein-quest retrieve emdb emdbs.csv downloads-emdb/
|
|
|
170
198
|
|
|
171
199
|
### To filter AlphaFold structures on confidence
|
|
172
200
|
|
|
173
|
-
Filter AlphaFoldDB structures based on confidence (pLDDT).
|
|
174
|
-
|
|
201
|
+
Filter AlphaFoldDB structures based on confidence (pLDDT). Keeps entries with
|
|
202
|
+
requested number of residues which have a confidence score above the threshold.
|
|
175
203
|
Also writes pdb files with only those residues.
|
|
176
204
|
|
|
177
205
|
```shell
|
|
@@ -184,7 +212,8 @@ protein-quest filter confidence \
|
|
|
184
212
|
|
|
185
213
|
### To filter PDBe files on chain of uniprot accession
|
|
186
214
|
|
|
187
|
-
Make PDBe files smaller by only keeping first chain of found uniprot entry and
|
|
215
|
+
Make PDBe files smaller by only keeping first chain of found uniprot entry and
|
|
216
|
+
renaming to chain A.
|
|
188
217
|
|
|
189
218
|
```shell
|
|
190
219
|
protein-quest filter chain \
|
|
@@ -203,7 +232,10 @@ protein-quest filter residue \
|
|
|
203
232
|
|
|
204
233
|
### To filter on secondary structure
|
|
205
234
|
|
|
206
|
-
To filter on structure being mostly alpha helices and have no beta sheets. See
|
|
235
|
+
To filter on structure being mostly alpha helices and have no beta sheets. See
|
|
236
|
+
the following
|
|
237
|
+
[notebook](https://www.bonvinlab.org/protein-detective/SSE_elements.html) to
|
|
238
|
+
determine the ratio of secondary structure elements.
|
|
207
239
|
|
|
208
240
|
```shell
|
|
209
241
|
protein-quest filter secondary-structure \
|
|
@@ -221,8 +253,10 @@ protein-quest search taxonomy "Homo sapiens" -
|
|
|
221
253
|
|
|
222
254
|
### Search Gene Ontology (GO)
|
|
223
255
|
|
|
224
|
-
You might not know what the identifier of a
|
|
225
|
-
|
|
256
|
+
You might not know what the identifier of a
|
|
257
|
+
[Gene Ontology](https://geneontology.org/) term is at
|
|
258
|
+
`protein-quest search uniprot`. You can use following command to search for a
|
|
259
|
+
Gene Ontology (GO) term.
|
|
226
260
|
|
|
227
261
|
```shell
|
|
228
262
|
protein-quest search go --limit 5 --aspect cellular_component apoptosome -
|
|
@@ -230,18 +264,21 @@ protein-quest search go --limit 5 --aspect cellular_component apoptosome -
|
|
|
230
264
|
|
|
231
265
|
### Search for interaction partners
|
|
232
266
|
|
|
233
|
-
Use https://www.ebi.ac.uk/complexportal to find interaction partners of given
|
|
267
|
+
Use <https://www.ebi.ac.uk/complexportal> to find interaction partners of given
|
|
268
|
+
UniProt accession.
|
|
234
269
|
|
|
235
270
|
```shell
|
|
236
271
|
protein-quest search interaction-partners Q05471 interaction-partners-of-Q05471.txt
|
|
237
272
|
```
|
|
238
273
|
|
|
239
|
-
The `interaction-partners-of-Q05471.txt` file contains uniprot accessions (one
|
|
274
|
+
The `interaction-partners-of-Q05471.txt` file contains uniprot accessions (one
|
|
275
|
+
per line).
|
|
240
276
|
|
|
241
277
|
### Search for complexes
|
|
242
278
|
|
|
243
|
-
Given Uniprot accessions search for macromolecular complexes at
|
|
244
|
-
and return the complex entries and their
|
|
279
|
+
Given Uniprot accessions search for macromolecular complexes at
|
|
280
|
+
<https://www.ebi.ac.uk/complexportal> and return the complex entries and their
|
|
281
|
+
members.
|
|
245
282
|
|
|
246
283
|
```shell
|
|
247
284
|
echo Q05471 | protein-quest search complexes - complexes.csv
|
|
@@ -256,7 +293,8 @@ Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chroma
|
|
|
256
293
|
|
|
257
294
|
### Search for UniProt details
|
|
258
295
|
|
|
259
|
-
To get details (like protein name, sequence length, organism) for a list of
|
|
296
|
+
To get details (like protein name, sequence length, organism) for a list of
|
|
297
|
+
UniProt accessions.
|
|
260
298
|
|
|
261
299
|
```shell
|
|
262
300
|
protein-quest search uniprot-details uniprot_accs.txt uniprot_details.csv
|
|
@@ -271,7 +309,8 @@ A0A087WUV0,ZN892_HUMAN,522,True,Zinc finger protein 892,9606,Homo sapiens
|
|
|
271
309
|
|
|
272
310
|
### Convert structure files to .cif format
|
|
273
311
|
|
|
274
|
-
Some tools (for example [powerfit](https://github.com/haddocking/powerfit)) only
|
|
312
|
+
Some tools (for example [powerfit](https://github.com/haddocking/powerfit)) only
|
|
313
|
+
work with `.cif` files and not `*.cif.gz` or `*.bcif` files.
|
|
275
314
|
|
|
276
315
|
```shell
|
|
277
316
|
protein-quest convert structures --format cif --output-dir ./filtered-cif ./filtered-ss
|
|
@@ -279,15 +318,25 @@ protein-quest convert structures --format cif --output-dir ./filtered-cif ./filt
|
|
|
279
318
|
|
|
280
319
|
### Convert structure files to UniProt accessions
|
|
281
320
|
|
|
282
|
-
After running some filters you might want to know which UniProt accessions are
|
|
321
|
+
After running some filters you might want to know which UniProt accessions are
|
|
322
|
+
still present in the filtered structures.
|
|
283
323
|
|
|
284
324
|
```shell
|
|
285
325
|
protein-quest convert uniprot ./filtered-ss uniprot_accs.filtered.txt
|
|
286
326
|
```
|
|
287
327
|
|
|
288
|
-
##
|
|
328
|
+
## Provenance
|
|
329
|
+
|
|
330
|
+
You can use `protein-quest --prov ...` to store provenance information of your
|
|
331
|
+
CLI invocations in a
|
|
332
|
+
[Research Object crate](https://www.researchobject.org/ro-crate/) file called
|
|
333
|
+
ro-crate-metadata.json.
|
|
334
|
+
|
|
335
|
+
## Model Context Protocol (MCP) server
|
|
289
336
|
|
|
290
|
-
Protein quest can also help LLMs like Claude Sonnet 4 by providing a
|
|
337
|
+
Protein quest can also help LLMs like Claude Sonnet 4 by providing a
|
|
338
|
+
[set of tools](https://modelcontextprotocol.io/docs/learn/server-concepts#tools-ai-actions)
|
|
339
|
+
for protein structures.
|
|
291
340
|
|
|
292
341
|

|
|
293
342
|
|
|
@@ -303,11 +352,13 @@ The server can be started with:
|
|
|
303
352
|
protein-quest mcp
|
|
304
353
|
```
|
|
305
354
|
|
|
306
|
-
The mcp server contains an prompt template to search/retrieve/filter candidate
|
|
355
|
+
The mcp server contains an prompt template to search/retrieve/filter candidate
|
|
356
|
+
structures.
|
|
307
357
|
|
|
308
358
|
## Shell autocompletion
|
|
309
359
|
|
|
310
|
-
The `protein-quest` command line tool supports shell autocompletion using
|
|
360
|
+
The `protein-quest` command line tool supports shell autocompletion using
|
|
361
|
+
[shtab](https://docs.iterative.ai/shtab).
|
|
311
362
|
|
|
312
363
|
Initialize for bash shell with:
|
|
313
364
|
|
|
@@ -327,4 +378,5 @@ autoload -Uz compinit && compinit
|
|
|
327
378
|
|
|
328
379
|
## Contributing
|
|
329
380
|
|
|
330
|
-
For development information and contribution guidelines, please see
|
|
381
|
+
For development information and contribution guidelines, please see
|
|
382
|
+
[CONTRIBUTING.md](CONTRIBUTING.md).
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
protein_quest/__version__.py,sha256=
|
|
3
|
-
protein_quest/cli.py,sha256=
|
|
4
|
-
protein_quest/converter.py,sha256=
|
|
2
|
+
protein_quest/__version__.py,sha256=1-Y-bSMxz0yut5o_jEVM46EG2KW008II37JW_koD3Oc,56
|
|
3
|
+
protein_quest/cli.py,sha256=2_bEP7gGYxvxaqlcwEaiJc9i6hf_HtMj_xucNeaOqv4,59587
|
|
4
|
+
protein_quest/converter.py,sha256=Qk-hIyp-YGUK4vvOZlES3BktZsK14-ShgBvVyo9Wjh8,1428
|
|
5
5
|
protein_quest/emdb.py,sha256=641c6RwNYnu-0GBFyCFBiI58fNc0jMkd0ZZ9MW9-Jmc,1501
|
|
6
6
|
protein_quest/filters.py,sha256=em1FYD7Y9z98ZSaJGYCv1VCGRADLbat8FfSOlNJNAJM,5663
|
|
7
7
|
protein_quest/go.py,sha256=lZNEcw8nTc9wpV3cl4y2FG9Lsj8wsXQ6zemmAQs_DWE,5650
|
|
8
8
|
protein_quest/io.py,sha256=ngV_HU2HIQFO-bP2xQj_fhgv0MYjW4puqz_9CxGpBv8,13017
|
|
9
|
-
protein_quest/mcp_server.py,sha256=
|
|
10
|
-
protein_quest/parallel.py,sha256=
|
|
9
|
+
protein_quest/mcp_server.py,sha256=N22DT8g6i1EXI2bunpPppLbwsGkBBOdKpmtTuooXuOk,8553
|
|
10
|
+
protein_quest/parallel.py,sha256=hmwjv-KeiC7qSs5xApAvh3ZKkJ9HDW5zmr1zuwOzFpg,6367
|
|
11
11
|
protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
12
|
protein_quest/ss.py,sha256=4ZGIHfjTlodYTXqGUKhMnGbgaStYOGaWg2oYrWIjdgo,10118
|
|
13
13
|
protein_quest/structure.py,sha256=3TdzrXbGpmnskp3gjwVevwD1tfhKfAUPOHWi9ViaheM,9101
|
|
14
14
|
protein_quest/taxonomy.py,sha256=4mKv8zll4mX02Ow8CTvyqMJE2KJZvcq3QlTjjjLOJJk,5072
|
|
15
|
-
protein_quest/uniprot.py,sha256=
|
|
15
|
+
protein_quest/uniprot.py,sha256=1tqAQqnQIH7OV0dhjWv8TJIIrY6sXgrfFvlf-OieP1s,36797
|
|
16
16
|
protein_quest/utils.py,sha256=5Ncdid-dslggy-Ti1yhOHwdAM7Bxpyia7Re-xDkc2P0,19909
|
|
17
17
|
protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
|
|
18
18
|
protein_quest/alphafold/confidence.py,sha256=UtS2MJEReaZ1kTXbQf8Vrc9gzGjAOiGLYs4glqN-1do,8098
|
|
@@ -20,8 +20,8 @@ protein_quest/alphafold/entry_summary.py,sha256=Qhnw75RXFaoOU332g7axg_jYbbdZbUps
|
|
|
20
20
|
protein_quest/alphafold/fetch.py,sha256=D-RWKWo5kWpCko_LNT_sslzrpeR3HX9nu5F4MUOFRtI,21979
|
|
21
21
|
protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
|
|
22
22
|
protein_quest/pdbe/fetch.py,sha256=e8CHWDX2QzWnVLmYXCfNrscw1UcN1lI9Uz6Z5HmEOEQ,2510
|
|
23
|
-
protein_quest-
|
|
24
|
-
protein_quest-
|
|
25
|
-
protein_quest-
|
|
26
|
-
protein_quest-
|
|
27
|
-
protein_quest-
|
|
23
|
+
protein_quest-1.1.0.dist-info/METADATA,sha256=BnrYu853g1P2RJom3E13vTWsumEurdo1XwdEJ2b7wJE,13045
|
|
24
|
+
protein_quest-1.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
25
|
+
protein_quest-1.1.0.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
|
|
26
|
+
protein_quest-1.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
27
|
+
protein_quest-1.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|