protein-quest 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- protein_quest/__init__.py +0 -0
- protein_quest/__version__.py +2 -0
- protein_quest/alphafold/__init__.py +1 -0
- protein_quest/alphafold/confidence.py +226 -0
- protein_quest/alphafold/entry_summary.py +64 -0
- protein_quest/alphafold/fetch.py +534 -0
- protein_quest/cli.py +1428 -0
- protein_quest/converter.py +46 -0
- protein_quest/emdb.py +37 -0
- protein_quest/filters.py +163 -0
- protein_quest/go.py +165 -0
- protein_quest/io.py +350 -0
- protein_quest/mcp_server.py +256 -0
- protein_quest/parallel.py +104 -0
- protein_quest/pdbe/__init__.py +1 -0
- protein_quest/pdbe/fetch.py +68 -0
- protein_quest/py.typed +0 -0
- protein_quest/ss.py +280 -0
- protein_quest/structure.py +232 -0
- protein_quest/taxonomy.py +149 -0
- protein_quest/uniprot.py +975 -0
- protein_quest/utils.py +547 -0
- protein_quest-0.9.0.dist-info/METADATA +325 -0
- protein_quest-0.9.0.dist-info/RECORD +27 -0
- protein_quest-0.9.0.dist-info/WHEEL +4 -0
- protein_quest-0.9.0.dist-info/entry_points.txt +2 -0
- protein_quest-0.9.0.dist-info/licenses/LICENSE +201 -0
protein_quest/cli.py
ADDED
|
@@ -0,0 +1,1428 @@
|
|
|
1
|
+
"""Module for cli parsers and handlers."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import asyncio
|
|
5
|
+
import csv
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
from collections.abc import Callable, Generator, Iterable, Sequence
|
|
10
|
+
from contextlib import suppress
|
|
11
|
+
from importlib.util import find_spec
|
|
12
|
+
from io import BytesIO, TextIOWrapper
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from textwrap import dedent
|
|
15
|
+
|
|
16
|
+
import shtab
|
|
17
|
+
from cattrs import structure
|
|
18
|
+
from rich.console import Console
|
|
19
|
+
from rich.logging import RichHandler
|
|
20
|
+
from rich.markdown import Markdown
|
|
21
|
+
from rich.panel import Panel
|
|
22
|
+
from rich_argparse import ArgumentDefaultsRichHelpFormatter
|
|
23
|
+
from tqdm.rich import tqdm
|
|
24
|
+
|
|
25
|
+
from protein_quest.__version__ import __version__
|
|
26
|
+
from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence
|
|
27
|
+
from protein_quest.alphafold.fetch import DownloadableFormat, downloadable_formats
|
|
28
|
+
from protein_quest.alphafold.fetch import fetch_many as af_fetch
|
|
29
|
+
from protein_quest.converter import PositiveInt, converter
|
|
30
|
+
from protein_quest.emdb import fetch as emdb_fetch
|
|
31
|
+
from protein_quest.filters import filter_files_on_chain, filter_files_on_residues
|
|
32
|
+
from protein_quest.go import Aspect, allowed_aspects, search_gene_ontology_term, write_go_terms_to_csv
|
|
33
|
+
from protein_quest.io import (
|
|
34
|
+
convert_to_cif_files,
|
|
35
|
+
glob_structure_files,
|
|
36
|
+
locate_structure_file,
|
|
37
|
+
read_structure,
|
|
38
|
+
valid_structure_file_extensions,
|
|
39
|
+
)
|
|
40
|
+
from protein_quest.pdbe import fetch as pdbe_fetch
|
|
41
|
+
from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
|
|
42
|
+
from protein_quest.structure import structure2uniprot_accessions
|
|
43
|
+
from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
|
|
44
|
+
from protein_quest.uniprot import (
|
|
45
|
+
ComplexPortalEntry,
|
|
46
|
+
PdbResults,
|
|
47
|
+
Query,
|
|
48
|
+
UniprotDetails,
|
|
49
|
+
filter_pdb_results_on_chain_length,
|
|
50
|
+
map_uniprot_accessions2uniprot_details,
|
|
51
|
+
search4af,
|
|
52
|
+
search4emdb,
|
|
53
|
+
search4interaction_partners,
|
|
54
|
+
search4macromolecular_complexes,
|
|
55
|
+
search4pdb,
|
|
56
|
+
search4uniprot,
|
|
57
|
+
)
|
|
58
|
+
from protein_quest.utils import (
|
|
59
|
+
Cacher,
|
|
60
|
+
CopyMethod,
|
|
61
|
+
DirectoryCacher,
|
|
62
|
+
PassthroughCacher,
|
|
63
|
+
copy_methods,
|
|
64
|
+
copyfile,
|
|
65
|
+
user_cache_root_dir,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
console = Console(stderr=True)
|
|
69
|
+
rprint = console.print
|
|
70
|
+
logger = logging.getLogger(__name__)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _add_search_uniprot_parser(subparsers: argparse._SubParsersAction):
|
|
74
|
+
"""Add search uniprot subcommand parser."""
|
|
75
|
+
parser = subparsers.add_parser(
|
|
76
|
+
"uniprot",
|
|
77
|
+
help="Search UniProt accessions",
|
|
78
|
+
description="Search for UniProt accessions based on various criteria in the Uniprot SPARQL endpoint.",
|
|
79
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
80
|
+
)
|
|
81
|
+
parser.add_argument(
|
|
82
|
+
"output",
|
|
83
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
84
|
+
help="Output text file for UniProt accessions (one per line). Use `-` for stdout.",
|
|
85
|
+
).complete = shtab.FILE
|
|
86
|
+
parser.add_argument("--taxon-id", type=str, help="NCBI Taxon ID, e.g. 9606 for Homo Sapiens")
|
|
87
|
+
parser.add_argument(
|
|
88
|
+
"--reviewed",
|
|
89
|
+
action=argparse.BooleanOptionalAction,
|
|
90
|
+
help="Reviewed=swissprot, no-reviewed=trembl. Default is uniprot=swissprot+trembl.",
|
|
91
|
+
default=None,
|
|
92
|
+
)
|
|
93
|
+
parser.add_argument(
|
|
94
|
+
"--subcellular-location-uniprot",
|
|
95
|
+
type=str,
|
|
96
|
+
help="Subcellular location label as used by UniProt (e.g. nucleus)",
|
|
97
|
+
)
|
|
98
|
+
parser.add_argument(
|
|
99
|
+
"--subcellular-location-go",
|
|
100
|
+
dest="subcellular_location_go",
|
|
101
|
+
action="append",
|
|
102
|
+
help="GO term(s) for subcellular location (e.g. GO:0005634). Can be given multiple times.",
|
|
103
|
+
)
|
|
104
|
+
parser.add_argument(
|
|
105
|
+
"--molecular-function-go",
|
|
106
|
+
dest="molecular_function_go",
|
|
107
|
+
action="append",
|
|
108
|
+
help="GO term(s) for molecular function (e.g. GO:0003677). Can be given multiple times.",
|
|
109
|
+
)
|
|
110
|
+
parser.add_argument("--min-sequence-length", type=int, help="Minimum length of the canonical sequence.")
|
|
111
|
+
parser.add_argument("--max-sequence-length", type=int, help="Maximum length of the canonical sequence.")
|
|
112
|
+
parser.add_argument("--limit", type=int, default=10_000, help="Maximum number of uniprot accessions to return")
|
|
113
|
+
parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _add_search_pdbe_parser(subparsers: argparse._SubParsersAction):
|
|
117
|
+
"""Add search pdbe subcommand parser."""
|
|
118
|
+
parser = subparsers.add_parser(
|
|
119
|
+
"pdbe",
|
|
120
|
+
help="Search PDBe structures of given UniProt accessions",
|
|
121
|
+
description="Search for PDB structures of given UniProt accessions in the Uniprot SPARQL endpoint.",
|
|
122
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
123
|
+
)
|
|
124
|
+
parser.add_argument(
|
|
125
|
+
"uniprot_accessions",
|
|
126
|
+
type=argparse.FileType("r", encoding="UTF-8"),
|
|
127
|
+
help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
|
|
128
|
+
).complete = shtab.FILE
|
|
129
|
+
parser.add_argument(
|
|
130
|
+
"output_csv",
|
|
131
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
132
|
+
help=dedent("""\
|
|
133
|
+
Output CSV with following columns:
|
|
134
|
+
`uniprot_accession`, `pdb_id`, `method`, `resolution`, `uniprot_chains`, `chain`, `chain_length`.
|
|
135
|
+
Where `uniprot_chains` is the raw UniProt chain string, for example `A=1-100`.
|
|
136
|
+
and where `chain` is the first chain from `uniprot_chains`, for example `A`
|
|
137
|
+
and `chain_length` is the length of the chain, for example `100`.
|
|
138
|
+
Use `-` for stdout.
|
|
139
|
+
"""),
|
|
140
|
+
).complete = shtab.FILE
|
|
141
|
+
parser.add_argument(
|
|
142
|
+
"--limit", type=int, default=10_000, help="Maximum number of PDB uniprot accessions combinations to return"
|
|
143
|
+
)
|
|
144
|
+
parser.add_argument(
|
|
145
|
+
"--min-residues",
|
|
146
|
+
type=int,
|
|
147
|
+
help="Minimum number of residues required in the chain mapped to the UniProt accession.",
|
|
148
|
+
)
|
|
149
|
+
parser.add_argument(
|
|
150
|
+
"--max-residues",
|
|
151
|
+
type=int,
|
|
152
|
+
help="Maximum number of residues allowed in chain mapped to the UniProt accession.",
|
|
153
|
+
)
|
|
154
|
+
parser.add_argument(
|
|
155
|
+
"--keep-invalid",
|
|
156
|
+
action="store_true",
|
|
157
|
+
help=dedent("""\
|
|
158
|
+
Keep PDB results when chain length could not be determined.
|
|
159
|
+
If not given, such results are dropped.
|
|
160
|
+
Only applies if min/max residues arguments are set.
|
|
161
|
+
"""),
|
|
162
|
+
)
|
|
163
|
+
parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _add_search_alphafold_parser(subparsers: argparse._SubParsersAction):
|
|
167
|
+
"""Add search alphafold subcommand parser."""
|
|
168
|
+
parser = subparsers.add_parser(
|
|
169
|
+
"alphafold",
|
|
170
|
+
help="Search AlphaFold structures of given UniProt accessions",
|
|
171
|
+
description="Search for AlphaFold structures of given UniProt accessions in the Uniprot SPARQL endpoint.",
|
|
172
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
173
|
+
)
|
|
174
|
+
parser.add_argument(
|
|
175
|
+
"uniprot_accessions",
|
|
176
|
+
type=argparse.FileType("r", encoding="UTF-8"),
|
|
177
|
+
help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
|
|
178
|
+
).complete = shtab.FILE
|
|
179
|
+
parser.add_argument(
|
|
180
|
+
"output_csv",
|
|
181
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
182
|
+
help="Output CSV with AlphaFold IDs per UniProt accession. Use `-` for stdout.",
|
|
183
|
+
).complete = shtab.FILE
|
|
184
|
+
parser.add_argument("--min-sequence-length", type=int, help="Minimum length of the canonical sequence.")
|
|
185
|
+
parser.add_argument("--max-sequence-length", type=int, help="Maximum length of the canonical sequence.")
|
|
186
|
+
parser.add_argument(
|
|
187
|
+
"--limit", type=int, default=10_000, help="Maximum number of Alphafold entry identifiers to return"
|
|
188
|
+
)
|
|
189
|
+
parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _add_search_emdb_parser(subparsers: argparse._SubParsersAction):
|
|
193
|
+
"""Add search emdb subcommand parser."""
|
|
194
|
+
parser = subparsers.add_parser(
|
|
195
|
+
"emdb",
|
|
196
|
+
help="Search Electron Microscopy Data Bank (EMDB) identifiers of given UniProt accessions",
|
|
197
|
+
description=dedent("""\
|
|
198
|
+
Search for Electron Microscopy Data Bank (EMDB) identifiers of given UniProt accessions
|
|
199
|
+
in the Uniprot SPARQL endpoint.
|
|
200
|
+
"""),
|
|
201
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
202
|
+
)
|
|
203
|
+
parser.add_argument(
|
|
204
|
+
"uniprot_accs",
|
|
205
|
+
type=argparse.FileType("r", encoding="UTF-8"),
|
|
206
|
+
help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
|
|
207
|
+
).complete = shtab.FILE
|
|
208
|
+
parser.add_argument(
|
|
209
|
+
"output_csv",
|
|
210
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
211
|
+
help="Output CSV with EMDB IDs per UniProt accession. Use `-` for stdout.",
|
|
212
|
+
).complete = shtab.FILE
|
|
213
|
+
parser.add_argument("--limit", type=int, default=10_000, help="Maximum number of EMDB entry identifiers to return")
|
|
214
|
+
parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _add_search_go_parser(subparsers: argparse._SubParsersAction):
|
|
218
|
+
"""Add search go subcommand parser"""
|
|
219
|
+
parser = subparsers.add_parser(
|
|
220
|
+
"go",
|
|
221
|
+
help="Search for Gene Ontology (GO) terms",
|
|
222
|
+
description="Search for Gene Ontology (GO) terms in the EBI QuickGO API.",
|
|
223
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
224
|
+
)
|
|
225
|
+
parser.add_argument(
|
|
226
|
+
"term",
|
|
227
|
+
type=str,
|
|
228
|
+
help="GO term to search for. For example `apoptosome`.",
|
|
229
|
+
)
|
|
230
|
+
parser.add_argument("--aspect", type=str, choices=allowed_aspects, help="Filter on aspect.")
|
|
231
|
+
parser.add_argument(
|
|
232
|
+
"output_csv",
|
|
233
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
234
|
+
help="Output CSV with GO term results. Use `-` for stdout.",
|
|
235
|
+
).complete = shtab.FILE
|
|
236
|
+
parser.add_argument("--limit", type=int, default=100, help="Maximum number of GO term results to return")
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _add_search_taxonomy_parser(subparser: argparse._SubParsersAction):
|
|
240
|
+
"""Add search taxonomy subcommand parser."""
|
|
241
|
+
parser = subparser.add_parser(
|
|
242
|
+
"taxonomy",
|
|
243
|
+
help="Search for taxon information in UniProt",
|
|
244
|
+
description=dedent("""\
|
|
245
|
+
Search for taxon information in UniProt.
|
|
246
|
+
Uses https://www.uniprot.org/taxonomy?query=*.
|
|
247
|
+
"""),
|
|
248
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
249
|
+
)
|
|
250
|
+
parser.add_argument(
|
|
251
|
+
"query", type=str, help="Search query for the taxon. Surround multiple words with quotes (' or \")."
|
|
252
|
+
)
|
|
253
|
+
parser.add_argument(
|
|
254
|
+
"output_csv",
|
|
255
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
256
|
+
help="Output CSV with taxonomy results. Use `-` for stdout.",
|
|
257
|
+
).complete = shtab.FILE
|
|
258
|
+
parser.add_argument(
|
|
259
|
+
"--field",
|
|
260
|
+
type=str,
|
|
261
|
+
choices=search_fields,
|
|
262
|
+
help=dedent("""\
|
|
263
|
+
Field to search in. If not given then searches all fields.
|
|
264
|
+
If "tax_id" then searches by taxon ID.
|
|
265
|
+
If "parent" then given a parent taxon ID returns all its children.
|
|
266
|
+
For example, if the parent taxon ID is 9606 (Human), it will return Neanderthal and Denisovan.
|
|
267
|
+
"""),
|
|
268
|
+
)
|
|
269
|
+
parser.add_argument("--limit", type=int, default=100, help="Maximum number of results to return")
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _add_search_interaction_partners_parser(subparsers: argparse._SubParsersAction):
|
|
273
|
+
"""Add search interaction partners subcommand parser."""
|
|
274
|
+
parser = subparsers.add_parser(
|
|
275
|
+
"interaction-partners",
|
|
276
|
+
help="Search for interaction partners of given UniProt accession",
|
|
277
|
+
description=dedent("""\
|
|
278
|
+
Search for interaction partners of given UniProt accession
|
|
279
|
+
in the Uniprot SPARQL endpoint and Complex Portal.
|
|
280
|
+
"""),
|
|
281
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
282
|
+
)
|
|
283
|
+
parser.add_argument(
|
|
284
|
+
"uniprot_accession",
|
|
285
|
+
type=str,
|
|
286
|
+
help="UniProt accession (for example P12345).",
|
|
287
|
+
)
|
|
288
|
+
parser.add_argument(
|
|
289
|
+
"--exclude",
|
|
290
|
+
type=str,
|
|
291
|
+
action="append",
|
|
292
|
+
help="UniProt accessions to exclude from the results. For example already known interaction partners.",
|
|
293
|
+
)
|
|
294
|
+
parser.add_argument(
|
|
295
|
+
"output_csv",
|
|
296
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
297
|
+
help="Output CSV with interaction partners per UniProt accession. Use `-` for stdout.",
|
|
298
|
+
).complete = shtab.FILE
|
|
299
|
+
parser.add_argument(
|
|
300
|
+
"--limit", type=int, default=10_000, help="Maximum number of interaction partner uniprot accessions to return"
|
|
301
|
+
)
|
|
302
|
+
parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def _add_search_complexes_parser(subparsers: argparse._SubParsersAction):
|
|
306
|
+
"""Add search complexes subcommand parser."""
|
|
307
|
+
description = dedent("""\
|
|
308
|
+
Search for complexes in the Complex Portal.
|
|
309
|
+
https://www.ebi.ac.uk/complexportal/
|
|
310
|
+
|
|
311
|
+
The output CSV file has the following columns:
|
|
312
|
+
|
|
313
|
+
- query_protein: UniProt accession used as query
|
|
314
|
+
- complex_id: Complex Portal identifier
|
|
315
|
+
- complex_url: URL to the Complex Portal entry
|
|
316
|
+
- complex_title: Title of the complex
|
|
317
|
+
- members: Semicolon-separated list of UniProt accessions of complex members
|
|
318
|
+
""")
|
|
319
|
+
parser = subparsers.add_parser(
|
|
320
|
+
"complexes",
|
|
321
|
+
help="Search for complexes in the Complex Portal",
|
|
322
|
+
description=Markdown(description, style="argparse.text"), # type: ignore using rich formatter makes this OK
|
|
323
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
324
|
+
)
|
|
325
|
+
parser.add_argument(
|
|
326
|
+
"uniprot_accessions",
|
|
327
|
+
type=argparse.FileType("r", encoding="UTF-8"),
|
|
328
|
+
help="Text file with UniProt accessions (one per line) as query for searching complexes. Use `-` for stdin.",
|
|
329
|
+
).complete = shtab.FILE
|
|
330
|
+
parser.add_argument(
|
|
331
|
+
"output_csv",
|
|
332
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
333
|
+
help="Output CSV file with complex results. Use `-` for stdout.",
|
|
334
|
+
).complete = shtab.FILE
|
|
335
|
+
parser.add_argument("--limit", type=int, default=100, help="Maximum number of complex results to return")
|
|
336
|
+
parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def _add_search_uniprot_details_parser(subparsers: argparse._SubParsersAction):
|
|
340
|
+
"""Add search uniprot details subcommand parser."""
|
|
341
|
+
description = dedent("""\
|
|
342
|
+
Retrieve UniProt details for given UniProt accessions
|
|
343
|
+
from the Uniprot SPARQL endpoint.
|
|
344
|
+
|
|
345
|
+
The output CSV file has the following columns:
|
|
346
|
+
|
|
347
|
+
- uniprot_accession: UniProt accession.
|
|
348
|
+
- uniprot_id: UniProt ID (mnemonic).
|
|
349
|
+
- sequence_length: Length of the canonical sequence.
|
|
350
|
+
- reviewed: Whether the entry is reviewed (Swiss-Prot) or unreviewed (TrEMBL).
|
|
351
|
+
- protein_name: Recommended protein name.
|
|
352
|
+
- taxon_id: NCBI Taxonomy ID of the organism.
|
|
353
|
+
- taxon_name: Scientific name of the organism.
|
|
354
|
+
|
|
355
|
+
The order of the output CSV can be different from the input order.
|
|
356
|
+
""")
|
|
357
|
+
parser = subparsers.add_parser(
|
|
358
|
+
"uniprot-details",
|
|
359
|
+
help="Retrieve UniProt details for given UniProt accessions",
|
|
360
|
+
description=Markdown(description, style="argparse.text"), # type: ignore using rich formatter makes this OK
|
|
361
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
362
|
+
)
|
|
363
|
+
parser.add_argument(
|
|
364
|
+
"uniprot_accessions",
|
|
365
|
+
type=argparse.FileType("r", encoding="UTF-8"),
|
|
366
|
+
help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
|
|
367
|
+
).complete = shtab.FILE
|
|
368
|
+
parser.add_argument(
|
|
369
|
+
"output_csv",
|
|
370
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
371
|
+
help="Output CSV with UniProt details. Use `-` for stdout.",
|
|
372
|
+
).complete = shtab.FILE
|
|
373
|
+
parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
|
|
374
|
+
parser.add_argument("--batch-size", type=int, default=1_000, help="Number of accessions to query per batch")
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _add_copy_method_arguments(parser):
|
|
378
|
+
parser.add_argument(
|
|
379
|
+
"--copy-method",
|
|
380
|
+
type=str,
|
|
381
|
+
choices=copy_methods,
|
|
382
|
+
default="hardlink",
|
|
383
|
+
help=dedent("""\
|
|
384
|
+
How to make target file be same file as source file.
|
|
385
|
+
By default uses hardlinks to save disk space.
|
|
386
|
+
Note that hardlinks only work within the same filesystem and are harder to track.
|
|
387
|
+
If you want to track cached files easily then use 'symlink'.
|
|
388
|
+
On Windows you need developer mode or admin privileges to create symlinks.
|
|
389
|
+
"""),
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def _add_cacher_arguments(parser: argparse.ArgumentParser):
|
|
394
|
+
"""Add cacher arguments to parser."""
|
|
395
|
+
parser.add_argument(
|
|
396
|
+
"--no-cache",
|
|
397
|
+
action="store_true",
|
|
398
|
+
help="Disable caching of files to central location.",
|
|
399
|
+
)
|
|
400
|
+
cache_dir_action = parser.add_argument(
|
|
401
|
+
"--cache-dir",
|
|
402
|
+
type=Path,
|
|
403
|
+
default=user_cache_root_dir(),
|
|
404
|
+
help="Directory to use as cache for files.",
|
|
405
|
+
)
|
|
406
|
+
cache_dir_action.complete = shtab.DIRECTORY # type: ignore[missing-attribute]
|
|
407
|
+
_add_copy_method_arguments(parser)
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
|
|
411
|
+
"""Add retrieve pdbe subcommand parser."""
|
|
412
|
+
parser = subparsers.add_parser(
|
|
413
|
+
"pdbe",
|
|
414
|
+
help="Retrieve PDBe gzipped mmCIF files for PDB IDs in CSV.",
|
|
415
|
+
description=dedent("""\
|
|
416
|
+
Retrieve mmCIF files from Protein Data Bank in Europe Knowledge Base (PDBe) website
|
|
417
|
+
for unique PDB IDs listed in a CSV file.
|
|
418
|
+
"""),
|
|
419
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
420
|
+
)
|
|
421
|
+
parser.add_argument(
|
|
422
|
+
"pdbe_csv",
|
|
423
|
+
type=argparse.FileType("r", encoding="UTF-8"),
|
|
424
|
+
help="CSV file with `pdb_id` column. Other columns are ignored. Use `-` for stdin.",
|
|
425
|
+
).complete = shtab.FILE
|
|
426
|
+
parser.add_argument(
|
|
427
|
+
"output_dir", type=Path, help="Directory to store downloaded PDBe mmCIF files"
|
|
428
|
+
).complete = shtab.DIRECTORY
|
|
429
|
+
parser.add_argument(
|
|
430
|
+
"--max-parallel-downloads",
|
|
431
|
+
type=int,
|
|
432
|
+
default=5,
|
|
433
|
+
help="Maximum number of parallel downloads",
|
|
434
|
+
)
|
|
435
|
+
_add_cacher_arguments(parser)
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
|
|
439
|
+
"""Add retrieve alphafold subcommand parser."""
|
|
440
|
+
parser = subparsers.add_parser(
|
|
441
|
+
"alphafold",
|
|
442
|
+
help="Retrieve AlphaFold files for IDs in CSV",
|
|
443
|
+
description="Retrieve AlphaFold files from the AlphaFold Protein Structure Database.",
|
|
444
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
445
|
+
)
|
|
446
|
+
parser.add_argument(
|
|
447
|
+
"alphafold_csv",
|
|
448
|
+
type=argparse.FileType("r", encoding="UTF-8"),
|
|
449
|
+
help="CSV file with `af_id` column. Other columns are ignored. Use `-` for stdin.",
|
|
450
|
+
).complete = shtab.FILE
|
|
451
|
+
parser.add_argument(
|
|
452
|
+
"output_dir", type=Path, help="Directory to store downloaded AlphaFold files"
|
|
453
|
+
).complete = shtab.DIRECTORY
|
|
454
|
+
parser.add_argument(
|
|
455
|
+
"--format",
|
|
456
|
+
type=str,
|
|
457
|
+
action="append",
|
|
458
|
+
choices=sorted(downloadable_formats),
|
|
459
|
+
help=dedent("""AlphaFold formats to retrieve. Can be specified multiple times.
|
|
460
|
+
Default is 'cif'."""),
|
|
461
|
+
)
|
|
462
|
+
parser.add_argument(
|
|
463
|
+
"--db-version",
|
|
464
|
+
type=str,
|
|
465
|
+
help="AlphaFold database version to use. If not given, the latest version is used. For example '6'.",
|
|
466
|
+
)
|
|
467
|
+
parser.add_argument(
|
|
468
|
+
"--gzip-files",
|
|
469
|
+
action="store_true",
|
|
470
|
+
help="Whether to gzip the downloaded files. Excludes summary files, they are always uncompressed.",
|
|
471
|
+
)
|
|
472
|
+
parser.add_argument(
|
|
473
|
+
"--all-isoforms",
|
|
474
|
+
action="store_true",
|
|
475
|
+
help=(
|
|
476
|
+
"Whether to return all isoforms of each uniprot entry. "
|
|
477
|
+
"If not given then only the Alphafold entry for the canonical sequence is returned."
|
|
478
|
+
),
|
|
479
|
+
)
|
|
480
|
+
parser.add_argument(
|
|
481
|
+
"--max-parallel-downloads",
|
|
482
|
+
type=int,
|
|
483
|
+
default=5,
|
|
484
|
+
help="Maximum number of parallel downloads",
|
|
485
|
+
)
|
|
486
|
+
_add_cacher_arguments(parser)
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
|
|
490
|
+
"""Add retrieve emdb subcommand parser."""
|
|
491
|
+
parser = subparsers.add_parser(
|
|
492
|
+
"emdb",
|
|
493
|
+
help="Retrieve Electron Microscopy Data Bank (EMDB) gzipped 3D volume files for EMDB IDs in CSV.",
|
|
494
|
+
description=dedent("""\
|
|
495
|
+
Retrieve volume files from Electron Microscopy Data Bank (EMDB) website
|
|
496
|
+
for unique EMDB IDs listed in a CSV file.
|
|
497
|
+
"""),
|
|
498
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
499
|
+
)
|
|
500
|
+
parser.add_argument(
|
|
501
|
+
"emdb_csv",
|
|
502
|
+
type=argparse.FileType("r", encoding="UTF-8"),
|
|
503
|
+
help="CSV file with `emdb_id` column. Other columns are ignored. Use `-` for stdin.",
|
|
504
|
+
).complete = shtab.FILE
|
|
505
|
+
parser.add_argument(
|
|
506
|
+
"output_dir", type=Path, help="Directory to store downloaded EMDB volume files"
|
|
507
|
+
).complete = shtab.DIRECTORY
|
|
508
|
+
_add_cacher_arguments(parser)
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def _add_scheduler_address_argument(parser):
|
|
512
|
+
parser.add_argument(
|
|
513
|
+
"--scheduler-address",
|
|
514
|
+
help=dedent("""Address of the Dask scheduler to connect to.
|
|
515
|
+
If not provided, will create a local cluster.
|
|
516
|
+
If set to `sequential` will run tasks sequentially."""),
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
|
|
521
|
+
"""Add filter confidence subcommand parser."""
|
|
522
|
+
parser = subparsers.add_parser(
|
|
523
|
+
"confidence",
|
|
524
|
+
help="Filter AlphaFold mmcif/PDB files by confidence",
|
|
525
|
+
description=dedent("""\
|
|
526
|
+
Filter AlphaFold mmcif/PDB files by confidence (plDDT).
|
|
527
|
+
Passed files are written with residues below threshold removed."""),
|
|
528
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
529
|
+
)
|
|
530
|
+
parser.add_argument(
|
|
531
|
+
"input_dir", type=Path, help="Directory with AlphaFold mmcif/PDB files"
|
|
532
|
+
).complete = shtab.DIRECTORY
|
|
533
|
+
parser.add_argument(
|
|
534
|
+
"output_dir", type=Path, help="Directory to write filtered mmcif/PDB files"
|
|
535
|
+
).complete = shtab.DIRECTORY
|
|
536
|
+
parser.add_argument("--confidence-threshold", type=float, default=70, help="pLDDT confidence threshold (0-100)")
|
|
537
|
+
parser.add_argument(
|
|
538
|
+
"--min-residues", type=int, default=0, help="Minimum number of high-confidence residues a structure should have"
|
|
539
|
+
)
|
|
540
|
+
parser.add_argument(
|
|
541
|
+
"--max-residues",
|
|
542
|
+
type=int,
|
|
543
|
+
default=10_000_000,
|
|
544
|
+
help="Maximum number of high-confidence residues a structure should have",
|
|
545
|
+
)
|
|
546
|
+
parser.add_argument(
|
|
547
|
+
"--write-stats",
|
|
548
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
549
|
+
help=dedent("""\
|
|
550
|
+
Write filter statistics to file.
|
|
551
|
+
In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
|
|
552
|
+
Use `-` for stdout."""),
|
|
553
|
+
).complete = shtab.FILE
|
|
554
|
+
_add_scheduler_address_argument(parser)
|
|
555
|
+
_add_copy_method_arguments(parser)
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
|
|
559
|
+
"""Add filter chain subcommand parser."""
|
|
560
|
+
parser = subparsers.add_parser(
|
|
561
|
+
"chain",
|
|
562
|
+
help="Filter on chain.",
|
|
563
|
+
description=dedent("""\
|
|
564
|
+
For each input PDB/mmCIF and chain combination
|
|
565
|
+
write a PDB/mmCIF file with just the given chain
|
|
566
|
+
and rename it to chain `A`.
|
|
567
|
+
Filtering is done in parallel using a Dask cluster."""),
|
|
568
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
569
|
+
)
|
|
570
|
+
parser.add_argument(
|
|
571
|
+
"chains",
|
|
572
|
+
type=argparse.FileType("r", encoding="UTF-8"),
|
|
573
|
+
help="CSV file with `pdb_id` and `chain` columns. Other columns are ignored.",
|
|
574
|
+
).complete = shtab.FILE
|
|
575
|
+
parser.add_argument(
|
|
576
|
+
"input_dir",
|
|
577
|
+
type=Path,
|
|
578
|
+
help=dedent("""\
|
|
579
|
+
Directory with PDB/mmCIF files.
|
|
580
|
+
Expected filenames are `{pdb_id}.cif.gz`, `{pdb_id}.cif`, `{pdb_id}.pdb.gz` or `{pdb_id}.pdb`.
|
|
581
|
+
"""),
|
|
582
|
+
).complete = shtab.DIRECTORY
|
|
583
|
+
parser.add_argument(
|
|
584
|
+
"output_dir",
|
|
585
|
+
type=Path,
|
|
586
|
+
help=dedent("""\
|
|
587
|
+
Directory to write the single-chain PDB/mmCIF files. Output files are in same format as input files."""),
|
|
588
|
+
).complete = shtab.DIRECTORY
|
|
589
|
+
_add_scheduler_address_argument(parser)
|
|
590
|
+
_add_copy_method_arguments(parser)
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
|
|
594
|
+
"""Add filter residue subcommand parser."""
|
|
595
|
+
parser = subparsers.add_parser(
|
|
596
|
+
"residue",
|
|
597
|
+
help="Filter PDB/mmCIF files by number of residues in chain A",
|
|
598
|
+
description=dedent("""\
|
|
599
|
+
Filter PDB/mmCIF files by number of residues in chain A.
|
|
600
|
+
"""),
|
|
601
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
602
|
+
)
|
|
603
|
+
parser.add_argument(
|
|
604
|
+
"input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')"
|
|
605
|
+
).complete = shtab.DIRECTORY
|
|
606
|
+
parser.add_argument(
|
|
607
|
+
"output_dir",
|
|
608
|
+
type=Path,
|
|
609
|
+
help=dedent("""\
|
|
610
|
+
Directory to write filtered PDB/mmCIF files. Files are copied without modification.
|
|
611
|
+
"""),
|
|
612
|
+
).complete = shtab.DIRECTORY
|
|
613
|
+
parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
|
|
614
|
+
parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
|
|
615
|
+
parser.add_argument(
|
|
616
|
+
"--write-stats",
|
|
617
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
618
|
+
help=dedent("""\
|
|
619
|
+
Write filter statistics to file.
|
|
620
|
+
In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
|
|
621
|
+
Use `-` for stdout."""),
|
|
622
|
+
).complete = shtab.FILE
|
|
623
|
+
_add_copy_method_arguments(parser)
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
|
|
627
|
+
"""Add filter secondary structure subcommand parser."""
|
|
628
|
+
parser = subparsers.add_parser(
|
|
629
|
+
"secondary-structure",
|
|
630
|
+
help="Filter PDB/mmCIF files by secondary structure",
|
|
631
|
+
description="Filter PDB/mmCIF files by secondary structure",
|
|
632
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
633
|
+
)
|
|
634
|
+
parser.add_argument(
|
|
635
|
+
"input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')"
|
|
636
|
+
).complete = shtab.DIRECTORY
|
|
637
|
+
parser.add_argument(
|
|
638
|
+
"output_dir",
|
|
639
|
+
type=Path,
|
|
640
|
+
help=dedent("""\
|
|
641
|
+
Directory to write filtered PDB/mmCIF files. Files are copied without modification.
|
|
642
|
+
"""),
|
|
643
|
+
).complete = shtab.DIRECTORY
|
|
644
|
+
parser.add_argument("--abs-min-helix-residues", type=int, help="Min residues in helices")
|
|
645
|
+
parser.add_argument("--abs-max-helix-residues", type=int, help="Max residues in helices")
|
|
646
|
+
parser.add_argument("--abs-min-sheet-residues", type=int, help="Min residues in sheets")
|
|
647
|
+
parser.add_argument("--abs-max-sheet-residues", type=int, help="Max residues in sheets")
|
|
648
|
+
parser.add_argument("--ratio-min-helix-residues", type=float, help="Min residues in helices (relative)")
|
|
649
|
+
parser.add_argument("--ratio-max-helix-residues", type=float, help="Max residues in helices (relative)")
|
|
650
|
+
parser.add_argument("--ratio-min-sheet-residues", type=float, help="Min residues in sheets (relative)")
|
|
651
|
+
parser.add_argument("--ratio-max-sheet-residues", type=float, help="Max residues in sheets (relative)")
|
|
652
|
+
parser.add_argument(
|
|
653
|
+
"--write-stats",
|
|
654
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
655
|
+
help=dedent("""
|
|
656
|
+
Write filter statistics to file. In CSV format with columns:
|
|
657
|
+
`<input_file>,<nr_residues>,<nr_helix_residues>,<nr_sheet_residues>,
|
|
658
|
+
<helix_ratio>,<sheet_ratio>,<passed>,<output_file>`.
|
|
659
|
+
Use `-` for stdout.
|
|
660
|
+
"""),
|
|
661
|
+
).complete = shtab.FILE
|
|
662
|
+
_add_copy_method_arguments(parser)
|
|
663
|
+
|
|
664
|
+
|
|
665
|
+
def _add_search_subcommands(subparsers: argparse._SubParsersAction):
|
|
666
|
+
"""Add search command and its subcommands."""
|
|
667
|
+
parser = subparsers.add_parser(
|
|
668
|
+
"search",
|
|
669
|
+
help="Search data sources",
|
|
670
|
+
description="Search various things online.",
|
|
671
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
672
|
+
)
|
|
673
|
+
subsubparsers = parser.add_subparsers(dest="search_cmd", required=True)
|
|
674
|
+
|
|
675
|
+
_add_search_uniprot_parser(subsubparsers)
|
|
676
|
+
_add_search_pdbe_parser(subsubparsers)
|
|
677
|
+
_add_search_alphafold_parser(subsubparsers)
|
|
678
|
+
_add_search_emdb_parser(subsubparsers)
|
|
679
|
+
_add_search_go_parser(subsubparsers)
|
|
680
|
+
_add_search_taxonomy_parser(subsubparsers)
|
|
681
|
+
_add_search_interaction_partners_parser(subsubparsers)
|
|
682
|
+
_add_search_complexes_parser(subsubparsers)
|
|
683
|
+
_add_search_uniprot_details_parser(subsubparsers)
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
def _add_retrieve_subcommands(subparsers: argparse._SubParsersAction):
|
|
687
|
+
"""Add retrieve command and its subcommands."""
|
|
688
|
+
parser = subparsers.add_parser(
|
|
689
|
+
"retrieve",
|
|
690
|
+
help="Retrieve structure files",
|
|
691
|
+
description="Retrieve structure files from online resources.",
|
|
692
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
693
|
+
)
|
|
694
|
+
subsubparsers = parser.add_subparsers(dest="retrieve_cmd", required=True)
|
|
695
|
+
|
|
696
|
+
_add_retrieve_pdbe_parser(subsubparsers)
|
|
697
|
+
_add_retrieve_alphafold_parser(subsubparsers)
|
|
698
|
+
_add_retrieve_emdb_parser(subsubparsers)
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
def _add_filter_subcommands(subparsers: argparse._SubParsersAction):
|
|
702
|
+
"""Add filter command and its subcommands."""
|
|
703
|
+
parser = subparsers.add_parser("filter", help="Filter files", formatter_class=ArgumentDefaultsRichHelpFormatter)
|
|
704
|
+
subsubparsers = parser.add_subparsers(dest="filter_cmd", required=True)
|
|
705
|
+
|
|
706
|
+
_add_filter_confidence_parser(subsubparsers)
|
|
707
|
+
_add_filter_chain_parser(subsubparsers)
|
|
708
|
+
_add_filter_residue_parser(subsubparsers)
|
|
709
|
+
_add_filter_ss_parser(subsubparsers)
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
def _add_convert_uniprot_parser(subparsers: argparse._SubParsersAction):
|
|
713
|
+
"""Add convert uniprot subcommand parser."""
|
|
714
|
+
parser = subparsers.add_parser(
|
|
715
|
+
"uniprot",
|
|
716
|
+
help="Convert structure files to list of UniProt accessions.",
|
|
717
|
+
description="Convert structure files to list of UniProt accessions. "
|
|
718
|
+
"Uniprot accessions are read from database reference of each structure.",
|
|
719
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
720
|
+
)
|
|
721
|
+
parser.add_argument(
|
|
722
|
+
"input_dir",
|
|
723
|
+
type=Path,
|
|
724
|
+
help=f"Directory with structure files. Supported extensions are {valid_structure_file_extensions}",
|
|
725
|
+
).complete = shtab.DIRECTORY
|
|
726
|
+
parser.add_argument(
|
|
727
|
+
"output",
|
|
728
|
+
type=argparse.FileType("wt", encoding="UTF-8"),
|
|
729
|
+
help="Output text file with UniProt accessions (one per line). Use '-' for stdout.",
|
|
730
|
+
).complete = shtab.FILE
|
|
731
|
+
parser.add_argument(
|
|
732
|
+
"--grouped",
|
|
733
|
+
action="store_true",
|
|
734
|
+
help="Whether to group accessions by structure file. "
|
|
735
|
+
"If set output changes to `<structure_file1>,<acc1>\\n<structure_file1>,<acc2>` format.",
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
def _add_convert_structures_parser(subparsers: argparse._SubParsersAction):
|
|
740
|
+
"""Add convert structures subcommand parser."""
|
|
741
|
+
parser = subparsers.add_parser(
|
|
742
|
+
"structures",
|
|
743
|
+
help="Convert structure files between formats",
|
|
744
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
745
|
+
)
|
|
746
|
+
parser.add_argument(
|
|
747
|
+
"input_dir",
|
|
748
|
+
type=Path,
|
|
749
|
+
help=f"Directory with structure files. Supported extensions are {valid_structure_file_extensions}",
|
|
750
|
+
).complete = shtab.DIRECTORY
|
|
751
|
+
parser.add_argument(
|
|
752
|
+
"--output-dir",
|
|
753
|
+
type=Path,
|
|
754
|
+
help=dedent("""\
|
|
755
|
+
Directory to write converted structure files. If not given, files are written to `input_dir`.
|
|
756
|
+
"""),
|
|
757
|
+
).complete = shtab.DIRECTORY
|
|
758
|
+
parser.add_argument(
|
|
759
|
+
"--format",
|
|
760
|
+
type=str,
|
|
761
|
+
choices=("cif",),
|
|
762
|
+
default="cif",
|
|
763
|
+
help="Output format to convert to.",
|
|
764
|
+
)
|
|
765
|
+
_add_copy_method_arguments(parser)
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
def _add_convert_subcommands(subparsers: argparse._SubParsersAction):
|
|
769
|
+
"""Add convert command and its subcommands."""
|
|
770
|
+
parser = subparsers.add_parser(
|
|
771
|
+
"convert",
|
|
772
|
+
help="Convert files between formats",
|
|
773
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
774
|
+
)
|
|
775
|
+
subsubparsers = parser.add_subparsers(dest="convert_cmd", required=True)
|
|
776
|
+
|
|
777
|
+
_add_convert_structures_parser(subsubparsers)
|
|
778
|
+
_add_convert_uniprot_parser(subsubparsers)
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
def _add_mcp_command(subparsers: argparse._SubParsersAction):
|
|
782
|
+
"""Add MCP command."""
|
|
783
|
+
|
|
784
|
+
parser = subparsers.add_parser(
|
|
785
|
+
"mcp",
|
|
786
|
+
help="Run Model Context Protocol (MCP) server",
|
|
787
|
+
description=(
|
|
788
|
+
"Run Model Context Protocol (MCP) server. "
|
|
789
|
+
"Can be used by agentic LLMs like Claude Sonnet 4 as a set of tools."
|
|
790
|
+
),
|
|
791
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
792
|
+
)
|
|
793
|
+
parser.add_argument(
|
|
794
|
+
"--transport", default="stdio", choices=["stdio", "http", "streamable-http"], help="Transport protocol to use"
|
|
795
|
+
)
|
|
796
|
+
parser.add_argument("--host", default="127.0.0.1", help="Host to bind the server to")
|
|
797
|
+
parser.add_argument("--port", default=8000, type=int, help="Port to bind the server to")
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
def make_parser() -> argparse.ArgumentParser:
|
|
801
|
+
parser = argparse.ArgumentParser(
|
|
802
|
+
description="Protein Quest CLI", prog="protein-quest", formatter_class=ArgumentDefaultsRichHelpFormatter
|
|
803
|
+
)
|
|
804
|
+
parser.add_argument("--log-level", default="WARNING", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
|
|
805
|
+
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
|
|
806
|
+
shtab.add_argument_to(parser, ["--print-completion"])
|
|
807
|
+
|
|
808
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
809
|
+
|
|
810
|
+
_add_search_subcommands(subparsers)
|
|
811
|
+
_add_retrieve_subcommands(subparsers)
|
|
812
|
+
_add_filter_subcommands(subparsers)
|
|
813
|
+
_add_convert_subcommands(subparsers)
|
|
814
|
+
_add_mcp_command(subparsers)
|
|
815
|
+
|
|
816
|
+
return parser
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
def _name_of(file: TextIOWrapper | BytesIO) -> str:
|
|
820
|
+
try:
|
|
821
|
+
return file.name
|
|
822
|
+
except AttributeError:
|
|
823
|
+
# In pytest BytesIO is used stdout which has no 'name' attribute
|
|
824
|
+
return "<stdout>"
|
|
825
|
+
|
|
826
|
+
|
|
827
|
+
def _handle_search_uniprot(args):
|
|
828
|
+
taxon_id = args.taxon_id
|
|
829
|
+
reviewed = args.reviewed
|
|
830
|
+
subcellular_location_uniprot = args.subcellular_location_uniprot
|
|
831
|
+
subcellular_location_go = args.subcellular_location_go
|
|
832
|
+
molecular_function_go = args.molecular_function_go
|
|
833
|
+
min_sequence_length = args.min_sequence_length
|
|
834
|
+
max_sequence_length = args.max_sequence_length
|
|
835
|
+
limit = args.limit
|
|
836
|
+
timeout = args.timeout
|
|
837
|
+
output_file = args.output
|
|
838
|
+
|
|
839
|
+
query = structure(
|
|
840
|
+
{
|
|
841
|
+
"taxon_id": taxon_id,
|
|
842
|
+
"reviewed": reviewed,
|
|
843
|
+
"subcellular_location_uniprot": subcellular_location_uniprot,
|
|
844
|
+
"subcellular_location_go": subcellular_location_go,
|
|
845
|
+
"molecular_function_go": molecular_function_go,
|
|
846
|
+
"min_sequence_length": min_sequence_length,
|
|
847
|
+
"max_sequence_length": max_sequence_length,
|
|
848
|
+
},
|
|
849
|
+
Query,
|
|
850
|
+
)
|
|
851
|
+
rprint("Searching for UniProt accessions")
|
|
852
|
+
accs = search4uniprot(query=query, limit=limit, timeout=timeout)
|
|
853
|
+
rprint(f"Found {len(accs)} UniProt accessions, written to {_name_of(output_file)}")
|
|
854
|
+
_write_lines(output_file, sorted(accs))
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
def _handle_search_pdbe(args):
|
|
858
|
+
uniprot_accessions = args.uniprot_accessions
|
|
859
|
+
limit = args.limit
|
|
860
|
+
timeout = args.timeout
|
|
861
|
+
output_csv = args.output_csv
|
|
862
|
+
min_residues = converter.structure(args.min_residues, PositiveInt | None) # pyright: ignore[reportArgumentType]
|
|
863
|
+
max_residues = converter.structure(args.max_residues, PositiveInt | None) # pyright: ignore[reportArgumentType]
|
|
864
|
+
keep_invalid = args.keep_invalid
|
|
865
|
+
|
|
866
|
+
accs = set(_read_lines(uniprot_accessions))
|
|
867
|
+
rprint(f"Finding PDB entries for {len(accs)} uniprot accessions")
|
|
868
|
+
results = search4pdb(accs, limit=limit, timeout=timeout)
|
|
869
|
+
|
|
870
|
+
raw_nr_results = len(results)
|
|
871
|
+
raw_total_pdbs = sum([len(v) for v in results.values()])
|
|
872
|
+
if min_residues or max_residues:
|
|
873
|
+
results = filter_pdb_results_on_chain_length(results, min_residues, max_residues, keep_invalid=keep_invalid)
|
|
874
|
+
total_pdbs = sum([len(v) for v in results.values()])
|
|
875
|
+
rprint(f"Before filtering found {raw_total_pdbs} PDB entries for {raw_nr_results} uniprot accessions.")
|
|
876
|
+
rprint(
|
|
877
|
+
f"After filtering on chain length ({min_residues}, {max_residues}) "
|
|
878
|
+
f"remained {total_pdbs} PDB entries for {len(results)} uniprot accessions."
|
|
879
|
+
)
|
|
880
|
+
else:
|
|
881
|
+
rprint(f"Found {raw_total_pdbs} PDB entries for {raw_nr_results} uniprot accessions")
|
|
882
|
+
|
|
883
|
+
_write_pdbe_csv(output_csv, results)
|
|
884
|
+
rprint(f"Written to {_name_of(output_csv)}")
|
|
885
|
+
|
|
886
|
+
|
|
887
|
+
def _handle_search_alphafold(args):
|
|
888
|
+
uniprot_accessions = args.uniprot_accessions
|
|
889
|
+
min_sequence_length = converter.structure(args.min_sequence_length, PositiveInt | None) # pyright: ignore[reportArgumentType]
|
|
890
|
+
max_sequence_length = converter.structure(args.max_sequence_length, PositiveInt | None) # pyright: ignore[reportArgumentType]
|
|
891
|
+
limit = args.limit
|
|
892
|
+
timeout = args.timeout
|
|
893
|
+
output_csv = args.output_csv
|
|
894
|
+
|
|
895
|
+
accs = _read_lines(uniprot_accessions)
|
|
896
|
+
rprint(f"Finding AlphaFold entries for {len(accs)} uniprot accessions")
|
|
897
|
+
results = search4af(
|
|
898
|
+
accs,
|
|
899
|
+
min_sequence_length=min_sequence_length,
|
|
900
|
+
max_sequence_length=max_sequence_length,
|
|
901
|
+
limit=limit,
|
|
902
|
+
timeout=timeout,
|
|
903
|
+
)
|
|
904
|
+
rprint(f"Found {len(results)} AlphaFold entries, written to {_name_of(output_csv)}")
|
|
905
|
+
_write_dict_of_sets2csv(output_csv, results, "af_id")
|
|
906
|
+
|
|
907
|
+
|
|
908
|
+
def _handle_search_emdb(args):
|
|
909
|
+
uniprot_accessions = args.uniprot_accessions
|
|
910
|
+
limit = args.limit
|
|
911
|
+
timeout = args.timeout
|
|
912
|
+
output_csv = args.output_csv
|
|
913
|
+
|
|
914
|
+
accs = _read_lines(uniprot_accessions)
|
|
915
|
+
rprint(f"Finding EMDB entries for {len(accs)} uniprot accessions")
|
|
916
|
+
results = search4emdb(accs, limit=limit, timeout=timeout)
|
|
917
|
+
total_emdbs = sum([len(v) for v in results.values()])
|
|
918
|
+
rprint(f"Found {total_emdbs} EMDB entries, written to {_name_of(output_csv)}")
|
|
919
|
+
_write_dict_of_sets2csv(output_csv, results, "emdb_id")
|
|
920
|
+
|
|
921
|
+
|
|
922
|
+
def _handle_search_go(args):
|
|
923
|
+
term = structure(args.term, str)
|
|
924
|
+
aspect: Aspect | None = args.aspect
|
|
925
|
+
limit = structure(args.limit, int)
|
|
926
|
+
output_csv: TextIOWrapper = args.output_csv
|
|
927
|
+
|
|
928
|
+
if aspect:
|
|
929
|
+
rprint(f"Searching for GO terms matching '{term}' with aspect '{aspect}'")
|
|
930
|
+
else:
|
|
931
|
+
rprint(f"Searching for GO terms matching '{term}'")
|
|
932
|
+
results = asyncio.run(search_gene_ontology_term(term, aspect=aspect, limit=limit))
|
|
933
|
+
rprint(f"Found {len(results)} GO terms, written to {_name_of(output_csv)}")
|
|
934
|
+
write_go_terms_to_csv(results, output_csv)
|
|
935
|
+
|
|
936
|
+
|
|
937
|
+
def _handle_search_taxonomy(args):
|
|
938
|
+
query: str = args.query
|
|
939
|
+
field: SearchField | None = args.field
|
|
940
|
+
limit: int = args.limit
|
|
941
|
+
output_csv: TextIOWrapper = args.output_csv
|
|
942
|
+
|
|
943
|
+
if field:
|
|
944
|
+
rprint(f"Searching for taxon information matching '{query}' in field '{field}'")
|
|
945
|
+
else:
|
|
946
|
+
rprint(f"Searching for taxon information matching '{query}'")
|
|
947
|
+
results = asyncio.run(search_taxon(query=query, field=field, limit=limit))
|
|
948
|
+
rprint(f"Found {len(results)} taxons, written to {_name_of(output_csv)}")
|
|
949
|
+
_write_taxonomy_csv(results, output_csv)
|
|
950
|
+
|
|
951
|
+
|
|
952
|
+
def _handle_search_interaction_partners(args: argparse.Namespace):
|
|
953
|
+
uniprot_accession: str = args.uniprot_accession
|
|
954
|
+
excludes: set[str] = set(args.exclude) if args.exclude else set()
|
|
955
|
+
limit: int = args.limit
|
|
956
|
+
timeout: int = args.timeout
|
|
957
|
+
output_csv: TextIOWrapper = args.output_csv
|
|
958
|
+
|
|
959
|
+
rprint(f"Searching for interaction partners of '{uniprot_accession}'")
|
|
960
|
+
results = search4interaction_partners(uniprot_accession, excludes=excludes, limit=limit, timeout=timeout)
|
|
961
|
+
rprint(f"Found {len(results)} interaction partners, written to {_name_of(output_csv)}")
|
|
962
|
+
_write_lines(output_csv, results.keys())
|
|
963
|
+
|
|
964
|
+
|
|
965
|
+
def _handle_search_complexes(args: argparse.Namespace):
|
|
966
|
+
uniprot_accessions = args.uniprot_accessions
|
|
967
|
+
limit = args.limit
|
|
968
|
+
timeout = args.timeout
|
|
969
|
+
output_csv = args.output_csv
|
|
970
|
+
|
|
971
|
+
accs = _read_lines(uniprot_accessions)
|
|
972
|
+
rprint(f"Finding complexes for {len(accs)} uniprot accessions")
|
|
973
|
+
results = search4macromolecular_complexes(accs, limit=limit, timeout=timeout)
|
|
974
|
+
rprint(f"Found {len(results)} complexes, written to {_name_of(output_csv)}")
|
|
975
|
+
_write_complexes_csv(results, output_csv)
|
|
976
|
+
|
|
977
|
+
|
|
978
|
+
def _handle_search_uniprot_details(args: argparse.Namespace):
|
|
979
|
+
uniprot_accessions = args.uniprot_accessions
|
|
980
|
+
timeout = args.timeout
|
|
981
|
+
batch_size = args.batch_size
|
|
982
|
+
output_csv: TextIOWrapper = args.output_csv
|
|
983
|
+
|
|
984
|
+
accs = _read_lines(uniprot_accessions)
|
|
985
|
+
rprint(f"Retrieving UniProt entry details for {len(accs)} uniprot accessions")
|
|
986
|
+
results = list(map_uniprot_accessions2uniprot_details(accs, timeout=timeout, batch_size=batch_size))
|
|
987
|
+
_write_uniprot_details_csv(output_csv, results)
|
|
988
|
+
rprint(f"Retrieved details for {len(results)} UniProt entries, written to {_name_of(output_csv)}")
|
|
989
|
+
|
|
990
|
+
|
|
991
|
+
def _initialize_cacher(args: argparse.Namespace) -> Cacher:
|
|
992
|
+
if args.no_cache:
|
|
993
|
+
return PassthroughCacher()
|
|
994
|
+
return DirectoryCacher(
|
|
995
|
+
cache_dir=args.cache_dir,
|
|
996
|
+
copy_method=args.copy_method,
|
|
997
|
+
)
|
|
998
|
+
|
|
999
|
+
|
|
1000
|
+
def _handle_retrieve_pdbe(args: argparse.Namespace):
|
|
1001
|
+
pdbe_csv = args.pdbe_csv
|
|
1002
|
+
output_dir = args.output_dir
|
|
1003
|
+
max_parallel_downloads = args.max_parallel_downloads
|
|
1004
|
+
cacher = _initialize_cacher(args)
|
|
1005
|
+
|
|
1006
|
+
pdb_ids = _read_column_from_csv(pdbe_csv, "pdb_id")
|
|
1007
|
+
rprint(f"Retrieving {len(pdb_ids)} PDBe entries")
|
|
1008
|
+
result = asyncio.run(
|
|
1009
|
+
pdbe_fetch.fetch(pdb_ids, output_dir, max_parallel_downloads=max_parallel_downloads, cacher=cacher)
|
|
1010
|
+
)
|
|
1011
|
+
rprint(f"Retrieved {len(result)} PDBe entries")
|
|
1012
|
+
|
|
1013
|
+
|
|
1014
|
+
def _handle_retrieve_alphafold(args):
|
|
1015
|
+
download_dir = args.output_dir
|
|
1016
|
+
raw_formats = args.format
|
|
1017
|
+
alphafold_csv = args.alphafold_csv
|
|
1018
|
+
max_parallel_downloads = args.max_parallel_downloads
|
|
1019
|
+
cacher = _initialize_cacher(args)
|
|
1020
|
+
gzip_files = args.gzip_files
|
|
1021
|
+
all_isoforms = args.all_isoforms
|
|
1022
|
+
db_version = args.db_version
|
|
1023
|
+
|
|
1024
|
+
if raw_formats is None:
|
|
1025
|
+
raw_formats = {"cif"}
|
|
1026
|
+
|
|
1027
|
+
# TODO besides `uniprot_accession,af_id\n` csv also allow headless single column format
|
|
1028
|
+
af_ids = _read_column_from_csv(alphafold_csv, "af_id")
|
|
1029
|
+
formats: set[DownloadableFormat] = structure(raw_formats, set[DownloadableFormat])
|
|
1030
|
+
rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {formats}")
|
|
1031
|
+
afs = af_fetch(
|
|
1032
|
+
af_ids,
|
|
1033
|
+
download_dir,
|
|
1034
|
+
formats=formats,
|
|
1035
|
+
db_version=db_version,
|
|
1036
|
+
max_parallel_downloads=max_parallel_downloads,
|
|
1037
|
+
cacher=cacher,
|
|
1038
|
+
gzip_files=gzip_files,
|
|
1039
|
+
all_isoforms=all_isoforms,
|
|
1040
|
+
)
|
|
1041
|
+
total_nr_files = sum(af.nr_of_files() for af in afs)
|
|
1042
|
+
rprint(f"Retrieved {total_nr_files} AlphaFold files and {len(afs)} summaries, written to {download_dir}")
|
|
1043
|
+
|
|
1044
|
+
|
|
1045
|
+
def _handle_retrieve_emdb(args):
|
|
1046
|
+
emdb_csv = args.emdb_csv
|
|
1047
|
+
output_dir = args.output_dir
|
|
1048
|
+
cacher = _initialize_cacher(args)
|
|
1049
|
+
|
|
1050
|
+
emdb_ids = _read_column_from_csv(emdb_csv, "emdb_id")
|
|
1051
|
+
rprint(f"Retrieving {len(emdb_ids)} EMDB entries")
|
|
1052
|
+
result = asyncio.run(emdb_fetch(emdb_ids, output_dir, cacher=cacher))
|
|
1053
|
+
rprint(f"Retrieved {len(result)} EMDB entries")
|
|
1054
|
+
|
|
1055
|
+
|
|
1056
|
+
def _handle_filter_confidence(args: argparse.Namespace):
|
|
1057
|
+
# we are repeating types here and in add_argument call
|
|
1058
|
+
# TODO replace argparse with modern alternative like cyclopts
|
|
1059
|
+
# to get rid of duplication
|
|
1060
|
+
input_dir = structure(args.input_dir, Path)
|
|
1061
|
+
output_dir = structure(args.output_dir, Path)
|
|
1062
|
+
|
|
1063
|
+
confidence_threshold = args.confidence_threshold
|
|
1064
|
+
min_residues = args.min_residues
|
|
1065
|
+
max_residues = args.max_residues
|
|
1066
|
+
stats_file: TextIOWrapper | None = args.write_stats
|
|
1067
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
1068
|
+
scheduler_address = structure(args.scheduler_address, str | None) # pyright: ignore[reportArgumentType]
|
|
1069
|
+
|
|
1070
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
1071
|
+
input_files = sorted(glob_structure_files(input_dir))
|
|
1072
|
+
nr_input_files = len(input_files)
|
|
1073
|
+
rprint(f"Starting confidence filtering of {nr_input_files} mmcif/PDB files in {input_dir} directory.")
|
|
1074
|
+
query = converter.structure(
|
|
1075
|
+
{
|
|
1076
|
+
"confidence": confidence_threshold,
|
|
1077
|
+
"min_residues": min_residues,
|
|
1078
|
+
"max_residues": max_residues,
|
|
1079
|
+
},
|
|
1080
|
+
ConfidenceFilterQuery,
|
|
1081
|
+
)
|
|
1082
|
+
if stats_file:
|
|
1083
|
+
writer = csv.writer(stats_file)
|
|
1084
|
+
writer.writerow(["input_file", "residue_count", "passed", "output_file"])
|
|
1085
|
+
|
|
1086
|
+
passed_count = 0
|
|
1087
|
+
results = filter_files_on_confidence(
|
|
1088
|
+
input_files, query, output_dir, copy_method=copy_method, scheduler_address=scheduler_address
|
|
1089
|
+
)
|
|
1090
|
+
for r in results:
|
|
1091
|
+
if r.filtered_file:
|
|
1092
|
+
passed_count += 1
|
|
1093
|
+
if stats_file:
|
|
1094
|
+
writer.writerow([r.input_file, r.count, r.filtered_file is not None, r.filtered_file]) # pyright: ignore[reportPossiblyUnboundVariable]
|
|
1095
|
+
rprint(f"Filtered {passed_count} mmcif/PDB files by confidence, written to {output_dir} directory")
|
|
1096
|
+
if stats_file:
|
|
1097
|
+
rprint(f"Statistics written to {_name_of(stats_file)}")
|
|
1098
|
+
|
|
1099
|
+
|
|
1100
|
+
def _handle_filter_chain(args):
|
|
1101
|
+
input_dir = args.input_dir
|
|
1102
|
+
output_dir = structure(args.output_dir, Path)
|
|
1103
|
+
pdb_id2chain_mapping_file = args.chains
|
|
1104
|
+
scheduler_address = structure(args.scheduler_address, str | None) # pyright: ignore[reportArgumentType]
|
|
1105
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
1106
|
+
|
|
1107
|
+
# make sure files in input dir with entries in mapping file are the same
|
|
1108
|
+
# complain when files from mapping file are missing on disk
|
|
1109
|
+
rows = list(_iter_csv_rows(pdb_id2chain_mapping_file))
|
|
1110
|
+
file2chain: set[tuple[Path, str]] = set()
|
|
1111
|
+
errors: list[FileNotFoundError] = []
|
|
1112
|
+
|
|
1113
|
+
for row in rows:
|
|
1114
|
+
pdb_id = row["pdb_id"]
|
|
1115
|
+
chain = row["chain"]
|
|
1116
|
+
try:
|
|
1117
|
+
f = locate_structure_file(input_dir, pdb_id)
|
|
1118
|
+
file2chain.add((f, chain))
|
|
1119
|
+
except FileNotFoundError as e:
|
|
1120
|
+
errors.append(e)
|
|
1121
|
+
|
|
1122
|
+
if errors:
|
|
1123
|
+
msg = f"Some structure files could not be found ({len(errors)} missing), skipping them"
|
|
1124
|
+
rprint(Panel(os.linesep.join(map(str, errors)), title=msg, style="red"))
|
|
1125
|
+
|
|
1126
|
+
if not file2chain:
|
|
1127
|
+
rprint("[red]No valid structure files found. Exiting.")
|
|
1128
|
+
sys.exit(1)
|
|
1129
|
+
|
|
1130
|
+
results = filter_files_on_chain(
|
|
1131
|
+
file2chain, output_dir, scheduler_address=scheduler_address, copy_method=copy_method
|
|
1132
|
+
)
|
|
1133
|
+
|
|
1134
|
+
nr_written = len([r for r in results if r.passed])
|
|
1135
|
+
|
|
1136
|
+
rprint(f"Wrote {nr_written} single-chain PDB/mmCIF files to {output_dir}.")
|
|
1137
|
+
|
|
1138
|
+
for result in results:
|
|
1139
|
+
if result.discard_reason:
|
|
1140
|
+
rprint(f"[red]Discarding {result.input_file} ({result.discard_reason})[/red]")
|
|
1141
|
+
|
|
1142
|
+
|
|
1143
|
+
def _handle_filter_residue(args):
|
|
1144
|
+
input_dir = structure(args.input_dir, Path)
|
|
1145
|
+
output_dir = structure(args.output_dir, Path)
|
|
1146
|
+
min_residues = structure(args.min_residues, int)
|
|
1147
|
+
max_residues = structure(args.max_residues, int)
|
|
1148
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
1149
|
+
stats_file: TextIOWrapper | None = args.write_stats
|
|
1150
|
+
|
|
1151
|
+
if stats_file:
|
|
1152
|
+
writer = csv.writer(stats_file)
|
|
1153
|
+
writer.writerow(["input_file", "residue_count", "passed", "output_file"])
|
|
1154
|
+
|
|
1155
|
+
nr_passed = 0
|
|
1156
|
+
input_files = sorted(glob_structure_files(input_dir))
|
|
1157
|
+
nr_total = len(input_files)
|
|
1158
|
+
rprint(f"Filtering {nr_total} files in {input_dir} directory by number of residues in chain A.")
|
|
1159
|
+
for r in filter_files_on_residues(
|
|
1160
|
+
input_files, output_dir, min_residues=min_residues, max_residues=max_residues, copy_method=copy_method
|
|
1161
|
+
):
|
|
1162
|
+
if stats_file:
|
|
1163
|
+
writer.writerow([r.input_file, r.residue_count, r.passed, r.output_file]) # pyright: ignore[reportPossiblyUnboundVariable]
|
|
1164
|
+
if r.passed:
|
|
1165
|
+
nr_passed += 1
|
|
1166
|
+
|
|
1167
|
+
rprint(f"Wrote {nr_passed} files to {output_dir} directory.")
|
|
1168
|
+
if stats_file:
|
|
1169
|
+
rprint(f"Statistics written to {_name_of(stats_file)}")
|
|
1170
|
+
|
|
1171
|
+
|
|
1172
|
+
def _handle_filter_ss(args):
|
|
1173
|
+
input_dir = structure(args.input_dir, Path)
|
|
1174
|
+
output_dir = structure(args.output_dir, Path)
|
|
1175
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
1176
|
+
stats_file: TextIOWrapper | None = args.write_stats
|
|
1177
|
+
|
|
1178
|
+
raw_query = {
|
|
1179
|
+
"abs_min_helix_residues": args.abs_min_helix_residues,
|
|
1180
|
+
"abs_max_helix_residues": args.abs_max_helix_residues,
|
|
1181
|
+
"abs_min_sheet_residues": args.abs_min_sheet_residues,
|
|
1182
|
+
"abs_max_sheet_residues": args.abs_max_sheet_residues,
|
|
1183
|
+
"ratio_min_helix_residues": args.ratio_min_helix_residues,
|
|
1184
|
+
"ratio_max_helix_residues": args.ratio_max_helix_residues,
|
|
1185
|
+
"ratio_min_sheet_residues": args.ratio_min_sheet_residues,
|
|
1186
|
+
"ratio_max_sheet_residues": args.ratio_max_sheet_residues,
|
|
1187
|
+
}
|
|
1188
|
+
query = converter.structure(raw_query, SecondaryStructureFilterQuery)
|
|
1189
|
+
input_files = sorted(glob_structure_files(input_dir))
|
|
1190
|
+
nr_total = len(input_files)
|
|
1191
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
1192
|
+
|
|
1193
|
+
if stats_file:
|
|
1194
|
+
writer = csv.writer(stats_file)
|
|
1195
|
+
writer.writerow(
|
|
1196
|
+
[
|
|
1197
|
+
"input_file",
|
|
1198
|
+
"nr_residues",
|
|
1199
|
+
"nr_helix_residues",
|
|
1200
|
+
"nr_sheet_residues",
|
|
1201
|
+
"helix_ratio",
|
|
1202
|
+
"sheet_ratio",
|
|
1203
|
+
"passed",
|
|
1204
|
+
"output_file",
|
|
1205
|
+
]
|
|
1206
|
+
)
|
|
1207
|
+
|
|
1208
|
+
rprint(f"Filtering {nr_total} files in {input_dir} directory by secondary structure.")
|
|
1209
|
+
nr_passed = 0
|
|
1210
|
+
for input_file, result in filter_files_on_secondary_structure(input_files, query=query):
|
|
1211
|
+
output_file: Path | None = None
|
|
1212
|
+
if result.passed:
|
|
1213
|
+
output_file = output_dir / input_file.name
|
|
1214
|
+
copyfile(input_file, output_file, copy_method)
|
|
1215
|
+
nr_passed += 1
|
|
1216
|
+
if stats_file:
|
|
1217
|
+
writer.writerow( # pyright: ignore[reportPossiblyUnboundVariable]
|
|
1218
|
+
[
|
|
1219
|
+
input_file,
|
|
1220
|
+
result.stats.nr_residues,
|
|
1221
|
+
result.stats.nr_helix_residues,
|
|
1222
|
+
result.stats.nr_sheet_residues,
|
|
1223
|
+
round(result.stats.helix_ratio, 3),
|
|
1224
|
+
round(result.stats.sheet_ratio, 3),
|
|
1225
|
+
result.passed,
|
|
1226
|
+
output_file,
|
|
1227
|
+
]
|
|
1228
|
+
)
|
|
1229
|
+
rprint(f"Wrote {nr_passed} files to {output_dir} directory.")
|
|
1230
|
+
if stats_file:
|
|
1231
|
+
rprint(f"Statistics written to {_name_of(stats_file)}")
|
|
1232
|
+
|
|
1233
|
+
|
|
1234
|
+
def _handle_mcp(args):
|
|
1235
|
+
if find_spec("fastmcp") is None:
|
|
1236
|
+
msg = "Unable to start MCP server, please install `protein-quest[mcp]`."
|
|
1237
|
+
raise ImportError(msg)
|
|
1238
|
+
|
|
1239
|
+
from protein_quest.mcp_server import mcp # noqa: PLC0415
|
|
1240
|
+
|
|
1241
|
+
if args.transport == "stdio":
|
|
1242
|
+
mcp.run(transport=args.transport)
|
|
1243
|
+
else:
|
|
1244
|
+
mcp.run(transport=args.transport, host=args.host, port=args.port)
|
|
1245
|
+
|
|
1246
|
+
|
|
1247
|
+
def _handle_convert_uniprot(args):
|
|
1248
|
+
input_dir = structure(args.input_dir, Path)
|
|
1249
|
+
output_file: TextIOWrapper = args.output
|
|
1250
|
+
grouped: bool = args.grouped
|
|
1251
|
+
input_files = sorted(glob_structure_files(input_dir))
|
|
1252
|
+
if grouped:
|
|
1253
|
+
for input_file in tqdm(input_files, unit="file"):
|
|
1254
|
+
s = read_structure(input_file)
|
|
1255
|
+
uniprot_accessions = structure2uniprot_accessions(s)
|
|
1256
|
+
_write_lines(
|
|
1257
|
+
output_file, [f"{input_file},{uniprot_accession}" for uniprot_accession in sorted(uniprot_accessions)]
|
|
1258
|
+
)
|
|
1259
|
+
else:
|
|
1260
|
+
uniprot_accessions: set[str] = set()
|
|
1261
|
+
for input_file in tqdm(input_files, unit="file"):
|
|
1262
|
+
s = read_structure(input_file)
|
|
1263
|
+
uniprot_accessions.update(structure2uniprot_accessions(s))
|
|
1264
|
+
_write_lines(output_file, sorted(uniprot_accessions))
|
|
1265
|
+
|
|
1266
|
+
|
|
1267
|
+
def _handle_convert_structures(args):
|
|
1268
|
+
input_dir = structure(args.input_dir, Path)
|
|
1269
|
+
output_dir = input_dir if args.output_dir is None else structure(args.output_dir, Path)
|
|
1270
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
1271
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
1272
|
+
|
|
1273
|
+
input_files = sorted(glob_structure_files(input_dir))
|
|
1274
|
+
rprint(f"Converting {len(input_files)} files in {input_dir} directory to cif format.")
|
|
1275
|
+
for _ in tqdm(
|
|
1276
|
+
convert_to_cif_files(
|
|
1277
|
+
input_files,
|
|
1278
|
+
output_dir,
|
|
1279
|
+
copy_method=copy_method,
|
|
1280
|
+
),
|
|
1281
|
+
total=len(input_files),
|
|
1282
|
+
unit="file",
|
|
1283
|
+
):
|
|
1284
|
+
pass
|
|
1285
|
+
rprint(f"Converted {len(input_files)} files into {output_dir}.")
|
|
1286
|
+
|
|
1287
|
+
|
|
1288
|
+
def _read_lines(file: TextIOWrapper) -> list[str]:
|
|
1289
|
+
return [line.strip() for line in file]
|
|
1290
|
+
|
|
1291
|
+
|
|
1292
|
+
def _make_sure_parent_exists(file: TextIOWrapper):
|
|
1293
|
+
# Can not create dir for stdout
|
|
1294
|
+
with suppress(AttributeError):
|
|
1295
|
+
Path(file.name).parent.mkdir(parents=True, exist_ok=True)
|
|
1296
|
+
|
|
1297
|
+
|
|
1298
|
+
def _write_lines(file: TextIOWrapper, lines: Iterable[str]):
|
|
1299
|
+
_make_sure_parent_exists(file)
|
|
1300
|
+
file.writelines(line + os.linesep for line in lines)
|
|
1301
|
+
|
|
1302
|
+
|
|
1303
|
+
def _write_pdbe_csv(path: TextIOWrapper, data: PdbResults):
|
|
1304
|
+
_make_sure_parent_exists(path)
|
|
1305
|
+
fieldnames = ["uniprot_accession", "pdb_id", "method", "resolution", "uniprot_chains", "chain", "chain_length"]
|
|
1306
|
+
writer = csv.DictWriter(path, fieldnames=fieldnames)
|
|
1307
|
+
writer.writeheader()
|
|
1308
|
+
for uniprot_accession, entries in sorted(data.items()):
|
|
1309
|
+
for e in sorted(entries, key=lambda x: (x.id, x.method)):
|
|
1310
|
+
writer.writerow(
|
|
1311
|
+
{
|
|
1312
|
+
"uniprot_accession": uniprot_accession,
|
|
1313
|
+
"pdb_id": e.id,
|
|
1314
|
+
"method": e.method,
|
|
1315
|
+
"resolution": e.resolution or "",
|
|
1316
|
+
"uniprot_chains": e.uniprot_chains,
|
|
1317
|
+
"chain": e.chain,
|
|
1318
|
+
"chain_length": e.chain_length,
|
|
1319
|
+
}
|
|
1320
|
+
)
|
|
1321
|
+
|
|
1322
|
+
|
|
1323
|
+
def _write_dict_of_sets2csv(file: TextIOWrapper, data: dict[str, set[str]], ref_id_field: str):
|
|
1324
|
+
_make_sure_parent_exists(file)
|
|
1325
|
+
fieldnames = ["uniprot_accession", ref_id_field]
|
|
1326
|
+
|
|
1327
|
+
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
|
1328
|
+
writer.writeheader()
|
|
1329
|
+
for uniprot_accession, ref_ids in sorted(data.items()):
|
|
1330
|
+
for ref_id in sorted(ref_ids):
|
|
1331
|
+
writer.writerow({"uniprot_accession": uniprot_accession, ref_id_field: ref_id})
|
|
1332
|
+
|
|
1333
|
+
|
|
1334
|
+
def _iter_csv_rows(file: TextIOWrapper) -> Generator[dict[str, str]]:
|
|
1335
|
+
reader = csv.DictReader(file)
|
|
1336
|
+
yield from reader
|
|
1337
|
+
|
|
1338
|
+
|
|
1339
|
+
def _read_column_from_csv(file: TextIOWrapper, column: str) -> set[str]:
|
|
1340
|
+
return {row[column] for row in _iter_csv_rows(file)}
|
|
1341
|
+
|
|
1342
|
+
|
|
1343
|
+
def _write_complexes_csv(complexes: list[ComplexPortalEntry], output_csv: TextIOWrapper) -> None:
|
|
1344
|
+
"""Write ComplexPortal information to a CSV file.
|
|
1345
|
+
|
|
1346
|
+
Args:
|
|
1347
|
+
complexes: List of ComplexPortalEntry objects.
|
|
1348
|
+
output_csv: TextIOWrapper to write the CSV data to.
|
|
1349
|
+
"""
|
|
1350
|
+
writer = csv.writer(output_csv)
|
|
1351
|
+
writer.writerow(
|
|
1352
|
+
[
|
|
1353
|
+
"query_protein",
|
|
1354
|
+
"complex_id",
|
|
1355
|
+
"complex_url",
|
|
1356
|
+
"complex_title",
|
|
1357
|
+
"members",
|
|
1358
|
+
]
|
|
1359
|
+
)
|
|
1360
|
+
for entry in complexes:
|
|
1361
|
+
members_str = ";".join(sorted(entry.members))
|
|
1362
|
+
writer.writerow(
|
|
1363
|
+
[
|
|
1364
|
+
entry.query_protein,
|
|
1365
|
+
entry.complex_id,
|
|
1366
|
+
entry.complex_url,
|
|
1367
|
+
entry.complex_title,
|
|
1368
|
+
members_str,
|
|
1369
|
+
]
|
|
1370
|
+
)
|
|
1371
|
+
|
|
1372
|
+
|
|
1373
|
+
def _write_uniprot_details_csv(
|
|
1374
|
+
output_csv: TextIOWrapper,
|
|
1375
|
+
uniprot_details_list: Iterable[UniprotDetails],
|
|
1376
|
+
) -> None:
|
|
1377
|
+
if not uniprot_details_list:
|
|
1378
|
+
msg = "No UniProt entries found for given accessions"
|
|
1379
|
+
raise ValueError(msg)
|
|
1380
|
+
# As all props of UniprotDetails are scalar, we can directly unstructure to dicts
|
|
1381
|
+
rows = converter.unstructure(uniprot_details_list)
|
|
1382
|
+
fieldnames = rows[0].keys()
|
|
1383
|
+
writer = csv.DictWriter(output_csv, fieldnames=fieldnames)
|
|
1384
|
+
writer.writeheader()
|
|
1385
|
+
writer.writerows(rows)
|
|
1386
|
+
|
|
1387
|
+
|
|
1388
|
+
HANDLERS: dict[tuple[str, str | None], Callable] = {
|
|
1389
|
+
("search", "uniprot"): _handle_search_uniprot,
|
|
1390
|
+
("search", "pdbe"): _handle_search_pdbe,
|
|
1391
|
+
("search", "alphafold"): _handle_search_alphafold,
|
|
1392
|
+
("search", "emdb"): _handle_search_emdb,
|
|
1393
|
+
("search", "go"): _handle_search_go,
|
|
1394
|
+
("search", "taxonomy"): _handle_search_taxonomy,
|
|
1395
|
+
("search", "interaction-partners"): _handle_search_interaction_partners,
|
|
1396
|
+
("search", "complexes"): _handle_search_complexes,
|
|
1397
|
+
("search", "uniprot-details"): _handle_search_uniprot_details,
|
|
1398
|
+
("retrieve", "pdbe"): _handle_retrieve_pdbe,
|
|
1399
|
+
("retrieve", "alphafold"): _handle_retrieve_alphafold,
|
|
1400
|
+
("retrieve", "emdb"): _handle_retrieve_emdb,
|
|
1401
|
+
("filter", "confidence"): _handle_filter_confidence,
|
|
1402
|
+
("filter", "chain"): _handle_filter_chain,
|
|
1403
|
+
("filter", "residue"): _handle_filter_residue,
|
|
1404
|
+
("filter", "secondary-structure"): _handle_filter_ss,
|
|
1405
|
+
("mcp", None): _handle_mcp,
|
|
1406
|
+
("convert", "structures"): _handle_convert_structures,
|
|
1407
|
+
("convert", "uniprot"): _handle_convert_uniprot,
|
|
1408
|
+
}
|
|
1409
|
+
|
|
1410
|
+
|
|
1411
|
+
def main(argv: Sequence[str] | None = None):
|
|
1412
|
+
"""Main entry point for the CLI.
|
|
1413
|
+
|
|
1414
|
+
Args:
|
|
1415
|
+
argv: List of command line arguments. If None, uses sys.argv.
|
|
1416
|
+
"""
|
|
1417
|
+
parser = make_parser()
|
|
1418
|
+
args = parser.parse_args(argv)
|
|
1419
|
+
logging.basicConfig(level=args.log_level, handlers=[RichHandler(show_level=False, console=console)])
|
|
1420
|
+
|
|
1421
|
+
# Dispatch table to reduce complexity
|
|
1422
|
+
cmd = args.command
|
|
1423
|
+
sub = getattr(args, f"{cmd}_cmd", None)
|
|
1424
|
+
handler = HANDLERS.get((cmd, sub))
|
|
1425
|
+
if handler is None:
|
|
1426
|
+
msg = f"Unknown command: {cmd} {sub}"
|
|
1427
|
+
raise SystemExit(msg)
|
|
1428
|
+
handler(args)
|