protein-quest 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
protein_quest/cli.py ADDED
@@ -0,0 +1,1428 @@
1
+ """Module for cli parsers and handlers."""
2
+
3
+ import argparse
4
+ import asyncio
5
+ import csv
6
+ import logging
7
+ import os
8
+ import sys
9
+ from collections.abc import Callable, Generator, Iterable, Sequence
10
+ from contextlib import suppress
11
+ from importlib.util import find_spec
12
+ from io import BytesIO, TextIOWrapper
13
+ from pathlib import Path
14
+ from textwrap import dedent
15
+
16
+ import shtab
17
+ from cattrs import structure
18
+ from rich.console import Console
19
+ from rich.logging import RichHandler
20
+ from rich.markdown import Markdown
21
+ from rich.panel import Panel
22
+ from rich_argparse import ArgumentDefaultsRichHelpFormatter
23
+ from tqdm.rich import tqdm
24
+
25
+ from protein_quest.__version__ import __version__
26
+ from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence
27
+ from protein_quest.alphafold.fetch import DownloadableFormat, downloadable_formats
28
+ from protein_quest.alphafold.fetch import fetch_many as af_fetch
29
+ from protein_quest.converter import PositiveInt, converter
30
+ from protein_quest.emdb import fetch as emdb_fetch
31
+ from protein_quest.filters import filter_files_on_chain, filter_files_on_residues
32
+ from protein_quest.go import Aspect, allowed_aspects, search_gene_ontology_term, write_go_terms_to_csv
33
+ from protein_quest.io import (
34
+ convert_to_cif_files,
35
+ glob_structure_files,
36
+ locate_structure_file,
37
+ read_structure,
38
+ valid_structure_file_extensions,
39
+ )
40
+ from protein_quest.pdbe import fetch as pdbe_fetch
41
+ from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
42
+ from protein_quest.structure import structure2uniprot_accessions
43
+ from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
44
+ from protein_quest.uniprot import (
45
+ ComplexPortalEntry,
46
+ PdbResults,
47
+ Query,
48
+ UniprotDetails,
49
+ filter_pdb_results_on_chain_length,
50
+ map_uniprot_accessions2uniprot_details,
51
+ search4af,
52
+ search4emdb,
53
+ search4interaction_partners,
54
+ search4macromolecular_complexes,
55
+ search4pdb,
56
+ search4uniprot,
57
+ )
58
+ from protein_quest.utils import (
59
+ Cacher,
60
+ CopyMethod,
61
+ DirectoryCacher,
62
+ PassthroughCacher,
63
+ copy_methods,
64
+ copyfile,
65
+ user_cache_root_dir,
66
+ )
67
+
68
+ console = Console(stderr=True)
69
+ rprint = console.print
70
+ logger = logging.getLogger(__name__)
71
+
72
+
73
+ def _add_search_uniprot_parser(subparsers: argparse._SubParsersAction):
74
+ """Add search uniprot subcommand parser."""
75
+ parser = subparsers.add_parser(
76
+ "uniprot",
77
+ help="Search UniProt accessions",
78
+ description="Search for UniProt accessions based on various criteria in the Uniprot SPARQL endpoint.",
79
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
80
+ )
81
+ parser.add_argument(
82
+ "output",
83
+ type=argparse.FileType("w", encoding="UTF-8"),
84
+ help="Output text file for UniProt accessions (one per line). Use `-` for stdout.",
85
+ ).complete = shtab.FILE
86
+ parser.add_argument("--taxon-id", type=str, help="NCBI Taxon ID, e.g. 9606 for Homo Sapiens")
87
+ parser.add_argument(
88
+ "--reviewed",
89
+ action=argparse.BooleanOptionalAction,
90
+ help="Reviewed=swissprot, no-reviewed=trembl. Default is uniprot=swissprot+trembl.",
91
+ default=None,
92
+ )
93
+ parser.add_argument(
94
+ "--subcellular-location-uniprot",
95
+ type=str,
96
+ help="Subcellular location label as used by UniProt (e.g. nucleus)",
97
+ )
98
+ parser.add_argument(
99
+ "--subcellular-location-go",
100
+ dest="subcellular_location_go",
101
+ action="append",
102
+ help="GO term(s) for subcellular location (e.g. GO:0005634). Can be given multiple times.",
103
+ )
104
+ parser.add_argument(
105
+ "--molecular-function-go",
106
+ dest="molecular_function_go",
107
+ action="append",
108
+ help="GO term(s) for molecular function (e.g. GO:0003677). Can be given multiple times.",
109
+ )
110
+ parser.add_argument("--min-sequence-length", type=int, help="Minimum length of the canonical sequence.")
111
+ parser.add_argument("--max-sequence-length", type=int, help="Maximum length of the canonical sequence.")
112
+ parser.add_argument("--limit", type=int, default=10_000, help="Maximum number of uniprot accessions to return")
113
+ parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
114
+
115
+
116
+ def _add_search_pdbe_parser(subparsers: argparse._SubParsersAction):
117
+ """Add search pdbe subcommand parser."""
118
+ parser = subparsers.add_parser(
119
+ "pdbe",
120
+ help="Search PDBe structures of given UniProt accessions",
121
+ description="Search for PDB structures of given UniProt accessions in the Uniprot SPARQL endpoint.",
122
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
123
+ )
124
+ parser.add_argument(
125
+ "uniprot_accessions",
126
+ type=argparse.FileType("r", encoding="UTF-8"),
127
+ help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
128
+ ).complete = shtab.FILE
129
+ parser.add_argument(
130
+ "output_csv",
131
+ type=argparse.FileType("w", encoding="UTF-8"),
132
+ help=dedent("""\
133
+ Output CSV with following columns:
134
+ `uniprot_accession`, `pdb_id`, `method`, `resolution`, `uniprot_chains`, `chain`, `chain_length`.
135
+ Where `uniprot_chains` is the raw UniProt chain string, for example `A=1-100`.
136
+ and where `chain` is the first chain from `uniprot_chains`, for example `A`
137
+ and `chain_length` is the length of the chain, for example `100`.
138
+ Use `-` for stdout.
139
+ """),
140
+ ).complete = shtab.FILE
141
+ parser.add_argument(
142
+ "--limit", type=int, default=10_000, help="Maximum number of PDB uniprot accessions combinations to return"
143
+ )
144
+ parser.add_argument(
145
+ "--min-residues",
146
+ type=int,
147
+ help="Minimum number of residues required in the chain mapped to the UniProt accession.",
148
+ )
149
+ parser.add_argument(
150
+ "--max-residues",
151
+ type=int,
152
+ help="Maximum number of residues allowed in chain mapped to the UniProt accession.",
153
+ )
154
+ parser.add_argument(
155
+ "--keep-invalid",
156
+ action="store_true",
157
+ help=dedent("""\
158
+ Keep PDB results when chain length could not be determined.
159
+ If not given, such results are dropped.
160
+ Only applies if min/max residues arguments are set.
161
+ """),
162
+ )
163
+ parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
164
+
165
+
166
+ def _add_search_alphafold_parser(subparsers: argparse._SubParsersAction):
167
+ """Add search alphafold subcommand parser."""
168
+ parser = subparsers.add_parser(
169
+ "alphafold",
170
+ help="Search AlphaFold structures of given UniProt accessions",
171
+ description="Search for AlphaFold structures of given UniProt accessions in the Uniprot SPARQL endpoint.",
172
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
173
+ )
174
+ parser.add_argument(
175
+ "uniprot_accessions",
176
+ type=argparse.FileType("r", encoding="UTF-8"),
177
+ help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
178
+ ).complete = shtab.FILE
179
+ parser.add_argument(
180
+ "output_csv",
181
+ type=argparse.FileType("w", encoding="UTF-8"),
182
+ help="Output CSV with AlphaFold IDs per UniProt accession. Use `-` for stdout.",
183
+ ).complete = shtab.FILE
184
+ parser.add_argument("--min-sequence-length", type=int, help="Minimum length of the canonical sequence.")
185
+ parser.add_argument("--max-sequence-length", type=int, help="Maximum length of the canonical sequence.")
186
+ parser.add_argument(
187
+ "--limit", type=int, default=10_000, help="Maximum number of Alphafold entry identifiers to return"
188
+ )
189
+ parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
190
+
191
+
192
+ def _add_search_emdb_parser(subparsers: argparse._SubParsersAction):
193
+ """Add search emdb subcommand parser."""
194
+ parser = subparsers.add_parser(
195
+ "emdb",
196
+ help="Search Electron Microscopy Data Bank (EMDB) identifiers of given UniProt accessions",
197
+ description=dedent("""\
198
+ Search for Electron Microscopy Data Bank (EMDB) identifiers of given UniProt accessions
199
+ in the Uniprot SPARQL endpoint.
200
+ """),
201
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
202
+ )
203
+ parser.add_argument(
204
+ "uniprot_accs",
205
+ type=argparse.FileType("r", encoding="UTF-8"),
206
+ help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
207
+ ).complete = shtab.FILE
208
+ parser.add_argument(
209
+ "output_csv",
210
+ type=argparse.FileType("w", encoding="UTF-8"),
211
+ help="Output CSV with EMDB IDs per UniProt accession. Use `-` for stdout.",
212
+ ).complete = shtab.FILE
213
+ parser.add_argument("--limit", type=int, default=10_000, help="Maximum number of EMDB entry identifiers to return")
214
+ parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
215
+
216
+
217
+ def _add_search_go_parser(subparsers: argparse._SubParsersAction):
218
+ """Add search go subcommand parser"""
219
+ parser = subparsers.add_parser(
220
+ "go",
221
+ help="Search for Gene Ontology (GO) terms",
222
+ description="Search for Gene Ontology (GO) terms in the EBI QuickGO API.",
223
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
224
+ )
225
+ parser.add_argument(
226
+ "term",
227
+ type=str,
228
+ help="GO term to search for. For example `apoptosome`.",
229
+ )
230
+ parser.add_argument("--aspect", type=str, choices=allowed_aspects, help="Filter on aspect.")
231
+ parser.add_argument(
232
+ "output_csv",
233
+ type=argparse.FileType("w", encoding="UTF-8"),
234
+ help="Output CSV with GO term results. Use `-` for stdout.",
235
+ ).complete = shtab.FILE
236
+ parser.add_argument("--limit", type=int, default=100, help="Maximum number of GO term results to return")
237
+
238
+
239
+ def _add_search_taxonomy_parser(subparser: argparse._SubParsersAction):
240
+ """Add search taxonomy subcommand parser."""
241
+ parser = subparser.add_parser(
242
+ "taxonomy",
243
+ help="Search for taxon information in UniProt",
244
+ description=dedent("""\
245
+ Search for taxon information in UniProt.
246
+ Uses https://www.uniprot.org/taxonomy?query=*.
247
+ """),
248
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
249
+ )
250
+ parser.add_argument(
251
+ "query", type=str, help="Search query for the taxon. Surround multiple words with quotes (' or \")."
252
+ )
253
+ parser.add_argument(
254
+ "output_csv",
255
+ type=argparse.FileType("w", encoding="UTF-8"),
256
+ help="Output CSV with taxonomy results. Use `-` for stdout.",
257
+ ).complete = shtab.FILE
258
+ parser.add_argument(
259
+ "--field",
260
+ type=str,
261
+ choices=search_fields,
262
+ help=dedent("""\
263
+ Field to search in. If not given then searches all fields.
264
+ If "tax_id" then searches by taxon ID.
265
+ If "parent" then given a parent taxon ID returns all its children.
266
+ For example, if the parent taxon ID is 9606 (Human), it will return Neanderthal and Denisovan.
267
+ """),
268
+ )
269
+ parser.add_argument("--limit", type=int, default=100, help="Maximum number of results to return")
270
+
271
+
272
+ def _add_search_interaction_partners_parser(subparsers: argparse._SubParsersAction):
273
+ """Add search interaction partners subcommand parser."""
274
+ parser = subparsers.add_parser(
275
+ "interaction-partners",
276
+ help="Search for interaction partners of given UniProt accession",
277
+ description=dedent("""\
278
+ Search for interaction partners of given UniProt accession
279
+ in the Uniprot SPARQL endpoint and Complex Portal.
280
+ """),
281
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
282
+ )
283
+ parser.add_argument(
284
+ "uniprot_accession",
285
+ type=str,
286
+ help="UniProt accession (for example P12345).",
287
+ )
288
+ parser.add_argument(
289
+ "--exclude",
290
+ type=str,
291
+ action="append",
292
+ help="UniProt accessions to exclude from the results. For example already known interaction partners.",
293
+ )
294
+ parser.add_argument(
295
+ "output_csv",
296
+ type=argparse.FileType("w", encoding="UTF-8"),
297
+ help="Output CSV with interaction partners per UniProt accession. Use `-` for stdout.",
298
+ ).complete = shtab.FILE
299
+ parser.add_argument(
300
+ "--limit", type=int, default=10_000, help="Maximum number of interaction partner uniprot accessions to return"
301
+ )
302
+ parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
303
+
304
+
305
+ def _add_search_complexes_parser(subparsers: argparse._SubParsersAction):
306
+ """Add search complexes subcommand parser."""
307
+ description = dedent("""\
308
+ Search for complexes in the Complex Portal.
309
+ https://www.ebi.ac.uk/complexportal/
310
+
311
+ The output CSV file has the following columns:
312
+
313
+ - query_protein: UniProt accession used as query
314
+ - complex_id: Complex Portal identifier
315
+ - complex_url: URL to the Complex Portal entry
316
+ - complex_title: Title of the complex
317
+ - members: Semicolon-separated list of UniProt accessions of complex members
318
+ """)
319
+ parser = subparsers.add_parser(
320
+ "complexes",
321
+ help="Search for complexes in the Complex Portal",
322
+ description=Markdown(description, style="argparse.text"), # type: ignore using rich formatter makes this OK
323
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
324
+ )
325
+ parser.add_argument(
326
+ "uniprot_accessions",
327
+ type=argparse.FileType("r", encoding="UTF-8"),
328
+ help="Text file with UniProt accessions (one per line) as query for searching complexes. Use `-` for stdin.",
329
+ ).complete = shtab.FILE
330
+ parser.add_argument(
331
+ "output_csv",
332
+ type=argparse.FileType("w", encoding="UTF-8"),
333
+ help="Output CSV file with complex results. Use `-` for stdout.",
334
+ ).complete = shtab.FILE
335
+ parser.add_argument("--limit", type=int, default=100, help="Maximum number of complex results to return")
336
+ parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
337
+
338
+
339
+ def _add_search_uniprot_details_parser(subparsers: argparse._SubParsersAction):
340
+ """Add search uniprot details subcommand parser."""
341
+ description = dedent("""\
342
+ Retrieve UniProt details for given UniProt accessions
343
+ from the Uniprot SPARQL endpoint.
344
+
345
+ The output CSV file has the following columns:
346
+
347
+ - uniprot_accession: UniProt accession.
348
+ - uniprot_id: UniProt ID (mnemonic).
349
+ - sequence_length: Length of the canonical sequence.
350
+ - reviewed: Whether the entry is reviewed (Swiss-Prot) or unreviewed (TrEMBL).
351
+ - protein_name: Recommended protein name.
352
+ - taxon_id: NCBI Taxonomy ID of the organism.
353
+ - taxon_name: Scientific name of the organism.
354
+
355
+ The order of the output CSV can be different from the input order.
356
+ """)
357
+ parser = subparsers.add_parser(
358
+ "uniprot-details",
359
+ help="Retrieve UniProt details for given UniProt accessions",
360
+ description=Markdown(description, style="argparse.text"), # type: ignore using rich formatter makes this OK
361
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
362
+ )
363
+ parser.add_argument(
364
+ "uniprot_accessions",
365
+ type=argparse.FileType("r", encoding="UTF-8"),
366
+ help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
367
+ ).complete = shtab.FILE
368
+ parser.add_argument(
369
+ "output_csv",
370
+ type=argparse.FileType("w", encoding="UTF-8"),
371
+ help="Output CSV with UniProt details. Use `-` for stdout.",
372
+ ).complete = shtab.FILE
373
+ parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
374
+ parser.add_argument("--batch-size", type=int, default=1_000, help="Number of accessions to query per batch")
375
+
376
+
377
+ def _add_copy_method_arguments(parser):
378
+ parser.add_argument(
379
+ "--copy-method",
380
+ type=str,
381
+ choices=copy_methods,
382
+ default="hardlink",
383
+ help=dedent("""\
384
+ How to make target file be same file as source file.
385
+ By default uses hardlinks to save disk space.
386
+ Note that hardlinks only work within the same filesystem and are harder to track.
387
+ If you want to track cached files easily then use 'symlink'.
388
+ On Windows you need developer mode or admin privileges to create symlinks.
389
+ """),
390
+ )
391
+
392
+
393
+ def _add_cacher_arguments(parser: argparse.ArgumentParser):
394
+ """Add cacher arguments to parser."""
395
+ parser.add_argument(
396
+ "--no-cache",
397
+ action="store_true",
398
+ help="Disable caching of files to central location.",
399
+ )
400
+ cache_dir_action = parser.add_argument(
401
+ "--cache-dir",
402
+ type=Path,
403
+ default=user_cache_root_dir(),
404
+ help="Directory to use as cache for files.",
405
+ )
406
+ cache_dir_action.complete = shtab.DIRECTORY # type: ignore[missing-attribute]
407
+ _add_copy_method_arguments(parser)
408
+
409
+
410
+ def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
411
+ """Add retrieve pdbe subcommand parser."""
412
+ parser = subparsers.add_parser(
413
+ "pdbe",
414
+ help="Retrieve PDBe gzipped mmCIF files for PDB IDs in CSV.",
415
+ description=dedent("""\
416
+ Retrieve mmCIF files from Protein Data Bank in Europe Knowledge Base (PDBe) website
417
+ for unique PDB IDs listed in a CSV file.
418
+ """),
419
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
420
+ )
421
+ parser.add_argument(
422
+ "pdbe_csv",
423
+ type=argparse.FileType("r", encoding="UTF-8"),
424
+ help="CSV file with `pdb_id` column. Other columns are ignored. Use `-` for stdin.",
425
+ ).complete = shtab.FILE
426
+ parser.add_argument(
427
+ "output_dir", type=Path, help="Directory to store downloaded PDBe mmCIF files"
428
+ ).complete = shtab.DIRECTORY
429
+ parser.add_argument(
430
+ "--max-parallel-downloads",
431
+ type=int,
432
+ default=5,
433
+ help="Maximum number of parallel downloads",
434
+ )
435
+ _add_cacher_arguments(parser)
436
+
437
+
438
+ def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
439
+ """Add retrieve alphafold subcommand parser."""
440
+ parser = subparsers.add_parser(
441
+ "alphafold",
442
+ help="Retrieve AlphaFold files for IDs in CSV",
443
+ description="Retrieve AlphaFold files from the AlphaFold Protein Structure Database.",
444
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
445
+ )
446
+ parser.add_argument(
447
+ "alphafold_csv",
448
+ type=argparse.FileType("r", encoding="UTF-8"),
449
+ help="CSV file with `af_id` column. Other columns are ignored. Use `-` for stdin.",
450
+ ).complete = shtab.FILE
451
+ parser.add_argument(
452
+ "output_dir", type=Path, help="Directory to store downloaded AlphaFold files"
453
+ ).complete = shtab.DIRECTORY
454
+ parser.add_argument(
455
+ "--format",
456
+ type=str,
457
+ action="append",
458
+ choices=sorted(downloadable_formats),
459
+ help=dedent("""AlphaFold formats to retrieve. Can be specified multiple times.
460
+ Default is 'cif'."""),
461
+ )
462
+ parser.add_argument(
463
+ "--db-version",
464
+ type=str,
465
+ help="AlphaFold database version to use. If not given, the latest version is used. For example '6'.",
466
+ )
467
+ parser.add_argument(
468
+ "--gzip-files",
469
+ action="store_true",
470
+ help="Whether to gzip the downloaded files. Excludes summary files, they are always uncompressed.",
471
+ )
472
+ parser.add_argument(
473
+ "--all-isoforms",
474
+ action="store_true",
475
+ help=(
476
+ "Whether to return all isoforms of each uniprot entry. "
477
+ "If not given then only the Alphafold entry for the canonical sequence is returned."
478
+ ),
479
+ )
480
+ parser.add_argument(
481
+ "--max-parallel-downloads",
482
+ type=int,
483
+ default=5,
484
+ help="Maximum number of parallel downloads",
485
+ )
486
+ _add_cacher_arguments(parser)
487
+
488
+
489
+ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
490
+ """Add retrieve emdb subcommand parser."""
491
+ parser = subparsers.add_parser(
492
+ "emdb",
493
+ help="Retrieve Electron Microscopy Data Bank (EMDB) gzipped 3D volume files for EMDB IDs in CSV.",
494
+ description=dedent("""\
495
+ Retrieve volume files from Electron Microscopy Data Bank (EMDB) website
496
+ for unique EMDB IDs listed in a CSV file.
497
+ """),
498
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
499
+ )
500
+ parser.add_argument(
501
+ "emdb_csv",
502
+ type=argparse.FileType("r", encoding="UTF-8"),
503
+ help="CSV file with `emdb_id` column. Other columns are ignored. Use `-` for stdin.",
504
+ ).complete = shtab.FILE
505
+ parser.add_argument(
506
+ "output_dir", type=Path, help="Directory to store downloaded EMDB volume files"
507
+ ).complete = shtab.DIRECTORY
508
+ _add_cacher_arguments(parser)
509
+
510
+
511
+ def _add_scheduler_address_argument(parser):
512
+ parser.add_argument(
513
+ "--scheduler-address",
514
+ help=dedent("""Address of the Dask scheduler to connect to.
515
+ If not provided, will create a local cluster.
516
+ If set to `sequential` will run tasks sequentially."""),
517
+ )
518
+
519
+
520
+ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
521
+ """Add filter confidence subcommand parser."""
522
+ parser = subparsers.add_parser(
523
+ "confidence",
524
+ help="Filter AlphaFold mmcif/PDB files by confidence",
525
+ description=dedent("""\
526
+ Filter AlphaFold mmcif/PDB files by confidence (plDDT).
527
+ Passed files are written with residues below threshold removed."""),
528
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
529
+ )
530
+ parser.add_argument(
531
+ "input_dir", type=Path, help="Directory with AlphaFold mmcif/PDB files"
532
+ ).complete = shtab.DIRECTORY
533
+ parser.add_argument(
534
+ "output_dir", type=Path, help="Directory to write filtered mmcif/PDB files"
535
+ ).complete = shtab.DIRECTORY
536
+ parser.add_argument("--confidence-threshold", type=float, default=70, help="pLDDT confidence threshold (0-100)")
537
+ parser.add_argument(
538
+ "--min-residues", type=int, default=0, help="Minimum number of high-confidence residues a structure should have"
539
+ )
540
+ parser.add_argument(
541
+ "--max-residues",
542
+ type=int,
543
+ default=10_000_000,
544
+ help="Maximum number of high-confidence residues a structure should have",
545
+ )
546
+ parser.add_argument(
547
+ "--write-stats",
548
+ type=argparse.FileType("w", encoding="UTF-8"),
549
+ help=dedent("""\
550
+ Write filter statistics to file.
551
+ In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
552
+ Use `-` for stdout."""),
553
+ ).complete = shtab.FILE
554
+ _add_scheduler_address_argument(parser)
555
+ _add_copy_method_arguments(parser)
556
+
557
+
558
+ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
559
+ """Add filter chain subcommand parser."""
560
+ parser = subparsers.add_parser(
561
+ "chain",
562
+ help="Filter on chain.",
563
+ description=dedent("""\
564
+ For each input PDB/mmCIF and chain combination
565
+ write a PDB/mmCIF file with just the given chain
566
+ and rename it to chain `A`.
567
+ Filtering is done in parallel using a Dask cluster."""),
568
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
569
+ )
570
+ parser.add_argument(
571
+ "chains",
572
+ type=argparse.FileType("r", encoding="UTF-8"),
573
+ help="CSV file with `pdb_id` and `chain` columns. Other columns are ignored.",
574
+ ).complete = shtab.FILE
575
+ parser.add_argument(
576
+ "input_dir",
577
+ type=Path,
578
+ help=dedent("""\
579
+ Directory with PDB/mmCIF files.
580
+ Expected filenames are `{pdb_id}.cif.gz`, `{pdb_id}.cif`, `{pdb_id}.pdb.gz` or `{pdb_id}.pdb`.
581
+ """),
582
+ ).complete = shtab.DIRECTORY
583
+ parser.add_argument(
584
+ "output_dir",
585
+ type=Path,
586
+ help=dedent("""\
587
+ Directory to write the single-chain PDB/mmCIF files. Output files are in same format as input files."""),
588
+ ).complete = shtab.DIRECTORY
589
+ _add_scheduler_address_argument(parser)
590
+ _add_copy_method_arguments(parser)
591
+
592
+
593
+ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
594
+ """Add filter residue subcommand parser."""
595
+ parser = subparsers.add_parser(
596
+ "residue",
597
+ help="Filter PDB/mmCIF files by number of residues in chain A",
598
+ description=dedent("""\
599
+ Filter PDB/mmCIF files by number of residues in chain A.
600
+ """),
601
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
602
+ )
603
+ parser.add_argument(
604
+ "input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')"
605
+ ).complete = shtab.DIRECTORY
606
+ parser.add_argument(
607
+ "output_dir",
608
+ type=Path,
609
+ help=dedent("""\
610
+ Directory to write filtered PDB/mmCIF files. Files are copied without modification.
611
+ """),
612
+ ).complete = shtab.DIRECTORY
613
+ parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
614
+ parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
615
+ parser.add_argument(
616
+ "--write-stats",
617
+ type=argparse.FileType("w", encoding="UTF-8"),
618
+ help=dedent("""\
619
+ Write filter statistics to file.
620
+ In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
621
+ Use `-` for stdout."""),
622
+ ).complete = shtab.FILE
623
+ _add_copy_method_arguments(parser)
624
+
625
+
626
+ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
627
+ """Add filter secondary structure subcommand parser."""
628
+ parser = subparsers.add_parser(
629
+ "secondary-structure",
630
+ help="Filter PDB/mmCIF files by secondary structure",
631
+ description="Filter PDB/mmCIF files by secondary structure",
632
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
633
+ )
634
+ parser.add_argument(
635
+ "input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')"
636
+ ).complete = shtab.DIRECTORY
637
+ parser.add_argument(
638
+ "output_dir",
639
+ type=Path,
640
+ help=dedent("""\
641
+ Directory to write filtered PDB/mmCIF files. Files are copied without modification.
642
+ """),
643
+ ).complete = shtab.DIRECTORY
644
+ parser.add_argument("--abs-min-helix-residues", type=int, help="Min residues in helices")
645
+ parser.add_argument("--abs-max-helix-residues", type=int, help="Max residues in helices")
646
+ parser.add_argument("--abs-min-sheet-residues", type=int, help="Min residues in sheets")
647
+ parser.add_argument("--abs-max-sheet-residues", type=int, help="Max residues in sheets")
648
+ parser.add_argument("--ratio-min-helix-residues", type=float, help="Min residues in helices (relative)")
649
+ parser.add_argument("--ratio-max-helix-residues", type=float, help="Max residues in helices (relative)")
650
+ parser.add_argument("--ratio-min-sheet-residues", type=float, help="Min residues in sheets (relative)")
651
+ parser.add_argument("--ratio-max-sheet-residues", type=float, help="Max residues in sheets (relative)")
652
+ parser.add_argument(
653
+ "--write-stats",
654
+ type=argparse.FileType("w", encoding="UTF-8"),
655
+ help=dedent("""
656
+ Write filter statistics to file. In CSV format with columns:
657
+ `<input_file>,<nr_residues>,<nr_helix_residues>,<nr_sheet_residues>,
658
+ <helix_ratio>,<sheet_ratio>,<passed>,<output_file>`.
659
+ Use `-` for stdout.
660
+ """),
661
+ ).complete = shtab.FILE
662
+ _add_copy_method_arguments(parser)
663
+
664
+
665
+ def _add_search_subcommands(subparsers: argparse._SubParsersAction):
666
+ """Add search command and its subcommands."""
667
+ parser = subparsers.add_parser(
668
+ "search",
669
+ help="Search data sources",
670
+ description="Search various things online.",
671
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
672
+ )
673
+ subsubparsers = parser.add_subparsers(dest="search_cmd", required=True)
674
+
675
+ _add_search_uniprot_parser(subsubparsers)
676
+ _add_search_pdbe_parser(subsubparsers)
677
+ _add_search_alphafold_parser(subsubparsers)
678
+ _add_search_emdb_parser(subsubparsers)
679
+ _add_search_go_parser(subsubparsers)
680
+ _add_search_taxonomy_parser(subsubparsers)
681
+ _add_search_interaction_partners_parser(subsubparsers)
682
+ _add_search_complexes_parser(subsubparsers)
683
+ _add_search_uniprot_details_parser(subsubparsers)
684
+
685
+
686
+ def _add_retrieve_subcommands(subparsers: argparse._SubParsersAction):
687
+ """Add retrieve command and its subcommands."""
688
+ parser = subparsers.add_parser(
689
+ "retrieve",
690
+ help="Retrieve structure files",
691
+ description="Retrieve structure files from online resources.",
692
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
693
+ )
694
+ subsubparsers = parser.add_subparsers(dest="retrieve_cmd", required=True)
695
+
696
+ _add_retrieve_pdbe_parser(subsubparsers)
697
+ _add_retrieve_alphafold_parser(subsubparsers)
698
+ _add_retrieve_emdb_parser(subsubparsers)
699
+
700
+
701
+ def _add_filter_subcommands(subparsers: argparse._SubParsersAction):
702
+ """Add filter command and its subcommands."""
703
+ parser = subparsers.add_parser("filter", help="Filter files", formatter_class=ArgumentDefaultsRichHelpFormatter)
704
+ subsubparsers = parser.add_subparsers(dest="filter_cmd", required=True)
705
+
706
+ _add_filter_confidence_parser(subsubparsers)
707
+ _add_filter_chain_parser(subsubparsers)
708
+ _add_filter_residue_parser(subsubparsers)
709
+ _add_filter_ss_parser(subsubparsers)
710
+
711
+
712
+ def _add_convert_uniprot_parser(subparsers: argparse._SubParsersAction):
713
+ """Add convert uniprot subcommand parser."""
714
+ parser = subparsers.add_parser(
715
+ "uniprot",
716
+ help="Convert structure files to list of UniProt accessions.",
717
+ description="Convert structure files to list of UniProt accessions. "
718
+ "Uniprot accessions are read from database reference of each structure.",
719
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
720
+ )
721
+ parser.add_argument(
722
+ "input_dir",
723
+ type=Path,
724
+ help=f"Directory with structure files. Supported extensions are {valid_structure_file_extensions}",
725
+ ).complete = shtab.DIRECTORY
726
+ parser.add_argument(
727
+ "output",
728
+ type=argparse.FileType("wt", encoding="UTF-8"),
729
+ help="Output text file with UniProt accessions (one per line). Use '-' for stdout.",
730
+ ).complete = shtab.FILE
731
+ parser.add_argument(
732
+ "--grouped",
733
+ action="store_true",
734
+ help="Whether to group accessions by structure file. "
735
+ "If set output changes to `<structure_file1>,<acc1>\\n<structure_file1>,<acc2>` format.",
736
+ )
737
+
738
+
739
+ def _add_convert_structures_parser(subparsers: argparse._SubParsersAction):
740
+ """Add convert structures subcommand parser."""
741
+ parser = subparsers.add_parser(
742
+ "structures",
743
+ help="Convert structure files between formats",
744
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
745
+ )
746
+ parser.add_argument(
747
+ "input_dir",
748
+ type=Path,
749
+ help=f"Directory with structure files. Supported extensions are {valid_structure_file_extensions}",
750
+ ).complete = shtab.DIRECTORY
751
+ parser.add_argument(
752
+ "--output-dir",
753
+ type=Path,
754
+ help=dedent("""\
755
+ Directory to write converted structure files. If not given, files are written to `input_dir`.
756
+ """),
757
+ ).complete = shtab.DIRECTORY
758
+ parser.add_argument(
759
+ "--format",
760
+ type=str,
761
+ choices=("cif",),
762
+ default="cif",
763
+ help="Output format to convert to.",
764
+ )
765
+ _add_copy_method_arguments(parser)
766
+
767
+
768
+ def _add_convert_subcommands(subparsers: argparse._SubParsersAction):
769
+ """Add convert command and its subcommands."""
770
+ parser = subparsers.add_parser(
771
+ "convert",
772
+ help="Convert files between formats",
773
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
774
+ )
775
+ subsubparsers = parser.add_subparsers(dest="convert_cmd", required=True)
776
+
777
+ _add_convert_structures_parser(subsubparsers)
778
+ _add_convert_uniprot_parser(subsubparsers)
779
+
780
+
781
+ def _add_mcp_command(subparsers: argparse._SubParsersAction):
782
+ """Add MCP command."""
783
+
784
+ parser = subparsers.add_parser(
785
+ "mcp",
786
+ help="Run Model Context Protocol (MCP) server",
787
+ description=(
788
+ "Run Model Context Protocol (MCP) server. "
789
+ "Can be used by agentic LLMs like Claude Sonnet 4 as a set of tools."
790
+ ),
791
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
792
+ )
793
+ parser.add_argument(
794
+ "--transport", default="stdio", choices=["stdio", "http", "streamable-http"], help="Transport protocol to use"
795
+ )
796
+ parser.add_argument("--host", default="127.0.0.1", help="Host to bind the server to")
797
+ parser.add_argument("--port", default=8000, type=int, help="Port to bind the server to")
798
+
799
+
800
+ def make_parser() -> argparse.ArgumentParser:
801
+ parser = argparse.ArgumentParser(
802
+ description="Protein Quest CLI", prog="protein-quest", formatter_class=ArgumentDefaultsRichHelpFormatter
803
+ )
804
+ parser.add_argument("--log-level", default="WARNING", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
805
+ parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
806
+ shtab.add_argument_to(parser, ["--print-completion"])
807
+
808
+ subparsers = parser.add_subparsers(dest="command", required=True)
809
+
810
+ _add_search_subcommands(subparsers)
811
+ _add_retrieve_subcommands(subparsers)
812
+ _add_filter_subcommands(subparsers)
813
+ _add_convert_subcommands(subparsers)
814
+ _add_mcp_command(subparsers)
815
+
816
+ return parser
817
+
818
+
819
+ def _name_of(file: TextIOWrapper | BytesIO) -> str:
820
+ try:
821
+ return file.name
822
+ except AttributeError:
823
+ # In pytest BytesIO is used stdout which has no 'name' attribute
824
+ return "<stdout>"
825
+
826
+
827
+ def _handle_search_uniprot(args):
828
+ taxon_id = args.taxon_id
829
+ reviewed = args.reviewed
830
+ subcellular_location_uniprot = args.subcellular_location_uniprot
831
+ subcellular_location_go = args.subcellular_location_go
832
+ molecular_function_go = args.molecular_function_go
833
+ min_sequence_length = args.min_sequence_length
834
+ max_sequence_length = args.max_sequence_length
835
+ limit = args.limit
836
+ timeout = args.timeout
837
+ output_file = args.output
838
+
839
+ query = structure(
840
+ {
841
+ "taxon_id": taxon_id,
842
+ "reviewed": reviewed,
843
+ "subcellular_location_uniprot": subcellular_location_uniprot,
844
+ "subcellular_location_go": subcellular_location_go,
845
+ "molecular_function_go": molecular_function_go,
846
+ "min_sequence_length": min_sequence_length,
847
+ "max_sequence_length": max_sequence_length,
848
+ },
849
+ Query,
850
+ )
851
+ rprint("Searching for UniProt accessions")
852
+ accs = search4uniprot(query=query, limit=limit, timeout=timeout)
853
+ rprint(f"Found {len(accs)} UniProt accessions, written to {_name_of(output_file)}")
854
+ _write_lines(output_file, sorted(accs))
855
+
856
+
857
+ def _handle_search_pdbe(args):
858
+ uniprot_accessions = args.uniprot_accessions
859
+ limit = args.limit
860
+ timeout = args.timeout
861
+ output_csv = args.output_csv
862
+ min_residues = converter.structure(args.min_residues, PositiveInt | None) # pyright: ignore[reportArgumentType]
863
+ max_residues = converter.structure(args.max_residues, PositiveInt | None) # pyright: ignore[reportArgumentType]
864
+ keep_invalid = args.keep_invalid
865
+
866
+ accs = set(_read_lines(uniprot_accessions))
867
+ rprint(f"Finding PDB entries for {len(accs)} uniprot accessions")
868
+ results = search4pdb(accs, limit=limit, timeout=timeout)
869
+
870
+ raw_nr_results = len(results)
871
+ raw_total_pdbs = sum([len(v) for v in results.values()])
872
+ if min_residues or max_residues:
873
+ results = filter_pdb_results_on_chain_length(results, min_residues, max_residues, keep_invalid=keep_invalid)
874
+ total_pdbs = sum([len(v) for v in results.values()])
875
+ rprint(f"Before filtering found {raw_total_pdbs} PDB entries for {raw_nr_results} uniprot accessions.")
876
+ rprint(
877
+ f"After filtering on chain length ({min_residues}, {max_residues}) "
878
+ f"remained {total_pdbs} PDB entries for {len(results)} uniprot accessions."
879
+ )
880
+ else:
881
+ rprint(f"Found {raw_total_pdbs} PDB entries for {raw_nr_results} uniprot accessions")
882
+
883
+ _write_pdbe_csv(output_csv, results)
884
+ rprint(f"Written to {_name_of(output_csv)}")
885
+
886
+
887
+ def _handle_search_alphafold(args):
888
+ uniprot_accessions = args.uniprot_accessions
889
+ min_sequence_length = converter.structure(args.min_sequence_length, PositiveInt | None) # pyright: ignore[reportArgumentType]
890
+ max_sequence_length = converter.structure(args.max_sequence_length, PositiveInt | None) # pyright: ignore[reportArgumentType]
891
+ limit = args.limit
892
+ timeout = args.timeout
893
+ output_csv = args.output_csv
894
+
895
+ accs = _read_lines(uniprot_accessions)
896
+ rprint(f"Finding AlphaFold entries for {len(accs)} uniprot accessions")
897
+ results = search4af(
898
+ accs,
899
+ min_sequence_length=min_sequence_length,
900
+ max_sequence_length=max_sequence_length,
901
+ limit=limit,
902
+ timeout=timeout,
903
+ )
904
+ rprint(f"Found {len(results)} AlphaFold entries, written to {_name_of(output_csv)}")
905
+ _write_dict_of_sets2csv(output_csv, results, "af_id")
906
+
907
+
908
+ def _handle_search_emdb(args):
909
+ uniprot_accessions = args.uniprot_accessions
910
+ limit = args.limit
911
+ timeout = args.timeout
912
+ output_csv = args.output_csv
913
+
914
+ accs = _read_lines(uniprot_accessions)
915
+ rprint(f"Finding EMDB entries for {len(accs)} uniprot accessions")
916
+ results = search4emdb(accs, limit=limit, timeout=timeout)
917
+ total_emdbs = sum([len(v) for v in results.values()])
918
+ rprint(f"Found {total_emdbs} EMDB entries, written to {_name_of(output_csv)}")
919
+ _write_dict_of_sets2csv(output_csv, results, "emdb_id")
920
+
921
+
922
+ def _handle_search_go(args):
923
+ term = structure(args.term, str)
924
+ aspect: Aspect | None = args.aspect
925
+ limit = structure(args.limit, int)
926
+ output_csv: TextIOWrapper = args.output_csv
927
+
928
+ if aspect:
929
+ rprint(f"Searching for GO terms matching '{term}' with aspect '{aspect}'")
930
+ else:
931
+ rprint(f"Searching for GO terms matching '{term}'")
932
+ results = asyncio.run(search_gene_ontology_term(term, aspect=aspect, limit=limit))
933
+ rprint(f"Found {len(results)} GO terms, written to {_name_of(output_csv)}")
934
+ write_go_terms_to_csv(results, output_csv)
935
+
936
+
937
+ def _handle_search_taxonomy(args):
938
+ query: str = args.query
939
+ field: SearchField | None = args.field
940
+ limit: int = args.limit
941
+ output_csv: TextIOWrapper = args.output_csv
942
+
943
+ if field:
944
+ rprint(f"Searching for taxon information matching '{query}' in field '{field}'")
945
+ else:
946
+ rprint(f"Searching for taxon information matching '{query}'")
947
+ results = asyncio.run(search_taxon(query=query, field=field, limit=limit))
948
+ rprint(f"Found {len(results)} taxons, written to {_name_of(output_csv)}")
949
+ _write_taxonomy_csv(results, output_csv)
950
+
951
+
952
+ def _handle_search_interaction_partners(args: argparse.Namespace):
953
+ uniprot_accession: str = args.uniprot_accession
954
+ excludes: set[str] = set(args.exclude) if args.exclude else set()
955
+ limit: int = args.limit
956
+ timeout: int = args.timeout
957
+ output_csv: TextIOWrapper = args.output_csv
958
+
959
+ rprint(f"Searching for interaction partners of '{uniprot_accession}'")
960
+ results = search4interaction_partners(uniprot_accession, excludes=excludes, limit=limit, timeout=timeout)
961
+ rprint(f"Found {len(results)} interaction partners, written to {_name_of(output_csv)}")
962
+ _write_lines(output_csv, results.keys())
963
+
964
+
965
+ def _handle_search_complexes(args: argparse.Namespace):
966
+ uniprot_accessions = args.uniprot_accessions
967
+ limit = args.limit
968
+ timeout = args.timeout
969
+ output_csv = args.output_csv
970
+
971
+ accs = _read_lines(uniprot_accessions)
972
+ rprint(f"Finding complexes for {len(accs)} uniprot accessions")
973
+ results = search4macromolecular_complexes(accs, limit=limit, timeout=timeout)
974
+ rprint(f"Found {len(results)} complexes, written to {_name_of(output_csv)}")
975
+ _write_complexes_csv(results, output_csv)
976
+
977
+
978
+ def _handle_search_uniprot_details(args: argparse.Namespace):
979
+ uniprot_accessions = args.uniprot_accessions
980
+ timeout = args.timeout
981
+ batch_size = args.batch_size
982
+ output_csv: TextIOWrapper = args.output_csv
983
+
984
+ accs = _read_lines(uniprot_accessions)
985
+ rprint(f"Retrieving UniProt entry details for {len(accs)} uniprot accessions")
986
+ results = list(map_uniprot_accessions2uniprot_details(accs, timeout=timeout, batch_size=batch_size))
987
+ _write_uniprot_details_csv(output_csv, results)
988
+ rprint(f"Retrieved details for {len(results)} UniProt entries, written to {_name_of(output_csv)}")
989
+
990
+
991
+ def _initialize_cacher(args: argparse.Namespace) -> Cacher:
992
+ if args.no_cache:
993
+ return PassthroughCacher()
994
+ return DirectoryCacher(
995
+ cache_dir=args.cache_dir,
996
+ copy_method=args.copy_method,
997
+ )
998
+
999
+
1000
+ def _handle_retrieve_pdbe(args: argparse.Namespace):
1001
+ pdbe_csv = args.pdbe_csv
1002
+ output_dir = args.output_dir
1003
+ max_parallel_downloads = args.max_parallel_downloads
1004
+ cacher = _initialize_cacher(args)
1005
+
1006
+ pdb_ids = _read_column_from_csv(pdbe_csv, "pdb_id")
1007
+ rprint(f"Retrieving {len(pdb_ids)} PDBe entries")
1008
+ result = asyncio.run(
1009
+ pdbe_fetch.fetch(pdb_ids, output_dir, max_parallel_downloads=max_parallel_downloads, cacher=cacher)
1010
+ )
1011
+ rprint(f"Retrieved {len(result)} PDBe entries")
1012
+
1013
+
1014
+ def _handle_retrieve_alphafold(args):
1015
+ download_dir = args.output_dir
1016
+ raw_formats = args.format
1017
+ alphafold_csv = args.alphafold_csv
1018
+ max_parallel_downloads = args.max_parallel_downloads
1019
+ cacher = _initialize_cacher(args)
1020
+ gzip_files = args.gzip_files
1021
+ all_isoforms = args.all_isoforms
1022
+ db_version = args.db_version
1023
+
1024
+ if raw_formats is None:
1025
+ raw_formats = {"cif"}
1026
+
1027
+ # TODO besides `uniprot_accession,af_id\n` csv also allow headless single column format
1028
+ af_ids = _read_column_from_csv(alphafold_csv, "af_id")
1029
+ formats: set[DownloadableFormat] = structure(raw_formats, set[DownloadableFormat])
1030
+ rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {formats}")
1031
+ afs = af_fetch(
1032
+ af_ids,
1033
+ download_dir,
1034
+ formats=formats,
1035
+ db_version=db_version,
1036
+ max_parallel_downloads=max_parallel_downloads,
1037
+ cacher=cacher,
1038
+ gzip_files=gzip_files,
1039
+ all_isoforms=all_isoforms,
1040
+ )
1041
+ total_nr_files = sum(af.nr_of_files() for af in afs)
1042
+ rprint(f"Retrieved {total_nr_files} AlphaFold files and {len(afs)} summaries, written to {download_dir}")
1043
+
1044
+
1045
+ def _handle_retrieve_emdb(args):
1046
+ emdb_csv = args.emdb_csv
1047
+ output_dir = args.output_dir
1048
+ cacher = _initialize_cacher(args)
1049
+
1050
+ emdb_ids = _read_column_from_csv(emdb_csv, "emdb_id")
1051
+ rprint(f"Retrieving {len(emdb_ids)} EMDB entries")
1052
+ result = asyncio.run(emdb_fetch(emdb_ids, output_dir, cacher=cacher))
1053
+ rprint(f"Retrieved {len(result)} EMDB entries")
1054
+
1055
+
1056
+ def _handle_filter_confidence(args: argparse.Namespace):
1057
+ # we are repeating types here and in add_argument call
1058
+ # TODO replace argparse with modern alternative like cyclopts
1059
+ # to get rid of duplication
1060
+ input_dir = structure(args.input_dir, Path)
1061
+ output_dir = structure(args.output_dir, Path)
1062
+
1063
+ confidence_threshold = args.confidence_threshold
1064
+ min_residues = args.min_residues
1065
+ max_residues = args.max_residues
1066
+ stats_file: TextIOWrapper | None = args.write_stats
1067
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
1068
+ scheduler_address = structure(args.scheduler_address, str | None) # pyright: ignore[reportArgumentType]
1069
+
1070
+ output_dir.mkdir(parents=True, exist_ok=True)
1071
+ input_files = sorted(glob_structure_files(input_dir))
1072
+ nr_input_files = len(input_files)
1073
+ rprint(f"Starting confidence filtering of {nr_input_files} mmcif/PDB files in {input_dir} directory.")
1074
+ query = converter.structure(
1075
+ {
1076
+ "confidence": confidence_threshold,
1077
+ "min_residues": min_residues,
1078
+ "max_residues": max_residues,
1079
+ },
1080
+ ConfidenceFilterQuery,
1081
+ )
1082
+ if stats_file:
1083
+ writer = csv.writer(stats_file)
1084
+ writer.writerow(["input_file", "residue_count", "passed", "output_file"])
1085
+
1086
+ passed_count = 0
1087
+ results = filter_files_on_confidence(
1088
+ input_files, query, output_dir, copy_method=copy_method, scheduler_address=scheduler_address
1089
+ )
1090
+ for r in results:
1091
+ if r.filtered_file:
1092
+ passed_count += 1
1093
+ if stats_file:
1094
+ writer.writerow([r.input_file, r.count, r.filtered_file is not None, r.filtered_file]) # pyright: ignore[reportPossiblyUnboundVariable]
1095
+ rprint(f"Filtered {passed_count} mmcif/PDB files by confidence, written to {output_dir} directory")
1096
+ if stats_file:
1097
+ rprint(f"Statistics written to {_name_of(stats_file)}")
1098
+
1099
+
1100
+ def _handle_filter_chain(args):
1101
+ input_dir = args.input_dir
1102
+ output_dir = structure(args.output_dir, Path)
1103
+ pdb_id2chain_mapping_file = args.chains
1104
+ scheduler_address = structure(args.scheduler_address, str | None) # pyright: ignore[reportArgumentType]
1105
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
1106
+
1107
+ # make sure files in input dir with entries in mapping file are the same
1108
+ # complain when files from mapping file are missing on disk
1109
+ rows = list(_iter_csv_rows(pdb_id2chain_mapping_file))
1110
+ file2chain: set[tuple[Path, str]] = set()
1111
+ errors: list[FileNotFoundError] = []
1112
+
1113
+ for row in rows:
1114
+ pdb_id = row["pdb_id"]
1115
+ chain = row["chain"]
1116
+ try:
1117
+ f = locate_structure_file(input_dir, pdb_id)
1118
+ file2chain.add((f, chain))
1119
+ except FileNotFoundError as e:
1120
+ errors.append(e)
1121
+
1122
+ if errors:
1123
+ msg = f"Some structure files could not be found ({len(errors)} missing), skipping them"
1124
+ rprint(Panel(os.linesep.join(map(str, errors)), title=msg, style="red"))
1125
+
1126
+ if not file2chain:
1127
+ rprint("[red]No valid structure files found. Exiting.")
1128
+ sys.exit(1)
1129
+
1130
+ results = filter_files_on_chain(
1131
+ file2chain, output_dir, scheduler_address=scheduler_address, copy_method=copy_method
1132
+ )
1133
+
1134
+ nr_written = len([r for r in results if r.passed])
1135
+
1136
+ rprint(f"Wrote {nr_written} single-chain PDB/mmCIF files to {output_dir}.")
1137
+
1138
+ for result in results:
1139
+ if result.discard_reason:
1140
+ rprint(f"[red]Discarding {result.input_file} ({result.discard_reason})[/red]")
1141
+
1142
+
1143
+ def _handle_filter_residue(args):
1144
+ input_dir = structure(args.input_dir, Path)
1145
+ output_dir = structure(args.output_dir, Path)
1146
+ min_residues = structure(args.min_residues, int)
1147
+ max_residues = structure(args.max_residues, int)
1148
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
1149
+ stats_file: TextIOWrapper | None = args.write_stats
1150
+
1151
+ if stats_file:
1152
+ writer = csv.writer(stats_file)
1153
+ writer.writerow(["input_file", "residue_count", "passed", "output_file"])
1154
+
1155
+ nr_passed = 0
1156
+ input_files = sorted(glob_structure_files(input_dir))
1157
+ nr_total = len(input_files)
1158
+ rprint(f"Filtering {nr_total} files in {input_dir} directory by number of residues in chain A.")
1159
+ for r in filter_files_on_residues(
1160
+ input_files, output_dir, min_residues=min_residues, max_residues=max_residues, copy_method=copy_method
1161
+ ):
1162
+ if stats_file:
1163
+ writer.writerow([r.input_file, r.residue_count, r.passed, r.output_file]) # pyright: ignore[reportPossiblyUnboundVariable]
1164
+ if r.passed:
1165
+ nr_passed += 1
1166
+
1167
+ rprint(f"Wrote {nr_passed} files to {output_dir} directory.")
1168
+ if stats_file:
1169
+ rprint(f"Statistics written to {_name_of(stats_file)}")
1170
+
1171
+
1172
+ def _handle_filter_ss(args):
1173
+ input_dir = structure(args.input_dir, Path)
1174
+ output_dir = structure(args.output_dir, Path)
1175
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
1176
+ stats_file: TextIOWrapper | None = args.write_stats
1177
+
1178
+ raw_query = {
1179
+ "abs_min_helix_residues": args.abs_min_helix_residues,
1180
+ "abs_max_helix_residues": args.abs_max_helix_residues,
1181
+ "abs_min_sheet_residues": args.abs_min_sheet_residues,
1182
+ "abs_max_sheet_residues": args.abs_max_sheet_residues,
1183
+ "ratio_min_helix_residues": args.ratio_min_helix_residues,
1184
+ "ratio_max_helix_residues": args.ratio_max_helix_residues,
1185
+ "ratio_min_sheet_residues": args.ratio_min_sheet_residues,
1186
+ "ratio_max_sheet_residues": args.ratio_max_sheet_residues,
1187
+ }
1188
+ query = converter.structure(raw_query, SecondaryStructureFilterQuery)
1189
+ input_files = sorted(glob_structure_files(input_dir))
1190
+ nr_total = len(input_files)
1191
+ output_dir.mkdir(parents=True, exist_ok=True)
1192
+
1193
+ if stats_file:
1194
+ writer = csv.writer(stats_file)
1195
+ writer.writerow(
1196
+ [
1197
+ "input_file",
1198
+ "nr_residues",
1199
+ "nr_helix_residues",
1200
+ "nr_sheet_residues",
1201
+ "helix_ratio",
1202
+ "sheet_ratio",
1203
+ "passed",
1204
+ "output_file",
1205
+ ]
1206
+ )
1207
+
1208
+ rprint(f"Filtering {nr_total} files in {input_dir} directory by secondary structure.")
1209
+ nr_passed = 0
1210
+ for input_file, result in filter_files_on_secondary_structure(input_files, query=query):
1211
+ output_file: Path | None = None
1212
+ if result.passed:
1213
+ output_file = output_dir / input_file.name
1214
+ copyfile(input_file, output_file, copy_method)
1215
+ nr_passed += 1
1216
+ if stats_file:
1217
+ writer.writerow( # pyright: ignore[reportPossiblyUnboundVariable]
1218
+ [
1219
+ input_file,
1220
+ result.stats.nr_residues,
1221
+ result.stats.nr_helix_residues,
1222
+ result.stats.nr_sheet_residues,
1223
+ round(result.stats.helix_ratio, 3),
1224
+ round(result.stats.sheet_ratio, 3),
1225
+ result.passed,
1226
+ output_file,
1227
+ ]
1228
+ )
1229
+ rprint(f"Wrote {nr_passed} files to {output_dir} directory.")
1230
+ if stats_file:
1231
+ rprint(f"Statistics written to {_name_of(stats_file)}")
1232
+
1233
+
1234
+ def _handle_mcp(args):
1235
+ if find_spec("fastmcp") is None:
1236
+ msg = "Unable to start MCP server, please install `protein-quest[mcp]`."
1237
+ raise ImportError(msg)
1238
+
1239
+ from protein_quest.mcp_server import mcp # noqa: PLC0415
1240
+
1241
+ if args.transport == "stdio":
1242
+ mcp.run(transport=args.transport)
1243
+ else:
1244
+ mcp.run(transport=args.transport, host=args.host, port=args.port)
1245
+
1246
+
1247
+ def _handle_convert_uniprot(args):
1248
+ input_dir = structure(args.input_dir, Path)
1249
+ output_file: TextIOWrapper = args.output
1250
+ grouped: bool = args.grouped
1251
+ input_files = sorted(glob_structure_files(input_dir))
1252
+ if grouped:
1253
+ for input_file in tqdm(input_files, unit="file"):
1254
+ s = read_structure(input_file)
1255
+ uniprot_accessions = structure2uniprot_accessions(s)
1256
+ _write_lines(
1257
+ output_file, [f"{input_file},{uniprot_accession}" for uniprot_accession in sorted(uniprot_accessions)]
1258
+ )
1259
+ else:
1260
+ uniprot_accessions: set[str] = set()
1261
+ for input_file in tqdm(input_files, unit="file"):
1262
+ s = read_structure(input_file)
1263
+ uniprot_accessions.update(structure2uniprot_accessions(s))
1264
+ _write_lines(output_file, sorted(uniprot_accessions))
1265
+
1266
+
1267
+ def _handle_convert_structures(args):
1268
+ input_dir = structure(args.input_dir, Path)
1269
+ output_dir = input_dir if args.output_dir is None else structure(args.output_dir, Path)
1270
+ output_dir.mkdir(parents=True, exist_ok=True)
1271
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
1272
+
1273
+ input_files = sorted(glob_structure_files(input_dir))
1274
+ rprint(f"Converting {len(input_files)} files in {input_dir} directory to cif format.")
1275
+ for _ in tqdm(
1276
+ convert_to_cif_files(
1277
+ input_files,
1278
+ output_dir,
1279
+ copy_method=copy_method,
1280
+ ),
1281
+ total=len(input_files),
1282
+ unit="file",
1283
+ ):
1284
+ pass
1285
+ rprint(f"Converted {len(input_files)} files into {output_dir}.")
1286
+
1287
+
1288
+ def _read_lines(file: TextIOWrapper) -> list[str]:
1289
+ return [line.strip() for line in file]
1290
+
1291
+
1292
+ def _make_sure_parent_exists(file: TextIOWrapper):
1293
+ # Can not create dir for stdout
1294
+ with suppress(AttributeError):
1295
+ Path(file.name).parent.mkdir(parents=True, exist_ok=True)
1296
+
1297
+
1298
+ def _write_lines(file: TextIOWrapper, lines: Iterable[str]):
1299
+ _make_sure_parent_exists(file)
1300
+ file.writelines(line + os.linesep for line in lines)
1301
+
1302
+
1303
+ def _write_pdbe_csv(path: TextIOWrapper, data: PdbResults):
1304
+ _make_sure_parent_exists(path)
1305
+ fieldnames = ["uniprot_accession", "pdb_id", "method", "resolution", "uniprot_chains", "chain", "chain_length"]
1306
+ writer = csv.DictWriter(path, fieldnames=fieldnames)
1307
+ writer.writeheader()
1308
+ for uniprot_accession, entries in sorted(data.items()):
1309
+ for e in sorted(entries, key=lambda x: (x.id, x.method)):
1310
+ writer.writerow(
1311
+ {
1312
+ "uniprot_accession": uniprot_accession,
1313
+ "pdb_id": e.id,
1314
+ "method": e.method,
1315
+ "resolution": e.resolution or "",
1316
+ "uniprot_chains": e.uniprot_chains,
1317
+ "chain": e.chain,
1318
+ "chain_length": e.chain_length,
1319
+ }
1320
+ )
1321
+
1322
+
1323
+ def _write_dict_of_sets2csv(file: TextIOWrapper, data: dict[str, set[str]], ref_id_field: str):
1324
+ _make_sure_parent_exists(file)
1325
+ fieldnames = ["uniprot_accession", ref_id_field]
1326
+
1327
+ writer = csv.DictWriter(file, fieldnames=fieldnames)
1328
+ writer.writeheader()
1329
+ for uniprot_accession, ref_ids in sorted(data.items()):
1330
+ for ref_id in sorted(ref_ids):
1331
+ writer.writerow({"uniprot_accession": uniprot_accession, ref_id_field: ref_id})
1332
+
1333
+
1334
+ def _iter_csv_rows(file: TextIOWrapper) -> Generator[dict[str, str]]:
1335
+ reader = csv.DictReader(file)
1336
+ yield from reader
1337
+
1338
+
1339
+ def _read_column_from_csv(file: TextIOWrapper, column: str) -> set[str]:
1340
+ return {row[column] for row in _iter_csv_rows(file)}
1341
+
1342
+
1343
+ def _write_complexes_csv(complexes: list[ComplexPortalEntry], output_csv: TextIOWrapper) -> None:
1344
+ """Write ComplexPortal information to a CSV file.
1345
+
1346
+ Args:
1347
+ complexes: List of ComplexPortalEntry objects.
1348
+ output_csv: TextIOWrapper to write the CSV data to.
1349
+ """
1350
+ writer = csv.writer(output_csv)
1351
+ writer.writerow(
1352
+ [
1353
+ "query_protein",
1354
+ "complex_id",
1355
+ "complex_url",
1356
+ "complex_title",
1357
+ "members",
1358
+ ]
1359
+ )
1360
+ for entry in complexes:
1361
+ members_str = ";".join(sorted(entry.members))
1362
+ writer.writerow(
1363
+ [
1364
+ entry.query_protein,
1365
+ entry.complex_id,
1366
+ entry.complex_url,
1367
+ entry.complex_title,
1368
+ members_str,
1369
+ ]
1370
+ )
1371
+
1372
+
1373
+ def _write_uniprot_details_csv(
1374
+ output_csv: TextIOWrapper,
1375
+ uniprot_details_list: Iterable[UniprotDetails],
1376
+ ) -> None:
1377
+ if not uniprot_details_list:
1378
+ msg = "No UniProt entries found for given accessions"
1379
+ raise ValueError(msg)
1380
+ # As all props of UniprotDetails are scalar, we can directly unstructure to dicts
1381
+ rows = converter.unstructure(uniprot_details_list)
1382
+ fieldnames = rows[0].keys()
1383
+ writer = csv.DictWriter(output_csv, fieldnames=fieldnames)
1384
+ writer.writeheader()
1385
+ writer.writerows(rows)
1386
+
1387
+
1388
+ HANDLERS: dict[tuple[str, str | None], Callable] = {
1389
+ ("search", "uniprot"): _handle_search_uniprot,
1390
+ ("search", "pdbe"): _handle_search_pdbe,
1391
+ ("search", "alphafold"): _handle_search_alphafold,
1392
+ ("search", "emdb"): _handle_search_emdb,
1393
+ ("search", "go"): _handle_search_go,
1394
+ ("search", "taxonomy"): _handle_search_taxonomy,
1395
+ ("search", "interaction-partners"): _handle_search_interaction_partners,
1396
+ ("search", "complexes"): _handle_search_complexes,
1397
+ ("search", "uniprot-details"): _handle_search_uniprot_details,
1398
+ ("retrieve", "pdbe"): _handle_retrieve_pdbe,
1399
+ ("retrieve", "alphafold"): _handle_retrieve_alphafold,
1400
+ ("retrieve", "emdb"): _handle_retrieve_emdb,
1401
+ ("filter", "confidence"): _handle_filter_confidence,
1402
+ ("filter", "chain"): _handle_filter_chain,
1403
+ ("filter", "residue"): _handle_filter_residue,
1404
+ ("filter", "secondary-structure"): _handle_filter_ss,
1405
+ ("mcp", None): _handle_mcp,
1406
+ ("convert", "structures"): _handle_convert_structures,
1407
+ ("convert", "uniprot"): _handle_convert_uniprot,
1408
+ }
1409
+
1410
+
1411
+ def main(argv: Sequence[str] | None = None):
1412
+ """Main entry point for the CLI.
1413
+
1414
+ Args:
1415
+ argv: List of command line arguments. If None, uses sys.argv.
1416
+ """
1417
+ parser = make_parser()
1418
+ args = parser.parse_args(argv)
1419
+ logging.basicConfig(level=args.log_level, handlers=[RichHandler(show_level=False, console=console)])
1420
+
1421
+ # Dispatch table to reduce complexity
1422
+ cmd = args.command
1423
+ sub = getattr(args, f"{cmd}_cmd", None)
1424
+ handler = HANDLERS.get((cmd, sub))
1425
+ if handler is None:
1426
+ msg = f"Unknown command: {cmd} {sub}"
1427
+ raise SystemExit(msg)
1428
+ handler(args)