protein-quest 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of protein-quest might be problematic. Click here for more details.

protein_quest/cli.py CHANGED
@@ -6,14 +6,15 @@ import csv
6
6
  import logging
7
7
  import os
8
8
  import sys
9
- from collections.abc import Callable, Generator, Iterable
9
+ from collections.abc import Callable, Generator, Iterable, Sequence
10
+ from contextlib import suppress
10
11
  from importlib.util import find_spec
11
- from io import TextIOWrapper
12
+ from io import BytesIO, TextIOWrapper
12
13
  from pathlib import Path
13
14
  from textwrap import dedent
14
15
 
15
16
  from cattrs import structure
16
- from rich import print as rprint
17
+ from rich.console import Console
17
18
  from rich.logging import RichHandler
18
19
  from rich.markdown import Markdown
19
20
  from rich.panel import Panel
@@ -24,18 +25,28 @@ from protein_quest.__version__ import __version__
24
25
  from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence
25
26
  from protein_quest.alphafold.fetch import DownloadableFormat, downloadable_formats
26
27
  from protein_quest.alphafold.fetch import fetch_many as af_fetch
27
- from protein_quest.converter import converter
28
+ from protein_quest.converter import PositiveInt, converter
28
29
  from protein_quest.emdb import fetch as emdb_fetch
29
30
  from protein_quest.filters import filter_files_on_chain, filter_files_on_residues
30
31
  from protein_quest.go import Aspect, allowed_aspects, search_gene_ontology_term, write_go_terms_to_csv
32
+ from protein_quest.io import (
33
+ convert_to_cif_files,
34
+ glob_structure_files,
35
+ locate_structure_file,
36
+ read_structure,
37
+ valid_structure_file_extensions,
38
+ )
31
39
  from protein_quest.pdbe import fetch as pdbe_fetch
32
- from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
33
40
  from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
41
+ from protein_quest.structure import structure2uniprot_accessions
34
42
  from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
35
43
  from protein_quest.uniprot import (
36
44
  ComplexPortalEntry,
37
- PdbResult,
45
+ PdbResults,
38
46
  Query,
47
+ UniprotDetails,
48
+ filter_pdb_results_on_chain_length,
49
+ map_uniprot_accessions2uniprot_details,
39
50
  search4af,
40
51
  search4emdb,
41
52
  search4interaction_partners,
@@ -53,6 +64,8 @@ from protein_quest.utils import (
53
64
  user_cache_root_dir,
54
65
  )
55
66
 
67
+ console = Console(stderr=True)
68
+ rprint = console.print
56
69
  logger = logging.getLogger(__name__)
57
70
 
58
71
 
@@ -93,6 +106,8 @@ def _add_search_uniprot_parser(subparsers: argparse._SubParsersAction):
93
106
  action="append",
94
107
  help="GO term(s) for molecular function (e.g. GO:0003677). Can be given multiple times.",
95
108
  )
109
+ parser.add_argument("--min-sequence-length", type=int, help="Minimum length of the canonical sequence.")
110
+ parser.add_argument("--max-sequence-length", type=int, help="Maximum length of the canonical sequence.")
96
111
  parser.add_argument("--limit", type=int, default=10_000, help="Maximum number of uniprot accessions to return")
97
112
  parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
98
113
 
@@ -106,7 +121,7 @@ def _add_search_pdbe_parser(subparsers: argparse._SubParsersAction):
106
121
  formatter_class=ArgumentDefaultsRichHelpFormatter,
107
122
  )
108
123
  parser.add_argument(
109
- "uniprot_accs",
124
+ "uniprot_accessions",
110
125
  type=argparse.FileType("r", encoding="UTF-8"),
111
126
  help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
112
127
  )
@@ -114,15 +129,27 @@ def _add_search_pdbe_parser(subparsers: argparse._SubParsersAction):
114
129
  "output_csv",
115
130
  type=argparse.FileType("w", encoding="UTF-8"),
116
131
  help=dedent("""\
117
- Output CSV with `uniprot_acc`, `pdb_id`, `method`, `resolution`, `uniprot_chains`, `chain` columns.
132
+ Output CSV with following columns:
133
+ `uniprot_accession`, `pdb_id`, `method`, `resolution`, `uniprot_chains`, `chain`, `chain_length`.
118
134
  Where `uniprot_chains` is the raw UniProt chain string, for example `A=1-100`.
119
- and where `chain` is the first chain from `uniprot_chains`, for example `A`.
135
+ and where `chain` is the first chain from `uniprot_chains`, for example `A`
136
+ and `chain_length` is the length of the chain, for example `100`.
120
137
  Use `-` for stdout.
121
138
  """),
122
139
  )
123
140
  parser.add_argument(
124
141
  "--limit", type=int, default=10_000, help="Maximum number of PDB uniprot accessions combinations to return"
125
142
  )
143
+ parser.add_argument(
144
+ "--min-residues",
145
+ type=int,
146
+ help="Minimum number of residues required in the chain mapped to the UniProt accession.",
147
+ )
148
+ parser.add_argument(
149
+ "--max-residues",
150
+ type=int,
151
+ help="Maximum number of residues allowed in chain mapped to the UniProt accession.",
152
+ )
126
153
  parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
127
154
 
128
155
 
@@ -135,7 +162,7 @@ def _add_search_alphafold_parser(subparsers: argparse._SubParsersAction):
135
162
  formatter_class=ArgumentDefaultsRichHelpFormatter,
136
163
  )
137
164
  parser.add_argument(
138
- "uniprot_accs",
165
+ "uniprot_accessions",
139
166
  type=argparse.FileType("r", encoding="UTF-8"),
140
167
  help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
141
168
  )
@@ -144,6 +171,8 @@ def _add_search_alphafold_parser(subparsers: argparse._SubParsersAction):
144
171
  type=argparse.FileType("w", encoding="UTF-8"),
145
172
  help="Output CSV with AlphaFold IDs per UniProt accession. Use `-` for stdout.",
146
173
  )
174
+ parser.add_argument("--min-sequence-length", type=int, help="Minimum length of the canonical sequence.")
175
+ parser.add_argument("--max-sequence-length", type=int, help="Maximum length of the canonical sequence.")
147
176
  parser.add_argument(
148
177
  "--limit", type=int, default=10_000, help="Maximum number of Alphafold entry identifiers to return"
149
178
  )
@@ -242,7 +271,7 @@ def _add_search_interaction_partners_parser(subparsers: argparse._SubParsersActi
242
271
  formatter_class=ArgumentDefaultsRichHelpFormatter,
243
272
  )
244
273
  parser.add_argument(
245
- "uniprot_acc",
274
+ "uniprot_accession",
246
275
  type=str,
247
276
  help="UniProt accession (for example P12345).",
248
277
  )
@@ -284,7 +313,7 @@ def _add_search_complexes_parser(subparsers: argparse._SubParsersAction):
284
313
  formatter_class=ArgumentDefaultsRichHelpFormatter,
285
314
  )
286
315
  parser.add_argument(
287
- "uniprot_accs",
316
+ "uniprot_accessions",
288
317
  type=argparse.FileType("r", encoding="UTF-8"),
289
318
  help="Text file with UniProt accessions (one per line) as query for searching complexes. Use `-` for stdin.",
290
319
  )
@@ -297,6 +326,76 @@ def _add_search_complexes_parser(subparsers: argparse._SubParsersAction):
297
326
  parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
298
327
 
299
328
 
329
+ def _add_search_uniprot_details_parser(subparsers: argparse._SubParsersAction):
330
+ """Add search uniprot details subcommand parser."""
331
+ description = dedent("""\
332
+ Retrieve UniProt details for given UniProt accessions
333
+ from the Uniprot SPARQL endpoint.
334
+
335
+ The output CSV file has the following columns:
336
+
337
+ - uniprot_accession: UniProt accession.
338
+ - uniprot_id: UniProt ID (mnemonic).
339
+ - sequence_length: Length of the canonical sequence.
340
+ - reviewed: Whether the entry is reviewed (Swiss-Prot) or unreviewed (TrEMBL).
341
+ - protein_name: Recommended protein name.
342
+ - taxon_id: NCBI Taxonomy ID of the organism.
343
+ - taxon_name: Scientific name of the organism.
344
+
345
+ The order of the output CSV can be different from the input order.
346
+ """)
347
+ parser = subparsers.add_parser(
348
+ "uniprot-details",
349
+ help="Retrieve UniProt details for given UniProt accessions",
350
+ description=Markdown(description, style="argparse.text"), # type: ignore using rich formatter makes this OK
351
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
352
+ )
353
+ parser.add_argument(
354
+ "uniprot_accessions",
355
+ type=argparse.FileType("r", encoding="UTF-8"),
356
+ help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
357
+ )
358
+ parser.add_argument(
359
+ "output_csv",
360
+ type=argparse.FileType("w", encoding="UTF-8"),
361
+ help="Output CSV with UniProt details. Use `-` for stdout.",
362
+ )
363
+ parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
364
+ parser.add_argument("--batch-size", type=int, default=1_000, help="Number of accessions to query per batch")
365
+
366
+
367
+ def _add_copy_method_arguments(parser):
368
+ parser.add_argument(
369
+ "--copy-method",
370
+ type=str,
371
+ choices=copy_methods,
372
+ default="hardlink",
373
+ help=dedent("""\
374
+ How to make target file be same file as source file.
375
+ By default uses hardlinks to save disk space.
376
+ Note that hardlinks only work within the same filesystem and are harder to track.
377
+ If you want to track cached files easily then use 'symlink'.
378
+ On Windows you need developer mode or admin privileges to create symlinks.
379
+ """),
380
+ )
381
+
382
+
383
+ def _add_cacher_arguments(parser: argparse.ArgumentParser):
384
+ """Add cacher arguments to parser."""
385
+ parser.add_argument(
386
+ "--no-cache",
387
+ action="store_true",
388
+ help="Disable caching of files to central location.",
389
+ )
390
+ parser.add_argument(
391
+ "--cache-dir",
392
+ type=Path,
393
+ default=user_cache_root_dir(),
394
+ help="Directory to use as cache for files.",
395
+ )
396
+ _add_copy_method_arguments(parser)
397
+
398
+
300
399
  def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
301
400
  """Add retrieve pdbe subcommand parser."""
302
401
  parser = subparsers.add_parser(
@@ -345,6 +444,19 @@ def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
345
444
  help=dedent("""AlphaFold formats to retrieve. Can be specified multiple times.
346
445
  Default is 'summary' and 'cif'."""),
347
446
  )
447
+ parser.add_argument(
448
+ "--gzip-files",
449
+ action="store_true",
450
+ help="Whether to gzip the downloaded files. Excludes summary files, they are always uncompressed.",
451
+ )
452
+ parser.add_argument(
453
+ "--all-isoforms",
454
+ action="store_true",
455
+ help=(
456
+ "Whether to return all isoforms of each uniprot entry. "
457
+ "If not given then only the Alphafold entry for the canonical sequence is returned."
458
+ ),
459
+ )
348
460
  parser.add_argument(
349
461
  "--max-parallel-downloads",
350
462
  type=int,
@@ -533,6 +645,7 @@ def _add_search_subcommands(subparsers: argparse._SubParsersAction):
533
645
  _add_search_taxonomy_parser(subsubparsers)
534
646
  _add_search_interaction_partners_parser(subsubparsers)
535
647
  _add_search_complexes_parser(subsubparsers)
648
+ _add_search_uniprot_details_parser(subsubparsers)
536
649
 
537
650
 
538
651
  def _add_retrieve_subcommands(subparsers: argparse._SubParsersAction):
@@ -561,6 +674,75 @@ def _add_filter_subcommands(subparsers: argparse._SubParsersAction):
561
674
  _add_filter_ss_parser(subsubparsers)
562
675
 
563
676
 
677
+ def _add_convert_uniprot_parser(subparsers: argparse._SubParsersAction):
678
+ """Add convert uniprot subcommand parser."""
679
+ parser = subparsers.add_parser(
680
+ "uniprot",
681
+ help="Convert structure files to list of UniProt accessions.",
682
+ description="Convert structure files to list of UniProt accessions. "
683
+ "Uniprot accessions are read from database reference of each structure.",
684
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
685
+ )
686
+ parser.add_argument(
687
+ "input_dir",
688
+ type=Path,
689
+ help=f"Directory with structure files. Supported extensions are {valid_structure_file_extensions}",
690
+ )
691
+ parser.add_argument(
692
+ "output",
693
+ type=argparse.FileType("wt", encoding="UTF-8"),
694
+ help="Output text file with UniProt accessions (one per line). Use '-' for stdout.",
695
+ )
696
+ parser.add_argument(
697
+ "--grouped",
698
+ action="store_true",
699
+ help="Whether to group accessions by structure file. "
700
+ "If set output changes to `<structure_file1>,<acc1>\\n<structure_file1>,<acc2>` format.",
701
+ )
702
+
703
+
704
+ def _add_convert_structures_parser(subparsers: argparse._SubParsersAction):
705
+ """Add convert structures subcommand parser."""
706
+ parser = subparsers.add_parser(
707
+ "structures",
708
+ help="Convert structure files between formats",
709
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
710
+ )
711
+ parser.add_argument(
712
+ "input_dir",
713
+ type=Path,
714
+ help=f"Directory with structure files. Supported extensions are {valid_structure_file_extensions}",
715
+ )
716
+ parser.add_argument(
717
+ "--output-dir",
718
+ type=Path,
719
+ help=dedent("""\
720
+ Directory to write converted structure files. If not given, files are written to `input_dir`.
721
+ """),
722
+ )
723
+ parser.add_argument(
724
+ "--format",
725
+ type=str,
726
+ choices=("cif",),
727
+ default="cif",
728
+ help="Output format to convert to.",
729
+ )
730
+ _add_copy_method_arguments(parser)
731
+
732
+
733
+ def _add_convert_subcommands(subparsers: argparse._SubParsersAction):
734
+ """Add convert command and its subcommands."""
735
+ parser = subparsers.add_parser(
736
+ "convert",
737
+ help="Convert files between formats",
738
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
739
+ )
740
+ subsubparsers = parser.add_subparsers(dest="convert_cmd", required=True)
741
+
742
+ _add_convert_structures_parser(subsubparsers)
743
+ _add_convert_uniprot_parser(subsubparsers)
744
+
745
+
564
746
  def _add_mcp_command(subparsers: argparse._SubParsersAction):
565
747
  """Add MCP command."""
566
748
 
@@ -580,38 +762,6 @@ def _add_mcp_command(subparsers: argparse._SubParsersAction):
580
762
  parser.add_argument("--port", default=8000, type=int, help="Port to bind the server to")
581
763
 
582
764
 
583
- def _add_copy_method_arguments(parser):
584
- parser.add_argument(
585
- "--copy-method",
586
- type=str,
587
- choices=copy_methods,
588
- default="hardlink",
589
- help=dedent("""\
590
- How to make target file be same file as source file.
591
- By default uses hardlinks to save disk space.
592
- Note that hardlinks only work within the same filesystem and are harder to track.
593
- If you want to track cached files easily then use 'symlink'.
594
- On Windows you need developer mode or admin privileges to create symlinks.
595
- """),
596
- )
597
-
598
-
599
- def _add_cacher_arguments(parser: argparse.ArgumentParser):
600
- """Add cacher arguments to parser."""
601
- parser.add_argument(
602
- "--no-cache",
603
- action="store_true",
604
- help="Disable caching of files to central location.",
605
- )
606
- parser.add_argument(
607
- "--cache-dir",
608
- type=Path,
609
- default=user_cache_root_dir(),
610
- help="Directory to use as cache for files.",
611
- )
612
- _add_copy_method_arguments(parser)
613
-
614
-
615
765
  def make_parser() -> argparse.ArgumentParser:
616
766
  parser = argparse.ArgumentParser(
617
767
  description="Protein Quest CLI", prog="protein-quest", formatter_class=ArgumentDefaultsRichHelpFormatter
@@ -624,25 +774,18 @@ def make_parser() -> argparse.ArgumentParser:
624
774
  _add_search_subcommands(subparsers)
625
775
  _add_retrieve_subcommands(subparsers)
626
776
  _add_filter_subcommands(subparsers)
777
+ _add_convert_subcommands(subparsers)
627
778
  _add_mcp_command(subparsers)
628
779
 
629
780
  return parser
630
781
 
631
782
 
632
- def main():
633
- """Main entry point for the CLI."""
634
- parser = make_parser()
635
- args = parser.parse_args()
636
- logging.basicConfig(level=args.log_level, handlers=[RichHandler(show_level=False)])
637
-
638
- # Dispatch table to reduce complexity
639
- cmd = args.command
640
- sub = getattr(args, f"{cmd}_cmd", None)
641
- handler = HANDLERS.get((cmd, sub))
642
- if handler is None:
643
- msg = f"Unknown command: {cmd} {sub}"
644
- raise SystemExit(msg)
645
- handler(args)
783
+ def _name_of(file: TextIOWrapper | BytesIO) -> str:
784
+ try:
785
+ return file.name
786
+ except AttributeError:
787
+ # In pytest BytesIO is used stdout which has no 'name' attribute
788
+ return "<stdout>"
646
789
 
647
790
 
648
791
  def _handle_search_uniprot(args):
@@ -651,6 +794,8 @@ def _handle_search_uniprot(args):
651
794
  subcellular_location_uniprot = args.subcellular_location_uniprot
652
795
  subcellular_location_go = args.subcellular_location_go
653
796
  molecular_function_go = args.molecular_function_go
797
+ min_sequence_length = args.min_sequence_length
798
+ max_sequence_length = args.max_sequence_length
654
799
  limit = args.limit
655
800
  timeout = args.timeout
656
801
  output_file = args.output
@@ -662,54 +807,78 @@ def _handle_search_uniprot(args):
662
807
  "subcellular_location_uniprot": subcellular_location_uniprot,
663
808
  "subcellular_location_go": subcellular_location_go,
664
809
  "molecular_function_go": molecular_function_go,
810
+ "min_sequence_length": min_sequence_length,
811
+ "max_sequence_length": max_sequence_length,
665
812
  },
666
813
  Query,
667
814
  )
668
815
  rprint("Searching for UniProt accessions")
669
816
  accs = search4uniprot(query=query, limit=limit, timeout=timeout)
670
- rprint(f"Found {len(accs)} UniProt accessions, written to {output_file.name}")
817
+ rprint(f"Found {len(accs)} UniProt accessions, written to {_name_of(output_file)}")
671
818
  _write_lines(output_file, sorted(accs))
672
819
 
673
820
 
674
821
  def _handle_search_pdbe(args):
675
- uniprot_accs = args.uniprot_accs
822
+ uniprot_accessions = args.uniprot_accessions
676
823
  limit = args.limit
677
824
  timeout = args.timeout
678
825
  output_csv = args.output_csv
826
+ min_residues = converter.structure(args.min_residues, PositiveInt | None) # pyright: ignore[reportArgumentType]
827
+ max_residues = converter.structure(args.max_residues, PositiveInt | None) # pyright: ignore[reportArgumentType]
679
828
 
680
- accs = set(_read_lines(uniprot_accs))
829
+ accs = set(_read_lines(uniprot_accessions))
681
830
  rprint(f"Finding PDB entries for {len(accs)} uniprot accessions")
682
831
  results = search4pdb(accs, limit=limit, timeout=timeout)
683
- total_pdbs = sum([len(v) for v in results.values()])
684
- rprint(f"Found {total_pdbs} PDB entries for {len(results)} uniprot accessions")
685
- rprint(f"Written to {output_csv.name}")
832
+
833
+ raw_nr_results = len(results)
834
+ raw_total_pdbs = sum([len(v) for v in results.values()])
835
+ if min_residues or max_residues:
836
+ results = filter_pdb_results_on_chain_length(results, min_residues, max_residues)
837
+ total_pdbs = sum([len(v) for v in results.values()])
838
+ rprint(f"Before filtering found {raw_total_pdbs} PDB entries for {raw_nr_results} uniprot accessions.")
839
+ rprint(
840
+ f"After filtering on chain length ({min_residues}, {max_residues}) "
841
+ f"remained {total_pdbs} PDB entries for {len(results)} uniprot accessions."
842
+ )
843
+ else:
844
+ rprint(f"Found {raw_total_pdbs} PDB entries for {raw_nr_results} uniprot accessions")
845
+
686
846
  _write_pdbe_csv(output_csv, results)
847
+ rprint(f"Written to {_name_of(output_csv)}")
687
848
 
688
849
 
689
850
  def _handle_search_alphafold(args):
690
- uniprot_accs = args.uniprot_accs
851
+ uniprot_accessions = args.uniprot_accessions
852
+ min_sequence_length = converter.structure(args.min_sequence_length, PositiveInt | None) # pyright: ignore[reportArgumentType]
853
+ max_sequence_length = converter.structure(args.max_sequence_length, PositiveInt | None) # pyright: ignore[reportArgumentType]
691
854
  limit = args.limit
692
855
  timeout = args.timeout
693
856
  output_csv = args.output_csv
694
857
 
695
- accs = _read_lines(uniprot_accs)
858
+ accs = _read_lines(uniprot_accessions)
696
859
  rprint(f"Finding AlphaFold entries for {len(accs)} uniprot accessions")
697
- results = search4af(accs, limit=limit, timeout=timeout)
698
- rprint(f"Found {len(results)} AlphaFold entries, written to {output_csv.name}")
860
+ results = search4af(
861
+ accs,
862
+ min_sequence_length=min_sequence_length,
863
+ max_sequence_length=max_sequence_length,
864
+ limit=limit,
865
+ timeout=timeout,
866
+ )
867
+ rprint(f"Found {len(results)} AlphaFold entries, written to {_name_of(output_csv)}")
699
868
  _write_dict_of_sets2csv(output_csv, results, "af_id")
700
869
 
701
870
 
702
871
  def _handle_search_emdb(args):
703
- uniprot_accs = args.uniprot_accs
872
+ uniprot_accessions = args.uniprot_accessions
704
873
  limit = args.limit
705
874
  timeout = args.timeout
706
875
  output_csv = args.output_csv
707
876
 
708
- accs = _read_lines(uniprot_accs)
877
+ accs = _read_lines(uniprot_accessions)
709
878
  rprint(f"Finding EMDB entries for {len(accs)} uniprot accessions")
710
879
  results = search4emdb(accs, limit=limit, timeout=timeout)
711
880
  total_emdbs = sum([len(v) for v in results.values()])
712
- rprint(f"Found {total_emdbs} EMDB entries, written to {output_csv.name}")
881
+ rprint(f"Found {total_emdbs} EMDB entries, written to {_name_of(output_csv)}")
713
882
  _write_dict_of_sets2csv(output_csv, results, "emdb_id")
714
883
 
715
884
 
@@ -724,7 +893,7 @@ def _handle_search_go(args):
724
893
  else:
725
894
  rprint(f"Searching for GO terms matching '{term}'")
726
895
  results = asyncio.run(search_gene_ontology_term(term, aspect=aspect, limit=limit))
727
- rprint(f"Found {len(results)} GO terms, written to {output_csv.name}")
896
+ rprint(f"Found {len(results)} GO terms, written to {_name_of(output_csv)}")
728
897
  write_go_terms_to_csv(results, output_csv)
729
898
 
730
899
 
@@ -739,36 +908,49 @@ def _handle_search_taxonomy(args):
739
908
  else:
740
909
  rprint(f"Searching for taxon information matching '{query}'")
741
910
  results = asyncio.run(search_taxon(query=query, field=field, limit=limit))
742
- rprint(f"Found {len(results)} taxons, written to {output_csv.name}")
911
+ rprint(f"Found {len(results)} taxons, written to {_name_of(output_csv)}")
743
912
  _write_taxonomy_csv(results, output_csv)
744
913
 
745
914
 
746
915
  def _handle_search_interaction_partners(args: argparse.Namespace):
747
- uniprot_acc: str = args.uniprot_acc
916
+ uniprot_accession: str = args.uniprot_accession
748
917
  excludes: set[str] = set(args.exclude) if args.exclude else set()
749
918
  limit: int = args.limit
750
919
  timeout: int = args.timeout
751
920
  output_csv: TextIOWrapper = args.output_csv
752
921
 
753
- rprint(f"Searching for interaction partners of '{uniprot_acc}'")
754
- results = search4interaction_partners(uniprot_acc, excludes=excludes, limit=limit, timeout=timeout)
755
- rprint(f"Found {len(results)} interaction partners, written to {output_csv.name}")
922
+ rprint(f"Searching for interaction partners of '{uniprot_accession}'")
923
+ results = search4interaction_partners(uniprot_accession, excludes=excludes, limit=limit, timeout=timeout)
924
+ rprint(f"Found {len(results)} interaction partners, written to {_name_of(output_csv)}")
756
925
  _write_lines(output_csv, results.keys())
757
926
 
758
927
 
759
928
  def _handle_search_complexes(args: argparse.Namespace):
760
- uniprot_accs = args.uniprot_accs
929
+ uniprot_accessions = args.uniprot_accessions
761
930
  limit = args.limit
762
931
  timeout = args.timeout
763
932
  output_csv = args.output_csv
764
933
 
765
- accs = _read_lines(uniprot_accs)
934
+ accs = _read_lines(uniprot_accessions)
766
935
  rprint(f"Finding complexes for {len(accs)} uniprot accessions")
767
936
  results = search4macromolecular_complexes(accs, limit=limit, timeout=timeout)
768
- rprint(f"Found {len(results)} complexes, written to {output_csv.name}")
937
+ rprint(f"Found {len(results)} complexes, written to {_name_of(output_csv)}")
769
938
  _write_complexes_csv(results, output_csv)
770
939
 
771
940
 
941
+ def _handle_search_uniprot_details(args: argparse.Namespace):
942
+ uniprot_accessions = args.uniprot_accessions
943
+ timeout = args.timeout
944
+ batch_size = args.batch_size
945
+ output_csv: TextIOWrapper = args.output_csv
946
+
947
+ accs = _read_lines(uniprot_accessions)
948
+ rprint(f"Retrieving UniProt entry details for {len(accs)} uniprot accessions")
949
+ results = list(map_uniprot_accessions2uniprot_details(accs, timeout=timeout, batch_size=batch_size))
950
+ _write_uniprot_details_csv(output_csv, results)
951
+ rprint(f"Retrieved details for {len(results)} UniProt entries, written to {_name_of(output_csv)}")
952
+
953
+
772
954
  def _initialize_cacher(args: argparse.Namespace) -> Cacher:
773
955
  if args.no_cache:
774
956
  return PassthroughCacher()
@@ -798,17 +980,25 @@ def _handle_retrieve_alphafold(args):
798
980
  alphafold_csv = args.alphafold_csv
799
981
  max_parallel_downloads = args.max_parallel_downloads
800
982
  cacher = _initialize_cacher(args)
983
+ gzip_files = args.gzip_files
984
+ all_isoforms = args.all_isoforms
801
985
 
802
986
  if what_formats is None:
803
987
  what_formats = {"summary", "cif"}
804
988
 
805
- # TODO besides `uniprot_acc,af_id\n` csv also allow headless single column format
989
+ # TODO besides `uniprot_accession,af_id\n` csv also allow headless single column format
806
990
  #
807
991
  af_ids = _read_column_from_csv(alphafold_csv, "af_id")
808
992
  validated_what: set[DownloadableFormat] = structure(what_formats, set[DownloadableFormat])
809
993
  rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {validated_what}")
810
994
  afs = af_fetch(
811
- af_ids, download_dir, what=validated_what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
995
+ af_ids,
996
+ download_dir,
997
+ what=validated_what,
998
+ max_parallel_downloads=max_parallel_downloads,
999
+ cacher=cacher,
1000
+ gzip_files=gzip_files,
1001
+ all_isoforms=all_isoforms,
812
1002
  )
813
1003
  total_nr_files = sum(af.nr_of_files() for af in afs)
814
1004
  rprint(f"Retrieved {total_nr_files} AlphaFold files and {len(afs)} summaries, written to {download_dir}")
@@ -863,11 +1053,11 @@ def _handle_filter_confidence(args: argparse.Namespace):
863
1053
  if r.filtered_file:
864
1054
  passed_count += 1
865
1055
  if stats_file:
866
- writer.writerow([r.input_file, r.count, r.filtered_file is not None, r.filtered_file])
1056
+ writer.writerow([r.input_file, r.count, r.filtered_file is not None, r.filtered_file]) # pyright: ignore[reportPossiblyUnboundVariable]
867
1057
 
868
1058
  rprint(f"Filtered {passed_count} mmcif/PDB files by confidence, written to {output_dir} directory")
869
1059
  if stats_file:
870
- rprint(f"Statistics written to {stats_file.name}")
1060
+ rprint(f"Statistics written to {_name_of(stats_file)}")
871
1061
 
872
1062
 
873
1063
  def _handle_filter_chain(args):
@@ -933,13 +1123,13 @@ def _handle_filter_residue(args):
933
1123
  input_files, output_dir, min_residues=min_residues, max_residues=max_residues, copy_method=copy_method
934
1124
  ):
935
1125
  if stats_file:
936
- writer.writerow([r.input_file, r.residue_count, r.passed, r.output_file])
1126
+ writer.writerow([r.input_file, r.residue_count, r.passed, r.output_file]) # pyright: ignore[reportPossiblyUnboundVariable]
937
1127
  if r.passed:
938
1128
  nr_passed += 1
939
1129
 
940
1130
  rprint(f"Wrote {nr_passed} files to {output_dir} directory.")
941
1131
  if stats_file:
942
- rprint(f"Statistics written to {stats_file.name}")
1132
+ rprint(f"Statistics written to {_name_of(stats_file)}")
943
1133
 
944
1134
 
945
1135
  def _handle_filter_ss(args):
@@ -987,7 +1177,7 @@ def _handle_filter_ss(args):
987
1177
  copyfile(input_file, output_file, copy_method)
988
1178
  nr_passed += 1
989
1179
  if stats_file:
990
- writer.writerow(
1180
+ writer.writerow( # pyright: ignore[reportPossiblyUnboundVariable]
991
1181
  [
992
1182
  input_file,
993
1183
  result.stats.nr_residues,
@@ -1001,7 +1191,7 @@ def _handle_filter_ss(args):
1001
1191
  )
1002
1192
  rprint(f"Wrote {nr_passed} files to {output_dir} directory.")
1003
1193
  if stats_file:
1004
- rprint(f"Statistics written to {stats_file.name}")
1194
+ rprint(f"Statistics written to {_name_of(stats_file)}")
1005
1195
 
1006
1196
 
1007
1197
  def _handle_mcp(args):
@@ -1017,24 +1207,45 @@ def _handle_mcp(args):
1017
1207
  mcp.run(transport=args.transport, host=args.host, port=args.port)
1018
1208
 
1019
1209
 
1020
- HANDLERS: dict[tuple[str, str | None], Callable] = {
1021
- ("search", "uniprot"): _handle_search_uniprot,
1022
- ("search", "pdbe"): _handle_search_pdbe,
1023
- ("search", "alphafold"): _handle_search_alphafold,
1024
- ("search", "emdb"): _handle_search_emdb,
1025
- ("search", "go"): _handle_search_go,
1026
- ("search", "taxonomy"): _handle_search_taxonomy,
1027
- ("search", "interaction-partners"): _handle_search_interaction_partners,
1028
- ("search", "complexes"): _handle_search_complexes,
1029
- ("retrieve", "pdbe"): _handle_retrieve_pdbe,
1030
- ("retrieve", "alphafold"): _handle_retrieve_alphafold,
1031
- ("retrieve", "emdb"): _handle_retrieve_emdb,
1032
- ("filter", "confidence"): _handle_filter_confidence,
1033
- ("filter", "chain"): _handle_filter_chain,
1034
- ("filter", "residue"): _handle_filter_residue,
1035
- ("filter", "secondary-structure"): _handle_filter_ss,
1036
- ("mcp", None): _handle_mcp,
1037
- }
1210
+ def _handle_convert_uniprot(args):
1211
+ input_dir = structure(args.input_dir, Path)
1212
+ output_file: TextIOWrapper = args.output
1213
+ grouped: bool = args.grouped
1214
+ input_files = sorted(glob_structure_files(input_dir))
1215
+ if grouped:
1216
+ for input_file in tqdm(input_files, unit="file"):
1217
+ s = read_structure(input_file)
1218
+ uniprot_accessions = structure2uniprot_accessions(s)
1219
+ _write_lines(
1220
+ output_file, [f"{input_file},{uniprot_accession}" for uniprot_accession in sorted(uniprot_accessions)]
1221
+ )
1222
+ else:
1223
+ uniprot_accessions: set[str] = set()
1224
+ for input_file in tqdm(input_files, unit="file"):
1225
+ s = read_structure(input_file)
1226
+ uniprot_accessions.update(structure2uniprot_accessions(s))
1227
+ _write_lines(output_file, sorted(uniprot_accessions))
1228
+
1229
+
1230
+ def _handle_convert_structures(args):
1231
+ input_dir = structure(args.input_dir, Path)
1232
+ output_dir = input_dir if args.output_dir is None else structure(args.output_dir, Path)
1233
+ output_dir.mkdir(parents=True, exist_ok=True)
1234
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
1235
+
1236
+ input_files = sorted(glob_structure_files(input_dir))
1237
+ rprint(f"Converting {len(input_files)} files in {input_dir} directory to cif format.")
1238
+ for _ in tqdm(
1239
+ convert_to_cif_files(
1240
+ input_files,
1241
+ output_dir,
1242
+ copy_method=copy_method,
1243
+ ),
1244
+ total=len(input_files),
1245
+ unit="file",
1246
+ ):
1247
+ pass
1248
+ rprint(f"Converted {len(input_files)} files into {output_dir}.")
1038
1249
 
1039
1250
 
1040
1251
  def _read_lines(file: TextIOWrapper) -> list[str]:
@@ -1042,7 +1253,8 @@ def _read_lines(file: TextIOWrapper) -> list[str]:
1042
1253
 
1043
1254
 
1044
1255
  def _make_sure_parent_exists(file: TextIOWrapper):
1045
- if file.name != "<stdout>":
1256
+ # Can not create dir for stdout
1257
+ with suppress(AttributeError):
1046
1258
  Path(file.name).parent.mkdir(parents=True, exist_ok=True)
1047
1259
 
1048
1260
 
@@ -1051,34 +1263,35 @@ def _write_lines(file: TextIOWrapper, lines: Iterable[str]):
1051
1263
  file.writelines(line + os.linesep for line in lines)
1052
1264
 
1053
1265
 
1054
- def _write_pdbe_csv(path: TextIOWrapper, data: dict[str, set[PdbResult]]):
1266
+ def _write_pdbe_csv(path: TextIOWrapper, data: PdbResults):
1055
1267
  _make_sure_parent_exists(path)
1056
- fieldnames = ["uniprot_acc", "pdb_id", "method", "resolution", "uniprot_chains", "chain"]
1268
+ fieldnames = ["uniprot_accession", "pdb_id", "method", "resolution", "uniprot_chains", "chain", "chain_length"]
1057
1269
  writer = csv.DictWriter(path, fieldnames=fieldnames)
1058
1270
  writer.writeheader()
1059
- for uniprot_acc, entries in sorted(data.items()):
1271
+ for uniprot_accession, entries in sorted(data.items()):
1060
1272
  for e in sorted(entries, key=lambda x: (x.id, x.method)):
1061
1273
  writer.writerow(
1062
1274
  {
1063
- "uniprot_acc": uniprot_acc,
1275
+ "uniprot_accession": uniprot_accession,
1064
1276
  "pdb_id": e.id,
1065
1277
  "method": e.method,
1066
1278
  "resolution": e.resolution or "",
1067
1279
  "uniprot_chains": e.uniprot_chains,
1068
1280
  "chain": e.chain,
1281
+ "chain_length": e.chain_length,
1069
1282
  }
1070
1283
  )
1071
1284
 
1072
1285
 
1073
1286
  def _write_dict_of_sets2csv(file: TextIOWrapper, data: dict[str, set[str]], ref_id_field: str):
1074
1287
  _make_sure_parent_exists(file)
1075
- fieldnames = ["uniprot_acc", ref_id_field]
1288
+ fieldnames = ["uniprot_accession", ref_id_field]
1076
1289
 
1077
1290
  writer = csv.DictWriter(file, fieldnames=fieldnames)
1078
1291
  writer.writeheader()
1079
- for uniprot_acc, ref_ids in sorted(data.items()):
1292
+ for uniprot_accession, ref_ids in sorted(data.items()):
1080
1293
  for ref_id in sorted(ref_ids):
1081
- writer.writerow({"uniprot_acc": uniprot_acc, ref_id_field: ref_id})
1294
+ writer.writerow({"uniprot_accession": uniprot_accession, ref_id_field: ref_id})
1082
1295
 
1083
1296
 
1084
1297
  def _iter_csv_rows(file: TextIOWrapper) -> Generator[dict[str, str]]:
@@ -1118,3 +1331,61 @@ def _write_complexes_csv(complexes: list[ComplexPortalEntry], output_csv: TextIO
1118
1331
  members_str,
1119
1332
  ]
1120
1333
  )
1334
+
1335
+
1336
+ def _write_uniprot_details_csv(
1337
+ output_csv: TextIOWrapper,
1338
+ uniprot_details_list: Iterable[UniprotDetails],
1339
+ ) -> None:
1340
+ if not uniprot_details_list:
1341
+ msg = "No UniProt entries found for given accessions"
1342
+ raise ValueError(msg)
1343
+ # As all props of UniprotDetails are scalar, we can directly unstructure to dicts
1344
+ rows = converter.unstructure(uniprot_details_list)
1345
+ fieldnames = rows[0].keys()
1346
+ writer = csv.DictWriter(output_csv, fieldnames=fieldnames)
1347
+ writer.writeheader()
1348
+ writer.writerows(rows)
1349
+
1350
+
1351
+ HANDLERS: dict[tuple[str, str | None], Callable] = {
1352
+ ("search", "uniprot"): _handle_search_uniprot,
1353
+ ("search", "pdbe"): _handle_search_pdbe,
1354
+ ("search", "alphafold"): _handle_search_alphafold,
1355
+ ("search", "emdb"): _handle_search_emdb,
1356
+ ("search", "go"): _handle_search_go,
1357
+ ("search", "taxonomy"): _handle_search_taxonomy,
1358
+ ("search", "interaction-partners"): _handle_search_interaction_partners,
1359
+ ("search", "complexes"): _handle_search_complexes,
1360
+ ("search", "uniprot-details"): _handle_search_uniprot_details,
1361
+ ("retrieve", "pdbe"): _handle_retrieve_pdbe,
1362
+ ("retrieve", "alphafold"): _handle_retrieve_alphafold,
1363
+ ("retrieve", "emdb"): _handle_retrieve_emdb,
1364
+ ("filter", "confidence"): _handle_filter_confidence,
1365
+ ("filter", "chain"): _handle_filter_chain,
1366
+ ("filter", "residue"): _handle_filter_residue,
1367
+ ("filter", "secondary-structure"): _handle_filter_ss,
1368
+ ("mcp", None): _handle_mcp,
1369
+ ("convert", "structures"): _handle_convert_structures,
1370
+ ("convert", "uniprot"): _handle_convert_uniprot,
1371
+ }
1372
+
1373
+
1374
+ def main(argv: Sequence[str] | None = None):
1375
+ """Main entry point for the CLI.
1376
+
1377
+ Args:
1378
+ argv: List of command line arguments. If None, uses sys.argv.
1379
+ """
1380
+ parser = make_parser()
1381
+ args = parser.parse_args(argv)
1382
+ logging.basicConfig(level=args.log_level, handlers=[RichHandler(show_level=False, console=console)])
1383
+
1384
+ # Dispatch table to reduce complexity
1385
+ cmd = args.command
1386
+ sub = getattr(args, f"{cmd}_cmd", None)
1387
+ handler = HANDLERS.get((cmd, sub))
1388
+ if handler is None:
1389
+ msg = f"Unknown command: {cmd} {sub}"
1390
+ raise SystemExit(msg)
1391
+ handler(args)