protein-quest 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of protein-quest might be problematic. Click here for more details.

@@ -1,2 +1,2 @@
1
- __version__ = "0.3.1"
1
+ __version__ = "0.4.0"
2
2
  """The version of the package."""
@@ -7,7 +7,10 @@ from pathlib import Path
7
7
 
8
8
  import gemmi
9
9
 
10
+ from protein_quest.converter import Percentage, PositiveInt, converter
10
11
  from protein_quest.pdbe.io import write_structure
12
+ from protein_quest.ss import nr_of_residues_in_total
13
+ from protein_quest.utils import CopyMethod, copyfile
11
14
 
12
15
  """
13
16
  Methods to filter AlphaFoldDB structures on confidence scores.
@@ -73,13 +76,25 @@ class ConfidenceFilterQuery:
73
76
  Parameters:
74
77
  confidence: The confidence threshold for filtering residues.
75
78
  Residues with a pLDDT (b-factor) above this value are considered high confidence.
76
- min_threshold: The minimum number of high-confidence residues required to keep the structure.
77
- max_threshold: The maximum number of high-confidence residues required to keep the structure.
79
+ min_residues: The minimum number of high-confidence residues required to keep the structure.
80
+ max_residues: The maximum number of high-confidence residues required to keep the structure.
78
81
  """
79
82
 
80
- confidence: float
81
- min_threshold: int
82
- max_threshold: int
83
+ confidence: Percentage
84
+ min_residues: PositiveInt
85
+ max_residues: PositiveInt
86
+
87
+
88
+ base_query_hook = converter.get_structure_hook(ConfidenceFilterQuery)
89
+
90
+
91
+ @converter.register_structure_hook
92
+ def confidence_filter_query_hook(val, _type) -> ConfidenceFilterQuery:
93
+ result: ConfidenceFilterQuery = base_query_hook(val, _type)
94
+ if result.min_residues > result.max_residues:
95
+ msg = f"min_residues {result.min_residues} cannot be larger than max_residues {result.max_residues}"
96
+ raise ValueError(msg)
97
+ return result
83
98
 
84
99
 
85
100
  @dataclass
@@ -93,17 +108,20 @@ class ConfidenceFilterResult:
93
108
  """
94
109
 
95
110
  input_file: str
96
- count: int
111
+ count: PositiveInt
97
112
  filtered_file: Path | None = None
98
113
 
99
114
 
100
- def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_dir: Path) -> ConfidenceFilterResult:
115
+ def filter_file_on_residues(
116
+ file: Path, query: ConfidenceFilterQuery, filtered_dir: Path, copy_method: CopyMethod = "copy"
117
+ ) -> ConfidenceFilterResult:
101
118
  """Filter a single AlphaFoldDB structure file (*.pdb[.gz], *.cif[.gz]) based on confidence.
102
119
 
103
120
  Args:
104
121
  file: The path to the PDB file to filter.
105
122
  query: The confidence filter query.
106
123
  filtered_dir: The directory to save the filtered PDB file.
124
+ copy_method: How to copy when no residues have to be removed.
107
125
 
108
126
  Returns:
109
127
  result with filtered_file property set to Path where filtered PDB file is saved.
@@ -112,19 +130,24 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
112
130
  structure = gemmi.read_structure(str(file))
113
131
  residues = set(find_high_confidence_residues(structure, query.confidence))
114
132
  count = len(residues)
115
- if count < query.min_threshold or count > query.max_threshold:
133
+ if count < query.min_residues or count > query.max_residues:
116
134
  # Skip structure that is outside the min and max threshold
117
135
  # just return number of high confidence residues
118
136
  return ConfidenceFilterResult(
119
137
  input_file=file.name,
120
138
  count=count,
121
139
  )
140
+ total_residues = nr_of_residues_in_total(structure)
122
141
  filtered_file = filtered_dir / file.name
123
- new_structure = filter_out_low_confidence_residues(
124
- structure,
125
- residues,
126
- )
127
- write_structure(new_structure, filtered_file)
142
+ if count == total_residues:
143
+ # if no residues have to be removed then copy instead of slower gemmi writing
144
+ copyfile(file, filtered_file, copy_method)
145
+ else:
146
+ new_structure = filter_out_low_confidence_residues(
147
+ structure,
148
+ residues,
149
+ )
150
+ write_structure(new_structure, filtered_file)
128
151
  return ConfidenceFilterResult(
129
152
  input_file=file.name,
130
153
  count=count,
@@ -133,7 +156,10 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
133
156
 
134
157
 
135
158
  def filter_files_on_confidence(
136
- alphafold_pdb_files: list[Path], query: ConfidenceFilterQuery, filtered_dir: Path
159
+ alphafold_pdb_files: list[Path],
160
+ query: ConfidenceFilterQuery,
161
+ filtered_dir: Path,
162
+ copy_method: CopyMethod = "copy",
137
163
  ) -> Generator[ConfidenceFilterResult]:
138
164
  """Filter AlphaFoldDB structures based on confidence.
139
165
 
@@ -141,6 +167,7 @@ def filter_files_on_confidence(
141
167
  alphafold_pdb_files: List of mmcif/PDB files from AlphaFoldDB to filter.
142
168
  query: The confidence filter query containing the confidence thresholds.
143
169
  filtered_dir: Directory where the filtered mmcif/PDB files will be saved.
170
+ copy_method: How to copy when a direct copy is possible.
144
171
 
145
172
  Yields:
146
173
  For each mmcif/PDB files yields whether it was filtered or not,
@@ -150,4 +177,4 @@ def filter_files_on_confidence(
150
177
  # In ../filter.py:filter_files_on_residues() we filter on number of residues on a file level
151
178
  # here we filter on file level and inside file remove low confidence residues
152
179
  for pdb_file in alphafold_pdb_files:
153
- yield filter_file_on_residues(pdb_file, query, filtered_dir)
180
+ yield filter_file_on_residues(pdb_file, query, filtered_dir, copy_method)
@@ -9,17 +9,15 @@ from typing import Literal, cast, get_args
9
9
 
10
10
  from aiohttp_retry import RetryClient
11
11
  from aiopath import AsyncPath
12
- from cattrs.preconf.orjson import make_converter
13
12
  from tqdm.asyncio import tqdm
14
13
  from yarl import URL
15
14
 
16
15
  from protein_quest.alphafold.entry_summary import EntrySummary
16
+ from protein_quest.converter import converter
17
17
  from protein_quest.utils import friendly_session, retrieve_files, run_async
18
18
 
19
19
  logger = logging.getLogger(__name__)
20
- converter = make_converter()
21
- """cattrs converter to read AlphaFold summary JSON document."""
22
- converter.register_structure_hook(URL, lambda v, _: URL(v))
20
+
23
21
 
24
22
  DownloadableFormat = Literal[
25
23
  "summary",
protein_quest/cli.py CHANGED
@@ -15,6 +15,7 @@ from textwrap import dedent
15
15
  from cattrs import structure
16
16
  from rich import print as rprint
17
17
  from rich.logging import RichHandler
18
+ from rich.markdown import Markdown
18
19
  from rich.panel import Panel
19
20
  from rich_argparse import ArgumentDefaultsRichHelpFormatter
20
21
  from tqdm.rich import tqdm
@@ -23,13 +24,26 @@ from protein_quest.__version__ import __version__
23
24
  from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence
24
25
  from protein_quest.alphafold.fetch import DownloadableFormat, downloadable_formats
25
26
  from protein_quest.alphafold.fetch import fetch_many as af_fetch
27
+ from protein_quest.converter import converter
26
28
  from protein_quest.emdb import fetch as emdb_fetch
27
29
  from protein_quest.filters import filter_files_on_chain, filter_files_on_residues
28
30
  from protein_quest.go import Aspect, allowed_aspects, search_gene_ontology_term, write_go_terms_to_csv
29
31
  from protein_quest.pdbe import fetch as pdbe_fetch
30
32
  from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
33
+ from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
31
34
  from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
32
- from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
35
+ from protein_quest.uniprot import (
36
+ ComplexPortalEntry,
37
+ PdbResult,
38
+ Query,
39
+ search4af,
40
+ search4emdb,
41
+ search4interaction_partners,
42
+ search4macromolecular_complexes,
43
+ search4pdb,
44
+ search4uniprot,
45
+ )
46
+ from protein_quest.utils import CopyMethod, copy_methods, copyfile
33
47
 
34
48
  logger = logging.getLogger(__name__)
35
49
 
@@ -208,6 +222,73 @@ def _add_search_taxonomy_parser(subparser: argparse._SubParsersAction):
208
222
  parser.add_argument("--limit", type=int, default=100, help="Maximum number of results to return")
209
223
 
210
224
 
225
+ def _add_search_interaction_partners_parser(subparsers: argparse._SubParsersAction):
226
+ """Add search interaction partners subcommand parser."""
227
+ parser = subparsers.add_parser(
228
+ "interaction-partners",
229
+ help="Search for interaction partners of given UniProt accession",
230
+ description=dedent("""\
231
+ Search for interaction partners of given UniProt accession
232
+ in the Uniprot SPARQL endpoint and Complex Portal.
233
+ """),
234
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
235
+ )
236
+ parser.add_argument(
237
+ "uniprot_acc",
238
+ type=str,
239
+ help="UniProt accession (for example P12345).",
240
+ )
241
+ parser.add_argument(
242
+ "--exclude",
243
+ type=str,
244
+ action="append",
245
+ help="UniProt accessions to exclude from the results. For example already known interaction partners.",
246
+ )
247
+ parser.add_argument(
248
+ "output_csv",
249
+ type=argparse.FileType("w", encoding="UTF-8"),
250
+ help="Output CSV with interaction partners per UniProt accession. Use `-` for stdout.",
251
+ )
252
+ parser.add_argument(
253
+ "--limit", type=int, default=10_000, help="Maximum number of interaction partner uniprot accessions to return"
254
+ )
255
+ parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
256
+
257
+
258
+ def _add_search_complexes_parser(subparsers: argparse._SubParsersAction):
259
+ """Add search complexes subcommand parser."""
260
+ description = dedent("""\
261
+ Search for complexes in the Complex Portal.
262
+ https://www.ebi.ac.uk/complexportal/
263
+
264
+ The output CSV file has the following columns:
265
+
266
+ - query_protein: UniProt accession used as query
267
+ - complex_id: Complex Portal identifier
268
+ - complex_url: URL to the Complex Portal entry
269
+ - complex_title: Title of the complex
270
+ - members: Semicolon-separated list of UniProt accessions of complex members
271
+ """)
272
+ parser = subparsers.add_parser(
273
+ "complexes",
274
+ help="Search for complexes in the Complex Portal",
275
+ description=Markdown(description, style="argparse.text"), # type: ignore using rich formatter makes this OK
276
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
277
+ )
278
+ parser.add_argument(
279
+ "uniprot_accs",
280
+ type=argparse.FileType("r", encoding="UTF-8"),
281
+ help="Text file with UniProt accessions (one per line) as query for searching complexes. Use `-` for stdin.",
282
+ )
283
+ parser.add_argument(
284
+ "output_csv",
285
+ type=argparse.FileType("w", encoding="UTF-8"),
286
+ help="Output CSV file with complex results. Use `-` for stdout.",
287
+ )
288
+ parser.add_argument("--limit", type=int, default=100, help="Maximum number of complex results to return")
289
+ parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
290
+
291
+
211
292
  def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
212
293
  """Add retrieve pdbe subcommand parser."""
213
294
  parser = subparsers.add_parser(
@@ -282,6 +363,22 @@ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
282
363
  parser.add_argument("output_dir", type=Path, help="Directory to store downloaded EMDB volume files")
283
364
 
284
365
 
366
+ def _add_copy_method_argument(parser: argparse.ArgumentParser):
367
+ """Add copy method argument to parser."""
368
+ default_copy_method = "symlink"
369
+ if os.name == "nt":
370
+ # On Windows you need developer mode or admin privileges to create symlinks
371
+ # so we default to copying files instead of symlinking
372
+ default_copy_method = "copy"
373
+ parser.add_argument(
374
+ "--copy-method",
375
+ type=str,
376
+ choices=copy_methods,
377
+ default=default_copy_method,
378
+ help="How to copy files when no changes are needed to output file.",
379
+ )
380
+
381
+
285
382
  def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
286
383
  """Add filter confidence subcommand parser."""
287
384
  parser = subparsers.add_parser(
@@ -312,6 +409,7 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
312
409
  In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
313
410
  Use `-` for stdout."""),
314
411
  )
412
+ _add_copy_method_argument(parser)
315
413
 
316
414
 
317
415
  def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
@@ -347,8 +445,11 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
347
445
  )
348
446
  parser.add_argument(
349
447
  "--scheduler-address",
350
- help="Address of the Dask scheduler to connect to. If not provided, will create a local cluster.",
448
+ help=dedent("""Address of the Dask scheduler to connect to.
449
+ If not provided, will create a local cluster.
450
+ If set to `sequential` will run tasks sequentially."""),
351
451
  )
452
+ _add_copy_method_argument(parser)
352
453
 
353
454
 
354
455
  def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
@@ -371,6 +472,7 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
371
472
  )
372
473
  parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
373
474
  parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
475
+ _add_copy_method_argument(parser)
374
476
  parser.add_argument(
375
477
  "--write-stats",
376
478
  type=argparse.FileType("w", encoding="UTF-8"),
@@ -381,6 +483,43 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
381
483
  )
382
484
 
383
485
 
486
+ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
487
+ """Add filter secondary structure subcommand parser."""
488
+ parser = subparsers.add_parser(
489
+ "secondary-structure",
490
+ help="Filter PDB/mmCIF files by secondary structure",
491
+ description="Filter PDB/mmCIF files by secondary structure",
492
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
493
+ )
494
+ parser.add_argument("input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')")
495
+ parser.add_argument(
496
+ "output_dir",
497
+ type=Path,
498
+ help=dedent("""\
499
+ Directory to write filtered PDB/mmCIF files. Files are copied without modification.
500
+ """),
501
+ )
502
+ parser.add_argument("--abs-min-helix-residues", type=int, help="Min residues in helices")
503
+ parser.add_argument("--abs-max-helix-residues", type=int, help="Max residues in helices")
504
+ parser.add_argument("--abs-min-sheet-residues", type=int, help="Min residues in sheets")
505
+ parser.add_argument("--abs-max-sheet-residues", type=int, help="Max residues in sheets")
506
+ parser.add_argument("--ratio-min-helix-residues", type=float, help="Min residues in helices (relative)")
507
+ parser.add_argument("--ratio-max-helix-residues", type=float, help="Max residues in helices (relative)")
508
+ parser.add_argument("--ratio-min-sheet-residues", type=float, help="Min residues in sheets (relative)")
509
+ parser.add_argument("--ratio-max-sheet-residues", type=float, help="Max residues in sheets (relative)")
510
+ _add_copy_method_argument(parser)
511
+ parser.add_argument(
512
+ "--write-stats",
513
+ type=argparse.FileType("w", encoding="UTF-8"),
514
+ help=dedent("""
515
+ Write filter statistics to file. In CSV format with columns:
516
+ `<input_file>,<nr_residues>,<nr_helix_residues>,<nr_sheet_residues>,
517
+ <helix_ratio>,<sheet_ratio>,<passed>,<output_file>`.
518
+ Use `-` for stdout.
519
+ """),
520
+ )
521
+
522
+
384
523
  def _add_search_subcommands(subparsers: argparse._SubParsersAction):
385
524
  """Add search command and its subcommands."""
386
525
  parser = subparsers.add_parser(
@@ -397,6 +536,8 @@ def _add_search_subcommands(subparsers: argparse._SubParsersAction):
397
536
  _add_search_emdb_parser(subsubparsers)
398
537
  _add_search_go_parser(subsubparsers)
399
538
  _add_search_taxonomy_parser(subsubparsers)
539
+ _add_search_interaction_partners_parser(subsubparsers)
540
+ _add_search_complexes_parser(subsubparsers)
400
541
 
401
542
 
402
543
  def _add_retrieve_subcommands(subparsers: argparse._SubParsersAction):
@@ -422,6 +563,7 @@ def _add_filter_subcommands(subparsers: argparse._SubParsersAction):
422
563
  _add_filter_confidence_parser(subsubparsers)
423
564
  _add_filter_chain_parser(subsubparsers)
424
565
  _add_filter_residue_parser(subsubparsers)
566
+ _add_filter_ss_parser(subsubparsers)
425
567
 
426
568
 
427
569
  def _add_mcp_command(subparsers: argparse._SubParsersAction):
@@ -574,6 +716,32 @@ def _handle_search_taxonomy(args):
574
716
  _write_taxonomy_csv(results, output_csv)
575
717
 
576
718
 
719
+ def _handle_search_interaction_partners(args: argparse.Namespace):
720
+ uniprot_acc: str = args.uniprot_acc
721
+ excludes: set[str] = set(args.exclude) if args.exclude else set()
722
+ limit: int = args.limit
723
+ timeout: int = args.timeout
724
+ output_csv: TextIOWrapper = args.output_csv
725
+
726
+ rprint(f"Searching for interaction partners of '{uniprot_acc}'")
727
+ results = search4interaction_partners(uniprot_acc, excludes=excludes, limit=limit, timeout=timeout)
728
+ rprint(f"Found {len(results)} interaction partners, written to {output_csv.name}")
729
+ _write_lines(output_csv, results.keys())
730
+
731
+
732
+ def _handle_search_complexes(args: argparse.Namespace):
733
+ uniprot_accs = args.uniprot_accs
734
+ limit = args.limit
735
+ timeout = args.timeout
736
+ output_csv = args.output_csv
737
+
738
+ accs = _read_lines(uniprot_accs)
739
+ rprint(f"Finding complexes for {len(accs)} uniprot accessions")
740
+ results = search4macromolecular_complexes(accs, limit=limit, timeout=timeout)
741
+ rprint(f"Found {len(results)} complexes, written to {output_csv.name}")
742
+ _write_complexes_csv(results, output_csv)
743
+
744
+
577
745
  def _handle_retrieve_pdbe(args):
578
746
  pdbe_csv = args.pdbe_csv
579
747
  output_dir = args.output_dir
@@ -620,21 +788,22 @@ def _handle_filter_confidence(args: argparse.Namespace):
620
788
  # to get rid of duplication
621
789
  input_dir = structure(args.input_dir, Path)
622
790
  output_dir = structure(args.output_dir, Path)
623
- confidence_threshold = structure(args.confidence_threshold, float)
624
- # TODO add min/max
625
- min_residues = structure(args.min_residues, int)
626
- max_residues = structure(args.max_residues, int)
791
+
792
+ confidence_threshold = args.confidence_threshold
793
+ min_residues = args.min_residues
794
+ max_residues = args.max_residues
627
795
  stats_file: TextIOWrapper | None = args.write_stats
796
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
628
797
 
629
798
  output_dir.mkdir(parents=True, exist_ok=True)
630
799
  input_files = sorted(glob_structure_files(input_dir))
631
800
  nr_input_files = len(input_files)
632
801
  rprint(f"Starting confidence filtering of {nr_input_files} mmcif/PDB files in {input_dir} directory.")
633
- query = structure(
802
+ query = converter.structure(
634
803
  {
635
804
  "confidence": confidence_threshold,
636
- "min_threshold": min_residues,
637
- "max_threshold": max_residues,
805
+ "min_residues": min_residues,
806
+ "max_residues": max_residues,
638
807
  },
639
808
  ConfidenceFilterQuery,
640
809
  )
@@ -643,7 +812,11 @@ def _handle_filter_confidence(args: argparse.Namespace):
643
812
  writer.writerow(["input_file", "residue_count", "passed", "output_file"])
644
813
 
645
814
  passed_count = 0
646
- for r in tqdm(filter_files_on_confidence(input_files, query, output_dir), total=len(input_files), unit="file"):
815
+ for r in tqdm(
816
+ filter_files_on_confidence(input_files, query, output_dir, copy_method=copy_method),
817
+ total=len(input_files),
818
+ unit="file",
819
+ ):
647
820
  if r.filtered_file:
648
821
  passed_count += 1
649
822
  if stats_file:
@@ -656,9 +829,10 @@ def _handle_filter_confidence(args: argparse.Namespace):
656
829
 
657
830
  def _handle_filter_chain(args):
658
831
  input_dir = args.input_dir
659
- output_dir = args.output_dir
832
+ output_dir = structure(args.output_dir, Path)
660
833
  pdb_id2chain_mapping_file = args.chains
661
- scheduler_address = args.scheduler_address
834
+ scheduler_address = structure(args.scheduler_address, str | None) # pyright: ignore[reportArgumentType]
835
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
662
836
 
663
837
  # make sure files in input dir with entries in mapping file are the same
664
838
  # complain when files from mapping file are missing on disk
@@ -683,18 +857,25 @@ def _handle_filter_chain(args):
683
857
  rprint("[red]No valid structure files found. Exiting.")
684
858
  sys.exit(1)
685
859
 
686
- results = filter_files_on_chain(file2chain, output_dir, scheduler_address=scheduler_address)
860
+ results = filter_files_on_chain(
861
+ file2chain, output_dir, scheduler_address=scheduler_address, copy_method=copy_method
862
+ )
687
863
 
688
864
  nr_written = len([r for r in results if r.passed])
689
865
 
690
866
  rprint(f"Wrote {nr_written} single-chain PDB/mmCIF files to {output_dir}.")
691
867
 
868
+ for result in results:
869
+ if result.discard_reason:
870
+ rprint(f"[red]Discarding {result.input_file} ({result.discard_reason})[/red]")
871
+
692
872
 
693
873
  def _handle_filter_residue(args):
694
874
  input_dir = structure(args.input_dir, Path)
695
875
  output_dir = structure(args.output_dir, Path)
696
876
  min_residues = structure(args.min_residues, int)
697
877
  max_residues = structure(args.max_residues, int)
878
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
698
879
  stats_file: TextIOWrapper | None = args.write_stats
699
880
 
700
881
  if stats_file:
@@ -705,7 +886,9 @@ def _handle_filter_residue(args):
705
886
  input_files = sorted(glob_structure_files(input_dir))
706
887
  nr_total = len(input_files)
707
888
  rprint(f"Filtering {nr_total} files in {input_dir} directory by number of residues in chain A.")
708
- for r in filter_files_on_residues(input_files, output_dir, min_residues=min_residues, max_residues=max_residues):
889
+ for r in filter_files_on_residues(
890
+ input_files, output_dir, min_residues=min_residues, max_residues=max_residues, copy_method=copy_method
891
+ ):
709
892
  if stats_file:
710
893
  writer.writerow([r.input_file, r.residue_count, r.passed, r.output_file])
711
894
  if r.passed:
@@ -716,6 +899,68 @@ def _handle_filter_residue(args):
716
899
  rprint(f"Statistics written to {stats_file.name}")
717
900
 
718
901
 
902
+ def _handle_filter_ss(args):
903
+ input_dir = structure(args.input_dir, Path)
904
+ output_dir = structure(args.output_dir, Path)
905
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
906
+ stats_file: TextIOWrapper | None = args.write_stats
907
+
908
+ raw_query = {
909
+ "abs_min_helix_residues": args.abs_min_helix_residues,
910
+ "abs_max_helix_residues": args.abs_max_helix_residues,
911
+ "abs_min_sheet_residues": args.abs_min_sheet_residues,
912
+ "abs_max_sheet_residues": args.abs_max_sheet_residues,
913
+ "ratio_min_helix_residues": args.ratio_min_helix_residues,
914
+ "ratio_max_helix_residues": args.ratio_max_helix_residues,
915
+ "ratio_min_sheet_residues": args.ratio_min_sheet_residues,
916
+ "ratio_max_sheet_residues": args.ratio_max_sheet_residues,
917
+ }
918
+ query = converter.structure(raw_query, SecondaryStructureFilterQuery)
919
+ input_files = sorted(glob_structure_files(input_dir))
920
+ nr_total = len(input_files)
921
+ output_dir.mkdir(parents=True, exist_ok=True)
922
+
923
+ if stats_file:
924
+ writer = csv.writer(stats_file)
925
+ writer.writerow(
926
+ [
927
+ "input_file",
928
+ "nr_residues",
929
+ "nr_helix_residues",
930
+ "nr_sheet_residues",
931
+ "helix_ratio",
932
+ "sheet_ratio",
933
+ "passed",
934
+ "output_file",
935
+ ]
936
+ )
937
+
938
+ rprint(f"Filtering {nr_total} files in {input_dir} directory by secondary structure.")
939
+ nr_passed = 0
940
+ for input_file, result in filter_files_on_secondary_structure(input_files, query=query):
941
+ output_file: Path | None = None
942
+ if result.passed:
943
+ output_file = output_dir / input_file.name
944
+ copyfile(input_file, output_file, copy_method)
945
+ nr_passed += 1
946
+ if stats_file:
947
+ writer.writerow(
948
+ [
949
+ input_file,
950
+ result.stats.nr_residues,
951
+ result.stats.nr_helix_residues,
952
+ result.stats.nr_sheet_residues,
953
+ round(result.stats.helix_ratio, 3),
954
+ round(result.stats.sheet_ratio, 3),
955
+ result.passed,
956
+ output_file,
957
+ ]
958
+ )
959
+ rprint(f"Wrote {nr_passed} files to {output_dir} directory.")
960
+ if stats_file:
961
+ rprint(f"Statistics written to {stats_file.name}")
962
+
963
+
719
964
  def _handle_mcp(args):
720
965
  if find_spec("fastmcp") is None:
721
966
  msg = "Unable to start MCP server, please install `protein-quest[mcp]`."
@@ -736,12 +981,15 @@ HANDLERS: dict[tuple[str, str | None], Callable] = {
736
981
  ("search", "emdb"): _handle_search_emdb,
737
982
  ("search", "go"): _handle_search_go,
738
983
  ("search", "taxonomy"): _handle_search_taxonomy,
984
+ ("search", "interaction-partners"): _handle_search_interaction_partners,
985
+ ("search", "complexes"): _handle_search_complexes,
739
986
  ("retrieve", "pdbe"): _handle_retrieve_pdbe,
740
987
  ("retrieve", "alphafold"): _handle_retrieve_alphafold,
741
988
  ("retrieve", "emdb"): _handle_retrieve_emdb,
742
989
  ("filter", "confidence"): _handle_filter_confidence,
743
990
  ("filter", "chain"): _handle_filter_chain,
744
991
  ("filter", "residue"): _handle_filter_residue,
992
+ ("filter", "secondary-structure"): _handle_filter_ss,
745
993
  ("mcp", None): _handle_mcp,
746
994
  }
747
995
 
@@ -797,3 +1045,33 @@ def _iter_csv_rows(file: TextIOWrapper) -> Generator[dict[str, str]]:
797
1045
 
798
1046
  def _read_column_from_csv(file: TextIOWrapper, column: str) -> set[str]:
799
1047
  return {row[column] for row in _iter_csv_rows(file)}
1048
+
1049
+
1050
+ def _write_complexes_csv(complexes: list[ComplexPortalEntry], output_csv: TextIOWrapper) -> None:
1051
+ """Write ComplexPortal information to a CSV file.
1052
+
1053
+ Args:
1054
+ complexes: List of ComplexPortalEntry objects.
1055
+ output_csv: TextIOWrapper to write the CSV data to.
1056
+ """
1057
+ writer = csv.writer(output_csv)
1058
+ writer.writerow(
1059
+ [
1060
+ "query_protein",
1061
+ "complex_id",
1062
+ "complex_url",
1063
+ "complex_title",
1064
+ "members",
1065
+ ]
1066
+ )
1067
+ for entry in complexes:
1068
+ members_str = ";".join(sorted(entry.members))
1069
+ writer.writerow(
1070
+ [
1071
+ entry.query_protein,
1072
+ entry.complex_id,
1073
+ entry.complex_url,
1074
+ entry.complex_title,
1075
+ members_str,
1076
+ ]
1077
+ )
@@ -0,0 +1,46 @@
1
+ """Convert json or dict to Python objects."""
2
+
3
+ from cattrs.preconf.orjson import make_converter
4
+ from yarl import URL
5
+
6
+ type Percentage = float
7
+ """Type alias for percentage values (0.0-100.0)."""
8
+ type Ratio = float
9
+ """Type alias for ratio values (0.0-1.0)."""
10
+ type PositiveInt = int
11
+ """Type alias for positive integer values (>= 0)."""
12
+
13
+ converter = make_converter()
14
+ """cattrs converter to read JSON document or dict to Python objects."""
15
+ converter.register_structure_hook(URL, lambda v, _: URL(v))
16
+ converter.register_unstructure_hook(URL, lambda u: str(u))
17
+
18
+
19
+ @converter.register_structure_hook
20
+ def percentage_hook(val, _) -> Percentage:
21
+ value = float(val)
22
+ """Cattrs hook to validate percentage values."""
23
+ if not 0.0 <= value <= 100.0:
24
+ msg = f"Value {value} is not a valid percentage (0.0-100.0)"
25
+ raise ValueError(msg)
26
+ return value
27
+
28
+
29
+ @converter.register_structure_hook
30
+ def ratio_hook(val, _) -> Ratio:
31
+ """Cattrs hook to validate ratio values."""
32
+ value = float(val)
33
+ if not 0.0 <= value <= 1.0:
34
+ msg = f"Value {value} is not a valid ratio (0.0-1.0)"
35
+ raise ValueError(msg)
36
+ return value
37
+
38
+
39
+ @converter.register_structure_hook
40
+ def positive_int_hook(val, _) -> PositiveInt:
41
+ """Cattrs hook to validate positive integer values."""
42
+ value = int(val)
43
+ if value < 0:
44
+ msg = f"Value {value} is not a valid positive integer (>= 0)"
45
+ raise ValueError(msg)
46
+ return value