protein-quest 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of protein-quest might be problematic. Click here for more details.

@@ -1,2 +1,2 @@
1
- __version__ = "0.3.1"
1
+ __version__ = "0.3.2"
2
2
  """The version of the package."""
@@ -7,7 +7,10 @@ from pathlib import Path
7
7
 
8
8
  import gemmi
9
9
 
10
+ from protein_quest.converter import Percentage, PositiveInt, converter
10
11
  from protein_quest.pdbe.io import write_structure
12
+ from protein_quest.ss import nr_of_residues_in_total
13
+ from protein_quest.utils import CopyMethod, copyfile
11
14
 
12
15
  """
13
16
  Methods to filter AlphaFoldDB structures on confidence scores.
@@ -73,13 +76,25 @@ class ConfidenceFilterQuery:
73
76
  Parameters:
74
77
  confidence: The confidence threshold for filtering residues.
75
78
  Residues with a pLDDT (b-factor) above this value are considered high confidence.
76
- min_threshold: The minimum number of high-confidence residues required to keep the structure.
77
- max_threshold: The maximum number of high-confidence residues required to keep the structure.
79
+ min_residues: The minimum number of high-confidence residues required to keep the structure.
80
+ max_residues: The maximum number of high-confidence residues required to keep the structure.
78
81
  """
79
82
 
80
- confidence: float
81
- min_threshold: int
82
- max_threshold: int
83
+ confidence: Percentage
84
+ min_residues: PositiveInt
85
+ max_residues: PositiveInt
86
+
87
+
88
+ base_query_hook = converter.get_structure_hook(ConfidenceFilterQuery)
89
+
90
+
91
+ @converter.register_structure_hook
92
+ def confidence_filter_query_hook(val, _type) -> ConfidenceFilterQuery:
93
+ result: ConfidenceFilterQuery = base_query_hook(val, _type)
94
+ if result.min_residues > result.max_residues:
95
+ msg = f"min_residues {result.min_residues} cannot be larger than max_residues {result.max_residues}"
96
+ raise ValueError(msg)
97
+ return result
83
98
 
84
99
 
85
100
  @dataclass
@@ -93,17 +108,20 @@ class ConfidenceFilterResult:
93
108
  """
94
109
 
95
110
  input_file: str
96
- count: int
111
+ count: PositiveInt
97
112
  filtered_file: Path | None = None
98
113
 
99
114
 
100
- def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_dir: Path) -> ConfidenceFilterResult:
115
+ def filter_file_on_residues(
116
+ file: Path, query: ConfidenceFilterQuery, filtered_dir: Path, copy_method: CopyMethod = "copy"
117
+ ) -> ConfidenceFilterResult:
101
118
  """Filter a single AlphaFoldDB structure file (*.pdb[.gz], *.cif[.gz]) based on confidence.
102
119
 
103
120
  Args:
104
121
  file: The path to the PDB file to filter.
105
122
  query: The confidence filter query.
106
123
  filtered_dir: The directory to save the filtered PDB file.
124
+ copy_method: How to copy when no residues have to be removed.
107
125
 
108
126
  Returns:
109
127
  result with filtered_file property set to Path where filtered PDB file is saved.
@@ -112,19 +130,24 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
112
130
  structure = gemmi.read_structure(str(file))
113
131
  residues = set(find_high_confidence_residues(structure, query.confidence))
114
132
  count = len(residues)
115
- if count < query.min_threshold or count > query.max_threshold:
133
+ if count < query.min_residues or count > query.max_residues:
116
134
  # Skip structure that is outside the min and max threshold
117
135
  # just return number of high confidence residues
118
136
  return ConfidenceFilterResult(
119
137
  input_file=file.name,
120
138
  count=count,
121
139
  )
140
+ total_residues = nr_of_residues_in_total(structure)
122
141
  filtered_file = filtered_dir / file.name
123
- new_structure = filter_out_low_confidence_residues(
124
- structure,
125
- residues,
126
- )
127
- write_structure(new_structure, filtered_file)
142
+ if count == total_residues:
143
+ # if no residues have to be removed then copy instead of slower gemmi writing
144
+ copyfile(file, filtered_file, copy_method)
145
+ else:
146
+ new_structure = filter_out_low_confidence_residues(
147
+ structure,
148
+ residues,
149
+ )
150
+ write_structure(new_structure, filtered_file)
128
151
  return ConfidenceFilterResult(
129
152
  input_file=file.name,
130
153
  count=count,
@@ -133,7 +156,10 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
133
156
 
134
157
 
135
158
  def filter_files_on_confidence(
136
- alphafold_pdb_files: list[Path], query: ConfidenceFilterQuery, filtered_dir: Path
159
+ alphafold_pdb_files: list[Path],
160
+ query: ConfidenceFilterQuery,
161
+ filtered_dir: Path,
162
+ copy_method: CopyMethod = "copy",
137
163
  ) -> Generator[ConfidenceFilterResult]:
138
164
  """Filter AlphaFoldDB structures based on confidence.
139
165
 
@@ -141,6 +167,7 @@ def filter_files_on_confidence(
141
167
  alphafold_pdb_files: List of mmcif/PDB files from AlphaFoldDB to filter.
142
168
  query: The confidence filter query containing the confidence thresholds.
143
169
  filtered_dir: Directory where the filtered mmcif/PDB files will be saved.
170
+ copy_method: How to copy when a direct copy is possible.
144
171
 
145
172
  Yields:
146
173
  For each mmcif/PDB files yields whether it was filtered or not,
@@ -150,4 +177,4 @@ def filter_files_on_confidence(
150
177
  # In ../filter.py:filter_files_on_residues() we filter on number of residues on a file level
151
178
  # here we filter on file level and inside file remove low confidence residues
152
179
  for pdb_file in alphafold_pdb_files:
153
- yield filter_file_on_residues(pdb_file, query, filtered_dir)
180
+ yield filter_file_on_residues(pdb_file, query, filtered_dir, copy_method)
@@ -9,17 +9,15 @@ from typing import Literal, cast, get_args
9
9
 
10
10
  from aiohttp_retry import RetryClient
11
11
  from aiopath import AsyncPath
12
- from cattrs.preconf.orjson import make_converter
13
12
  from tqdm.asyncio import tqdm
14
13
  from yarl import URL
15
14
 
16
15
  from protein_quest.alphafold.entry_summary import EntrySummary
16
+ from protein_quest.converter import converter
17
17
  from protein_quest.utils import friendly_session, retrieve_files, run_async
18
18
 
19
19
  logger = logging.getLogger(__name__)
20
- converter = make_converter()
21
- """cattrs converter to read AlphaFold summary JSON document."""
22
- converter.register_structure_hook(URL, lambda v, _: URL(v))
20
+
23
21
 
24
22
  DownloadableFormat = Literal[
25
23
  "summary",
protein_quest/cli.py CHANGED
@@ -23,13 +23,16 @@ from protein_quest.__version__ import __version__
23
23
  from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence
24
24
  from protein_quest.alphafold.fetch import DownloadableFormat, downloadable_formats
25
25
  from protein_quest.alphafold.fetch import fetch_many as af_fetch
26
+ from protein_quest.converter import converter
26
27
  from protein_quest.emdb import fetch as emdb_fetch
27
28
  from protein_quest.filters import filter_files_on_chain, filter_files_on_residues
28
29
  from protein_quest.go import Aspect, allowed_aspects, search_gene_ontology_term, write_go_terms_to_csv
29
30
  from protein_quest.pdbe import fetch as pdbe_fetch
30
31
  from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
32
+ from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
31
33
  from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
32
34
  from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
35
+ from protein_quest.utils import CopyMethod, copy_methods, copyfile
33
36
 
34
37
  logger = logging.getLogger(__name__)
35
38
 
@@ -282,6 +285,22 @@ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
282
285
  parser.add_argument("output_dir", type=Path, help="Directory to store downloaded EMDB volume files")
283
286
 
284
287
 
288
+ def _add_copy_method_argument(parser: argparse.ArgumentParser):
289
+ """Add copy method argument to parser."""
290
+ default_copy_method = "symlink"
291
+ if os.name == "nt":
292
+ # On Windows you need developer mode or admin privileges to create symlinks
293
+ # so we default to copying files instead of symlinking
294
+ default_copy_method = "copy"
295
+ parser.add_argument(
296
+ "--copy-method",
297
+ type=str,
298
+ choices=copy_methods,
299
+ default=default_copy_method,
300
+ help="How to copy files when no changes are needed to output file.",
301
+ )
302
+
303
+
285
304
  def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
286
305
  """Add filter confidence subcommand parser."""
287
306
  parser = subparsers.add_parser(
@@ -312,6 +331,7 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
312
331
  In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
313
332
  Use `-` for stdout."""),
314
333
  )
334
+ _add_copy_method_argument(parser)
315
335
 
316
336
 
317
337
  def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
@@ -347,8 +367,11 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
347
367
  )
348
368
  parser.add_argument(
349
369
  "--scheduler-address",
350
- help="Address of the Dask scheduler to connect to. If not provided, will create a local cluster.",
370
+ help=dedent("""Address of the Dask scheduler to connect to.
371
+ If not provided, will create a local cluster.
372
+ If set to `sequential` will run tasks sequentially."""),
351
373
  )
374
+ _add_copy_method_argument(parser)
352
375
 
353
376
 
354
377
  def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
@@ -371,6 +394,7 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
371
394
  )
372
395
  parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
373
396
  parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
397
+ _add_copy_method_argument(parser)
374
398
  parser.add_argument(
375
399
  "--write-stats",
376
400
  type=argparse.FileType("w", encoding="UTF-8"),
@@ -381,6 +405,43 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
381
405
  )
382
406
 
383
407
 
408
+ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
409
+ """Add filter secondary structure subcommand parser."""
410
+ parser = subparsers.add_parser(
411
+ "secondary-structure",
412
+ help="Filter PDB/mmCIF files by secondary structure",
413
+ description="Filter PDB/mmCIF files by secondary structure",
414
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
415
+ )
416
+ parser.add_argument("input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')")
417
+ parser.add_argument(
418
+ "output_dir",
419
+ type=Path,
420
+ help=dedent("""\
421
+ Directory to write filtered PDB/mmCIF files. Files are copied without modification.
422
+ """),
423
+ )
424
+ parser.add_argument("--abs-min-helix-residues", type=int, help="Min residues in helices")
425
+ parser.add_argument("--abs-max-helix-residues", type=int, help="Max residues in helices")
426
+ parser.add_argument("--abs-min-sheet-residues", type=int, help="Min residues in sheets")
427
+ parser.add_argument("--abs-max-sheet-residues", type=int, help="Max residues in sheets")
428
+ parser.add_argument("--ratio-min-helix-residues", type=float, help="Min residues in helices (relative)")
429
+ parser.add_argument("--ratio-max-helix-residues", type=float, help="Max residues in helices (relative)")
430
+ parser.add_argument("--ratio-min-sheet-residues", type=float, help="Min residues in sheets (relative)")
431
+ parser.add_argument("--ratio-max-sheet-residues", type=float, help="Max residues in sheets (relative)")
432
+ _add_copy_method_argument(parser)
433
+ parser.add_argument(
434
+ "--write-stats",
435
+ type=argparse.FileType("w", encoding="UTF-8"),
436
+ help=dedent("""
437
+ Write filter statistics to file. In CSV format with columns:
438
+ `<input_file>,<nr_residues>,<nr_helix_residues>,<nr_sheet_residues>,
439
+ <helix_ratio>,<sheet_ratio>,<passed>,<output_file>`.
440
+ Use `-` for stdout.
441
+ """),
442
+ )
443
+
444
+
384
445
  def _add_search_subcommands(subparsers: argparse._SubParsersAction):
385
446
  """Add search command and its subcommands."""
386
447
  parser = subparsers.add_parser(
@@ -422,6 +483,7 @@ def _add_filter_subcommands(subparsers: argparse._SubParsersAction):
422
483
  _add_filter_confidence_parser(subsubparsers)
423
484
  _add_filter_chain_parser(subsubparsers)
424
485
  _add_filter_residue_parser(subsubparsers)
486
+ _add_filter_ss_parser(subsubparsers)
425
487
 
426
488
 
427
489
  def _add_mcp_command(subparsers: argparse._SubParsersAction):
@@ -620,21 +682,22 @@ def _handle_filter_confidence(args: argparse.Namespace):
620
682
  # to get rid of duplication
621
683
  input_dir = structure(args.input_dir, Path)
622
684
  output_dir = structure(args.output_dir, Path)
623
- confidence_threshold = structure(args.confidence_threshold, float)
624
- # TODO add min/max
625
- min_residues = structure(args.min_residues, int)
626
- max_residues = structure(args.max_residues, int)
685
+
686
+ confidence_threshold = args.confidence_threshold
687
+ min_residues = args.min_residues
688
+ max_residues = args.max_residues
627
689
  stats_file: TextIOWrapper | None = args.write_stats
690
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
628
691
 
629
692
  output_dir.mkdir(parents=True, exist_ok=True)
630
693
  input_files = sorted(glob_structure_files(input_dir))
631
694
  nr_input_files = len(input_files)
632
695
  rprint(f"Starting confidence filtering of {nr_input_files} mmcif/PDB files in {input_dir} directory.")
633
- query = structure(
696
+ query = converter.structure(
634
697
  {
635
698
  "confidence": confidence_threshold,
636
- "min_threshold": min_residues,
637
- "max_threshold": max_residues,
699
+ "min_residues": min_residues,
700
+ "max_residues": max_residues,
638
701
  },
639
702
  ConfidenceFilterQuery,
640
703
  )
@@ -643,7 +706,11 @@ def _handle_filter_confidence(args: argparse.Namespace):
643
706
  writer.writerow(["input_file", "residue_count", "passed", "output_file"])
644
707
 
645
708
  passed_count = 0
646
- for r in tqdm(filter_files_on_confidence(input_files, query, output_dir), total=len(input_files), unit="file"):
709
+ for r in tqdm(
710
+ filter_files_on_confidence(input_files, query, output_dir, copy_method=copy_method),
711
+ total=len(input_files),
712
+ unit="file",
713
+ ):
647
714
  if r.filtered_file:
648
715
  passed_count += 1
649
716
  if stats_file:
@@ -656,9 +723,10 @@ def _handle_filter_confidence(args: argparse.Namespace):
656
723
 
657
724
  def _handle_filter_chain(args):
658
725
  input_dir = args.input_dir
659
- output_dir = args.output_dir
726
+ output_dir = structure(args.output_dir, Path)
660
727
  pdb_id2chain_mapping_file = args.chains
661
- scheduler_address = args.scheduler_address
728
+ scheduler_address = structure(args.scheduler_address, str | None) # pyright: ignore[reportArgumentType]
729
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
662
730
 
663
731
  # make sure files in input dir with entries in mapping file are the same
664
732
  # complain when files from mapping file are missing on disk
@@ -683,18 +751,25 @@ def _handle_filter_chain(args):
683
751
  rprint("[red]No valid structure files found. Exiting.")
684
752
  sys.exit(1)
685
753
 
686
- results = filter_files_on_chain(file2chain, output_dir, scheduler_address=scheduler_address)
754
+ results = filter_files_on_chain(
755
+ file2chain, output_dir, scheduler_address=scheduler_address, copy_method=copy_method
756
+ )
687
757
 
688
758
  nr_written = len([r for r in results if r.passed])
689
759
 
690
760
  rprint(f"Wrote {nr_written} single-chain PDB/mmCIF files to {output_dir}.")
691
761
 
762
+ for result in results:
763
+ if result.discard_reason:
764
+ rprint(f"[red]Discarding {result.input_file} ({result.discard_reason})[/red]")
765
+
692
766
 
693
767
  def _handle_filter_residue(args):
694
768
  input_dir = structure(args.input_dir, Path)
695
769
  output_dir = structure(args.output_dir, Path)
696
770
  min_residues = structure(args.min_residues, int)
697
771
  max_residues = structure(args.max_residues, int)
772
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
698
773
  stats_file: TextIOWrapper | None = args.write_stats
699
774
 
700
775
  if stats_file:
@@ -705,7 +780,9 @@ def _handle_filter_residue(args):
705
780
  input_files = sorted(glob_structure_files(input_dir))
706
781
  nr_total = len(input_files)
707
782
  rprint(f"Filtering {nr_total} files in {input_dir} directory by number of residues in chain A.")
708
- for r in filter_files_on_residues(input_files, output_dir, min_residues=min_residues, max_residues=max_residues):
783
+ for r in filter_files_on_residues(
784
+ input_files, output_dir, min_residues=min_residues, max_residues=max_residues, copy_method=copy_method
785
+ ):
709
786
  if stats_file:
710
787
  writer.writerow([r.input_file, r.residue_count, r.passed, r.output_file])
711
788
  if r.passed:
@@ -716,6 +793,68 @@ def _handle_filter_residue(args):
716
793
  rprint(f"Statistics written to {stats_file.name}")
717
794
 
718
795
 
796
+ def _handle_filter_ss(args):
797
+ input_dir = structure(args.input_dir, Path)
798
+ output_dir = structure(args.output_dir, Path)
799
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
800
+ stats_file: TextIOWrapper | None = args.write_stats
801
+
802
+ raw_query = {
803
+ "abs_min_helix_residues": args.abs_min_helix_residues,
804
+ "abs_max_helix_residues": args.abs_max_helix_residues,
805
+ "abs_min_sheet_residues": args.abs_min_sheet_residues,
806
+ "abs_max_sheet_residues": args.abs_max_sheet_residues,
807
+ "ratio_min_helix_residues": args.ratio_min_helix_residues,
808
+ "ratio_max_helix_residues": args.ratio_max_helix_residues,
809
+ "ratio_min_sheet_residues": args.ratio_min_sheet_residues,
810
+ "ratio_max_sheet_residues": args.ratio_max_sheet_residues,
811
+ }
812
+ query = converter.structure(raw_query, SecondaryStructureFilterQuery)
813
+ input_files = sorted(glob_structure_files(input_dir))
814
+ nr_total = len(input_files)
815
+ output_dir.mkdir(parents=True, exist_ok=True)
816
+
817
+ if stats_file:
818
+ writer = csv.writer(stats_file)
819
+ writer.writerow(
820
+ [
821
+ "input_file",
822
+ "nr_residues",
823
+ "nr_helix_residues",
824
+ "nr_sheet_residues",
825
+ "helix_ratio",
826
+ "sheet_ratio",
827
+ "passed",
828
+ "output_file",
829
+ ]
830
+ )
831
+
832
+ rprint(f"Filtering {nr_total} files in {input_dir} directory by secondary structure.")
833
+ nr_passed = 0
834
+ for input_file, result in filter_files_on_secondary_structure(input_files, query=query):
835
+ output_file: Path | None = None
836
+ if result.passed:
837
+ output_file = output_dir / input_file.name
838
+ copyfile(input_file, output_file, copy_method)
839
+ nr_passed += 1
840
+ if stats_file:
841
+ writer.writerow(
842
+ [
843
+ input_file,
844
+ result.stats.nr_residues,
845
+ result.stats.nr_helix_residues,
846
+ result.stats.nr_sheet_residues,
847
+ round(result.stats.helix_ratio, 3),
848
+ round(result.stats.sheet_ratio, 3),
849
+ result.passed,
850
+ output_file,
851
+ ]
852
+ )
853
+ rprint(f"Wrote {nr_passed} files to {output_dir} directory.")
854
+ if stats_file:
855
+ rprint(f"Statistics written to {stats_file.name}")
856
+
857
+
719
858
  def _handle_mcp(args):
720
859
  if find_spec("fastmcp") is None:
721
860
  msg = "Unable to start MCP server, please install `protein-quest[mcp]`."
@@ -742,6 +881,7 @@ HANDLERS: dict[tuple[str, str | None], Callable] = {
742
881
  ("filter", "confidence"): _handle_filter_confidence,
743
882
  ("filter", "chain"): _handle_filter_chain,
744
883
  ("filter", "residue"): _handle_filter_residue,
884
+ ("filter", "secondary-structure"): _handle_filter_ss,
745
885
  ("mcp", None): _handle_mcp,
746
886
  }
747
887
 
@@ -0,0 +1,45 @@
1
+ """Convert json or dict to Python objects."""
2
+
3
+ from cattrs.preconf.orjson import make_converter
4
+ from yarl import URL
5
+
6
+ type Percentage = float
7
+ """Type alias for percentage values (0.0-100.0)."""
8
+ type Ratio = float
9
+ """Type alias for ratio values (0.0-1.0)."""
10
+ type PositiveInt = int
11
+ """Type alias for positive integer values (>= 0)."""
12
+
13
+ converter = make_converter()
14
+ """cattrs converter to read JSON document or dict to Python objects."""
15
+ converter.register_structure_hook(URL, lambda v, _: URL(v))
16
+
17
+
18
+ @converter.register_structure_hook
19
+ def percentage_hook(val, _) -> Percentage:
20
+ value = float(val)
21
+ """Cattrs hook to validate percentage values."""
22
+ if not 0.0 <= value <= 100.0:
23
+ msg = f"Value {value} is not a valid percentage (0.0-100.0)"
24
+ raise ValueError(msg)
25
+ return value
26
+
27
+
28
+ @converter.register_structure_hook
29
+ def ratio_hook(val, _) -> Ratio:
30
+ """Cattrs hook to validate ratio values."""
31
+ value = float(val)
32
+ if not 0.0 <= value <= 1.0:
33
+ msg = f"Value {value} is not a valid ratio (0.0-1.0)"
34
+ raise ValueError(msg)
35
+ return value
36
+
37
+
38
+ @converter.register_structure_hook
39
+ def positive_int_hook(val, _) -> PositiveInt:
40
+ """Cattrs hook to validate positive integer values."""
41
+ value = int(val)
42
+ if value < 0:
43
+ msg = f"Value {value} is not a valid positive integer (>= 0)"
44
+ raise ValueError(msg)
45
+ return value
protein_quest/filters.py CHANGED
@@ -4,7 +4,7 @@ import logging
4
4
  from collections.abc import Collection, Generator
5
5
  from dataclasses import dataclass
6
6
  from pathlib import Path
7
- from shutil import copyfile
7
+ from typing import Literal
8
8
 
9
9
  from dask.distributed import Client
10
10
  from distributed.deploy.cluster import Cluster
@@ -15,6 +15,7 @@ from protein_quest.pdbe.io import (
15
15
  nr_residues_in_chain,
16
16
  write_single_chain_pdb_file,
17
17
  )
18
+ from protein_quest.utils import CopyMethod, copyfile
18
19
 
19
20
  logger = logging.getLogger(__name__)
20
21
 
@@ -29,11 +30,17 @@ class ChainFilterStatistics:
29
30
 
30
31
 
31
32
  def filter_file_on_chain(
32
- file_and_chain: tuple[Path, str], output_dir: Path, out_chain: str = "A"
33
+ file_and_chain: tuple[Path, str],
34
+ output_dir: Path,
35
+ out_chain: str = "A",
36
+ copy_method: CopyMethod = "copy",
33
37
  ) -> ChainFilterStatistics:
34
38
  input_file, chain_id = file_and_chain
39
+ logger.debug("Filtering %s on chain %s", input_file, chain_id)
35
40
  try:
36
- output_file = write_single_chain_pdb_file(input_file, chain_id, output_dir, out_chain=out_chain)
41
+ output_file = write_single_chain_pdb_file(
42
+ input_file, chain_id, output_dir, out_chain=out_chain, copy_method=copy_method
43
+ )
37
44
  return ChainFilterStatistics(
38
45
  input_file=input_file,
39
46
  chain_id=chain_id,
@@ -48,7 +55,8 @@ def filter_files_on_chain(
48
55
  file2chains: Collection[tuple[Path, str]],
49
56
  output_dir: Path,
50
57
  out_chain: str = "A",
51
- scheduler_address: str | Cluster | None = None,
58
+ scheduler_address: str | Cluster | Literal["sequential"] | None = None,
59
+ copy_method: CopyMethod = "copy",
52
60
  ) -> list[ChainFilterStatistics]:
53
61
  """Filter mmcif/PDB files by chain.
54
62
 
@@ -58,19 +66,37 @@ def filter_files_on_chain(
58
66
  output_dir: The directory where the filtered files will be written.
59
67
  out_chain: Under what name to write the kept chain.
60
68
  scheduler_address: The address of the Dask scheduler.
69
+ If not provided, will create a local cluster.
70
+ If set to `sequential` will run tasks sequentially.
71
+ copy_method: How to copy when a direct copy is possible.
61
72
 
62
73
  Returns:
63
74
  Result of the filtering process.
64
75
  """
65
76
  output_dir.mkdir(parents=True, exist_ok=True)
77
+ if scheduler_address == "sequential":
78
+
79
+ def task(file_and_chain: tuple[Path, str]) -> ChainFilterStatistics:
80
+ return filter_file_on_chain(file_and_chain, output_dir, out_chain=out_chain, copy_method=copy_method)
81
+
82
+ return list(map(task, file2chains))
83
+
84
+ # TODO make logger.debug in filter_file_on_chain show to user when --log
85
+ # GPT-5 generated a fairly difficult setup with a WorkerPlugin, need to find a simpler approach
66
86
  scheduler_address = configure_dask_scheduler(
67
87
  scheduler_address,
68
88
  name="filter-chain",
69
89
  )
70
90
 
71
91
  with Client(scheduler_address) as client:
92
+ client.forward_logging()
72
93
  return dask_map_with_progress(
73
- client, filter_file_on_chain, file2chains, output_dir=output_dir, out_chain=out_chain
94
+ client,
95
+ filter_file_on_chain,
96
+ file2chains,
97
+ output_dir=output_dir,
98
+ out_chain=out_chain,
99
+ copy_method=copy_method,
74
100
  )
75
101
 
76
102
 
@@ -92,7 +118,12 @@ class ResidueFilterStatistics:
92
118
 
93
119
 
94
120
  def filter_files_on_residues(
95
- input_files: list[Path], output_dir: Path, min_residues: int, max_residues: int, chain: str = "A"
121
+ input_files: list[Path],
122
+ output_dir: Path,
123
+ min_residues: int,
124
+ max_residues: int,
125
+ chain: str = "A",
126
+ copy_method: CopyMethod = "copy",
96
127
  ) -> Generator[ResidueFilterStatistics]:
97
128
  """Filter PDB/mmCIF files by number of residues in given chain.
98
129
 
@@ -102,6 +133,7 @@ def filter_files_on_residues(
102
133
  min_residues: The minimum number of residues in chain.
103
134
  max_residues: The maximum number of residues in chain.
104
135
  chain: The chain to count residues of.
136
+ copy_method: How to copy passed files to output directory:
105
137
 
106
138
  Yields:
107
139
  Objects containing information about the filtering process for each input file.
@@ -112,7 +144,7 @@ def filter_files_on_residues(
112
144
  passed = min_residues <= residue_count <= max_residues
113
145
  if passed:
114
146
  output_file = output_dir / input_file.name
115
- copyfile(input_file, output_file)
147
+ copyfile(input_file, output_file, copy_method)
116
148
  yield ResidueFilterStatistics(input_file, residue_count, True, output_file)
117
149
  else:
118
150
  yield ResidueFilterStatistics(input_file, residue_count, False, None)
protein_quest/go.py CHANGED
@@ -8,8 +8,8 @@ from io import TextIOWrapper
8
8
  from typing import Literal, get_args
9
9
 
10
10
  from cattrs.gen import make_dict_structure_fn, override
11
- from cattrs.preconf.orjson import make_converter
12
11
 
12
+ from protein_quest.converter import converter
13
13
  from protein_quest.utils import friendly_session
14
14
 
15
15
  logger = logging.getLogger(__name__)
@@ -52,9 +52,6 @@ class SearchResponse:
52
52
  page_info: PageInfo
53
53
 
54
54
 
55
- converter = make_converter()
56
-
57
-
58
55
  def flatten_definition(definition, _context) -> str:
59
56
  return definition["text"]
60
57
 
@@ -46,6 +46,7 @@ from protein_quest.emdb import fetch as emdb_fetch
46
46
  from protein_quest.go import search_gene_ontology_term
47
47
  from protein_quest.pdbe.fetch import fetch as pdbe_fetch
48
48
  from protein_quest.pdbe.io import glob_structure_files, nr_residues_in_chain, write_single_chain_pdb_file
49
+ from protein_quest.ss import filter_file_on_secondary_structure
49
50
  from protein_quest.taxonomy import search_taxon
50
51
  from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
51
52
 
@@ -165,6 +166,9 @@ def alphafold_confidence_filter(file: Path, query: ConfidenceFilterQuery, filter
165
166
  return filter_file_on_residues(file, query, filtered_dir)
166
167
 
167
168
 
169
+ mcp.tool(filter_file_on_secondary_structure)
170
+
171
+
168
172
  @mcp.prompt
169
173
  def candidate_structures(
170
174
  species: str = "Human",
protein_quest/pdbe/io.py CHANGED
@@ -2,12 +2,14 @@
2
2
 
3
3
  import gzip
4
4
  import logging
5
- from collections.abc import Generator
5
+ from collections.abc import Generator, Iterable
6
+ from datetime import UTC, datetime
6
7
  from pathlib import Path
7
8
 
8
9
  import gemmi
9
10
 
10
- from protein_quest import __version__
11
+ from protein_quest.__version__ import __version__
12
+ from protein_quest.utils import CopyMethod, copyfile
11
13
 
12
14
  logger = logging.getLogger(__name__)
13
15
 
@@ -28,14 +30,21 @@ def nr_residues_in_chain(file: Path | str, chain: str = "A") -> int:
28
30
  The number of residues in the specified chain.
29
31
  """
30
32
  structure = gemmi.read_structure(str(file))
31
- model = structure[0]
32
- gchain = find_chain_in_model(model, chain)
33
+ gchain = find_chain_in_structure(structure, chain)
33
34
  if gchain is None:
34
35
  logger.warning("Chain %s not found in %s. Returning 0.", chain, file)
35
36
  return 0
36
37
  return len(gchain)
37
38
 
38
39
 
40
+ def find_chain_in_structure(structure: gemmi.Structure, wanted_chain: str) -> gemmi.Chain | None:
41
+ for model in structure:
42
+ chain = find_chain_in_model(model, wanted_chain)
43
+ if chain is not None:
44
+ return chain
45
+ return None
46
+
47
+
39
48
  def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain | None:
40
49
  chain = model.find_chain(wanted_chain)
41
50
  if chain is None:
@@ -68,10 +77,12 @@ def write_structure(structure: gemmi.Structure, path: Path):
68
77
  with gzip.open(path, "wt") as f:
69
78
  f.write(body)
70
79
  elif path.name.endswith(".cif"):
71
- doc = structure.make_mmcif_document()
80
+ # do not write chem_comp so it is viewable by molstar
81
+ # see https://github.com/project-gemmi/gemmi/discussions/362
82
+ doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
72
83
  doc.write_file(str(path))
73
84
  elif path.name.endswith(".cif.gz"):
74
- doc = structure.make_mmcif_document()
85
+ doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
75
86
  cif_str = doc.as_string()
76
87
  with gzip.open(path, "wt") as f:
77
88
  f.write(cif_str)
@@ -111,14 +122,17 @@ def locate_structure_file(root: Path, pdb_id: str) -> Path:
111
122
  Raises:
112
123
  FileNotFoundError: If no structure file is found for the given PDB ID.
113
124
  """
114
- exts = [".cif.gz", ".cif", ".pdb.gz", ".pdb"]
115
- # files downloaded from https://www.ebi.ac.uk/pdbe/ website
116
- # have file names like pdb6t5y.ent or pdb6t5y.ent.gz for a PDB formatted file.
117
- # TODO support pdb6t5y.ent or pdb6t5y.ent.gz file names
125
+ exts = [".cif.gz", ".cif", ".pdb.gz", ".pdb", ".ent", ".ent.gz"]
118
126
  for ext in exts:
119
- candidate = root / f"{pdb_id.lower()}{ext}"
120
- if candidate.exists():
121
- return candidate
127
+ candidates = (
128
+ root / f"{pdb_id}{ext}",
129
+ root / f"{pdb_id.lower()}{ext}",
130
+ root / f"{pdb_id.upper()}{ext}",
131
+ root / f"pdb{pdb_id.lower()}{ext}",
132
+ )
133
+ for candidate in candidates:
134
+ if candidate.exists():
135
+ return candidate
122
136
  msg = f"No structure file found for {pdb_id} in {root}"
123
137
  raise FileNotFoundError(msg)
124
138
 
@@ -139,20 +153,84 @@ def glob_structure_files(input_dir: Path) -> Generator[Path]:
139
153
  class ChainNotFoundError(IndexError):
140
154
  """Exception raised when a chain is not found in a structure."""
141
155
 
142
- def __init__(self, chain: str, file: Path | str):
143
- super().__init__(f"Chain {chain} not found in {file}")
156
+ def __init__(self, chain: str, file: Path | str, available_chains: Iterable[str]):
157
+ super().__init__(f"Chain {chain} not found in {file}. Available chains are: {available_chains}")
144
158
  self.chain_id = chain
145
159
  self.file = file
146
160
 
147
161
 
148
- def write_single_chain_pdb_file(input_file: Path, chain2keep: str, output_dir: Path, out_chain: str = "A") -> Path:
162
+ def _dedup_helices(structure: gemmi.Structure):
163
+ helix_starts: set[str] = set()
164
+ duplicate_helix_indexes: list[int] = []
165
+ for hindex, helix in enumerate(structure.helices):
166
+ if str(helix.start) in helix_starts:
167
+ logger.debug(f"Duplicate start helix found: {hindex} {helix.start}, removing")
168
+ duplicate_helix_indexes.append(hindex)
169
+ else:
170
+ helix_starts.add(str(helix.start))
171
+ for helix_index in reversed(duplicate_helix_indexes):
172
+ structure.helices.pop(helix_index)
173
+
174
+
175
+ def _dedup_sheets(structure: gemmi.Structure, chain2keep: str):
176
+ duplicate_sheet_indexes: list[int] = []
177
+ for sindex, sheet in enumerate(structure.sheets):
178
+ if sheet.name != chain2keep:
179
+ duplicate_sheet_indexes.append(sindex)
180
+ for sheet_index in reversed(duplicate_sheet_indexes):
181
+ structure.sheets.pop(sheet_index)
182
+
183
+
184
+ def _add_provenance_info(structure: gemmi.Structure, chain2keep: str, out_chain: str):
185
+ old_id = structure.name
186
+ new_id = structure.name + f"{chain2keep}2{out_chain}"
187
+ structure.name = new_id
188
+ structure.info["_entry.id"] = new_id
189
+ new_title = f"From {old_id} chain {chain2keep} to {out_chain}"
190
+ structure.info["_struct.title"] = new_title
191
+ structure.info["_struct_keywords.pdbx_keywords"] = new_title.upper()
192
+ new_si = gemmi.SoftwareItem()
193
+ new_si.classification = gemmi.SoftwareItem.Classification.DataExtraction
194
+ new_si.name = "protein-quest.pdbe.io.write_single_chain_pdb_file"
195
+ new_si.version = str(__version__)
196
+ new_si.date = str(datetime.now(tz=UTC).date())
197
+ structure.meta.software = [*structure.meta.software, new_si]
198
+
199
+
200
+ def chains_in_structure(structure: gemmi.Structure) -> set[gemmi.Chain]:
201
+ """Get a list of chains in a structure."""
202
+ return {c for model in structure for c in model}
203
+
204
+
205
+ def write_single_chain_pdb_file(
206
+ input_file: Path,
207
+ chain2keep: str,
208
+ output_dir: Path,
209
+ out_chain: str = "A",
210
+ copy_method: CopyMethod = "copy",
211
+ ) -> Path:
149
212
  """Write a single chain from a mmCIF/pdb file to a new mmCIF/pdb file.
150
213
 
214
+ Also
215
+
216
+ - removes ligands and waters
217
+ - renumbers atoms ids
218
+ - removes chem_comp section from cif files
219
+ - adds provenance information to the header like software and input file+chain
220
+
221
+ This function is equivalent to the following gemmi commands:
222
+
223
+ ```shell
224
+ gemmi convert --remove-lig-wat --select=B --to=cif chain-in/3JRS.cif - | \\
225
+ gemmi convert --from=cif --rename-chain=B:A - chain-out/3JRS_B2A.gemmi.cif
226
+ ```
227
+
151
228
  Args:
152
229
  input_file: Path to the input mmCIF/pdb file.
153
230
  chain2keep: The chain to keep.
154
231
  output_dir: Directory to save the output file.
155
232
  out_chain: The chain identifier for the output file.
233
+ copy_method: How to copy when no changes are needed to output file.
156
234
 
157
235
  Returns:
158
236
  Path to the output mmCIF/pdb file
@@ -162,39 +240,42 @@ def write_single_chain_pdb_file(input_file: Path, chain2keep: str, output_dir: P
162
240
  ChainNotFoundError: If the specified chain is not found in the input file.
163
241
  """
164
242
 
243
+ logger.debug(f"chain2keep: {chain2keep}, out_chain: {out_chain}")
165
244
  structure = gemmi.read_structure(str(input_file))
166
- model = structure[0]
167
-
168
- # Only count residues of polymer
169
- model.remove_ligands_and_waters()
245
+ structure.setup_entities()
170
246
 
171
- chain = find_chain_in_model(model, chain2keep)
247
+ chain = find_chain_in_structure(structure, chain2keep)
248
+ chainnames_in_structure = {c.name for c in chains_in_structure(structure)}
172
249
  if chain is None:
173
- raise ChainNotFoundError(chain2keep, input_file)
250
+ raise ChainNotFoundError(chain2keep, input_file, chainnames_in_structure)
251
+ chain_name = chain.name
174
252
  name, extension = _split_name_and_extension(input_file.name)
175
- output_file = output_dir / f"{name}_{chain.name}2{out_chain}{extension}"
253
+ output_file = output_dir / f"{name}_{chain_name}2{out_chain}{extension}"
176
254
 
177
255
  if output_file.exists():
178
256
  logger.info("Output file %s already exists for input file %s. Skipping.", output_file, input_file)
179
257
  return output_file
180
258
 
181
- new_structure = gemmi.Structure()
182
- new_structure.resolution = structure.resolution
183
- new_id = structure.name + f"{chain2keep}2{out_chain}"
184
- new_structure.name = new_id
185
- new_structure.info["_entry.id"] = new_id
186
- new_title = f"From {structure.info['_entry.id']} chain {chain2keep} to {out_chain}"
187
- new_structure.info["_struct.title"] = new_title
188
- new_structure.info["_struct_keywords.pdbx_keywords"] = new_title.upper()
189
- new_si = gemmi.SoftwareItem()
190
- new_si.classification = gemmi.SoftwareItem.Classification.DataExtraction
191
- new_si.name = "protein-quest"
192
- new_si.version = str(__version__)
193
- new_structure.meta.software.append(new_si)
194
- new_model = gemmi.Model(1)
195
- chain.name = out_chain
196
- new_model.add_chain(chain)
197
- new_structure.add_model(new_model)
198
- write_structure(new_structure, output_file)
259
+ if chain_name == out_chain and len(chainnames_in_structure) == 1:
260
+ logger.info(
261
+ "%s only has chain %s and out_chain is also %s. Copying file to %s.",
262
+ input_file,
263
+ chain_name,
264
+ out_chain,
265
+ output_file,
266
+ )
267
+ copyfile(input_file, output_file, copy_method)
268
+ return output_file
269
+
270
+ gemmi.Selection(chain_name).remove_not_selected(structure)
271
+ for m in structure:
272
+ m.remove_ligands_and_waters()
273
+ structure.setup_entities()
274
+ structure.rename_chain(chain_name, out_chain)
275
+ _dedup_helices(structure)
276
+ _dedup_sheets(structure, out_chain)
277
+ _add_provenance_info(structure, chain_name, out_chain)
278
+
279
+ write_structure(structure, output_file)
199
280
 
200
281
  return output_file
protein_quest/ss.py ADDED
@@ -0,0 +1,264 @@
1
+ """Module for dealing with secondary structure."""
2
+
3
+ import logging
4
+ from collections.abc import Generator, Iterable
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+ from gemmi import Structure, read_structure, set_leak_warnings
9
+
10
+ from protein_quest.converter import PositiveInt, Ratio, converter
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # TODO remove once v0.7.4 of gemmi is released,
15
+ # as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
16
+ # Swallow gemmi leaked function warnings
17
+ set_leak_warnings(False)
18
+
19
+ # TODO if a structure has no secondary structure information, calculate it with `gemmi ss`.
20
+ # https://github.com/MonomerLibrary/monomers/wiki/Installation as --monomers dir
21
+ # gemmi executable is in https://pypi.org/project/gemmi-program/
22
+ # `gemmi ss` only prints secondary structure to stdout with `-v` flag.
23
+
24
+
25
+ def nr_of_residues_in_total(structure: Structure) -> int:
26
+ """Count the total number of residues in the structure.
27
+
28
+ Args:
29
+ structure: The gemmi Structure object to analyze.
30
+
31
+ Returns:
32
+ The total number of residues in the structure.
33
+ """
34
+ count = 0
35
+ for model in structure:
36
+ for chain in model:
37
+ count += len(chain)
38
+ return count
39
+
40
+
41
+ def nr_of_residues_in_helix(structure: Structure) -> int:
42
+ """Count the number of residues in alpha helices.
43
+
44
+ Requires structure to have secondary structure information.
45
+
46
+ Args:
47
+ structure: The gemmi Structure object to analyze.
48
+
49
+ Returns:
50
+ The number of residues in alpha helices.
51
+ """
52
+ # For cif files from AlphaFold the helix.length is set to -1
53
+ # so use resid instead
54
+ count = 0
55
+ for helix in structure.helices:
56
+ end = helix.end.res_id.seqid.num
57
+ start = helix.start.res_id.seqid.num
58
+ if end is None or start is None:
59
+ logger.warning(f"Invalid helix coordinates: {helix.end} or {helix.start}")
60
+ continue
61
+ length = end - start + 1
62
+ count += length
63
+ return count
64
+
65
+
66
+ def nr_of_residues_in_sheet(structure: Structure) -> int:
67
+ """Count the number of residues in beta sheets.
68
+
69
+ Requires structure to have secondary structure information.
70
+
71
+ Args:
72
+ structure: The gemmi Structure object to analyze.
73
+
74
+ Returns:
75
+ The number of residues in beta sheets.
76
+ """
77
+ count = 0
78
+ for sheet in structure.sheets:
79
+ for strand in sheet.strands:
80
+ end = strand.end.res_id.seqid.num
81
+ start = strand.start.res_id.seqid.num
82
+ if end is None or start is None:
83
+ logger.warning(f"Invalid strand coordinates: {strand.end} or {strand.start}")
84
+ continue
85
+ length = end - start + 1
86
+ count += length
87
+ return count
88
+
89
+
90
+ @dataclass
91
+ class SecondaryStructureFilterQuery:
92
+ """Query object to filter on secondary structure.
93
+
94
+ Parameters:
95
+ abs_min_helix_residues: Minimum number of residues in helices (absolute).
96
+ abs_max_helix_residues: Maximum number of residues in helices (absolute).
97
+ abs_min_sheet_residues: Minimum number of residues in sheets (absolute).
98
+ abs_max_sheet_residues: Maximum number of residues in sheets (absolute).
99
+ ratio_min_helix_residues: Minimum number of residues in helices (relative).
100
+ ratio_max_helix_residues: Maximum number of residues in helices (relative).
101
+ ratio_min_sheet_residues: Minimum number of residues in sheets (relative).
102
+ ratio_max_sheet_residues: Maximum number of residues in sheets (relative).
103
+ """
104
+
105
+ abs_min_helix_residues: PositiveInt | None = None
106
+ abs_max_helix_residues: PositiveInt | None = None
107
+ abs_min_sheet_residues: PositiveInt | None = None
108
+ abs_max_sheet_residues: PositiveInt | None = None
109
+ ratio_min_helix_residues: Ratio | None = None
110
+ ratio_max_helix_residues: Ratio | None = None
111
+ ratio_min_sheet_residues: Ratio | None = None
112
+ ratio_max_sheet_residues: Ratio | None = None
113
+
114
+
115
+ def _check_range(min_val, max_val, label):
116
+ if min_val is not None and max_val is not None and min_val >= max_val:
117
+ msg = f"Invalid {label} range: min {min_val} must be smaller than max {max_val}"
118
+ raise ValueError(msg)
119
+
120
+
121
+ base_query_hook = converter.get_structure_hook(SecondaryStructureFilterQuery)
122
+
123
+
124
+ @converter.register_structure_hook
125
+ def secondary_structure_filter_query_hook(value, _type) -> SecondaryStructureFilterQuery:
126
+ result: SecondaryStructureFilterQuery = base_query_hook(value, _type)
127
+ _check_range(result.abs_min_helix_residues, result.abs_max_helix_residues, "absolute helix residue")
128
+ _check_range(result.abs_min_sheet_residues, result.abs_max_sheet_residues, "absolute sheet residue")
129
+ _check_range(result.ratio_min_helix_residues, result.ratio_max_helix_residues, "ratio helix residue")
130
+ _check_range(result.ratio_min_sheet_residues, result.ratio_max_sheet_residues, "ratio sheet residue")
131
+ return result
132
+
133
+
134
+ @dataclass
135
+ class SecondaryStructureStats:
136
+ """Statistics about the secondary structure of a protein.
137
+
138
+ Parameters:
139
+ nr_residues: Total number of residues in the structure.
140
+ nr_helix_residues: Number of residues in helices.
141
+ nr_sheet_residues: Number of residues in sheets.
142
+ helix_ratio: Ratio of residues in helices.
143
+ sheet_ratio: Ratio of residues in sheets.
144
+ """
145
+
146
+ nr_residues: PositiveInt
147
+ nr_helix_residues: PositiveInt
148
+ nr_sheet_residues: PositiveInt
149
+ helix_ratio: Ratio
150
+ sheet_ratio: Ratio
151
+
152
+
153
+ @dataclass
154
+ class SecondaryStructureFilterResult:
155
+ """Result of filtering on secondary structure.
156
+
157
+ Parameters:
158
+ stats: The secondary structure statistics.
159
+ passed: Whether the structure passed the filtering criteria.
160
+ """
161
+
162
+ stats: SecondaryStructureStats
163
+ passed: bool = False
164
+
165
+
166
+ def _gather_stats(structure: Structure) -> SecondaryStructureStats:
167
+ nr_total_residues = nr_of_residues_in_total(structure)
168
+ nr_helix_residues = nr_of_residues_in_helix(structure)
169
+ nr_sheet_residues = nr_of_residues_in_sheet(structure)
170
+ if nr_total_residues == 0:
171
+ msg = "Structure has zero residues; cannot compute secondary structure ratios."
172
+ raise ValueError(msg)
173
+ helix_ratio = nr_helix_residues / nr_total_residues
174
+ sheet_ratio = nr_sheet_residues / nr_total_residues
175
+ return SecondaryStructureStats(
176
+ nr_residues=nr_total_residues,
177
+ nr_helix_residues=nr_helix_residues,
178
+ nr_sheet_residues=nr_sheet_residues,
179
+ helix_ratio=helix_ratio,
180
+ sheet_ratio=sheet_ratio,
181
+ )
182
+
183
+
184
+ def filter_on_secondary_structure(
185
+ structure: Structure,
186
+ query: SecondaryStructureFilterQuery,
187
+ ) -> SecondaryStructureFilterResult:
188
+ """Filter a structure based on secondary structure criteria.
189
+
190
+ Args:
191
+ structure: The gemmi Structure object to analyze.
192
+ query: The filtering criteria to apply.
193
+
194
+ Returns:
195
+ Filtering statistics and whether structure passed.
196
+ """
197
+ stats = _gather_stats(structure)
198
+ conditions: list[bool] = []
199
+
200
+ # Helix absolute thresholds
201
+ if query.abs_min_helix_residues is not None:
202
+ conditions.append(stats.nr_helix_residues >= query.abs_min_helix_residues)
203
+ if query.abs_max_helix_residues is not None:
204
+ conditions.append(stats.nr_helix_residues <= query.abs_max_helix_residues)
205
+
206
+ # Helix ratio thresholds
207
+ if query.ratio_min_helix_residues is not None:
208
+ conditions.append(stats.helix_ratio >= query.ratio_min_helix_residues)
209
+ if query.ratio_max_helix_residues is not None:
210
+ conditions.append(stats.helix_ratio <= query.ratio_max_helix_residues)
211
+
212
+ # Sheet absolute thresholds
213
+ if query.abs_min_sheet_residues is not None:
214
+ conditions.append(stats.nr_sheet_residues >= query.abs_min_sheet_residues)
215
+ if query.abs_max_sheet_residues is not None:
216
+ conditions.append(stats.nr_sheet_residues <= query.abs_max_sheet_residues)
217
+
218
+ # Sheet ratio thresholds
219
+ if query.ratio_min_sheet_residues is not None:
220
+ conditions.append(stats.sheet_ratio >= query.ratio_min_sheet_residues)
221
+ if query.ratio_max_sheet_residues is not None:
222
+ conditions.append(stats.sheet_ratio <= query.ratio_max_sheet_residues)
223
+
224
+ if not conditions:
225
+ msg = "No filtering conditions provided. Please specify at least one condition."
226
+ raise ValueError(msg)
227
+ passed = all(conditions)
228
+ return SecondaryStructureFilterResult(stats=stats, passed=passed)
229
+
230
+
231
+ def filter_file_on_secondary_structure(
232
+ file_path: Path,
233
+ query: SecondaryStructureFilterQuery,
234
+ ) -> SecondaryStructureFilterResult:
235
+ """Filter a structure file based on secondary structure criteria.
236
+
237
+ Args:
238
+ file_path: The path to the structure file to analyze.
239
+ query: The filtering criteria to apply.
240
+
241
+ Returns:
242
+ Filtering statistics and whether file passed.
243
+ """
244
+ structure = read_structure(str(file_path))
245
+ return filter_on_secondary_structure(structure, query)
246
+
247
+
248
+ def filter_files_on_secondary_structure(
249
+ file_paths: Iterable[Path],
250
+ query: SecondaryStructureFilterQuery,
251
+ ) -> Generator[tuple[Path, SecondaryStructureFilterResult]]:
252
+ """Filter multiple structure files based on secondary structure criteria.
253
+
254
+ Args:
255
+ file_paths: A list of paths to the structure files to analyze.
256
+ query: The filtering criteria to apply.
257
+
258
+ Yields:
259
+ For each file returns the filtering statistics and whether structure passed.
260
+ """
261
+ # TODO check if quick enough in serial mode, if not switch to dask map
262
+ for file_path in file_paths:
263
+ result = filter_file_on_secondary_structure(file_path, query)
264
+ yield file_path, result
protein_quest/taxonomy.py CHANGED
@@ -9,9 +9,9 @@ from typing import Literal, get_args
9
9
  from aiohttp.client import ClientResponse
10
10
  from aiohttp_retry import RetryClient
11
11
  from cattrs.gen import make_dict_structure_fn, override
12
- from cattrs.preconf.orjson import make_converter
13
12
  from yarl import URL
14
13
 
14
+ from protein_quest.converter import converter
15
15
  from protein_quest.go import TextIOWrapper
16
16
  from protein_quest.utils import friendly_session
17
17
 
@@ -42,8 +42,6 @@ class SearchTaxonResponse:
42
42
  results: list[Taxon]
43
43
 
44
44
 
45
- converter = make_converter()
46
-
47
45
  converter.register_structure_hook(
48
46
  Taxon,
49
47
  make_dict_structure_fn(
protein_quest/utils.py CHANGED
@@ -2,11 +2,12 @@
2
2
 
3
3
  import asyncio
4
4
  import logging
5
+ import shutil
5
6
  from collections.abc import Coroutine, Iterable
6
7
  from contextlib import asynccontextmanager
7
8
  from pathlib import Path
8
9
  from textwrap import dedent
9
- from typing import Any
10
+ from typing import Any, Literal, get_args
10
11
 
11
12
  import aiofiles
12
13
  import aiohttp
@@ -138,3 +139,29 @@ def run_async[R](coroutine: Coroutine[Any, Any, R]) -> R:
138
139
  return asyncio.run(coroutine)
139
140
  except RuntimeError as e:
140
141
  raise NestedAsyncIOLoopError from e
142
+
143
+
144
+ CopyMethod = Literal["copy", "symlink"]
145
+ copy_methods = set(get_args(CopyMethod))
146
+
147
+
148
+ def copyfile(source: Path, target: Path, copy_method: CopyMethod = "copy"):
149
+ """Make target path be same file as source by either copying or symlinking.
150
+
151
+ Args:
152
+ source: The source file to copy or symlink.
153
+ target: The target file to create.
154
+ copy_method: The method to use for copying.
155
+
156
+ Raises:
157
+ FileNotFoundError: If the source file or parent of target does not exist.
158
+ ValueError: If the method is not "copy" or "symlink".
159
+ """
160
+ if copy_method == "copy":
161
+ shutil.copyfile(source, target)
162
+ elif copy_method == "symlink":
163
+ rel_source = source.relative_to(target.parent, walk_up=True)
164
+ target.symlink_to(rel_source)
165
+ else:
166
+ msg = f"Unknown method: {copy_method}"
167
+ raise ValueError(msg)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: protein_quest
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Search/retrieve/filter proteins and protein structures
5
5
  Project-URL: Homepage, https://github.com/haddocking/protein-quest
6
6
  Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
@@ -59,9 +59,11 @@ graph TB;
59
59
  searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
60
60
  searchaf --> |uniprot_accessions|fetchad(Retrieve AlphaFold)
61
61
  searchemdb -. emdb_ids .->fetchemdb[Retrieve EMDB]
62
- fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{Filter on chain of uniprot}
63
- chainfilter --> |mmcif_files| residuefilter{Filter on chain length}
64
- fetchad -->|pdb_files| confidencefilter{Filter out low confidence}
62
+ fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{{Filter on chain of uniprot}}
63
+ chainfilter --> |mmcif_files| residuefilter{{Filter on chain length}}
64
+ fetchad -->|pdb_files| confidencefilter{{Filter out low confidence}}
65
+ confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
66
+ residuefilter --> |mmcif_files| ssfilter
65
67
  classDef dashedBorder stroke-dasharray: 5 5;
66
68
  goterm:::dashedBorder
67
69
  taxonomy:::dashedBorder
@@ -175,6 +177,18 @@ protein-quest filter residue \
175
177
  ./filtered-chains ./filtered
176
178
  ```
177
179
 
180
+ ### To filter on secondary structure
181
+
182
+ To filter on structure being mostly alpha helices and have no beta sheets.
183
+
184
+ ```shell
185
+ protein-quest filter secondary-structure \
186
+ --ratio-min-helix-residues 0.5 \
187
+ --ratio-max-sheet-residues 0.0 \
188
+ --write-stats filtered-ss/stats.csv \
189
+ ./filtered-chains ./filtered-ss
190
+ ```
191
+
178
192
  ### Search Taxonomy
179
193
 
180
194
  ```shell
@@ -0,0 +1,26 @@
1
+ protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ protein_quest/__version__.py,sha256=tDIN8WjNdFKRoXsf6tArV0_n6nbcPEBWNv1zuhaRbKo,56
3
+ protein_quest/cli.py,sha256=k4HC282QkbAAIk614vIJgaKfkS3XD9hYj7E5hEuiDxA,37893
4
+ protein_quest/converter.py,sha256=tSDw7HOlC7UoWryr_G-sHGzGq8nwflzSq8o7Gv1hWuQ,1382
5
+ protein_quest/emdb.py,sha256=QEeU0VJQ4lLM-o5yAU3QZlrtzDZNgnC5fCjlqPtTyAY,1370
6
+ protein_quest/filters.py,sha256=-gasSXR4g5SzYSYbkfcDwR-tm2KCAhCMdpIVJrUPR1w,5224
7
+ protein_quest/go.py,sha256=lZNEcw8nTc9wpV3cl4y2FG9Lsj8wsXQ6zemmAQs_DWE,5650
8
+ protein_quest/mcp_server.py,sha256=auftrx4aBZp1P-pBcunkPiSmXLtOIZ6MTuhUuW7yrGY,7241
9
+ protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
10
+ protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ protein_quest/ss.py,sha256=MMHgqKPxjYpjyExiqslWjmyG7aeForeAeJorCYdh75g,9663
12
+ protein_quest/taxonomy.py,sha256=4mKv8zll4mX02Ow8CTvyqMJE2KJZvcq3QlTjjjLOJJk,5072
13
+ protein_quest/uniprot.py,sha256=8qWV4GWqHTRfed0bE_TdgsLYcnDT_vzKu-6JxIgapJQ,18680
14
+ protein_quest/utils.py,sha256=z4PPPcog6nvPhA93DWVf7stv5uJ4h_2BP5owdhoO5mo,5626
15
+ protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
16
+ protein_quest/alphafold/confidence.py,sha256=pYIuwYdkuPuHLagcX1dSvSyZ_84xboRLfHUxkEoc4MY,6766
17
+ protein_quest/alphafold/entry_summary.py,sha256=GtE3rT7wH3vIOOeiXY2s80Fo6EzdoqlcvakW8K591Yk,1257
18
+ protein_quest/alphafold/fetch.py,sha256=iFHORaO-2NvPwmpm33tfOFUcSJx8mBGwMXxwc4bRuk8,11336
19
+ protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
20
+ protein_quest/pdbe/fetch.py,sha256=tlCrWoaOrwxnQFrf-PnimUUa6lmtHwwysS51efYsBcA,2379
21
+ protein_quest/pdbe/io.py,sha256=iGLvmsD-eEYnrgZDYfkGWIDCzwDRRD5dwqB480talCs,10037
22
+ protein_quest-0.3.2.dist-info/METADATA,sha256=wcURSjBlmkCt-ddhZX7xRYrL-7tT1VuBpJ36_mP0Iuk,7760
23
+ protein_quest-0.3.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
24
+ protein_quest-0.3.2.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
25
+ protein_quest-0.3.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
26
+ protein_quest-0.3.2.dist-info/RECORD,,
@@ -1,24 +0,0 @@
1
- protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- protein_quest/__version__.py,sha256=Bu2gp24I4eIxc1qgY2e0PnF8N-szjUpFQwVAe10IRAo,56
3
- protein_quest/cli.py,sha256=xjiWtRDqv-Ruv1fpvXq4dmDSuuyewxw81akDs1ktVbI,31772
4
- protein_quest/emdb.py,sha256=QEeU0VJQ4lLM-o5yAU3QZlrtzDZNgnC5fCjlqPtTyAY,1370
5
- protein_quest/filters.py,sha256=3vqfFH87Lz7r9uYiSvwMxzShMfRNv1Zv_freJtDljrU,4051
6
- protein_quest/go.py,sha256=ycV3-grxuIKFt28bFgH6iRKmt5AEGi7txoTbaAnBxQE,5684
7
- protein_quest/mcp_server.py,sha256=1_CGC0peqoNUFBvgFWupKwIWjmHsKxN5Vxy1K7dt5Dw,7130
8
- protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
9
- protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- protein_quest/taxonomy.py,sha256=wPzLjum5n_SEkL2rHUKvyRnjL1pG7bhEnE2vMmXixEc,5105
11
- protein_quest/uniprot.py,sha256=8qWV4GWqHTRfed0bE_TdgsLYcnDT_vzKu-6JxIgapJQ,18680
12
- protein_quest/utils.py,sha256=YhlTJreIr1bExbh1M514l6sz4GmLVa3RN57mI1kjjuw,4730
13
- protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
14
- protein_quest/alphafold/confidence.py,sha256=GGd_vYsqVvs9InvFKtqHdGKB_61GHllPmDyIztvzG7E,5625
15
- protein_quest/alphafold/entry_summary.py,sha256=GtE3rT7wH3vIOOeiXY2s80Fo6EzdoqlcvakW8K591Yk,1257
16
- protein_quest/alphafold/fetch.py,sha256=1mDbQNm01cxlwFNDsKHBWD7MEwzB3PaheskdaLN7XJs,11491
17
- protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
18
- protein_quest/pdbe/fetch.py,sha256=tlCrWoaOrwxnQFrf-PnimUUa6lmtHwwysS51efYsBcA,2379
19
- protein_quest/pdbe/io.py,sha256=J6fHlRLHLALnpxDgSUUnFCNFV9Hr3u6eJDO6j81ftT4,6936
20
- protein_quest-0.3.1.dist-info/METADATA,sha256=fWvmMbm5aEMb3WbWgPAqwEOWeYJSY47iuZLaRIgBuuk,7305
21
- protein_quest-0.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
- protein_quest-0.3.1.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
23
- protein_quest-0.3.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
24
- protein_quest-0.3.1.dist-info/RECORD,,