protein-quest 0.3.1__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. {protein_quest-0.3.1 → protein_quest-0.3.2}/PKG-INFO +18 -4
  2. {protein_quest-0.3.1 → protein_quest-0.3.2}/README.md +17 -3
  3. {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/notebooks/alphafold.ipynb +3 -3
  4. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/__version__.py +1 -1
  5. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/alphafold/confidence.py +42 -15
  6. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/alphafold/fetch.py +2 -4
  7. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/cli.py +153 -13
  8. protein_quest-0.3.2/src/protein_quest/converter.py +45 -0
  9. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/filters.py +39 -7
  10. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/go.py +1 -4
  11. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/mcp_server.py +4 -0
  12. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/pdbe/io.py +122 -41
  13. protein_quest-0.3.2/src/protein_quest/ss.py +264 -0
  14. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/taxonomy.py +1 -3
  15. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/utils.py +28 -1
  16. protein_quest-0.3.2/tests/alphafold/test_confidence.py +155 -0
  17. protein_quest-0.3.2/tests/fixtures/3JRS_B2A.cif.gz +0 -0
  18. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/pdbe/test_io.py +39 -4
  19. protein_quest-0.3.2/tests/test_ss.py +227 -0
  20. protein_quest-0.3.2/tests/test_utils.py +31 -0
  21. protein_quest-0.3.1/tests/alphafold/test_confidence.py +0 -63
  22. {protein_quest-0.3.1 → protein_quest-0.3.2}/.github/workflows/ci.yml +0 -0
  23. {protein_quest-0.3.1 → protein_quest-0.3.2}/.github/workflows/pages.yml +0 -0
  24. {protein_quest-0.3.1 → protein_quest-0.3.2}/.github/workflows/pypi-publish.yml +0 -0
  25. {protein_quest-0.3.1 → protein_quest-0.3.2}/.gitignore +0 -0
  26. {protein_quest-0.3.1 → protein_quest-0.3.2}/.vscode/extensions.json +0 -0
  27. {protein_quest-0.3.1 → protein_quest-0.3.2}/CITATION.cff +0 -0
  28. {protein_quest-0.3.1 → protein_quest-0.3.2}/CODE_OF_CONDUCT.md +0 -0
  29. {protein_quest-0.3.1 → protein_quest-0.3.2}/CONTRIBUTING.md +0 -0
  30. {protein_quest-0.3.1 → protein_quest-0.3.2}/LICENSE +0 -0
  31. {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/CONTRIBUTING.md +0 -0
  32. {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/cli_doc_hook.py +0 -0
  33. {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/index.md +0 -0
  34. {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/notebooks/.gitignore +0 -0
  35. {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/notebooks/index.md +0 -0
  36. {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/notebooks/pdbe.ipynb +0 -0
  37. {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/notebooks/uniprot.ipynb +0 -0
  38. {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/protein-quest-mcp.png +0 -0
  39. {protein_quest-0.3.1 → protein_quest-0.3.2}/mkdocs.yml +0 -0
  40. {protein_quest-0.3.1 → protein_quest-0.3.2}/pyproject.toml +0 -0
  41. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/__init__.py +0 -0
  42. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/alphafold/__init__.py +0 -0
  43. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/alphafold/entry_summary.py +0 -0
  44. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/emdb.py +0 -0
  45. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/parallel.py +0 -0
  46. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/pdbe/__init__.py +0 -0
  47. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/pdbe/fetch.py +0 -0
  48. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/py.typed +0 -0
  49. {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/uniprot.py +0 -0
  50. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/alphafold/AF-A1YPR0-F1-model_v4.pdb +0 -0
  51. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/alphafold/cassettes/test_fetch/test_fetch_many.yaml +0 -0
  52. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/alphafold/test_entry_summary.py +0 -0
  53. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/alphafold/test_fetch.py +0 -0
  54. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/cassettes/test_emdb/test_fetch.yaml +0 -0
  55. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/cassettes/test_go/test_search_gene_ontology_term.yaml +0 -0
  56. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/cassettes/test_taxonomy/test_search_taxon.yaml +0 -0
  57. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/cassettes/test_taxonomy/test_search_taxon_by_id.yaml +0 -0
  58. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/cassettes/test_uniprot/test_search4af.yaml +0 -0
  59. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/cassettes/test_uniprot/test_search4emdb.yaml +0 -0
  60. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/cassettes/test_uniprot/test_search4pdb.yaml +0 -0
  61. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/cassettes/test_uniprot/test_search4uniprot.yaml +0 -0
  62. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/pdbe/cassettes/test_fetch/test_fetch.yaml +0 -0
  63. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/pdbe/fixtures/2y29.cif +0 -0
  64. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/pdbe/test_fetch.py +0 -0
  65. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/test_cli.py +0 -0
  66. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/test_emdb.py +0 -0
  67. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/test_go.py +0 -0
  68. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/test_mcp.py +0 -0
  69. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/test_taxonomy.py +0 -0
  70. {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/test_uniprot.py +0 -0
  71. {protein_quest-0.3.1 → protein_quest-0.3.2}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: protein_quest
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Search/retrieve/filter proteins and protein structures
5
5
  Project-URL: Homepage, https://github.com/haddocking/protein-quest
6
6
  Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
@@ -59,9 +59,11 @@ graph TB;
59
59
  searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
60
60
  searchaf --> |uniprot_accessions|fetchad(Retrieve AlphaFold)
61
61
  searchemdb -. emdb_ids .->fetchemdb[Retrieve EMDB]
62
- fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{Filter on chain of uniprot}
63
- chainfilter --> |mmcif_files| residuefilter{Filter on chain length}
64
- fetchad -->|pdb_files| confidencefilter{Filter out low confidence}
62
+ fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{{Filter on chain of uniprot}}
63
+ chainfilter --> |mmcif_files| residuefilter{{Filter on chain length}}
64
+ fetchad -->|pdb_files| confidencefilter{{Filter out low confidence}}
65
+ confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
66
+ residuefilter --> |mmcif_files| ssfilter
65
67
  classDef dashedBorder stroke-dasharray: 5 5;
66
68
  goterm:::dashedBorder
67
69
  taxonomy:::dashedBorder
@@ -175,6 +177,18 @@ protein-quest filter residue \
175
177
  ./filtered-chains ./filtered
176
178
  ```
177
179
 
180
+ ### To filter on secondary structure
181
+
182
+ To filter on structure being mostly alpha helices and have no beta sheets.
183
+
184
+ ```shell
185
+ protein-quest filter secondary-structure \
186
+ --ratio-min-helix-residues 0.5 \
187
+ --ratio-max-sheet-residues 0.0 \
188
+ --write-stats filtered-ss/stats.csv \
189
+ ./filtered-chains ./filtered-ss
190
+ ```
191
+
178
192
  ### Search Taxonomy
179
193
 
180
194
  ```shell
@@ -29,9 +29,11 @@ graph TB;
29
29
  searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
30
30
  searchaf --> |uniprot_accessions|fetchad(Retrieve AlphaFold)
31
31
  searchemdb -. emdb_ids .->fetchemdb[Retrieve EMDB]
32
- fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{Filter on chain of uniprot}
33
- chainfilter --> |mmcif_files| residuefilter{Filter on chain length}
34
- fetchad -->|pdb_files| confidencefilter{Filter out low confidence}
32
+ fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{{Filter on chain of uniprot}}
33
+ chainfilter --> |mmcif_files| residuefilter{{Filter on chain length}}
34
+ fetchad -->|pdb_files| confidencefilter{{Filter out low confidence}}
35
+ confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
36
+ residuefilter --> |mmcif_files| ssfilter
35
37
  classDef dashedBorder stroke-dasharray: 5 5;
36
38
  goterm:::dashedBorder
37
39
  taxonomy:::dashedBorder
@@ -145,6 +147,18 @@ protein-quest filter residue \
145
147
  ./filtered-chains ./filtered
146
148
  ```
147
149
 
150
+ ### To filter on secondary structure
151
+
152
+ To filter on structure being mostly alpha helices and have no beta sheets.
153
+
154
+ ```shell
155
+ protein-quest filter secondary-structure \
156
+ --ratio-min-helix-residues 0.5 \
157
+ --ratio-max-sheet-residues 0.0 \
158
+ --write-stats filtered-ss/stats.csv \
159
+ ./filtered-chains ./filtered-ss
160
+ ```
161
+
148
162
  ### Search Taxonomy
149
163
 
150
164
  ```shell
@@ -301,7 +301,7 @@
301
301
  "metadata": {},
302
302
  "outputs": [],
303
303
  "source": [
304
- "query = ConfidenceFilterQuery(confidence=80, min_threshold=100, max_threshold=1000)"
304
+ "query = ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000)"
305
305
  ]
306
306
  },
307
307
  {
@@ -318,7 +318,7 @@
318
318
  },
319
319
  {
320
320
  "cell_type": "code",
321
- "execution_count": 17,
321
+ "execution_count": null,
322
322
  "id": "6a6f8e3f",
323
323
  "metadata": {},
324
324
  "outputs": [
@@ -338,7 +338,7 @@
338
338
  "source": [
339
339
  "list(\n",
340
340
  " filter_files_on_confidence(\n",
341
- " input_files, ConfidenceFilterQuery(confidence=80, min_threshold=100, max_threshold=1000), output_dir\n",
341
+ " input_files, ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000), output_dir\n",
342
342
  " )\n",
343
343
  ")"
344
344
  ]
@@ -1,2 +1,2 @@
1
- __version__ = "0.3.1"
1
+ __version__ = "0.3.2"
2
2
  """The version of the package."""
@@ -7,7 +7,10 @@ from pathlib import Path
7
7
 
8
8
  import gemmi
9
9
 
10
+ from protein_quest.converter import Percentage, PositiveInt, converter
10
11
  from protein_quest.pdbe.io import write_structure
12
+ from protein_quest.ss import nr_of_residues_in_total
13
+ from protein_quest.utils import CopyMethod, copyfile
11
14
 
12
15
  """
13
16
  Methods to filter AlphaFoldDB structures on confidence scores.
@@ -73,13 +76,25 @@ class ConfidenceFilterQuery:
73
76
  Parameters:
74
77
  confidence: The confidence threshold for filtering residues.
75
78
  Residues with a pLDDT (b-factor) above this value are considered high confidence.
76
- min_threshold: The minimum number of high-confidence residues required to keep the structure.
77
- max_threshold: The maximum number of high-confidence residues required to keep the structure.
79
+ min_residues: The minimum number of high-confidence residues required to keep the structure.
80
+ max_residues: The maximum number of high-confidence residues required to keep the structure.
78
81
  """
79
82
 
80
- confidence: float
81
- min_threshold: int
82
- max_threshold: int
83
+ confidence: Percentage
84
+ min_residues: PositiveInt
85
+ max_residues: PositiveInt
86
+
87
+
88
+ base_query_hook = converter.get_structure_hook(ConfidenceFilterQuery)
89
+
90
+
91
+ @converter.register_structure_hook
92
+ def confidence_filter_query_hook(val, _type) -> ConfidenceFilterQuery:
93
+ result: ConfidenceFilterQuery = base_query_hook(val, _type)
94
+ if result.min_residues > result.max_residues:
95
+ msg = f"min_residues {result.min_residues} cannot be larger than max_residues {result.max_residues}"
96
+ raise ValueError(msg)
97
+ return result
83
98
 
84
99
 
85
100
  @dataclass
@@ -93,17 +108,20 @@ class ConfidenceFilterResult:
93
108
  """
94
109
 
95
110
  input_file: str
96
- count: int
111
+ count: PositiveInt
97
112
  filtered_file: Path | None = None
98
113
 
99
114
 
100
- def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_dir: Path) -> ConfidenceFilterResult:
115
+ def filter_file_on_residues(
116
+ file: Path, query: ConfidenceFilterQuery, filtered_dir: Path, copy_method: CopyMethod = "copy"
117
+ ) -> ConfidenceFilterResult:
101
118
  """Filter a single AlphaFoldDB structure file (*.pdb[.gz], *.cif[.gz]) based on confidence.
102
119
 
103
120
  Args:
104
121
  file: The path to the PDB file to filter.
105
122
  query: The confidence filter query.
106
123
  filtered_dir: The directory to save the filtered PDB file.
124
+ copy_method: How to copy when no residues have to be removed.
107
125
 
108
126
  Returns:
109
127
  result with filtered_file property set to Path where filtered PDB file is saved.
@@ -112,19 +130,24 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
112
130
  structure = gemmi.read_structure(str(file))
113
131
  residues = set(find_high_confidence_residues(structure, query.confidence))
114
132
  count = len(residues)
115
- if count < query.min_threshold or count > query.max_threshold:
133
+ if count < query.min_residues or count > query.max_residues:
116
134
  # Skip structure that is outside the min and max threshold
117
135
  # just return number of high confidence residues
118
136
  return ConfidenceFilterResult(
119
137
  input_file=file.name,
120
138
  count=count,
121
139
  )
140
+ total_residues = nr_of_residues_in_total(structure)
122
141
  filtered_file = filtered_dir / file.name
123
- new_structure = filter_out_low_confidence_residues(
124
- structure,
125
- residues,
126
- )
127
- write_structure(new_structure, filtered_file)
142
+ if count == total_residues:
143
+ # if no residues have to be removed then copy instead of slower gemmi writing
144
+ copyfile(file, filtered_file, copy_method)
145
+ else:
146
+ new_structure = filter_out_low_confidence_residues(
147
+ structure,
148
+ residues,
149
+ )
150
+ write_structure(new_structure, filtered_file)
128
151
  return ConfidenceFilterResult(
129
152
  input_file=file.name,
130
153
  count=count,
@@ -133,7 +156,10 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
133
156
 
134
157
 
135
158
  def filter_files_on_confidence(
136
- alphafold_pdb_files: list[Path], query: ConfidenceFilterQuery, filtered_dir: Path
159
+ alphafold_pdb_files: list[Path],
160
+ query: ConfidenceFilterQuery,
161
+ filtered_dir: Path,
162
+ copy_method: CopyMethod = "copy",
137
163
  ) -> Generator[ConfidenceFilterResult]:
138
164
  """Filter AlphaFoldDB structures based on confidence.
139
165
 
@@ -141,6 +167,7 @@ def filter_files_on_confidence(
141
167
  alphafold_pdb_files: List of mmcif/PDB files from AlphaFoldDB to filter.
142
168
  query: The confidence filter query containing the confidence thresholds.
143
169
  filtered_dir: Directory where the filtered mmcif/PDB files will be saved.
170
+ copy_method: How to copy when a direct copy is possible.
144
171
 
145
172
  Yields:
146
173
  For each mmcif/PDB files yields whether it was filtered or not,
@@ -150,4 +177,4 @@ def filter_files_on_confidence(
150
177
  # In ../filter.py:filter_files_on_residues() we filter on number of residues on a file level
151
178
  # here we filter on file level and inside file remove low confidence residues
152
179
  for pdb_file in alphafold_pdb_files:
153
- yield filter_file_on_residues(pdb_file, query, filtered_dir)
180
+ yield filter_file_on_residues(pdb_file, query, filtered_dir, copy_method)
@@ -9,17 +9,15 @@ from typing import Literal, cast, get_args
9
9
 
10
10
  from aiohttp_retry import RetryClient
11
11
  from aiopath import AsyncPath
12
- from cattrs.preconf.orjson import make_converter
13
12
  from tqdm.asyncio import tqdm
14
13
  from yarl import URL
15
14
 
16
15
  from protein_quest.alphafold.entry_summary import EntrySummary
16
+ from protein_quest.converter import converter
17
17
  from protein_quest.utils import friendly_session, retrieve_files, run_async
18
18
 
19
19
  logger = logging.getLogger(__name__)
20
- converter = make_converter()
21
- """cattrs converter to read AlphaFold summary JSON document."""
22
- converter.register_structure_hook(URL, lambda v, _: URL(v))
20
+
23
21
 
24
22
  DownloadableFormat = Literal[
25
23
  "summary",
@@ -23,13 +23,16 @@ from protein_quest.__version__ import __version__
23
23
  from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence
24
24
  from protein_quest.alphafold.fetch import DownloadableFormat, downloadable_formats
25
25
  from protein_quest.alphafold.fetch import fetch_many as af_fetch
26
+ from protein_quest.converter import converter
26
27
  from protein_quest.emdb import fetch as emdb_fetch
27
28
  from protein_quest.filters import filter_files_on_chain, filter_files_on_residues
28
29
  from protein_quest.go import Aspect, allowed_aspects, search_gene_ontology_term, write_go_terms_to_csv
29
30
  from protein_quest.pdbe import fetch as pdbe_fetch
30
31
  from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
32
+ from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
31
33
  from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
32
34
  from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
35
+ from protein_quest.utils import CopyMethod, copy_methods, copyfile
33
36
 
34
37
  logger = logging.getLogger(__name__)
35
38
 
@@ -282,6 +285,22 @@ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
282
285
  parser.add_argument("output_dir", type=Path, help="Directory to store downloaded EMDB volume files")
283
286
 
284
287
 
288
+ def _add_copy_method_argument(parser: argparse.ArgumentParser):
289
+ """Add copy method argument to parser."""
290
+ default_copy_method = "symlink"
291
+ if os.name == "nt":
292
+ # On Windows you need developer mode or admin privileges to create symlinks
293
+ # so we default to copying files instead of symlinking
294
+ default_copy_method = "copy"
295
+ parser.add_argument(
296
+ "--copy-method",
297
+ type=str,
298
+ choices=copy_methods,
299
+ default=default_copy_method,
300
+ help="How to copy files when no changes are needed to output file.",
301
+ )
302
+
303
+
285
304
  def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
286
305
  """Add filter confidence subcommand parser."""
287
306
  parser = subparsers.add_parser(
@@ -312,6 +331,7 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
312
331
  In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
313
332
  Use `-` for stdout."""),
314
333
  )
334
+ _add_copy_method_argument(parser)
315
335
 
316
336
 
317
337
  def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
@@ -347,8 +367,11 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
347
367
  )
348
368
  parser.add_argument(
349
369
  "--scheduler-address",
350
- help="Address of the Dask scheduler to connect to. If not provided, will create a local cluster.",
370
+ help=dedent("""Address of the Dask scheduler to connect to.
371
+ If not provided, will create a local cluster.
372
+ If set to `sequential` will run tasks sequentially."""),
351
373
  )
374
+ _add_copy_method_argument(parser)
352
375
 
353
376
 
354
377
  def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
@@ -371,6 +394,7 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
371
394
  )
372
395
  parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
373
396
  parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
397
+ _add_copy_method_argument(parser)
374
398
  parser.add_argument(
375
399
  "--write-stats",
376
400
  type=argparse.FileType("w", encoding="UTF-8"),
@@ -381,6 +405,43 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
381
405
  )
382
406
 
383
407
 
408
+ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
409
+ """Add filter secondary structure subcommand parser."""
410
+ parser = subparsers.add_parser(
411
+ "secondary-structure",
412
+ help="Filter PDB/mmCIF files by secondary structure",
413
+ description="Filter PDB/mmCIF files by secondary structure",
414
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
415
+ )
416
+ parser.add_argument("input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')")
417
+ parser.add_argument(
418
+ "output_dir",
419
+ type=Path,
420
+ help=dedent("""\
421
+ Directory to write filtered PDB/mmCIF files. Files are copied without modification.
422
+ """),
423
+ )
424
+ parser.add_argument("--abs-min-helix-residues", type=int, help="Min residues in helices")
425
+ parser.add_argument("--abs-max-helix-residues", type=int, help="Max residues in helices")
426
+ parser.add_argument("--abs-min-sheet-residues", type=int, help="Min residues in sheets")
427
+ parser.add_argument("--abs-max-sheet-residues", type=int, help="Max residues in sheets")
428
+ parser.add_argument("--ratio-min-helix-residues", type=float, help="Min residues in helices (relative)")
429
+ parser.add_argument("--ratio-max-helix-residues", type=float, help="Max residues in helices (relative)")
430
+ parser.add_argument("--ratio-min-sheet-residues", type=float, help="Min residues in sheets (relative)")
431
+ parser.add_argument("--ratio-max-sheet-residues", type=float, help="Max residues in sheets (relative)")
432
+ _add_copy_method_argument(parser)
433
+ parser.add_argument(
434
+ "--write-stats",
435
+ type=argparse.FileType("w", encoding="UTF-8"),
436
+ help=dedent("""
437
+ Write filter statistics to file. In CSV format with columns:
438
+ `<input_file>,<nr_residues>,<nr_helix_residues>,<nr_sheet_residues>,
439
+ <helix_ratio>,<sheet_ratio>,<passed>,<output_file>`.
440
+ Use `-` for stdout.
441
+ """),
442
+ )
443
+
444
+
384
445
  def _add_search_subcommands(subparsers: argparse._SubParsersAction):
385
446
  """Add search command and its subcommands."""
386
447
  parser = subparsers.add_parser(
@@ -422,6 +483,7 @@ def _add_filter_subcommands(subparsers: argparse._SubParsersAction):
422
483
  _add_filter_confidence_parser(subsubparsers)
423
484
  _add_filter_chain_parser(subsubparsers)
424
485
  _add_filter_residue_parser(subsubparsers)
486
+ _add_filter_ss_parser(subsubparsers)
425
487
 
426
488
 
427
489
  def _add_mcp_command(subparsers: argparse._SubParsersAction):
@@ -620,21 +682,22 @@ def _handle_filter_confidence(args: argparse.Namespace):
620
682
  # to get rid of duplication
621
683
  input_dir = structure(args.input_dir, Path)
622
684
  output_dir = structure(args.output_dir, Path)
623
- confidence_threshold = structure(args.confidence_threshold, float)
624
- # TODO add min/max
625
- min_residues = structure(args.min_residues, int)
626
- max_residues = structure(args.max_residues, int)
685
+
686
+ confidence_threshold = args.confidence_threshold
687
+ min_residues = args.min_residues
688
+ max_residues = args.max_residues
627
689
  stats_file: TextIOWrapper | None = args.write_stats
690
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
628
691
 
629
692
  output_dir.mkdir(parents=True, exist_ok=True)
630
693
  input_files = sorted(glob_structure_files(input_dir))
631
694
  nr_input_files = len(input_files)
632
695
  rprint(f"Starting confidence filtering of {nr_input_files} mmcif/PDB files in {input_dir} directory.")
633
- query = structure(
696
+ query = converter.structure(
634
697
  {
635
698
  "confidence": confidence_threshold,
636
- "min_threshold": min_residues,
637
- "max_threshold": max_residues,
699
+ "min_residues": min_residues,
700
+ "max_residues": max_residues,
638
701
  },
639
702
  ConfidenceFilterQuery,
640
703
  )
@@ -643,7 +706,11 @@ def _handle_filter_confidence(args: argparse.Namespace):
643
706
  writer.writerow(["input_file", "residue_count", "passed", "output_file"])
644
707
 
645
708
  passed_count = 0
646
- for r in tqdm(filter_files_on_confidence(input_files, query, output_dir), total=len(input_files), unit="file"):
709
+ for r in tqdm(
710
+ filter_files_on_confidence(input_files, query, output_dir, copy_method=copy_method),
711
+ total=len(input_files),
712
+ unit="file",
713
+ ):
647
714
  if r.filtered_file:
648
715
  passed_count += 1
649
716
  if stats_file:
@@ -656,9 +723,10 @@ def _handle_filter_confidence(args: argparse.Namespace):
656
723
 
657
724
  def _handle_filter_chain(args):
658
725
  input_dir = args.input_dir
659
- output_dir = args.output_dir
726
+ output_dir = structure(args.output_dir, Path)
660
727
  pdb_id2chain_mapping_file = args.chains
661
- scheduler_address = args.scheduler_address
728
+ scheduler_address = structure(args.scheduler_address, str | None) # pyright: ignore[reportArgumentType]
729
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
662
730
 
663
731
  # make sure files in input dir with entries in mapping file are the same
664
732
  # complain when files from mapping file are missing on disk
@@ -683,18 +751,25 @@ def _handle_filter_chain(args):
683
751
  rprint("[red]No valid structure files found. Exiting.")
684
752
  sys.exit(1)
685
753
 
686
- results = filter_files_on_chain(file2chain, output_dir, scheduler_address=scheduler_address)
754
+ results = filter_files_on_chain(
755
+ file2chain, output_dir, scheduler_address=scheduler_address, copy_method=copy_method
756
+ )
687
757
 
688
758
  nr_written = len([r for r in results if r.passed])
689
759
 
690
760
  rprint(f"Wrote {nr_written} single-chain PDB/mmCIF files to {output_dir}.")
691
761
 
762
+ for result in results:
763
+ if result.discard_reason:
764
+ rprint(f"[red]Discarding {result.input_file} ({result.discard_reason})[/red]")
765
+
692
766
 
693
767
  def _handle_filter_residue(args):
694
768
  input_dir = structure(args.input_dir, Path)
695
769
  output_dir = structure(args.output_dir, Path)
696
770
  min_residues = structure(args.min_residues, int)
697
771
  max_residues = structure(args.max_residues, int)
772
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
698
773
  stats_file: TextIOWrapper | None = args.write_stats
699
774
 
700
775
  if stats_file:
@@ -705,7 +780,9 @@ def _handle_filter_residue(args):
705
780
  input_files = sorted(glob_structure_files(input_dir))
706
781
  nr_total = len(input_files)
707
782
  rprint(f"Filtering {nr_total} files in {input_dir} directory by number of residues in chain A.")
708
- for r in filter_files_on_residues(input_files, output_dir, min_residues=min_residues, max_residues=max_residues):
783
+ for r in filter_files_on_residues(
784
+ input_files, output_dir, min_residues=min_residues, max_residues=max_residues, copy_method=copy_method
785
+ ):
709
786
  if stats_file:
710
787
  writer.writerow([r.input_file, r.residue_count, r.passed, r.output_file])
711
788
  if r.passed:
@@ -716,6 +793,68 @@ def _handle_filter_residue(args):
716
793
  rprint(f"Statistics written to {stats_file.name}")
717
794
 
718
795
 
796
+ def _handle_filter_ss(args):
797
+ input_dir = structure(args.input_dir, Path)
798
+ output_dir = structure(args.output_dir, Path)
799
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
800
+ stats_file: TextIOWrapper | None = args.write_stats
801
+
802
+ raw_query = {
803
+ "abs_min_helix_residues": args.abs_min_helix_residues,
804
+ "abs_max_helix_residues": args.abs_max_helix_residues,
805
+ "abs_min_sheet_residues": args.abs_min_sheet_residues,
806
+ "abs_max_sheet_residues": args.abs_max_sheet_residues,
807
+ "ratio_min_helix_residues": args.ratio_min_helix_residues,
808
+ "ratio_max_helix_residues": args.ratio_max_helix_residues,
809
+ "ratio_min_sheet_residues": args.ratio_min_sheet_residues,
810
+ "ratio_max_sheet_residues": args.ratio_max_sheet_residues,
811
+ }
812
+ query = converter.structure(raw_query, SecondaryStructureFilterQuery)
813
+ input_files = sorted(glob_structure_files(input_dir))
814
+ nr_total = len(input_files)
815
+ output_dir.mkdir(parents=True, exist_ok=True)
816
+
817
+ if stats_file:
818
+ writer = csv.writer(stats_file)
819
+ writer.writerow(
820
+ [
821
+ "input_file",
822
+ "nr_residues",
823
+ "nr_helix_residues",
824
+ "nr_sheet_residues",
825
+ "helix_ratio",
826
+ "sheet_ratio",
827
+ "passed",
828
+ "output_file",
829
+ ]
830
+ )
831
+
832
+ rprint(f"Filtering {nr_total} files in {input_dir} directory by secondary structure.")
833
+ nr_passed = 0
834
+ for input_file, result in filter_files_on_secondary_structure(input_files, query=query):
835
+ output_file: Path | None = None
836
+ if result.passed:
837
+ output_file = output_dir / input_file.name
838
+ copyfile(input_file, output_file, copy_method)
839
+ nr_passed += 1
840
+ if stats_file:
841
+ writer.writerow(
842
+ [
843
+ input_file,
844
+ result.stats.nr_residues,
845
+ result.stats.nr_helix_residues,
846
+ result.stats.nr_sheet_residues,
847
+ round(result.stats.helix_ratio, 3),
848
+ round(result.stats.sheet_ratio, 3),
849
+ result.passed,
850
+ output_file,
851
+ ]
852
+ )
853
+ rprint(f"Wrote {nr_passed} files to {output_dir} directory.")
854
+ if stats_file:
855
+ rprint(f"Statistics written to {stats_file.name}")
856
+
857
+
719
858
  def _handle_mcp(args):
720
859
  if find_spec("fastmcp") is None:
721
860
  msg = "Unable to start MCP server, please install `protein-quest[mcp]`."
@@ -742,6 +881,7 @@ HANDLERS: dict[tuple[str, str | None], Callable] = {
742
881
  ("filter", "confidence"): _handle_filter_confidence,
743
882
  ("filter", "chain"): _handle_filter_chain,
744
883
  ("filter", "residue"): _handle_filter_residue,
884
+ ("filter", "secondary-structure"): _handle_filter_ss,
745
885
  ("mcp", None): _handle_mcp,
746
886
  }
747
887
 
@@ -0,0 +1,45 @@
1
+ """Convert json or dict to Python objects."""
2
+
3
+ from cattrs.preconf.orjson import make_converter
4
+ from yarl import URL
5
+
6
+ type Percentage = float
7
+ """Type alias for percentage values (0.0-100.0)."""
8
+ type Ratio = float
9
+ """Type alias for ratio values (0.0-1.0)."""
10
+ type PositiveInt = int
11
+ """Type alias for positive integer values (>= 0)."""
12
+
13
+ converter = make_converter()
14
+ """cattrs converter to read JSON document or dict to Python objects."""
15
+ converter.register_structure_hook(URL, lambda v, _: URL(v))
16
+
17
+
18
+ @converter.register_structure_hook
19
+ def percentage_hook(val, _) -> Percentage:
20
+ value = float(val)
21
+ """Cattrs hook to validate percentage values."""
22
+ if not 0.0 <= value <= 100.0:
23
+ msg = f"Value {value} is not a valid percentage (0.0-100.0)"
24
+ raise ValueError(msg)
25
+ return value
26
+
27
+
28
+ @converter.register_structure_hook
29
+ def ratio_hook(val, _) -> Ratio:
30
+ """Cattrs hook to validate ratio values."""
31
+ value = float(val)
32
+ if not 0.0 <= value <= 1.0:
33
+ msg = f"Value {value} is not a valid ratio (0.0-1.0)"
34
+ raise ValueError(msg)
35
+ return value
36
+
37
+
38
+ @converter.register_structure_hook
39
+ def positive_int_hook(val, _) -> PositiveInt:
40
+ """Cattrs hook to validate positive integer values."""
41
+ value = int(val)
42
+ if value < 0:
43
+ msg = f"Value {value} is not a valid positive integer (>= 0)"
44
+ raise ValueError(msg)
45
+ return value