protein-quest 0.3.1__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of protein-quest might be problematic. Click here for more details.

Files changed (74) hide show
  1. {protein_quest-0.3.1 → protein_quest-0.4.0}/PKG-INFO +48 -4
  2. {protein_quest-0.3.1 → protein_quest-0.4.0}/README.md +47 -3
  3. {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/notebooks/alphafold.ipynb +3 -3
  4. {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/notebooks/uniprot.ipynb +95 -2
  5. {protein_quest-0.3.1 → protein_quest-0.4.0}/pyproject.toml +1 -0
  6. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/__version__.py +1 -1
  7. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/alphafold/confidence.py +42 -15
  8. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/alphafold/fetch.py +2 -4
  9. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/cli.py +292 -14
  10. protein_quest-0.4.0/src/protein_quest/converter.py +46 -0
  11. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/filters.py +39 -7
  12. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/go.py +1 -4
  13. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/mcp_server.py +14 -1
  14. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/pdbe/io.py +122 -41
  15. protein_quest-0.4.0/src/protein_quest/ss.py +284 -0
  16. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/taxonomy.py +1 -3
  17. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/uniprot.py +157 -4
  18. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/utils.py +28 -1
  19. protein_quest-0.4.0/tests/alphafold/test_confidence.py +155 -0
  20. protein_quest-0.4.0/tests/cassettes/test_uniprot/test_search4interaction_partners.yaml +384 -0
  21. protein_quest-0.4.0/tests/cassettes/test_uniprot/test_search4macromolecular_complexes.yaml +382 -0
  22. protein_quest-0.4.0/tests/fixtures/3JRS_B2A.cif.gz +0 -0
  23. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/pdbe/test_io.py +39 -4
  24. protein_quest-0.4.0/tests/test_converter.py +23 -0
  25. protein_quest-0.4.0/tests/test_ss.py +233 -0
  26. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/test_uniprot.py +65 -0
  27. protein_quest-0.4.0/tests/test_utils.py +31 -0
  28. {protein_quest-0.3.1 → protein_quest-0.4.0}/uv.lock +36 -0
  29. protein_quest-0.3.1/tests/alphafold/test_confidence.py +0 -63
  30. {protein_quest-0.3.1 → protein_quest-0.4.0}/.github/workflows/ci.yml +0 -0
  31. {protein_quest-0.3.1 → protein_quest-0.4.0}/.github/workflows/pages.yml +0 -0
  32. {protein_quest-0.3.1 → protein_quest-0.4.0}/.github/workflows/pypi-publish.yml +0 -0
  33. {protein_quest-0.3.1 → protein_quest-0.4.0}/.gitignore +0 -0
  34. {protein_quest-0.3.1 → protein_quest-0.4.0}/.vscode/extensions.json +0 -0
  35. {protein_quest-0.3.1 → protein_quest-0.4.0}/CITATION.cff +0 -0
  36. {protein_quest-0.3.1 → protein_quest-0.4.0}/CODE_OF_CONDUCT.md +0 -0
  37. {protein_quest-0.3.1 → protein_quest-0.4.0}/CONTRIBUTING.md +0 -0
  38. {protein_quest-0.3.1 → protein_quest-0.4.0}/LICENSE +0 -0
  39. {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/CONTRIBUTING.md +0 -0
  40. {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/cli_doc_hook.py +0 -0
  41. {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/index.md +0 -0
  42. {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/notebooks/.gitignore +0 -0
  43. {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/notebooks/index.md +0 -0
  44. {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/notebooks/pdbe.ipynb +0 -0
  45. {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/protein-quest-mcp.png +0 -0
  46. {protein_quest-0.3.1 → protein_quest-0.4.0}/mkdocs.yml +0 -0
  47. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/__init__.py +0 -0
  48. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/alphafold/__init__.py +0 -0
  49. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/alphafold/entry_summary.py +0 -0
  50. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/emdb.py +0 -0
  51. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/parallel.py +0 -0
  52. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/pdbe/__init__.py +0 -0
  53. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/pdbe/fetch.py +0 -0
  54. {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/py.typed +0 -0
  55. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/alphafold/AF-A1YPR0-F1-model_v4.pdb +0 -0
  56. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/alphafold/cassettes/test_fetch/test_fetch_many.yaml +0 -0
  57. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/alphafold/test_entry_summary.py +0 -0
  58. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/alphafold/test_fetch.py +0 -0
  59. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/cassettes/test_emdb/test_fetch.yaml +0 -0
  60. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/cassettes/test_go/test_search_gene_ontology_term.yaml +0 -0
  61. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/cassettes/test_taxonomy/test_search_taxon.yaml +0 -0
  62. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/cassettes/test_taxonomy/test_search_taxon_by_id.yaml +0 -0
  63. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/cassettes/test_uniprot/test_search4af.yaml +0 -0
  64. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/cassettes/test_uniprot/test_search4emdb.yaml +0 -0
  65. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/cassettes/test_uniprot/test_search4pdb.yaml +0 -0
  66. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/cassettes/test_uniprot/test_search4uniprot.yaml +0 -0
  67. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/pdbe/cassettes/test_fetch/test_fetch.yaml +0 -0
  68. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/pdbe/fixtures/2y29.cif +0 -0
  69. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/pdbe/test_fetch.py +0 -0
  70. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/test_cli.py +0 -0
  71. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/test_emdb.py +0 -0
  72. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/test_go.py +0 -0
  73. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/test_mcp.py +0 -0
  74. {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/test_taxonomy.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: protein_quest
3
- Version: 0.3.1
3
+ Version: 0.4.0
4
4
  Summary: Search/retrieve/filter proteins and protein structures
5
5
  Project-URL: Homepage, https://github.com/haddocking/protein-quest
6
6
  Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
@@ -56,17 +56,23 @@ graph TB;
56
56
  searchuniprot --> |uniprot_accessions|searchpdbe[/Search PDBe/]
57
57
  searchuniprot --> |uniprot_accessions|searchaf[/Search Alphafold/]
58
58
  searchuniprot -. uniprot_accessions .-> searchemdb[/Search EMDB/]
59
+ searchintactionpartners[/Search interaction partners/] -.-x |uniprot_accessions|searchuniprot
60
+ searchcomplexes[/Search complexes/]
59
61
  searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
60
62
  searchaf --> |uniprot_accessions|fetchad(Retrieve AlphaFold)
61
63
  searchemdb -. emdb_ids .->fetchemdb[Retrieve EMDB]
62
- fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{Filter on chain of uniprot}
63
- chainfilter --> |mmcif_files| residuefilter{Filter on chain length}
64
- fetchad -->|pdb_files| confidencefilter{Filter out low confidence}
64
+ fetchpdbe -->|mmcif_files| chainfilter{{Filter on chain of uniprot}}
65
+ chainfilter --> |mmcif_files| residuefilter{{Filter on chain length}}
66
+ fetchad -->|mmcif_files| confidencefilter{{Filter out low confidence}}
67
+ confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
68
+ residuefilter --> |mmcif_files| ssfilter
65
69
  classDef dashedBorder stroke-dasharray: 5 5;
66
70
  goterm:::dashedBorder
67
71
  taxonomy:::dashedBorder
68
72
  searchemdb:::dashedBorder
69
73
  fetchemdb:::dashedBorder
74
+ searchintactionpartners:::dashedBorder
75
+ searchcomplexes:::dashedBorder
70
76
  ```
71
77
 
72
78
  (Dotted nodes and edges are side-quests.)
@@ -175,6 +181,18 @@ protein-quest filter residue \
175
181
  ./filtered-chains ./filtered
176
182
  ```
177
183
 
184
+ ### To filter on secondary structure
185
+
186
+ To filter on structure being mostly alpha helices and have no beta sheets.
187
+
188
+ ```shell
189
+ protein-quest filter secondary-structure \
190
+ --ratio-min-helix-residues 0.5 \
191
+ --ratio-max-sheet-residues 0.0 \
192
+ --write-stats filtered-ss/stats.csv \
193
+ ./filtered-chains ./filtered-ss
194
+ ```
195
+
178
196
  ### Search Taxonomy
179
197
 
180
198
  ```shell
@@ -190,6 +208,32 @@ You can use following command to search for a Gene Ontology (GO) term.
190
208
  protein-quest search go --limit 5 --aspect cellular_component apoptosome -
191
209
  ```
192
210
 
211
+ ### Search for interaction partners
212
+
213
+ Use https://www.ebi.ac.uk/complexportal to find interaction partners of given UniProt accession.
214
+
215
+ ```shell
216
+ protein-quest search interaction-partners Q05471 interaction-partners-of-Q05471.txt
217
+ ```
218
+
219
+ The `interaction-partners-of-Q05471.txt` file contains uniprot accessions (one per line).
220
+
221
+ ### Search for complexes
222
+
223
+ Given Uniprot accessions search for macromolecular complexes at https://www.ebi.ac.uk/complexportal
224
+ and return the complex entries and their members.
225
+
226
+ ```shell
227
+ echo Q05471 | protein-quest search complexes - complexes.csv
228
+ ```
229
+
230
+ The `complexes.csv` looks like
231
+
232
+ ```csv
233
+ query_protein,complex_id,complex_url,complex_title,members
234
+ Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
235
+ ```
236
+
193
237
  ## Model Context Protocol (MCP) server
194
238
 
195
239
  Protein quest can also help LLMs like Claude Sonnet 4 by providing a [set of tools](https://modelcontextprotocol.io/docs/learn/server-concepts#tools-ai-actions) for protein structures.
@@ -26,17 +26,23 @@ graph TB;
26
26
  searchuniprot --> |uniprot_accessions|searchpdbe[/Search PDBe/]
27
27
  searchuniprot --> |uniprot_accessions|searchaf[/Search Alphafold/]
28
28
  searchuniprot -. uniprot_accessions .-> searchemdb[/Search EMDB/]
29
+ searchintactionpartners[/Search interaction partners/] -.-x |uniprot_accessions|searchuniprot
30
+ searchcomplexes[/Search complexes/]
29
31
  searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
30
32
  searchaf --> |uniprot_accessions|fetchad(Retrieve AlphaFold)
31
33
  searchemdb -. emdb_ids .->fetchemdb[Retrieve EMDB]
32
- fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{Filter on chain of uniprot}
33
- chainfilter --> |mmcif_files| residuefilter{Filter on chain length}
34
- fetchad -->|pdb_files| confidencefilter{Filter out low confidence}
34
+ fetchpdbe -->|mmcif_files| chainfilter{{Filter on chain of uniprot}}
35
+ chainfilter --> |mmcif_files| residuefilter{{Filter on chain length}}
36
+ fetchad -->|mmcif_files| confidencefilter{{Filter out low confidence}}
37
+ confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
38
+ residuefilter --> |mmcif_files| ssfilter
35
39
  classDef dashedBorder stroke-dasharray: 5 5;
36
40
  goterm:::dashedBorder
37
41
  taxonomy:::dashedBorder
38
42
  searchemdb:::dashedBorder
39
43
  fetchemdb:::dashedBorder
44
+ searchintactionpartners:::dashedBorder
45
+ searchcomplexes:::dashedBorder
40
46
  ```
41
47
 
42
48
  (Dotted nodes and edges are side-quests.)
@@ -145,6 +151,18 @@ protein-quest filter residue \
145
151
  ./filtered-chains ./filtered
146
152
  ```
147
153
 
154
+ ### To filter on secondary structure
155
+
156
+ To filter on structure being mostly alpha helices and have no beta sheets.
157
+
158
+ ```shell
159
+ protein-quest filter secondary-structure \
160
+ --ratio-min-helix-residues 0.5 \
161
+ --ratio-max-sheet-residues 0.0 \
162
+ --write-stats filtered-ss/stats.csv \
163
+ ./filtered-chains ./filtered-ss
164
+ ```
165
+
148
166
  ### Search Taxonomy
149
167
 
150
168
  ```shell
@@ -160,6 +178,32 @@ You can use following command to search for a Gene Ontology (GO) term.
160
178
  protein-quest search go --limit 5 --aspect cellular_component apoptosome -
161
179
  ```
162
180
 
181
+ ### Search for interaction partners
182
+
183
+ Use https://www.ebi.ac.uk/complexportal to find interaction partners of given UniProt accession.
184
+
185
+ ```shell
186
+ protein-quest search interaction-partners Q05471 interaction-partners-of-Q05471.txt
187
+ ```
188
+
189
+ The `interaction-partners-of-Q05471.txt` file contains uniprot accessions (one per line).
190
+
191
+ ### Search for complexes
192
+
193
+ Given Uniprot accessions search for macromolecular complexes at https://www.ebi.ac.uk/complexportal
194
+ and return the complex entries and their members.
195
+
196
+ ```shell
197
+ echo Q05471 | protein-quest search complexes - complexes.csv
198
+ ```
199
+
200
+ The `complexes.csv` looks like
201
+
202
+ ```csv
203
+ query_protein,complex_id,complex_url,complex_title,members
204
+ Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
205
+ ```
206
+
163
207
  ## Model Context Protocol (MCP) server
164
208
 
165
209
  Protein quest can also help LLMs like Claude Sonnet 4 by providing a [set of tools](https://modelcontextprotocol.io/docs/learn/server-concepts#tools-ai-actions) for protein structures.
@@ -301,7 +301,7 @@
301
301
  "metadata": {},
302
302
  "outputs": [],
303
303
  "source": [
304
- "query = ConfidenceFilterQuery(confidence=80, min_threshold=100, max_threshold=1000)"
304
+ "query = ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000)"
305
305
  ]
306
306
  },
307
307
  {
@@ -318,7 +318,7 @@
318
318
  },
319
319
  {
320
320
  "cell_type": "code",
321
- "execution_count": 17,
321
+ "execution_count": null,
322
322
  "id": "6a6f8e3f",
323
323
  "metadata": {},
324
324
  "outputs": [
@@ -338,7 +338,7 @@
338
338
  "source": [
339
339
  "list(\n",
340
340
  " filter_files_on_confidence(\n",
341
- " input_files, ConfidenceFilterQuery(confidence=80, min_threshold=100, max_threshold=1000), output_dir\n",
341
+ " input_files, ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000), output_dir\n",
342
342
  " )\n",
343
343
  ")"
344
344
  ]
@@ -12,7 +12,7 @@
12
12
  },
13
13
  {
14
14
  "cell_type": "code",
15
- "execution_count": 23,
15
+ "execution_count": 1,
16
16
  "id": "85674583",
17
17
  "metadata": {},
18
18
  "outputs": [],
@@ -282,6 +282,99 @@
282
282
  "first_uniprot = next(iter(uniprot_accessions.items()))\n",
283
283
  "pprint(first_uniprot)"
284
284
  ]
285
+ },
286
+ {
287
+ "cell_type": "markdown",
288
+ "id": "e32a95f8",
289
+ "metadata": {},
290
+ "source": [
291
+ "## Find interaction partners for uniprot entries"
292
+ ]
293
+ },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": 1,
297
+ "id": "d035c702",
298
+ "metadata": {},
299
+ "outputs": [],
300
+ "source": [
301
+ "from protein_quest.uniprot import search4interaction_partners, search4macromolecular_complexes"
302
+ ]
303
+ },
304
+ {
305
+ "cell_type": "code",
306
+ "execution_count": 2,
307
+ "id": "601c690a",
308
+ "metadata": {},
309
+ "outputs": [],
310
+ "source": [
311
+ "# Helicase SWR1 in yeast\n",
312
+ "uniprot_accession = \"Q05471\""
313
+ ]
314
+ },
315
+ {
316
+ "cell_type": "code",
317
+ "execution_count": 3,
318
+ "id": "173c764d",
319
+ "metadata": {},
320
+ "outputs": [
321
+ {
322
+ "data": {
323
+ "text/plain": [
324
+ "{'Q12464': {'CPX-2122'},\n",
325
+ " 'P35817': {'CPX-2122'},\n",
326
+ " 'P80428': {'CPX-2122'},\n",
327
+ " 'Q12509': {'CPX-2122'},\n",
328
+ " 'Q03388': {'CPX-2122'},\n",
329
+ " 'P53201': {'CPX-2122'},\n",
330
+ " 'P53930': {'CPX-2122'},\n",
331
+ " 'P60010': {'CPX-2122'},\n",
332
+ " 'Q03433': {'CPX-2122'},\n",
333
+ " 'Q06707': {'CPX-2122'},\n",
334
+ " 'P38326': {'CPX-2122'},\n",
335
+ " 'P31376': {'CPX-2122'},\n",
336
+ " 'Q03940': {'CPX-2122'}}"
337
+ ]
338
+ },
339
+ "execution_count": 3,
340
+ "metadata": {},
341
+ "output_type": "execute_result"
342
+ }
343
+ ],
344
+ "source": [
345
+ "partners = search4interaction_partners(uniprot_accession, limit=100)\n",
346
+ "partners"
347
+ ]
348
+ },
349
+ {
350
+ "cell_type": "markdown",
351
+ "id": "a763b6f8",
352
+ "metadata": {},
353
+ "source": [
354
+ "To get more information about the complex you can search for the complexes themselves with:"
355
+ ]
356
+ },
357
+ {
358
+ "cell_type": "code",
359
+ "execution_count": 4,
360
+ "id": "236050ea",
361
+ "metadata": {},
362
+ "outputs": [
363
+ {
364
+ "data": {
365
+ "text/plain": [
366
+ "[ComplexPortalEntry(query_protein='Q05471', complex_id='CPX-2122', complex_url='https://www.ebi.ac.uk/complexportal/complex/CPX-2122', complex_title='Swr1 chromatin remodelling complex', members={'P35817', 'Q05471', 'Q12464', 'Q12509', 'Q06707', 'Q03433', 'P38326', 'P53201', 'Q03388', 'P53930', 'P80428', 'Q03940', 'P60010', 'P31376'})]"
367
+ ]
368
+ },
369
+ "execution_count": 4,
370
+ "metadata": {},
371
+ "output_type": "execute_result"
372
+ }
373
+ ],
374
+ "source": [
375
+ "complexes = search4macromolecular_complexes([uniprot_accession])\n",
376
+ "complexes"
377
+ ]
285
378
  }
286
379
  ],
287
380
  "metadata": {
@@ -300,7 +393,7 @@
300
393
  "name": "python",
301
394
  "nbconvert_exporter": "python",
302
395
  "pygments_lexer": "ipython3",
303
- "version": "3.13.2"
396
+ "version": "3.13.5"
304
397
  }
305
398
  },
306
399
  "nbformat": 4,
@@ -52,6 +52,7 @@ dev = [
52
52
  ]
53
53
  docs = [
54
54
  "ipykernel>=6.29.5", # For notebook support in VS Code
55
+ "ipywidgets", # For tqdm support in notebooks
55
56
  "mkdocs>=1.6.1",
56
57
  "mkdocs-autoapi>=0.4.1",
57
58
  "mkdocs-jupyter>=0.25.1",
@@ -1,2 +1,2 @@
1
- __version__ = "0.3.1"
1
+ __version__ = "0.4.0"
2
2
  """The version of the package."""
@@ -7,7 +7,10 @@ from pathlib import Path
7
7
 
8
8
  import gemmi
9
9
 
10
+ from protein_quest.converter import Percentage, PositiveInt, converter
10
11
  from protein_quest.pdbe.io import write_structure
12
+ from protein_quest.ss import nr_of_residues_in_total
13
+ from protein_quest.utils import CopyMethod, copyfile
11
14
 
12
15
  """
13
16
  Methods to filter AlphaFoldDB structures on confidence scores.
@@ -73,13 +76,25 @@ class ConfidenceFilterQuery:
73
76
  Parameters:
74
77
  confidence: The confidence threshold for filtering residues.
75
78
  Residues with a pLDDT (b-factor) above this value are considered high confidence.
76
- min_threshold: The minimum number of high-confidence residues required to keep the structure.
77
- max_threshold: The maximum number of high-confidence residues required to keep the structure.
79
+ min_residues: The minimum number of high-confidence residues required to keep the structure.
80
+ max_residues: The maximum number of high-confidence residues required to keep the structure.
78
81
  """
79
82
 
80
- confidence: float
81
- min_threshold: int
82
- max_threshold: int
83
+ confidence: Percentage
84
+ min_residues: PositiveInt
85
+ max_residues: PositiveInt
86
+
87
+
88
+ base_query_hook = converter.get_structure_hook(ConfidenceFilterQuery)
89
+
90
+
91
+ @converter.register_structure_hook
92
+ def confidence_filter_query_hook(val, _type) -> ConfidenceFilterQuery:
93
+ result: ConfidenceFilterQuery = base_query_hook(val, _type)
94
+ if result.min_residues > result.max_residues:
95
+ msg = f"min_residues {result.min_residues} cannot be larger than max_residues {result.max_residues}"
96
+ raise ValueError(msg)
97
+ return result
83
98
 
84
99
 
85
100
  @dataclass
@@ -93,17 +108,20 @@ class ConfidenceFilterResult:
93
108
  """
94
109
 
95
110
  input_file: str
96
- count: int
111
+ count: PositiveInt
97
112
  filtered_file: Path | None = None
98
113
 
99
114
 
100
- def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_dir: Path) -> ConfidenceFilterResult:
115
+ def filter_file_on_residues(
116
+ file: Path, query: ConfidenceFilterQuery, filtered_dir: Path, copy_method: CopyMethod = "copy"
117
+ ) -> ConfidenceFilterResult:
101
118
  """Filter a single AlphaFoldDB structure file (*.pdb[.gz], *.cif[.gz]) based on confidence.
102
119
 
103
120
  Args:
104
121
  file: The path to the PDB file to filter.
105
122
  query: The confidence filter query.
106
123
  filtered_dir: The directory to save the filtered PDB file.
124
+ copy_method: How to copy when no residues have to be removed.
107
125
 
108
126
  Returns:
109
127
  result with filtered_file property set to Path where filtered PDB file is saved.
@@ -112,19 +130,24 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
112
130
  structure = gemmi.read_structure(str(file))
113
131
  residues = set(find_high_confidence_residues(structure, query.confidence))
114
132
  count = len(residues)
115
- if count < query.min_threshold or count > query.max_threshold:
133
+ if count < query.min_residues or count > query.max_residues:
116
134
  # Skip structure that is outside the min and max threshold
117
135
  # just return number of high confidence residues
118
136
  return ConfidenceFilterResult(
119
137
  input_file=file.name,
120
138
  count=count,
121
139
  )
140
+ total_residues = nr_of_residues_in_total(structure)
122
141
  filtered_file = filtered_dir / file.name
123
- new_structure = filter_out_low_confidence_residues(
124
- structure,
125
- residues,
126
- )
127
- write_structure(new_structure, filtered_file)
142
+ if count == total_residues:
143
+ # if no residues have to be removed then copy instead of slower gemmi writing
144
+ copyfile(file, filtered_file, copy_method)
145
+ else:
146
+ new_structure = filter_out_low_confidence_residues(
147
+ structure,
148
+ residues,
149
+ )
150
+ write_structure(new_structure, filtered_file)
128
151
  return ConfidenceFilterResult(
129
152
  input_file=file.name,
130
153
  count=count,
@@ -133,7 +156,10 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
133
156
 
134
157
 
135
158
  def filter_files_on_confidence(
136
- alphafold_pdb_files: list[Path], query: ConfidenceFilterQuery, filtered_dir: Path
159
+ alphafold_pdb_files: list[Path],
160
+ query: ConfidenceFilterQuery,
161
+ filtered_dir: Path,
162
+ copy_method: CopyMethod = "copy",
137
163
  ) -> Generator[ConfidenceFilterResult]:
138
164
  """Filter AlphaFoldDB structures based on confidence.
139
165
 
@@ -141,6 +167,7 @@ def filter_files_on_confidence(
141
167
  alphafold_pdb_files: List of mmcif/PDB files from AlphaFoldDB to filter.
142
168
  query: The confidence filter query containing the confidence thresholds.
143
169
  filtered_dir: Directory where the filtered mmcif/PDB files will be saved.
170
+ copy_method: How to copy when a direct copy is possible.
144
171
 
145
172
  Yields:
146
173
  For each mmcif/PDB files yields whether it was filtered or not,
@@ -150,4 +177,4 @@ def filter_files_on_confidence(
150
177
  # In ../filter.py:filter_files_on_residues() we filter on number of residues on a file level
151
178
  # here we filter on file level and inside file remove low confidence residues
152
179
  for pdb_file in alphafold_pdb_files:
153
- yield filter_file_on_residues(pdb_file, query, filtered_dir)
180
+ yield filter_file_on_residues(pdb_file, query, filtered_dir, copy_method)
@@ -9,17 +9,15 @@ from typing import Literal, cast, get_args
9
9
 
10
10
  from aiohttp_retry import RetryClient
11
11
  from aiopath import AsyncPath
12
- from cattrs.preconf.orjson import make_converter
13
12
  from tqdm.asyncio import tqdm
14
13
  from yarl import URL
15
14
 
16
15
  from protein_quest.alphafold.entry_summary import EntrySummary
16
+ from protein_quest.converter import converter
17
17
  from protein_quest.utils import friendly_session, retrieve_files, run_async
18
18
 
19
19
  logger = logging.getLogger(__name__)
20
- converter = make_converter()
21
- """cattrs converter to read AlphaFold summary JSON document."""
22
- converter.register_structure_hook(URL, lambda v, _: URL(v))
20
+
23
21
 
24
22
  DownloadableFormat = Literal[
25
23
  "summary",