protein-quest 0.5.1__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of protein-quest might be problematic. Click here for more details.

Files changed (95) hide show
  1. protein_quest-0.7.0/.python-version +1 -0
  2. {protein_quest-0.5.1 → protein_quest-0.7.0}/PKG-INFO +42 -5
  3. {protein_quest-0.5.1 → protein_quest-0.7.0}/README.md +39 -2
  4. protein_quest-0.7.0/docs/notebooks/alphafold.ipynb +463 -0
  5. {protein_quest-0.5.1 → protein_quest-0.7.0}/docs/notebooks/pdbe.ipynb +12 -8
  6. {protein_quest-0.5.1 → protein_quest-0.7.0}/docs/notebooks/uniprot.ipynb +1 -1
  7. {protein_quest-0.5.1 → protein_quest-0.7.0}/pyproject.toml +2 -2
  8. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/__version__.py +1 -1
  9. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/alphafold/confidence.py +2 -2
  10. protein_quest-0.7.0/src/protein_quest/alphafold/entry_summary.py +64 -0
  11. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/alphafold/fetch.py +76 -42
  12. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/cli.py +385 -114
  13. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/filters.py +2 -5
  14. protein_quest-0.7.0/src/protein_quest/io.py +350 -0
  15. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/mcp_server.py +21 -7
  16. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/ss.py +3 -7
  17. protein_quest-0.5.1/src/protein_quest/pdbe/io.py → protein_quest-0.7.0/src/protein_quest/structure.py +77 -126
  18. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/uniprot.py +287 -15
  19. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/utils.py +26 -2
  20. protein_quest-0.7.0/tests/alphafold/cassettes/test_fetch/test_fetch_many.yaml +55567 -0
  21. protein_quest-0.7.0/tests/alphafold/cassettes/test_fetch/test_fetch_many_all_isoforms.yaml +51 -0
  22. protein_quest-0.7.0/tests/alphafold/cassettes/test_fetch/test_fetch_many_gzipped.yaml +42326 -0
  23. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/alphafold/test_confidence.py +3 -2
  24. protein_quest-0.7.0/tests/alphafold/test_entry_summary.py +16 -0
  25. protein_quest-0.7.0/tests/alphafold/test_fetch.py +56 -0
  26. protein_quest-0.7.0/tests/cassettes/test_cli/test_search_pdbe.yaml +1023 -0
  27. protein_quest-0.7.0/tests/cassettes/test_cli/test_search_uniprot.yaml +64 -0
  28. protein_quest-0.7.0/tests/cassettes/test_cli/test_search_uniprot_details.yaml +87 -0
  29. protein_quest-0.7.0/tests/cassettes/test_uniprot/TestSearch4AfExternalIsoforms.test_do_not_match_external_isoform.yaml +62 -0
  30. protein_quest-0.7.0/tests/cassettes/test_uniprot/TestSearch4AfExternalIsoforms.test_match_canonical_isoform.yaml +66 -0
  31. protein_quest-0.7.0/tests/cassettes/test_uniprot/test_map_uniprot_accessions2uniprot_details.yaml +145 -0
  32. protein_quest-0.7.0/tests/cassettes/test_uniprot/test_search4af_ok_sequence_length.yaml +66 -0
  33. protein_quest-0.7.0/tests/cassettes/test_uniprot/test_search4af_too_big_sequence_length.yaml +62 -0
  34. protein_quest-0.7.0/tests/cassettes/test_uniprot/test_search4af_too_small_sequence_length.yaml +62 -0
  35. protein_quest-0.7.0/tests/conftest.py +18 -0
  36. protein_quest-0.7.0/tests/fixtures/2Y29.cif.gz +0 -0
  37. protein_quest-0.7.0/tests/test_cli.py +101 -0
  38. protein_quest-0.7.0/tests/test_io.py +230 -0
  39. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/test_mcp.py +3 -8
  40. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/test_ss.py +2 -10
  41. protein_quest-0.7.0/tests/test_structure.py +116 -0
  42. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/test_uniprot.py +193 -3
  43. {protein_quest-0.5.1 → protein_quest-0.7.0}/uv.lock +73 -65
  44. protein_quest-0.5.1/docs/notebooks/alphafold.ipynb +0 -384
  45. protein_quest-0.5.1/src/protein_quest/alphafold/entry_summary.py +0 -40
  46. protein_quest-0.5.1/tests/alphafold/cassettes/test_fetch/test_fetch_many.yaml +0 -6289
  47. protein_quest-0.5.1/tests/alphafold/test_entry_summary.py +0 -12
  48. protein_quest-0.5.1/tests/alphafold/test_fetch.py +0 -20
  49. protein_quest-0.5.1/tests/pdbe/fixtures/2y29.cif +0 -940
  50. protein_quest-0.5.1/tests/pdbe/test_io.py +0 -142
  51. protein_quest-0.5.1/tests/test_cli.py +0 -14
  52. {protein_quest-0.5.1 → protein_quest-0.7.0}/.github/workflows/ci.yml +0 -0
  53. {protein_quest-0.5.1 → protein_quest-0.7.0}/.github/workflows/pages.yml +0 -0
  54. {protein_quest-0.5.1 → protein_quest-0.7.0}/.github/workflows/pypi-publish.yml +0 -0
  55. {protein_quest-0.5.1 → protein_quest-0.7.0}/.gitignore +0 -0
  56. {protein_quest-0.5.1 → protein_quest-0.7.0}/.vscode/extensions.json +0 -0
  57. {protein_quest-0.5.1 → protein_quest-0.7.0}/CITATION.cff +0 -0
  58. {protein_quest-0.5.1 → protein_quest-0.7.0}/CODE_OF_CONDUCT.md +0 -0
  59. {protein_quest-0.5.1 → protein_quest-0.7.0}/CONTRIBUTING.md +0 -0
  60. {protein_quest-0.5.1 → protein_quest-0.7.0}/LICENSE +0 -0
  61. {protein_quest-0.5.1 → protein_quest-0.7.0}/docs/CONTRIBUTING.md +0 -0
  62. {protein_quest-0.5.1 → protein_quest-0.7.0}/docs/index.md +0 -0
  63. {protein_quest-0.5.1 → protein_quest-0.7.0}/docs/notebooks/.gitignore +0 -0
  64. {protein_quest-0.5.1 → protein_quest-0.7.0}/docs/notebooks/index.md +0 -0
  65. {protein_quest-0.5.1 → protein_quest-0.7.0}/docs/protein-quest-mcp.png +0 -0
  66. {protein_quest-0.5.1 → protein_quest-0.7.0}/mkdocs.yml +0 -0
  67. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/__init__.py +0 -0
  68. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/alphafold/__init__.py +0 -0
  69. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/converter.py +0 -0
  70. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/emdb.py +0 -0
  71. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/go.py +0 -0
  72. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/parallel.py +0 -0
  73. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/pdbe/__init__.py +0 -0
  74. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/pdbe/fetch.py +0 -0
  75. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/py.typed +0 -0
  76. {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/taxonomy.py +0 -0
  77. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/alphafold/AF-A1YPR0-F1-model_v4.pdb +0 -0
  78. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_emdb/test_fetch.yaml +0 -0
  79. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_go/test_search_gene_ontology_term.yaml +0 -0
  80. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_taxonomy/test_search_taxon.yaml +0 -0
  81. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_taxonomy/test_search_taxon_by_id.yaml +0 -0
  82. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4af.yaml +0 -0
  83. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4emdb.yaml +0 -0
  84. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4interaction_partners.yaml +0 -0
  85. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4macromolecular_complexes.yaml +0 -0
  86. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4pdb.yaml +0 -0
  87. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4uniprot.yaml +0 -0
  88. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/fixtures/3JRS_B2A.cif.gz +0 -0
  89. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/pdbe/cassettes/test_fetch/test_fetch.yaml +0 -0
  90. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/pdbe/test_fetch.py +0 -0
  91. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/test_converter.py +0 -0
  92. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/test_emdb.py +0 -0
  93. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/test_go.py +0 -0
  94. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/test_taxonomy.py +0 -0
  95. {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/test_utils.py +0 -0
@@ -0,0 +1 @@
1
+ 3.13
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: protein_quest
3
- Version: 0.5.1
3
+ Version: 0.7.0
4
4
  Summary: Search/retrieve/filter proteins and protein structures
5
5
  Project-URL: Homepage, https://github.com/haddocking/protein-quest
6
6
  Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
@@ -11,12 +11,12 @@ Requires-Python: >=3.13
11
11
  Requires-Dist: aiofiles>=24.1.0
12
12
  Requires-Dist: aiohttp-retry>=2.9.1
13
13
  Requires-Dist: aiohttp[speedups]>=3.11.18
14
- Requires-Dist: aiopath>=0.7.7
15
14
  Requires-Dist: attrs>=25.3.0
16
15
  Requires-Dist: cattrs[orjson]>=24.1.3
17
16
  Requires-Dist: dask>=2025.5.1
18
17
  Requires-Dist: distributed>=2025.5.1
19
18
  Requires-Dist: gemmi>=0.7.3
19
+ Requires-Dist: mmcif>=0.92.0
20
20
  Requires-Dist: platformdirs>=4.3.8
21
21
  Requires-Dist: psutil>=7.0.0
22
22
  Requires-Dist: rich-argparse>=1.7.1
@@ -26,7 +26,7 @@ Requires-Dist: tqdm>=4.67.1
26
26
  Requires-Dist: yarl>=1.20.1
27
27
  Provides-Extra: mcp
28
28
  Requires-Dist: fastmcp>=2.11.3; extra == 'mcp'
29
- Requires-Dist: pydantic>=2.11.7; extra == 'mcp'
29
+ Requires-Dist: pydantic>=2.12.0; extra == 'mcp'
30
30
  Description-Content-Type: text/markdown
31
31
 
32
32
  # protein-quest
@@ -61,6 +61,7 @@ graph TB;
61
61
  searchuniprot --> |uniprot_accessions|searchpdbe[/Search PDBe/]
62
62
  searchuniprot --> |uniprot_accessions|searchaf[/Search Alphafold/]
63
63
  searchuniprot -. uniprot_accessions .-> searchemdb[/Search EMDB/]
64
+ searchuniprot -. uniprot_accessions .-> searchuniprotdetails[/Search UniProt details/]
64
65
  searchintactionpartners[/Search interaction partners/] -.-x |uniprot_accessions|searchuniprot
65
66
  searchcomplexes[/Search complexes/]
66
67
  searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
@@ -71,6 +72,8 @@ graph TB;
71
72
  fetchad -->|mmcif_files| confidencefilter{{Filter out low confidence}}
72
73
  confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
73
74
  residuefilter --> |mmcif_files| ssfilter
75
+ ssfilter -. mmcif_files .-> convert2cif([Convert to cif])
76
+ ssfilter -. mmcif_files .-> convert2uniprot_accessions([Convert to UniProt accessions])
74
77
  classDef dashedBorder stroke-dasharray: 5 5;
75
78
  goterm:::dashedBorder
76
79
  taxonomy:::dashedBorder
@@ -78,6 +81,9 @@ graph TB;
78
81
  fetchemdb:::dashedBorder
79
82
  searchintactionpartners:::dashedBorder
80
83
  searchcomplexes:::dashedBorder
84
+ searchuniprotdetails:::dashedBorder
85
+ convert2cif:::dashedBorder
86
+ convert2uniprot_accessions:::dashedBorder
81
87
  ```
82
88
 
83
89
  (Dotted nodes and edges are side-quests.)
@@ -108,7 +114,7 @@ This behavior can be customized with the `--no-cache`, `--cache-dir`, and `--cop
108
114
  protein-quest search uniprot \
109
115
  --taxon-id 9606 \
110
116
  --reviewed \
111
- --subcellular-location-uniprot nucleus \
117
+ --subcellular-location-uniprot "nucleus" \
112
118
  --subcellular-location-go GO:0005634 \
113
119
  --molecular-function-go GO:0003677 \
114
120
  --limit 100 \
@@ -191,7 +197,7 @@ protein-quest filter residue \
191
197
 
192
198
  ### To filter on secondary structure
193
199
 
194
- To filter on structure being mostly alpha helices and have no beta sheets.
200
+ To filter on structure being mostly alpha helices and have no beta sheets. See the following [notebook](https://www.bonvinlab.org/protein-detective/SSE_elements.html) to determine the ratio of secondary structure elements.
195
201
 
196
202
  ```shell
197
203
  protein-quest filter secondary-structure \
@@ -242,6 +248,37 @@ query_protein,complex_id,complex_url,complex_title,members
242
248
  Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
243
249
  ```
244
250
 
251
+ ### Search for UniProt details
252
+
253
+ To get details (like protein name, sequence length, organism) for a list of UniProt accessions.
254
+
255
+ ```shell
256
+ protein-quest search uniprot-details uniprot_accs.txt uniprot_details.csv
257
+ ```
258
+
259
+ The `uniprot_details.csv` looks like:
260
+
261
+ ```csv
262
+ uniprot_accession,uniprot_id,sequence_length,reviewed,protein_name,taxon_id,taxon_name
263
+ A0A087WUV0,ZN892_HUMAN,522,True,Zinc finger protein 892,9606,Homo sapiens
264
+ ```
265
+
266
+ ### Convert structure files to .cif format
267
+
268
+ Some tools (for example [powerfit](https://github.com/haddocking/powerfit)) only work with `.cif` files and not `*.cif.gz` or `*.bcif` files.
269
+
270
+ ```shell
271
+ protein-quest convert structures --format cif --output-dir ./filtered-cif ./filtered-ss
272
+ ```
273
+
274
+ ### Convert structure files to UniProt accessions
275
+
276
+ After running some filters you might want to know which UniProt accessions are still present in the filtered structures.
277
+
278
+ ```shell
279
+ protein-quest convert uniprot ./filtered-ss uniprot_accs.filtered.txt
280
+ ```
281
+
245
282
  ## Model Context Protocol (MCP) server
246
283
 
247
284
  Protein quest can also help LLMs like Claude Sonnet 4 by providing a [set of tools](https://modelcontextprotocol.io/docs/learn/server-concepts#tools-ai-actions) for protein structures.
@@ -30,6 +30,7 @@ graph TB;
30
30
  searchuniprot --> |uniprot_accessions|searchpdbe[/Search PDBe/]
31
31
  searchuniprot --> |uniprot_accessions|searchaf[/Search Alphafold/]
32
32
  searchuniprot -. uniprot_accessions .-> searchemdb[/Search EMDB/]
33
+ searchuniprot -. uniprot_accessions .-> searchuniprotdetails[/Search UniProt details/]
33
34
  searchintactionpartners[/Search interaction partners/] -.-x |uniprot_accessions|searchuniprot
34
35
  searchcomplexes[/Search complexes/]
35
36
  searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
@@ -40,6 +41,8 @@ graph TB;
40
41
  fetchad -->|mmcif_files| confidencefilter{{Filter out low confidence}}
41
42
  confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
42
43
  residuefilter --> |mmcif_files| ssfilter
44
+ ssfilter -. mmcif_files .-> convert2cif([Convert to cif])
45
+ ssfilter -. mmcif_files .-> convert2uniprot_accessions([Convert to UniProt accessions])
43
46
  classDef dashedBorder stroke-dasharray: 5 5;
44
47
  goterm:::dashedBorder
45
48
  taxonomy:::dashedBorder
@@ -47,6 +50,9 @@ graph TB;
47
50
  fetchemdb:::dashedBorder
48
51
  searchintactionpartners:::dashedBorder
49
52
  searchcomplexes:::dashedBorder
53
+ searchuniprotdetails:::dashedBorder
54
+ convert2cif:::dashedBorder
55
+ convert2uniprot_accessions:::dashedBorder
50
56
  ```
51
57
 
52
58
  (Dotted nodes and edges are side-quests.)
@@ -77,7 +83,7 @@ This behavior can be customized with the `--no-cache`, `--cache-dir`, and `--cop
77
83
  protein-quest search uniprot \
78
84
  --taxon-id 9606 \
79
85
  --reviewed \
80
- --subcellular-location-uniprot nucleus \
86
+ --subcellular-location-uniprot "nucleus" \
81
87
  --subcellular-location-go GO:0005634 \
82
88
  --molecular-function-go GO:0003677 \
83
89
  --limit 100 \
@@ -160,7 +166,7 @@ protein-quest filter residue \
160
166
 
161
167
  ### To filter on secondary structure
162
168
 
163
- To filter on structure being mostly alpha helices and have no beta sheets.
169
+ To filter on structure being mostly alpha helices and have no beta sheets. See the following [notebook](https://www.bonvinlab.org/protein-detective/SSE_elements.html) to determine the ratio of secondary structure elements.
164
170
 
165
171
  ```shell
166
172
  protein-quest filter secondary-structure \
@@ -211,6 +217,37 @@ query_protein,complex_id,complex_url,complex_title,members
211
217
  Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
212
218
  ```
213
219
 
220
+ ### Search for UniProt details
221
+
222
+ To get details (like protein name, sequence length, organism) for a list of UniProt accessions.
223
+
224
+ ```shell
225
+ protein-quest search uniprot-details uniprot_accs.txt uniprot_details.csv
226
+ ```
227
+
228
+ The `uniprot_details.csv` looks like:
229
+
230
+ ```csv
231
+ uniprot_accession,uniprot_id,sequence_length,reviewed,protein_name,taxon_id,taxon_name
232
+ A0A087WUV0,ZN892_HUMAN,522,True,Zinc finger protein 892,9606,Homo sapiens
233
+ ```
234
+
235
+ ### Convert structure files to .cif format
236
+
237
+ Some tools (for example [powerfit](https://github.com/haddocking/powerfit)) only work with `.cif` files and not `*.cif.gz` or `*.bcif` files.
238
+
239
+ ```shell
240
+ protein-quest convert structures --format cif --output-dir ./filtered-cif ./filtered-ss
241
+ ```
242
+
243
+ ### Convert structure files to UniProt accessions
244
+
245
+ After running some filters you might want to know which UniProt accessions are still present in the filtered structures.
246
+
247
+ ```shell
248
+ protein-quest convert uniprot ./filtered-ss uniprot_accs.filtered.txt
249
+ ```
250
+
214
251
  ## Model Context Protocol (MCP) server
215
252
 
216
253
  Protein quest can also help LLMs like Claude Sonnet 4 by providing a [set of tools](https://modelcontextprotocol.io/docs/learn/server-concepts#tools-ai-actions) for protein structures.
@@ -0,0 +1,463 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "24b1926c",
6
+ "metadata": {},
7
+ "source": [
8
+ "# AlphaFold\n",
9
+ "\n",
10
+ "You can download and filter AlphaFold files on confidence."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 1,
16
+ "id": "681ba946",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "# Generic imports\n",
21
+ "import logging\n",
22
+ "from pathlib import Path\n",
23
+ "from pprint import pprint\n",
24
+ "\n",
25
+ "logging.basicConfig(level=logging.WARNING)\n",
26
+ "# Set to WARNING to see only warnings\n",
27
+ "# Set to INFO to see sparql queries\n",
28
+ "# Set to DEBUG to see raw results"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "markdown",
33
+ "id": "4959258c",
34
+ "metadata": {},
35
+ "source": [
36
+ "\n",
37
+ "## Download Alphafold files"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 2,
43
+ "id": "81e449db",
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "from protein_quest.alphafold.fetch import fetch_many_async"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 3,
53
+ "id": "5c2e6ee3",
54
+ "metadata": {},
55
+ "outputs": [],
56
+ "source": [
57
+ "save_dir = Path(\"alphafold_files\")"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "markdown",
62
+ "id": "f38991cf",
63
+ "metadata": {},
64
+ "source": [
65
+ "To download the summary, the cif and predicted Aligned error document (peaDoc) file for 3 AlphaFold entries given their uniprot accessions.\n"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 4,
71
+ "id": "e32b474a",
72
+ "metadata": {},
73
+ "outputs": [
74
+ {
75
+ "name": "stderr",
76
+ "output_type": "stream",
77
+ "text": [
78
+ "Fetching Alphafold summaries: 100%|██████████| 3/3 [00:00<00:00, 553.10it/s]\n",
79
+ "Downloading AlphaFold files: 100%|██████████| 6/6 [00:00<00:00, 38245.93it/s]"
80
+ ]
81
+ },
82
+ {
83
+ "name": "stdout",
84
+ "output_type": "stream",
85
+ "text": [
86
+ "[AlphaFoldEntry(uniprot_accession='A1YPR0',\n",
87
+ " summary=EntrySummary(allVersions=[1, 2, 3, 4, 5, 6],\n",
88
+ " bcifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v6.bcif'),\n",
89
+ " cifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v6.cif'),\n",
90
+ " entityType='protein',\n",
91
+ " fractionPlddtConfident=0.26,\n",
92
+ " fractionPlddtLow=0.099,\n",
93
+ " fractionPlddtVeryHigh=0.089,\n",
94
+ " fractionPlddtVeryLow=0.553,\n",
95
+ " globalMetricValue=56.03,\n",
96
+ " isUniProt=True,\n",
97
+ " latestVersion=6,\n",
98
+ " modelCreatedDate='2025-08-01T00:00:00Z',\n",
99
+ " modelEntityId='AF-A1YPR0-F1',\n",
100
+ " paeDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-predicted_aligned_error_v6.json'),\n",
101
+ " pdbUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v6.pdb'),\n",
102
+ " providerId='GDM',\n",
103
+ " sequence='MANDIDELIGIPFPNHSSEVLCSLNEQRHDGLLCDVLLVVQEQEYRTHRSVLAACSKYFKKLFTAGTLASQPYVYEIDFVQPEALAAILEFAYTSTLTITAGNVKHILNAARMLEIQCIVNVCLEIMEPGGDGGEEDDKEDDDDDEDDDDEEDEEEEEEEEEDDDDDTEDFADQENLPDPQDISCHQSPSKTDHLTEKAYSDTPRDFPDSFQAGSPGHLGVIRDFSIESLLRENLYPKANIPDRRPSLSPFAPDFFPHLWPGDFGAFAQLPEQPMDSGPLDLVIKNRKIKEEEKEELPPPPPPPFPNDFFKDMFPDLPGGPLGPIKAENDYGAYLNFLSATHLGGLFPPWPLVEERKLKPKASQQCPICHKVIMGAGKLPRHMRTHTGEKPYMCTICEVRFTRQDKLKIHMRKHTGERPYLCIHCNAKFVHNYDLKNHMRIHTGVRPYQCEFCYKSFTRSDHLHRHIKRQSCRMARPRRGRKPAAWRAASLLFGPGGPAPDKAAFVMPPALGEVGGHLGGAAVCLPGPSPAKHFLAAPKGALSLQELERQFEETQMKLFGRAQLEAERNAGGLLAFALAENVAAARPYFPLPDPWAAGLAGLPGLAGLNHVASMSEANN',\n",
104
+ " sequenceChecksum='73D82A34502B55BF',\n",
105
+ " sequenceEnd=619,\n",
106
+ " sequenceStart=1,\n",
107
+ " sequenceVersionDate='2007-02-06T00:00:00Z',\n",
108
+ " toolUsed='AlphaFold Monomer v2.0 pipeline',\n",
109
+ " alternativeNames=None,\n",
110
+ " amAnnotationsHg19Url=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg19.csv'),\n",
111
+ " amAnnotationsHg38Url=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg38.csv'),\n",
112
+ " amAnnotationsUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-aa-substitutions.csv'),\n",
113
+ " catalyticActivities=None,\n",
114
+ " complexName=None,\n",
115
+ " functions=None,\n",
116
+ " gene='ZBTB7C',\n",
117
+ " geneSynonyms=None,\n",
118
+ " ipSAE=None,\n",
119
+ " ipTM=None,\n",
120
+ " isUniProtReferenceProteome=True,\n",
121
+ " isUniProtReviewed=True,\n",
122
+ " keywords=None,\n",
123
+ " msaUrl=URL('https://alphafold.ebi.ac.uk/files/msa/AF-A1YPR0-F1-msa_v6.a3m'),\n",
124
+ " organismCommonNames=None,\n",
125
+ " organismScientificName='Homo sapiens',\n",
126
+ " organismSynonyms=None,\n",
127
+ " plddtDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-confidence_v6.json'),\n",
128
+ " proteinFullNames=None,\n",
129
+ " proteinShortNames=None,\n",
130
+ " stoichiometry=None,\n",
131
+ " taxId=9606,\n",
132
+ " taxonomyLineage=None,\n",
133
+ " uniprotAccession='A1YPR0',\n",
134
+ " uniprotDescription='Zinc finger and BTB '\n",
135
+ " 'domain-containing '\n",
136
+ " 'protein 7C',\n",
137
+ " uniprotId='ZBT7C_HUMAN'),\n",
138
+ " summary_file=PosixPath('alphafold_files/A1YPR0.json'),\n",
139
+ " bcif_file=None,\n",
140
+ " cif_file=PosixPath('alphafold_files/AF-A1YPR0-F1-model_v6.cif'),\n",
141
+ " pdb_file=None,\n",
142
+ " pae_doc_file=PosixPath('alphafold_files/AF-A1YPR0-F1-predicted_aligned_error_v6.json'),\n",
143
+ " am_annotations_file=None,\n",
144
+ " am_annotations_hg19_file=None,\n",
145
+ " am_annotations_hg38_file=None,\n",
146
+ " msa_file=None,\n",
147
+ " plddt_doc_file=None),\n",
148
+ " AlphaFoldEntry(uniprot_accession='O60481',\n",
149
+ " summary=EntrySummary(allVersions=[1, 2, 3, 4, 5, 6],\n",
150
+ " bcifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v6.bcif'),\n",
151
+ " cifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v6.cif'),\n",
152
+ " entityType='protein',\n",
153
+ " fractionPlddtConfident=0.289,\n",
154
+ " fractionPlddtLow=0.107,\n",
155
+ " fractionPlddtVeryHigh=0.0,\n",
156
+ " fractionPlddtVeryLow=0.604,\n",
157
+ " globalMetricValue=53.88,\n",
158
+ " isUniProt=True,\n",
159
+ " latestVersion=6,\n",
160
+ " modelCreatedDate='2025-08-01T00:00:00Z',\n",
161
+ " modelEntityId='AF-O60481-F1',\n",
162
+ " paeDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-predicted_aligned_error_v6.json'),\n",
163
+ " pdbUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v6.pdb'),\n",
164
+ " providerId='GDM',\n",
165
+ " sequence='MTMLLDGGPQFPGLGVGSFGAPRHHEMPNREPAGMGLNPFGDSTHAAAAAAAAAAFKLSPAAAHDLSSGQSSAFTPQGSGYANALGHHHHHHHHHHHTSQVPSYGGAASAAFNSTREFLFRQRSSGLSEAASGGGQHGLFAGSASSLHAPAGIPEPPSYLLFPGLHEQGAGHPSPTGHVDNNQVHLGLRGELFGRADPYRPVASPRTDPYAAGAQFPNYSPMNMNMGVNVAAHHGPGAFFRYMRQPIKQELSCKWIDEAQLSRPKKSCDRTFSTMHELVTHVTMEHVGGPEQNNHVCYWEECPREGKSFKAKYKLVNHIRVHTGEKPFPCPFPGCGKIFARSENLKIHKRTHTGEKPFKCEFEGCDRRFANSSDRKKHMHVHTSDKPYICKVCDKSYTHPSSLRKHMKVHESQGSDSSPAASSGYESSTPPAIASANSKDTTKTPSAVQTSTSHNPGLPPNFNEWYV',\n",
166
+ " sequenceChecksum='3150CF13C0679568',\n",
167
+ " sequenceEnd=467,\n",
168
+ " sequenceStart=1,\n",
169
+ " sequenceVersionDate='1998-08-01T00:00:00Z',\n",
170
+ " toolUsed='AlphaFold Monomer v2.0 pipeline',\n",
171
+ " alternativeNames=None,\n",
172
+ " amAnnotationsHg19Url=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg19.csv'),\n",
173
+ " amAnnotationsHg38Url=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg38.csv'),\n",
174
+ " amAnnotationsUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-aa-substitutions.csv'),\n",
175
+ " catalyticActivities=None,\n",
176
+ " complexName=None,\n",
177
+ " functions=None,\n",
178
+ " gene='ZIC3',\n",
179
+ " geneSynonyms=None,\n",
180
+ " ipSAE=None,\n",
181
+ " ipTM=None,\n",
182
+ " isUniProtReferenceProteome=True,\n",
183
+ " isUniProtReviewed=True,\n",
184
+ " keywords=None,\n",
185
+ " msaUrl=URL('https://alphafold.ebi.ac.uk/files/msa/AF-O60481-F1-msa_v6.a3m'),\n",
186
+ " organismCommonNames=None,\n",
187
+ " organismScientificName='Homo sapiens',\n",
188
+ " organismSynonyms=None,\n",
189
+ " plddtDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-confidence_v6.json'),\n",
190
+ " proteinFullNames=None,\n",
191
+ " proteinShortNames=None,\n",
192
+ " stoichiometry=None,\n",
193
+ " taxId=9606,\n",
194
+ " taxonomyLineage=None,\n",
195
+ " uniprotAccession='O60481',\n",
196
+ " uniprotDescription='Zinc finger protein '\n",
197
+ " 'ZIC 3',\n",
198
+ " uniprotId='ZIC3_HUMAN'),\n",
199
+ " summary_file=PosixPath('alphafold_files/O60481.json'),\n",
200
+ " bcif_file=None,\n",
201
+ " cif_file=PosixPath('alphafold_files/AF-O60481-F1-model_v6.cif'),\n",
202
+ " pdb_file=None,\n",
203
+ " pae_doc_file=PosixPath('alphafold_files/AF-O60481-F1-predicted_aligned_error_v6.json'),\n",
204
+ " am_annotations_file=None,\n",
205
+ " am_annotations_hg19_file=None,\n",
206
+ " am_annotations_hg38_file=None,\n",
207
+ " msa_file=None,\n",
208
+ " plddt_doc_file=None),\n",
209
+ " AlphaFoldEntry(uniprot_accession='P50613',\n",
210
+ " summary=EntrySummary(allVersions=[1, 2, 3, 4, 5, 6],\n",
211
+ " bcifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v6.bcif'),\n",
212
+ " cifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v6.cif'),\n",
213
+ " entityType='protein',\n",
214
+ " fractionPlddtConfident=0.127,\n",
215
+ " fractionPlddtLow=0.092,\n",
216
+ " fractionPlddtVeryHigh=0.618,\n",
217
+ " fractionPlddtVeryLow=0.162,\n",
218
+ " globalMetricValue=82.0,\n",
219
+ " isUniProt=True,\n",
220
+ " latestVersion=6,\n",
221
+ " modelCreatedDate='2025-08-01T00:00:00Z',\n",
222
+ " modelEntityId='AF-P50613-F1',\n",
223
+ " paeDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-predicted_aligned_error_v6.json'),\n",
224
+ " pdbUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v6.pdb'),\n",
225
+ " providerId='GDM',\n",
226
+ " sequence='MALDVKSRAKRYEKLDFLGEGQFATVYKARDKNTNQIVAIKKIKLGHRSEAKDGINRTALREIKLLQELSHPNIIGLLDAFGHKSNISLVFDFMETDLEVIIKDNSLVLTPSHIKAYMLMTLQGLEYLHQHWILHRDLKPNNLLLDENGVLKLADFGLAKSFGSPNRAYTHQVVTRWYRAPELLFGARMYGVGVDMWAVGCILAELLLRVPFLPGDSDLDQLTRIFETLGTPTEEQWPDMCSLPDYVTFKSFPGIPLHHIFSAAGDDLLDLIQGLFLFNPCARITATQALKMKYFSNRPGPTPGCQLPRPNCPVETLKEQSNPALAIKRKRTEALEQGGLPKKLIF',\n",
227
+ " sequenceChecksum='0A94BFA7DD416CEB',\n",
228
+ " sequenceEnd=346,\n",
229
+ " sequenceStart=1,\n",
230
+ " sequenceVersionDate='1996-10-01T00:00:00Z',\n",
231
+ " toolUsed='AlphaFold Monomer v2.0 pipeline',\n",
232
+ " alternativeNames=None,\n",
233
+ " amAnnotationsHg19Url=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg19.csv'),\n",
234
+ " amAnnotationsHg38Url=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg38.csv'),\n",
235
+ " amAnnotationsUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-aa-substitutions.csv'),\n",
236
+ " catalyticActivities=None,\n",
237
+ " complexName=None,\n",
238
+ " functions=None,\n",
239
+ " gene='CDK7',\n",
240
+ " geneSynonyms=None,\n",
241
+ " ipSAE=None,\n",
242
+ " ipTM=None,\n",
243
+ " isUniProtReferenceProteome=True,\n",
244
+ " isUniProtReviewed=True,\n",
245
+ " keywords=None,\n",
246
+ " msaUrl=URL('https://alphafold.ebi.ac.uk/files/msa/AF-P50613-F1-msa_v6.a3m'),\n",
247
+ " organismCommonNames=None,\n",
248
+ " organismScientificName='Homo sapiens',\n",
249
+ " organismSynonyms=None,\n",
250
+ " plddtDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-confidence_v6.json'),\n",
251
+ " proteinFullNames=None,\n",
252
+ " proteinShortNames=None,\n",
253
+ " stoichiometry=None,\n",
254
+ " taxId=9606,\n",
255
+ " taxonomyLineage=None,\n",
256
+ " uniprotAccession='P50613',\n",
257
+ " uniprotDescription='Cyclin-dependent '\n",
258
+ " 'kinase 7',\n",
259
+ " uniprotId='CDK7_HUMAN'),\n",
260
+ " summary_file=PosixPath('alphafold_files/P50613.json'),\n",
261
+ " bcif_file=None,\n",
262
+ " cif_file=PosixPath('alphafold_files/AF-P50613-F1-model_v6.cif'),\n",
263
+ " pdb_file=None,\n",
264
+ " pae_doc_file=PosixPath('alphafold_files/AF-P50613-F1-predicted_aligned_error_v6.json'),\n",
265
+ " am_annotations_file=None,\n",
266
+ " am_annotations_hg19_file=None,\n",
267
+ " am_annotations_hg38_file=None,\n",
268
+ " msa_file=None,\n",
269
+ " plddt_doc_file=None)]\n"
270
+ ]
271
+ },
272
+ {
273
+ "name": "stderr",
274
+ "output_type": "stream",
275
+ "text": [
276
+ "\n"
277
+ ]
278
+ }
279
+ ],
280
+ "source": [
281
+ "summaries = [\n",
282
+ " s async for s in fetch_many_async([\"A1YPR0\", \"O60481\", \"P50613\"], save_dir, what={\"summary\", \"cif\", \"paeDoc\"})\n",
283
+ "]\n",
284
+ "pprint(summaries)"
285
+ ]
286
+ },
287
+ {
288
+ "cell_type": "code",
289
+ "execution_count": 7,
290
+ "id": "2d3595e6",
291
+ "metadata": {},
292
+ "outputs": [
293
+ {
294
+ "name": "stdout",
295
+ "output_type": "stream",
296
+ "text": [
297
+ "total 4.3M\n",
298
+ "4.0K A1YPR0.json\n",
299
+ "556K AF-A1YPR0-F1-model_v6.cif\n",
300
+ "1.1M AF-A1YPR0-F1-predicted_aligned_error_v6.json\n",
301
+ "412K AF-O60481-2-F1-model_v6.cif\n",
302
+ "600K AF-O60481-2-F1-predicted_aligned_error_v6.json\n",
303
+ "412K AF-O60481-F1-model_v6.cif\n",
304
+ "628K AF-O60481-F1-predicted_aligned_error_v6.json\n",
305
+ "324K AF-P50613-F1-model_v6.cif\n",
306
+ "276K AF-P50613-F1-predicted_aligned_error_v6.json\n",
307
+ "8.0K O60481.json\n",
308
+ "4.0K P50613.json\n"
309
+ ]
310
+ }
311
+ ],
312
+ "source": [
313
+ "!ls -sh {save_dir}"
314
+ ]
315
+ },
316
+ {
317
+ "cell_type": "markdown",
318
+ "id": "a43edd87",
319
+ "metadata": {},
320
+ "source": [
321
+ "## Filter AlphFold structure files on confidence\n",
322
+ "\n",
323
+ "Filter AlphaFold mmcif/PDB files by confidence (plDDT). Passed files are written with residues below threshold removed."
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "code",
328
+ "execution_count": 10,
329
+ "id": "cc96c63a",
330
+ "metadata": {},
331
+ "outputs": [],
332
+ "source": [
333
+ "from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence"
334
+ ]
335
+ },
336
+ {
337
+ "cell_type": "markdown",
338
+ "id": "724141d4",
339
+ "metadata": {},
340
+ "source": [
341
+ "Take one of the downloaded files"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": 12,
347
+ "id": "73a61cf6",
348
+ "metadata": {},
349
+ "outputs": [
350
+ {
351
+ "data": {
352
+ "text/plain": [
353
+ "[PosixPath('alphafold_files/AF-A1YPR0-F1-model_v4.cif'),\n",
354
+ " PosixPath('alphafold_files/AF-O60481-F1-model_v4.cif'),\n",
355
+ " PosixPath('alphafold_files/AF-P50613-F1-model_v4.cif')]"
356
+ ]
357
+ },
358
+ "execution_count": 12,
359
+ "metadata": {},
360
+ "output_type": "execute_result"
361
+ }
362
+ ],
363
+ "source": [
364
+ "input_files = [entry.cif_file for entry in summaries if entry.cif_file is not None]\n",
365
+ "input_files"
366
+ ]
367
+ },
368
+ {
369
+ "cell_type": "markdown",
370
+ "id": "da8f2f67",
371
+ "metadata": {},
372
+ "source": [
373
+ "We only write a filtered cif file when in the input file there are between 100 and 1000 residues that have a pLDDT score above 50."
374
+ ]
375
+ },
376
+ {
377
+ "cell_type": "code",
378
+ "execution_count": null,
379
+ "id": "fbfdf472",
380
+ "metadata": {},
381
+ "outputs": [],
382
+ "source": [
383
+ "query = ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000)"
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "code",
388
+ "execution_count": 14,
389
+ "id": "152aec9a",
390
+ "metadata": {},
391
+ "outputs": [],
392
+ "source": [
393
+ "output_dir = Path(\"./filtered\")\n",
394
+ "output_dir.mkdir(exist_ok=True)\n",
395
+ "result = filter_files_on_confidence(input_files, query, output_dir)"
396
+ ]
397
+ },
398
+ {
399
+ "cell_type": "code",
400
+ "execution_count": null,
401
+ "id": "6a6f8e3f",
402
+ "metadata": {},
403
+ "outputs": [
404
+ {
405
+ "data": {
406
+ "text/plain": [
407
+ "[ConfidenceFilterResult(input_file='AF-A1YPR0-F1-model_v4.cif', count=175, filtered_file=PosixPath('filtered/AF-A1YPR0-F1-model_v4.cif')),\n",
408
+ " ConfidenceFilterResult(input_file='AF-O60481-F1-model_v4.cif', count=76, filtered_file=None),\n",
409
+ " ConfidenceFilterResult(input_file='AF-P50613-F1-model_v4.cif', count=244, filtered_file=PosixPath('filtered/AF-P50613-F1-model_v4.cif'))]"
410
+ ]
411
+ },
412
+ "execution_count": 17,
413
+ "metadata": {},
414
+ "output_type": "execute_result"
415
+ }
416
+ ],
417
+ "source": [
418
+ "list(\n",
419
+ " filter_files_on_confidence(\n",
420
+ " input_files, ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000), output_dir\n",
421
+ " )\n",
422
+ ")"
423
+ ]
424
+ },
425
+ {
426
+ "cell_type": "markdown",
427
+ "id": "0fe1e388",
428
+ "metadata": {},
429
+ "source": [
430
+ "2 files have passed, but 1 file only has 75 high confidence residues so it is discarded."
431
+ ]
432
+ },
433
+ {
434
+ "cell_type": "code",
435
+ "execution_count": null,
436
+ "id": "83ffc09b",
437
+ "metadata": {},
438
+ "outputs": [],
439
+ "source": []
440
+ }
441
+ ],
442
+ "metadata": {
443
+ "kernelspec": {
444
+ "display_name": "protein-quest",
445
+ "language": "python",
446
+ "name": "python3"
447
+ },
448
+ "language_info": {
449
+ "codemirror_mode": {
450
+ "name": "ipython",
451
+ "version": 3
452
+ },
453
+ "file_extension": ".py",
454
+ "mimetype": "text/x-python",
455
+ "name": "python",
456
+ "nbconvert_exporter": "python",
457
+ "pygments_lexer": "ipython3",
458
+ "version": "3.13.5"
459
+ }
460
+ },
461
+ "nbformat": 4,
462
+ "nbformat_minor": 5
463
+ }