protein-quest 0.6.0__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of protein-quest might be problematic. Click here for more details.

Files changed (93) hide show
  1. protein_quest-0.7.0/.python-version +1 -0
  2. {protein_quest-0.6.0 → protein_quest-0.7.0}/PKG-INFO +32 -6
  3. {protein_quest-0.6.0 → protein_quest-0.7.0}/README.md +30 -3
  4. protein_quest-0.7.0/docs/notebooks/alphafold.ipynb +463 -0
  5. {protein_quest-0.6.0 → protein_quest-0.7.0}/docs/notebooks/uniprot.ipynb +1 -1
  6. {protein_quest-0.6.0 → protein_quest-0.7.0}/pyproject.toml +1 -2
  7. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/__version__.py +1 -1
  8. protein_quest-0.7.0/src/protein_quest/alphafold/entry_summary.py +64 -0
  9. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/alphafold/fetch.py +53 -28
  10. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/cli.py +263 -57
  11. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/mcp_server.py +15 -4
  12. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/structure.py +24 -0
  13. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/uniprot.py +287 -15
  14. protein_quest-0.7.0/tests/alphafold/cassettes/test_fetch/test_fetch_many.yaml +55567 -0
  15. protein_quest-0.7.0/tests/alphafold/cassettes/test_fetch/test_fetch_many_all_isoforms.yaml +51 -0
  16. protein_quest-0.7.0/tests/alphafold/cassettes/test_fetch/test_fetch_many_gzipped.yaml +42326 -0
  17. protein_quest-0.7.0/tests/alphafold/test_entry_summary.py +16 -0
  18. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/alphafold/test_fetch.py +19 -2
  19. protein_quest-0.7.0/tests/cassettes/test_cli/test_search_pdbe.yaml +1023 -0
  20. protein_quest-0.7.0/tests/cassettes/test_cli/test_search_uniprot.yaml +64 -0
  21. protein_quest-0.7.0/tests/cassettes/test_cli/test_search_uniprot_details.yaml +87 -0
  22. protein_quest-0.7.0/tests/cassettes/test_uniprot/TestSearch4AfExternalIsoforms.test_do_not_match_external_isoform.yaml +62 -0
  23. protein_quest-0.7.0/tests/cassettes/test_uniprot/TestSearch4AfExternalIsoforms.test_match_canonical_isoform.yaml +66 -0
  24. protein_quest-0.7.0/tests/cassettes/test_uniprot/test_map_uniprot_accessions2uniprot_details.yaml +145 -0
  25. protein_quest-0.7.0/tests/cassettes/test_uniprot/test_search4af_ok_sequence_length.yaml +66 -0
  26. protein_quest-0.7.0/tests/cassettes/test_uniprot/test_search4af_too_big_sequence_length.yaml +62 -0
  27. protein_quest-0.7.0/tests/cassettes/test_uniprot/test_search4af_too_small_sequence_length.yaml +62 -0
  28. protein_quest-0.7.0/tests/test_cli.py +101 -0
  29. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_structure.py +28 -1
  30. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_uniprot.py +193 -3
  31. {protein_quest-0.6.0 → protein_quest-0.7.0}/uv.lock +44 -65
  32. protein_quest-0.6.0/docs/notebooks/alphafold.ipynb +0 -384
  33. protein_quest-0.6.0/src/protein_quest/alphafold/entry_summary.py +0 -40
  34. protein_quest-0.6.0/tests/alphafold/cassettes/test_fetch/test_fetch_many.yaml +0 -6289
  35. protein_quest-0.6.0/tests/alphafold/cassettes/test_fetch/test_fetch_many_gzipped.yaml +0 -4789
  36. protein_quest-0.6.0/tests/alphafold/test_entry_summary.py +0 -12
  37. protein_quest-0.6.0/tests/test_cli.py +0 -14
  38. {protein_quest-0.6.0 → protein_quest-0.7.0}/.github/workflows/ci.yml +0 -0
  39. {protein_quest-0.6.0 → protein_quest-0.7.0}/.github/workflows/pages.yml +0 -0
  40. {protein_quest-0.6.0 → protein_quest-0.7.0}/.github/workflows/pypi-publish.yml +0 -0
  41. {protein_quest-0.6.0 → protein_quest-0.7.0}/.gitignore +0 -0
  42. {protein_quest-0.6.0 → protein_quest-0.7.0}/.vscode/extensions.json +0 -0
  43. {protein_quest-0.6.0 → protein_quest-0.7.0}/CITATION.cff +0 -0
  44. {protein_quest-0.6.0 → protein_quest-0.7.0}/CODE_OF_CONDUCT.md +0 -0
  45. {protein_quest-0.6.0 → protein_quest-0.7.0}/CONTRIBUTING.md +0 -0
  46. {protein_quest-0.6.0 → protein_quest-0.7.0}/LICENSE +0 -0
  47. {protein_quest-0.6.0 → protein_quest-0.7.0}/docs/CONTRIBUTING.md +0 -0
  48. {protein_quest-0.6.0 → protein_quest-0.7.0}/docs/index.md +0 -0
  49. {protein_quest-0.6.0 → protein_quest-0.7.0}/docs/notebooks/.gitignore +0 -0
  50. {protein_quest-0.6.0 → protein_quest-0.7.0}/docs/notebooks/index.md +0 -0
  51. {protein_quest-0.6.0 → protein_quest-0.7.0}/docs/notebooks/pdbe.ipynb +0 -0
  52. {protein_quest-0.6.0 → protein_quest-0.7.0}/docs/protein-quest-mcp.png +0 -0
  53. {protein_quest-0.6.0 → protein_quest-0.7.0}/mkdocs.yml +0 -0
  54. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/__init__.py +0 -0
  55. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/alphafold/__init__.py +0 -0
  56. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/alphafold/confidence.py +0 -0
  57. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/converter.py +0 -0
  58. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/emdb.py +0 -0
  59. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/filters.py +0 -0
  60. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/go.py +0 -0
  61. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/io.py +0 -0
  62. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/parallel.py +0 -0
  63. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/pdbe/__init__.py +0 -0
  64. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/pdbe/fetch.py +0 -0
  65. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/py.typed +0 -0
  66. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/ss.py +0 -0
  67. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/taxonomy.py +0 -0
  68. {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/utils.py +0 -0
  69. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/alphafold/AF-A1YPR0-F1-model_v4.pdb +0 -0
  70. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/alphafold/test_confidence.py +0 -0
  71. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_emdb/test_fetch.yaml +0 -0
  72. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_go/test_search_gene_ontology_term.yaml +0 -0
  73. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_taxonomy/test_search_taxon.yaml +0 -0
  74. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_taxonomy/test_search_taxon_by_id.yaml +0 -0
  75. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4af.yaml +0 -0
  76. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4emdb.yaml +0 -0
  77. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4interaction_partners.yaml +0 -0
  78. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4macromolecular_complexes.yaml +0 -0
  79. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4pdb.yaml +0 -0
  80. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4uniprot.yaml +0 -0
  81. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/conftest.py +0 -0
  82. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/fixtures/2Y29.cif.gz +0 -0
  83. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/fixtures/3JRS_B2A.cif.gz +0 -0
  84. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/pdbe/cassettes/test_fetch/test_fetch.yaml +0 -0
  85. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/pdbe/test_fetch.py +0 -0
  86. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_converter.py +0 -0
  87. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_emdb.py +0 -0
  88. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_go.py +0 -0
  89. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_io.py +0 -0
  90. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_mcp.py +0 -0
  91. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_ss.py +0 -0
  92. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_taxonomy.py +0 -0
  93. {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_utils.py +0 -0
@@ -0,0 +1 @@
1
+ 3.13
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: protein_quest
3
- Version: 0.6.0
3
+ Version: 0.7.0
4
4
  Summary: Search/retrieve/filter proteins and protein structures
5
5
  Project-URL: Homepage, https://github.com/haddocking/protein-quest
6
6
  Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
@@ -11,7 +11,6 @@ Requires-Python: >=3.13
11
11
  Requires-Dist: aiofiles>=24.1.0
12
12
  Requires-Dist: aiohttp-retry>=2.9.1
13
13
  Requires-Dist: aiohttp[speedups]>=3.11.18
14
- Requires-Dist: aiopath>=0.7.7
15
14
  Requires-Dist: attrs>=25.3.0
16
15
  Requires-Dist: cattrs[orjson]>=24.1.3
17
16
  Requires-Dist: dask>=2025.5.1
@@ -27,7 +26,7 @@ Requires-Dist: tqdm>=4.67.1
27
26
  Requires-Dist: yarl>=1.20.1
28
27
  Provides-Extra: mcp
29
28
  Requires-Dist: fastmcp>=2.11.3; extra == 'mcp'
30
- Requires-Dist: pydantic>=2.11.7; extra == 'mcp'
29
+ Requires-Dist: pydantic>=2.12.0; extra == 'mcp'
31
30
  Description-Content-Type: text/markdown
32
31
 
33
32
  # protein-quest
@@ -62,6 +61,7 @@ graph TB;
62
61
  searchuniprot --> |uniprot_accessions|searchpdbe[/Search PDBe/]
63
62
  searchuniprot --> |uniprot_accessions|searchaf[/Search Alphafold/]
64
63
  searchuniprot -. uniprot_accessions .-> searchemdb[/Search EMDB/]
64
+ searchuniprot -. uniprot_accessions .-> searchuniprotdetails[/Search UniProt details/]
65
65
  searchintactionpartners[/Search interaction partners/] -.-x |uniprot_accessions|searchuniprot
66
66
  searchcomplexes[/Search complexes/]
67
67
  searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
@@ -73,6 +73,7 @@ graph TB;
73
73
  confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
74
74
  residuefilter --> |mmcif_files| ssfilter
75
75
  ssfilter -. mmcif_files .-> convert2cif([Convert to cif])
76
+ ssfilter -. mmcif_files .-> convert2uniprot_accessions([Convert to UniProt accessions])
76
77
  classDef dashedBorder stroke-dasharray: 5 5;
77
78
  goterm:::dashedBorder
78
79
  taxonomy:::dashedBorder
@@ -80,7 +81,9 @@ graph TB;
80
81
  fetchemdb:::dashedBorder
81
82
  searchintactionpartners:::dashedBorder
82
83
  searchcomplexes:::dashedBorder
84
+ searchuniprotdetails:::dashedBorder
83
85
  convert2cif:::dashedBorder
86
+ convert2uniprot_accessions:::dashedBorder
84
87
  ```
85
88
 
86
89
  (Dotted nodes and edges are side-quests.)
@@ -111,7 +114,7 @@ This behavior can be customized with the `--no-cache`, `--cache-dir`, and `--cop
111
114
  protein-quest search uniprot \
112
115
  --taxon-id 9606 \
113
116
  --reviewed \
114
- --subcellular-location-uniprot nucleus \
117
+ --subcellular-location-uniprot "nucleus" \
115
118
  --subcellular-location-go GO:0005634 \
116
119
  --molecular-function-go GO:0003677 \
117
120
  --limit 100 \
@@ -194,7 +197,7 @@ protein-quest filter residue \
194
197
 
195
198
  ### To filter on secondary structure
196
199
 
197
- To filter on structure being mostly alpha helices and have no beta sheets.
200
+ To filter on structure being mostly alpha helices and have no beta sheets. See the following [notebook](https://www.bonvinlab.org/protein-detective/SSE_elements.html) to determine the ratio of secondary structure elements.
198
201
 
199
202
  ```shell
200
203
  protein-quest filter secondary-structure \
@@ -245,12 +248,35 @@ query_protein,complex_id,complex_url,complex_title,members
245
248
  Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
246
249
  ```
247
250
 
251
+ ### Search for UniProt details
252
+
253
+ To get details (like protein name, sequence length, organism) for a list of UniProt accessions.
254
+
255
+ ```shell
256
+ protein-quest search uniprot-details uniprot_accs.txt uniprot_details.csv
257
+ ```
258
+
259
+ The `uniprot_details.csv` looks like:
260
+
261
+ ```csv
262
+ uniprot_accession,uniprot_id,sequence_length,reviewed,protein_name,taxon_id,taxon_name
263
+ A0A087WUV0,ZN892_HUMAN,522,True,Zinc finger protein 892,9606,Homo sapiens
264
+ ```
265
+
248
266
  ### Convert structure files to .cif format
249
267
 
250
268
  Some tools (for example [powerfit](https://github.com/haddocking/powerfit)) only work with `.cif` files and not `*.cif.gz` or `*.bcif` files.
251
269
 
252
270
  ```shell
253
- protein-quest convert --output-dir ./filtered-cif ./filtered-ss
271
+ protein-quest convert structures --format cif --output-dir ./filtered-cif ./filtered-ss
272
+ ```
273
+
274
+ ### Convert structure files to UniProt accessions
275
+
276
+ After running some filters you might want to know which UniProt accessions are still present in the filtered structures.
277
+
278
+ ```shell
279
+ protein-quest convert uniprot ./filtered-ss uniprot_accs.filtered.txt
254
280
  ```
255
281
 
256
282
  ## Model Context Protocol (MCP) server
@@ -30,6 +30,7 @@ graph TB;
30
30
  searchuniprot --> |uniprot_accessions|searchpdbe[/Search PDBe/]
31
31
  searchuniprot --> |uniprot_accessions|searchaf[/Search Alphafold/]
32
32
  searchuniprot -. uniprot_accessions .-> searchemdb[/Search EMDB/]
33
+ searchuniprot -. uniprot_accessions .-> searchuniprotdetails[/Search UniProt details/]
33
34
  searchintactionpartners[/Search interaction partners/] -.-x |uniprot_accessions|searchuniprot
34
35
  searchcomplexes[/Search complexes/]
35
36
  searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
@@ -41,6 +42,7 @@ graph TB;
41
42
  confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
42
43
  residuefilter --> |mmcif_files| ssfilter
43
44
  ssfilter -. mmcif_files .-> convert2cif([Convert to cif])
45
+ ssfilter -. mmcif_files .-> convert2uniprot_accessions([Convert to UniProt accessions])
44
46
  classDef dashedBorder stroke-dasharray: 5 5;
45
47
  goterm:::dashedBorder
46
48
  taxonomy:::dashedBorder
@@ -48,7 +50,9 @@ graph TB;
48
50
  fetchemdb:::dashedBorder
49
51
  searchintactionpartners:::dashedBorder
50
52
  searchcomplexes:::dashedBorder
53
+ searchuniprotdetails:::dashedBorder
51
54
  convert2cif:::dashedBorder
55
+ convert2uniprot_accessions:::dashedBorder
52
56
  ```
53
57
 
54
58
  (Dotted nodes and edges are side-quests.)
@@ -79,7 +83,7 @@ This behavior can be customized with the `--no-cache`, `--cache-dir`, and `--cop
79
83
  protein-quest search uniprot \
80
84
  --taxon-id 9606 \
81
85
  --reviewed \
82
- --subcellular-location-uniprot nucleus \
86
+ --subcellular-location-uniprot "nucleus" \
83
87
  --subcellular-location-go GO:0005634 \
84
88
  --molecular-function-go GO:0003677 \
85
89
  --limit 100 \
@@ -162,7 +166,7 @@ protein-quest filter residue \
162
166
 
163
167
  ### To filter on secondary structure
164
168
 
165
- To filter on structure being mostly alpha helices and have no beta sheets.
169
+ To filter on structure being mostly alpha helices and have no beta sheets. See the following [notebook](https://www.bonvinlab.org/protein-detective/SSE_elements.html) to determine the ratio of secondary structure elements.
166
170
 
167
171
  ```shell
168
172
  protein-quest filter secondary-structure \
@@ -213,12 +217,35 @@ query_protein,complex_id,complex_url,complex_title,members
213
217
  Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
214
218
  ```
215
219
 
220
+ ### Search for UniProt details
221
+
222
+ To get details (like protein name, sequence length, organism) for a list of UniProt accessions.
223
+
224
+ ```shell
225
+ protein-quest search uniprot-details uniprot_accs.txt uniprot_details.csv
226
+ ```
227
+
228
+ The `uniprot_details.csv` looks like:
229
+
230
+ ```csv
231
+ uniprot_accession,uniprot_id,sequence_length,reviewed,protein_name,taxon_id,taxon_name
232
+ A0A087WUV0,ZN892_HUMAN,522,True,Zinc finger protein 892,9606,Homo sapiens
233
+ ```
234
+
216
235
  ### Convert structure files to .cif format
217
236
 
218
237
  Some tools (for example [powerfit](https://github.com/haddocking/powerfit)) only work with `.cif` files and not `*.cif.gz` or `*.bcif` files.
219
238
 
220
239
  ```shell
221
- protein-quest convert --output-dir ./filtered-cif ./filtered-ss
240
+ protein-quest convert structures --format cif --output-dir ./filtered-cif ./filtered-ss
241
+ ```
242
+
243
+ ### Convert structure files to UniProt accessions
244
+
245
+ After running some filters you might want to know which UniProt accessions are still present in the filtered structures.
246
+
247
+ ```shell
248
+ protein-quest convert uniprot ./filtered-ss uniprot_accs.filtered.txt
222
249
  ```
223
250
 
224
251
  ## Model Context Protocol (MCP) server
@@ -0,0 +1,463 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "24b1926c",
6
+ "metadata": {},
7
+ "source": [
8
+ "# AlphaFold\n",
9
+ "\n",
10
+ "You can download and filter AlphaFold files on confidence."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 1,
16
+ "id": "681ba946",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "# Generic imports\n",
21
+ "import logging\n",
22
+ "from pathlib import Path\n",
23
+ "from pprint import pprint\n",
24
+ "\n",
25
+ "logging.basicConfig(level=logging.WARNING)\n",
26
+ "# Set to WARNING to see only warnings\n",
27
+ "# Set to INFO to see sparql queries\n",
28
+ "# Set to DEBUG to see raw results"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "markdown",
33
+ "id": "4959258c",
34
+ "metadata": {},
35
+ "source": [
36
+ "\n",
37
+ "## Download Alphafold files"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 2,
43
+ "id": "81e449db",
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "from protein_quest.alphafold.fetch import fetch_many_async"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 3,
53
+ "id": "5c2e6ee3",
54
+ "metadata": {},
55
+ "outputs": [],
56
+ "source": [
57
+ "save_dir = Path(\"alphafold_files\")"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "markdown",
62
+ "id": "f38991cf",
63
+ "metadata": {},
64
+ "source": [
65
+ "To download the summary, the cif and predicted Aligned error document (peaDoc) file for 3 AlphaFold entries given their uniprot accessions.\n"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 4,
71
+ "id": "e32b474a",
72
+ "metadata": {},
73
+ "outputs": [
74
+ {
75
+ "name": "stderr",
76
+ "output_type": "stream",
77
+ "text": [
78
+ "Fetching Alphafold summaries: 100%|██████████| 3/3 [00:00<00:00, 553.10it/s]\n",
79
+ "Downloading AlphaFold files: 100%|██████████| 6/6 [00:00<00:00, 38245.93it/s]"
80
+ ]
81
+ },
82
+ {
83
+ "name": "stdout",
84
+ "output_type": "stream",
85
+ "text": [
86
+ "[AlphaFoldEntry(uniprot_accession='A1YPR0',\n",
87
+ " summary=EntrySummary(allVersions=[1, 2, 3, 4, 5, 6],\n",
88
+ " bcifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v6.bcif'),\n",
89
+ " cifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v6.cif'),\n",
90
+ " entityType='protein',\n",
91
+ " fractionPlddtConfident=0.26,\n",
92
+ " fractionPlddtLow=0.099,\n",
93
+ " fractionPlddtVeryHigh=0.089,\n",
94
+ " fractionPlddtVeryLow=0.553,\n",
95
+ " globalMetricValue=56.03,\n",
96
+ " isUniProt=True,\n",
97
+ " latestVersion=6,\n",
98
+ " modelCreatedDate='2025-08-01T00:00:00Z',\n",
99
+ " modelEntityId='AF-A1YPR0-F1',\n",
100
+ " paeDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-predicted_aligned_error_v6.json'),\n",
101
+ " pdbUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v6.pdb'),\n",
102
+ " providerId='GDM',\n",
103
+ " sequence='MANDIDELIGIPFPNHSSEVLCSLNEQRHDGLLCDVLLVVQEQEYRTHRSVLAACSKYFKKLFTAGTLASQPYVYEIDFVQPEALAAILEFAYTSTLTITAGNVKHILNAARMLEIQCIVNVCLEIMEPGGDGGEEDDKEDDDDDEDDDDEEDEEEEEEEEEDDDDDTEDFADQENLPDPQDISCHQSPSKTDHLTEKAYSDTPRDFPDSFQAGSPGHLGVIRDFSIESLLRENLYPKANIPDRRPSLSPFAPDFFPHLWPGDFGAFAQLPEQPMDSGPLDLVIKNRKIKEEEKEELPPPPPPPFPNDFFKDMFPDLPGGPLGPIKAENDYGAYLNFLSATHLGGLFPPWPLVEERKLKPKASQQCPICHKVIMGAGKLPRHMRTHTGEKPYMCTICEVRFTRQDKLKIHMRKHTGERPYLCIHCNAKFVHNYDLKNHMRIHTGVRPYQCEFCYKSFTRSDHLHRHIKRQSCRMARPRRGRKPAAWRAASLLFGPGGPAPDKAAFVMPPALGEVGGHLGGAAVCLPGPSPAKHFLAAPKGALSLQELERQFEETQMKLFGRAQLEAERNAGGLLAFALAENVAAARPYFPLPDPWAAGLAGLPGLAGLNHVASMSEANN',\n",
104
+ " sequenceChecksum='73D82A34502B55BF',\n",
105
+ " sequenceEnd=619,\n",
106
+ " sequenceStart=1,\n",
107
+ " sequenceVersionDate='2007-02-06T00:00:00Z',\n",
108
+ " toolUsed='AlphaFold Monomer v2.0 pipeline',\n",
109
+ " alternativeNames=None,\n",
110
+ " amAnnotationsHg19Url=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg19.csv'),\n",
111
+ " amAnnotationsHg38Url=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg38.csv'),\n",
112
+ " amAnnotationsUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-aa-substitutions.csv'),\n",
113
+ " catalyticActivities=None,\n",
114
+ " complexName=None,\n",
115
+ " functions=None,\n",
116
+ " gene='ZBTB7C',\n",
117
+ " geneSynonyms=None,\n",
118
+ " ipSAE=None,\n",
119
+ " ipTM=None,\n",
120
+ " isUniProtReferenceProteome=True,\n",
121
+ " isUniProtReviewed=True,\n",
122
+ " keywords=None,\n",
123
+ " msaUrl=URL('https://alphafold.ebi.ac.uk/files/msa/AF-A1YPR0-F1-msa_v6.a3m'),\n",
124
+ " organismCommonNames=None,\n",
125
+ " organismScientificName='Homo sapiens',\n",
126
+ " organismSynonyms=None,\n",
127
+ " plddtDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-confidence_v6.json'),\n",
128
+ " proteinFullNames=None,\n",
129
+ " proteinShortNames=None,\n",
130
+ " stoichiometry=None,\n",
131
+ " taxId=9606,\n",
132
+ " taxonomyLineage=None,\n",
133
+ " uniprotAccession='A1YPR0',\n",
134
+ " uniprotDescription='Zinc finger and BTB '\n",
135
+ " 'domain-containing '\n",
136
+ " 'protein 7C',\n",
137
+ " uniprotId='ZBT7C_HUMAN'),\n",
138
+ " summary_file=PosixPath('alphafold_files/A1YPR0.json'),\n",
139
+ " bcif_file=None,\n",
140
+ " cif_file=PosixPath('alphafold_files/AF-A1YPR0-F1-model_v6.cif'),\n",
141
+ " pdb_file=None,\n",
142
+ " pae_doc_file=PosixPath('alphafold_files/AF-A1YPR0-F1-predicted_aligned_error_v6.json'),\n",
143
+ " am_annotations_file=None,\n",
144
+ " am_annotations_hg19_file=None,\n",
145
+ " am_annotations_hg38_file=None,\n",
146
+ " msa_file=None,\n",
147
+ " plddt_doc_file=None),\n",
148
+ " AlphaFoldEntry(uniprot_accession='O60481',\n",
149
+ " summary=EntrySummary(allVersions=[1, 2, 3, 4, 5, 6],\n",
150
+ " bcifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v6.bcif'),\n",
151
+ " cifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v6.cif'),\n",
152
+ " entityType='protein',\n",
153
+ " fractionPlddtConfident=0.289,\n",
154
+ " fractionPlddtLow=0.107,\n",
155
+ " fractionPlddtVeryHigh=0.0,\n",
156
+ " fractionPlddtVeryLow=0.604,\n",
157
+ " globalMetricValue=53.88,\n",
158
+ " isUniProt=True,\n",
159
+ " latestVersion=6,\n",
160
+ " modelCreatedDate='2025-08-01T00:00:00Z',\n",
161
+ " modelEntityId='AF-O60481-F1',\n",
162
+ " paeDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-predicted_aligned_error_v6.json'),\n",
163
+ " pdbUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v6.pdb'),\n",
164
+ " providerId='GDM',\n",
165
+ " sequence='MTMLLDGGPQFPGLGVGSFGAPRHHEMPNREPAGMGLNPFGDSTHAAAAAAAAAAFKLSPAAAHDLSSGQSSAFTPQGSGYANALGHHHHHHHHHHHTSQVPSYGGAASAAFNSTREFLFRQRSSGLSEAASGGGQHGLFAGSASSLHAPAGIPEPPSYLLFPGLHEQGAGHPSPTGHVDNNQVHLGLRGELFGRADPYRPVASPRTDPYAAGAQFPNYSPMNMNMGVNVAAHHGPGAFFRYMRQPIKQELSCKWIDEAQLSRPKKSCDRTFSTMHELVTHVTMEHVGGPEQNNHVCYWEECPREGKSFKAKYKLVNHIRVHTGEKPFPCPFPGCGKIFARSENLKIHKRTHTGEKPFKCEFEGCDRRFANSSDRKKHMHVHTSDKPYICKVCDKSYTHPSSLRKHMKVHESQGSDSSPAASSGYESSTPPAIASANSKDTTKTPSAVQTSTSHNPGLPPNFNEWYV',\n",
166
+ " sequenceChecksum='3150CF13C0679568',\n",
167
+ " sequenceEnd=467,\n",
168
+ " sequenceStart=1,\n",
169
+ " sequenceVersionDate='1998-08-01T00:00:00Z',\n",
170
+ " toolUsed='AlphaFold Monomer v2.0 pipeline',\n",
171
+ " alternativeNames=None,\n",
172
+ " amAnnotationsHg19Url=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg19.csv'),\n",
173
+ " amAnnotationsHg38Url=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg38.csv'),\n",
174
+ " amAnnotationsUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-aa-substitutions.csv'),\n",
175
+ " catalyticActivities=None,\n",
176
+ " complexName=None,\n",
177
+ " functions=None,\n",
178
+ " gene='ZIC3',\n",
179
+ " geneSynonyms=None,\n",
180
+ " ipSAE=None,\n",
181
+ " ipTM=None,\n",
182
+ " isUniProtReferenceProteome=True,\n",
183
+ " isUniProtReviewed=True,\n",
184
+ " keywords=None,\n",
185
+ " msaUrl=URL('https://alphafold.ebi.ac.uk/files/msa/AF-O60481-F1-msa_v6.a3m'),\n",
186
+ " organismCommonNames=None,\n",
187
+ " organismScientificName='Homo sapiens',\n",
188
+ " organismSynonyms=None,\n",
189
+ " plddtDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-confidence_v6.json'),\n",
190
+ " proteinFullNames=None,\n",
191
+ " proteinShortNames=None,\n",
192
+ " stoichiometry=None,\n",
193
+ " taxId=9606,\n",
194
+ " taxonomyLineage=None,\n",
195
+ " uniprotAccession='O60481',\n",
196
+ " uniprotDescription='Zinc finger protein '\n",
197
+ " 'ZIC 3',\n",
198
+ " uniprotId='ZIC3_HUMAN'),\n",
199
+ " summary_file=PosixPath('alphafold_files/O60481.json'),\n",
200
+ " bcif_file=None,\n",
201
+ " cif_file=PosixPath('alphafold_files/AF-O60481-F1-model_v6.cif'),\n",
202
+ " pdb_file=None,\n",
203
+ " pae_doc_file=PosixPath('alphafold_files/AF-O60481-F1-predicted_aligned_error_v6.json'),\n",
204
+ " am_annotations_file=None,\n",
205
+ " am_annotations_hg19_file=None,\n",
206
+ " am_annotations_hg38_file=None,\n",
207
+ " msa_file=None,\n",
208
+ " plddt_doc_file=None),\n",
209
+ " AlphaFoldEntry(uniprot_accession='P50613',\n",
210
+ " summary=EntrySummary(allVersions=[1, 2, 3, 4, 5, 6],\n",
211
+ " bcifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v6.bcif'),\n",
212
+ " cifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v6.cif'),\n",
213
+ " entityType='protein',\n",
214
+ " fractionPlddtConfident=0.127,\n",
215
+ " fractionPlddtLow=0.092,\n",
216
+ " fractionPlddtVeryHigh=0.618,\n",
217
+ " fractionPlddtVeryLow=0.162,\n",
218
+ " globalMetricValue=82.0,\n",
219
+ " isUniProt=True,\n",
220
+ " latestVersion=6,\n",
221
+ " modelCreatedDate='2025-08-01T00:00:00Z',\n",
222
+ " modelEntityId='AF-P50613-F1',\n",
223
+ " paeDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-predicted_aligned_error_v6.json'),\n",
224
+ " pdbUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v6.pdb'),\n",
225
+ " providerId='GDM',\n",
226
+ " sequence='MALDVKSRAKRYEKLDFLGEGQFATVYKARDKNTNQIVAIKKIKLGHRSEAKDGINRTALREIKLLQELSHPNIIGLLDAFGHKSNISLVFDFMETDLEVIIKDNSLVLTPSHIKAYMLMTLQGLEYLHQHWILHRDLKPNNLLLDENGVLKLADFGLAKSFGSPNRAYTHQVVTRWYRAPELLFGARMYGVGVDMWAVGCILAELLLRVPFLPGDSDLDQLTRIFETLGTPTEEQWPDMCSLPDYVTFKSFPGIPLHHIFSAAGDDLLDLIQGLFLFNPCARITATQALKMKYFSNRPGPTPGCQLPRPNCPVETLKEQSNPALAIKRKRTEALEQGGLPKKLIF',\n",
227
+ " sequenceChecksum='0A94BFA7DD416CEB',\n",
228
+ " sequenceEnd=346,\n",
229
+ " sequenceStart=1,\n",
230
+ " sequenceVersionDate='1996-10-01T00:00:00Z',\n",
231
+ " toolUsed='AlphaFold Monomer v2.0 pipeline',\n",
232
+ " alternativeNames=None,\n",
233
+ " amAnnotationsHg19Url=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg19.csv'),\n",
234
+ " amAnnotationsHg38Url=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg38.csv'),\n",
235
+ " amAnnotationsUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-aa-substitutions.csv'),\n",
236
+ " catalyticActivities=None,\n",
237
+ " complexName=None,\n",
238
+ " functions=None,\n",
239
+ " gene='CDK7',\n",
240
+ " geneSynonyms=None,\n",
241
+ " ipSAE=None,\n",
242
+ " ipTM=None,\n",
243
+ " isUniProtReferenceProteome=True,\n",
244
+ " isUniProtReviewed=True,\n",
245
+ " keywords=None,\n",
246
+ " msaUrl=URL('https://alphafold.ebi.ac.uk/files/msa/AF-P50613-F1-msa_v6.a3m'),\n",
247
+ " organismCommonNames=None,\n",
248
+ " organismScientificName='Homo sapiens',\n",
249
+ " organismSynonyms=None,\n",
250
+ " plddtDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-confidence_v6.json'),\n",
251
+ " proteinFullNames=None,\n",
252
+ " proteinShortNames=None,\n",
253
+ " stoichiometry=None,\n",
254
+ " taxId=9606,\n",
255
+ " taxonomyLineage=None,\n",
256
+ " uniprotAccession='P50613',\n",
257
+ " uniprotDescription='Cyclin-dependent '\n",
258
+ " 'kinase 7',\n",
259
+ " uniprotId='CDK7_HUMAN'),\n",
260
+ " summary_file=PosixPath('alphafold_files/P50613.json'),\n",
261
+ " bcif_file=None,\n",
262
+ " cif_file=PosixPath('alphafold_files/AF-P50613-F1-model_v6.cif'),\n",
263
+ " pdb_file=None,\n",
264
+ " pae_doc_file=PosixPath('alphafold_files/AF-P50613-F1-predicted_aligned_error_v6.json'),\n",
265
+ " am_annotations_file=None,\n",
266
+ " am_annotations_hg19_file=None,\n",
267
+ " am_annotations_hg38_file=None,\n",
268
+ " msa_file=None,\n",
269
+ " plddt_doc_file=None)]\n"
270
+ ]
271
+ },
272
+ {
273
+ "name": "stderr",
274
+ "output_type": "stream",
275
+ "text": [
276
+ "\n"
277
+ ]
278
+ }
279
+ ],
280
+ "source": [
281
+ "summaries = [\n",
282
+ " s async for s in fetch_many_async([\"A1YPR0\", \"O60481\", \"P50613\"], save_dir, what={\"summary\", \"cif\", \"paeDoc\"})\n",
283
+ "]\n",
284
+ "pprint(summaries)"
285
+ ]
286
+ },
287
+ {
288
+ "cell_type": "code",
289
+ "execution_count": 7,
290
+ "id": "2d3595e6",
291
+ "metadata": {},
292
+ "outputs": [
293
+ {
294
+ "name": "stdout",
295
+ "output_type": "stream",
296
+ "text": [
297
+ "total 4.3M\n",
298
+ "4.0K A1YPR0.json\n",
299
+ "556K AF-A1YPR0-F1-model_v6.cif\n",
300
+ "1.1M AF-A1YPR0-F1-predicted_aligned_error_v6.json\n",
301
+ "412K AF-O60481-2-F1-model_v6.cif\n",
302
+ "600K AF-O60481-2-F1-predicted_aligned_error_v6.json\n",
303
+ "412K AF-O60481-F1-model_v6.cif\n",
304
+ "628K AF-O60481-F1-predicted_aligned_error_v6.json\n",
305
+ "324K AF-P50613-F1-model_v6.cif\n",
306
+ "276K AF-P50613-F1-predicted_aligned_error_v6.json\n",
307
+ "8.0K O60481.json\n",
308
+ "4.0K P50613.json\n"
309
+ ]
310
+ }
311
+ ],
312
+ "source": [
313
+ "!ls -sh {save_dir}"
314
+ ]
315
+ },
316
+ {
317
+ "cell_type": "markdown",
318
+ "id": "a43edd87",
319
+ "metadata": {},
320
+ "source": [
321
+ "## Filter AlphFold structure files on confidence\n",
322
+ "\n",
323
+ "Filter AlphaFold mmcif/PDB files by confidence (plDDT). Passed files are written with residues below threshold removed."
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "code",
328
+ "execution_count": 10,
329
+ "id": "cc96c63a",
330
+ "metadata": {},
331
+ "outputs": [],
332
+ "source": [
333
+ "from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence"
334
+ ]
335
+ },
336
+ {
337
+ "cell_type": "markdown",
338
+ "id": "724141d4",
339
+ "metadata": {},
340
+ "source": [
341
+ "Take one of the downloaded files"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": 12,
347
+ "id": "73a61cf6",
348
+ "metadata": {},
349
+ "outputs": [
350
+ {
351
+ "data": {
352
+ "text/plain": [
353
+ "[PosixPath('alphafold_files/AF-A1YPR0-F1-model_v4.cif'),\n",
354
+ " PosixPath('alphafold_files/AF-O60481-F1-model_v4.cif'),\n",
355
+ " PosixPath('alphafold_files/AF-P50613-F1-model_v4.cif')]"
356
+ ]
357
+ },
358
+ "execution_count": 12,
359
+ "metadata": {},
360
+ "output_type": "execute_result"
361
+ }
362
+ ],
363
+ "source": [
364
+ "input_files = [entry.cif_file for entry in summaries if entry.cif_file is not None]\n",
365
+ "input_files"
366
+ ]
367
+ },
368
+ {
369
+ "cell_type": "markdown",
370
+ "id": "da8f2f67",
371
+ "metadata": {},
372
+ "source": [
373
+ "We only write a filtered cif file when in the input file there are between 100 and 1000 residues that have a pLDDT score above 50."
374
+ ]
375
+ },
376
+ {
377
+ "cell_type": "code",
378
+ "execution_count": null,
379
+ "id": "fbfdf472",
380
+ "metadata": {},
381
+ "outputs": [],
382
+ "source": [
383
+ "query = ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000)"
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "code",
388
+ "execution_count": 14,
389
+ "id": "152aec9a",
390
+ "metadata": {},
391
+ "outputs": [],
392
+ "source": [
393
+ "output_dir = Path(\"./filtered\")\n",
394
+ "output_dir.mkdir(exist_ok=True)\n",
395
+ "result = filter_files_on_confidence(input_files, query, output_dir)"
396
+ ]
397
+ },
398
+ {
399
+ "cell_type": "code",
400
+ "execution_count": null,
401
+ "id": "6a6f8e3f",
402
+ "metadata": {},
403
+ "outputs": [
404
+ {
405
+ "data": {
406
+ "text/plain": [
407
+ "[ConfidenceFilterResult(input_file='AF-A1YPR0-F1-model_v4.cif', count=175, filtered_file=PosixPath('filtered/AF-A1YPR0-F1-model_v4.cif')),\n",
408
+ " ConfidenceFilterResult(input_file='AF-O60481-F1-model_v4.cif', count=76, filtered_file=None),\n",
409
+ " ConfidenceFilterResult(input_file='AF-P50613-F1-model_v4.cif', count=244, filtered_file=PosixPath('filtered/AF-P50613-F1-model_v4.cif'))]"
410
+ ]
411
+ },
412
+ "execution_count": 17,
413
+ "metadata": {},
414
+ "output_type": "execute_result"
415
+ }
416
+ ],
417
+ "source": [
418
+ "list(\n",
419
+ " filter_files_on_confidence(\n",
420
+ " input_files, ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000), output_dir\n",
421
+ " )\n",
422
+ ")"
423
+ ]
424
+ },
425
+ {
426
+ "cell_type": "markdown",
427
+ "id": "0fe1e388",
428
+ "metadata": {},
429
+ "source": [
430
+ "2 files have passed, but 1 file only has 75 high confidence residues so it is discarded."
431
+ ]
432
+ },
433
+ {
434
+ "cell_type": "code",
435
+ "execution_count": null,
436
+ "id": "83ffc09b",
437
+ "metadata": {},
438
+ "outputs": [],
439
+ "source": []
440
+ }
441
+ ],
442
+ "metadata": {
443
+ "kernelspec": {
444
+ "display_name": "protein-quest",
445
+ "language": "python",
446
+ "name": "python3"
447
+ },
448
+ "language_info": {
449
+ "codemirror_mode": {
450
+ "name": "ipython",
451
+ "version": 3
452
+ },
453
+ "file_extension": ".py",
454
+ "mimetype": "text/x-python",
455
+ "name": "python",
456
+ "nbconvert_exporter": "python",
457
+ "pygments_lexer": "ipython3",
458
+ "version": "3.13.5"
459
+ }
460
+ },
461
+ "nbformat": 4,
462
+ "nbformat_minor": 5
463
+ }
@@ -32,7 +32,7 @@
32
32
  "id": "5378e1d5",
33
33
  "metadata": {},
34
34
  "source": [
35
- "## Search for uniprot acessions based on a query"
35
+ "## Search for uniprot accessions based on a query"
36
36
  ]
37
37
  },
38
38
  {