protein-quest 0.5.1__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of protein-quest might be problematic. Click here for more details.
- protein_quest-0.7.0/.python-version +1 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/PKG-INFO +42 -5
- {protein_quest-0.5.1 → protein_quest-0.7.0}/README.md +39 -2
- protein_quest-0.7.0/docs/notebooks/alphafold.ipynb +463 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/docs/notebooks/pdbe.ipynb +12 -8
- {protein_quest-0.5.1 → protein_quest-0.7.0}/docs/notebooks/uniprot.ipynb +1 -1
- {protein_quest-0.5.1 → protein_quest-0.7.0}/pyproject.toml +2 -2
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/__version__.py +1 -1
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/alphafold/confidence.py +2 -2
- protein_quest-0.7.0/src/protein_quest/alphafold/entry_summary.py +64 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/alphafold/fetch.py +76 -42
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/cli.py +385 -114
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/filters.py +2 -5
- protein_quest-0.7.0/src/protein_quest/io.py +350 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/mcp_server.py +21 -7
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/ss.py +3 -7
- protein_quest-0.5.1/src/protein_quest/pdbe/io.py → protein_quest-0.7.0/src/protein_quest/structure.py +77 -126
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/uniprot.py +287 -15
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/utils.py +26 -2
- protein_quest-0.7.0/tests/alphafold/cassettes/test_fetch/test_fetch_many.yaml +55567 -0
- protein_quest-0.7.0/tests/alphafold/cassettes/test_fetch/test_fetch_many_all_isoforms.yaml +51 -0
- protein_quest-0.7.0/tests/alphafold/cassettes/test_fetch/test_fetch_many_gzipped.yaml +42326 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/alphafold/test_confidence.py +3 -2
- protein_quest-0.7.0/tests/alphafold/test_entry_summary.py +16 -0
- protein_quest-0.7.0/tests/alphafold/test_fetch.py +56 -0
- protein_quest-0.7.0/tests/cassettes/test_cli/test_search_pdbe.yaml +1023 -0
- protein_quest-0.7.0/tests/cassettes/test_cli/test_search_uniprot.yaml +64 -0
- protein_quest-0.7.0/tests/cassettes/test_cli/test_search_uniprot_details.yaml +87 -0
- protein_quest-0.7.0/tests/cassettes/test_uniprot/TestSearch4AfExternalIsoforms.test_do_not_match_external_isoform.yaml +62 -0
- protein_quest-0.7.0/tests/cassettes/test_uniprot/TestSearch4AfExternalIsoforms.test_match_canonical_isoform.yaml +66 -0
- protein_quest-0.7.0/tests/cassettes/test_uniprot/test_map_uniprot_accessions2uniprot_details.yaml +145 -0
- protein_quest-0.7.0/tests/cassettes/test_uniprot/test_search4af_ok_sequence_length.yaml +66 -0
- protein_quest-0.7.0/tests/cassettes/test_uniprot/test_search4af_too_big_sequence_length.yaml +62 -0
- protein_quest-0.7.0/tests/cassettes/test_uniprot/test_search4af_too_small_sequence_length.yaml +62 -0
- protein_quest-0.7.0/tests/conftest.py +18 -0
- protein_quest-0.7.0/tests/fixtures/2Y29.cif.gz +0 -0
- protein_quest-0.7.0/tests/test_cli.py +101 -0
- protein_quest-0.7.0/tests/test_io.py +230 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/test_mcp.py +3 -8
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/test_ss.py +2 -10
- protein_quest-0.7.0/tests/test_structure.py +116 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/test_uniprot.py +193 -3
- {protein_quest-0.5.1 → protein_quest-0.7.0}/uv.lock +73 -65
- protein_quest-0.5.1/docs/notebooks/alphafold.ipynb +0 -384
- protein_quest-0.5.1/src/protein_quest/alphafold/entry_summary.py +0 -40
- protein_quest-0.5.1/tests/alphafold/cassettes/test_fetch/test_fetch_many.yaml +0 -6289
- protein_quest-0.5.1/tests/alphafold/test_entry_summary.py +0 -12
- protein_quest-0.5.1/tests/alphafold/test_fetch.py +0 -20
- protein_quest-0.5.1/tests/pdbe/fixtures/2y29.cif +0 -940
- protein_quest-0.5.1/tests/pdbe/test_io.py +0 -142
- protein_quest-0.5.1/tests/test_cli.py +0 -14
- {protein_quest-0.5.1 → protein_quest-0.7.0}/.github/workflows/ci.yml +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/.github/workflows/pages.yml +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/.github/workflows/pypi-publish.yml +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/.gitignore +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/.vscode/extensions.json +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/CITATION.cff +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/CODE_OF_CONDUCT.md +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/CONTRIBUTING.md +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/LICENSE +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/docs/CONTRIBUTING.md +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/docs/index.md +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/docs/notebooks/.gitignore +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/docs/notebooks/index.md +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/docs/protein-quest-mcp.png +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/mkdocs.yml +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/__init__.py +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/alphafold/__init__.py +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/converter.py +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/emdb.py +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/go.py +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/parallel.py +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/pdbe/__init__.py +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/pdbe/fetch.py +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/py.typed +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/src/protein_quest/taxonomy.py +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/alphafold/AF-A1YPR0-F1-model_v4.pdb +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_emdb/test_fetch.yaml +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_go/test_search_gene_ontology_term.yaml +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_taxonomy/test_search_taxon.yaml +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_taxonomy/test_search_taxon_by_id.yaml +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4af.yaml +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4emdb.yaml +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4interaction_partners.yaml +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4macromolecular_complexes.yaml +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4pdb.yaml +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4uniprot.yaml +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/fixtures/3JRS_B2A.cif.gz +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/pdbe/cassettes/test_fetch/test_fetch.yaml +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/pdbe/test_fetch.py +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/test_converter.py +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/test_emdb.py +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/test_go.py +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/test_taxonomy.py +0 -0
- {protein_quest-0.5.1 → protein_quest-0.7.0}/tests/test_utils.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.13
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: protein_quest
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: Search/retrieve/filter proteins and protein structures
|
|
5
5
|
Project-URL: Homepage, https://github.com/haddocking/protein-quest
|
|
6
6
|
Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
|
|
@@ -11,12 +11,12 @@ Requires-Python: >=3.13
|
|
|
11
11
|
Requires-Dist: aiofiles>=24.1.0
|
|
12
12
|
Requires-Dist: aiohttp-retry>=2.9.1
|
|
13
13
|
Requires-Dist: aiohttp[speedups]>=3.11.18
|
|
14
|
-
Requires-Dist: aiopath>=0.7.7
|
|
15
14
|
Requires-Dist: attrs>=25.3.0
|
|
16
15
|
Requires-Dist: cattrs[orjson]>=24.1.3
|
|
17
16
|
Requires-Dist: dask>=2025.5.1
|
|
18
17
|
Requires-Dist: distributed>=2025.5.1
|
|
19
18
|
Requires-Dist: gemmi>=0.7.3
|
|
19
|
+
Requires-Dist: mmcif>=0.92.0
|
|
20
20
|
Requires-Dist: platformdirs>=4.3.8
|
|
21
21
|
Requires-Dist: psutil>=7.0.0
|
|
22
22
|
Requires-Dist: rich-argparse>=1.7.1
|
|
@@ -26,7 +26,7 @@ Requires-Dist: tqdm>=4.67.1
|
|
|
26
26
|
Requires-Dist: yarl>=1.20.1
|
|
27
27
|
Provides-Extra: mcp
|
|
28
28
|
Requires-Dist: fastmcp>=2.11.3; extra == 'mcp'
|
|
29
|
-
Requires-Dist: pydantic>=2.
|
|
29
|
+
Requires-Dist: pydantic>=2.12.0; extra == 'mcp'
|
|
30
30
|
Description-Content-Type: text/markdown
|
|
31
31
|
|
|
32
32
|
# protein-quest
|
|
@@ -61,6 +61,7 @@ graph TB;
|
|
|
61
61
|
searchuniprot --> |uniprot_accessions|searchpdbe[/Search PDBe/]
|
|
62
62
|
searchuniprot --> |uniprot_accessions|searchaf[/Search Alphafold/]
|
|
63
63
|
searchuniprot -. uniprot_accessions .-> searchemdb[/Search EMDB/]
|
|
64
|
+
searchuniprot -. uniprot_accessions .-> searchuniprotdetails[/Search UniProt details/]
|
|
64
65
|
searchintactionpartners[/Search interaction partners/] -.-x |uniprot_accessions|searchuniprot
|
|
65
66
|
searchcomplexes[/Search complexes/]
|
|
66
67
|
searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
|
|
@@ -71,6 +72,8 @@ graph TB;
|
|
|
71
72
|
fetchad -->|mmcif_files| confidencefilter{{Filter out low confidence}}
|
|
72
73
|
confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
|
|
73
74
|
residuefilter --> |mmcif_files| ssfilter
|
|
75
|
+
ssfilter -. mmcif_files .-> convert2cif([Convert to cif])
|
|
76
|
+
ssfilter -. mmcif_files .-> convert2uniprot_accessions([Convert to UniProt accessions])
|
|
74
77
|
classDef dashedBorder stroke-dasharray: 5 5;
|
|
75
78
|
goterm:::dashedBorder
|
|
76
79
|
taxonomy:::dashedBorder
|
|
@@ -78,6 +81,9 @@ graph TB;
|
|
|
78
81
|
fetchemdb:::dashedBorder
|
|
79
82
|
searchintactionpartners:::dashedBorder
|
|
80
83
|
searchcomplexes:::dashedBorder
|
|
84
|
+
searchuniprotdetails:::dashedBorder
|
|
85
|
+
convert2cif:::dashedBorder
|
|
86
|
+
convert2uniprot_accessions:::dashedBorder
|
|
81
87
|
```
|
|
82
88
|
|
|
83
89
|
(Dotted nodes and edges are side-quests.)
|
|
@@ -108,7 +114,7 @@ This behavior can be customized with the `--no-cache`, `--cache-dir`, and `--cop
|
|
|
108
114
|
protein-quest search uniprot \
|
|
109
115
|
--taxon-id 9606 \
|
|
110
116
|
--reviewed \
|
|
111
|
-
--subcellular-location-uniprot nucleus \
|
|
117
|
+
--subcellular-location-uniprot "nucleus" \
|
|
112
118
|
--subcellular-location-go GO:0005634 \
|
|
113
119
|
--molecular-function-go GO:0003677 \
|
|
114
120
|
--limit 100 \
|
|
@@ -191,7 +197,7 @@ protein-quest filter residue \
|
|
|
191
197
|
|
|
192
198
|
### To filter on secondary structure
|
|
193
199
|
|
|
194
|
-
To filter on structure being mostly alpha helices and have no beta sheets.
|
|
200
|
+
To filter on structure being mostly alpha helices and have no beta sheets. See the following [notebook](https://www.bonvinlab.org/protein-detective/SSE_elements.html) to determine the ratio of secondary structure elements.
|
|
195
201
|
|
|
196
202
|
```shell
|
|
197
203
|
protein-quest filter secondary-structure \
|
|
@@ -242,6 +248,37 @@ query_protein,complex_id,complex_url,complex_title,members
|
|
|
242
248
|
Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
|
|
243
249
|
```
|
|
244
250
|
|
|
251
|
+
### Search for UniProt details
|
|
252
|
+
|
|
253
|
+
To get details (like protein name, sequence length, organism) for a list of UniProt accessions.
|
|
254
|
+
|
|
255
|
+
```shell
|
|
256
|
+
protein-quest search uniprot-details uniprot_accs.txt uniprot_details.csv
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
The `uniprot_details.csv` looks like:
|
|
260
|
+
|
|
261
|
+
```csv
|
|
262
|
+
uniprot_accession,uniprot_id,sequence_length,reviewed,protein_name,taxon_id,taxon_name
|
|
263
|
+
A0A087WUV0,ZN892_HUMAN,522,True,Zinc finger protein 892,9606,Homo sapiens
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
### Convert structure files to .cif format
|
|
267
|
+
|
|
268
|
+
Some tools (for example [powerfit](https://github.com/haddocking/powerfit)) only work with `.cif` files and not `*.cif.gz` or `*.bcif` files.
|
|
269
|
+
|
|
270
|
+
```shell
|
|
271
|
+
protein-quest convert structures --format cif --output-dir ./filtered-cif ./filtered-ss
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
### Convert structure files to UniProt accessions
|
|
275
|
+
|
|
276
|
+
After running some filters you might want to know which UniProt accessions are still present in the filtered structures.
|
|
277
|
+
|
|
278
|
+
```shell
|
|
279
|
+
protein-quest convert uniprot ./filtered-ss uniprot_accs.filtered.txt
|
|
280
|
+
```
|
|
281
|
+
|
|
245
282
|
## Model Context Protocol (MCP) server
|
|
246
283
|
|
|
247
284
|
Protein quest can also help LLMs like Claude Sonnet 4 by providing a [set of tools](https://modelcontextprotocol.io/docs/learn/server-concepts#tools-ai-actions) for protein structures.
|
|
@@ -30,6 +30,7 @@ graph TB;
|
|
|
30
30
|
searchuniprot --> |uniprot_accessions|searchpdbe[/Search PDBe/]
|
|
31
31
|
searchuniprot --> |uniprot_accessions|searchaf[/Search Alphafold/]
|
|
32
32
|
searchuniprot -. uniprot_accessions .-> searchemdb[/Search EMDB/]
|
|
33
|
+
searchuniprot -. uniprot_accessions .-> searchuniprotdetails[/Search UniProt details/]
|
|
33
34
|
searchintactionpartners[/Search interaction partners/] -.-x |uniprot_accessions|searchuniprot
|
|
34
35
|
searchcomplexes[/Search complexes/]
|
|
35
36
|
searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
|
|
@@ -40,6 +41,8 @@ graph TB;
|
|
|
40
41
|
fetchad -->|mmcif_files| confidencefilter{{Filter out low confidence}}
|
|
41
42
|
confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
|
|
42
43
|
residuefilter --> |mmcif_files| ssfilter
|
|
44
|
+
ssfilter -. mmcif_files .-> convert2cif([Convert to cif])
|
|
45
|
+
ssfilter -. mmcif_files .-> convert2uniprot_accessions([Convert to UniProt accessions])
|
|
43
46
|
classDef dashedBorder stroke-dasharray: 5 5;
|
|
44
47
|
goterm:::dashedBorder
|
|
45
48
|
taxonomy:::dashedBorder
|
|
@@ -47,6 +50,9 @@ graph TB;
|
|
|
47
50
|
fetchemdb:::dashedBorder
|
|
48
51
|
searchintactionpartners:::dashedBorder
|
|
49
52
|
searchcomplexes:::dashedBorder
|
|
53
|
+
searchuniprotdetails:::dashedBorder
|
|
54
|
+
convert2cif:::dashedBorder
|
|
55
|
+
convert2uniprot_accessions:::dashedBorder
|
|
50
56
|
```
|
|
51
57
|
|
|
52
58
|
(Dotted nodes and edges are side-quests.)
|
|
@@ -77,7 +83,7 @@ This behavior can be customized with the `--no-cache`, `--cache-dir`, and `--cop
|
|
|
77
83
|
protein-quest search uniprot \
|
|
78
84
|
--taxon-id 9606 \
|
|
79
85
|
--reviewed \
|
|
80
|
-
--subcellular-location-uniprot nucleus \
|
|
86
|
+
--subcellular-location-uniprot "nucleus" \
|
|
81
87
|
--subcellular-location-go GO:0005634 \
|
|
82
88
|
--molecular-function-go GO:0003677 \
|
|
83
89
|
--limit 100 \
|
|
@@ -160,7 +166,7 @@ protein-quest filter residue \
|
|
|
160
166
|
|
|
161
167
|
### To filter on secondary structure
|
|
162
168
|
|
|
163
|
-
To filter on structure being mostly alpha helices and have no beta sheets.
|
|
169
|
+
To filter on structure being mostly alpha helices and have no beta sheets. See the following [notebook](https://www.bonvinlab.org/protein-detective/SSE_elements.html) to determine the ratio of secondary structure elements.
|
|
164
170
|
|
|
165
171
|
```shell
|
|
166
172
|
protein-quest filter secondary-structure \
|
|
@@ -211,6 +217,37 @@ query_protein,complex_id,complex_url,complex_title,members
|
|
|
211
217
|
Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
|
|
212
218
|
```
|
|
213
219
|
|
|
220
|
+
### Search for UniProt details
|
|
221
|
+
|
|
222
|
+
To get details (like protein name, sequence length, organism) for a list of UniProt accessions.
|
|
223
|
+
|
|
224
|
+
```shell
|
|
225
|
+
protein-quest search uniprot-details uniprot_accs.txt uniprot_details.csv
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
The `uniprot_details.csv` looks like:
|
|
229
|
+
|
|
230
|
+
```csv
|
|
231
|
+
uniprot_accession,uniprot_id,sequence_length,reviewed,protein_name,taxon_id,taxon_name
|
|
232
|
+
A0A087WUV0,ZN892_HUMAN,522,True,Zinc finger protein 892,9606,Homo sapiens
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### Convert structure files to .cif format
|
|
236
|
+
|
|
237
|
+
Some tools (for example [powerfit](https://github.com/haddocking/powerfit)) only work with `.cif` files and not `*.cif.gz` or `*.bcif` files.
|
|
238
|
+
|
|
239
|
+
```shell
|
|
240
|
+
protein-quest convert structures --format cif --output-dir ./filtered-cif ./filtered-ss
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### Convert structure files to UniProt accessions
|
|
244
|
+
|
|
245
|
+
After running some filters you might want to know which UniProt accessions are still present in the filtered structures.
|
|
246
|
+
|
|
247
|
+
```shell
|
|
248
|
+
protein-quest convert uniprot ./filtered-ss uniprot_accs.filtered.txt
|
|
249
|
+
```
|
|
250
|
+
|
|
214
251
|
## Model Context Protocol (MCP) server
|
|
215
252
|
|
|
216
253
|
Protein quest can also help LLMs like Claude Sonnet 4 by providing a [set of tools](https://modelcontextprotocol.io/docs/learn/server-concepts#tools-ai-actions) for protein structures.
|
|
@@ -0,0 +1,463 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "24b1926c",
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"source": [
|
|
8
|
+
"# AlphaFold\n",
|
|
9
|
+
"\n",
|
|
10
|
+
"You can download and filter AlphaFold files on confidence."
|
|
11
|
+
]
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"cell_type": "code",
|
|
15
|
+
"execution_count": 1,
|
|
16
|
+
"id": "681ba946",
|
|
17
|
+
"metadata": {},
|
|
18
|
+
"outputs": [],
|
|
19
|
+
"source": [
|
|
20
|
+
"# Generic imports\n",
|
|
21
|
+
"import logging\n",
|
|
22
|
+
"from pathlib import Path\n",
|
|
23
|
+
"from pprint import pprint\n",
|
|
24
|
+
"\n",
|
|
25
|
+
"logging.basicConfig(level=logging.WARNING)\n",
|
|
26
|
+
"# Set to WARNING to see only warnings\n",
|
|
27
|
+
"# Set to INFO to see sparql queries\n",
|
|
28
|
+
"# Set to DEBUG to see raw results"
|
|
29
|
+
]
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"cell_type": "markdown",
|
|
33
|
+
"id": "4959258c",
|
|
34
|
+
"metadata": {},
|
|
35
|
+
"source": [
|
|
36
|
+
"\n",
|
|
37
|
+
"## Download Alphafold files"
|
|
38
|
+
]
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"cell_type": "code",
|
|
42
|
+
"execution_count": 2,
|
|
43
|
+
"id": "81e449db",
|
|
44
|
+
"metadata": {},
|
|
45
|
+
"outputs": [],
|
|
46
|
+
"source": [
|
|
47
|
+
"from protein_quest.alphafold.fetch import fetch_many_async"
|
|
48
|
+
]
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"cell_type": "code",
|
|
52
|
+
"execution_count": 3,
|
|
53
|
+
"id": "5c2e6ee3",
|
|
54
|
+
"metadata": {},
|
|
55
|
+
"outputs": [],
|
|
56
|
+
"source": [
|
|
57
|
+
"save_dir = Path(\"alphafold_files\")"
|
|
58
|
+
]
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
"cell_type": "markdown",
|
|
62
|
+
"id": "f38991cf",
|
|
63
|
+
"metadata": {},
|
|
64
|
+
"source": [
|
|
65
|
+
"To download the summary, the cif and predicted Aligned error document (peaDoc) file for 3 AlphaFold entries given their uniprot accessions.\n"
|
|
66
|
+
]
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"cell_type": "code",
|
|
70
|
+
"execution_count": 4,
|
|
71
|
+
"id": "e32b474a",
|
|
72
|
+
"metadata": {},
|
|
73
|
+
"outputs": [
|
|
74
|
+
{
|
|
75
|
+
"name": "stderr",
|
|
76
|
+
"output_type": "stream",
|
|
77
|
+
"text": [
|
|
78
|
+
"Fetching Alphafold summaries: 100%|██████████| 3/3 [00:00<00:00, 553.10it/s]\n",
|
|
79
|
+
"Downloading AlphaFold files: 100%|██████████| 6/6 [00:00<00:00, 38245.93it/s]"
|
|
80
|
+
]
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
"name": "stdout",
|
|
84
|
+
"output_type": "stream",
|
|
85
|
+
"text": [
|
|
86
|
+
"[AlphaFoldEntry(uniprot_accession='A1YPR0',\n",
|
|
87
|
+
" summary=EntrySummary(allVersions=[1, 2, 3, 4, 5, 6],\n",
|
|
88
|
+
" bcifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v6.bcif'),\n",
|
|
89
|
+
" cifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v6.cif'),\n",
|
|
90
|
+
" entityType='protein',\n",
|
|
91
|
+
" fractionPlddtConfident=0.26,\n",
|
|
92
|
+
" fractionPlddtLow=0.099,\n",
|
|
93
|
+
" fractionPlddtVeryHigh=0.089,\n",
|
|
94
|
+
" fractionPlddtVeryLow=0.553,\n",
|
|
95
|
+
" globalMetricValue=56.03,\n",
|
|
96
|
+
" isUniProt=True,\n",
|
|
97
|
+
" latestVersion=6,\n",
|
|
98
|
+
" modelCreatedDate='2025-08-01T00:00:00Z',\n",
|
|
99
|
+
" modelEntityId='AF-A1YPR0-F1',\n",
|
|
100
|
+
" paeDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-predicted_aligned_error_v6.json'),\n",
|
|
101
|
+
" pdbUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v6.pdb'),\n",
|
|
102
|
+
" providerId='GDM',\n",
|
|
103
|
+
" sequence='MANDIDELIGIPFPNHSSEVLCSLNEQRHDGLLCDVLLVVQEQEYRTHRSVLAACSKYFKKLFTAGTLASQPYVYEIDFVQPEALAAILEFAYTSTLTITAGNVKHILNAARMLEIQCIVNVCLEIMEPGGDGGEEDDKEDDDDDEDDDDEEDEEEEEEEEEDDDDDTEDFADQENLPDPQDISCHQSPSKTDHLTEKAYSDTPRDFPDSFQAGSPGHLGVIRDFSIESLLRENLYPKANIPDRRPSLSPFAPDFFPHLWPGDFGAFAQLPEQPMDSGPLDLVIKNRKIKEEEKEELPPPPPPPFPNDFFKDMFPDLPGGPLGPIKAENDYGAYLNFLSATHLGGLFPPWPLVEERKLKPKASQQCPICHKVIMGAGKLPRHMRTHTGEKPYMCTICEVRFTRQDKLKIHMRKHTGERPYLCIHCNAKFVHNYDLKNHMRIHTGVRPYQCEFCYKSFTRSDHLHRHIKRQSCRMARPRRGRKPAAWRAASLLFGPGGPAPDKAAFVMPPALGEVGGHLGGAAVCLPGPSPAKHFLAAPKGALSLQELERQFEETQMKLFGRAQLEAERNAGGLLAFALAENVAAARPYFPLPDPWAAGLAGLPGLAGLNHVASMSEANN',\n",
|
|
104
|
+
" sequenceChecksum='73D82A34502B55BF',\n",
|
|
105
|
+
" sequenceEnd=619,\n",
|
|
106
|
+
" sequenceStart=1,\n",
|
|
107
|
+
" sequenceVersionDate='2007-02-06T00:00:00Z',\n",
|
|
108
|
+
" toolUsed='AlphaFold Monomer v2.0 pipeline',\n",
|
|
109
|
+
" alternativeNames=None,\n",
|
|
110
|
+
" amAnnotationsHg19Url=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg19.csv'),\n",
|
|
111
|
+
" amAnnotationsHg38Url=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg38.csv'),\n",
|
|
112
|
+
" amAnnotationsUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-aa-substitutions.csv'),\n",
|
|
113
|
+
" catalyticActivities=None,\n",
|
|
114
|
+
" complexName=None,\n",
|
|
115
|
+
" functions=None,\n",
|
|
116
|
+
" gene='ZBTB7C',\n",
|
|
117
|
+
" geneSynonyms=None,\n",
|
|
118
|
+
" ipSAE=None,\n",
|
|
119
|
+
" ipTM=None,\n",
|
|
120
|
+
" isUniProtReferenceProteome=True,\n",
|
|
121
|
+
" isUniProtReviewed=True,\n",
|
|
122
|
+
" keywords=None,\n",
|
|
123
|
+
" msaUrl=URL('https://alphafold.ebi.ac.uk/files/msa/AF-A1YPR0-F1-msa_v6.a3m'),\n",
|
|
124
|
+
" organismCommonNames=None,\n",
|
|
125
|
+
" organismScientificName='Homo sapiens',\n",
|
|
126
|
+
" organismSynonyms=None,\n",
|
|
127
|
+
" plddtDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-confidence_v6.json'),\n",
|
|
128
|
+
" proteinFullNames=None,\n",
|
|
129
|
+
" proteinShortNames=None,\n",
|
|
130
|
+
" stoichiometry=None,\n",
|
|
131
|
+
" taxId=9606,\n",
|
|
132
|
+
" taxonomyLineage=None,\n",
|
|
133
|
+
" uniprotAccession='A1YPR0',\n",
|
|
134
|
+
" uniprotDescription='Zinc finger and BTB '\n",
|
|
135
|
+
" 'domain-containing '\n",
|
|
136
|
+
" 'protein 7C',\n",
|
|
137
|
+
" uniprotId='ZBT7C_HUMAN'),\n",
|
|
138
|
+
" summary_file=PosixPath('alphafold_files/A1YPR0.json'),\n",
|
|
139
|
+
" bcif_file=None,\n",
|
|
140
|
+
" cif_file=PosixPath('alphafold_files/AF-A1YPR0-F1-model_v6.cif'),\n",
|
|
141
|
+
" pdb_file=None,\n",
|
|
142
|
+
" pae_doc_file=PosixPath('alphafold_files/AF-A1YPR0-F1-predicted_aligned_error_v6.json'),\n",
|
|
143
|
+
" am_annotations_file=None,\n",
|
|
144
|
+
" am_annotations_hg19_file=None,\n",
|
|
145
|
+
" am_annotations_hg38_file=None,\n",
|
|
146
|
+
" msa_file=None,\n",
|
|
147
|
+
" plddt_doc_file=None),\n",
|
|
148
|
+
" AlphaFoldEntry(uniprot_accession='O60481',\n",
|
|
149
|
+
" summary=EntrySummary(allVersions=[1, 2, 3, 4, 5, 6],\n",
|
|
150
|
+
" bcifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v6.bcif'),\n",
|
|
151
|
+
" cifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v6.cif'),\n",
|
|
152
|
+
" entityType='protein',\n",
|
|
153
|
+
" fractionPlddtConfident=0.289,\n",
|
|
154
|
+
" fractionPlddtLow=0.107,\n",
|
|
155
|
+
" fractionPlddtVeryHigh=0.0,\n",
|
|
156
|
+
" fractionPlddtVeryLow=0.604,\n",
|
|
157
|
+
" globalMetricValue=53.88,\n",
|
|
158
|
+
" isUniProt=True,\n",
|
|
159
|
+
" latestVersion=6,\n",
|
|
160
|
+
" modelCreatedDate='2025-08-01T00:00:00Z',\n",
|
|
161
|
+
" modelEntityId='AF-O60481-F1',\n",
|
|
162
|
+
" paeDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-predicted_aligned_error_v6.json'),\n",
|
|
163
|
+
" pdbUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v6.pdb'),\n",
|
|
164
|
+
" providerId='GDM',\n",
|
|
165
|
+
" sequence='MTMLLDGGPQFPGLGVGSFGAPRHHEMPNREPAGMGLNPFGDSTHAAAAAAAAAAFKLSPAAAHDLSSGQSSAFTPQGSGYANALGHHHHHHHHHHHTSQVPSYGGAASAAFNSTREFLFRQRSSGLSEAASGGGQHGLFAGSASSLHAPAGIPEPPSYLLFPGLHEQGAGHPSPTGHVDNNQVHLGLRGELFGRADPYRPVASPRTDPYAAGAQFPNYSPMNMNMGVNVAAHHGPGAFFRYMRQPIKQELSCKWIDEAQLSRPKKSCDRTFSTMHELVTHVTMEHVGGPEQNNHVCYWEECPREGKSFKAKYKLVNHIRVHTGEKPFPCPFPGCGKIFARSENLKIHKRTHTGEKPFKCEFEGCDRRFANSSDRKKHMHVHTSDKPYICKVCDKSYTHPSSLRKHMKVHESQGSDSSPAASSGYESSTPPAIASANSKDTTKTPSAVQTSTSHNPGLPPNFNEWYV',\n",
|
|
166
|
+
" sequenceChecksum='3150CF13C0679568',\n",
|
|
167
|
+
" sequenceEnd=467,\n",
|
|
168
|
+
" sequenceStart=1,\n",
|
|
169
|
+
" sequenceVersionDate='1998-08-01T00:00:00Z',\n",
|
|
170
|
+
" toolUsed='AlphaFold Monomer v2.0 pipeline',\n",
|
|
171
|
+
" alternativeNames=None,\n",
|
|
172
|
+
" amAnnotationsHg19Url=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg19.csv'),\n",
|
|
173
|
+
" amAnnotationsHg38Url=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg38.csv'),\n",
|
|
174
|
+
" amAnnotationsUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-aa-substitutions.csv'),\n",
|
|
175
|
+
" catalyticActivities=None,\n",
|
|
176
|
+
" complexName=None,\n",
|
|
177
|
+
" functions=None,\n",
|
|
178
|
+
" gene='ZIC3',\n",
|
|
179
|
+
" geneSynonyms=None,\n",
|
|
180
|
+
" ipSAE=None,\n",
|
|
181
|
+
" ipTM=None,\n",
|
|
182
|
+
" isUniProtReferenceProteome=True,\n",
|
|
183
|
+
" isUniProtReviewed=True,\n",
|
|
184
|
+
" keywords=None,\n",
|
|
185
|
+
" msaUrl=URL('https://alphafold.ebi.ac.uk/files/msa/AF-O60481-F1-msa_v6.a3m'),\n",
|
|
186
|
+
" organismCommonNames=None,\n",
|
|
187
|
+
" organismScientificName='Homo sapiens',\n",
|
|
188
|
+
" organismSynonyms=None,\n",
|
|
189
|
+
" plddtDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-confidence_v6.json'),\n",
|
|
190
|
+
" proteinFullNames=None,\n",
|
|
191
|
+
" proteinShortNames=None,\n",
|
|
192
|
+
" stoichiometry=None,\n",
|
|
193
|
+
" taxId=9606,\n",
|
|
194
|
+
" taxonomyLineage=None,\n",
|
|
195
|
+
" uniprotAccession='O60481',\n",
|
|
196
|
+
" uniprotDescription='Zinc finger protein '\n",
|
|
197
|
+
" 'ZIC 3',\n",
|
|
198
|
+
" uniprotId='ZIC3_HUMAN'),\n",
|
|
199
|
+
" summary_file=PosixPath('alphafold_files/O60481.json'),\n",
|
|
200
|
+
" bcif_file=None,\n",
|
|
201
|
+
" cif_file=PosixPath('alphafold_files/AF-O60481-F1-model_v6.cif'),\n",
|
|
202
|
+
" pdb_file=None,\n",
|
|
203
|
+
" pae_doc_file=PosixPath('alphafold_files/AF-O60481-F1-predicted_aligned_error_v6.json'),\n",
|
|
204
|
+
" am_annotations_file=None,\n",
|
|
205
|
+
" am_annotations_hg19_file=None,\n",
|
|
206
|
+
" am_annotations_hg38_file=None,\n",
|
|
207
|
+
" msa_file=None,\n",
|
|
208
|
+
" plddt_doc_file=None),\n",
|
|
209
|
+
" AlphaFoldEntry(uniprot_accession='P50613',\n",
|
|
210
|
+
" summary=EntrySummary(allVersions=[1, 2, 3, 4, 5, 6],\n",
|
|
211
|
+
" bcifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v6.bcif'),\n",
|
|
212
|
+
" cifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v6.cif'),\n",
|
|
213
|
+
" entityType='protein',\n",
|
|
214
|
+
" fractionPlddtConfident=0.127,\n",
|
|
215
|
+
" fractionPlddtLow=0.092,\n",
|
|
216
|
+
" fractionPlddtVeryHigh=0.618,\n",
|
|
217
|
+
" fractionPlddtVeryLow=0.162,\n",
|
|
218
|
+
" globalMetricValue=82.0,\n",
|
|
219
|
+
" isUniProt=True,\n",
|
|
220
|
+
" latestVersion=6,\n",
|
|
221
|
+
" modelCreatedDate='2025-08-01T00:00:00Z',\n",
|
|
222
|
+
" modelEntityId='AF-P50613-F1',\n",
|
|
223
|
+
" paeDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-predicted_aligned_error_v6.json'),\n",
|
|
224
|
+
" pdbUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v6.pdb'),\n",
|
|
225
|
+
" providerId='GDM',\n",
|
|
226
|
+
" sequence='MALDVKSRAKRYEKLDFLGEGQFATVYKARDKNTNQIVAIKKIKLGHRSEAKDGINRTALREIKLLQELSHPNIIGLLDAFGHKSNISLVFDFMETDLEVIIKDNSLVLTPSHIKAYMLMTLQGLEYLHQHWILHRDLKPNNLLLDENGVLKLADFGLAKSFGSPNRAYTHQVVTRWYRAPELLFGARMYGVGVDMWAVGCILAELLLRVPFLPGDSDLDQLTRIFETLGTPTEEQWPDMCSLPDYVTFKSFPGIPLHHIFSAAGDDLLDLIQGLFLFNPCARITATQALKMKYFSNRPGPTPGCQLPRPNCPVETLKEQSNPALAIKRKRTEALEQGGLPKKLIF',\n",
|
|
227
|
+
" sequenceChecksum='0A94BFA7DD416CEB',\n",
|
|
228
|
+
" sequenceEnd=346,\n",
|
|
229
|
+
" sequenceStart=1,\n",
|
|
230
|
+
" sequenceVersionDate='1996-10-01T00:00:00Z',\n",
|
|
231
|
+
" toolUsed='AlphaFold Monomer v2.0 pipeline',\n",
|
|
232
|
+
" alternativeNames=None,\n",
|
|
233
|
+
" amAnnotationsHg19Url=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg19.csv'),\n",
|
|
234
|
+
" amAnnotationsHg38Url=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg38.csv'),\n",
|
|
235
|
+
" amAnnotationsUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-aa-substitutions.csv'),\n",
|
|
236
|
+
" catalyticActivities=None,\n",
|
|
237
|
+
" complexName=None,\n",
|
|
238
|
+
" functions=None,\n",
|
|
239
|
+
" gene='CDK7',\n",
|
|
240
|
+
" geneSynonyms=None,\n",
|
|
241
|
+
" ipSAE=None,\n",
|
|
242
|
+
" ipTM=None,\n",
|
|
243
|
+
" isUniProtReferenceProteome=True,\n",
|
|
244
|
+
" isUniProtReviewed=True,\n",
|
|
245
|
+
" keywords=None,\n",
|
|
246
|
+
" msaUrl=URL('https://alphafold.ebi.ac.uk/files/msa/AF-P50613-F1-msa_v6.a3m'),\n",
|
|
247
|
+
" organismCommonNames=None,\n",
|
|
248
|
+
" organismScientificName='Homo sapiens',\n",
|
|
249
|
+
" organismSynonyms=None,\n",
|
|
250
|
+
" plddtDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-confidence_v6.json'),\n",
|
|
251
|
+
" proteinFullNames=None,\n",
|
|
252
|
+
" proteinShortNames=None,\n",
|
|
253
|
+
" stoichiometry=None,\n",
|
|
254
|
+
" taxId=9606,\n",
|
|
255
|
+
" taxonomyLineage=None,\n",
|
|
256
|
+
" uniprotAccession='P50613',\n",
|
|
257
|
+
" uniprotDescription='Cyclin-dependent '\n",
|
|
258
|
+
" 'kinase 7',\n",
|
|
259
|
+
" uniprotId='CDK7_HUMAN'),\n",
|
|
260
|
+
" summary_file=PosixPath('alphafold_files/P50613.json'),\n",
|
|
261
|
+
" bcif_file=None,\n",
|
|
262
|
+
" cif_file=PosixPath('alphafold_files/AF-P50613-F1-model_v6.cif'),\n",
|
|
263
|
+
" pdb_file=None,\n",
|
|
264
|
+
" pae_doc_file=PosixPath('alphafold_files/AF-P50613-F1-predicted_aligned_error_v6.json'),\n",
|
|
265
|
+
" am_annotations_file=None,\n",
|
|
266
|
+
" am_annotations_hg19_file=None,\n",
|
|
267
|
+
" am_annotations_hg38_file=None,\n",
|
|
268
|
+
" msa_file=None,\n",
|
|
269
|
+
" plddt_doc_file=None)]\n"
|
|
270
|
+
]
|
|
271
|
+
},
|
|
272
|
+
{
|
|
273
|
+
"name": "stderr",
|
|
274
|
+
"output_type": "stream",
|
|
275
|
+
"text": [
|
|
276
|
+
"\n"
|
|
277
|
+
]
|
|
278
|
+
}
|
|
279
|
+
],
|
|
280
|
+
"source": [
|
|
281
|
+
"summaries = [\n",
|
|
282
|
+
" s async for s in fetch_many_async([\"A1YPR0\", \"O60481\", \"P50613\"], save_dir, what={\"summary\", \"cif\", \"paeDoc\"})\n",
|
|
283
|
+
"]\n",
|
|
284
|
+
"pprint(summaries)"
|
|
285
|
+
]
|
|
286
|
+
},
|
|
287
|
+
{
|
|
288
|
+
"cell_type": "code",
|
|
289
|
+
"execution_count": 7,
|
|
290
|
+
"id": "2d3595e6",
|
|
291
|
+
"metadata": {},
|
|
292
|
+
"outputs": [
|
|
293
|
+
{
|
|
294
|
+
"name": "stdout",
|
|
295
|
+
"output_type": "stream",
|
|
296
|
+
"text": [
|
|
297
|
+
"total 4.3M\n",
|
|
298
|
+
"4.0K A1YPR0.json\n",
|
|
299
|
+
"556K AF-A1YPR0-F1-model_v6.cif\n",
|
|
300
|
+
"1.1M AF-A1YPR0-F1-predicted_aligned_error_v6.json\n",
|
|
301
|
+
"412K AF-O60481-2-F1-model_v6.cif\n",
|
|
302
|
+
"600K AF-O60481-2-F1-predicted_aligned_error_v6.json\n",
|
|
303
|
+
"412K AF-O60481-F1-model_v6.cif\n",
|
|
304
|
+
"628K AF-O60481-F1-predicted_aligned_error_v6.json\n",
|
|
305
|
+
"324K AF-P50613-F1-model_v6.cif\n",
|
|
306
|
+
"276K AF-P50613-F1-predicted_aligned_error_v6.json\n",
|
|
307
|
+
"8.0K O60481.json\n",
|
|
308
|
+
"4.0K P50613.json\n"
|
|
309
|
+
]
|
|
310
|
+
}
|
|
311
|
+
],
|
|
312
|
+
"source": [
|
|
313
|
+
"!ls -sh {save_dir}"
|
|
314
|
+
]
|
|
315
|
+
},
|
|
316
|
+
{
|
|
317
|
+
"cell_type": "markdown",
|
|
318
|
+
"id": "a43edd87",
|
|
319
|
+
"metadata": {},
|
|
320
|
+
"source": [
|
|
321
|
+
"## Filter AlphFold structure files on confidence\n",
|
|
322
|
+
"\n",
|
|
323
|
+
"Filter AlphaFold mmcif/PDB files by confidence (plDDT). Passed files are written with residues below threshold removed."
|
|
324
|
+
]
|
|
325
|
+
},
|
|
326
|
+
{
|
|
327
|
+
"cell_type": "code",
|
|
328
|
+
"execution_count": 10,
|
|
329
|
+
"id": "cc96c63a",
|
|
330
|
+
"metadata": {},
|
|
331
|
+
"outputs": [],
|
|
332
|
+
"source": [
|
|
333
|
+
"from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence"
|
|
334
|
+
]
|
|
335
|
+
},
|
|
336
|
+
{
|
|
337
|
+
"cell_type": "markdown",
|
|
338
|
+
"id": "724141d4",
|
|
339
|
+
"metadata": {},
|
|
340
|
+
"source": [
|
|
341
|
+
"Take one of the downloaded files"
|
|
342
|
+
]
|
|
343
|
+
},
|
|
344
|
+
{
|
|
345
|
+
"cell_type": "code",
|
|
346
|
+
"execution_count": 12,
|
|
347
|
+
"id": "73a61cf6",
|
|
348
|
+
"metadata": {},
|
|
349
|
+
"outputs": [
|
|
350
|
+
{
|
|
351
|
+
"data": {
|
|
352
|
+
"text/plain": [
|
|
353
|
+
"[PosixPath('alphafold_files/AF-A1YPR0-F1-model_v4.cif'),\n",
|
|
354
|
+
" PosixPath('alphafold_files/AF-O60481-F1-model_v4.cif'),\n",
|
|
355
|
+
" PosixPath('alphafold_files/AF-P50613-F1-model_v4.cif')]"
|
|
356
|
+
]
|
|
357
|
+
},
|
|
358
|
+
"execution_count": 12,
|
|
359
|
+
"metadata": {},
|
|
360
|
+
"output_type": "execute_result"
|
|
361
|
+
}
|
|
362
|
+
],
|
|
363
|
+
"source": [
|
|
364
|
+
"input_files = [entry.cif_file for entry in summaries if entry.cif_file is not None]\n",
|
|
365
|
+
"input_files"
|
|
366
|
+
]
|
|
367
|
+
},
|
|
368
|
+
{
|
|
369
|
+
"cell_type": "markdown",
|
|
370
|
+
"id": "da8f2f67",
|
|
371
|
+
"metadata": {},
|
|
372
|
+
"source": [
|
|
373
|
+
"We only write a filtered cif file when in the input file there are between 100 and 1000 residues that have a pLDDT score above 50."
|
|
374
|
+
]
|
|
375
|
+
},
|
|
376
|
+
{
|
|
377
|
+
"cell_type": "code",
|
|
378
|
+
"execution_count": null,
|
|
379
|
+
"id": "fbfdf472",
|
|
380
|
+
"metadata": {},
|
|
381
|
+
"outputs": [],
|
|
382
|
+
"source": [
|
|
383
|
+
"query = ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000)"
|
|
384
|
+
]
|
|
385
|
+
},
|
|
386
|
+
{
|
|
387
|
+
"cell_type": "code",
|
|
388
|
+
"execution_count": 14,
|
|
389
|
+
"id": "152aec9a",
|
|
390
|
+
"metadata": {},
|
|
391
|
+
"outputs": [],
|
|
392
|
+
"source": [
|
|
393
|
+
"output_dir = Path(\"./filtered\")\n",
|
|
394
|
+
"output_dir.mkdir(exist_ok=True)\n",
|
|
395
|
+
"result = filter_files_on_confidence(input_files, query, output_dir)"
|
|
396
|
+
]
|
|
397
|
+
},
|
|
398
|
+
{
|
|
399
|
+
"cell_type": "code",
|
|
400
|
+
"execution_count": null,
|
|
401
|
+
"id": "6a6f8e3f",
|
|
402
|
+
"metadata": {},
|
|
403
|
+
"outputs": [
|
|
404
|
+
{
|
|
405
|
+
"data": {
|
|
406
|
+
"text/plain": [
|
|
407
|
+
"[ConfidenceFilterResult(input_file='AF-A1YPR0-F1-model_v4.cif', count=175, filtered_file=PosixPath('filtered/AF-A1YPR0-F1-model_v4.cif')),\n",
|
|
408
|
+
" ConfidenceFilterResult(input_file='AF-O60481-F1-model_v4.cif', count=76, filtered_file=None),\n",
|
|
409
|
+
" ConfidenceFilterResult(input_file='AF-P50613-F1-model_v4.cif', count=244, filtered_file=PosixPath('filtered/AF-P50613-F1-model_v4.cif'))]"
|
|
410
|
+
]
|
|
411
|
+
},
|
|
412
|
+
"execution_count": 17,
|
|
413
|
+
"metadata": {},
|
|
414
|
+
"output_type": "execute_result"
|
|
415
|
+
}
|
|
416
|
+
],
|
|
417
|
+
"source": [
|
|
418
|
+
"list(\n",
|
|
419
|
+
" filter_files_on_confidence(\n",
|
|
420
|
+
" input_files, ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000), output_dir\n",
|
|
421
|
+
" )\n",
|
|
422
|
+
")"
|
|
423
|
+
]
|
|
424
|
+
},
|
|
425
|
+
{
|
|
426
|
+
"cell_type": "markdown",
|
|
427
|
+
"id": "0fe1e388",
|
|
428
|
+
"metadata": {},
|
|
429
|
+
"source": [
|
|
430
|
+
"2 files have passed, but 1 file only has 75 high confidence residues so it is discarded."
|
|
431
|
+
]
|
|
432
|
+
},
|
|
433
|
+
{
|
|
434
|
+
"cell_type": "code",
|
|
435
|
+
"execution_count": null,
|
|
436
|
+
"id": "83ffc09b",
|
|
437
|
+
"metadata": {},
|
|
438
|
+
"outputs": [],
|
|
439
|
+
"source": []
|
|
440
|
+
}
|
|
441
|
+
],
|
|
442
|
+
"metadata": {
|
|
443
|
+
"kernelspec": {
|
|
444
|
+
"display_name": "protein-quest",
|
|
445
|
+
"language": "python",
|
|
446
|
+
"name": "python3"
|
|
447
|
+
},
|
|
448
|
+
"language_info": {
|
|
449
|
+
"codemirror_mode": {
|
|
450
|
+
"name": "ipython",
|
|
451
|
+
"version": 3
|
|
452
|
+
},
|
|
453
|
+
"file_extension": ".py",
|
|
454
|
+
"mimetype": "text/x-python",
|
|
455
|
+
"name": "python",
|
|
456
|
+
"nbconvert_exporter": "python",
|
|
457
|
+
"pygments_lexer": "ipython3",
|
|
458
|
+
"version": "3.13.5"
|
|
459
|
+
}
|
|
460
|
+
},
|
|
461
|
+
"nbformat": 4,
|
|
462
|
+
"nbformat_minor": 5
|
|
463
|
+
}
|