protein-quest 0.6.0__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of protein-quest might be problematic. Click here for more details.
- protein_quest-0.7.0/.python-version +1 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/PKG-INFO +32 -6
- {protein_quest-0.6.0 → protein_quest-0.7.0}/README.md +30 -3
- protein_quest-0.7.0/docs/notebooks/alphafold.ipynb +463 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/docs/notebooks/uniprot.ipynb +1 -1
- {protein_quest-0.6.0 → protein_quest-0.7.0}/pyproject.toml +1 -2
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/__version__.py +1 -1
- protein_quest-0.7.0/src/protein_quest/alphafold/entry_summary.py +64 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/alphafold/fetch.py +53 -28
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/cli.py +263 -57
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/mcp_server.py +15 -4
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/structure.py +24 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/uniprot.py +287 -15
- protein_quest-0.7.0/tests/alphafold/cassettes/test_fetch/test_fetch_many.yaml +55567 -0
- protein_quest-0.7.0/tests/alphafold/cassettes/test_fetch/test_fetch_many_all_isoforms.yaml +51 -0
- protein_quest-0.7.0/tests/alphafold/cassettes/test_fetch/test_fetch_many_gzipped.yaml +42326 -0
- protein_quest-0.7.0/tests/alphafold/test_entry_summary.py +16 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/alphafold/test_fetch.py +19 -2
- protein_quest-0.7.0/tests/cassettes/test_cli/test_search_pdbe.yaml +1023 -0
- protein_quest-0.7.0/tests/cassettes/test_cli/test_search_uniprot.yaml +64 -0
- protein_quest-0.7.0/tests/cassettes/test_cli/test_search_uniprot_details.yaml +87 -0
- protein_quest-0.7.0/tests/cassettes/test_uniprot/TestSearch4AfExternalIsoforms.test_do_not_match_external_isoform.yaml +62 -0
- protein_quest-0.7.0/tests/cassettes/test_uniprot/TestSearch4AfExternalIsoforms.test_match_canonical_isoform.yaml +66 -0
- protein_quest-0.7.0/tests/cassettes/test_uniprot/test_map_uniprot_accessions2uniprot_details.yaml +145 -0
- protein_quest-0.7.0/tests/cassettes/test_uniprot/test_search4af_ok_sequence_length.yaml +66 -0
- protein_quest-0.7.0/tests/cassettes/test_uniprot/test_search4af_too_big_sequence_length.yaml +62 -0
- protein_quest-0.7.0/tests/cassettes/test_uniprot/test_search4af_too_small_sequence_length.yaml +62 -0
- protein_quest-0.7.0/tests/test_cli.py +101 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_structure.py +28 -1
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_uniprot.py +193 -3
- {protein_quest-0.6.0 → protein_quest-0.7.0}/uv.lock +44 -65
- protein_quest-0.6.0/docs/notebooks/alphafold.ipynb +0 -384
- protein_quest-0.6.0/src/protein_quest/alphafold/entry_summary.py +0 -40
- protein_quest-0.6.0/tests/alphafold/cassettes/test_fetch/test_fetch_many.yaml +0 -6289
- protein_quest-0.6.0/tests/alphafold/cassettes/test_fetch/test_fetch_many_gzipped.yaml +0 -4789
- protein_quest-0.6.0/tests/alphafold/test_entry_summary.py +0 -12
- protein_quest-0.6.0/tests/test_cli.py +0 -14
- {protein_quest-0.6.0 → protein_quest-0.7.0}/.github/workflows/ci.yml +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/.github/workflows/pages.yml +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/.github/workflows/pypi-publish.yml +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/.gitignore +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/.vscode/extensions.json +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/CITATION.cff +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/CODE_OF_CONDUCT.md +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/CONTRIBUTING.md +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/LICENSE +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/docs/CONTRIBUTING.md +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/docs/index.md +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/docs/notebooks/.gitignore +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/docs/notebooks/index.md +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/docs/notebooks/pdbe.ipynb +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/docs/protein-quest-mcp.png +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/mkdocs.yml +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/__init__.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/alphafold/__init__.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/alphafold/confidence.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/converter.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/emdb.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/filters.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/go.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/io.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/parallel.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/pdbe/__init__.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/pdbe/fetch.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/py.typed +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/ss.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/taxonomy.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/src/protein_quest/utils.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/alphafold/AF-A1YPR0-F1-model_v4.pdb +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/alphafold/test_confidence.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_emdb/test_fetch.yaml +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_go/test_search_gene_ontology_term.yaml +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_taxonomy/test_search_taxon.yaml +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_taxonomy/test_search_taxon_by_id.yaml +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4af.yaml +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4emdb.yaml +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4interaction_partners.yaml +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4macromolecular_complexes.yaml +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4pdb.yaml +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/cassettes/test_uniprot/test_search4uniprot.yaml +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/conftest.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/fixtures/2Y29.cif.gz +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/fixtures/3JRS_B2A.cif.gz +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/pdbe/cassettes/test_fetch/test_fetch.yaml +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/pdbe/test_fetch.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_converter.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_emdb.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_go.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_io.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_mcp.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_ss.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_taxonomy.py +0 -0
- {protein_quest-0.6.0 → protein_quest-0.7.0}/tests/test_utils.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.13
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: protein_quest
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: Search/retrieve/filter proteins and protein structures
|
|
5
5
|
Project-URL: Homepage, https://github.com/haddocking/protein-quest
|
|
6
6
|
Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
|
|
@@ -11,7 +11,6 @@ Requires-Python: >=3.13
|
|
|
11
11
|
Requires-Dist: aiofiles>=24.1.0
|
|
12
12
|
Requires-Dist: aiohttp-retry>=2.9.1
|
|
13
13
|
Requires-Dist: aiohttp[speedups]>=3.11.18
|
|
14
|
-
Requires-Dist: aiopath>=0.7.7
|
|
15
14
|
Requires-Dist: attrs>=25.3.0
|
|
16
15
|
Requires-Dist: cattrs[orjson]>=24.1.3
|
|
17
16
|
Requires-Dist: dask>=2025.5.1
|
|
@@ -27,7 +26,7 @@ Requires-Dist: tqdm>=4.67.1
|
|
|
27
26
|
Requires-Dist: yarl>=1.20.1
|
|
28
27
|
Provides-Extra: mcp
|
|
29
28
|
Requires-Dist: fastmcp>=2.11.3; extra == 'mcp'
|
|
30
|
-
Requires-Dist: pydantic>=2.
|
|
29
|
+
Requires-Dist: pydantic>=2.12.0; extra == 'mcp'
|
|
31
30
|
Description-Content-Type: text/markdown
|
|
32
31
|
|
|
33
32
|
# protein-quest
|
|
@@ -62,6 +61,7 @@ graph TB;
|
|
|
62
61
|
searchuniprot --> |uniprot_accessions|searchpdbe[/Search PDBe/]
|
|
63
62
|
searchuniprot --> |uniprot_accessions|searchaf[/Search Alphafold/]
|
|
64
63
|
searchuniprot -. uniprot_accessions .-> searchemdb[/Search EMDB/]
|
|
64
|
+
searchuniprot -. uniprot_accessions .-> searchuniprotdetails[/Search UniProt details/]
|
|
65
65
|
searchintactionpartners[/Search interaction partners/] -.-x |uniprot_accessions|searchuniprot
|
|
66
66
|
searchcomplexes[/Search complexes/]
|
|
67
67
|
searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
|
|
@@ -73,6 +73,7 @@ graph TB;
|
|
|
73
73
|
confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
|
|
74
74
|
residuefilter --> |mmcif_files| ssfilter
|
|
75
75
|
ssfilter -. mmcif_files .-> convert2cif([Convert to cif])
|
|
76
|
+
ssfilter -. mmcif_files .-> convert2uniprot_accessions([Convert to UniProt accessions])
|
|
76
77
|
classDef dashedBorder stroke-dasharray: 5 5;
|
|
77
78
|
goterm:::dashedBorder
|
|
78
79
|
taxonomy:::dashedBorder
|
|
@@ -80,7 +81,9 @@ graph TB;
|
|
|
80
81
|
fetchemdb:::dashedBorder
|
|
81
82
|
searchintactionpartners:::dashedBorder
|
|
82
83
|
searchcomplexes:::dashedBorder
|
|
84
|
+
searchuniprotdetails:::dashedBorder
|
|
83
85
|
convert2cif:::dashedBorder
|
|
86
|
+
convert2uniprot_accessions:::dashedBorder
|
|
84
87
|
```
|
|
85
88
|
|
|
86
89
|
(Dotted nodes and edges are side-quests.)
|
|
@@ -111,7 +114,7 @@ This behavior can be customized with the `--no-cache`, `--cache-dir`, and `--cop
|
|
|
111
114
|
protein-quest search uniprot \
|
|
112
115
|
--taxon-id 9606 \
|
|
113
116
|
--reviewed \
|
|
114
|
-
--subcellular-location-uniprot nucleus \
|
|
117
|
+
--subcellular-location-uniprot "nucleus" \
|
|
115
118
|
--subcellular-location-go GO:0005634 \
|
|
116
119
|
--molecular-function-go GO:0003677 \
|
|
117
120
|
--limit 100 \
|
|
@@ -194,7 +197,7 @@ protein-quest filter residue \
|
|
|
194
197
|
|
|
195
198
|
### To filter on secondary structure
|
|
196
199
|
|
|
197
|
-
To filter on structure being mostly alpha helices and have no beta sheets.
|
|
200
|
+
To filter on structure being mostly alpha helices and have no beta sheets. See the following [notebook](https://www.bonvinlab.org/protein-detective/SSE_elements.html) to determine the ratio of secondary structure elements.
|
|
198
201
|
|
|
199
202
|
```shell
|
|
200
203
|
protein-quest filter secondary-structure \
|
|
@@ -245,12 +248,35 @@ query_protein,complex_id,complex_url,complex_title,members
|
|
|
245
248
|
Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
|
|
246
249
|
```
|
|
247
250
|
|
|
251
|
+
### Search for UniProt details
|
|
252
|
+
|
|
253
|
+
To get details (like protein name, sequence length, organism) for a list of UniProt accessions.
|
|
254
|
+
|
|
255
|
+
```shell
|
|
256
|
+
protein-quest search uniprot-details uniprot_accs.txt uniprot_details.csv
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
The `uniprot_details.csv` looks like:
|
|
260
|
+
|
|
261
|
+
```csv
|
|
262
|
+
uniprot_accession,uniprot_id,sequence_length,reviewed,protein_name,taxon_id,taxon_name
|
|
263
|
+
A0A087WUV0,ZN892_HUMAN,522,True,Zinc finger protein 892,9606,Homo sapiens
|
|
264
|
+
```
|
|
265
|
+
|
|
248
266
|
### Convert structure files to .cif format
|
|
249
267
|
|
|
250
268
|
Some tools (for example [powerfit](https://github.com/haddocking/powerfit)) only work with `.cif` files and not `*.cif.gz` or `*.bcif` files.
|
|
251
269
|
|
|
252
270
|
```shell
|
|
253
|
-
protein-quest convert --output-dir ./filtered-cif ./filtered-ss
|
|
271
|
+
protein-quest convert structures --format cif --output-dir ./filtered-cif ./filtered-ss
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
### Convert structure files to UniProt accessions
|
|
275
|
+
|
|
276
|
+
After running some filters you might want to know which UniProt accessions are still present in the filtered structures.
|
|
277
|
+
|
|
278
|
+
```shell
|
|
279
|
+
protein-quest convert uniprot ./filtered-ss uniprot_accs.filtered.txt
|
|
254
280
|
```
|
|
255
281
|
|
|
256
282
|
## Model Context Protocol (MCP) server
|
|
@@ -30,6 +30,7 @@ graph TB;
|
|
|
30
30
|
searchuniprot --> |uniprot_accessions|searchpdbe[/Search PDBe/]
|
|
31
31
|
searchuniprot --> |uniprot_accessions|searchaf[/Search Alphafold/]
|
|
32
32
|
searchuniprot -. uniprot_accessions .-> searchemdb[/Search EMDB/]
|
|
33
|
+
searchuniprot -. uniprot_accessions .-> searchuniprotdetails[/Search UniProt details/]
|
|
33
34
|
searchintactionpartners[/Search interaction partners/] -.-x |uniprot_accessions|searchuniprot
|
|
34
35
|
searchcomplexes[/Search complexes/]
|
|
35
36
|
searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
|
|
@@ -41,6 +42,7 @@ graph TB;
|
|
|
41
42
|
confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
|
|
42
43
|
residuefilter --> |mmcif_files| ssfilter
|
|
43
44
|
ssfilter -. mmcif_files .-> convert2cif([Convert to cif])
|
|
45
|
+
ssfilter -. mmcif_files .-> convert2uniprot_accessions([Convert to UniProt accessions])
|
|
44
46
|
classDef dashedBorder stroke-dasharray: 5 5;
|
|
45
47
|
goterm:::dashedBorder
|
|
46
48
|
taxonomy:::dashedBorder
|
|
@@ -48,7 +50,9 @@ graph TB;
|
|
|
48
50
|
fetchemdb:::dashedBorder
|
|
49
51
|
searchintactionpartners:::dashedBorder
|
|
50
52
|
searchcomplexes:::dashedBorder
|
|
53
|
+
searchuniprotdetails:::dashedBorder
|
|
51
54
|
convert2cif:::dashedBorder
|
|
55
|
+
convert2uniprot_accessions:::dashedBorder
|
|
52
56
|
```
|
|
53
57
|
|
|
54
58
|
(Dotted nodes and edges are side-quests.)
|
|
@@ -79,7 +83,7 @@ This behavior can be customized with the `--no-cache`, `--cache-dir`, and `--cop
|
|
|
79
83
|
protein-quest search uniprot \
|
|
80
84
|
--taxon-id 9606 \
|
|
81
85
|
--reviewed \
|
|
82
|
-
--subcellular-location-uniprot nucleus \
|
|
86
|
+
--subcellular-location-uniprot "nucleus" \
|
|
83
87
|
--subcellular-location-go GO:0005634 \
|
|
84
88
|
--molecular-function-go GO:0003677 \
|
|
85
89
|
--limit 100 \
|
|
@@ -162,7 +166,7 @@ protein-quest filter residue \
|
|
|
162
166
|
|
|
163
167
|
### To filter on secondary structure
|
|
164
168
|
|
|
165
|
-
To filter on structure being mostly alpha helices and have no beta sheets.
|
|
169
|
+
To filter on structure being mostly alpha helices and have no beta sheets. See the following [notebook](https://www.bonvinlab.org/protein-detective/SSE_elements.html) to determine the ratio of secondary structure elements.
|
|
166
170
|
|
|
167
171
|
```shell
|
|
168
172
|
protein-quest filter secondary-structure \
|
|
@@ -213,12 +217,35 @@ query_protein,complex_id,complex_url,complex_title,members
|
|
|
213
217
|
Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
|
|
214
218
|
```
|
|
215
219
|
|
|
220
|
+
### Search for UniProt details
|
|
221
|
+
|
|
222
|
+
To get details (like protein name, sequence length, organism) for a list of UniProt accessions.
|
|
223
|
+
|
|
224
|
+
```shell
|
|
225
|
+
protein-quest search uniprot-details uniprot_accs.txt uniprot_details.csv
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
The `uniprot_details.csv` looks like:
|
|
229
|
+
|
|
230
|
+
```csv
|
|
231
|
+
uniprot_accession,uniprot_id,sequence_length,reviewed,protein_name,taxon_id,taxon_name
|
|
232
|
+
A0A087WUV0,ZN892_HUMAN,522,True,Zinc finger protein 892,9606,Homo sapiens
|
|
233
|
+
```
|
|
234
|
+
|
|
216
235
|
### Convert structure files to .cif format
|
|
217
236
|
|
|
218
237
|
Some tools (for example [powerfit](https://github.com/haddocking/powerfit)) only work with `.cif` files and not `*.cif.gz` or `*.bcif` files.
|
|
219
238
|
|
|
220
239
|
```shell
|
|
221
|
-
protein-quest convert --output-dir ./filtered-cif ./filtered-ss
|
|
240
|
+
protein-quest convert structures --format cif --output-dir ./filtered-cif ./filtered-ss
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### Convert structure files to UniProt accessions
|
|
244
|
+
|
|
245
|
+
After running some filters you might want to know which UniProt accessions are still present in the filtered structures.
|
|
246
|
+
|
|
247
|
+
```shell
|
|
248
|
+
protein-quest convert uniprot ./filtered-ss uniprot_accs.filtered.txt
|
|
222
249
|
```
|
|
223
250
|
|
|
224
251
|
## Model Context Protocol (MCP) server
|
|
@@ -0,0 +1,463 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "24b1926c",
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"source": [
|
|
8
|
+
"# AlphaFold\n",
|
|
9
|
+
"\n",
|
|
10
|
+
"You can download and filter AlphaFold files on confidence."
|
|
11
|
+
]
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"cell_type": "code",
|
|
15
|
+
"execution_count": 1,
|
|
16
|
+
"id": "681ba946",
|
|
17
|
+
"metadata": {},
|
|
18
|
+
"outputs": [],
|
|
19
|
+
"source": [
|
|
20
|
+
"# Generic imports\n",
|
|
21
|
+
"import logging\n",
|
|
22
|
+
"from pathlib import Path\n",
|
|
23
|
+
"from pprint import pprint\n",
|
|
24
|
+
"\n",
|
|
25
|
+
"logging.basicConfig(level=logging.WARNING)\n",
|
|
26
|
+
"# Set to WARNING to see only warnings\n",
|
|
27
|
+
"# Set to INFO to see sparql queries\n",
|
|
28
|
+
"# Set to DEBUG to see raw results"
|
|
29
|
+
]
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"cell_type": "markdown",
|
|
33
|
+
"id": "4959258c",
|
|
34
|
+
"metadata": {},
|
|
35
|
+
"source": [
|
|
36
|
+
"\n",
|
|
37
|
+
"## Download Alphafold files"
|
|
38
|
+
]
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"cell_type": "code",
|
|
42
|
+
"execution_count": 2,
|
|
43
|
+
"id": "81e449db",
|
|
44
|
+
"metadata": {},
|
|
45
|
+
"outputs": [],
|
|
46
|
+
"source": [
|
|
47
|
+
"from protein_quest.alphafold.fetch import fetch_many_async"
|
|
48
|
+
]
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"cell_type": "code",
|
|
52
|
+
"execution_count": 3,
|
|
53
|
+
"id": "5c2e6ee3",
|
|
54
|
+
"metadata": {},
|
|
55
|
+
"outputs": [],
|
|
56
|
+
"source": [
|
|
57
|
+
"save_dir = Path(\"alphafold_files\")"
|
|
58
|
+
]
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
"cell_type": "markdown",
|
|
62
|
+
"id": "f38991cf",
|
|
63
|
+
"metadata": {},
|
|
64
|
+
"source": [
|
|
65
|
+
"To download the summary, the cif and predicted Aligned error document (peaDoc) file for 3 AlphaFold entries given their uniprot accessions.\n"
|
|
66
|
+
]
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"cell_type": "code",
|
|
70
|
+
"execution_count": 4,
|
|
71
|
+
"id": "e32b474a",
|
|
72
|
+
"metadata": {},
|
|
73
|
+
"outputs": [
|
|
74
|
+
{
|
|
75
|
+
"name": "stderr",
|
|
76
|
+
"output_type": "stream",
|
|
77
|
+
"text": [
|
|
78
|
+
"Fetching Alphafold summaries: 100%|██████████| 3/3 [00:00<00:00, 553.10it/s]\n",
|
|
79
|
+
"Downloading AlphaFold files: 100%|██████████| 6/6 [00:00<00:00, 38245.93it/s]"
|
|
80
|
+
]
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
"name": "stdout",
|
|
84
|
+
"output_type": "stream",
|
|
85
|
+
"text": [
|
|
86
|
+
"[AlphaFoldEntry(uniprot_accession='A1YPR0',\n",
|
|
87
|
+
" summary=EntrySummary(allVersions=[1, 2, 3, 4, 5, 6],\n",
|
|
88
|
+
" bcifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v6.bcif'),\n",
|
|
89
|
+
" cifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v6.cif'),\n",
|
|
90
|
+
" entityType='protein',\n",
|
|
91
|
+
" fractionPlddtConfident=0.26,\n",
|
|
92
|
+
" fractionPlddtLow=0.099,\n",
|
|
93
|
+
" fractionPlddtVeryHigh=0.089,\n",
|
|
94
|
+
" fractionPlddtVeryLow=0.553,\n",
|
|
95
|
+
" globalMetricValue=56.03,\n",
|
|
96
|
+
" isUniProt=True,\n",
|
|
97
|
+
" latestVersion=6,\n",
|
|
98
|
+
" modelCreatedDate='2025-08-01T00:00:00Z',\n",
|
|
99
|
+
" modelEntityId='AF-A1YPR0-F1',\n",
|
|
100
|
+
" paeDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-predicted_aligned_error_v6.json'),\n",
|
|
101
|
+
" pdbUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v6.pdb'),\n",
|
|
102
|
+
" providerId='GDM',\n",
|
|
103
|
+
" sequence='MANDIDELIGIPFPNHSSEVLCSLNEQRHDGLLCDVLLVVQEQEYRTHRSVLAACSKYFKKLFTAGTLASQPYVYEIDFVQPEALAAILEFAYTSTLTITAGNVKHILNAARMLEIQCIVNVCLEIMEPGGDGGEEDDKEDDDDDEDDDDEEDEEEEEEEEEDDDDDTEDFADQENLPDPQDISCHQSPSKTDHLTEKAYSDTPRDFPDSFQAGSPGHLGVIRDFSIESLLRENLYPKANIPDRRPSLSPFAPDFFPHLWPGDFGAFAQLPEQPMDSGPLDLVIKNRKIKEEEKEELPPPPPPPFPNDFFKDMFPDLPGGPLGPIKAENDYGAYLNFLSATHLGGLFPPWPLVEERKLKPKASQQCPICHKVIMGAGKLPRHMRTHTGEKPYMCTICEVRFTRQDKLKIHMRKHTGERPYLCIHCNAKFVHNYDLKNHMRIHTGVRPYQCEFCYKSFTRSDHLHRHIKRQSCRMARPRRGRKPAAWRAASLLFGPGGPAPDKAAFVMPPALGEVGGHLGGAAVCLPGPSPAKHFLAAPKGALSLQELERQFEETQMKLFGRAQLEAERNAGGLLAFALAENVAAARPYFPLPDPWAAGLAGLPGLAGLNHVASMSEANN',\n",
|
|
104
|
+
" sequenceChecksum='73D82A34502B55BF',\n",
|
|
105
|
+
" sequenceEnd=619,\n",
|
|
106
|
+
" sequenceStart=1,\n",
|
|
107
|
+
" sequenceVersionDate='2007-02-06T00:00:00Z',\n",
|
|
108
|
+
" toolUsed='AlphaFold Monomer v2.0 pipeline',\n",
|
|
109
|
+
" alternativeNames=None,\n",
|
|
110
|
+
" amAnnotationsHg19Url=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg19.csv'),\n",
|
|
111
|
+
" amAnnotationsHg38Url=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg38.csv'),\n",
|
|
112
|
+
" amAnnotationsUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-aa-substitutions.csv'),\n",
|
|
113
|
+
" catalyticActivities=None,\n",
|
|
114
|
+
" complexName=None,\n",
|
|
115
|
+
" functions=None,\n",
|
|
116
|
+
" gene='ZBTB7C',\n",
|
|
117
|
+
" geneSynonyms=None,\n",
|
|
118
|
+
" ipSAE=None,\n",
|
|
119
|
+
" ipTM=None,\n",
|
|
120
|
+
" isUniProtReferenceProteome=True,\n",
|
|
121
|
+
" isUniProtReviewed=True,\n",
|
|
122
|
+
" keywords=None,\n",
|
|
123
|
+
" msaUrl=URL('https://alphafold.ebi.ac.uk/files/msa/AF-A1YPR0-F1-msa_v6.a3m'),\n",
|
|
124
|
+
" organismCommonNames=None,\n",
|
|
125
|
+
" organismScientificName='Homo sapiens',\n",
|
|
126
|
+
" organismSynonyms=None,\n",
|
|
127
|
+
" plddtDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-confidence_v6.json'),\n",
|
|
128
|
+
" proteinFullNames=None,\n",
|
|
129
|
+
" proteinShortNames=None,\n",
|
|
130
|
+
" stoichiometry=None,\n",
|
|
131
|
+
" taxId=9606,\n",
|
|
132
|
+
" taxonomyLineage=None,\n",
|
|
133
|
+
" uniprotAccession='A1YPR0',\n",
|
|
134
|
+
" uniprotDescription='Zinc finger and BTB '\n",
|
|
135
|
+
" 'domain-containing '\n",
|
|
136
|
+
" 'protein 7C',\n",
|
|
137
|
+
" uniprotId='ZBT7C_HUMAN'),\n",
|
|
138
|
+
" summary_file=PosixPath('alphafold_files/A1YPR0.json'),\n",
|
|
139
|
+
" bcif_file=None,\n",
|
|
140
|
+
" cif_file=PosixPath('alphafold_files/AF-A1YPR0-F1-model_v6.cif'),\n",
|
|
141
|
+
" pdb_file=None,\n",
|
|
142
|
+
" pae_doc_file=PosixPath('alphafold_files/AF-A1YPR0-F1-predicted_aligned_error_v6.json'),\n",
|
|
143
|
+
" am_annotations_file=None,\n",
|
|
144
|
+
" am_annotations_hg19_file=None,\n",
|
|
145
|
+
" am_annotations_hg38_file=None,\n",
|
|
146
|
+
" msa_file=None,\n",
|
|
147
|
+
" plddt_doc_file=None),\n",
|
|
148
|
+
" AlphaFoldEntry(uniprot_accession='O60481',\n",
|
|
149
|
+
" summary=EntrySummary(allVersions=[1, 2, 3, 4, 5, 6],\n",
|
|
150
|
+
" bcifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v6.bcif'),\n",
|
|
151
|
+
" cifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v6.cif'),\n",
|
|
152
|
+
" entityType='protein',\n",
|
|
153
|
+
" fractionPlddtConfident=0.289,\n",
|
|
154
|
+
" fractionPlddtLow=0.107,\n",
|
|
155
|
+
" fractionPlddtVeryHigh=0.0,\n",
|
|
156
|
+
" fractionPlddtVeryLow=0.604,\n",
|
|
157
|
+
" globalMetricValue=53.88,\n",
|
|
158
|
+
" isUniProt=True,\n",
|
|
159
|
+
" latestVersion=6,\n",
|
|
160
|
+
" modelCreatedDate='2025-08-01T00:00:00Z',\n",
|
|
161
|
+
" modelEntityId='AF-O60481-F1',\n",
|
|
162
|
+
" paeDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-predicted_aligned_error_v6.json'),\n",
|
|
163
|
+
" pdbUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v6.pdb'),\n",
|
|
164
|
+
" providerId='GDM',\n",
|
|
165
|
+
" sequence='MTMLLDGGPQFPGLGVGSFGAPRHHEMPNREPAGMGLNPFGDSTHAAAAAAAAAAFKLSPAAAHDLSSGQSSAFTPQGSGYANALGHHHHHHHHHHHTSQVPSYGGAASAAFNSTREFLFRQRSSGLSEAASGGGQHGLFAGSASSLHAPAGIPEPPSYLLFPGLHEQGAGHPSPTGHVDNNQVHLGLRGELFGRADPYRPVASPRTDPYAAGAQFPNYSPMNMNMGVNVAAHHGPGAFFRYMRQPIKQELSCKWIDEAQLSRPKKSCDRTFSTMHELVTHVTMEHVGGPEQNNHVCYWEECPREGKSFKAKYKLVNHIRVHTGEKPFPCPFPGCGKIFARSENLKIHKRTHTGEKPFKCEFEGCDRRFANSSDRKKHMHVHTSDKPYICKVCDKSYTHPSSLRKHMKVHESQGSDSSPAASSGYESSTPPAIASANSKDTTKTPSAVQTSTSHNPGLPPNFNEWYV',\n",
|
|
166
|
+
" sequenceChecksum='3150CF13C0679568',\n",
|
|
167
|
+
" sequenceEnd=467,\n",
|
|
168
|
+
" sequenceStart=1,\n",
|
|
169
|
+
" sequenceVersionDate='1998-08-01T00:00:00Z',\n",
|
|
170
|
+
" toolUsed='AlphaFold Monomer v2.0 pipeline',\n",
|
|
171
|
+
" alternativeNames=None,\n",
|
|
172
|
+
" amAnnotationsHg19Url=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg19.csv'),\n",
|
|
173
|
+
" amAnnotationsHg38Url=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg38.csv'),\n",
|
|
174
|
+
" amAnnotationsUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-aa-substitutions.csv'),\n",
|
|
175
|
+
" catalyticActivities=None,\n",
|
|
176
|
+
" complexName=None,\n",
|
|
177
|
+
" functions=None,\n",
|
|
178
|
+
" gene='ZIC3',\n",
|
|
179
|
+
" geneSynonyms=None,\n",
|
|
180
|
+
" ipSAE=None,\n",
|
|
181
|
+
" ipTM=None,\n",
|
|
182
|
+
" isUniProtReferenceProteome=True,\n",
|
|
183
|
+
" isUniProtReviewed=True,\n",
|
|
184
|
+
" keywords=None,\n",
|
|
185
|
+
" msaUrl=URL('https://alphafold.ebi.ac.uk/files/msa/AF-O60481-F1-msa_v6.a3m'),\n",
|
|
186
|
+
" organismCommonNames=None,\n",
|
|
187
|
+
" organismScientificName='Homo sapiens',\n",
|
|
188
|
+
" organismSynonyms=None,\n",
|
|
189
|
+
" plddtDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-confidence_v6.json'),\n",
|
|
190
|
+
" proteinFullNames=None,\n",
|
|
191
|
+
" proteinShortNames=None,\n",
|
|
192
|
+
" stoichiometry=None,\n",
|
|
193
|
+
" taxId=9606,\n",
|
|
194
|
+
" taxonomyLineage=None,\n",
|
|
195
|
+
" uniprotAccession='O60481',\n",
|
|
196
|
+
" uniprotDescription='Zinc finger protein '\n",
|
|
197
|
+
" 'ZIC 3',\n",
|
|
198
|
+
" uniprotId='ZIC3_HUMAN'),\n",
|
|
199
|
+
" summary_file=PosixPath('alphafold_files/O60481.json'),\n",
|
|
200
|
+
" bcif_file=None,\n",
|
|
201
|
+
" cif_file=PosixPath('alphafold_files/AF-O60481-F1-model_v6.cif'),\n",
|
|
202
|
+
" pdb_file=None,\n",
|
|
203
|
+
" pae_doc_file=PosixPath('alphafold_files/AF-O60481-F1-predicted_aligned_error_v6.json'),\n",
|
|
204
|
+
" am_annotations_file=None,\n",
|
|
205
|
+
" am_annotations_hg19_file=None,\n",
|
|
206
|
+
" am_annotations_hg38_file=None,\n",
|
|
207
|
+
" msa_file=None,\n",
|
|
208
|
+
" plddt_doc_file=None),\n",
|
|
209
|
+
" AlphaFoldEntry(uniprot_accession='P50613',\n",
|
|
210
|
+
" summary=EntrySummary(allVersions=[1, 2, 3, 4, 5, 6],\n",
|
|
211
|
+
" bcifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v6.bcif'),\n",
|
|
212
|
+
" cifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v6.cif'),\n",
|
|
213
|
+
" entityType='protein',\n",
|
|
214
|
+
" fractionPlddtConfident=0.127,\n",
|
|
215
|
+
" fractionPlddtLow=0.092,\n",
|
|
216
|
+
" fractionPlddtVeryHigh=0.618,\n",
|
|
217
|
+
" fractionPlddtVeryLow=0.162,\n",
|
|
218
|
+
" globalMetricValue=82.0,\n",
|
|
219
|
+
" isUniProt=True,\n",
|
|
220
|
+
" latestVersion=6,\n",
|
|
221
|
+
" modelCreatedDate='2025-08-01T00:00:00Z',\n",
|
|
222
|
+
" modelEntityId='AF-P50613-F1',\n",
|
|
223
|
+
" paeDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-predicted_aligned_error_v6.json'),\n",
|
|
224
|
+
" pdbUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v6.pdb'),\n",
|
|
225
|
+
" providerId='GDM',\n",
|
|
226
|
+
" sequence='MALDVKSRAKRYEKLDFLGEGQFATVYKARDKNTNQIVAIKKIKLGHRSEAKDGINRTALREIKLLQELSHPNIIGLLDAFGHKSNISLVFDFMETDLEVIIKDNSLVLTPSHIKAYMLMTLQGLEYLHQHWILHRDLKPNNLLLDENGVLKLADFGLAKSFGSPNRAYTHQVVTRWYRAPELLFGARMYGVGVDMWAVGCILAELLLRVPFLPGDSDLDQLTRIFETLGTPTEEQWPDMCSLPDYVTFKSFPGIPLHHIFSAAGDDLLDLIQGLFLFNPCARITATQALKMKYFSNRPGPTPGCQLPRPNCPVETLKEQSNPALAIKRKRTEALEQGGLPKKLIF',\n",
|
|
227
|
+
" sequenceChecksum='0A94BFA7DD416CEB',\n",
|
|
228
|
+
" sequenceEnd=346,\n",
|
|
229
|
+
" sequenceStart=1,\n",
|
|
230
|
+
" sequenceVersionDate='1996-10-01T00:00:00Z',\n",
|
|
231
|
+
" toolUsed='AlphaFold Monomer v2.0 pipeline',\n",
|
|
232
|
+
" alternativeNames=None,\n",
|
|
233
|
+
" amAnnotationsHg19Url=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg19.csv'),\n",
|
|
234
|
+
" amAnnotationsHg38Url=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg38.csv'),\n",
|
|
235
|
+
" amAnnotationsUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-aa-substitutions.csv'),\n",
|
|
236
|
+
" catalyticActivities=None,\n",
|
|
237
|
+
" complexName=None,\n",
|
|
238
|
+
" functions=None,\n",
|
|
239
|
+
" gene='CDK7',\n",
|
|
240
|
+
" geneSynonyms=None,\n",
|
|
241
|
+
" ipSAE=None,\n",
|
|
242
|
+
" ipTM=None,\n",
|
|
243
|
+
" isUniProtReferenceProteome=True,\n",
|
|
244
|
+
" isUniProtReviewed=True,\n",
|
|
245
|
+
" keywords=None,\n",
|
|
246
|
+
" msaUrl=URL('https://alphafold.ebi.ac.uk/files/msa/AF-P50613-F1-msa_v6.a3m'),\n",
|
|
247
|
+
" organismCommonNames=None,\n",
|
|
248
|
+
" organismScientificName='Homo sapiens',\n",
|
|
249
|
+
" organismSynonyms=None,\n",
|
|
250
|
+
" plddtDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-confidence_v6.json'),\n",
|
|
251
|
+
" proteinFullNames=None,\n",
|
|
252
|
+
" proteinShortNames=None,\n",
|
|
253
|
+
" stoichiometry=None,\n",
|
|
254
|
+
" taxId=9606,\n",
|
|
255
|
+
" taxonomyLineage=None,\n",
|
|
256
|
+
" uniprotAccession='P50613',\n",
|
|
257
|
+
" uniprotDescription='Cyclin-dependent '\n",
|
|
258
|
+
" 'kinase 7',\n",
|
|
259
|
+
" uniprotId='CDK7_HUMAN'),\n",
|
|
260
|
+
" summary_file=PosixPath('alphafold_files/P50613.json'),\n",
|
|
261
|
+
" bcif_file=None,\n",
|
|
262
|
+
" cif_file=PosixPath('alphafold_files/AF-P50613-F1-model_v6.cif'),\n",
|
|
263
|
+
" pdb_file=None,\n",
|
|
264
|
+
" pae_doc_file=PosixPath('alphafold_files/AF-P50613-F1-predicted_aligned_error_v6.json'),\n",
|
|
265
|
+
" am_annotations_file=None,\n",
|
|
266
|
+
" am_annotations_hg19_file=None,\n",
|
|
267
|
+
" am_annotations_hg38_file=None,\n",
|
|
268
|
+
" msa_file=None,\n",
|
|
269
|
+
" plddt_doc_file=None)]\n"
|
|
270
|
+
]
|
|
271
|
+
},
|
|
272
|
+
{
|
|
273
|
+
"name": "stderr",
|
|
274
|
+
"output_type": "stream",
|
|
275
|
+
"text": [
|
|
276
|
+
"\n"
|
|
277
|
+
]
|
|
278
|
+
}
|
|
279
|
+
],
|
|
280
|
+
"source": [
|
|
281
|
+
"summaries = [\n",
|
|
282
|
+
" s async for s in fetch_many_async([\"A1YPR0\", \"O60481\", \"P50613\"], save_dir, what={\"summary\", \"cif\", \"paeDoc\"})\n",
|
|
283
|
+
"]\n",
|
|
284
|
+
"pprint(summaries)"
|
|
285
|
+
]
|
|
286
|
+
},
|
|
287
|
+
{
|
|
288
|
+
"cell_type": "code",
|
|
289
|
+
"execution_count": 7,
|
|
290
|
+
"id": "2d3595e6",
|
|
291
|
+
"metadata": {},
|
|
292
|
+
"outputs": [
|
|
293
|
+
{
|
|
294
|
+
"name": "stdout",
|
|
295
|
+
"output_type": "stream",
|
|
296
|
+
"text": [
|
|
297
|
+
"total 4.3M\n",
|
|
298
|
+
"4.0K A1YPR0.json\n",
|
|
299
|
+
"556K AF-A1YPR0-F1-model_v6.cif\n",
|
|
300
|
+
"1.1M AF-A1YPR0-F1-predicted_aligned_error_v6.json\n",
|
|
301
|
+
"412K AF-O60481-2-F1-model_v6.cif\n",
|
|
302
|
+
"600K AF-O60481-2-F1-predicted_aligned_error_v6.json\n",
|
|
303
|
+
"412K AF-O60481-F1-model_v6.cif\n",
|
|
304
|
+
"628K AF-O60481-F1-predicted_aligned_error_v6.json\n",
|
|
305
|
+
"324K AF-P50613-F1-model_v6.cif\n",
|
|
306
|
+
"276K AF-P50613-F1-predicted_aligned_error_v6.json\n",
|
|
307
|
+
"8.0K O60481.json\n",
|
|
308
|
+
"4.0K P50613.json\n"
|
|
309
|
+
]
|
|
310
|
+
}
|
|
311
|
+
],
|
|
312
|
+
"source": [
|
|
313
|
+
"!ls -sh {save_dir}"
|
|
314
|
+
]
|
|
315
|
+
},
|
|
316
|
+
{
|
|
317
|
+
"cell_type": "markdown",
|
|
318
|
+
"id": "a43edd87",
|
|
319
|
+
"metadata": {},
|
|
320
|
+
"source": [
|
|
321
|
+
"## Filter AlphFold structure files on confidence\n",
|
|
322
|
+
"\n",
|
|
323
|
+
"Filter AlphaFold mmcif/PDB files by confidence (plDDT). Passed files are written with residues below threshold removed."
|
|
324
|
+
]
|
|
325
|
+
},
|
|
326
|
+
{
|
|
327
|
+
"cell_type": "code",
|
|
328
|
+
"execution_count": 10,
|
|
329
|
+
"id": "cc96c63a",
|
|
330
|
+
"metadata": {},
|
|
331
|
+
"outputs": [],
|
|
332
|
+
"source": [
|
|
333
|
+
"from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence"
|
|
334
|
+
]
|
|
335
|
+
},
|
|
336
|
+
{
|
|
337
|
+
"cell_type": "markdown",
|
|
338
|
+
"id": "724141d4",
|
|
339
|
+
"metadata": {},
|
|
340
|
+
"source": [
|
|
341
|
+
"Take one of the downloaded files"
|
|
342
|
+
]
|
|
343
|
+
},
|
|
344
|
+
{
|
|
345
|
+
"cell_type": "code",
|
|
346
|
+
"execution_count": 12,
|
|
347
|
+
"id": "73a61cf6",
|
|
348
|
+
"metadata": {},
|
|
349
|
+
"outputs": [
|
|
350
|
+
{
|
|
351
|
+
"data": {
|
|
352
|
+
"text/plain": [
|
|
353
|
+
"[PosixPath('alphafold_files/AF-A1YPR0-F1-model_v4.cif'),\n",
|
|
354
|
+
" PosixPath('alphafold_files/AF-O60481-F1-model_v4.cif'),\n",
|
|
355
|
+
" PosixPath('alphafold_files/AF-P50613-F1-model_v4.cif')]"
|
|
356
|
+
]
|
|
357
|
+
},
|
|
358
|
+
"execution_count": 12,
|
|
359
|
+
"metadata": {},
|
|
360
|
+
"output_type": "execute_result"
|
|
361
|
+
}
|
|
362
|
+
],
|
|
363
|
+
"source": [
|
|
364
|
+
"input_files = [entry.cif_file for entry in summaries if entry.cif_file is not None]\n",
|
|
365
|
+
"input_files"
|
|
366
|
+
]
|
|
367
|
+
},
|
|
368
|
+
{
|
|
369
|
+
"cell_type": "markdown",
|
|
370
|
+
"id": "da8f2f67",
|
|
371
|
+
"metadata": {},
|
|
372
|
+
"source": [
|
|
373
|
+
"We only write a filtered cif file when in the input file there are between 100 and 1000 residues that have a pLDDT score above 50."
|
|
374
|
+
]
|
|
375
|
+
},
|
|
376
|
+
{
|
|
377
|
+
"cell_type": "code",
|
|
378
|
+
"execution_count": null,
|
|
379
|
+
"id": "fbfdf472",
|
|
380
|
+
"metadata": {},
|
|
381
|
+
"outputs": [],
|
|
382
|
+
"source": [
|
|
383
|
+
"query = ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000)"
|
|
384
|
+
]
|
|
385
|
+
},
|
|
386
|
+
{
|
|
387
|
+
"cell_type": "code",
|
|
388
|
+
"execution_count": 14,
|
|
389
|
+
"id": "152aec9a",
|
|
390
|
+
"metadata": {},
|
|
391
|
+
"outputs": [],
|
|
392
|
+
"source": [
|
|
393
|
+
"output_dir = Path(\"./filtered\")\n",
|
|
394
|
+
"output_dir.mkdir(exist_ok=True)\n",
|
|
395
|
+
"result = filter_files_on_confidence(input_files, query, output_dir)"
|
|
396
|
+
]
|
|
397
|
+
},
|
|
398
|
+
{
|
|
399
|
+
"cell_type": "code",
|
|
400
|
+
"execution_count": null,
|
|
401
|
+
"id": "6a6f8e3f",
|
|
402
|
+
"metadata": {},
|
|
403
|
+
"outputs": [
|
|
404
|
+
{
|
|
405
|
+
"data": {
|
|
406
|
+
"text/plain": [
|
|
407
|
+
"[ConfidenceFilterResult(input_file='AF-A1YPR0-F1-model_v4.cif', count=175, filtered_file=PosixPath('filtered/AF-A1YPR0-F1-model_v4.cif')),\n",
|
|
408
|
+
" ConfidenceFilterResult(input_file='AF-O60481-F1-model_v4.cif', count=76, filtered_file=None),\n",
|
|
409
|
+
" ConfidenceFilterResult(input_file='AF-P50613-F1-model_v4.cif', count=244, filtered_file=PosixPath('filtered/AF-P50613-F1-model_v4.cif'))]"
|
|
410
|
+
]
|
|
411
|
+
},
|
|
412
|
+
"execution_count": 17,
|
|
413
|
+
"metadata": {},
|
|
414
|
+
"output_type": "execute_result"
|
|
415
|
+
}
|
|
416
|
+
],
|
|
417
|
+
"source": [
|
|
418
|
+
"list(\n",
|
|
419
|
+
" filter_files_on_confidence(\n",
|
|
420
|
+
" input_files, ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000), output_dir\n",
|
|
421
|
+
" )\n",
|
|
422
|
+
")"
|
|
423
|
+
]
|
|
424
|
+
},
|
|
425
|
+
{
|
|
426
|
+
"cell_type": "markdown",
|
|
427
|
+
"id": "0fe1e388",
|
|
428
|
+
"metadata": {},
|
|
429
|
+
"source": [
|
|
430
|
+
"2 files have passed, but 1 file only has 75 high confidence residues so it is discarded."
|
|
431
|
+
]
|
|
432
|
+
},
|
|
433
|
+
{
|
|
434
|
+
"cell_type": "code",
|
|
435
|
+
"execution_count": null,
|
|
436
|
+
"id": "83ffc09b",
|
|
437
|
+
"metadata": {},
|
|
438
|
+
"outputs": [],
|
|
439
|
+
"source": []
|
|
440
|
+
}
|
|
441
|
+
],
|
|
442
|
+
"metadata": {
|
|
443
|
+
"kernelspec": {
|
|
444
|
+
"display_name": "protein-quest",
|
|
445
|
+
"language": "python",
|
|
446
|
+
"name": "python3"
|
|
447
|
+
},
|
|
448
|
+
"language_info": {
|
|
449
|
+
"codemirror_mode": {
|
|
450
|
+
"name": "ipython",
|
|
451
|
+
"version": 3
|
|
452
|
+
},
|
|
453
|
+
"file_extension": ".py",
|
|
454
|
+
"mimetype": "text/x-python",
|
|
455
|
+
"name": "python",
|
|
456
|
+
"nbconvert_exporter": "python",
|
|
457
|
+
"pygments_lexer": "ipython3",
|
|
458
|
+
"version": "3.13.5"
|
|
459
|
+
}
|
|
460
|
+
},
|
|
461
|
+
"nbformat": 4,
|
|
462
|
+
"nbformat_minor": 5
|
|
463
|
+
}
|