protein-quest 0.3.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {protein_quest-0.3.0 → protein_quest-0.3.2}/.github/workflows/ci.yml +16 -1
- {protein_quest-0.3.0 → protein_quest-0.3.2}/CITATION.cff +1 -2
- {protein_quest-0.3.0 → protein_quest-0.3.2}/CONTRIBUTING.md +15 -1
- {protein_quest-0.3.0 → protein_quest-0.3.2}/PKG-INFO +21 -11
- {protein_quest-0.3.0 → protein_quest-0.3.2}/README.md +19 -6
- protein_quest-0.3.2/docs/notebooks/.gitignore +4 -0
- protein_quest-0.3.2/docs/notebooks/alphafold.ipynb +384 -0
- protein_quest-0.3.2/docs/notebooks/index.md +3 -0
- protein_quest-0.3.2/docs/notebooks/pdbe.ipynb +278 -0
- protein_quest-0.3.2/docs/notebooks/uniprot.ipynb +308 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/mkdocs.yml +13 -4
- {protein_quest-0.3.0 → protein_quest-0.3.2}/pyproject.toml +12 -16
- protein_quest-0.3.2/src/protein_quest/__version__.py +2 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/alphafold/confidence.py +44 -17
- {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/alphafold/entry_summary.py +11 -9
- {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/alphafold/fetch.py +37 -63
- {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/cli.py +187 -30
- protein_quest-0.3.2/src/protein_quest/converter.py +45 -0
- protein_quest-0.3.2/src/protein_quest/filters.py +150 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/go.py +1 -4
- {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/mcp_server.py +8 -5
- {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/parallel.py +37 -1
- {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/pdbe/fetch.py +15 -1
- protein_quest-0.3.2/src/protein_quest/pdbe/io.py +281 -0
- protein_quest-0.3.2/src/protein_quest/ss.py +264 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/taxonomy.py +13 -3
- {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/utils.py +65 -3
- protein_quest-0.3.2/tests/alphafold/test_confidence.py +155 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/alphafold/test_entry_summary.py +1 -4
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/alphafold/test_fetch.py +1 -1
- protein_quest-0.3.2/tests/fixtures/3JRS_B2A.cif.gz +0 -0
- protein_quest-0.3.2/tests/pdbe/test_fetch.py +29 -0
- protein_quest-0.3.2/tests/pdbe/test_io.py +142 -0
- protein_quest-0.3.2/tests/test_ss.py +227 -0
- protein_quest-0.3.2/tests/test_utils.py +31 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/uv.lock +76 -695
- protein_quest-0.3.0/src/protein_quest/__version__.py +0 -1
- protein_quest-0.3.0/src/protein_quest/filters.py +0 -107
- protein_quest-0.3.0/src/protein_quest/pdbe/io.py +0 -185
- protein_quest-0.3.0/tests/alphafold/test_confidence.py +0 -63
- protein_quest-0.3.0/tests/pdbe/test_fetch.py +0 -17
- protein_quest-0.3.0/tests/pdbe/test_io.py +0 -81
- {protein_quest-0.3.0 → protein_quest-0.3.2}/.github/workflows/pages.yml +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/.github/workflows/pypi-publish.yml +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/.gitignore +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/.vscode/extensions.json +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/CODE_OF_CONDUCT.md +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/LICENSE +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/docs/CONTRIBUTING.md +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/docs/cli_doc_hook.py +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/docs/index.md +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/docs/protein-quest-mcp.png +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/__init__.py +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/alphafold/__init__.py +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/emdb.py +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/pdbe/__init__.py +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/py.typed +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/uniprot.py +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/alphafold/AF-A1YPR0-F1-model_v4.pdb +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/alphafold/cassettes/test_fetch/test_fetch_many.yaml +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/cassettes/test_emdb/test_fetch.yaml +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/cassettes/test_go/test_search_gene_ontology_term.yaml +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/cassettes/test_taxonomy/test_search_taxon.yaml +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/cassettes/test_taxonomy/test_search_taxon_by_id.yaml +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/cassettes/test_uniprot/test_search4af.yaml +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/cassettes/test_uniprot/test_search4emdb.yaml +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/cassettes/test_uniprot/test_search4pdb.yaml +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/cassettes/test_uniprot/test_search4uniprot.yaml +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/pdbe/cassettes/test_fetch/test_fetch.yaml +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/pdbe/fixtures/2y29.cif +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/test_cli.py +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/test_emdb.py +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/test_go.py +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/test_mcp.py +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/test_taxonomy.py +0 -0
- {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/test_uniprot.py +0 -0
|
@@ -3,7 +3,7 @@ name: CI
|
|
|
3
3
|
on:
|
|
4
4
|
push:
|
|
5
5
|
branches:
|
|
6
|
-
|
|
6
|
+
- main
|
|
7
7
|
pull_request:
|
|
8
8
|
|
|
9
9
|
concurrency:
|
|
@@ -70,3 +70,18 @@ jobs:
|
|
|
70
70
|
run: uv sync --locked --dev --extra mcp
|
|
71
71
|
- name: Run type checkers
|
|
72
72
|
run: uv run pyrefly check src tests
|
|
73
|
+
typing-docs:
|
|
74
|
+
name: typing-docs
|
|
75
|
+
runs-on: ubuntu-latest
|
|
76
|
+
steps:
|
|
77
|
+
- uses: actions/checkout@v4
|
|
78
|
+
- name: Install uv
|
|
79
|
+
uses: astral-sh/setup-uv@v6
|
|
80
|
+
- name: Install the project
|
|
81
|
+
run: uv sync --group docs-type
|
|
82
|
+
- name: Convert notebooks to Python scripts
|
|
83
|
+
run: |
|
|
84
|
+
find docs/ -name "*.ipynb" -exec uv run --group docs-type marimo convert {} -o {}.py \;
|
|
85
|
+
- name: Run type checkers on docs
|
|
86
|
+
run: uv run --group docs-type pyrefly check docs/notebooks/*.ipynb.py
|
|
87
|
+
|
|
@@ -82,9 +82,23 @@ uv run mkdocs build
|
|
|
82
82
|
python3 -m http.server -d site
|
|
83
83
|
```
|
|
84
84
|
|
|
85
|
+
<details>
|
|
86
|
+
<summary>Type checking notebooks</summary>
|
|
87
|
+
|
|
88
|
+
[Pyrefly](https://pyrefly.org/) does not support notebooks yet, so we need to convert them to python scripts and then run pyrefly on them.
|
|
89
|
+
|
|
90
|
+
```shell
|
|
91
|
+
find docs/ -name "*.ipynb" -exec uv run --group docs-type marimo convert {} -o {}.py \;
|
|
92
|
+
uv run --group docs-type pyrefly check docs/notebooks/*.ipynb.py
|
|
93
|
+
rm docs/notebooks/*.ipynb.py
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
</details>
|
|
97
|
+
|
|
98
|
+
|
|
85
99
|
## Contributing to tests
|
|
86
100
|
|
|
87
|
-
The code coverage
|
|
101
|
+
The code coverage is stored at [https://app.codacy.com/gh/haddocking/protein-quest/coverage](https://app.codacy.com/gh/haddocking/protein-quest/coverage) .
|
|
88
102
|
|
|
89
103
|
The search functions of the protein-quest package talk to web services on the Internet.
|
|
90
104
|
To have fast tests we use [pytest-recording](https://github.com/kiwicom/pytest-recording) to record and replay HTTP interactions.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: protein_quest
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: Search/retrieve/filter proteins and protein structures
|
|
5
5
|
Project-URL: Homepage, https://github.com/haddocking/protein-quest
|
|
6
6
|
Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
|
|
@@ -13,19 +13,16 @@ Requires-Dist: aiohttp-retry>=2.9.1
|
|
|
13
13
|
Requires-Dist: aiohttp[speedups]>=3.11.18
|
|
14
14
|
Requires-Dist: aiopath>=0.7.7
|
|
15
15
|
Requires-Dist: attrs>=25.3.0
|
|
16
|
-
Requires-Dist: bokeh>=3.7.3
|
|
17
16
|
Requires-Dist: cattrs[orjson]>=24.1.3
|
|
18
17
|
Requires-Dist: dask>=2025.5.1
|
|
19
18
|
Requires-Dist: distributed>=2025.5.1
|
|
20
19
|
Requires-Dist: gemmi>=0.7.3
|
|
21
|
-
Requires-Dist: molviewspec>=1.6.0
|
|
22
|
-
Requires-Dist: pandas>=2.3.0
|
|
23
|
-
Requires-Dist: platformdirs>=4.3.8
|
|
24
20
|
Requires-Dist: psutil>=7.0.0
|
|
25
21
|
Requires-Dist: rich-argparse>=1.7.1
|
|
26
22
|
Requires-Dist: rich>=14.0.0
|
|
27
23
|
Requires-Dist: sparqlwrapper>=2.0.0
|
|
28
24
|
Requires-Dist: tqdm>=4.67.1
|
|
25
|
+
Requires-Dist: yarl>=1.20.1
|
|
29
26
|
Provides-Extra: mcp
|
|
30
27
|
Requires-Dist: fastmcp>=2.11.3; extra == 'mcp'
|
|
31
28
|
Requires-Dist: pydantic>=2.11.7; extra == 'mcp'
|
|
@@ -37,8 +34,7 @@ Description-Content-Type: text/markdown
|
|
|
37
34
|
[](https://github.com/haddocking/protein-quest/actions/workflows/ci.yml)
|
|
38
35
|
[](https://www.research-software.nl/software/protein-quest)
|
|
39
36
|
[](https://pypi.org/project/protein-quest/)
|
|
40
|
-
|
|
41
|
-
[](https://doi.org/10.5281/zenodo.15632658)
|
|
37
|
+
[](https://doi.org/10.5281/zenodo.16941288)
|
|
42
38
|
[](https://app.codacy.com/gh/haddocking/protein-quest/coverage?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_coverage)
|
|
43
39
|
|
|
44
40
|
Python package to search/retrieve/filter proteins and protein structures.
|
|
@@ -63,9 +59,11 @@ graph TB;
|
|
|
63
59
|
searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
|
|
64
60
|
searchaf --> |uniprot_accessions|fetchad(Retrieve AlphaFold)
|
|
65
61
|
searchemdb -. emdb_ids .->fetchemdb[Retrieve EMDB]
|
|
66
|
-
fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{Filter on chain of uniprot}
|
|
67
|
-
chainfilter --> |mmcif_files| residuefilter{Filter on chain length}
|
|
68
|
-
fetchad -->|pdb_files| confidencefilter{Filter out low confidence}
|
|
62
|
+
fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{{Filter on chain of uniprot}}
|
|
63
|
+
chainfilter --> |mmcif_files| residuefilter{{Filter on chain length}}
|
|
64
|
+
fetchad -->|pdb_files| confidencefilter{{Filter out low confidence}}
|
|
65
|
+
confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
|
|
66
|
+
residuefilter --> |mmcif_files| ssfilter
|
|
69
67
|
classDef dashedBorder stroke-dasharray: 5 5;
|
|
70
68
|
goterm:::dashedBorder
|
|
71
69
|
taxonomy:::dashedBorder
|
|
@@ -90,7 +88,7 @@ pip install git+https://github.com/haddocking/protein-quest.git
|
|
|
90
88
|
|
|
91
89
|
The main entry point is the `protein-quest` command line tool which has multiple subcommands to perform actions.
|
|
92
90
|
|
|
93
|
-
To use programmaticly, see [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
|
|
91
|
+
To use programmaticly, see the [Jupyter notebooks](https://www.bonvinlab.org/protein-quest/notebooks) and [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
|
|
94
92
|
|
|
95
93
|
### Search Uniprot accessions
|
|
96
94
|
|
|
@@ -179,6 +177,18 @@ protein-quest filter residue \
|
|
|
179
177
|
./filtered-chains ./filtered
|
|
180
178
|
```
|
|
181
179
|
|
|
180
|
+
### To filter on secondary structure
|
|
181
|
+
|
|
182
|
+
To filter on structure being mostly alpha helices and have no beta sheets.
|
|
183
|
+
|
|
184
|
+
```shell
|
|
185
|
+
protein-quest filter secondary-structure \
|
|
186
|
+
--ratio-min-helix-residues 0.5 \
|
|
187
|
+
--ratio-max-sheet-residues 0.0 \
|
|
188
|
+
--write-stats filtered-ss/stats.csv \
|
|
189
|
+
./filtered-chains ./filtered-ss
|
|
190
|
+
```
|
|
191
|
+
|
|
182
192
|
### Search Taxonomy
|
|
183
193
|
|
|
184
194
|
```shell
|
|
@@ -4,8 +4,7 @@
|
|
|
4
4
|
[](https://github.com/haddocking/protein-quest/actions/workflows/ci.yml)
|
|
5
5
|
[](https://www.research-software.nl/software/protein-quest)
|
|
6
6
|
[](https://pypi.org/project/protein-quest/)
|
|
7
|
-
|
|
8
|
-
[](https://doi.org/10.5281/zenodo.15632658)
|
|
7
|
+
[](https://doi.org/10.5281/zenodo.16941288)
|
|
9
8
|
[](https://app.codacy.com/gh/haddocking/protein-quest/coverage?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_coverage)
|
|
10
9
|
|
|
11
10
|
Python package to search/retrieve/filter proteins and protein structures.
|
|
@@ -30,9 +29,11 @@ graph TB;
|
|
|
30
29
|
searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
|
|
31
30
|
searchaf --> |uniprot_accessions|fetchad(Retrieve AlphaFold)
|
|
32
31
|
searchemdb -. emdb_ids .->fetchemdb[Retrieve EMDB]
|
|
33
|
-
fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{Filter on chain of uniprot}
|
|
34
|
-
chainfilter --> |mmcif_files| residuefilter{Filter on chain length}
|
|
35
|
-
fetchad -->|pdb_files| confidencefilter{Filter out low confidence}
|
|
32
|
+
fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{{Filter on chain of uniprot}}
|
|
33
|
+
chainfilter --> |mmcif_files| residuefilter{{Filter on chain length}}
|
|
34
|
+
fetchad -->|pdb_files| confidencefilter{{Filter out low confidence}}
|
|
35
|
+
confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
|
|
36
|
+
residuefilter --> |mmcif_files| ssfilter
|
|
36
37
|
classDef dashedBorder stroke-dasharray: 5 5;
|
|
37
38
|
goterm:::dashedBorder
|
|
38
39
|
taxonomy:::dashedBorder
|
|
@@ -57,7 +58,7 @@ pip install git+https://github.com/haddocking/protein-quest.git
|
|
|
57
58
|
|
|
58
59
|
The main entry point is the `protein-quest` command line tool which has multiple subcommands to perform actions.
|
|
59
60
|
|
|
60
|
-
To use programmaticly, see [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
|
|
61
|
+
To use programmaticly, see the [Jupyter notebooks](https://www.bonvinlab.org/protein-quest/notebooks) and [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
|
|
61
62
|
|
|
62
63
|
### Search Uniprot accessions
|
|
63
64
|
|
|
@@ -146,6 +147,18 @@ protein-quest filter residue \
|
|
|
146
147
|
./filtered-chains ./filtered
|
|
147
148
|
```
|
|
148
149
|
|
|
150
|
+
### To filter on secondary structure
|
|
151
|
+
|
|
152
|
+
To filter on structure being mostly alpha helices and have no beta sheets.
|
|
153
|
+
|
|
154
|
+
```shell
|
|
155
|
+
protein-quest filter secondary-structure \
|
|
156
|
+
--ratio-min-helix-residues 0.5 \
|
|
157
|
+
--ratio-max-sheet-residues 0.0 \
|
|
158
|
+
--write-stats filtered-ss/stats.csv \
|
|
159
|
+
./filtered-chains ./filtered-ss
|
|
160
|
+
```
|
|
161
|
+
|
|
149
162
|
### Search Taxonomy
|
|
150
163
|
|
|
151
164
|
```shell
|
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "24b1926c",
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"source": [
|
|
8
|
+
"# AlphaFold\n",
|
|
9
|
+
"\n",
|
|
10
|
+
"You can download and filter AlphaFold files on confidence."
|
|
11
|
+
]
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"cell_type": "code",
|
|
15
|
+
"execution_count": 1,
|
|
16
|
+
"id": "681ba946",
|
|
17
|
+
"metadata": {},
|
|
18
|
+
"outputs": [],
|
|
19
|
+
"source": [
|
|
20
|
+
"# Generic imports\n",
|
|
21
|
+
"import logging\n",
|
|
22
|
+
"from pathlib import Path\n",
|
|
23
|
+
"from pprint import pprint\n",
|
|
24
|
+
"\n",
|
|
25
|
+
"logging.basicConfig(level=logging.WARNING)\n",
|
|
26
|
+
"# Set to WARNING to see only warnings\n",
|
|
27
|
+
"# Set to INFO to see sparql queries\n",
|
|
28
|
+
"# Set to DEBUG to see raw results"
|
|
29
|
+
]
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"cell_type": "markdown",
|
|
33
|
+
"id": "4959258c",
|
|
34
|
+
"metadata": {},
|
|
35
|
+
"source": [
|
|
36
|
+
"\n",
|
|
37
|
+
"## Download Alphafold files"
|
|
38
|
+
]
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"cell_type": "code",
|
|
42
|
+
"execution_count": 2,
|
|
43
|
+
"id": "81e449db",
|
|
44
|
+
"metadata": {},
|
|
45
|
+
"outputs": [],
|
|
46
|
+
"source": [
|
|
47
|
+
"from protein_quest.alphafold.fetch import fetch_many_async"
|
|
48
|
+
]
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"cell_type": "code",
|
|
52
|
+
"execution_count": 3,
|
|
53
|
+
"id": "5c2e6ee3",
|
|
54
|
+
"metadata": {},
|
|
55
|
+
"outputs": [],
|
|
56
|
+
"source": [
|
|
57
|
+
"save_dir = Path(\"alphafold_files\")"
|
|
58
|
+
]
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
"cell_type": "markdown",
|
|
62
|
+
"id": "f38991cf",
|
|
63
|
+
"metadata": {},
|
|
64
|
+
"source": [
|
|
65
|
+
"To download the summary, the cif, predicted Aligned error document (peaDoc) and the pdb file for 3 AlphaFold entries given their uniprot accessions.\n"
|
|
66
|
+
]
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"cell_type": "code",
|
|
70
|
+
"execution_count": 8,
|
|
71
|
+
"id": "e32b474a",
|
|
72
|
+
"metadata": {},
|
|
73
|
+
"outputs": [
|
|
74
|
+
{
|
|
75
|
+
"name": "stderr",
|
|
76
|
+
"output_type": "stream",
|
|
77
|
+
"text": [
|
|
78
|
+
"Fetching Alphafold summaries: 100%|██████████| 3/3 [00:00<00:00, 8.07it/s]\n",
|
|
79
|
+
"Downloading AlphaFold files: 100%|██████████| 9/9 [00:00<00:00, 55.82it/s]"
|
|
80
|
+
]
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
"name": "stdout",
|
|
84
|
+
"output_type": "stream",
|
|
85
|
+
"text": [
|
|
86
|
+
"[AlphaFoldEntry(uniprot_acc='A1YPR0',\n",
|
|
87
|
+
" summary=EntrySummary(entryId='AF-A1YPR0-F1',\n",
|
|
88
|
+
" uniprotAccession='A1YPR0',\n",
|
|
89
|
+
" uniprotId='ZBT7C_HUMAN',\n",
|
|
90
|
+
" uniprotDescription='Zinc finger and BTB '\n",
|
|
91
|
+
" 'domain-containing '\n",
|
|
92
|
+
" 'protein 7C',\n",
|
|
93
|
+
" taxId=9606,\n",
|
|
94
|
+
" organismScientificName='Homo sapiens',\n",
|
|
95
|
+
" uniprotStart=1,\n",
|
|
96
|
+
" uniprotEnd=619,\n",
|
|
97
|
+
" uniprotSequence='MANDIDELIGIPFPNHSSEVLCSLNEQRHDGLLCDVLLVVQEQEYRTHRSVLAACSKYFKKLFTAGTLASQPYVYEIDFVQPEALAAILEFAYTSTLTITAGNVKHILNAARMLEIQCIVNVCLEIMEPGGDGGEEDDKEDDDDDEDDDDEEDEEEEEEEEEDDDDDTEDFADQENLPDPQDISCHQSPSKTDHLTEKAYSDTPRDFPDSFQAGSPGHLGVIRDFSIESLLRENLYPKANIPDRRPSLSPFAPDFFPHLWPGDFGAFAQLPEQPMDSGPLDLVIKNRKIKEEEKEELPPPPPPPFPNDFFKDMFPDLPGGPLGPIKAENDYGAYLNFLSATHLGGLFPPWPLVEERKLKPKASQQCPICHKVIMGAGKLPRHMRTHTGEKPYMCTICEVRFTRQDKLKIHMRKHTGERPYLCIHCNAKFVHNYDLKNHMRIHTGVRPYQCEFCYKSFTRSDHLHRHIKRQSCRMARPRRGRKPAAWRAASLLFGPGGPAPDKAAFVMPPALGEVGGHLGGAAVCLPGPSPAKHFLAAPKGALSLQELERQFEETQMKLFGRAQLEAERNAGGLLAFALAENVAAARPYFPLPDPWAAGLAGLPGLAGLNHVASMSEANN',\n",
|
|
98
|
+
" modelCreatedDate='2022-06-01T00:00:00Z',\n",
|
|
99
|
+
" latestVersion=4,\n",
|
|
100
|
+
" allVersions=[1, 2, 3, 4],\n",
|
|
101
|
+
" bcifUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v4.bcif',\n",
|
|
102
|
+
" cifUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v4.cif',\n",
|
|
103
|
+
" pdbUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v4.pdb',\n",
|
|
104
|
+
" paeImageUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-predicted_aligned_error_v4.png',\n",
|
|
105
|
+
" paeDocUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-predicted_aligned_error_v4.json',\n",
|
|
106
|
+
" gene='ZBTB7C',\n",
|
|
107
|
+
" sequenceChecksum='73D82A34502B55BF',\n",
|
|
108
|
+
" sequenceVersionDate='2007-02-06T00:00:00Z',\n",
|
|
109
|
+
" amAnnotationsUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-aa-substitutions.csv',\n",
|
|
110
|
+
" amAnnotationsHg19Url='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg19.csv',\n",
|
|
111
|
+
" amAnnotationsHg38Url='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg38.csv',\n",
|
|
112
|
+
" isReviewed=True,\n",
|
|
113
|
+
" isReferenceProteome=True),\n",
|
|
114
|
+
" bcif_file=None,\n",
|
|
115
|
+
" cif_file=PosixPath('alphafold_files/AF-A1YPR0-F1-model_v4.cif'),\n",
|
|
116
|
+
" pdb_file=PosixPath('alphafold_files/AF-A1YPR0-F1-model_v4.pdb'),\n",
|
|
117
|
+
" pae_image_file=None,\n",
|
|
118
|
+
" pae_doc_file=PosixPath('alphafold_files/AF-A1YPR0-F1-predicted_aligned_error_v4.json'),\n",
|
|
119
|
+
" am_annotations_file=None,\n",
|
|
120
|
+
" am_annotations_hg19_file=None,\n",
|
|
121
|
+
" am_annotations_hg38_file=None),\n",
|
|
122
|
+
" AlphaFoldEntry(uniprot_acc='O60481',\n",
|
|
123
|
+
" summary=EntrySummary(entryId='AF-O60481-F1',\n",
|
|
124
|
+
" uniprotAccession='O60481',\n",
|
|
125
|
+
" uniprotId='ZIC3_HUMAN',\n",
|
|
126
|
+
" uniprotDescription='Zinc finger protein '\n",
|
|
127
|
+
" 'ZIC 3',\n",
|
|
128
|
+
" taxId=9606,\n",
|
|
129
|
+
" organismScientificName='Homo sapiens',\n",
|
|
130
|
+
" uniprotStart=1,\n",
|
|
131
|
+
" uniprotEnd=467,\n",
|
|
132
|
+
" uniprotSequence='MTMLLDGGPQFPGLGVGSFGAPRHHEMPNREPAGMGLNPFGDSTHAAAAAAAAAAFKLSPAAAHDLSSGQSSAFTPQGSGYANALGHHHHHHHHHHHTSQVPSYGGAASAAFNSTREFLFRQRSSGLSEAASGGGQHGLFAGSASSLHAPAGIPEPPSYLLFPGLHEQGAGHPSPTGHVDNNQVHLGLRGELFGRADPYRPVASPRTDPYAAGAQFPNYSPMNMNMGVNVAAHHGPGAFFRYMRQPIKQELSCKWIDEAQLSRPKKSCDRTFSTMHELVTHVTMEHVGGPEQNNHVCYWEECPREGKSFKAKYKLVNHIRVHTGEKPFPCPFPGCGKIFARSENLKIHKRTHTGEKPFKCEFEGCDRRFANSSDRKKHMHVHTSDKPYICKVCDKSYTHPSSLRKHMKVHESQGSDSSPAASSGYESSTPPAIASANSKDTTKTPSAVQTSTSHNPGLPPNFNEWYV',\n",
|
|
133
|
+
" modelCreatedDate='2022-06-01T00:00:00Z',\n",
|
|
134
|
+
" latestVersion=4,\n",
|
|
135
|
+
" allVersions=[1, 2, 3, 4],\n",
|
|
136
|
+
" bcifUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v4.bcif',\n",
|
|
137
|
+
" cifUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v4.cif',\n",
|
|
138
|
+
" pdbUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v4.pdb',\n",
|
|
139
|
+
" paeImageUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-predicted_aligned_error_v4.png',\n",
|
|
140
|
+
" paeDocUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-predicted_aligned_error_v4.json',\n",
|
|
141
|
+
" gene='ZIC3',\n",
|
|
142
|
+
" sequenceChecksum='3150CF13C0679568',\n",
|
|
143
|
+
" sequenceVersionDate='1998-08-01T00:00:00Z',\n",
|
|
144
|
+
" amAnnotationsUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-aa-substitutions.csv',\n",
|
|
145
|
+
" amAnnotationsHg19Url='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg19.csv',\n",
|
|
146
|
+
" amAnnotationsHg38Url='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg38.csv',\n",
|
|
147
|
+
" isReviewed=True,\n",
|
|
148
|
+
" isReferenceProteome=True),\n",
|
|
149
|
+
" bcif_file=None,\n",
|
|
150
|
+
" cif_file=PosixPath('alphafold_files/AF-O60481-F1-model_v4.cif'),\n",
|
|
151
|
+
" pdb_file=PosixPath('alphafold_files/AF-O60481-F1-model_v4.pdb'),\n",
|
|
152
|
+
" pae_image_file=None,\n",
|
|
153
|
+
" pae_doc_file=PosixPath('alphafold_files/AF-O60481-F1-predicted_aligned_error_v4.json'),\n",
|
|
154
|
+
" am_annotations_file=None,\n",
|
|
155
|
+
" am_annotations_hg19_file=None,\n",
|
|
156
|
+
" am_annotations_hg38_file=None),\n",
|
|
157
|
+
" AlphaFoldEntry(uniprot_acc='P50613',\n",
|
|
158
|
+
" summary=EntrySummary(entryId='AF-P50613-F1',\n",
|
|
159
|
+
" uniprotAccession='P50613',\n",
|
|
160
|
+
" uniprotId='CDK7_HUMAN',\n",
|
|
161
|
+
" uniprotDescription='Cyclin-dependent '\n",
|
|
162
|
+
" 'kinase 7',\n",
|
|
163
|
+
" taxId=9606,\n",
|
|
164
|
+
" organismScientificName='Homo sapiens',\n",
|
|
165
|
+
" uniprotStart=1,\n",
|
|
166
|
+
" uniprotEnd=346,\n",
|
|
167
|
+
" uniprotSequence='MALDVKSRAKRYEKLDFLGEGQFATVYKARDKNTNQIVAIKKIKLGHRSEAKDGINRTALREIKLLQELSHPNIIGLLDAFGHKSNISLVFDFMETDLEVIIKDNSLVLTPSHIKAYMLMTLQGLEYLHQHWILHRDLKPNNLLLDENGVLKLADFGLAKSFGSPNRAYTHQVVTRWYRAPELLFGARMYGVGVDMWAVGCILAELLLRVPFLPGDSDLDQLTRIFETLGTPTEEQWPDMCSLPDYVTFKSFPGIPLHHIFSAAGDDLLDLIQGLFLFNPCARITATQALKMKYFSNRPGPTPGCQLPRPNCPVETLKEQSNPALAIKRKRTEALEQGGLPKKLIF',\n",
|
|
168
|
+
" modelCreatedDate='2022-06-01T00:00:00Z',\n",
|
|
169
|
+
" latestVersion=4,\n",
|
|
170
|
+
" allVersions=[1, 2, 3, 4],\n",
|
|
171
|
+
" bcifUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v4.bcif',\n",
|
|
172
|
+
" cifUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v4.cif',\n",
|
|
173
|
+
" pdbUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v4.pdb',\n",
|
|
174
|
+
" paeImageUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-predicted_aligned_error_v4.png',\n",
|
|
175
|
+
" paeDocUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-predicted_aligned_error_v4.json',\n",
|
|
176
|
+
" gene='CDK7',\n",
|
|
177
|
+
" sequenceChecksum='0A94BFA7DD416CEB',\n",
|
|
178
|
+
" sequenceVersionDate='1996-10-01T00:00:00Z',\n",
|
|
179
|
+
" amAnnotationsUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-aa-substitutions.csv',\n",
|
|
180
|
+
" amAnnotationsHg19Url='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg19.csv',\n",
|
|
181
|
+
" amAnnotationsHg38Url='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg38.csv',\n",
|
|
182
|
+
" isReviewed=True,\n",
|
|
183
|
+
" isReferenceProteome=True),\n",
|
|
184
|
+
" bcif_file=None,\n",
|
|
185
|
+
" cif_file=PosixPath('alphafold_files/AF-P50613-F1-model_v4.cif'),\n",
|
|
186
|
+
" pdb_file=PosixPath('alphafold_files/AF-P50613-F1-model_v4.pdb'),\n",
|
|
187
|
+
" pae_image_file=None,\n",
|
|
188
|
+
" pae_doc_file=PosixPath('alphafold_files/AF-P50613-F1-predicted_aligned_error_v4.json'),\n",
|
|
189
|
+
" am_annotations_file=None,\n",
|
|
190
|
+
" am_annotations_hg19_file=None,\n",
|
|
191
|
+
" am_annotations_hg38_file=None)]\n"
|
|
192
|
+
]
|
|
193
|
+
},
|
|
194
|
+
{
|
|
195
|
+
"name": "stderr",
|
|
196
|
+
"output_type": "stream",
|
|
197
|
+
"text": [
|
|
198
|
+
"\n"
|
|
199
|
+
]
|
|
200
|
+
}
|
|
201
|
+
],
|
|
202
|
+
"source": [
|
|
203
|
+
"summaries = [s async for s in fetch_many_async([\"A1YPR0\", \"O60481\", \"P50613\"], save_dir, what={\"pdb\", \"cif\", \"paeDoc\"})]\n",
|
|
204
|
+
"pprint(summaries)"
|
|
205
|
+
]
|
|
206
|
+
},
|
|
207
|
+
{
|
|
208
|
+
"cell_type": "code",
|
|
209
|
+
"execution_count": 9,
|
|
210
|
+
"id": "2d3595e6",
|
|
211
|
+
"metadata": {},
|
|
212
|
+
"outputs": [
|
|
213
|
+
{
|
|
214
|
+
"name": "stdout",
|
|
215
|
+
"output_type": "stream",
|
|
216
|
+
"text": [
|
|
217
|
+
"total 4.2M\n",
|
|
218
|
+
"4.0K A1YPR0.json\n",
|
|
219
|
+
"548K AF-A1YPR0-F1-model_v4.cif\n",
|
|
220
|
+
"392K AF-A1YPR0-F1-model_v4.pdb\n",
|
|
221
|
+
"1.1M AF-A1YPR0-F1-predicted_aligned_error_v4.json\n",
|
|
222
|
+
"408K AF-O60481-F1-model_v4.cif\n",
|
|
223
|
+
"292K AF-O60481-F1-model_v4.pdb\n",
|
|
224
|
+
"632K AF-O60481-F1-predicted_aligned_error_v4.json\n",
|
|
225
|
+
"320K AF-P50613-F1-model_v4.cif\n",
|
|
226
|
+
"224K AF-P50613-F1-model_v4.pdb\n",
|
|
227
|
+
"280K AF-P50613-F1-predicted_aligned_error_v4.json\n",
|
|
228
|
+
"4.0K O60481.json\n",
|
|
229
|
+
"4.0K P50613.json\n"
|
|
230
|
+
]
|
|
231
|
+
}
|
|
232
|
+
],
|
|
233
|
+
"source": [
|
|
234
|
+
"!ls -sh {save_dir}"
|
|
235
|
+
]
|
|
236
|
+
},
|
|
237
|
+
{
|
|
238
|
+
"cell_type": "markdown",
|
|
239
|
+
"id": "a43edd87",
|
|
240
|
+
"metadata": {},
|
|
241
|
+
"source": [
|
|
242
|
+
"## Filter AlphFold structure files on confidence\n",
|
|
243
|
+
"\n",
|
|
244
|
+
"Filter AlphaFold mmcif/PDB files by confidence (plDDT). Passed files are written with residues below threshold removed."
|
|
245
|
+
]
|
|
246
|
+
},
|
|
247
|
+
{
|
|
248
|
+
"cell_type": "code",
|
|
249
|
+
"execution_count": 10,
|
|
250
|
+
"id": "cc96c63a",
|
|
251
|
+
"metadata": {},
|
|
252
|
+
"outputs": [],
|
|
253
|
+
"source": [
|
|
254
|
+
"from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence"
|
|
255
|
+
]
|
|
256
|
+
},
|
|
257
|
+
{
|
|
258
|
+
"cell_type": "markdown",
|
|
259
|
+
"id": "724141d4",
|
|
260
|
+
"metadata": {},
|
|
261
|
+
"source": [
|
|
262
|
+
"Take one of the downloaded files"
|
|
263
|
+
]
|
|
264
|
+
},
|
|
265
|
+
{
|
|
266
|
+
"cell_type": "code",
|
|
267
|
+
"execution_count": 12,
|
|
268
|
+
"id": "73a61cf6",
|
|
269
|
+
"metadata": {},
|
|
270
|
+
"outputs": [
|
|
271
|
+
{
|
|
272
|
+
"data": {
|
|
273
|
+
"text/plain": [
|
|
274
|
+
"[PosixPath('alphafold_files/AF-A1YPR0-F1-model_v4.cif'),\n",
|
|
275
|
+
" PosixPath('alphafold_files/AF-O60481-F1-model_v4.cif'),\n",
|
|
276
|
+
" PosixPath('alphafold_files/AF-P50613-F1-model_v4.cif')]"
|
|
277
|
+
]
|
|
278
|
+
},
|
|
279
|
+
"execution_count": 12,
|
|
280
|
+
"metadata": {},
|
|
281
|
+
"output_type": "execute_result"
|
|
282
|
+
}
|
|
283
|
+
],
|
|
284
|
+
"source": [
|
|
285
|
+
"input_files = [entry.cif_file for entry in summaries if entry.cif_file is not None]\n",
|
|
286
|
+
"input_files"
|
|
287
|
+
]
|
|
288
|
+
},
|
|
289
|
+
{
|
|
290
|
+
"cell_type": "markdown",
|
|
291
|
+
"id": "da8f2f67",
|
|
292
|
+
"metadata": {},
|
|
293
|
+
"source": [
|
|
294
|
+
"We only write a filtered cif file when in the input file there are between 100 and 1000 residues that have a pLDDT score above 50."
|
|
295
|
+
]
|
|
296
|
+
},
|
|
297
|
+
{
|
|
298
|
+
"cell_type": "code",
|
|
299
|
+
"execution_count": null,
|
|
300
|
+
"id": "fbfdf472",
|
|
301
|
+
"metadata": {},
|
|
302
|
+
"outputs": [],
|
|
303
|
+
"source": [
|
|
304
|
+
"query = ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000)"
|
|
305
|
+
]
|
|
306
|
+
},
|
|
307
|
+
{
|
|
308
|
+
"cell_type": "code",
|
|
309
|
+
"execution_count": 14,
|
|
310
|
+
"id": "152aec9a",
|
|
311
|
+
"metadata": {},
|
|
312
|
+
"outputs": [],
|
|
313
|
+
"source": [
|
|
314
|
+
"output_dir = Path(\"./filtered\")\n",
|
|
315
|
+
"output_dir.mkdir(exist_ok=True)\n",
|
|
316
|
+
"result = filter_files_on_confidence(input_files, query, output_dir)"
|
|
317
|
+
]
|
|
318
|
+
},
|
|
319
|
+
{
|
|
320
|
+
"cell_type": "code",
|
|
321
|
+
"execution_count": null,
|
|
322
|
+
"id": "6a6f8e3f",
|
|
323
|
+
"metadata": {},
|
|
324
|
+
"outputs": [
|
|
325
|
+
{
|
|
326
|
+
"data": {
|
|
327
|
+
"text/plain": [
|
|
328
|
+
"[ConfidenceFilterResult(input_file='AF-A1YPR0-F1-model_v4.cif', count=175, filtered_file=PosixPath('filtered/AF-A1YPR0-F1-model_v4.cif')),\n",
|
|
329
|
+
" ConfidenceFilterResult(input_file='AF-O60481-F1-model_v4.cif', count=76, filtered_file=None),\n",
|
|
330
|
+
" ConfidenceFilterResult(input_file='AF-P50613-F1-model_v4.cif', count=244, filtered_file=PosixPath('filtered/AF-P50613-F1-model_v4.cif'))]"
|
|
331
|
+
]
|
|
332
|
+
},
|
|
333
|
+
"execution_count": 17,
|
|
334
|
+
"metadata": {},
|
|
335
|
+
"output_type": "execute_result"
|
|
336
|
+
}
|
|
337
|
+
],
|
|
338
|
+
"source": [
|
|
339
|
+
"list(\n",
|
|
340
|
+
" filter_files_on_confidence(\n",
|
|
341
|
+
" input_files, ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000), output_dir\n",
|
|
342
|
+
" )\n",
|
|
343
|
+
")"
|
|
344
|
+
]
|
|
345
|
+
},
|
|
346
|
+
{
|
|
347
|
+
"cell_type": "markdown",
|
|
348
|
+
"id": "0fe1e388",
|
|
349
|
+
"metadata": {},
|
|
350
|
+
"source": [
|
|
351
|
+
"2 files have passed, but 1 file only has 75 high confidence residues so it is discarded."
|
|
352
|
+
]
|
|
353
|
+
},
|
|
354
|
+
{
|
|
355
|
+
"cell_type": "code",
|
|
356
|
+
"execution_count": null,
|
|
357
|
+
"id": "83ffc09b",
|
|
358
|
+
"metadata": {},
|
|
359
|
+
"outputs": [],
|
|
360
|
+
"source": []
|
|
361
|
+
}
|
|
362
|
+
],
|
|
363
|
+
"metadata": {
|
|
364
|
+
"kernelspec": {
|
|
365
|
+
"display_name": "protein-quest",
|
|
366
|
+
"language": "python",
|
|
367
|
+
"name": "python3"
|
|
368
|
+
},
|
|
369
|
+
"language_info": {
|
|
370
|
+
"codemirror_mode": {
|
|
371
|
+
"name": "ipython",
|
|
372
|
+
"version": 3
|
|
373
|
+
},
|
|
374
|
+
"file_extension": ".py",
|
|
375
|
+
"mimetype": "text/x-python",
|
|
376
|
+
"name": "python",
|
|
377
|
+
"nbconvert_exporter": "python",
|
|
378
|
+
"pygments_lexer": "ipython3",
|
|
379
|
+
"version": "3.13.2"
|
|
380
|
+
}
|
|
381
|
+
},
|
|
382
|
+
"nbformat": 4,
|
|
383
|
+
"nbformat_minor": 5
|
|
384
|
+
}
|