protein-quest 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. {protein_quest-0.3.0 → protein_quest-0.3.2}/.github/workflows/ci.yml +16 -1
  2. {protein_quest-0.3.0 → protein_quest-0.3.2}/CITATION.cff +1 -2
  3. {protein_quest-0.3.0 → protein_quest-0.3.2}/CONTRIBUTING.md +15 -1
  4. {protein_quest-0.3.0 → protein_quest-0.3.2}/PKG-INFO +21 -11
  5. {protein_quest-0.3.0 → protein_quest-0.3.2}/README.md +19 -6
  6. protein_quest-0.3.2/docs/notebooks/.gitignore +4 -0
  7. protein_quest-0.3.2/docs/notebooks/alphafold.ipynb +384 -0
  8. protein_quest-0.3.2/docs/notebooks/index.md +3 -0
  9. protein_quest-0.3.2/docs/notebooks/pdbe.ipynb +278 -0
  10. protein_quest-0.3.2/docs/notebooks/uniprot.ipynb +308 -0
  11. {protein_quest-0.3.0 → protein_quest-0.3.2}/mkdocs.yml +13 -4
  12. {protein_quest-0.3.0 → protein_quest-0.3.2}/pyproject.toml +12 -16
  13. protein_quest-0.3.2/src/protein_quest/__version__.py +2 -0
  14. {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/alphafold/confidence.py +44 -17
  15. {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/alphafold/entry_summary.py +11 -9
  16. {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/alphafold/fetch.py +37 -63
  17. {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/cli.py +187 -30
  18. protein_quest-0.3.2/src/protein_quest/converter.py +45 -0
  19. protein_quest-0.3.2/src/protein_quest/filters.py +150 -0
  20. {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/go.py +1 -4
  21. {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/mcp_server.py +8 -5
  22. {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/parallel.py +37 -1
  23. {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/pdbe/fetch.py +15 -1
  24. protein_quest-0.3.2/src/protein_quest/pdbe/io.py +281 -0
  25. protein_quest-0.3.2/src/protein_quest/ss.py +264 -0
  26. {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/taxonomy.py +13 -3
  27. {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/utils.py +65 -3
  28. protein_quest-0.3.2/tests/alphafold/test_confidence.py +155 -0
  29. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/alphafold/test_entry_summary.py +1 -4
  30. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/alphafold/test_fetch.py +1 -1
  31. protein_quest-0.3.2/tests/fixtures/3JRS_B2A.cif.gz +0 -0
  32. protein_quest-0.3.2/tests/pdbe/test_fetch.py +29 -0
  33. protein_quest-0.3.2/tests/pdbe/test_io.py +142 -0
  34. protein_quest-0.3.2/tests/test_ss.py +227 -0
  35. protein_quest-0.3.2/tests/test_utils.py +31 -0
  36. {protein_quest-0.3.0 → protein_quest-0.3.2}/uv.lock +76 -695
  37. protein_quest-0.3.0/src/protein_quest/__version__.py +0 -1
  38. protein_quest-0.3.0/src/protein_quest/filters.py +0 -107
  39. protein_quest-0.3.0/src/protein_quest/pdbe/io.py +0 -185
  40. protein_quest-0.3.0/tests/alphafold/test_confidence.py +0 -63
  41. protein_quest-0.3.0/tests/pdbe/test_fetch.py +0 -17
  42. protein_quest-0.3.0/tests/pdbe/test_io.py +0 -81
  43. {protein_quest-0.3.0 → protein_quest-0.3.2}/.github/workflows/pages.yml +0 -0
  44. {protein_quest-0.3.0 → protein_quest-0.3.2}/.github/workflows/pypi-publish.yml +0 -0
  45. {protein_quest-0.3.0 → protein_quest-0.3.2}/.gitignore +0 -0
  46. {protein_quest-0.3.0 → protein_quest-0.3.2}/.vscode/extensions.json +0 -0
  47. {protein_quest-0.3.0 → protein_quest-0.3.2}/CODE_OF_CONDUCT.md +0 -0
  48. {protein_quest-0.3.0 → protein_quest-0.3.2}/LICENSE +0 -0
  49. {protein_quest-0.3.0 → protein_quest-0.3.2}/docs/CONTRIBUTING.md +0 -0
  50. {protein_quest-0.3.0 → protein_quest-0.3.2}/docs/cli_doc_hook.py +0 -0
  51. {protein_quest-0.3.0 → protein_quest-0.3.2}/docs/index.md +0 -0
  52. {protein_quest-0.3.0 → protein_quest-0.3.2}/docs/protein-quest-mcp.png +0 -0
  53. {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/__init__.py +0 -0
  54. {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/alphafold/__init__.py +0 -0
  55. {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/emdb.py +0 -0
  56. {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/pdbe/__init__.py +0 -0
  57. {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/py.typed +0 -0
  58. {protein_quest-0.3.0 → protein_quest-0.3.2}/src/protein_quest/uniprot.py +0 -0
  59. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/alphafold/AF-A1YPR0-F1-model_v4.pdb +0 -0
  60. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/alphafold/cassettes/test_fetch/test_fetch_many.yaml +0 -0
  61. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/cassettes/test_emdb/test_fetch.yaml +0 -0
  62. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/cassettes/test_go/test_search_gene_ontology_term.yaml +0 -0
  63. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/cassettes/test_taxonomy/test_search_taxon.yaml +0 -0
  64. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/cassettes/test_taxonomy/test_search_taxon_by_id.yaml +0 -0
  65. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/cassettes/test_uniprot/test_search4af.yaml +0 -0
  66. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/cassettes/test_uniprot/test_search4emdb.yaml +0 -0
  67. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/cassettes/test_uniprot/test_search4pdb.yaml +0 -0
  68. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/cassettes/test_uniprot/test_search4uniprot.yaml +0 -0
  69. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/pdbe/cassettes/test_fetch/test_fetch.yaml +0 -0
  70. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/pdbe/fixtures/2y29.cif +0 -0
  71. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/test_cli.py +0 -0
  72. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/test_emdb.py +0 -0
  73. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/test_go.py +0 -0
  74. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/test_mcp.py +0 -0
  75. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/test_taxonomy.py +0 -0
  76. {protein_quest-0.3.0 → protein_quest-0.3.2}/tests/test_uniprot.py +0 -0
@@ -3,7 +3,7 @@ name: CI
3
3
  on:
4
4
  push:
5
5
  branches:
6
- - main
6
+ - main
7
7
  pull_request:
8
8
 
9
9
  concurrency:
@@ -70,3 +70,18 @@ jobs:
70
70
  run: uv sync --locked --dev --extra mcp
71
71
  - name: Run type checkers
72
72
  run: uv run pyrefly check src tests
73
+ typing-docs:
74
+ name: typing-docs
75
+ runs-on: ubuntu-latest
76
+ steps:
77
+ - uses: actions/checkout@v4
78
+ - name: Install uv
79
+ uses: astral-sh/setup-uv@v6
80
+ - name: Install the project
81
+ run: uv sync --group docs-type
82
+ - name: Convert notebooks to Python scripts
83
+ run: |
84
+ find docs/ -name "*.ipynb" -exec uv run --group docs-type marimo convert {} -o {}.py \;
85
+ - name: Run type checkers on docs
86
+ run: uv run --group docs-type pyrefly check docs/notebooks/*.ipynb.py
87
+
@@ -23,5 +23,4 @@ repository-code: https://github.com/haddocking/protein-quest
23
23
  identifiers:
24
24
  - description: Latest version of software
25
25
  type: doi
26
- # TODO update once release has been made
27
- value: 10.5281/zenodo.15632658
26
+ value: 10.5281/zenodo.16941288
@@ -82,9 +82,23 @@ uv run mkdocs build
82
82
  python3 -m http.server -d site
83
83
  ```
84
84
 
85
+ <details>
86
+ <summary>Type checking notebooks</summary>
87
+
88
+ [Pyrefly](https://pyrefly.org/) does not support notebooks yet, so we need to convert them to python scripts and then run pyrefly on them.
89
+
90
+ ```shell
91
+ find docs/ -name "*.ipynb" -exec uv run --group docs-type marimo convert {} -o {}.py \;
92
+ uv run --group docs-type pyrefly check docs/notebooks/*.ipynb.py
93
+ rm docs/notebooks/*.ipynb.py
94
+ ```
95
+
96
+ </details>
97
+
98
+
85
99
  ## Contributing to tests
86
100
 
87
- The code coverage are stored at https://app.codacy.com/gh/haddocking/protein-quest/coverage .
101
+ The code coverage is stored at [https://app.codacy.com/gh/haddocking/protein-quest/coverage](https://app.codacy.com/gh/haddocking/protein-quest/coverage) .
88
102
 
89
103
  The search functions of the protein-quest package talk to web services on the Internet.
90
104
  To have fast tests we use [pytest-recording](https://github.com/kiwicom/pytest-recording) to record and replay HTTP interactions.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: protein_quest
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Search/retrieve/filter proteins and protein structures
5
5
  Project-URL: Homepage, https://github.com/haddocking/protein-quest
6
6
  Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
@@ -13,19 +13,16 @@ Requires-Dist: aiohttp-retry>=2.9.1
13
13
  Requires-Dist: aiohttp[speedups]>=3.11.18
14
14
  Requires-Dist: aiopath>=0.7.7
15
15
  Requires-Dist: attrs>=25.3.0
16
- Requires-Dist: bokeh>=3.7.3
17
16
  Requires-Dist: cattrs[orjson]>=24.1.3
18
17
  Requires-Dist: dask>=2025.5.1
19
18
  Requires-Dist: distributed>=2025.5.1
20
19
  Requires-Dist: gemmi>=0.7.3
21
- Requires-Dist: molviewspec>=1.6.0
22
- Requires-Dist: pandas>=2.3.0
23
- Requires-Dist: platformdirs>=4.3.8
24
20
  Requires-Dist: psutil>=7.0.0
25
21
  Requires-Dist: rich-argparse>=1.7.1
26
22
  Requires-Dist: rich>=14.0.0
27
23
  Requires-Dist: sparqlwrapper>=2.0.0
28
24
  Requires-Dist: tqdm>=4.67.1
25
+ Requires-Dist: yarl>=1.20.1
29
26
  Provides-Extra: mcp
30
27
  Requires-Dist: fastmcp>=2.11.3; extra == 'mcp'
31
28
  Requires-Dist: pydantic>=2.11.7; extra == 'mcp'
@@ -37,8 +34,7 @@ Description-Content-Type: text/markdown
37
34
  [![CI](https://github.com/haddocking/protein-quest/actions/workflows/ci.yml/badge.svg)](https://github.com/haddocking/protein-quest/actions/workflows/ci.yml)
38
35
  [![Research Software Directory Badge](https://img.shields.io/badge/rsd-00a3e3.svg)](https://www.research-software.nl/software/protein-quest)
39
36
  [![PyPI](https://img.shields.io/pypi/v/protein-quest)](https://pypi.org/project/protein-quest/)
40
- <!-- TODO replace with correct zenodo id -->
41
- [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.15632658.svg)](https://doi.org/10.5281/zenodo.15632658)
37
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.16941288.svg)](https://doi.org/10.5281/zenodo.16941288)
42
38
  [![Codacy Badge](https://app.codacy.com/project/badge/Coverage/7a3f3f1fe64640d583a5e50fe7ba828e)](https://app.codacy.com/gh/haddocking/protein-quest/coverage?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_coverage)
43
39
 
44
40
  Python package to search/retrieve/filter proteins and protein structures.
@@ -63,9 +59,11 @@ graph TB;
63
59
  searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
64
60
  searchaf --> |uniprot_accessions|fetchad(Retrieve AlphaFold)
65
61
  searchemdb -. emdb_ids .->fetchemdb[Retrieve EMDB]
66
- fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{Filter on chain of uniprot}
67
- chainfilter --> |mmcif_files| residuefilter{Filter on chain length}
68
- fetchad -->|pdb_files| confidencefilter{Filter out low confidence}
62
+ fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{{Filter on chain of uniprot}}
63
+ chainfilter --> |mmcif_files| residuefilter{{Filter on chain length}}
64
+ fetchad -->|pdb_files| confidencefilter{{Filter out low confidence}}
65
+ confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
66
+ residuefilter --> |mmcif_files| ssfilter
69
67
  classDef dashedBorder stroke-dasharray: 5 5;
70
68
  goterm:::dashedBorder
71
69
  taxonomy:::dashedBorder
@@ -90,7 +88,7 @@ pip install git+https://github.com/haddocking/protein-quest.git
90
88
 
91
89
  The main entry point is the `protein-quest` command line tool which has multiple subcommands to perform actions.
92
90
 
93
- To use programmaticly, see [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
91
+ To use programmaticly, see the [Jupyter notebooks](https://www.bonvinlab.org/protein-quest/notebooks) and [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
94
92
 
95
93
  ### Search Uniprot accessions
96
94
 
@@ -179,6 +177,18 @@ protein-quest filter residue \
179
177
  ./filtered-chains ./filtered
180
178
  ```
181
179
 
180
+ ### To filter on secondary structure
181
+
182
+ To filter on structure being mostly alpha helices and have no beta sheets.
183
+
184
+ ```shell
185
+ protein-quest filter secondary-structure \
186
+ --ratio-min-helix-residues 0.5 \
187
+ --ratio-max-sheet-residues 0.0 \
188
+ --write-stats filtered-ss/stats.csv \
189
+ ./filtered-chains ./filtered-ss
190
+ ```
191
+
182
192
  ### Search Taxonomy
183
193
 
184
194
  ```shell
@@ -4,8 +4,7 @@
4
4
  [![CI](https://github.com/haddocking/protein-quest/actions/workflows/ci.yml/badge.svg)](https://github.com/haddocking/protein-quest/actions/workflows/ci.yml)
5
5
  [![Research Software Directory Badge](https://img.shields.io/badge/rsd-00a3e3.svg)](https://www.research-software.nl/software/protein-quest)
6
6
  [![PyPI](https://img.shields.io/pypi/v/protein-quest)](https://pypi.org/project/protein-quest/)
7
- <!-- TODO replace with correct zenodo id -->
8
- [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.15632658.svg)](https://doi.org/10.5281/zenodo.15632658)
7
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.16941288.svg)](https://doi.org/10.5281/zenodo.16941288)
9
8
  [![Codacy Badge](https://app.codacy.com/project/badge/Coverage/7a3f3f1fe64640d583a5e50fe7ba828e)](https://app.codacy.com/gh/haddocking/protein-quest/coverage?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_coverage)
10
9
 
11
10
  Python package to search/retrieve/filter proteins and protein structures.
@@ -30,9 +29,11 @@ graph TB;
30
29
  searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
31
30
  searchaf --> |uniprot_accessions|fetchad(Retrieve AlphaFold)
32
31
  searchemdb -. emdb_ids .->fetchemdb[Retrieve EMDB]
33
- fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{Filter on chain of uniprot}
34
- chainfilter --> |mmcif_files| residuefilter{Filter on chain length}
35
- fetchad -->|pdb_files| confidencefilter{Filter out low confidence}
32
+ fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{{Filter on chain of uniprot}}
33
+ chainfilter --> |mmcif_files| residuefilter{{Filter on chain length}}
34
+ fetchad -->|pdb_files| confidencefilter{{Filter out low confidence}}
35
+ confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
36
+ residuefilter --> |mmcif_files| ssfilter
36
37
  classDef dashedBorder stroke-dasharray: 5 5;
37
38
  goterm:::dashedBorder
38
39
  taxonomy:::dashedBorder
@@ -57,7 +58,7 @@ pip install git+https://github.com/haddocking/protein-quest.git
57
58
 
58
59
  The main entry point is the `protein-quest` command line tool which has multiple subcommands to perform actions.
59
60
 
60
- To use programmaticly, see [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
61
+ To use programmaticly, see the [Jupyter notebooks](https://www.bonvinlab.org/protein-quest/notebooks) and [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
61
62
 
62
63
  ### Search Uniprot accessions
63
64
 
@@ -146,6 +147,18 @@ protein-quest filter residue \
146
147
  ./filtered-chains ./filtered
147
148
  ```
148
149
 
150
+ ### To filter on secondary structure
151
+
152
+ To filter on structure being mostly alpha helices and have no beta sheets.
153
+
154
+ ```shell
155
+ protein-quest filter secondary-structure \
156
+ --ratio-min-helix-residues 0.5 \
157
+ --ratio-max-sheet-residues 0.0 \
158
+ --write-stats filtered-ss/stats.csv \
159
+ ./filtered-chains ./filtered-ss
160
+ ```
161
+
149
162
  ### Search Taxonomy
150
163
 
151
164
  ```shell
@@ -0,0 +1,4 @@
1
+ pdb_files/
2
+ alphafold_files/
3
+ filtered/
4
+ session/
@@ -0,0 +1,384 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "24b1926c",
6
+ "metadata": {},
7
+ "source": [
8
+ "# AlphaFold\n",
9
+ "\n",
10
+ "You can download and filter AlphaFold files on confidence."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 1,
16
+ "id": "681ba946",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "# Generic imports\n",
21
+ "import logging\n",
22
+ "from pathlib import Path\n",
23
+ "from pprint import pprint\n",
24
+ "\n",
25
+ "logging.basicConfig(level=logging.WARNING)\n",
26
+ "# Set to WARNING to see only warnings\n",
27
+ "# Set to INFO to see sparql queries\n",
28
+ "# Set to DEBUG to see raw results"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "markdown",
33
+ "id": "4959258c",
34
+ "metadata": {},
35
+ "source": [
36
+ "\n",
37
+ "## Download Alphafold files"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 2,
43
+ "id": "81e449db",
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "from protein_quest.alphafold.fetch import fetch_many_async"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 3,
53
+ "id": "5c2e6ee3",
54
+ "metadata": {},
55
+ "outputs": [],
56
+ "source": [
57
+ "save_dir = Path(\"alphafold_files\")"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "markdown",
62
+ "id": "f38991cf",
63
+ "metadata": {},
64
+ "source": [
65
+ "To download the summary, the cif, predicted Aligned error document (peaDoc) and the pdb file for 3 AlphaFold entries given their uniprot accessions.\n"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 8,
71
+ "id": "e32b474a",
72
+ "metadata": {},
73
+ "outputs": [
74
+ {
75
+ "name": "stderr",
76
+ "output_type": "stream",
77
+ "text": [
78
+ "Fetching Alphafold summaries: 100%|██████████| 3/3 [00:00<00:00, 8.07it/s]\n",
79
+ "Downloading AlphaFold files: 100%|██████████| 9/9 [00:00<00:00, 55.82it/s]"
80
+ ]
81
+ },
82
+ {
83
+ "name": "stdout",
84
+ "output_type": "stream",
85
+ "text": [
86
+ "[AlphaFoldEntry(uniprot_acc='A1YPR0',\n",
87
+ " summary=EntrySummary(entryId='AF-A1YPR0-F1',\n",
88
+ " uniprotAccession='A1YPR0',\n",
89
+ " uniprotId='ZBT7C_HUMAN',\n",
90
+ " uniprotDescription='Zinc finger and BTB '\n",
91
+ " 'domain-containing '\n",
92
+ " 'protein 7C',\n",
93
+ " taxId=9606,\n",
94
+ " organismScientificName='Homo sapiens',\n",
95
+ " uniprotStart=1,\n",
96
+ " uniprotEnd=619,\n",
97
+ " uniprotSequence='MANDIDELIGIPFPNHSSEVLCSLNEQRHDGLLCDVLLVVQEQEYRTHRSVLAACSKYFKKLFTAGTLASQPYVYEIDFVQPEALAAILEFAYTSTLTITAGNVKHILNAARMLEIQCIVNVCLEIMEPGGDGGEEDDKEDDDDDEDDDDEEDEEEEEEEEEDDDDDTEDFADQENLPDPQDISCHQSPSKTDHLTEKAYSDTPRDFPDSFQAGSPGHLGVIRDFSIESLLRENLYPKANIPDRRPSLSPFAPDFFPHLWPGDFGAFAQLPEQPMDSGPLDLVIKNRKIKEEEKEELPPPPPPPFPNDFFKDMFPDLPGGPLGPIKAENDYGAYLNFLSATHLGGLFPPWPLVEERKLKPKASQQCPICHKVIMGAGKLPRHMRTHTGEKPYMCTICEVRFTRQDKLKIHMRKHTGERPYLCIHCNAKFVHNYDLKNHMRIHTGVRPYQCEFCYKSFTRSDHLHRHIKRQSCRMARPRRGRKPAAWRAASLLFGPGGPAPDKAAFVMPPALGEVGGHLGGAAVCLPGPSPAKHFLAAPKGALSLQELERQFEETQMKLFGRAQLEAERNAGGLLAFALAENVAAARPYFPLPDPWAAGLAGLPGLAGLNHVASMSEANN',\n",
98
+ " modelCreatedDate='2022-06-01T00:00:00Z',\n",
99
+ " latestVersion=4,\n",
100
+ " allVersions=[1, 2, 3, 4],\n",
101
+ " bcifUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v4.bcif',\n",
102
+ " cifUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v4.cif',\n",
103
+ " pdbUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v4.pdb',\n",
104
+ " paeImageUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-predicted_aligned_error_v4.png',\n",
105
+ " paeDocUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-predicted_aligned_error_v4.json',\n",
106
+ " gene='ZBTB7C',\n",
107
+ " sequenceChecksum='73D82A34502B55BF',\n",
108
+ " sequenceVersionDate='2007-02-06T00:00:00Z',\n",
109
+ " amAnnotationsUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-aa-substitutions.csv',\n",
110
+ " amAnnotationsHg19Url='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg19.csv',\n",
111
+ " amAnnotationsHg38Url='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg38.csv',\n",
112
+ " isReviewed=True,\n",
113
+ " isReferenceProteome=True),\n",
114
+ " bcif_file=None,\n",
115
+ " cif_file=PosixPath('alphafold_files/AF-A1YPR0-F1-model_v4.cif'),\n",
116
+ " pdb_file=PosixPath('alphafold_files/AF-A1YPR0-F1-model_v4.pdb'),\n",
117
+ " pae_image_file=None,\n",
118
+ " pae_doc_file=PosixPath('alphafold_files/AF-A1YPR0-F1-predicted_aligned_error_v4.json'),\n",
119
+ " am_annotations_file=None,\n",
120
+ " am_annotations_hg19_file=None,\n",
121
+ " am_annotations_hg38_file=None),\n",
122
+ " AlphaFoldEntry(uniprot_acc='O60481',\n",
123
+ " summary=EntrySummary(entryId='AF-O60481-F1',\n",
124
+ " uniprotAccession='O60481',\n",
125
+ " uniprotId='ZIC3_HUMAN',\n",
126
+ " uniprotDescription='Zinc finger protein '\n",
127
+ " 'ZIC 3',\n",
128
+ " taxId=9606,\n",
129
+ " organismScientificName='Homo sapiens',\n",
130
+ " uniprotStart=1,\n",
131
+ " uniprotEnd=467,\n",
132
+ " uniprotSequence='MTMLLDGGPQFPGLGVGSFGAPRHHEMPNREPAGMGLNPFGDSTHAAAAAAAAAAFKLSPAAAHDLSSGQSSAFTPQGSGYANALGHHHHHHHHHHHTSQVPSYGGAASAAFNSTREFLFRQRSSGLSEAASGGGQHGLFAGSASSLHAPAGIPEPPSYLLFPGLHEQGAGHPSPTGHVDNNQVHLGLRGELFGRADPYRPVASPRTDPYAAGAQFPNYSPMNMNMGVNVAAHHGPGAFFRYMRQPIKQELSCKWIDEAQLSRPKKSCDRTFSTMHELVTHVTMEHVGGPEQNNHVCYWEECPREGKSFKAKYKLVNHIRVHTGEKPFPCPFPGCGKIFARSENLKIHKRTHTGEKPFKCEFEGCDRRFANSSDRKKHMHVHTSDKPYICKVCDKSYTHPSSLRKHMKVHESQGSDSSPAASSGYESSTPPAIASANSKDTTKTPSAVQTSTSHNPGLPPNFNEWYV',\n",
133
+ " modelCreatedDate='2022-06-01T00:00:00Z',\n",
134
+ " latestVersion=4,\n",
135
+ " allVersions=[1, 2, 3, 4],\n",
136
+ " bcifUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v4.bcif',\n",
137
+ " cifUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v4.cif',\n",
138
+ " pdbUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v4.pdb',\n",
139
+ " paeImageUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-predicted_aligned_error_v4.png',\n",
140
+ " paeDocUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-predicted_aligned_error_v4.json',\n",
141
+ " gene='ZIC3',\n",
142
+ " sequenceChecksum='3150CF13C0679568',\n",
143
+ " sequenceVersionDate='1998-08-01T00:00:00Z',\n",
144
+ " amAnnotationsUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-aa-substitutions.csv',\n",
145
+ " amAnnotationsHg19Url='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg19.csv',\n",
146
+ " amAnnotationsHg38Url='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg38.csv',\n",
147
+ " isReviewed=True,\n",
148
+ " isReferenceProteome=True),\n",
149
+ " bcif_file=None,\n",
150
+ " cif_file=PosixPath('alphafold_files/AF-O60481-F1-model_v4.cif'),\n",
151
+ " pdb_file=PosixPath('alphafold_files/AF-O60481-F1-model_v4.pdb'),\n",
152
+ " pae_image_file=None,\n",
153
+ " pae_doc_file=PosixPath('alphafold_files/AF-O60481-F1-predicted_aligned_error_v4.json'),\n",
154
+ " am_annotations_file=None,\n",
155
+ " am_annotations_hg19_file=None,\n",
156
+ " am_annotations_hg38_file=None),\n",
157
+ " AlphaFoldEntry(uniprot_acc='P50613',\n",
158
+ " summary=EntrySummary(entryId='AF-P50613-F1',\n",
159
+ " uniprotAccession='P50613',\n",
160
+ " uniprotId='CDK7_HUMAN',\n",
161
+ " uniprotDescription='Cyclin-dependent '\n",
162
+ " 'kinase 7',\n",
163
+ " taxId=9606,\n",
164
+ " organismScientificName='Homo sapiens',\n",
165
+ " uniprotStart=1,\n",
166
+ " uniprotEnd=346,\n",
167
+ " uniprotSequence='MALDVKSRAKRYEKLDFLGEGQFATVYKARDKNTNQIVAIKKIKLGHRSEAKDGINRTALREIKLLQELSHPNIIGLLDAFGHKSNISLVFDFMETDLEVIIKDNSLVLTPSHIKAYMLMTLQGLEYLHQHWILHRDLKPNNLLLDENGVLKLADFGLAKSFGSPNRAYTHQVVTRWYRAPELLFGARMYGVGVDMWAVGCILAELLLRVPFLPGDSDLDQLTRIFETLGTPTEEQWPDMCSLPDYVTFKSFPGIPLHHIFSAAGDDLLDLIQGLFLFNPCARITATQALKMKYFSNRPGPTPGCQLPRPNCPVETLKEQSNPALAIKRKRTEALEQGGLPKKLIF',\n",
168
+ " modelCreatedDate='2022-06-01T00:00:00Z',\n",
169
+ " latestVersion=4,\n",
170
+ " allVersions=[1, 2, 3, 4],\n",
171
+ " bcifUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v4.bcif',\n",
172
+ " cifUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v4.cif',\n",
173
+ " pdbUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v4.pdb',\n",
174
+ " paeImageUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-predicted_aligned_error_v4.png',\n",
175
+ " paeDocUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-predicted_aligned_error_v4.json',\n",
176
+ " gene='CDK7',\n",
177
+ " sequenceChecksum='0A94BFA7DD416CEB',\n",
178
+ " sequenceVersionDate='1996-10-01T00:00:00Z',\n",
179
+ " amAnnotationsUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-aa-substitutions.csv',\n",
180
+ " amAnnotationsHg19Url='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg19.csv',\n",
181
+ " amAnnotationsHg38Url='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg38.csv',\n",
182
+ " isReviewed=True,\n",
183
+ " isReferenceProteome=True),\n",
184
+ " bcif_file=None,\n",
185
+ " cif_file=PosixPath('alphafold_files/AF-P50613-F1-model_v4.cif'),\n",
186
+ " pdb_file=PosixPath('alphafold_files/AF-P50613-F1-model_v4.pdb'),\n",
187
+ " pae_image_file=None,\n",
188
+ " pae_doc_file=PosixPath('alphafold_files/AF-P50613-F1-predicted_aligned_error_v4.json'),\n",
189
+ " am_annotations_file=None,\n",
190
+ " am_annotations_hg19_file=None,\n",
191
+ " am_annotations_hg38_file=None)]\n"
192
+ ]
193
+ },
194
+ {
195
+ "name": "stderr",
196
+ "output_type": "stream",
197
+ "text": [
198
+ "\n"
199
+ ]
200
+ }
201
+ ],
202
+ "source": [
203
+ "summaries = [s async for s in fetch_many_async([\"A1YPR0\", \"O60481\", \"P50613\"], save_dir, what={\"pdb\", \"cif\", \"paeDoc\"})]\n",
204
+ "pprint(summaries)"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": 9,
210
+ "id": "2d3595e6",
211
+ "metadata": {},
212
+ "outputs": [
213
+ {
214
+ "name": "stdout",
215
+ "output_type": "stream",
216
+ "text": [
217
+ "total 4.2M\n",
218
+ "4.0K A1YPR0.json\n",
219
+ "548K AF-A1YPR0-F1-model_v4.cif\n",
220
+ "392K AF-A1YPR0-F1-model_v4.pdb\n",
221
+ "1.1M AF-A1YPR0-F1-predicted_aligned_error_v4.json\n",
222
+ "408K AF-O60481-F1-model_v4.cif\n",
223
+ "292K AF-O60481-F1-model_v4.pdb\n",
224
+ "632K AF-O60481-F1-predicted_aligned_error_v4.json\n",
225
+ "320K AF-P50613-F1-model_v4.cif\n",
226
+ "224K AF-P50613-F1-model_v4.pdb\n",
227
+ "280K AF-P50613-F1-predicted_aligned_error_v4.json\n",
228
+ "4.0K O60481.json\n",
229
+ "4.0K P50613.json\n"
230
+ ]
231
+ }
232
+ ],
233
+ "source": [
234
+ "!ls -sh {save_dir}"
235
+ ]
236
+ },
237
+ {
238
+ "cell_type": "markdown",
239
+ "id": "a43edd87",
240
+ "metadata": {},
241
+ "source": [
242
+ "## Filter AlphFold structure files on confidence\n",
243
+ "\n",
244
+ "Filter AlphaFold mmcif/PDB files by confidence (plDDT). Passed files are written with residues below threshold removed."
245
+ ]
246
+ },
247
+ {
248
+ "cell_type": "code",
249
+ "execution_count": 10,
250
+ "id": "cc96c63a",
251
+ "metadata": {},
252
+ "outputs": [],
253
+ "source": [
254
+ "from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence"
255
+ ]
256
+ },
257
+ {
258
+ "cell_type": "markdown",
259
+ "id": "724141d4",
260
+ "metadata": {},
261
+ "source": [
262
+ "Take one of the downloaded files"
263
+ ]
264
+ },
265
+ {
266
+ "cell_type": "code",
267
+ "execution_count": 12,
268
+ "id": "73a61cf6",
269
+ "metadata": {},
270
+ "outputs": [
271
+ {
272
+ "data": {
273
+ "text/plain": [
274
+ "[PosixPath('alphafold_files/AF-A1YPR0-F1-model_v4.cif'),\n",
275
+ " PosixPath('alphafold_files/AF-O60481-F1-model_v4.cif'),\n",
276
+ " PosixPath('alphafold_files/AF-P50613-F1-model_v4.cif')]"
277
+ ]
278
+ },
279
+ "execution_count": 12,
280
+ "metadata": {},
281
+ "output_type": "execute_result"
282
+ }
283
+ ],
284
+ "source": [
285
+ "input_files = [entry.cif_file for entry in summaries if entry.cif_file is not None]\n",
286
+ "input_files"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "markdown",
291
+ "id": "da8f2f67",
292
+ "metadata": {},
293
+ "source": [
294
+ "We only write a filtered cif file when in the input file there are between 100 and 1000 residues that have a pLDDT score above 50."
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "execution_count": null,
300
+ "id": "fbfdf472",
301
+ "metadata": {},
302
+ "outputs": [],
303
+ "source": [
304
+ "query = ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000)"
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "code",
309
+ "execution_count": 14,
310
+ "id": "152aec9a",
311
+ "metadata": {},
312
+ "outputs": [],
313
+ "source": [
314
+ "output_dir = Path(\"./filtered\")\n",
315
+ "output_dir.mkdir(exist_ok=True)\n",
316
+ "result = filter_files_on_confidence(input_files, query, output_dir)"
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "code",
321
+ "execution_count": null,
322
+ "id": "6a6f8e3f",
323
+ "metadata": {},
324
+ "outputs": [
325
+ {
326
+ "data": {
327
+ "text/plain": [
328
+ "[ConfidenceFilterResult(input_file='AF-A1YPR0-F1-model_v4.cif', count=175, filtered_file=PosixPath('filtered/AF-A1YPR0-F1-model_v4.cif')),\n",
329
+ " ConfidenceFilterResult(input_file='AF-O60481-F1-model_v4.cif', count=76, filtered_file=None),\n",
330
+ " ConfidenceFilterResult(input_file='AF-P50613-F1-model_v4.cif', count=244, filtered_file=PosixPath('filtered/AF-P50613-F1-model_v4.cif'))]"
331
+ ]
332
+ },
333
+ "execution_count": 17,
334
+ "metadata": {},
335
+ "output_type": "execute_result"
336
+ }
337
+ ],
338
+ "source": [
339
+ "list(\n",
340
+ " filter_files_on_confidence(\n",
341
+ " input_files, ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000), output_dir\n",
342
+ " )\n",
343
+ ")"
344
+ ]
345
+ },
346
+ {
347
+ "cell_type": "markdown",
348
+ "id": "0fe1e388",
349
+ "metadata": {},
350
+ "source": [
351
+ "2 files have passed, but 1 file only has 75 high confidence residues so it is discarded."
352
+ ]
353
+ },
354
+ {
355
+ "cell_type": "code",
356
+ "execution_count": null,
357
+ "id": "83ffc09b",
358
+ "metadata": {},
359
+ "outputs": [],
360
+ "source": []
361
+ }
362
+ ],
363
+ "metadata": {
364
+ "kernelspec": {
365
+ "display_name": "protein-quest",
366
+ "language": "python",
367
+ "name": "python3"
368
+ },
369
+ "language_info": {
370
+ "codemirror_mode": {
371
+ "name": "ipython",
372
+ "version": 3
373
+ },
374
+ "file_extension": ".py",
375
+ "mimetype": "text/x-python",
376
+ "name": "python",
377
+ "nbconvert_exporter": "python",
378
+ "pygments_lexer": "ipython3",
379
+ "version": "3.13.2"
380
+ }
381
+ },
382
+ "nbformat": 4,
383
+ "nbformat_minor": 5
384
+ }
@@ -0,0 +1,3 @@
1
+ # Example notebooks
2
+
3
+ The Jupyter notebooks show how to use the protein-quest package via its API.