protein-quest 0.3.1__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of protein-quest might be problematic. Click here for more details.
- {protein_quest-0.3.1 → protein_quest-0.4.0}/PKG-INFO +48 -4
- {protein_quest-0.3.1 → protein_quest-0.4.0}/README.md +47 -3
- {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/notebooks/alphafold.ipynb +3 -3
- {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/notebooks/uniprot.ipynb +95 -2
- {protein_quest-0.3.1 → protein_quest-0.4.0}/pyproject.toml +1 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/__version__.py +1 -1
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/alphafold/confidence.py +42 -15
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/alphafold/fetch.py +2 -4
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/cli.py +292 -14
- protein_quest-0.4.0/src/protein_quest/converter.py +46 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/filters.py +39 -7
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/go.py +1 -4
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/mcp_server.py +14 -1
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/pdbe/io.py +122 -41
- protein_quest-0.4.0/src/protein_quest/ss.py +284 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/taxonomy.py +1 -3
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/uniprot.py +157 -4
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/utils.py +28 -1
- protein_quest-0.4.0/tests/alphafold/test_confidence.py +155 -0
- protein_quest-0.4.0/tests/cassettes/test_uniprot/test_search4interaction_partners.yaml +384 -0
- protein_quest-0.4.0/tests/cassettes/test_uniprot/test_search4macromolecular_complexes.yaml +382 -0
- protein_quest-0.4.0/tests/fixtures/3JRS_B2A.cif.gz +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/pdbe/test_io.py +39 -4
- protein_quest-0.4.0/tests/test_converter.py +23 -0
- protein_quest-0.4.0/tests/test_ss.py +233 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/test_uniprot.py +65 -0
- protein_quest-0.4.0/tests/test_utils.py +31 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/uv.lock +36 -0
- protein_quest-0.3.1/tests/alphafold/test_confidence.py +0 -63
- {protein_quest-0.3.1 → protein_quest-0.4.0}/.github/workflows/ci.yml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/.github/workflows/pages.yml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/.github/workflows/pypi-publish.yml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/.gitignore +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/.vscode/extensions.json +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/CITATION.cff +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/CODE_OF_CONDUCT.md +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/CONTRIBUTING.md +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/LICENSE +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/CONTRIBUTING.md +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/cli_doc_hook.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/index.md +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/notebooks/.gitignore +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/notebooks/index.md +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/notebooks/pdbe.ipynb +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/docs/protein-quest-mcp.png +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/mkdocs.yml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/__init__.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/alphafold/__init__.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/alphafold/entry_summary.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/emdb.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/parallel.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/pdbe/__init__.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/pdbe/fetch.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/src/protein_quest/py.typed +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/alphafold/AF-A1YPR0-F1-model_v4.pdb +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/alphafold/cassettes/test_fetch/test_fetch_many.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/alphafold/test_entry_summary.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/alphafold/test_fetch.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/cassettes/test_emdb/test_fetch.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/cassettes/test_go/test_search_gene_ontology_term.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/cassettes/test_taxonomy/test_search_taxon.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/cassettes/test_taxonomy/test_search_taxon_by_id.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/cassettes/test_uniprot/test_search4af.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/cassettes/test_uniprot/test_search4emdb.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/cassettes/test_uniprot/test_search4pdb.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/cassettes/test_uniprot/test_search4uniprot.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/pdbe/cassettes/test_fetch/test_fetch.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/pdbe/fixtures/2y29.cif +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/pdbe/test_fetch.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/test_cli.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/test_emdb.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/test_go.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/test_mcp.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.4.0}/tests/test_taxonomy.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: protein_quest
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Search/retrieve/filter proteins and protein structures
|
|
5
5
|
Project-URL: Homepage, https://github.com/haddocking/protein-quest
|
|
6
6
|
Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
|
|
@@ -56,17 +56,23 @@ graph TB;
|
|
|
56
56
|
searchuniprot --> |uniprot_accessions|searchpdbe[/Search PDBe/]
|
|
57
57
|
searchuniprot --> |uniprot_accessions|searchaf[/Search Alphafold/]
|
|
58
58
|
searchuniprot -. uniprot_accessions .-> searchemdb[/Search EMDB/]
|
|
59
|
+
searchintactionpartners[/Search interaction partners/] -.-x |uniprot_accessions|searchuniprot
|
|
60
|
+
searchcomplexes[/Search complexes/]
|
|
59
61
|
searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
|
|
60
62
|
searchaf --> |uniprot_accessions|fetchad(Retrieve AlphaFold)
|
|
61
63
|
searchemdb -. emdb_ids .->fetchemdb[Retrieve EMDB]
|
|
62
|
-
fetchpdbe -->|
|
|
63
|
-
chainfilter --> |mmcif_files| residuefilter{Filter on chain length}
|
|
64
|
-
fetchad -->|
|
|
64
|
+
fetchpdbe -->|mmcif_files| chainfilter{{Filter on chain of uniprot}}
|
|
65
|
+
chainfilter --> |mmcif_files| residuefilter{{Filter on chain length}}
|
|
66
|
+
fetchad -->|mmcif_files| confidencefilter{{Filter out low confidence}}
|
|
67
|
+
confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
|
|
68
|
+
residuefilter --> |mmcif_files| ssfilter
|
|
65
69
|
classDef dashedBorder stroke-dasharray: 5 5;
|
|
66
70
|
goterm:::dashedBorder
|
|
67
71
|
taxonomy:::dashedBorder
|
|
68
72
|
searchemdb:::dashedBorder
|
|
69
73
|
fetchemdb:::dashedBorder
|
|
74
|
+
searchintactionpartners:::dashedBorder
|
|
75
|
+
searchcomplexes:::dashedBorder
|
|
70
76
|
```
|
|
71
77
|
|
|
72
78
|
(Dotted nodes and edges are side-quests.)
|
|
@@ -175,6 +181,18 @@ protein-quest filter residue \
|
|
|
175
181
|
./filtered-chains ./filtered
|
|
176
182
|
```
|
|
177
183
|
|
|
184
|
+
### To filter on secondary structure
|
|
185
|
+
|
|
186
|
+
To filter on structure being mostly alpha helices and have no beta sheets.
|
|
187
|
+
|
|
188
|
+
```shell
|
|
189
|
+
protein-quest filter secondary-structure \
|
|
190
|
+
--ratio-min-helix-residues 0.5 \
|
|
191
|
+
--ratio-max-sheet-residues 0.0 \
|
|
192
|
+
--write-stats filtered-ss/stats.csv \
|
|
193
|
+
./filtered-chains ./filtered-ss
|
|
194
|
+
```
|
|
195
|
+
|
|
178
196
|
### Search Taxonomy
|
|
179
197
|
|
|
180
198
|
```shell
|
|
@@ -190,6 +208,32 @@ You can use following command to search for a Gene Ontology (GO) term.
|
|
|
190
208
|
protein-quest search go --limit 5 --aspect cellular_component apoptosome -
|
|
191
209
|
```
|
|
192
210
|
|
|
211
|
+
### Search for interaction partners
|
|
212
|
+
|
|
213
|
+
Use https://www.ebi.ac.uk/complexportal to find interaction partners of given UniProt accession.
|
|
214
|
+
|
|
215
|
+
```shell
|
|
216
|
+
protein-quest search interaction-partners Q05471 interaction-partners-of-Q05471.txt
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
The `interaction-partners-of-Q05471.txt` file contains uniprot accessions (one per line).
|
|
220
|
+
|
|
221
|
+
### Search for complexes
|
|
222
|
+
|
|
223
|
+
Given Uniprot accessions search for macromolecular complexes at https://www.ebi.ac.uk/complexportal
|
|
224
|
+
and return the complex entries and their members.
|
|
225
|
+
|
|
226
|
+
```shell
|
|
227
|
+
echo Q05471 | protein-quest search complexes - complexes.csv
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
The `complexes.csv` looks like
|
|
231
|
+
|
|
232
|
+
```csv
|
|
233
|
+
query_protein,complex_id,complex_url,complex_title,members
|
|
234
|
+
Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
|
|
235
|
+
```
|
|
236
|
+
|
|
193
237
|
## Model Context Protocol (MCP) server
|
|
194
238
|
|
|
195
239
|
Protein quest can also help LLMs like Claude Sonnet 4 by providing a [set of tools](https://modelcontextprotocol.io/docs/learn/server-concepts#tools-ai-actions) for protein structures.
|
|
@@ -26,17 +26,23 @@ graph TB;
|
|
|
26
26
|
searchuniprot --> |uniprot_accessions|searchpdbe[/Search PDBe/]
|
|
27
27
|
searchuniprot --> |uniprot_accessions|searchaf[/Search Alphafold/]
|
|
28
28
|
searchuniprot -. uniprot_accessions .-> searchemdb[/Search EMDB/]
|
|
29
|
+
searchintactionpartners[/Search interaction partners/] -.-x |uniprot_accessions|searchuniprot
|
|
30
|
+
searchcomplexes[/Search complexes/]
|
|
29
31
|
searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
|
|
30
32
|
searchaf --> |uniprot_accessions|fetchad(Retrieve AlphaFold)
|
|
31
33
|
searchemdb -. emdb_ids .->fetchemdb[Retrieve EMDB]
|
|
32
|
-
fetchpdbe -->|
|
|
33
|
-
chainfilter --> |mmcif_files| residuefilter{Filter on chain length}
|
|
34
|
-
fetchad -->|
|
|
34
|
+
fetchpdbe -->|mmcif_files| chainfilter{{Filter on chain of uniprot}}
|
|
35
|
+
chainfilter --> |mmcif_files| residuefilter{{Filter on chain length}}
|
|
36
|
+
fetchad -->|mmcif_files| confidencefilter{{Filter out low confidence}}
|
|
37
|
+
confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
|
|
38
|
+
residuefilter --> |mmcif_files| ssfilter
|
|
35
39
|
classDef dashedBorder stroke-dasharray: 5 5;
|
|
36
40
|
goterm:::dashedBorder
|
|
37
41
|
taxonomy:::dashedBorder
|
|
38
42
|
searchemdb:::dashedBorder
|
|
39
43
|
fetchemdb:::dashedBorder
|
|
44
|
+
searchintactionpartners:::dashedBorder
|
|
45
|
+
searchcomplexes:::dashedBorder
|
|
40
46
|
```
|
|
41
47
|
|
|
42
48
|
(Dotted nodes and edges are side-quests.)
|
|
@@ -145,6 +151,18 @@ protein-quest filter residue \
|
|
|
145
151
|
./filtered-chains ./filtered
|
|
146
152
|
```
|
|
147
153
|
|
|
154
|
+
### To filter on secondary structure
|
|
155
|
+
|
|
156
|
+
To filter on structure being mostly alpha helices and have no beta sheets.
|
|
157
|
+
|
|
158
|
+
```shell
|
|
159
|
+
protein-quest filter secondary-structure \
|
|
160
|
+
--ratio-min-helix-residues 0.5 \
|
|
161
|
+
--ratio-max-sheet-residues 0.0 \
|
|
162
|
+
--write-stats filtered-ss/stats.csv \
|
|
163
|
+
./filtered-chains ./filtered-ss
|
|
164
|
+
```
|
|
165
|
+
|
|
148
166
|
### Search Taxonomy
|
|
149
167
|
|
|
150
168
|
```shell
|
|
@@ -160,6 +178,32 @@ You can use following command to search for a Gene Ontology (GO) term.
|
|
|
160
178
|
protein-quest search go --limit 5 --aspect cellular_component apoptosome -
|
|
161
179
|
```
|
|
162
180
|
|
|
181
|
+
### Search for interaction partners
|
|
182
|
+
|
|
183
|
+
Use https://www.ebi.ac.uk/complexportal to find interaction partners of given UniProt accession.
|
|
184
|
+
|
|
185
|
+
```shell
|
|
186
|
+
protein-quest search interaction-partners Q05471 interaction-partners-of-Q05471.txt
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
The `interaction-partners-of-Q05471.txt` file contains uniprot accessions (one per line).
|
|
190
|
+
|
|
191
|
+
### Search for complexes
|
|
192
|
+
|
|
193
|
+
Given Uniprot accessions search for macromolecular complexes at https://www.ebi.ac.uk/complexportal
|
|
194
|
+
and return the complex entries and their members.
|
|
195
|
+
|
|
196
|
+
```shell
|
|
197
|
+
echo Q05471 | protein-quest search complexes - complexes.csv
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
The `complexes.csv` looks like
|
|
201
|
+
|
|
202
|
+
```csv
|
|
203
|
+
query_protein,complex_id,complex_url,complex_title,members
|
|
204
|
+
Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
|
|
205
|
+
```
|
|
206
|
+
|
|
163
207
|
## Model Context Protocol (MCP) server
|
|
164
208
|
|
|
165
209
|
Protein quest can also help LLMs like Claude Sonnet 4 by providing a [set of tools](https://modelcontextprotocol.io/docs/learn/server-concepts#tools-ai-actions) for protein structures.
|
|
@@ -301,7 +301,7 @@
|
|
|
301
301
|
"metadata": {},
|
|
302
302
|
"outputs": [],
|
|
303
303
|
"source": [
|
|
304
|
-
"query = ConfidenceFilterQuery(confidence=80,
|
|
304
|
+
"query = ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000)"
|
|
305
305
|
]
|
|
306
306
|
},
|
|
307
307
|
{
|
|
@@ -318,7 +318,7 @@
|
|
|
318
318
|
},
|
|
319
319
|
{
|
|
320
320
|
"cell_type": "code",
|
|
321
|
-
"execution_count":
|
|
321
|
+
"execution_count": null,
|
|
322
322
|
"id": "6a6f8e3f",
|
|
323
323
|
"metadata": {},
|
|
324
324
|
"outputs": [
|
|
@@ -338,7 +338,7 @@
|
|
|
338
338
|
"source": [
|
|
339
339
|
"list(\n",
|
|
340
340
|
" filter_files_on_confidence(\n",
|
|
341
|
-
" input_files, ConfidenceFilterQuery(confidence=80,
|
|
341
|
+
" input_files, ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000), output_dir\n",
|
|
342
342
|
" )\n",
|
|
343
343
|
")"
|
|
344
344
|
]
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
},
|
|
13
13
|
{
|
|
14
14
|
"cell_type": "code",
|
|
15
|
-
"execution_count":
|
|
15
|
+
"execution_count": 1,
|
|
16
16
|
"id": "85674583",
|
|
17
17
|
"metadata": {},
|
|
18
18
|
"outputs": [],
|
|
@@ -282,6 +282,99 @@
|
|
|
282
282
|
"first_uniprot = next(iter(uniprot_accessions.items()))\n",
|
|
283
283
|
"pprint(first_uniprot)"
|
|
284
284
|
]
|
|
285
|
+
},
|
|
286
|
+
{
|
|
287
|
+
"cell_type": "markdown",
|
|
288
|
+
"id": "e32a95f8",
|
|
289
|
+
"metadata": {},
|
|
290
|
+
"source": [
|
|
291
|
+
"## Find interaction partners for uniprot entries"
|
|
292
|
+
]
|
|
293
|
+
},
|
|
294
|
+
{
|
|
295
|
+
"cell_type": "code",
|
|
296
|
+
"execution_count": 1,
|
|
297
|
+
"id": "d035c702",
|
|
298
|
+
"metadata": {},
|
|
299
|
+
"outputs": [],
|
|
300
|
+
"source": [
|
|
301
|
+
"from protein_quest.uniprot import search4interaction_partners, search4macromolecular_complexes"
|
|
302
|
+
]
|
|
303
|
+
},
|
|
304
|
+
{
|
|
305
|
+
"cell_type": "code",
|
|
306
|
+
"execution_count": 2,
|
|
307
|
+
"id": "601c690a",
|
|
308
|
+
"metadata": {},
|
|
309
|
+
"outputs": [],
|
|
310
|
+
"source": [
|
|
311
|
+
"# Helicase SWR1 in yeast\n",
|
|
312
|
+
"uniprot_accession = \"Q05471\""
|
|
313
|
+
]
|
|
314
|
+
},
|
|
315
|
+
{
|
|
316
|
+
"cell_type": "code",
|
|
317
|
+
"execution_count": 3,
|
|
318
|
+
"id": "173c764d",
|
|
319
|
+
"metadata": {},
|
|
320
|
+
"outputs": [
|
|
321
|
+
{
|
|
322
|
+
"data": {
|
|
323
|
+
"text/plain": [
|
|
324
|
+
"{'Q12464': {'CPX-2122'},\n",
|
|
325
|
+
" 'P35817': {'CPX-2122'},\n",
|
|
326
|
+
" 'P80428': {'CPX-2122'},\n",
|
|
327
|
+
" 'Q12509': {'CPX-2122'},\n",
|
|
328
|
+
" 'Q03388': {'CPX-2122'},\n",
|
|
329
|
+
" 'P53201': {'CPX-2122'},\n",
|
|
330
|
+
" 'P53930': {'CPX-2122'},\n",
|
|
331
|
+
" 'P60010': {'CPX-2122'},\n",
|
|
332
|
+
" 'Q03433': {'CPX-2122'},\n",
|
|
333
|
+
" 'Q06707': {'CPX-2122'},\n",
|
|
334
|
+
" 'P38326': {'CPX-2122'},\n",
|
|
335
|
+
" 'P31376': {'CPX-2122'},\n",
|
|
336
|
+
" 'Q03940': {'CPX-2122'}}"
|
|
337
|
+
]
|
|
338
|
+
},
|
|
339
|
+
"execution_count": 3,
|
|
340
|
+
"metadata": {},
|
|
341
|
+
"output_type": "execute_result"
|
|
342
|
+
}
|
|
343
|
+
],
|
|
344
|
+
"source": [
|
|
345
|
+
"partners = search4interaction_partners(uniprot_accession, limit=100)\n",
|
|
346
|
+
"partners"
|
|
347
|
+
]
|
|
348
|
+
},
|
|
349
|
+
{
|
|
350
|
+
"cell_type": "markdown",
|
|
351
|
+
"id": "a763b6f8",
|
|
352
|
+
"metadata": {},
|
|
353
|
+
"source": [
|
|
354
|
+
"To get more information about the complex you can search for the complexes themselves with:"
|
|
355
|
+
]
|
|
356
|
+
},
|
|
357
|
+
{
|
|
358
|
+
"cell_type": "code",
|
|
359
|
+
"execution_count": 4,
|
|
360
|
+
"id": "236050ea",
|
|
361
|
+
"metadata": {},
|
|
362
|
+
"outputs": [
|
|
363
|
+
{
|
|
364
|
+
"data": {
|
|
365
|
+
"text/plain": [
|
|
366
|
+
"[ComplexPortalEntry(query_protein='Q05471', complex_id='CPX-2122', complex_url='https://www.ebi.ac.uk/complexportal/complex/CPX-2122', complex_title='Swr1 chromatin remodelling complex', members={'P35817', 'Q05471', 'Q12464', 'Q12509', 'Q06707', 'Q03433', 'P38326', 'P53201', 'Q03388', 'P53930', 'P80428', 'Q03940', 'P60010', 'P31376'})]"
|
|
367
|
+
]
|
|
368
|
+
},
|
|
369
|
+
"execution_count": 4,
|
|
370
|
+
"metadata": {},
|
|
371
|
+
"output_type": "execute_result"
|
|
372
|
+
}
|
|
373
|
+
],
|
|
374
|
+
"source": [
|
|
375
|
+
"complexes = search4macromolecular_complexes([uniprot_accession])\n",
|
|
376
|
+
"complexes"
|
|
377
|
+
]
|
|
285
378
|
}
|
|
286
379
|
],
|
|
287
380
|
"metadata": {
|
|
@@ -300,7 +393,7 @@
|
|
|
300
393
|
"name": "python",
|
|
301
394
|
"nbconvert_exporter": "python",
|
|
302
395
|
"pygments_lexer": "ipython3",
|
|
303
|
-
"version": "3.13.
|
|
396
|
+
"version": "3.13.5"
|
|
304
397
|
}
|
|
305
398
|
},
|
|
306
399
|
"nbformat": 4,
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.4.0"
|
|
2
2
|
"""The version of the package."""
|
|
@@ -7,7 +7,10 @@ from pathlib import Path
|
|
|
7
7
|
|
|
8
8
|
import gemmi
|
|
9
9
|
|
|
10
|
+
from protein_quest.converter import Percentage, PositiveInt, converter
|
|
10
11
|
from protein_quest.pdbe.io import write_structure
|
|
12
|
+
from protein_quest.ss import nr_of_residues_in_total
|
|
13
|
+
from protein_quest.utils import CopyMethod, copyfile
|
|
11
14
|
|
|
12
15
|
"""
|
|
13
16
|
Methods to filter AlphaFoldDB structures on confidence scores.
|
|
@@ -73,13 +76,25 @@ class ConfidenceFilterQuery:
|
|
|
73
76
|
Parameters:
|
|
74
77
|
confidence: The confidence threshold for filtering residues.
|
|
75
78
|
Residues with a pLDDT (b-factor) above this value are considered high confidence.
|
|
76
|
-
|
|
77
|
-
|
|
79
|
+
min_residues: The minimum number of high-confidence residues required to keep the structure.
|
|
80
|
+
max_residues: The maximum number of high-confidence residues required to keep the structure.
|
|
78
81
|
"""
|
|
79
82
|
|
|
80
|
-
confidence:
|
|
81
|
-
|
|
82
|
-
|
|
83
|
+
confidence: Percentage
|
|
84
|
+
min_residues: PositiveInt
|
|
85
|
+
max_residues: PositiveInt
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
base_query_hook = converter.get_structure_hook(ConfidenceFilterQuery)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@converter.register_structure_hook
|
|
92
|
+
def confidence_filter_query_hook(val, _type) -> ConfidenceFilterQuery:
|
|
93
|
+
result: ConfidenceFilterQuery = base_query_hook(val, _type)
|
|
94
|
+
if result.min_residues > result.max_residues:
|
|
95
|
+
msg = f"min_residues {result.min_residues} cannot be larger than max_residues {result.max_residues}"
|
|
96
|
+
raise ValueError(msg)
|
|
97
|
+
return result
|
|
83
98
|
|
|
84
99
|
|
|
85
100
|
@dataclass
|
|
@@ -93,17 +108,20 @@ class ConfidenceFilterResult:
|
|
|
93
108
|
"""
|
|
94
109
|
|
|
95
110
|
input_file: str
|
|
96
|
-
count:
|
|
111
|
+
count: PositiveInt
|
|
97
112
|
filtered_file: Path | None = None
|
|
98
113
|
|
|
99
114
|
|
|
100
|
-
def filter_file_on_residues(
|
|
115
|
+
def filter_file_on_residues(
|
|
116
|
+
file: Path, query: ConfidenceFilterQuery, filtered_dir: Path, copy_method: CopyMethod = "copy"
|
|
117
|
+
) -> ConfidenceFilterResult:
|
|
101
118
|
"""Filter a single AlphaFoldDB structure file (*.pdb[.gz], *.cif[.gz]) based on confidence.
|
|
102
119
|
|
|
103
120
|
Args:
|
|
104
121
|
file: The path to the PDB file to filter.
|
|
105
122
|
query: The confidence filter query.
|
|
106
123
|
filtered_dir: The directory to save the filtered PDB file.
|
|
124
|
+
copy_method: How to copy when no residues have to be removed.
|
|
107
125
|
|
|
108
126
|
Returns:
|
|
109
127
|
result with filtered_file property set to Path where filtered PDB file is saved.
|
|
@@ -112,19 +130,24 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
|
|
|
112
130
|
structure = gemmi.read_structure(str(file))
|
|
113
131
|
residues = set(find_high_confidence_residues(structure, query.confidence))
|
|
114
132
|
count = len(residues)
|
|
115
|
-
if count < query.
|
|
133
|
+
if count < query.min_residues or count > query.max_residues:
|
|
116
134
|
# Skip structure that is outside the min and max threshold
|
|
117
135
|
# just return number of high confidence residues
|
|
118
136
|
return ConfidenceFilterResult(
|
|
119
137
|
input_file=file.name,
|
|
120
138
|
count=count,
|
|
121
139
|
)
|
|
140
|
+
total_residues = nr_of_residues_in_total(structure)
|
|
122
141
|
filtered_file = filtered_dir / file.name
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
142
|
+
if count == total_residues:
|
|
143
|
+
# if no residues have to be removed then copy instead of slower gemmi writing
|
|
144
|
+
copyfile(file, filtered_file, copy_method)
|
|
145
|
+
else:
|
|
146
|
+
new_structure = filter_out_low_confidence_residues(
|
|
147
|
+
structure,
|
|
148
|
+
residues,
|
|
149
|
+
)
|
|
150
|
+
write_structure(new_structure, filtered_file)
|
|
128
151
|
return ConfidenceFilterResult(
|
|
129
152
|
input_file=file.name,
|
|
130
153
|
count=count,
|
|
@@ -133,7 +156,10 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
|
|
|
133
156
|
|
|
134
157
|
|
|
135
158
|
def filter_files_on_confidence(
|
|
136
|
-
alphafold_pdb_files: list[Path],
|
|
159
|
+
alphafold_pdb_files: list[Path],
|
|
160
|
+
query: ConfidenceFilterQuery,
|
|
161
|
+
filtered_dir: Path,
|
|
162
|
+
copy_method: CopyMethod = "copy",
|
|
137
163
|
) -> Generator[ConfidenceFilterResult]:
|
|
138
164
|
"""Filter AlphaFoldDB structures based on confidence.
|
|
139
165
|
|
|
@@ -141,6 +167,7 @@ def filter_files_on_confidence(
|
|
|
141
167
|
alphafold_pdb_files: List of mmcif/PDB files from AlphaFoldDB to filter.
|
|
142
168
|
query: The confidence filter query containing the confidence thresholds.
|
|
143
169
|
filtered_dir: Directory where the filtered mmcif/PDB files will be saved.
|
|
170
|
+
copy_method: How to copy when a direct copy is possible.
|
|
144
171
|
|
|
145
172
|
Yields:
|
|
146
173
|
For each mmcif/PDB files yields whether it was filtered or not,
|
|
@@ -150,4 +177,4 @@ def filter_files_on_confidence(
|
|
|
150
177
|
# In ../filter.py:filter_files_on_residues() we filter on number of residues on a file level
|
|
151
178
|
# here we filter on file level and inside file remove low confidence residues
|
|
152
179
|
for pdb_file in alphafold_pdb_files:
|
|
153
|
-
yield filter_file_on_residues(pdb_file, query, filtered_dir)
|
|
180
|
+
yield filter_file_on_residues(pdb_file, query, filtered_dir, copy_method)
|
|
@@ -9,17 +9,15 @@ from typing import Literal, cast, get_args
|
|
|
9
9
|
|
|
10
10
|
from aiohttp_retry import RetryClient
|
|
11
11
|
from aiopath import AsyncPath
|
|
12
|
-
from cattrs.preconf.orjson import make_converter
|
|
13
12
|
from tqdm.asyncio import tqdm
|
|
14
13
|
from yarl import URL
|
|
15
14
|
|
|
16
15
|
from protein_quest.alphafold.entry_summary import EntrySummary
|
|
16
|
+
from protein_quest.converter import converter
|
|
17
17
|
from protein_quest.utils import friendly_session, retrieve_files, run_async
|
|
18
18
|
|
|
19
19
|
logger = logging.getLogger(__name__)
|
|
20
|
-
|
|
21
|
-
"""cattrs converter to read AlphaFold summary JSON document."""
|
|
22
|
-
converter.register_structure_hook(URL, lambda v, _: URL(v))
|
|
20
|
+
|
|
23
21
|
|
|
24
22
|
DownloadableFormat = Literal[
|
|
25
23
|
"summary",
|