protein-quest 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of protein-quest might be problematic. Click here for more details.
- protein_quest/__version__.py +2 -1
- protein_quest/alphafold/confidence.py +44 -17
- protein_quest/alphafold/entry_summary.py +11 -9
- protein_quest/alphafold/fetch.py +37 -63
- protein_quest/cli.py +187 -30
- protein_quest/converter.py +45 -0
- protein_quest/filters.py +78 -35
- protein_quest/go.py +1 -4
- protein_quest/mcp_server.py +8 -5
- protein_quest/parallel.py +37 -1
- protein_quest/pdbe/fetch.py +15 -1
- protein_quest/pdbe/io.py +142 -46
- protein_quest/ss.py +264 -0
- protein_quest/taxonomy.py +13 -3
- protein_quest/utils.py +65 -3
- {protein_quest-0.3.0.dist-info → protein_quest-0.3.2.dist-info}/METADATA +21 -11
- protein_quest-0.3.2.dist-info/RECORD +26 -0
- protein_quest-0.3.0.dist-info/RECORD +0 -24
- {protein_quest-0.3.0.dist-info → protein_quest-0.3.2.dist-info}/WHEEL +0 -0
- {protein_quest-0.3.0.dist-info → protein_quest-0.3.2.dist-info}/entry_points.txt +0 -0
- {protein_quest-0.3.0.dist-info → protein_quest-0.3.2.dist-info}/licenses/LICENSE +0 -0
protein_quest/__version__.py
CHANGED
|
@@ -1 +1,2 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.2"
|
|
2
|
+
"""The version of the package."""
|
|
@@ -7,7 +7,10 @@ from pathlib import Path
|
|
|
7
7
|
|
|
8
8
|
import gemmi
|
|
9
9
|
|
|
10
|
+
from protein_quest.converter import Percentage, PositiveInt, converter
|
|
10
11
|
from protein_quest.pdbe.io import write_structure
|
|
12
|
+
from protein_quest.ss import nr_of_residues_in_total
|
|
13
|
+
from protein_quest.utils import CopyMethod, copyfile
|
|
11
14
|
|
|
12
15
|
"""
|
|
13
16
|
Methods to filter AlphaFoldDB structures on confidence scores.
|
|
@@ -73,13 +76,25 @@ class ConfidenceFilterQuery:
|
|
|
73
76
|
Parameters:
|
|
74
77
|
confidence: The confidence threshold for filtering residues.
|
|
75
78
|
Residues with a pLDDT (b-factor) above this value are considered high confidence.
|
|
76
|
-
|
|
77
|
-
|
|
79
|
+
min_residues: The minimum number of high-confidence residues required to keep the structure.
|
|
80
|
+
max_residues: The maximum number of high-confidence residues required to keep the structure.
|
|
78
81
|
"""
|
|
79
82
|
|
|
80
|
-
confidence:
|
|
81
|
-
|
|
82
|
-
|
|
83
|
+
confidence: Percentage
|
|
84
|
+
min_residues: PositiveInt
|
|
85
|
+
max_residues: PositiveInt
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
base_query_hook = converter.get_structure_hook(ConfidenceFilterQuery)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@converter.register_structure_hook
|
|
92
|
+
def confidence_filter_query_hook(val, _type) -> ConfidenceFilterQuery:
|
|
93
|
+
result: ConfidenceFilterQuery = base_query_hook(val, _type)
|
|
94
|
+
if result.min_residues > result.max_residues:
|
|
95
|
+
msg = f"min_residues {result.min_residues} cannot be larger than max_residues {result.max_residues}"
|
|
96
|
+
raise ValueError(msg)
|
|
97
|
+
return result
|
|
83
98
|
|
|
84
99
|
|
|
85
100
|
@dataclass
|
|
@@ -93,38 +108,46 @@ class ConfidenceFilterResult:
|
|
|
93
108
|
"""
|
|
94
109
|
|
|
95
110
|
input_file: str
|
|
96
|
-
count:
|
|
111
|
+
count: PositiveInt
|
|
97
112
|
filtered_file: Path | None = None
|
|
98
113
|
|
|
99
114
|
|
|
100
|
-
def filter_file_on_residues(
|
|
101
|
-
|
|
115
|
+
def filter_file_on_residues(
|
|
116
|
+
file: Path, query: ConfidenceFilterQuery, filtered_dir: Path, copy_method: CopyMethod = "copy"
|
|
117
|
+
) -> ConfidenceFilterResult:
|
|
118
|
+
"""Filter a single AlphaFoldDB structure file (*.pdb[.gz], *.cif[.gz]) based on confidence.
|
|
102
119
|
|
|
103
120
|
Args:
|
|
104
121
|
file: The path to the PDB file to filter.
|
|
105
122
|
query: The confidence filter query.
|
|
106
123
|
filtered_dir: The directory to save the filtered PDB file.
|
|
124
|
+
copy_method: How to copy when no residues have to be removed.
|
|
107
125
|
|
|
108
126
|
Returns:
|
|
109
127
|
result with filtered_file property set to Path where filtered PDB file is saved.
|
|
110
|
-
|
|
128
|
+
or None if structure was filtered out.
|
|
111
129
|
"""
|
|
112
130
|
structure = gemmi.read_structure(str(file))
|
|
113
131
|
residues = set(find_high_confidence_residues(structure, query.confidence))
|
|
114
132
|
count = len(residues)
|
|
115
|
-
if count < query.
|
|
133
|
+
if count < query.min_residues or count > query.max_residues:
|
|
116
134
|
# Skip structure that is outside the min and max threshold
|
|
117
135
|
# just return number of high confidence residues
|
|
118
136
|
return ConfidenceFilterResult(
|
|
119
137
|
input_file=file.name,
|
|
120
138
|
count=count,
|
|
121
139
|
)
|
|
140
|
+
total_residues = nr_of_residues_in_total(structure)
|
|
122
141
|
filtered_file = filtered_dir / file.name
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
142
|
+
if count == total_residues:
|
|
143
|
+
# if no residues have to be removed then copy instead of slower gemmi writing
|
|
144
|
+
copyfile(file, filtered_file, copy_method)
|
|
145
|
+
else:
|
|
146
|
+
new_structure = filter_out_low_confidence_residues(
|
|
147
|
+
structure,
|
|
148
|
+
residues,
|
|
149
|
+
)
|
|
150
|
+
write_structure(new_structure, filtered_file)
|
|
128
151
|
return ConfidenceFilterResult(
|
|
129
152
|
input_file=file.name,
|
|
130
153
|
count=count,
|
|
@@ -133,7 +156,10 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
|
|
|
133
156
|
|
|
134
157
|
|
|
135
158
|
def filter_files_on_confidence(
|
|
136
|
-
alphafold_pdb_files: list[Path],
|
|
159
|
+
alphafold_pdb_files: list[Path],
|
|
160
|
+
query: ConfidenceFilterQuery,
|
|
161
|
+
filtered_dir: Path,
|
|
162
|
+
copy_method: CopyMethod = "copy",
|
|
137
163
|
) -> Generator[ConfidenceFilterResult]:
|
|
138
164
|
"""Filter AlphaFoldDB structures based on confidence.
|
|
139
165
|
|
|
@@ -141,6 +167,7 @@ def filter_files_on_confidence(
|
|
|
141
167
|
alphafold_pdb_files: List of mmcif/PDB files from AlphaFoldDB to filter.
|
|
142
168
|
query: The confidence filter query containing the confidence thresholds.
|
|
143
169
|
filtered_dir: Directory where the filtered mmcif/PDB files will be saved.
|
|
170
|
+
copy_method: How to copy when a direct copy is possible.
|
|
144
171
|
|
|
145
172
|
Yields:
|
|
146
173
|
For each mmcif/PDB files yields whether it was filtered or not,
|
|
@@ -150,4 +177,4 @@ def filter_files_on_confidence(
|
|
|
150
177
|
# In ../filter.py:filter_files_on_residues() we filter on number of residues on a file level
|
|
151
178
|
# here we filter on file level and inside file remove low confidence residues
|
|
152
179
|
for pdb_file in alphafold_pdb_files:
|
|
153
|
-
yield filter_file_on_residues(pdb_file, query, filtered_dir)
|
|
180
|
+
yield filter_file_on_residues(pdb_file, query, filtered_dir, copy_method)
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
# ruff: noqa: N815 allow camelCase follow what api returns
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
|
|
4
|
+
from yarl import URL
|
|
5
|
+
|
|
4
6
|
|
|
5
7
|
@dataclass
|
|
6
8
|
class EntrySummary:
|
|
7
9
|
"""Dataclass representing a summary of an AlphaFold entry.
|
|
8
10
|
|
|
9
|
-
Modelled after EntrySummary in https://alphafold.ebi.ac.uk/api/openapi.json
|
|
11
|
+
Modelled after EntrySummary in [https://alphafold.ebi.ac.uk/api/openapi.json](https://alphafold.ebi.ac.uk/api/openapi.json)
|
|
10
12
|
"""
|
|
11
13
|
|
|
12
14
|
entryId: str
|
|
@@ -21,17 +23,17 @@ class EntrySummary:
|
|
|
21
23
|
modelCreatedDate: str
|
|
22
24
|
latestVersion: int
|
|
23
25
|
allVersions: list[int]
|
|
24
|
-
bcifUrl:
|
|
25
|
-
cifUrl:
|
|
26
|
-
pdbUrl:
|
|
27
|
-
paeImageUrl:
|
|
28
|
-
paeDocUrl:
|
|
26
|
+
bcifUrl: URL
|
|
27
|
+
cifUrl: URL
|
|
28
|
+
pdbUrl: URL
|
|
29
|
+
paeImageUrl: URL
|
|
30
|
+
paeDocUrl: URL
|
|
29
31
|
gene: str | None = None
|
|
30
32
|
sequenceChecksum: str | None = None
|
|
31
33
|
sequenceVersionDate: str | None = None
|
|
32
|
-
amAnnotationsUrl:
|
|
33
|
-
amAnnotationsHg19Url:
|
|
34
|
-
amAnnotationsHg38Url:
|
|
34
|
+
amAnnotationsUrl: URL | None = None
|
|
35
|
+
amAnnotationsHg19Url: URL | None = None
|
|
36
|
+
amAnnotationsHg38Url: URL | None = None
|
|
35
37
|
isReviewed: bool | None = None
|
|
36
38
|
isReferenceProteome: bool | None = None
|
|
37
39
|
# TODO add new fields from https://alphafold.ebi.ac.uk/#/public-api/get_uniprot_summary_api_uniprot_summary__qualifier__json_get
|
protein_quest/alphafold/fetch.py
CHANGED
|
@@ -1,26 +1,26 @@
|
|
|
1
1
|
"""Module for fetch Alphafold data."""
|
|
2
2
|
|
|
3
|
-
import asyncio
|
|
4
3
|
import logging
|
|
5
4
|
from asyncio import Semaphore
|
|
6
5
|
from collections.abc import AsyncGenerator, Iterable
|
|
7
6
|
from dataclasses import dataclass
|
|
8
7
|
from pathlib import Path
|
|
9
|
-
from
|
|
10
|
-
from typing import Literal
|
|
8
|
+
from typing import Literal, cast, get_args
|
|
11
9
|
|
|
12
10
|
from aiohttp_retry import RetryClient
|
|
13
11
|
from aiopath import AsyncPath
|
|
14
|
-
from cattrs.preconf.orjson import make_converter
|
|
15
12
|
from tqdm.asyncio import tqdm
|
|
13
|
+
from yarl import URL
|
|
16
14
|
|
|
17
15
|
from protein_quest.alphafold.entry_summary import EntrySummary
|
|
18
|
-
from protein_quest.
|
|
16
|
+
from protein_quest.converter import converter
|
|
17
|
+
from protein_quest.utils import friendly_session, retrieve_files, run_async
|
|
19
18
|
|
|
20
19
|
logger = logging.getLogger(__name__)
|
|
21
|
-
|
|
20
|
+
|
|
22
21
|
|
|
23
22
|
DownloadableFormat = Literal[
|
|
23
|
+
"summary",
|
|
24
24
|
"bcif",
|
|
25
25
|
"cif",
|
|
26
26
|
"pdb",
|
|
@@ -32,16 +32,7 @@ DownloadableFormat = Literal[
|
|
|
32
32
|
]
|
|
33
33
|
"""Types of formats that can be downloaded from the AlphaFold web service."""
|
|
34
34
|
|
|
35
|
-
downloadable_formats: set[DownloadableFormat] =
|
|
36
|
-
"bcif",
|
|
37
|
-
"cif",
|
|
38
|
-
"pdb",
|
|
39
|
-
"paeImage",
|
|
40
|
-
"paeDoc",
|
|
41
|
-
"amAnnotations",
|
|
42
|
-
"amAnnotationsHg19",
|
|
43
|
-
"amAnnotationsHg38",
|
|
44
|
-
}
|
|
35
|
+
downloadable_formats: set[DownloadableFormat] = set(get_args(DownloadableFormat))
|
|
45
36
|
"""Set of formats that can be downloaded from the AlphaFold web service."""
|
|
46
37
|
|
|
47
38
|
|
|
@@ -59,6 +50,7 @@ class AlphaFoldEntry:
|
|
|
59
50
|
|
|
60
51
|
uniprot_acc: str
|
|
61
52
|
summary: EntrySummary | None
|
|
53
|
+
summary_file: Path | None = None
|
|
62
54
|
bcif_file: Path | None = None
|
|
63
55
|
cif_file: Path | None = None
|
|
64
56
|
pdb_file: Path | None = None
|
|
@@ -127,10 +119,6 @@ async def fetch_summary(
|
|
|
127
119
|
|
|
128
120
|
Returns:
|
|
129
121
|
A list of EntrySummary objects representing the fetched summary.
|
|
130
|
-
|
|
131
|
-
Raises:
|
|
132
|
-
HTTPError: If the HTTP request returns an error status code.
|
|
133
|
-
Exception: If there is an error during file reading/writing or data conversion.
|
|
134
122
|
"""
|
|
135
123
|
url = f"https://alphafold.ebi.ac.uk/api/prediction/{qualifier}"
|
|
136
124
|
fn: AsyncPath | None = None
|
|
@@ -144,6 +132,7 @@ async def fetch_summary(
|
|
|
144
132
|
response.raise_for_status()
|
|
145
133
|
raw_data = await response.content.read()
|
|
146
134
|
if fn is not None:
|
|
135
|
+
# TODO return fn and make it part of AlphaFoldEntry as summary_file prop
|
|
147
136
|
await fn.write_bytes(raw_data)
|
|
148
137
|
return converter.loads(raw_data, list[EntrySummary])
|
|
149
138
|
|
|
@@ -164,19 +153,14 @@ async def fetch_summaries(
|
|
|
164
153
|
yield summary
|
|
165
154
|
|
|
166
155
|
|
|
167
|
-
def url2name(url: str) -> str:
|
|
168
|
-
"""Given a URL, return the final path component as the name of the file."""
|
|
169
|
-
return url.split("/")[-1]
|
|
170
|
-
|
|
171
|
-
|
|
172
156
|
async def fetch_many_async(
|
|
173
|
-
|
|
157
|
+
uniprot_accessions: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
|
|
174
158
|
) -> AsyncGenerator[AlphaFoldEntry]:
|
|
175
|
-
"""Asynchronously fetches summaries and
|
|
159
|
+
"""Asynchronously fetches summaries and files from
|
|
176
160
|
[AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
|
|
177
161
|
|
|
178
162
|
Args:
|
|
179
|
-
|
|
163
|
+
uniprot_accessions: A set of Uniprot acessions to fetch.
|
|
180
164
|
save_dir: The directory to save the fetched files to.
|
|
181
165
|
what: A set of formats to download.
|
|
182
166
|
max_parallel_downloads: The maximum number of parallel downloads.
|
|
@@ -184,7 +168,13 @@ async def fetch_many_async(
|
|
|
184
168
|
Yields:
|
|
185
169
|
A dataclass containing the summary, pdb file, and pae file.
|
|
186
170
|
"""
|
|
187
|
-
|
|
171
|
+
save_dir_for_summaries = save_dir if "summary" in what and save_dir is not None else None
|
|
172
|
+
summaries = [
|
|
173
|
+
s
|
|
174
|
+
async for s in fetch_summaries(
|
|
175
|
+
uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads
|
|
176
|
+
)
|
|
177
|
+
]
|
|
188
178
|
|
|
189
179
|
files = files_to_download(what, summaries)
|
|
190
180
|
|
|
@@ -198,30 +188,31 @@ async def fetch_many_async(
|
|
|
198
188
|
yield AlphaFoldEntry(
|
|
199
189
|
uniprot_acc=summary.uniprotAccession,
|
|
200
190
|
summary=summary,
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
191
|
+
summary_file=save_dir / f"{summary.uniprotAccession}.json" if save_dir_for_summaries is not None else None,
|
|
192
|
+
bcif_file=save_dir / summary.bcifUrl.name if "bcif" in what else None,
|
|
193
|
+
cif_file=save_dir / summary.cifUrl.name if "cif" in what else None,
|
|
194
|
+
pdb_file=save_dir / summary.pdbUrl.name if "pdb" in what else None,
|
|
195
|
+
pae_image_file=save_dir / summary.paeImageUrl.name if "paeImage" in what else None,
|
|
196
|
+
pae_doc_file=save_dir / summary.paeDocUrl.name if "paeDoc" in what else None,
|
|
206
197
|
am_annotations_file=(
|
|
207
|
-
save_dir /
|
|
198
|
+
save_dir / summary.amAnnotationsUrl.name
|
|
208
199
|
if "amAnnotations" in what and summary.amAnnotationsUrl
|
|
209
200
|
else None
|
|
210
201
|
),
|
|
211
202
|
am_annotations_hg19_file=(
|
|
212
|
-
save_dir /
|
|
203
|
+
save_dir / summary.amAnnotationsHg19Url.name
|
|
213
204
|
if "amAnnotationsHg19" in what and summary.amAnnotationsHg19Url
|
|
214
205
|
else None
|
|
215
206
|
),
|
|
216
207
|
am_annotations_hg38_file=(
|
|
217
|
-
save_dir /
|
|
208
|
+
save_dir / summary.amAnnotationsHg38Url.name
|
|
218
209
|
if "amAnnotationsHg38" in what and summary.amAnnotationsHg38Url
|
|
219
210
|
else None
|
|
220
211
|
),
|
|
221
212
|
)
|
|
222
213
|
|
|
223
214
|
|
|
224
|
-
def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySummary]) -> set[tuple[
|
|
215
|
+
def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySummary]) -> set[tuple[URL, str]]:
|
|
225
216
|
if not (set(what) <= downloadable_formats):
|
|
226
217
|
msg = (
|
|
227
218
|
f"Invalid format(s) specified: {set(what) - downloadable_formats}. "
|
|
@@ -229,24 +220,21 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
|
|
|
229
220
|
)
|
|
230
221
|
raise ValueError(msg)
|
|
231
222
|
|
|
232
|
-
files: set[tuple[
|
|
223
|
+
files: set[tuple[URL, str]] = set()
|
|
233
224
|
for summary in summaries:
|
|
234
225
|
for fmt in what:
|
|
235
|
-
|
|
226
|
+
if fmt == "summary":
|
|
227
|
+
# summary is handled already in fetch_summary
|
|
228
|
+
continue
|
|
229
|
+
url = cast("URL | None", getattr(summary, f"{fmt}Url", None))
|
|
236
230
|
if url is None:
|
|
237
231
|
logger.warning(f"Summary {summary.uniprotAccession} does not have a URL for format '{fmt}'. Skipping.")
|
|
238
232
|
continue
|
|
239
|
-
file = (url,
|
|
233
|
+
file = (url, url.name)
|
|
240
234
|
files.add(file)
|
|
241
235
|
return files
|
|
242
236
|
|
|
243
237
|
|
|
244
|
-
class NestedAsyncIOLoopError(RuntimeError):
|
|
245
|
-
"""Custom error for nested async I/O loops."""
|
|
246
|
-
|
|
247
|
-
pass
|
|
248
|
-
|
|
249
|
-
|
|
250
238
|
def fetch_many(
|
|
251
239
|
ids: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
|
|
252
240
|
) -> list[AlphaFoldEntry]:
|
|
@@ -260,9 +248,6 @@ def fetch_many(
|
|
|
260
248
|
|
|
261
249
|
Returns:
|
|
262
250
|
A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
|
|
263
|
-
|
|
264
|
-
Raises:
|
|
265
|
-
NestedAsyncIOLoopError: If called from a nested async I/O loop like in a Jupyter notebook.
|
|
266
251
|
"""
|
|
267
252
|
|
|
268
253
|
async def gather_entries():
|
|
@@ -271,19 +256,7 @@ def fetch_many(
|
|
|
271
256
|
async for entry in fetch_many_async(ids, save_dir, what, max_parallel_downloads=max_parallel_downloads)
|
|
272
257
|
]
|
|
273
258
|
|
|
274
|
-
|
|
275
|
-
return asyncio.run(gather_entries())
|
|
276
|
-
except RuntimeError as e:
|
|
277
|
-
msg = dedent("""\
|
|
278
|
-
Can not run async method from an environment where the asyncio event loop is already running.
|
|
279
|
-
Like a Jupyter notebook.
|
|
280
|
-
|
|
281
|
-
Please use the `fetch_many_async` function directly or before call
|
|
282
|
-
|
|
283
|
-
import nest_asyncio
|
|
284
|
-
nest_asyncio.apply()
|
|
285
|
-
""")
|
|
286
|
-
raise NestedAsyncIOLoopError(msg) from e
|
|
259
|
+
return run_async(gather_entries())
|
|
287
260
|
|
|
288
261
|
|
|
289
262
|
def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
|
|
@@ -299,6 +272,7 @@ def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
|
|
|
299
272
|
return AlphaFoldEntry(
|
|
300
273
|
uniprot_acc=entry.uniprot_acc,
|
|
301
274
|
summary=entry.summary,
|
|
275
|
+
summary_file=entry.summary_file.relative_to(session_dir) if entry.summary_file else None,
|
|
302
276
|
bcif_file=entry.bcif_file.relative_to(session_dir) if entry.bcif_file else None,
|
|
303
277
|
cif_file=entry.cif_file.relative_to(session_dir) if entry.cif_file else None,
|
|
304
278
|
pdb_file=entry.pdb_file.relative_to(session_dir) if entry.pdb_file else None,
|