protein-quest 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- protein_quest/__version__.py +1 -1
- protein_quest/alphafold/entry_summary.py +46 -22
- protein_quest/alphafold/fetch.py +302 -90
- protein_quest/cli.py +337 -98
- protein_quest/mcp_server.py +18 -7
- protein_quest/structure.py +24 -0
- protein_quest/uniprot.py +322 -15
- protein_quest/utils.py +15 -3
- {protein_quest-0.6.0.dist-info → protein_quest-0.8.0.dist-info}/METADATA +54 -7
- {protein_quest-0.6.0.dist-info → protein_quest-0.8.0.dist-info}/RECORD +13 -13
- {protein_quest-0.6.0.dist-info → protein_quest-0.8.0.dist-info}/WHEEL +0 -0
- {protein_quest-0.6.0.dist-info → protein_quest-0.8.0.dist-info}/entry_points.txt +0 -0
- {protein_quest-0.6.0.dist-info → protein_quest-0.8.0.dist-info}/licenses/LICENSE +0 -0
protein_quest/__version__.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.8.0"
|
|
2
2
|
"""The version of the package."""
|
|
@@ -8,33 +8,57 @@ from yarl import URL
|
|
|
8
8
|
class EntrySummary:
|
|
9
9
|
"""Dataclass representing a summary of an AlphaFold entry.
|
|
10
10
|
|
|
11
|
-
Modelled after
|
|
11
|
+
Modelled after NewEntrySummary in [https://alphafold.ebi.ac.uk/api/openapi.json](https://alphafold.ebi.ac.uk/api/openapi.json)
|
|
12
|
+
with URL types and without deprecated fields.
|
|
12
13
|
"""
|
|
13
14
|
|
|
14
|
-
entryId: str
|
|
15
|
-
uniprotAccession: str
|
|
16
|
-
uniprotId: str
|
|
17
|
-
uniprotDescription: str
|
|
18
|
-
taxId: int
|
|
19
|
-
organismScientificName: str
|
|
20
|
-
uniprotStart: int
|
|
21
|
-
uniprotEnd: int
|
|
22
|
-
uniprotSequence: str
|
|
23
|
-
modelCreatedDate: str
|
|
24
|
-
latestVersion: int
|
|
25
15
|
allVersions: list[int]
|
|
26
16
|
bcifUrl: URL
|
|
27
17
|
cifUrl: URL
|
|
28
|
-
|
|
29
|
-
|
|
18
|
+
entityType: str
|
|
19
|
+
fractionPlddtConfident: float
|
|
20
|
+
fractionPlddtLow: float
|
|
21
|
+
fractionPlddtVeryHigh: float
|
|
22
|
+
fractionPlddtVeryLow: float
|
|
23
|
+
globalMetricValue: float
|
|
24
|
+
isUniProt: bool
|
|
25
|
+
latestVersion: int
|
|
26
|
+
modelCreatedDate: str
|
|
27
|
+
modelEntityId: str
|
|
30
28
|
paeDocUrl: URL
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
29
|
+
pdbUrl: URL
|
|
30
|
+
providerId: str
|
|
31
|
+
sequence: str
|
|
32
|
+
sequenceChecksum: str
|
|
33
|
+
sequenceEnd: int
|
|
34
|
+
sequenceStart: int
|
|
35
|
+
sequenceVersionDate: str
|
|
36
|
+
toolUsed: str
|
|
37
|
+
alternativeNames: list[str] | None = None
|
|
35
38
|
amAnnotationsHg19Url: URL | None = None
|
|
36
39
|
amAnnotationsHg38Url: URL | None = None
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
40
|
+
amAnnotationsUrl: URL | None = None
|
|
41
|
+
catalyticActivities: list[str] | None = None
|
|
42
|
+
complexName: str | None = None
|
|
43
|
+
functions: list[str] | None = None
|
|
44
|
+
gene: str | None = None
|
|
45
|
+
geneSynonyms: list[str] | None = None
|
|
46
|
+
ipSAE: float | None = None
|
|
47
|
+
ipTM: float | None = None
|
|
48
|
+
isUniProtReferenceProteome: bool | None = None
|
|
49
|
+
isUniProtReviewed: bool | None = None
|
|
50
|
+
keywords: list[str] | None = None
|
|
51
|
+
msaUrl: URL | None = None
|
|
52
|
+
organismCommonNames: list[str] | None = None
|
|
53
|
+
organismScientificName: str | None = None
|
|
54
|
+
organismSynonyms: list[str] | None = None
|
|
55
|
+
plddtDocUrl: URL | None = None
|
|
56
|
+
proteinFullNames: list[str] | None = None
|
|
57
|
+
proteinShortNames: list[str] | None = None
|
|
58
|
+
stoichiometry: int | None = None
|
|
59
|
+
taxId: int | None = None
|
|
60
|
+
taxonomyLineage: list[str] | None = None
|
|
61
|
+
# uniprotAccession is isoform id (<uniprot_accession>-<isoform number>) when entry has multiple isoforms.
|
|
62
|
+
uniprotAccession: str | None = None
|
|
63
|
+
uniprotDescription: str | None = None
|
|
64
|
+
uniprotId: str | None = None
|
protein_quest/alphafold/fetch.py
CHANGED
|
@@ -7,8 +7,9 @@ from dataclasses import dataclass
|
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from typing import Literal, cast, get_args
|
|
9
9
|
|
|
10
|
+
import aiofiles
|
|
11
|
+
from aiofiles.ospath import exists
|
|
10
12
|
from aiohttp_retry import RetryClient
|
|
11
|
-
from aiopath import AsyncPath
|
|
12
13
|
from tqdm.asyncio import tqdm
|
|
13
14
|
from yarl import URL
|
|
14
15
|
|
|
@@ -24,17 +25,23 @@ DownloadableFormat = Literal[
|
|
|
24
25
|
"bcif",
|
|
25
26
|
"cif",
|
|
26
27
|
"pdb",
|
|
27
|
-
"paeImage",
|
|
28
28
|
"paeDoc",
|
|
29
29
|
"amAnnotations",
|
|
30
30
|
"amAnnotationsHg19",
|
|
31
31
|
"amAnnotationsHg38",
|
|
32
|
+
"msa",
|
|
33
|
+
"plddtDoc",
|
|
32
34
|
]
|
|
33
35
|
"""Types of formats that can be downloaded from the AlphaFold web service."""
|
|
34
36
|
|
|
35
37
|
downloadable_formats: set[DownloadableFormat] = set(get_args(DownloadableFormat))
|
|
36
38
|
"""Set of formats that can be downloaded from the AlphaFold web service."""
|
|
37
39
|
|
|
40
|
+
UrlFileNamePair = tuple[URL, str]
|
|
41
|
+
"""A tuple of a URL and a filename."""
|
|
42
|
+
UrlFileNamePairsOfFormats = dict[DownloadableFormat, UrlFileNamePair]
|
|
43
|
+
"""A mapping of DownloadableFormat to UrlFileNamePair."""
|
|
44
|
+
|
|
38
45
|
|
|
39
46
|
def _camel_to_snake_case(name: str) -> str:
|
|
40
47
|
"""Convert a camelCase string to snake_case."""
|
|
@@ -43,22 +50,23 @@ def _camel_to_snake_case(name: str) -> str:
|
|
|
43
50
|
|
|
44
51
|
@dataclass
|
|
45
52
|
class AlphaFoldEntry:
|
|
46
|
-
"""
|
|
53
|
+
"""AlphaFold entry with summary object and optionally local files.
|
|
47
54
|
|
|
48
|
-
See https://alphafold.ebi.ac.uk/api-docs for more details on the
|
|
55
|
+
See https://alphafold.ebi.ac.uk/api-docs for more details on the summary data structure.
|
|
49
56
|
"""
|
|
50
57
|
|
|
51
|
-
|
|
52
|
-
summary: EntrySummary | None
|
|
58
|
+
uniprot_accession: str
|
|
59
|
+
summary: EntrySummary | None = None
|
|
53
60
|
summary_file: Path | None = None
|
|
54
61
|
bcif_file: Path | None = None
|
|
55
62
|
cif_file: Path | None = None
|
|
56
63
|
pdb_file: Path | None = None
|
|
57
|
-
pae_image_file: Path | None = None
|
|
58
64
|
pae_doc_file: Path | None = None
|
|
59
65
|
am_annotations_file: Path | None = None
|
|
60
66
|
am_annotations_hg19_file: Path | None = None
|
|
61
67
|
am_annotations_hg38_file: Path | None = None
|
|
68
|
+
msa_file: Path | None = None
|
|
69
|
+
plddt_doc_file: Path | None = None
|
|
62
70
|
|
|
63
71
|
@classmethod
|
|
64
72
|
def format2attr(cls, dl_format: DownloadableFormat) -> str:
|
|
@@ -102,6 +110,35 @@ class AlphaFoldEntry:
|
|
|
102
110
|
"""
|
|
103
111
|
return sum(1 for attr in vars(self) if attr.endswith("_file") and getattr(self, attr) is not None)
|
|
104
112
|
|
|
113
|
+
def relative_to(self, session_dir: Path) -> "AlphaFoldEntry":
|
|
114
|
+
"""Convert paths in an AlphaFoldEntry to be relative to the session directory.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
entry: An AlphaFoldEntry instance with absolute paths.
|
|
118
|
+
session_dir: The session directory to which the paths should be made relative.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
An AlphaFoldEntry instance with paths relative to the session directory.
|
|
122
|
+
"""
|
|
123
|
+
return AlphaFoldEntry(
|
|
124
|
+
uniprot_accession=self.uniprot_accession,
|
|
125
|
+
summary=self.summary,
|
|
126
|
+
summary_file=self.summary_file.relative_to(session_dir) if self.summary_file else None,
|
|
127
|
+
bcif_file=self.bcif_file.relative_to(session_dir) if self.bcif_file else None,
|
|
128
|
+
cif_file=self.cif_file.relative_to(session_dir) if self.cif_file else None,
|
|
129
|
+
pdb_file=self.pdb_file.relative_to(session_dir) if self.pdb_file else None,
|
|
130
|
+
pae_doc_file=self.pae_doc_file.relative_to(session_dir) if self.pae_doc_file else None,
|
|
131
|
+
am_annotations_file=self.am_annotations_file.relative_to(session_dir) if self.am_annotations_file else None,
|
|
132
|
+
am_annotations_hg19_file=(
|
|
133
|
+
self.am_annotations_hg19_file.relative_to(session_dir) if self.am_annotations_hg19_file else None
|
|
134
|
+
),
|
|
135
|
+
am_annotations_hg38_file=(
|
|
136
|
+
self.am_annotations_hg38_file.relative_to(session_dir) if self.am_annotations_hg38_file else None
|
|
137
|
+
),
|
|
138
|
+
msa_file=self.msa_file.relative_to(session_dir) if self.msa_file else None,
|
|
139
|
+
plddt_doc_file=self.plddt_doc_file.relative_to(session_dir) if self.plddt_doc_file else None,
|
|
140
|
+
)
|
|
141
|
+
|
|
105
142
|
|
|
106
143
|
async def fetch_summary(
|
|
107
144
|
qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None, cacher: Cacher
|
|
@@ -120,25 +157,28 @@ async def fetch_summary(
|
|
|
120
157
|
|
|
121
158
|
Returns:
|
|
122
159
|
A list of EntrySummary objects representing the fetched summary.
|
|
160
|
+
When qualifier has multiple isoforms then multiple summaries are returned,
|
|
161
|
+
otherwise a list of a single summary is returned.
|
|
123
162
|
"""
|
|
124
163
|
url = f"https://alphafold.ebi.ac.uk/api/prediction/{qualifier}"
|
|
125
|
-
fn:
|
|
164
|
+
fn: Path | None = None
|
|
126
165
|
if save_dir is not None:
|
|
127
|
-
fn =
|
|
128
|
-
if await
|
|
166
|
+
fn = save_dir / f"{qualifier}.json"
|
|
167
|
+
if await exists(fn):
|
|
129
168
|
logger.debug(f"File {fn} already exists. Skipping download from {url}.")
|
|
130
|
-
|
|
169
|
+
async with aiofiles.open(fn, "rb") as f:
|
|
170
|
+
raw_data = await f.read()
|
|
131
171
|
return converter.loads(raw_data, list[EntrySummary])
|
|
132
172
|
cached_file = await cacher.copy_from_cache(Path(fn))
|
|
133
173
|
if cached_file is not None:
|
|
134
174
|
logger.debug(f"Using cached file {cached_file} for summary of {qualifier}.")
|
|
135
|
-
|
|
175
|
+
async with aiofiles.open(cached_file, "rb") as f:
|
|
176
|
+
raw_data = await f.read()
|
|
136
177
|
return converter.loads(raw_data, list[EntrySummary])
|
|
137
178
|
async with semaphore, session.get(url) as response:
|
|
138
179
|
response.raise_for_status()
|
|
139
180
|
raw_data = await response.content.read()
|
|
140
181
|
if fn is not None:
|
|
141
|
-
# TODO return fn and make it part of AlphaFoldEntry as summary_file prop
|
|
142
182
|
await cacher.write_bytes(Path(fn), raw_data)
|
|
143
183
|
return converter.loads(raw_data, list[EntrySummary])
|
|
144
184
|
|
|
@@ -148,7 +188,7 @@ async def fetch_summaries(
|
|
|
148
188
|
save_dir: Path | None = None,
|
|
149
189
|
max_parallel_downloads: int = 5,
|
|
150
190
|
cacher: Cacher | None = None,
|
|
151
|
-
) -> AsyncGenerator[EntrySummary]:
|
|
191
|
+
) -> AsyncGenerator[tuple[str, EntrySummary]]:
|
|
152
192
|
semaphore = Semaphore(max_parallel_downloads)
|
|
153
193
|
if save_dir is not None:
|
|
154
194
|
save_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -159,43 +199,32 @@ async def fetch_summaries(
|
|
|
159
199
|
summaries_per_qualifier: list[list[EntrySummary]] = await tqdm.gather(
|
|
160
200
|
*tasks, desc="Fetching Alphafold summaries"
|
|
161
201
|
)
|
|
162
|
-
for summaries in summaries_per_qualifier:
|
|
202
|
+
for qualifier, summaries in zip(qualifiers, summaries_per_qualifier, strict=True):
|
|
163
203
|
for summary in summaries:
|
|
164
|
-
yield summary
|
|
204
|
+
yield qualifier, summary
|
|
165
205
|
|
|
166
206
|
|
|
167
|
-
async def
|
|
207
|
+
async def _fetch_many_async_with_summary(
|
|
168
208
|
uniprot_accessions: Iterable[str],
|
|
169
209
|
save_dir: Path,
|
|
170
|
-
|
|
210
|
+
formats: set[DownloadableFormat],
|
|
171
211
|
max_parallel_downloads: int = 5,
|
|
172
212
|
cacher: Cacher | None = None,
|
|
173
213
|
gzip_files: bool = False,
|
|
214
|
+
all_isoforms: bool = False,
|
|
174
215
|
) -> AsyncGenerator[AlphaFoldEntry]:
|
|
175
|
-
""
|
|
176
|
-
[AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
|
|
177
|
-
|
|
178
|
-
Args:
|
|
179
|
-
uniprot_accessions: A set of Uniprot acessions to fetch.
|
|
180
|
-
save_dir: The directory to save the fetched files to.
|
|
181
|
-
what: A set of formats to download.
|
|
182
|
-
max_parallel_downloads: The maximum number of parallel downloads.
|
|
183
|
-
cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
|
|
184
|
-
gzip_files: Whether to gzip the downloaded files.
|
|
185
|
-
|
|
186
|
-
Yields:
|
|
187
|
-
A dataclass containing the summary, pdb file, and pae file.
|
|
188
|
-
"""
|
|
189
|
-
save_dir_for_summaries = save_dir if "summary" in what and save_dir is not None else None
|
|
216
|
+
save_dir_for_summaries = save_dir if "summary" in formats else None
|
|
190
217
|
|
|
191
218
|
summaries = [
|
|
192
219
|
s
|
|
193
220
|
async for s in fetch_summaries(
|
|
194
221
|
uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads, cacher=cacher
|
|
195
222
|
)
|
|
223
|
+
# Filter out isoforms if all_isoforms is False
|
|
224
|
+
# O60481 is canonical and O60481-2 is isoform, so we skip the isoform
|
|
225
|
+
if all_isoforms or s[0] == s[1].uniprotAccession
|
|
196
226
|
]
|
|
197
|
-
|
|
198
|
-
files = files_to_download(what, summaries, gzip_files)
|
|
227
|
+
files = files_to_download(formats, summaries, gzip_files)
|
|
199
228
|
|
|
200
229
|
await retrieve_files(
|
|
201
230
|
files,
|
|
@@ -205,54 +234,58 @@ async def fetch_many_async(
|
|
|
205
234
|
cacher=cacher,
|
|
206
235
|
gzip_files=gzip_files,
|
|
207
236
|
)
|
|
237
|
+
|
|
208
238
|
gzext = ".gz" if gzip_files else ""
|
|
209
|
-
for summary in summaries:
|
|
239
|
+
for uniprot_accession, summary in summaries:
|
|
210
240
|
yield AlphaFoldEntry(
|
|
211
|
-
|
|
241
|
+
uniprot_accession=uniprot_accession,
|
|
212
242
|
summary=summary,
|
|
213
|
-
summary_file=save_dir / f"{
|
|
214
|
-
bcif_file=save_dir / (summary.bcifUrl.name + gzext) if "bcif" in
|
|
215
|
-
cif_file=save_dir / (summary.cifUrl.name + gzext) if "cif" in
|
|
216
|
-
pdb_file=save_dir / (summary.pdbUrl.name + gzext) if "pdb" in
|
|
217
|
-
|
|
218
|
-
pae_doc_file=save_dir / (summary.paeDocUrl.name + gzext) if "paeDoc" in what else None,
|
|
243
|
+
summary_file=save_dir / f"{uniprot_accession}.json" if save_dir_for_summaries is not None else None,
|
|
244
|
+
bcif_file=save_dir / (summary.bcifUrl.name + gzext) if "bcif" in formats else None,
|
|
245
|
+
cif_file=save_dir / (summary.cifUrl.name + gzext) if "cif" in formats else None,
|
|
246
|
+
pdb_file=save_dir / (summary.pdbUrl.name + gzext) if "pdb" in formats else None,
|
|
247
|
+
pae_doc_file=save_dir / (summary.paeDocUrl.name + gzext) if "paeDoc" in formats else None,
|
|
219
248
|
am_annotations_file=(
|
|
220
249
|
save_dir / (summary.amAnnotationsUrl.name + gzext)
|
|
221
|
-
if "amAnnotations" in
|
|
250
|
+
if "amAnnotations" in formats and summary.amAnnotationsUrl
|
|
222
251
|
else None
|
|
223
252
|
),
|
|
224
253
|
am_annotations_hg19_file=(
|
|
225
254
|
save_dir / (summary.amAnnotationsHg19Url.name + gzext)
|
|
226
|
-
if "amAnnotationsHg19" in
|
|
255
|
+
if "amAnnotationsHg19" in formats and summary.amAnnotationsHg19Url
|
|
227
256
|
else None
|
|
228
257
|
),
|
|
229
258
|
am_annotations_hg38_file=(
|
|
230
259
|
save_dir / (summary.amAnnotationsHg38Url.name + gzext)
|
|
231
|
-
if "amAnnotationsHg38" in
|
|
260
|
+
if "amAnnotationsHg38" in formats and summary.amAnnotationsHg38Url
|
|
232
261
|
else None
|
|
233
262
|
),
|
|
263
|
+
msa_file=(save_dir / (summary.msaUrl.name + gzext) if "msa" in formats and summary.msaUrl else None),
|
|
264
|
+
plddt_doc_file=(
|
|
265
|
+
save_dir / (summary.plddtDocUrl.name + gzext) if "plddtDoc" in formats and summary.plddtDocUrl else None
|
|
266
|
+
),
|
|
234
267
|
)
|
|
235
268
|
|
|
236
269
|
|
|
237
270
|
def files_to_download(
|
|
238
|
-
|
|
239
|
-
) -> set[
|
|
240
|
-
if not (set(
|
|
271
|
+
formats: set[DownloadableFormat], summaries: Iterable[tuple[str, EntrySummary]], gzip_files: bool
|
|
272
|
+
) -> set[UrlFileNamePair]:
|
|
273
|
+
if not (set(formats) <= downloadable_formats):
|
|
241
274
|
msg = (
|
|
242
|
-
f"Invalid format(s) specified: {set(
|
|
275
|
+
f"Invalid format(s) specified: {set(formats) - downloadable_formats}. "
|
|
243
276
|
f"Valid formats are: {downloadable_formats}"
|
|
244
277
|
)
|
|
245
278
|
raise ValueError(msg)
|
|
246
279
|
|
|
247
|
-
url_filename_pairs: set[
|
|
248
|
-
for summary in summaries:
|
|
249
|
-
for fmt in
|
|
280
|
+
url_filename_pairs: set[UrlFileNamePair] = set()
|
|
281
|
+
for _, summary in summaries:
|
|
282
|
+
for fmt in formats:
|
|
250
283
|
if fmt == "summary":
|
|
251
284
|
# summary is handled already in fetch_summary
|
|
252
285
|
continue
|
|
253
286
|
url = cast("URL | None", getattr(summary, f"{fmt}Url", None))
|
|
254
287
|
if url is None:
|
|
255
|
-
logger.warning(f"Summary {summary.
|
|
288
|
+
logger.warning(f"Summary {summary.modelEntityId} does not have a URL for format '{fmt}'. Skipping.")
|
|
256
289
|
continue
|
|
257
290
|
fn = url.name + (".gz" if gzip_files else "")
|
|
258
291
|
url_filename_pair = (url, fn)
|
|
@@ -260,23 +293,224 @@ def files_to_download(
|
|
|
260
293
|
return url_filename_pairs
|
|
261
294
|
|
|
262
295
|
|
|
296
|
+
async def fetch_alphafold_db_version() -> str:
|
|
297
|
+
"""Fetch the current version of the AlphaFold database.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
The current version of the AlphaFold database as a string. For example: "6".
|
|
301
|
+
"""
|
|
302
|
+
url = "https://ftp.ebi.ac.uk/pub/databases/alphafold/accession_ids.csv"
|
|
303
|
+
headers = {"Range": "bytes=0-200"}
|
|
304
|
+
logger.debug(f"Detecting AlphaFold DB version from head of {url}")
|
|
305
|
+
async with friendly_session() as session, session.get(url, headers=headers) as response:
|
|
306
|
+
response.raise_for_status()
|
|
307
|
+
raw = await response.content.read(200)
|
|
308
|
+
text = raw.decode("utf-8")
|
|
309
|
+
first_line = text.splitlines()[1]
|
|
310
|
+
version = first_line.split(",")[-1]
|
|
311
|
+
logger.debug(f"Found current AlphaFold DB version is '{version}'")
|
|
312
|
+
return version
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _files_for_alphafold_entry(
|
|
316
|
+
uniprot_accession: str,
|
|
317
|
+
formats: set[DownloadableFormat],
|
|
318
|
+
db_version: str,
|
|
319
|
+
gzip_files: bool,
|
|
320
|
+
) -> UrlFileNamePairsOfFormats:
|
|
321
|
+
templates: dict[DownloadableFormat, URL] = {
|
|
322
|
+
"bcif": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-model_v{db_version}.bcif"),
|
|
323
|
+
"cif": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-model_v{db_version}.cif"),
|
|
324
|
+
"pdb": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-model_v{db_version}.pdb"),
|
|
325
|
+
"paeDoc": URL(
|
|
326
|
+
f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-predicted_aligned_error_v{db_version}.json"
|
|
327
|
+
),
|
|
328
|
+
"amAnnotations": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-aa-substitutions.csv"),
|
|
329
|
+
"amAnnotationsHg19": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-hg19.csv"),
|
|
330
|
+
"amAnnotationsHg38": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-hg38.csv"),
|
|
331
|
+
"msa": URL(f"https://alphafold.ebi.ac.uk/files/msa/AF-{uniprot_accession}-F1-msa_v{db_version}.a3m"),
|
|
332
|
+
"plddtDoc": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-confidence_v{db_version}.json"),
|
|
333
|
+
}
|
|
334
|
+
url_filename_pairs = {}
|
|
335
|
+
for fmt in formats:
|
|
336
|
+
if fmt == "summary":
|
|
337
|
+
# Summaries are downloaded separately as its using API instead of static files
|
|
338
|
+
continue
|
|
339
|
+
if fmt not in templates:
|
|
340
|
+
logger.warning(f"No URL template found for format '{fmt}'. Skipping.")
|
|
341
|
+
continue
|
|
342
|
+
url = templates[cast("DownloadableFormat", fmt)]
|
|
343
|
+
fn = url.name
|
|
344
|
+
if gzip_files:
|
|
345
|
+
fn += ".gz"
|
|
346
|
+
url_filename_pair = (url, fn)
|
|
347
|
+
url_filename_pairs[fmt] = url_filename_pair
|
|
348
|
+
return url_filename_pairs
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def files_for_alphafold_entries(
|
|
352
|
+
uniprot_accessions: Iterable[str],
|
|
353
|
+
formats: set[DownloadableFormat],
|
|
354
|
+
db_version: str,
|
|
355
|
+
gzip_files: bool,
|
|
356
|
+
) -> dict[str, UrlFileNamePairsOfFormats]:
|
|
357
|
+
"""Get the files to download for multiple AlphaFold entries.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
uniprot_accessions: A set of Uniprot accessions.
|
|
361
|
+
formats: A set of formats to download.
|
|
362
|
+
db_version: The version of the AlphaFold database to use.
|
|
363
|
+
gzip_files: Whether to download gzipped files. Otherwise downloads uncompressed files.
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
A mapping of Uniprot accession to a mapping of DownloadableFormat to UrlFileNamePair.
|
|
367
|
+
"""
|
|
368
|
+
return {
|
|
369
|
+
uniprot_accession: _files_for_alphafold_entry(
|
|
370
|
+
uniprot_accession, formats=formats, db_version=db_version, gzip_files=gzip_files
|
|
371
|
+
)
|
|
372
|
+
for uniprot_accession in uniprot_accessions
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
async def _fetch_many_async_without_summary(
|
|
377
|
+
uniprot_accessions: Iterable[str],
|
|
378
|
+
save_dir: Path,
|
|
379
|
+
formats: set[DownloadableFormat],
|
|
380
|
+
db_version: str | None = None,
|
|
381
|
+
max_parallel_downloads: int = 5,
|
|
382
|
+
cacher: Cacher | None = None,
|
|
383
|
+
gzip_files: bool = False,
|
|
384
|
+
) -> AsyncGenerator[AlphaFoldEntry]:
|
|
385
|
+
if db_version is None:
|
|
386
|
+
db_version = await fetch_alphafold_db_version()
|
|
387
|
+
nested_files = files_for_alphafold_entries(
|
|
388
|
+
uniprot_accessions, formats=formats, db_version=db_version, gzip_files=gzip_files
|
|
389
|
+
)
|
|
390
|
+
files: set[UrlFileNamePair] = set()
|
|
391
|
+
for uniprot_accession in uniprot_accessions:
|
|
392
|
+
files.update(nested_files[uniprot_accession].values())
|
|
393
|
+
|
|
394
|
+
retrieved_files = await retrieve_files(
|
|
395
|
+
files,
|
|
396
|
+
save_dir,
|
|
397
|
+
desc="Downloading AlphaFold files",
|
|
398
|
+
max_parallel_downloads=max_parallel_downloads,
|
|
399
|
+
cacher=cacher,
|
|
400
|
+
gzip_files=gzip_files,
|
|
401
|
+
raise_for_not_found=False,
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
retrieved_files_set = set(retrieved_files)
|
|
405
|
+
for uniprot_accession in uniprot_accessions:
|
|
406
|
+
entry = AlphaFoldEntry(
|
|
407
|
+
uniprot_accession=uniprot_accession,
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
for af_format, url_filename_pair in nested_files[uniprot_accession].items():
|
|
411
|
+
_, filename = url_filename_pair
|
|
412
|
+
filepath = save_dir / filename
|
|
413
|
+
if filepath in retrieved_files_set:
|
|
414
|
+
attr = AlphaFoldEntry.format2attr(af_format)
|
|
415
|
+
setattr(entry, attr, filepath)
|
|
416
|
+
# else: File was not found (404) during download, so we leave the attribute as None
|
|
417
|
+
|
|
418
|
+
yield entry
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def fetch_many_async(
|
|
422
|
+
uniprot_accessions: Iterable[str],
|
|
423
|
+
save_dir: Path,
|
|
424
|
+
formats: set[DownloadableFormat],
|
|
425
|
+
db_version: str | None = None,
|
|
426
|
+
max_parallel_downloads: int = 5,
|
|
427
|
+
cacher: Cacher | None = None,
|
|
428
|
+
gzip_files: bool = False,
|
|
429
|
+
all_isoforms: bool = False,
|
|
430
|
+
) -> AsyncGenerator[AlphaFoldEntry]:
|
|
431
|
+
"""Asynchronously fetches summaries and/or files from
|
|
432
|
+
[AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
uniprot_accessions: A set of Uniprot accessions to fetch.
|
|
436
|
+
save_dir: The directory to save the fetched files to.
|
|
437
|
+
formats: A set of formats to download.
|
|
438
|
+
If `summary` is in the set then summaries will be fetched using the API endpoint.
|
|
439
|
+
and later the other files will be downloaded using static file URLs.
|
|
440
|
+
If `summary` is not in the set then all files will be downloaded using static file
|
|
441
|
+
URLs only.
|
|
442
|
+
db_version: The version of the AlphaFold database to use. If None, the latest version will be used.
|
|
443
|
+
max_parallel_downloads: The maximum number of parallel downloads.
|
|
444
|
+
cacher: A cacher to use for caching the fetched files.
|
|
445
|
+
gzip_files: Whether to gzip the downloaded files.
|
|
446
|
+
Summaries are never gzipped.
|
|
447
|
+
all_isoforms: Whether to yield all isoforms of each uniprot entry.
|
|
448
|
+
When False then yields only the canonical sequence per uniprot entry.
|
|
449
|
+
|
|
450
|
+
Yields:
|
|
451
|
+
A dataclass containing the summary, pdb file, and pae file.
|
|
452
|
+
|
|
453
|
+
Raises:
|
|
454
|
+
ValueError: If 'formats' set is empty.
|
|
455
|
+
ValueError: If all_isoforms is True and 'summary' is not in 'formats' set.
|
|
456
|
+
"""
|
|
457
|
+
if len(formats) == 0:
|
|
458
|
+
msg = "At least one format must be specified. The 'formats' argument is empty."
|
|
459
|
+
raise ValueError(msg)
|
|
460
|
+
if "summary" in formats:
|
|
461
|
+
if db_version is not None:
|
|
462
|
+
logger.warning("db_version is ignored when 'summary' is in 'formats' set. Always uses latest version.")
|
|
463
|
+
return _fetch_many_async_with_summary(
|
|
464
|
+
uniprot_accessions,
|
|
465
|
+
save_dir,
|
|
466
|
+
formats,
|
|
467
|
+
max_parallel_downloads=max_parallel_downloads,
|
|
468
|
+
cacher=cacher,
|
|
469
|
+
gzip_files=gzip_files,
|
|
470
|
+
all_isoforms=all_isoforms,
|
|
471
|
+
)
|
|
472
|
+
if all_isoforms:
|
|
473
|
+
msg = "Cannot fetch all isoforms when 'summary' is not in 'formats' set."
|
|
474
|
+
raise ValueError(msg)
|
|
475
|
+
return _fetch_many_async_without_summary(
|
|
476
|
+
uniprot_accessions,
|
|
477
|
+
save_dir,
|
|
478
|
+
formats,
|
|
479
|
+
db_version=db_version,
|
|
480
|
+
max_parallel_downloads=max_parallel_downloads,
|
|
481
|
+
cacher=cacher,
|
|
482
|
+
gzip_files=gzip_files,
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
|
|
263
486
|
def fetch_many(
|
|
264
|
-
|
|
487
|
+
uniprot_accessions: Iterable[str],
|
|
265
488
|
save_dir: Path,
|
|
266
|
-
|
|
489
|
+
formats: set[DownloadableFormat],
|
|
490
|
+
db_version: str | None = None,
|
|
267
491
|
max_parallel_downloads: int = 5,
|
|
268
492
|
cacher: Cacher | None = None,
|
|
269
493
|
gzip_files: bool = False,
|
|
494
|
+
all_isoforms: bool = False,
|
|
270
495
|
) -> list[AlphaFoldEntry]:
|
|
271
|
-
"""Synchronously fetches summaries and
|
|
496
|
+
"""Synchronously fetches summaries and/or files like cif from AlphaFold Protein Structure Database.
|
|
272
497
|
|
|
273
498
|
Args:
|
|
274
|
-
|
|
499
|
+
uniprot_accessions: A set of Uniprot accessions to fetch.
|
|
275
500
|
save_dir: The directory to save the fetched files to.
|
|
276
|
-
|
|
501
|
+
formats: A set of formats to download.
|
|
502
|
+
If `summary` is in the set then summaries will be fetched using the API endpoint.
|
|
503
|
+
and later the other files will be downloaded using static file URLs.
|
|
504
|
+
If `summary` is not in the set then all files will be downloaded using static file
|
|
505
|
+
URLs only.
|
|
506
|
+
Excluding 'summary' is much faster as it avoids slow API calls.
|
|
507
|
+
db_version: The version of the AlphaFold database to use. If None, the latest version will be used.
|
|
277
508
|
max_parallel_downloads: The maximum number of parallel downloads.
|
|
278
|
-
cacher: A cacher to use for caching the fetched files.
|
|
509
|
+
cacher: A cacher to use for caching the fetched files.
|
|
279
510
|
gzip_files: Whether to gzip the downloaded files.
|
|
511
|
+
Summaries are never gzipped.
|
|
512
|
+
all_isoforms: Whether to yield all isoforms of each uniprot entry.
|
|
513
|
+
When False then yields only the canonical sequence per uniprot entry.
|
|
280
514
|
|
|
281
515
|
Returns:
|
|
282
516
|
A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
|
|
@@ -286,37 +520,15 @@ def fetch_many(
|
|
|
286
520
|
return [
|
|
287
521
|
entry
|
|
288
522
|
async for entry in fetch_many_async(
|
|
289
|
-
|
|
523
|
+
uniprot_accessions,
|
|
524
|
+
save_dir,
|
|
525
|
+
formats,
|
|
526
|
+
db_version=db_version,
|
|
527
|
+
max_parallel_downloads=max_parallel_downloads,
|
|
528
|
+
cacher=cacher,
|
|
529
|
+
gzip_files=gzip_files,
|
|
530
|
+
all_isoforms=all_isoforms,
|
|
290
531
|
)
|
|
291
532
|
]
|
|
292
533
|
|
|
293
534
|
return run_async(gather_entries())
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
|
|
297
|
-
"""Convert paths in an AlphaFoldEntry to be relative to the session directory.
|
|
298
|
-
|
|
299
|
-
Args:
|
|
300
|
-
entry: An AlphaFoldEntry instance with absolute paths.
|
|
301
|
-
session_dir: The session directory to which the paths should be made relative.
|
|
302
|
-
|
|
303
|
-
Returns:
|
|
304
|
-
An AlphaFoldEntry instance with paths relative to the session directory.
|
|
305
|
-
"""
|
|
306
|
-
return AlphaFoldEntry(
|
|
307
|
-
uniprot_acc=entry.uniprot_acc,
|
|
308
|
-
summary=entry.summary,
|
|
309
|
-
summary_file=entry.summary_file.relative_to(session_dir) if entry.summary_file else None,
|
|
310
|
-
bcif_file=entry.bcif_file.relative_to(session_dir) if entry.bcif_file else None,
|
|
311
|
-
cif_file=entry.cif_file.relative_to(session_dir) if entry.cif_file else None,
|
|
312
|
-
pdb_file=entry.pdb_file.relative_to(session_dir) if entry.pdb_file else None,
|
|
313
|
-
pae_image_file=entry.pae_image_file.relative_to(session_dir) if entry.pae_image_file else None,
|
|
314
|
-
pae_doc_file=entry.pae_doc_file.relative_to(session_dir) if entry.pae_doc_file else None,
|
|
315
|
-
am_annotations_file=entry.am_annotations_file.relative_to(session_dir) if entry.am_annotations_file else None,
|
|
316
|
-
am_annotations_hg19_file=(
|
|
317
|
-
entry.am_annotations_hg19_file.relative_to(session_dir) if entry.am_annotations_hg19_file else None
|
|
318
|
-
),
|
|
319
|
-
am_annotations_hg38_file=(
|
|
320
|
-
entry.am_annotations_hg38_file.relative_to(session_dir) if entry.am_annotations_hg38_file else None
|
|
321
|
-
),
|
|
322
|
-
)
|