protein-quest 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of protein-quest might be problematic. Click here for more details.
- protein_quest/__version__.py +1 -1
- protein_quest/alphafold/entry_summary.py +46 -22
- protein_quest/alphafold/fetch.py +53 -28
- protein_quest/cli.py +263 -57
- protein_quest/mcp_server.py +15 -4
- protein_quest/structure.py +24 -0
- protein_quest/uniprot.py +287 -15
- {protein_quest-0.6.0.dist-info → protein_quest-0.7.0.dist-info}/METADATA +32 -6
- {protein_quest-0.6.0.dist-info → protein_quest-0.7.0.dist-info}/RECORD +12 -12
- {protein_quest-0.6.0.dist-info → protein_quest-0.7.0.dist-info}/WHEEL +0 -0
- {protein_quest-0.6.0.dist-info → protein_quest-0.7.0.dist-info}/entry_points.txt +0 -0
- {protein_quest-0.6.0.dist-info → protein_quest-0.7.0.dist-info}/licenses/LICENSE +0 -0
protein_quest/__version__.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.7.0"
|
|
2
2
|
"""The version of the package."""
|
|
@@ -8,33 +8,57 @@ from yarl import URL
|
|
|
8
8
|
class EntrySummary:
|
|
9
9
|
"""Dataclass representing a summary of an AlphaFold entry.
|
|
10
10
|
|
|
11
|
-
Modelled after
|
|
11
|
+
Modelled after NewEntrySummary in [https://alphafold.ebi.ac.uk/api/openapi.json](https://alphafold.ebi.ac.uk/api/openapi.json)
|
|
12
|
+
with URL types and without deprecated fields.
|
|
12
13
|
"""
|
|
13
14
|
|
|
14
|
-
entryId: str
|
|
15
|
-
uniprotAccession: str
|
|
16
|
-
uniprotId: str
|
|
17
|
-
uniprotDescription: str
|
|
18
|
-
taxId: int
|
|
19
|
-
organismScientificName: str
|
|
20
|
-
uniprotStart: int
|
|
21
|
-
uniprotEnd: int
|
|
22
|
-
uniprotSequence: str
|
|
23
|
-
modelCreatedDate: str
|
|
24
|
-
latestVersion: int
|
|
25
15
|
allVersions: list[int]
|
|
26
16
|
bcifUrl: URL
|
|
27
17
|
cifUrl: URL
|
|
28
|
-
|
|
29
|
-
|
|
18
|
+
entityType: str
|
|
19
|
+
fractionPlddtConfident: float
|
|
20
|
+
fractionPlddtLow: float
|
|
21
|
+
fractionPlddtVeryHigh: float
|
|
22
|
+
fractionPlddtVeryLow: float
|
|
23
|
+
globalMetricValue: float
|
|
24
|
+
isUniProt: bool
|
|
25
|
+
latestVersion: int
|
|
26
|
+
modelCreatedDate: str
|
|
27
|
+
modelEntityId: str
|
|
30
28
|
paeDocUrl: URL
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
29
|
+
pdbUrl: URL
|
|
30
|
+
providerId: str
|
|
31
|
+
sequence: str
|
|
32
|
+
sequenceChecksum: str
|
|
33
|
+
sequenceEnd: int
|
|
34
|
+
sequenceStart: int
|
|
35
|
+
sequenceVersionDate: str
|
|
36
|
+
toolUsed: str
|
|
37
|
+
alternativeNames: list[str] | None = None
|
|
35
38
|
amAnnotationsHg19Url: URL | None = None
|
|
36
39
|
amAnnotationsHg38Url: URL | None = None
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
40
|
+
amAnnotationsUrl: URL | None = None
|
|
41
|
+
catalyticActivities: list[str] | None = None
|
|
42
|
+
complexName: str | None = None
|
|
43
|
+
functions: list[str] | None = None
|
|
44
|
+
gene: str | None = None
|
|
45
|
+
geneSynonyms: list[str] | None = None
|
|
46
|
+
ipSAE: float | None = None
|
|
47
|
+
ipTM: float | None = None
|
|
48
|
+
isUniProtReferenceProteome: bool | None = None
|
|
49
|
+
isUniProtReviewed: bool | None = None
|
|
50
|
+
keywords: list[str] | None = None
|
|
51
|
+
msaUrl: URL | None = None
|
|
52
|
+
organismCommonNames: list[str] | None = None
|
|
53
|
+
organismScientificName: str | None = None
|
|
54
|
+
organismSynonyms: list[str] | None = None
|
|
55
|
+
plddtDocUrl: URL | None = None
|
|
56
|
+
proteinFullNames: list[str] | None = None
|
|
57
|
+
proteinShortNames: list[str] | None = None
|
|
58
|
+
stoichiometry: int | None = None
|
|
59
|
+
taxId: int | None = None
|
|
60
|
+
taxonomyLineage: list[str] | None = None
|
|
61
|
+
# uniprotAccession is isoform id (<uniprot_accession>-<isoform number>) when entry has multiple isoforms.
|
|
62
|
+
uniprotAccession: str | None = None
|
|
63
|
+
uniprotDescription: str | None = None
|
|
64
|
+
uniprotId: str | None = None
|
protein_quest/alphafold/fetch.py
CHANGED
|
@@ -7,8 +7,9 @@ from dataclasses import dataclass
|
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from typing import Literal, cast, get_args
|
|
9
9
|
|
|
10
|
+
import aiofiles
|
|
11
|
+
from aiofiles.ospath import exists
|
|
10
12
|
from aiohttp_retry import RetryClient
|
|
11
|
-
from aiopath import AsyncPath
|
|
12
13
|
from tqdm.asyncio import tqdm
|
|
13
14
|
from yarl import URL
|
|
14
15
|
|
|
@@ -24,11 +25,12 @@ DownloadableFormat = Literal[
|
|
|
24
25
|
"bcif",
|
|
25
26
|
"cif",
|
|
26
27
|
"pdb",
|
|
27
|
-
"paeImage",
|
|
28
28
|
"paeDoc",
|
|
29
29
|
"amAnnotations",
|
|
30
30
|
"amAnnotationsHg19",
|
|
31
31
|
"amAnnotationsHg38",
|
|
32
|
+
"msaUrl",
|
|
33
|
+
"plddtDocUrl",
|
|
32
34
|
]
|
|
33
35
|
"""Types of formats that can be downloaded from the AlphaFold web service."""
|
|
34
36
|
|
|
@@ -43,22 +45,23 @@ def _camel_to_snake_case(name: str) -> str:
|
|
|
43
45
|
|
|
44
46
|
@dataclass
|
|
45
47
|
class AlphaFoldEntry:
|
|
46
|
-
"""
|
|
48
|
+
"""AlphaFold entry with summary object and optionally local files.
|
|
47
49
|
|
|
48
|
-
See https://alphafold.ebi.ac.uk/api-docs for more details on the
|
|
50
|
+
See https://alphafold.ebi.ac.uk/api-docs for more details on the summary data structure.
|
|
49
51
|
"""
|
|
50
52
|
|
|
51
|
-
|
|
52
|
-
summary: EntrySummary
|
|
53
|
+
uniprot_accession: str
|
|
54
|
+
summary: EntrySummary
|
|
53
55
|
summary_file: Path | None = None
|
|
54
56
|
bcif_file: Path | None = None
|
|
55
57
|
cif_file: Path | None = None
|
|
56
58
|
pdb_file: Path | None = None
|
|
57
|
-
pae_image_file: Path | None = None
|
|
58
59
|
pae_doc_file: Path | None = None
|
|
59
60
|
am_annotations_file: Path | None = None
|
|
60
61
|
am_annotations_hg19_file: Path | None = None
|
|
61
62
|
am_annotations_hg38_file: Path | None = None
|
|
63
|
+
msa_file: Path | None = None
|
|
64
|
+
plddt_doc_file: Path | None = None
|
|
62
65
|
|
|
63
66
|
@classmethod
|
|
64
67
|
def format2attr(cls, dl_format: DownloadableFormat) -> str:
|
|
@@ -120,25 +123,28 @@ async def fetch_summary(
|
|
|
120
123
|
|
|
121
124
|
Returns:
|
|
122
125
|
A list of EntrySummary objects representing the fetched summary.
|
|
126
|
+
When qualifier has multiple isoforms then multiple summaries are returned,
|
|
127
|
+
otherwise a list of a single summary is returned.
|
|
123
128
|
"""
|
|
124
129
|
url = f"https://alphafold.ebi.ac.uk/api/prediction/{qualifier}"
|
|
125
|
-
fn:
|
|
130
|
+
fn: Path | None = None
|
|
126
131
|
if save_dir is not None:
|
|
127
|
-
fn =
|
|
128
|
-
if await
|
|
132
|
+
fn = save_dir / f"{qualifier}.json"
|
|
133
|
+
if await exists(fn):
|
|
129
134
|
logger.debug(f"File {fn} already exists. Skipping download from {url}.")
|
|
130
|
-
|
|
135
|
+
async with aiofiles.open(fn, "rb") as f:
|
|
136
|
+
raw_data = await f.read()
|
|
131
137
|
return converter.loads(raw_data, list[EntrySummary])
|
|
132
138
|
cached_file = await cacher.copy_from_cache(Path(fn))
|
|
133
139
|
if cached_file is not None:
|
|
134
140
|
logger.debug(f"Using cached file {cached_file} for summary of {qualifier}.")
|
|
135
|
-
|
|
141
|
+
async with aiofiles.open(cached_file, "rb") as f:
|
|
142
|
+
raw_data = await f.read()
|
|
136
143
|
return converter.loads(raw_data, list[EntrySummary])
|
|
137
144
|
async with semaphore, session.get(url) as response:
|
|
138
145
|
response.raise_for_status()
|
|
139
146
|
raw_data = await response.content.read()
|
|
140
147
|
if fn is not None:
|
|
141
|
-
# TODO return fn and make it part of AlphaFoldEntry as summary_file prop
|
|
142
148
|
await cacher.write_bytes(Path(fn), raw_data)
|
|
143
149
|
return converter.loads(raw_data, list[EntrySummary])
|
|
144
150
|
|
|
@@ -148,7 +154,7 @@ async def fetch_summaries(
|
|
|
148
154
|
save_dir: Path | None = None,
|
|
149
155
|
max_parallel_downloads: int = 5,
|
|
150
156
|
cacher: Cacher | None = None,
|
|
151
|
-
) -> AsyncGenerator[EntrySummary]:
|
|
157
|
+
) -> AsyncGenerator[tuple[str, EntrySummary]]:
|
|
152
158
|
semaphore = Semaphore(max_parallel_downloads)
|
|
153
159
|
if save_dir is not None:
|
|
154
160
|
save_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -159,9 +165,9 @@ async def fetch_summaries(
|
|
|
159
165
|
summaries_per_qualifier: list[list[EntrySummary]] = await tqdm.gather(
|
|
160
166
|
*tasks, desc="Fetching Alphafold summaries"
|
|
161
167
|
)
|
|
162
|
-
for summaries in summaries_per_qualifier:
|
|
168
|
+
for qualifier, summaries in zip(qualifiers, summaries_per_qualifier, strict=True):
|
|
163
169
|
for summary in summaries:
|
|
164
|
-
yield summary
|
|
170
|
+
yield qualifier, summary
|
|
165
171
|
|
|
166
172
|
|
|
167
173
|
async def fetch_many_async(
|
|
@@ -171,17 +177,20 @@ async def fetch_many_async(
|
|
|
171
177
|
max_parallel_downloads: int = 5,
|
|
172
178
|
cacher: Cacher | None = None,
|
|
173
179
|
gzip_files: bool = False,
|
|
180
|
+
all_isoforms: bool = False,
|
|
174
181
|
) -> AsyncGenerator[AlphaFoldEntry]:
|
|
175
182
|
"""Asynchronously fetches summaries and files from
|
|
176
183
|
[AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
|
|
177
184
|
|
|
178
185
|
Args:
|
|
179
|
-
uniprot_accessions: A set of Uniprot
|
|
186
|
+
uniprot_accessions: A set of Uniprot accessions to fetch.
|
|
180
187
|
save_dir: The directory to save the fetched files to.
|
|
181
188
|
what: A set of formats to download.
|
|
182
189
|
max_parallel_downloads: The maximum number of parallel downloads.
|
|
183
190
|
cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
|
|
184
191
|
gzip_files: Whether to gzip the downloaded files.
|
|
192
|
+
all_isoforms: Whether to yield all isoforms of each uniprot entry.
|
|
193
|
+
When False then yields only the canonical sequence of uniprot entry.
|
|
185
194
|
|
|
186
195
|
Yields:
|
|
187
196
|
A dataclass containing the summary, pdb file, and pae file.
|
|
@@ -193,8 +202,10 @@ async def fetch_many_async(
|
|
|
193
202
|
async for s in fetch_summaries(
|
|
194
203
|
uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads, cacher=cacher
|
|
195
204
|
)
|
|
205
|
+
# Filter out isoforms if all_isoforms is False
|
|
206
|
+
# O60481 is canonical and O60481-2 is isoform, so we skip the isoform
|
|
207
|
+
if all_isoforms or s[0] == s[1].uniprotAccession
|
|
196
208
|
]
|
|
197
|
-
|
|
198
209
|
files = files_to_download(what, summaries, gzip_files)
|
|
199
210
|
|
|
200
211
|
await retrieve_files(
|
|
@@ -205,16 +216,16 @@ async def fetch_many_async(
|
|
|
205
216
|
cacher=cacher,
|
|
206
217
|
gzip_files=gzip_files,
|
|
207
218
|
)
|
|
219
|
+
|
|
208
220
|
gzext = ".gz" if gzip_files else ""
|
|
209
|
-
for summary in summaries:
|
|
221
|
+
for uniprot_accession, summary in summaries:
|
|
210
222
|
yield AlphaFoldEntry(
|
|
211
|
-
|
|
223
|
+
uniprot_accession=uniprot_accession,
|
|
212
224
|
summary=summary,
|
|
213
|
-
summary_file=save_dir / f"{
|
|
225
|
+
summary_file=save_dir / f"{uniprot_accession}.json" if save_dir_for_summaries is not None else None,
|
|
214
226
|
bcif_file=save_dir / (summary.bcifUrl.name + gzext) if "bcif" in what else None,
|
|
215
227
|
cif_file=save_dir / (summary.cifUrl.name + gzext) if "cif" in what else None,
|
|
216
228
|
pdb_file=save_dir / (summary.pdbUrl.name + gzext) if "pdb" in what else None,
|
|
217
|
-
pae_image_file=save_dir / (summary.paeImageUrl.name + gzext) if "paeImage" in what else None,
|
|
218
229
|
pae_doc_file=save_dir / (summary.paeDocUrl.name + gzext) if "paeDoc" in what else None,
|
|
219
230
|
am_annotations_file=(
|
|
220
231
|
save_dir / (summary.amAnnotationsUrl.name + gzext)
|
|
@@ -231,11 +242,15 @@ async def fetch_many_async(
|
|
|
231
242
|
if "amAnnotationsHg38" in what and summary.amAnnotationsHg38Url
|
|
232
243
|
else None
|
|
233
244
|
),
|
|
245
|
+
msa_file=(save_dir / (summary.msaUrl.name + gzext) if "msaUrl" in what and summary.msaUrl else None),
|
|
246
|
+
plddt_doc_file=(
|
|
247
|
+
save_dir / (summary.plddtDocUrl.name + gzext) if "plddtDocUrl" in what and summary.plddtDocUrl else None
|
|
248
|
+
),
|
|
234
249
|
)
|
|
235
250
|
|
|
236
251
|
|
|
237
252
|
def files_to_download(
|
|
238
|
-
what: set[DownloadableFormat], summaries: Iterable[EntrySummary], gzip_files: bool
|
|
253
|
+
what: set[DownloadableFormat], summaries: Iterable[tuple[str, EntrySummary]], gzip_files: bool
|
|
239
254
|
) -> set[tuple[URL, str]]:
|
|
240
255
|
if not (set(what) <= downloadable_formats):
|
|
241
256
|
msg = (
|
|
@@ -245,14 +260,14 @@ def files_to_download(
|
|
|
245
260
|
raise ValueError(msg)
|
|
246
261
|
|
|
247
262
|
url_filename_pairs: set[tuple[URL, str]] = set()
|
|
248
|
-
for summary in summaries:
|
|
263
|
+
for _, summary in summaries:
|
|
249
264
|
for fmt in what:
|
|
250
265
|
if fmt == "summary":
|
|
251
266
|
# summary is handled already in fetch_summary
|
|
252
267
|
continue
|
|
253
268
|
url = cast("URL | None", getattr(summary, f"{fmt}Url", None))
|
|
254
269
|
if url is None:
|
|
255
|
-
logger.warning(f"Summary {summary.
|
|
270
|
+
logger.warning(f"Summary {summary.modelEntityId} does not have a URL for format '{fmt}'. Skipping.")
|
|
256
271
|
continue
|
|
257
272
|
fn = url.name + (".gz" if gzip_files else "")
|
|
258
273
|
url_filename_pair = (url, fn)
|
|
@@ -267,6 +282,7 @@ def fetch_many(
|
|
|
267
282
|
max_parallel_downloads: int = 5,
|
|
268
283
|
cacher: Cacher | None = None,
|
|
269
284
|
gzip_files: bool = False,
|
|
285
|
+
all_isoforms: bool = False,
|
|
270
286
|
) -> list[AlphaFoldEntry]:
|
|
271
287
|
"""Synchronously fetches summaries and pdb and pae files from AlphaFold Protein Structure Database.
|
|
272
288
|
|
|
@@ -277,6 +293,8 @@ def fetch_many(
|
|
|
277
293
|
max_parallel_downloads: The maximum number of parallel downloads.
|
|
278
294
|
cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
|
|
279
295
|
gzip_files: Whether to gzip the downloaded files.
|
|
296
|
+
all_isoforms: Whether to return all isoforms of each uniprot entry.
|
|
297
|
+
When False then returns only the canonical sequence of uniprot entry.
|
|
280
298
|
|
|
281
299
|
Returns:
|
|
282
300
|
A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
|
|
@@ -286,7 +304,13 @@ def fetch_many(
|
|
|
286
304
|
return [
|
|
287
305
|
entry
|
|
288
306
|
async for entry in fetch_many_async(
|
|
289
|
-
ids,
|
|
307
|
+
ids,
|
|
308
|
+
save_dir,
|
|
309
|
+
what,
|
|
310
|
+
max_parallel_downloads=max_parallel_downloads,
|
|
311
|
+
cacher=cacher,
|
|
312
|
+
gzip_files=gzip_files,
|
|
313
|
+
all_isoforms=all_isoforms,
|
|
290
314
|
)
|
|
291
315
|
]
|
|
292
316
|
|
|
@@ -304,13 +328,12 @@ def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
|
|
|
304
328
|
An AlphaFoldEntry instance with paths relative to the session directory.
|
|
305
329
|
"""
|
|
306
330
|
return AlphaFoldEntry(
|
|
307
|
-
|
|
331
|
+
uniprot_accession=entry.uniprot_accession,
|
|
308
332
|
summary=entry.summary,
|
|
309
333
|
summary_file=entry.summary_file.relative_to(session_dir) if entry.summary_file else None,
|
|
310
334
|
bcif_file=entry.bcif_file.relative_to(session_dir) if entry.bcif_file else None,
|
|
311
335
|
cif_file=entry.cif_file.relative_to(session_dir) if entry.cif_file else None,
|
|
312
336
|
pdb_file=entry.pdb_file.relative_to(session_dir) if entry.pdb_file else None,
|
|
313
|
-
pae_image_file=entry.pae_image_file.relative_to(session_dir) if entry.pae_image_file else None,
|
|
314
337
|
pae_doc_file=entry.pae_doc_file.relative_to(session_dir) if entry.pae_doc_file else None,
|
|
315
338
|
am_annotations_file=entry.am_annotations_file.relative_to(session_dir) if entry.am_annotations_file else None,
|
|
316
339
|
am_annotations_hg19_file=(
|
|
@@ -319,4 +342,6 @@ def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
|
|
|
319
342
|
am_annotations_hg38_file=(
|
|
320
343
|
entry.am_annotations_hg38_file.relative_to(session_dir) if entry.am_annotations_hg38_file else None
|
|
321
344
|
),
|
|
345
|
+
msa_file=entry.msa_file.relative_to(session_dir) if entry.msa_file else None,
|
|
346
|
+
plddt_doc_file=entry.plddt_doc_file.relative_to(session_dir) if entry.plddt_doc_file else None,
|
|
322
347
|
)
|