protein-quest 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of protein-quest might be problematic. Click here for more details.

@@ -1,2 +1,2 @@
1
- __version__ = "0.6.0"
1
+ __version__ = "0.7.0"
2
2
  """The version of the package."""
@@ -8,33 +8,57 @@ from yarl import URL
8
8
  class EntrySummary:
9
9
  """Dataclass representing a summary of an AlphaFold entry.
10
10
 
11
- Modelled after EntrySummary in [https://alphafold.ebi.ac.uk/api/openapi.json](https://alphafold.ebi.ac.uk/api/openapi.json)
11
+ Modelled after NewEntrySummary in [https://alphafold.ebi.ac.uk/api/openapi.json](https://alphafold.ebi.ac.uk/api/openapi.json)
12
+ with URL types and without deprecated fields.
12
13
  """
13
14
 
14
- entryId: str
15
- uniprotAccession: str
16
- uniprotId: str
17
- uniprotDescription: str
18
- taxId: int
19
- organismScientificName: str
20
- uniprotStart: int
21
- uniprotEnd: int
22
- uniprotSequence: str
23
- modelCreatedDate: str
24
- latestVersion: int
25
15
  allVersions: list[int]
26
16
  bcifUrl: URL
27
17
  cifUrl: URL
28
- pdbUrl: URL
29
- paeImageUrl: URL
18
+ entityType: str
19
+ fractionPlddtConfident: float
20
+ fractionPlddtLow: float
21
+ fractionPlddtVeryHigh: float
22
+ fractionPlddtVeryLow: float
23
+ globalMetricValue: float
24
+ isUniProt: bool
25
+ latestVersion: int
26
+ modelCreatedDate: str
27
+ modelEntityId: str
30
28
  paeDocUrl: URL
31
- gene: str | None = None
32
- sequenceChecksum: str | None = None
33
- sequenceVersionDate: str | None = None
34
- amAnnotationsUrl: URL | None = None
29
+ pdbUrl: URL
30
+ providerId: str
31
+ sequence: str
32
+ sequenceChecksum: str
33
+ sequenceEnd: int
34
+ sequenceStart: int
35
+ sequenceVersionDate: str
36
+ toolUsed: str
37
+ alternativeNames: list[str] | None = None
35
38
  amAnnotationsHg19Url: URL | None = None
36
39
  amAnnotationsHg38Url: URL | None = None
37
- isReviewed: bool | None = None
38
- isReferenceProteome: bool | None = None
39
- # TODO add new fields from https://alphafold.ebi.ac.uk/#/public-api/get_uniprot_summary_api_uniprot_summary__qualifier__json_get
40
- # TODO like fractionPlddt* fields which can be used in filter_files_on_confidence()
40
+ amAnnotationsUrl: URL | None = None
41
+ catalyticActivities: list[str] | None = None
42
+ complexName: str | None = None
43
+ functions: list[str] | None = None
44
+ gene: str | None = None
45
+ geneSynonyms: list[str] | None = None
46
+ ipSAE: float | None = None
47
+ ipTM: float | None = None
48
+ isUniProtReferenceProteome: bool | None = None
49
+ isUniProtReviewed: bool | None = None
50
+ keywords: list[str] | None = None
51
+ msaUrl: URL | None = None
52
+ organismCommonNames: list[str] | None = None
53
+ organismScientificName: str | None = None
54
+ organismSynonyms: list[str] | None = None
55
+ plddtDocUrl: URL | None = None
56
+ proteinFullNames: list[str] | None = None
57
+ proteinShortNames: list[str] | None = None
58
+ stoichiometry: int | None = None
59
+ taxId: int | None = None
60
+ taxonomyLineage: list[str] | None = None
61
+ # uniprotAccession is isoform id (<uniprot_accession>-<isoform number>) when entry has multiple isoforms.
62
+ uniprotAccession: str | None = None
63
+ uniprotDescription: str | None = None
64
+ uniprotId: str | None = None
@@ -7,8 +7,9 @@ from dataclasses import dataclass
7
7
  from pathlib import Path
8
8
  from typing import Literal, cast, get_args
9
9
 
10
+ import aiofiles
11
+ from aiofiles.ospath import exists
10
12
  from aiohttp_retry import RetryClient
11
- from aiopath import AsyncPath
12
13
  from tqdm.asyncio import tqdm
13
14
  from yarl import URL
14
15
 
@@ -24,11 +25,12 @@ DownloadableFormat = Literal[
24
25
  "bcif",
25
26
  "cif",
26
27
  "pdb",
27
- "paeImage",
28
28
  "paeDoc",
29
29
  "amAnnotations",
30
30
  "amAnnotationsHg19",
31
31
  "amAnnotationsHg38",
32
+ "msaUrl",
33
+ "plddtDocUrl",
32
34
  ]
33
35
  """Types of formats that can be downloaded from the AlphaFold web service."""
34
36
 
@@ -43,22 +45,23 @@ def _camel_to_snake_case(name: str) -> str:
43
45
 
44
46
  @dataclass
45
47
  class AlphaFoldEntry:
46
- """AlphaFoldEntry represents a minimal single entry in the AlphaFold database.
48
+ """AlphaFold entry with summary object and optionally local files.
47
49
 
48
- See https://alphafold.ebi.ac.uk/api-docs for more details on the API and data structure.
50
+ See https://alphafold.ebi.ac.uk/api-docs for more details on the summary data structure.
49
51
  """
50
52
 
51
- uniprot_acc: str
52
- summary: EntrySummary | None
53
+ uniprot_accession: str
54
+ summary: EntrySummary
53
55
  summary_file: Path | None = None
54
56
  bcif_file: Path | None = None
55
57
  cif_file: Path | None = None
56
58
  pdb_file: Path | None = None
57
- pae_image_file: Path | None = None
58
59
  pae_doc_file: Path | None = None
59
60
  am_annotations_file: Path | None = None
60
61
  am_annotations_hg19_file: Path | None = None
61
62
  am_annotations_hg38_file: Path | None = None
63
+ msa_file: Path | None = None
64
+ plddt_doc_file: Path | None = None
62
65
 
63
66
  @classmethod
64
67
  def format2attr(cls, dl_format: DownloadableFormat) -> str:
@@ -120,25 +123,28 @@ async def fetch_summary(
120
123
 
121
124
  Returns:
122
125
  A list of EntrySummary objects representing the fetched summary.
126
+ When qualifier has multiple isoforms then multiple summaries are returned,
127
+ otherwise a list of a single summary is returned.
123
128
  """
124
129
  url = f"https://alphafold.ebi.ac.uk/api/prediction/{qualifier}"
125
- fn: AsyncPath | None = None
130
+ fn: Path | None = None
126
131
  if save_dir is not None:
127
- fn = AsyncPath(save_dir / f"{qualifier}.json")
128
- if await fn.exists():
132
+ fn = save_dir / f"{qualifier}.json"
133
+ if await exists(fn):
129
134
  logger.debug(f"File {fn} already exists. Skipping download from {url}.")
130
- raw_data = await fn.read_bytes()
135
+ async with aiofiles.open(fn, "rb") as f:
136
+ raw_data = await f.read()
131
137
  return converter.loads(raw_data, list[EntrySummary])
132
138
  cached_file = await cacher.copy_from_cache(Path(fn))
133
139
  if cached_file is not None:
134
140
  logger.debug(f"Using cached file {cached_file} for summary of {qualifier}.")
135
- raw_data = await AsyncPath(cached_file).read_bytes()
141
+ async with aiofiles.open(cached_file, "rb") as f:
142
+ raw_data = await f.read()
136
143
  return converter.loads(raw_data, list[EntrySummary])
137
144
  async with semaphore, session.get(url) as response:
138
145
  response.raise_for_status()
139
146
  raw_data = await response.content.read()
140
147
  if fn is not None:
141
- # TODO return fn and make it part of AlphaFoldEntry as summary_file prop
142
148
  await cacher.write_bytes(Path(fn), raw_data)
143
149
  return converter.loads(raw_data, list[EntrySummary])
144
150
 
@@ -148,7 +154,7 @@ async def fetch_summaries(
148
154
  save_dir: Path | None = None,
149
155
  max_parallel_downloads: int = 5,
150
156
  cacher: Cacher | None = None,
151
- ) -> AsyncGenerator[EntrySummary]:
157
+ ) -> AsyncGenerator[tuple[str, EntrySummary]]:
152
158
  semaphore = Semaphore(max_parallel_downloads)
153
159
  if save_dir is not None:
154
160
  save_dir.mkdir(parents=True, exist_ok=True)
@@ -159,9 +165,9 @@ async def fetch_summaries(
159
165
  summaries_per_qualifier: list[list[EntrySummary]] = await tqdm.gather(
160
166
  *tasks, desc="Fetching Alphafold summaries"
161
167
  )
162
- for summaries in summaries_per_qualifier:
168
+ for qualifier, summaries in zip(qualifiers, summaries_per_qualifier, strict=True):
163
169
  for summary in summaries:
164
- yield summary
170
+ yield qualifier, summary
165
171
 
166
172
 
167
173
  async def fetch_many_async(
@@ -171,17 +177,20 @@ async def fetch_many_async(
171
177
  max_parallel_downloads: int = 5,
172
178
  cacher: Cacher | None = None,
173
179
  gzip_files: bool = False,
180
+ all_isoforms: bool = False,
174
181
  ) -> AsyncGenerator[AlphaFoldEntry]:
175
182
  """Asynchronously fetches summaries and files from
176
183
  [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
177
184
 
178
185
  Args:
179
- uniprot_accessions: A set of Uniprot acessions to fetch.
186
+ uniprot_accessions: A set of Uniprot accessions to fetch.
180
187
  save_dir: The directory to save the fetched files to.
181
188
  what: A set of formats to download.
182
189
  max_parallel_downloads: The maximum number of parallel downloads.
183
190
  cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
184
191
  gzip_files: Whether to gzip the downloaded files.
192
+ all_isoforms: Whether to yield all isoforms of each uniprot entry.
193
+ When False then yields only the canonical sequence of uniprot entry.
185
194
 
186
195
  Yields:
187
196
  A dataclass containing the summary, pdb file, and pae file.
@@ -193,8 +202,10 @@ async def fetch_many_async(
193
202
  async for s in fetch_summaries(
194
203
  uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads, cacher=cacher
195
204
  )
205
+ # Filter out isoforms if all_isoforms is False
206
+ # O60481 is canonical and O60481-2 is isoform, so we skip the isoform
207
+ if all_isoforms or s[0] == s[1].uniprotAccession
196
208
  ]
197
-
198
209
  files = files_to_download(what, summaries, gzip_files)
199
210
 
200
211
  await retrieve_files(
@@ -205,16 +216,16 @@ async def fetch_many_async(
205
216
  cacher=cacher,
206
217
  gzip_files=gzip_files,
207
218
  )
219
+
208
220
  gzext = ".gz" if gzip_files else ""
209
- for summary in summaries:
221
+ for uniprot_accession, summary in summaries:
210
222
  yield AlphaFoldEntry(
211
- uniprot_acc=summary.uniprotAccession,
223
+ uniprot_accession=uniprot_accession,
212
224
  summary=summary,
213
- summary_file=save_dir / f"{summary.uniprotAccession}.json" if save_dir_for_summaries is not None else None,
225
+ summary_file=save_dir / f"{uniprot_accession}.json" if save_dir_for_summaries is not None else None,
214
226
  bcif_file=save_dir / (summary.bcifUrl.name + gzext) if "bcif" in what else None,
215
227
  cif_file=save_dir / (summary.cifUrl.name + gzext) if "cif" in what else None,
216
228
  pdb_file=save_dir / (summary.pdbUrl.name + gzext) if "pdb" in what else None,
217
- pae_image_file=save_dir / (summary.paeImageUrl.name + gzext) if "paeImage" in what else None,
218
229
  pae_doc_file=save_dir / (summary.paeDocUrl.name + gzext) if "paeDoc" in what else None,
219
230
  am_annotations_file=(
220
231
  save_dir / (summary.amAnnotationsUrl.name + gzext)
@@ -231,11 +242,15 @@ async def fetch_many_async(
231
242
  if "amAnnotationsHg38" in what and summary.amAnnotationsHg38Url
232
243
  else None
233
244
  ),
245
+ msa_file=(save_dir / (summary.msaUrl.name + gzext) if "msaUrl" in what and summary.msaUrl else None),
246
+ plddt_doc_file=(
247
+ save_dir / (summary.plddtDocUrl.name + gzext) if "plddtDocUrl" in what and summary.plddtDocUrl else None
248
+ ),
234
249
  )
235
250
 
236
251
 
237
252
  def files_to_download(
238
- what: set[DownloadableFormat], summaries: Iterable[EntrySummary], gzip_files: bool
253
+ what: set[DownloadableFormat], summaries: Iterable[tuple[str, EntrySummary]], gzip_files: bool
239
254
  ) -> set[tuple[URL, str]]:
240
255
  if not (set(what) <= downloadable_formats):
241
256
  msg = (
@@ -245,14 +260,14 @@ def files_to_download(
245
260
  raise ValueError(msg)
246
261
 
247
262
  url_filename_pairs: set[tuple[URL, str]] = set()
248
- for summary in summaries:
263
+ for _, summary in summaries:
249
264
  for fmt in what:
250
265
  if fmt == "summary":
251
266
  # summary is handled already in fetch_summary
252
267
  continue
253
268
  url = cast("URL | None", getattr(summary, f"{fmt}Url", None))
254
269
  if url is None:
255
- logger.warning(f"Summary {summary.uniprotAccession} does not have a URL for format '{fmt}'. Skipping.")
270
+ logger.warning(f"Summary {summary.modelEntityId} does not have a URL for format '{fmt}'. Skipping.")
256
271
  continue
257
272
  fn = url.name + (".gz" if gzip_files else "")
258
273
  url_filename_pair = (url, fn)
@@ -267,6 +282,7 @@ def fetch_many(
267
282
  max_parallel_downloads: int = 5,
268
283
  cacher: Cacher | None = None,
269
284
  gzip_files: bool = False,
285
+ all_isoforms: bool = False,
270
286
  ) -> list[AlphaFoldEntry]:
271
287
  """Synchronously fetches summaries and pdb and pae files from AlphaFold Protein Structure Database.
272
288
 
@@ -277,6 +293,8 @@ def fetch_many(
277
293
  max_parallel_downloads: The maximum number of parallel downloads.
278
294
  cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
279
295
  gzip_files: Whether to gzip the downloaded files.
296
+ all_isoforms: Whether to return all isoforms of each uniprot entry.
297
+ When False then returns only the canonical sequence of uniprot entry.
280
298
 
281
299
  Returns:
282
300
  A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
@@ -286,7 +304,13 @@ def fetch_many(
286
304
  return [
287
305
  entry
288
306
  async for entry in fetch_many_async(
289
- ids, save_dir, what, max_parallel_downloads=max_parallel_downloads, cacher=cacher, gzip_files=gzip_files
307
+ ids,
308
+ save_dir,
309
+ what,
310
+ max_parallel_downloads=max_parallel_downloads,
311
+ cacher=cacher,
312
+ gzip_files=gzip_files,
313
+ all_isoforms=all_isoforms,
290
314
  )
291
315
  ]
292
316
 
@@ -304,13 +328,12 @@ def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
304
328
  An AlphaFoldEntry instance with paths relative to the session directory.
305
329
  """
306
330
  return AlphaFoldEntry(
307
- uniprot_acc=entry.uniprot_acc,
331
+ uniprot_accession=entry.uniprot_accession,
308
332
  summary=entry.summary,
309
333
  summary_file=entry.summary_file.relative_to(session_dir) if entry.summary_file else None,
310
334
  bcif_file=entry.bcif_file.relative_to(session_dir) if entry.bcif_file else None,
311
335
  cif_file=entry.cif_file.relative_to(session_dir) if entry.cif_file else None,
312
336
  pdb_file=entry.pdb_file.relative_to(session_dir) if entry.pdb_file else None,
313
- pae_image_file=entry.pae_image_file.relative_to(session_dir) if entry.pae_image_file else None,
314
337
  pae_doc_file=entry.pae_doc_file.relative_to(session_dir) if entry.pae_doc_file else None,
315
338
  am_annotations_file=entry.am_annotations_file.relative_to(session_dir) if entry.am_annotations_file else None,
316
339
  am_annotations_hg19_file=(
@@ -319,4 +342,6 @@ def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
319
342
  am_annotations_hg38_file=(
320
343
  entry.am_annotations_hg38_file.relative_to(session_dir) if entry.am_annotations_hg38_file else None
321
344
  ),
345
+ msa_file=entry.msa_file.relative_to(session_dir) if entry.msa_file else None,
346
+ plddt_doc_file=entry.plddt_doc_file.relative_to(session_dir) if entry.plddt_doc_file else None,
322
347
  )