protein-quest 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2 +1,2 @@
1
- __version__ = "0.6.0"
1
+ __version__ = "0.8.0"
2
2
  """The version of the package."""
@@ -8,33 +8,57 @@ from yarl import URL
8
8
  class EntrySummary:
9
9
  """Dataclass representing a summary of an AlphaFold entry.
10
10
 
11
- Modelled after EntrySummary in [https://alphafold.ebi.ac.uk/api/openapi.json](https://alphafold.ebi.ac.uk/api/openapi.json)
11
+ Modelled after NewEntrySummary in [https://alphafold.ebi.ac.uk/api/openapi.json](https://alphafold.ebi.ac.uk/api/openapi.json)
12
+ with URL types and without deprecated fields.
12
13
  """
13
14
 
14
- entryId: str
15
- uniprotAccession: str
16
- uniprotId: str
17
- uniprotDescription: str
18
- taxId: int
19
- organismScientificName: str
20
- uniprotStart: int
21
- uniprotEnd: int
22
- uniprotSequence: str
23
- modelCreatedDate: str
24
- latestVersion: int
25
15
  allVersions: list[int]
26
16
  bcifUrl: URL
27
17
  cifUrl: URL
28
- pdbUrl: URL
29
- paeImageUrl: URL
18
+ entityType: str
19
+ fractionPlddtConfident: float
20
+ fractionPlddtLow: float
21
+ fractionPlddtVeryHigh: float
22
+ fractionPlddtVeryLow: float
23
+ globalMetricValue: float
24
+ isUniProt: bool
25
+ latestVersion: int
26
+ modelCreatedDate: str
27
+ modelEntityId: str
30
28
  paeDocUrl: URL
31
- gene: str | None = None
32
- sequenceChecksum: str | None = None
33
- sequenceVersionDate: str | None = None
34
- amAnnotationsUrl: URL | None = None
29
+ pdbUrl: URL
30
+ providerId: str
31
+ sequence: str
32
+ sequenceChecksum: str
33
+ sequenceEnd: int
34
+ sequenceStart: int
35
+ sequenceVersionDate: str
36
+ toolUsed: str
37
+ alternativeNames: list[str] | None = None
35
38
  amAnnotationsHg19Url: URL | None = None
36
39
  amAnnotationsHg38Url: URL | None = None
37
- isReviewed: bool | None = None
38
- isReferenceProteome: bool | None = None
39
- # TODO add new fields from https://alphafold.ebi.ac.uk/#/public-api/get_uniprot_summary_api_uniprot_summary__qualifier__json_get
40
- # TODO like fractionPlddt* fields which can be used in filter_files_on_confidence()
40
+ amAnnotationsUrl: URL | None = None
41
+ catalyticActivities: list[str] | None = None
42
+ complexName: str | None = None
43
+ functions: list[str] | None = None
44
+ gene: str | None = None
45
+ geneSynonyms: list[str] | None = None
46
+ ipSAE: float | None = None
47
+ ipTM: float | None = None
48
+ isUniProtReferenceProteome: bool | None = None
49
+ isUniProtReviewed: bool | None = None
50
+ keywords: list[str] | None = None
51
+ msaUrl: URL | None = None
52
+ organismCommonNames: list[str] | None = None
53
+ organismScientificName: str | None = None
54
+ organismSynonyms: list[str] | None = None
55
+ plddtDocUrl: URL | None = None
56
+ proteinFullNames: list[str] | None = None
57
+ proteinShortNames: list[str] | None = None
58
+ stoichiometry: int | None = None
59
+ taxId: int | None = None
60
+ taxonomyLineage: list[str] | None = None
61
+ # uniprotAccession is isoform id (<uniprot_accession>-<isoform number>) when entry has multiple isoforms.
62
+ uniprotAccession: str | None = None
63
+ uniprotDescription: str | None = None
64
+ uniprotId: str | None = None
@@ -7,8 +7,9 @@ from dataclasses import dataclass
7
7
  from pathlib import Path
8
8
  from typing import Literal, cast, get_args
9
9
 
10
+ import aiofiles
11
+ from aiofiles.ospath import exists
10
12
  from aiohttp_retry import RetryClient
11
- from aiopath import AsyncPath
12
13
  from tqdm.asyncio import tqdm
13
14
  from yarl import URL
14
15
 
@@ -24,17 +25,23 @@ DownloadableFormat = Literal[
24
25
  "bcif",
25
26
  "cif",
26
27
  "pdb",
27
- "paeImage",
28
28
  "paeDoc",
29
29
  "amAnnotations",
30
30
  "amAnnotationsHg19",
31
31
  "amAnnotationsHg38",
32
+ "msa",
33
+ "plddtDoc",
32
34
  ]
33
35
  """Types of formats that can be downloaded from the AlphaFold web service."""
34
36
 
35
37
  downloadable_formats: set[DownloadableFormat] = set(get_args(DownloadableFormat))
36
38
  """Set of formats that can be downloaded from the AlphaFold web service."""
37
39
 
40
+ UrlFileNamePair = tuple[URL, str]
41
+ """A tuple of a URL and a filename."""
42
+ UrlFileNamePairsOfFormats = dict[DownloadableFormat, UrlFileNamePair]
43
+ """A mapping of DownloadableFormat to UrlFileNamePair."""
44
+
38
45
 
39
46
  def _camel_to_snake_case(name: str) -> str:
40
47
  """Convert a camelCase string to snake_case."""
@@ -43,22 +50,23 @@ def _camel_to_snake_case(name: str) -> str:
43
50
 
44
51
  @dataclass
45
52
  class AlphaFoldEntry:
46
- """AlphaFoldEntry represents a minimal single entry in the AlphaFold database.
53
+ """AlphaFold entry with summary object and optionally local files.
47
54
 
48
- See https://alphafold.ebi.ac.uk/api-docs for more details on the API and data structure.
55
+ See https://alphafold.ebi.ac.uk/api-docs for more details on the summary data structure.
49
56
  """
50
57
 
51
- uniprot_acc: str
52
- summary: EntrySummary | None
58
+ uniprot_accession: str
59
+ summary: EntrySummary | None = None
53
60
  summary_file: Path | None = None
54
61
  bcif_file: Path | None = None
55
62
  cif_file: Path | None = None
56
63
  pdb_file: Path | None = None
57
- pae_image_file: Path | None = None
58
64
  pae_doc_file: Path | None = None
59
65
  am_annotations_file: Path | None = None
60
66
  am_annotations_hg19_file: Path | None = None
61
67
  am_annotations_hg38_file: Path | None = None
68
+ msa_file: Path | None = None
69
+ plddt_doc_file: Path | None = None
62
70
 
63
71
  @classmethod
64
72
  def format2attr(cls, dl_format: DownloadableFormat) -> str:
@@ -102,6 +110,35 @@ class AlphaFoldEntry:
102
110
  """
103
111
  return sum(1 for attr in vars(self) if attr.endswith("_file") and getattr(self, attr) is not None)
104
112
 
113
+ def relative_to(self, session_dir: Path) -> "AlphaFoldEntry":
114
+ """Convert paths in an AlphaFoldEntry to be relative to the session directory.
115
+
116
+ Args:
117
+ entry: An AlphaFoldEntry instance with absolute paths.
118
+ session_dir: The session directory to which the paths should be made relative.
119
+
120
+ Returns:
121
+ An AlphaFoldEntry instance with paths relative to the session directory.
122
+ """
123
+ return AlphaFoldEntry(
124
+ uniprot_accession=self.uniprot_accession,
125
+ summary=self.summary,
126
+ summary_file=self.summary_file.relative_to(session_dir) if self.summary_file else None,
127
+ bcif_file=self.bcif_file.relative_to(session_dir) if self.bcif_file else None,
128
+ cif_file=self.cif_file.relative_to(session_dir) if self.cif_file else None,
129
+ pdb_file=self.pdb_file.relative_to(session_dir) if self.pdb_file else None,
130
+ pae_doc_file=self.pae_doc_file.relative_to(session_dir) if self.pae_doc_file else None,
131
+ am_annotations_file=self.am_annotations_file.relative_to(session_dir) if self.am_annotations_file else None,
132
+ am_annotations_hg19_file=(
133
+ self.am_annotations_hg19_file.relative_to(session_dir) if self.am_annotations_hg19_file else None
134
+ ),
135
+ am_annotations_hg38_file=(
136
+ self.am_annotations_hg38_file.relative_to(session_dir) if self.am_annotations_hg38_file else None
137
+ ),
138
+ msa_file=self.msa_file.relative_to(session_dir) if self.msa_file else None,
139
+ plddt_doc_file=self.plddt_doc_file.relative_to(session_dir) if self.plddt_doc_file else None,
140
+ )
141
+
105
142
 
106
143
  async def fetch_summary(
107
144
  qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None, cacher: Cacher
@@ -120,25 +157,28 @@ async def fetch_summary(
120
157
 
121
158
  Returns:
122
159
  A list of EntrySummary objects representing the fetched summary.
160
+ When qualifier has multiple isoforms then multiple summaries are returned,
161
+ otherwise a list of a single summary is returned.
123
162
  """
124
163
  url = f"https://alphafold.ebi.ac.uk/api/prediction/{qualifier}"
125
- fn: AsyncPath | None = None
164
+ fn: Path | None = None
126
165
  if save_dir is not None:
127
- fn = AsyncPath(save_dir / f"{qualifier}.json")
128
- if await fn.exists():
166
+ fn = save_dir / f"{qualifier}.json"
167
+ if await exists(fn):
129
168
  logger.debug(f"File {fn} already exists. Skipping download from {url}.")
130
- raw_data = await fn.read_bytes()
169
+ async with aiofiles.open(fn, "rb") as f:
170
+ raw_data = await f.read()
131
171
  return converter.loads(raw_data, list[EntrySummary])
132
172
  cached_file = await cacher.copy_from_cache(Path(fn))
133
173
  if cached_file is not None:
134
174
  logger.debug(f"Using cached file {cached_file} for summary of {qualifier}.")
135
- raw_data = await AsyncPath(cached_file).read_bytes()
175
+ async with aiofiles.open(cached_file, "rb") as f:
176
+ raw_data = await f.read()
136
177
  return converter.loads(raw_data, list[EntrySummary])
137
178
  async with semaphore, session.get(url) as response:
138
179
  response.raise_for_status()
139
180
  raw_data = await response.content.read()
140
181
  if fn is not None:
141
- # TODO return fn and make it part of AlphaFoldEntry as summary_file prop
142
182
  await cacher.write_bytes(Path(fn), raw_data)
143
183
  return converter.loads(raw_data, list[EntrySummary])
144
184
 
@@ -148,7 +188,7 @@ async def fetch_summaries(
148
188
  save_dir: Path | None = None,
149
189
  max_parallel_downloads: int = 5,
150
190
  cacher: Cacher | None = None,
151
- ) -> AsyncGenerator[EntrySummary]:
191
+ ) -> AsyncGenerator[tuple[str, EntrySummary]]:
152
192
  semaphore = Semaphore(max_parallel_downloads)
153
193
  if save_dir is not None:
154
194
  save_dir.mkdir(parents=True, exist_ok=True)
@@ -159,43 +199,32 @@ async def fetch_summaries(
159
199
  summaries_per_qualifier: list[list[EntrySummary]] = await tqdm.gather(
160
200
  *tasks, desc="Fetching Alphafold summaries"
161
201
  )
162
- for summaries in summaries_per_qualifier:
202
+ for qualifier, summaries in zip(qualifiers, summaries_per_qualifier, strict=True):
163
203
  for summary in summaries:
164
- yield summary
204
+ yield qualifier, summary
165
205
 
166
206
 
167
- async def fetch_many_async(
207
+ async def _fetch_many_async_with_summary(
168
208
  uniprot_accessions: Iterable[str],
169
209
  save_dir: Path,
170
- what: set[DownloadableFormat],
210
+ formats: set[DownloadableFormat],
171
211
  max_parallel_downloads: int = 5,
172
212
  cacher: Cacher | None = None,
173
213
  gzip_files: bool = False,
214
+ all_isoforms: bool = False,
174
215
  ) -> AsyncGenerator[AlphaFoldEntry]:
175
- """Asynchronously fetches summaries and files from
176
- [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
177
-
178
- Args:
179
- uniprot_accessions: A set of Uniprot acessions to fetch.
180
- save_dir: The directory to save the fetched files to.
181
- what: A set of formats to download.
182
- max_parallel_downloads: The maximum number of parallel downloads.
183
- cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
184
- gzip_files: Whether to gzip the downloaded files.
185
-
186
- Yields:
187
- A dataclass containing the summary, pdb file, and pae file.
188
- """
189
- save_dir_for_summaries = save_dir if "summary" in what and save_dir is not None else None
216
+ save_dir_for_summaries = save_dir if "summary" in formats else None
190
217
 
191
218
  summaries = [
192
219
  s
193
220
  async for s in fetch_summaries(
194
221
  uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads, cacher=cacher
195
222
  )
223
+ # Filter out isoforms if all_isoforms is False
224
+ # O60481 is canonical and O60481-2 is isoform, so we skip the isoform
225
+ if all_isoforms or s[0] == s[1].uniprotAccession
196
226
  ]
197
-
198
- files = files_to_download(what, summaries, gzip_files)
227
+ files = files_to_download(formats, summaries, gzip_files)
199
228
 
200
229
  await retrieve_files(
201
230
  files,
@@ -205,54 +234,58 @@ async def fetch_many_async(
205
234
  cacher=cacher,
206
235
  gzip_files=gzip_files,
207
236
  )
237
+
208
238
  gzext = ".gz" if gzip_files else ""
209
- for summary in summaries:
239
+ for uniprot_accession, summary in summaries:
210
240
  yield AlphaFoldEntry(
211
- uniprot_acc=summary.uniprotAccession,
241
+ uniprot_accession=uniprot_accession,
212
242
  summary=summary,
213
- summary_file=save_dir / f"{summary.uniprotAccession}.json" if save_dir_for_summaries is not None else None,
214
- bcif_file=save_dir / (summary.bcifUrl.name + gzext) if "bcif" in what else None,
215
- cif_file=save_dir / (summary.cifUrl.name + gzext) if "cif" in what else None,
216
- pdb_file=save_dir / (summary.pdbUrl.name + gzext) if "pdb" in what else None,
217
- pae_image_file=save_dir / (summary.paeImageUrl.name + gzext) if "paeImage" in what else None,
218
- pae_doc_file=save_dir / (summary.paeDocUrl.name + gzext) if "paeDoc" in what else None,
243
+ summary_file=save_dir / f"{uniprot_accession}.json" if save_dir_for_summaries is not None else None,
244
+ bcif_file=save_dir / (summary.bcifUrl.name + gzext) if "bcif" in formats else None,
245
+ cif_file=save_dir / (summary.cifUrl.name + gzext) if "cif" in formats else None,
246
+ pdb_file=save_dir / (summary.pdbUrl.name + gzext) if "pdb" in formats else None,
247
+ pae_doc_file=save_dir / (summary.paeDocUrl.name + gzext) if "paeDoc" in formats else None,
219
248
  am_annotations_file=(
220
249
  save_dir / (summary.amAnnotationsUrl.name + gzext)
221
- if "amAnnotations" in what and summary.amAnnotationsUrl
250
+ if "amAnnotations" in formats and summary.amAnnotationsUrl
222
251
  else None
223
252
  ),
224
253
  am_annotations_hg19_file=(
225
254
  save_dir / (summary.amAnnotationsHg19Url.name + gzext)
226
- if "amAnnotationsHg19" in what and summary.amAnnotationsHg19Url
255
+ if "amAnnotationsHg19" in formats and summary.amAnnotationsHg19Url
227
256
  else None
228
257
  ),
229
258
  am_annotations_hg38_file=(
230
259
  save_dir / (summary.amAnnotationsHg38Url.name + gzext)
231
- if "amAnnotationsHg38" in what and summary.amAnnotationsHg38Url
260
+ if "amAnnotationsHg38" in formats and summary.amAnnotationsHg38Url
232
261
  else None
233
262
  ),
263
+ msa_file=(save_dir / (summary.msaUrl.name + gzext) if "msa" in formats and summary.msaUrl else None),
264
+ plddt_doc_file=(
265
+ save_dir / (summary.plddtDocUrl.name + gzext) if "plddtDoc" in formats and summary.plddtDocUrl else None
266
+ ),
234
267
  )
235
268
 
236
269
 
237
270
  def files_to_download(
238
- what: set[DownloadableFormat], summaries: Iterable[EntrySummary], gzip_files: bool
239
- ) -> set[tuple[URL, str]]:
240
- if not (set(what) <= downloadable_formats):
271
+ formats: set[DownloadableFormat], summaries: Iterable[tuple[str, EntrySummary]], gzip_files: bool
272
+ ) -> set[UrlFileNamePair]:
273
+ if not (set(formats) <= downloadable_formats):
241
274
  msg = (
242
- f"Invalid format(s) specified: {set(what) - downloadable_formats}. "
275
+ f"Invalid format(s) specified: {set(formats) - downloadable_formats}. "
243
276
  f"Valid formats are: {downloadable_formats}"
244
277
  )
245
278
  raise ValueError(msg)
246
279
 
247
- url_filename_pairs: set[tuple[URL, str]] = set()
248
- for summary in summaries:
249
- for fmt in what:
280
+ url_filename_pairs: set[UrlFileNamePair] = set()
281
+ for _, summary in summaries:
282
+ for fmt in formats:
250
283
  if fmt == "summary":
251
284
  # summary is handled already in fetch_summary
252
285
  continue
253
286
  url = cast("URL | None", getattr(summary, f"{fmt}Url", None))
254
287
  if url is None:
255
- logger.warning(f"Summary {summary.uniprotAccession} does not have a URL for format '{fmt}'. Skipping.")
288
+ logger.warning(f"Summary {summary.modelEntityId} does not have a URL for format '{fmt}'. Skipping.")
256
289
  continue
257
290
  fn = url.name + (".gz" if gzip_files else "")
258
291
  url_filename_pair = (url, fn)
@@ -260,23 +293,224 @@ def files_to_download(
260
293
  return url_filename_pairs
261
294
 
262
295
 
296
+ async def fetch_alphafold_db_version() -> str:
297
+ """Fetch the current version of the AlphaFold database.
298
+
299
+ Returns:
300
+ The current version of the AlphaFold database as a string. For example: "6".
301
+ """
302
+ url = "https://ftp.ebi.ac.uk/pub/databases/alphafold/accession_ids.csv"
303
+ headers = {"Range": "bytes=0-200"}
304
+ logger.debug(f"Detecting AlphaFold DB version from head of {url}")
305
+ async with friendly_session() as session, session.get(url, headers=headers) as response:
306
+ response.raise_for_status()
307
+ raw = await response.content.read(200)
308
+ text = raw.decode("utf-8")
309
+ first_line = text.splitlines()[1]
310
+ version = first_line.split(",")[-1]
311
+ logger.debug(f"Found current AlphaFold DB version is '{version}'")
312
+ return version
313
+
314
+
315
+ def _files_for_alphafold_entry(
316
+ uniprot_accession: str,
317
+ formats: set[DownloadableFormat],
318
+ db_version: str,
319
+ gzip_files: bool,
320
+ ) -> UrlFileNamePairsOfFormats:
321
+ templates: dict[DownloadableFormat, URL] = {
322
+ "bcif": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-model_v{db_version}.bcif"),
323
+ "cif": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-model_v{db_version}.cif"),
324
+ "pdb": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-model_v{db_version}.pdb"),
325
+ "paeDoc": URL(
326
+ f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-predicted_aligned_error_v{db_version}.json"
327
+ ),
328
+ "amAnnotations": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-aa-substitutions.csv"),
329
+ "amAnnotationsHg19": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-hg19.csv"),
330
+ "amAnnotationsHg38": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-hg38.csv"),
331
+ "msa": URL(f"https://alphafold.ebi.ac.uk/files/msa/AF-{uniprot_accession}-F1-msa_v{db_version}.a3m"),
332
+ "plddtDoc": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-confidence_v{db_version}.json"),
333
+ }
334
+ url_filename_pairs = {}
335
+ for fmt in formats:
336
+ if fmt == "summary":
337
+ # Summaries are downloaded separately as its using API instead of static files
338
+ continue
339
+ if fmt not in templates:
340
+ logger.warning(f"No URL template found for format '{fmt}'. Skipping.")
341
+ continue
342
+ url = templates[cast("DownloadableFormat", fmt)]
343
+ fn = url.name
344
+ if gzip_files:
345
+ fn += ".gz"
346
+ url_filename_pair = (url, fn)
347
+ url_filename_pairs[fmt] = url_filename_pair
348
+ return url_filename_pairs
349
+
350
+
351
+ def files_for_alphafold_entries(
352
+ uniprot_accessions: Iterable[str],
353
+ formats: set[DownloadableFormat],
354
+ db_version: str,
355
+ gzip_files: bool,
356
+ ) -> dict[str, UrlFileNamePairsOfFormats]:
357
+ """Get the files to download for multiple AlphaFold entries.
358
+
359
+ Args:
360
+ uniprot_accessions: A set of Uniprot accessions.
361
+ formats: A set of formats to download.
362
+ db_version: The version of the AlphaFold database to use.
363
+ gzip_files: Whether to download gzipped files. Otherwise downloads uncompressed files.
364
+
365
+ Returns:
366
+ A mapping of Uniprot accession to a mapping of DownloadableFormat to UrlFileNamePair.
367
+ """
368
+ return {
369
+ uniprot_accession: _files_for_alphafold_entry(
370
+ uniprot_accession, formats=formats, db_version=db_version, gzip_files=gzip_files
371
+ )
372
+ for uniprot_accession in uniprot_accessions
373
+ }
374
+
375
+
376
+ async def _fetch_many_async_without_summary(
377
+ uniprot_accessions: Iterable[str],
378
+ save_dir: Path,
379
+ formats: set[DownloadableFormat],
380
+ db_version: str | None = None,
381
+ max_parallel_downloads: int = 5,
382
+ cacher: Cacher | None = None,
383
+ gzip_files: bool = False,
384
+ ) -> AsyncGenerator[AlphaFoldEntry]:
385
+ if db_version is None:
386
+ db_version = await fetch_alphafold_db_version()
387
+ nested_files = files_for_alphafold_entries(
388
+ uniprot_accessions, formats=formats, db_version=db_version, gzip_files=gzip_files
389
+ )
390
+ files: set[UrlFileNamePair] = set()
391
+ for uniprot_accession in uniprot_accessions:
392
+ files.update(nested_files[uniprot_accession].values())
393
+
394
+ retrieved_files = await retrieve_files(
395
+ files,
396
+ save_dir,
397
+ desc="Downloading AlphaFold files",
398
+ max_parallel_downloads=max_parallel_downloads,
399
+ cacher=cacher,
400
+ gzip_files=gzip_files,
401
+ raise_for_not_found=False,
402
+ )
403
+
404
+ retrieved_files_set = set(retrieved_files)
405
+ for uniprot_accession in uniprot_accessions:
406
+ entry = AlphaFoldEntry(
407
+ uniprot_accession=uniprot_accession,
408
+ )
409
+
410
+ for af_format, url_filename_pair in nested_files[uniprot_accession].items():
411
+ _, filename = url_filename_pair
412
+ filepath = save_dir / filename
413
+ if filepath in retrieved_files_set:
414
+ attr = AlphaFoldEntry.format2attr(af_format)
415
+ setattr(entry, attr, filepath)
416
+ # else: File was not found (404) during download, so we leave the attribute as None
417
+
418
+ yield entry
419
+
420
+
421
+ def fetch_many_async(
422
+ uniprot_accessions: Iterable[str],
423
+ save_dir: Path,
424
+ formats: set[DownloadableFormat],
425
+ db_version: str | None = None,
426
+ max_parallel_downloads: int = 5,
427
+ cacher: Cacher | None = None,
428
+ gzip_files: bool = False,
429
+ all_isoforms: bool = False,
430
+ ) -> AsyncGenerator[AlphaFoldEntry]:
431
+ """Asynchronously fetches summaries and/or files from
432
+ [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
433
+
434
+ Args:
435
+ uniprot_accessions: A set of Uniprot accessions to fetch.
436
+ save_dir: The directory to save the fetched files to.
437
+ formats: A set of formats to download.
438
+ If `summary` is in the set then summaries will be fetched using the API endpoint.
439
+ and later the other files will be downloaded using static file URLs.
440
+ If `summary` is not in the set then all files will be downloaded using static file
441
+ URLs only.
442
+ db_version: The version of the AlphaFold database to use. If None, the latest version will be used.
443
+ max_parallel_downloads: The maximum number of parallel downloads.
444
+ cacher: A cacher to use for caching the fetched files.
445
+ gzip_files: Whether to gzip the downloaded files.
446
+ Summaries are never gzipped.
447
+ all_isoforms: Whether to yield all isoforms of each uniprot entry.
448
+ When False then yields only the canonical sequence per uniprot entry.
449
+
450
+ Yields:
451
+ A dataclass containing the summary, pdb file, and pae file.
452
+
453
+ Raises:
454
+ ValueError: If 'formats' set is empty.
455
+ ValueError: If all_isoforms is True and 'summary' is not in 'formats' set.
456
+ """
457
+ if len(formats) == 0:
458
+ msg = "At least one format must be specified. The 'formats' argument is empty."
459
+ raise ValueError(msg)
460
+ if "summary" in formats:
461
+ if db_version is not None:
462
+ logger.warning("db_version is ignored when 'summary' is in 'formats' set. Always uses latest version.")
463
+ return _fetch_many_async_with_summary(
464
+ uniprot_accessions,
465
+ save_dir,
466
+ formats,
467
+ max_parallel_downloads=max_parallel_downloads,
468
+ cacher=cacher,
469
+ gzip_files=gzip_files,
470
+ all_isoforms=all_isoforms,
471
+ )
472
+ if all_isoforms:
473
+ msg = "Cannot fetch all isoforms when 'summary' is not in 'formats' set."
474
+ raise ValueError(msg)
475
+ return _fetch_many_async_without_summary(
476
+ uniprot_accessions,
477
+ save_dir,
478
+ formats,
479
+ db_version=db_version,
480
+ max_parallel_downloads=max_parallel_downloads,
481
+ cacher=cacher,
482
+ gzip_files=gzip_files,
483
+ )
484
+
485
+
263
486
  def fetch_many(
264
- ids: Iterable[str],
487
+ uniprot_accessions: Iterable[str],
265
488
  save_dir: Path,
266
- what: set[DownloadableFormat],
489
+ formats: set[DownloadableFormat],
490
+ db_version: str | None = None,
267
491
  max_parallel_downloads: int = 5,
268
492
  cacher: Cacher | None = None,
269
493
  gzip_files: bool = False,
494
+ all_isoforms: bool = False,
270
495
  ) -> list[AlphaFoldEntry]:
271
- """Synchronously fetches summaries and pdb and pae files from AlphaFold Protein Structure Database.
496
+ """Synchronously fetches summaries and/or files like cif from AlphaFold Protein Structure Database.
272
497
 
273
498
  Args:
274
- ids: A set of Uniprot IDs to fetch.
499
+ uniprot_accessions: A set of Uniprot accessions to fetch.
275
500
  save_dir: The directory to save the fetched files to.
276
- what: A set of formats to download.
501
+ formats: A set of formats to download.
502
+ If `summary` is in the set then summaries will be fetched using the API endpoint.
503
+ and later the other files will be downloaded using static file URLs.
504
+ If `summary` is not in the set then all files will be downloaded using static file
505
+ URLs only.
506
+ Excluding 'summary' is much faster as it avoids slow API calls.
507
+ db_version: The version of the AlphaFold database to use. If None, the latest version will be used.
277
508
  max_parallel_downloads: The maximum number of parallel downloads.
278
- cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
509
+ cacher: A cacher to use for caching the fetched files.
279
510
  gzip_files: Whether to gzip the downloaded files.
511
+ Summaries are never gzipped.
512
+ all_isoforms: Whether to yield all isoforms of each uniprot entry.
513
+ When False then yields only the canonical sequence per uniprot entry.
280
514
 
281
515
  Returns:
282
516
  A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
@@ -286,37 +520,15 @@ def fetch_many(
286
520
  return [
287
521
  entry
288
522
  async for entry in fetch_many_async(
289
- ids, save_dir, what, max_parallel_downloads=max_parallel_downloads, cacher=cacher, gzip_files=gzip_files
523
+ uniprot_accessions,
524
+ save_dir,
525
+ formats,
526
+ db_version=db_version,
527
+ max_parallel_downloads=max_parallel_downloads,
528
+ cacher=cacher,
529
+ gzip_files=gzip_files,
530
+ all_isoforms=all_isoforms,
290
531
  )
291
532
  ]
292
533
 
293
534
  return run_async(gather_entries())
294
-
295
-
296
- def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
297
- """Convert paths in an AlphaFoldEntry to be relative to the session directory.
298
-
299
- Args:
300
- entry: An AlphaFoldEntry instance with absolute paths.
301
- session_dir: The session directory to which the paths should be made relative.
302
-
303
- Returns:
304
- An AlphaFoldEntry instance with paths relative to the session directory.
305
- """
306
- return AlphaFoldEntry(
307
- uniprot_acc=entry.uniprot_acc,
308
- summary=entry.summary,
309
- summary_file=entry.summary_file.relative_to(session_dir) if entry.summary_file else None,
310
- bcif_file=entry.bcif_file.relative_to(session_dir) if entry.bcif_file else None,
311
- cif_file=entry.cif_file.relative_to(session_dir) if entry.cif_file else None,
312
- pdb_file=entry.pdb_file.relative_to(session_dir) if entry.pdb_file else None,
313
- pae_image_file=entry.pae_image_file.relative_to(session_dir) if entry.pae_image_file else None,
314
- pae_doc_file=entry.pae_doc_file.relative_to(session_dir) if entry.pae_doc_file else None,
315
- am_annotations_file=entry.am_annotations_file.relative_to(session_dir) if entry.am_annotations_file else None,
316
- am_annotations_hg19_file=(
317
- entry.am_annotations_hg19_file.relative_to(session_dir) if entry.am_annotations_hg19_file else None
318
- ),
319
- am_annotations_hg38_file=(
320
- entry.am_annotations_hg38_file.relative_to(session_dir) if entry.am_annotations_hg38_file else None
321
- ),
322
- )