protein-quest 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2 +1,2 @@
1
- __version__ = "0.7.0"
1
+ __version__ = "0.8.0"
2
2
  """The version of the package."""
@@ -29,14 +29,19 @@ DownloadableFormat = Literal[
29
29
  "amAnnotations",
30
30
  "amAnnotationsHg19",
31
31
  "amAnnotationsHg38",
32
- "msaUrl",
33
- "plddtDocUrl",
32
+ "msa",
33
+ "plddtDoc",
34
34
  ]
35
35
  """Types of formats that can be downloaded from the AlphaFold web service."""
36
36
 
37
37
  downloadable_formats: set[DownloadableFormat] = set(get_args(DownloadableFormat))
38
38
  """Set of formats that can be downloaded from the AlphaFold web service."""
39
39
 
40
+ UrlFileNamePair = tuple[URL, str]
41
+ """A tuple of a URL and a filename."""
42
+ UrlFileNamePairsOfFormats = dict[DownloadableFormat, UrlFileNamePair]
43
+ """A mapping of DownloadableFormat to UrlFileNamePair."""
44
+
40
45
 
41
46
  def _camel_to_snake_case(name: str) -> str:
42
47
  """Convert a camelCase string to snake_case."""
@@ -51,7 +56,7 @@ class AlphaFoldEntry:
51
56
  """
52
57
 
53
58
  uniprot_accession: str
54
- summary: EntrySummary
59
+ summary: EntrySummary | None = None
55
60
  summary_file: Path | None = None
56
61
  bcif_file: Path | None = None
57
62
  cif_file: Path | None = None
@@ -105,6 +110,35 @@ class AlphaFoldEntry:
105
110
  """
106
111
  return sum(1 for attr in vars(self) if attr.endswith("_file") and getattr(self, attr) is not None)
107
112
 
113
+ def relative_to(self, session_dir: Path) -> "AlphaFoldEntry":
114
+ """Convert paths in an AlphaFoldEntry to be relative to the session directory.
115
+
116
+ Args:
117
+ entry: An AlphaFoldEntry instance with absolute paths.
118
+ session_dir: The session directory to which the paths should be made relative.
119
+
120
+ Returns:
121
+ An AlphaFoldEntry instance with paths relative to the session directory.
122
+ """
123
+ return AlphaFoldEntry(
124
+ uniprot_accession=self.uniprot_accession,
125
+ summary=self.summary,
126
+ summary_file=self.summary_file.relative_to(session_dir) if self.summary_file else None,
127
+ bcif_file=self.bcif_file.relative_to(session_dir) if self.bcif_file else None,
128
+ cif_file=self.cif_file.relative_to(session_dir) if self.cif_file else None,
129
+ pdb_file=self.pdb_file.relative_to(session_dir) if self.pdb_file else None,
130
+ pae_doc_file=self.pae_doc_file.relative_to(session_dir) if self.pae_doc_file else None,
131
+ am_annotations_file=self.am_annotations_file.relative_to(session_dir) if self.am_annotations_file else None,
132
+ am_annotations_hg19_file=(
133
+ self.am_annotations_hg19_file.relative_to(session_dir) if self.am_annotations_hg19_file else None
134
+ ),
135
+ am_annotations_hg38_file=(
136
+ self.am_annotations_hg38_file.relative_to(session_dir) if self.am_annotations_hg38_file else None
137
+ ),
138
+ msa_file=self.msa_file.relative_to(session_dir) if self.msa_file else None,
139
+ plddt_doc_file=self.plddt_doc_file.relative_to(session_dir) if self.plddt_doc_file else None,
140
+ )
141
+
108
142
 
109
143
  async def fetch_summary(
110
144
  qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None, cacher: Cacher
@@ -170,32 +204,16 @@ async def fetch_summaries(
170
204
  yield qualifier, summary
171
205
 
172
206
 
173
- async def fetch_many_async(
207
+ async def _fetch_many_async_with_summary(
174
208
  uniprot_accessions: Iterable[str],
175
209
  save_dir: Path,
176
- what: set[DownloadableFormat],
210
+ formats: set[DownloadableFormat],
177
211
  max_parallel_downloads: int = 5,
178
212
  cacher: Cacher | None = None,
179
213
  gzip_files: bool = False,
180
214
  all_isoforms: bool = False,
181
215
  ) -> AsyncGenerator[AlphaFoldEntry]:
182
- """Asynchronously fetches summaries and files from
183
- [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
184
-
185
- Args:
186
- uniprot_accessions: A set of Uniprot accessions to fetch.
187
- save_dir: The directory to save the fetched files to.
188
- what: A set of formats to download.
189
- max_parallel_downloads: The maximum number of parallel downloads.
190
- cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
191
- gzip_files: Whether to gzip the downloaded files.
192
- all_isoforms: Whether to yield all isoforms of each uniprot entry.
193
- When False then yields only the canonical sequence of uniprot entry.
194
-
195
- Yields:
196
- A dataclass containing the summary, pdb file, and pae file.
197
- """
198
- save_dir_for_summaries = save_dir if "summary" in what and save_dir is not None else None
216
+ save_dir_for_summaries = save_dir if "summary" in formats else None
199
217
 
200
218
  summaries = [
201
219
  s
@@ -206,7 +224,7 @@ async def fetch_many_async(
206
224
  # O60481 is canonical and O60481-2 is isoform, so we skip the isoform
207
225
  if all_isoforms or s[0] == s[1].uniprotAccession
208
226
  ]
209
- files = files_to_download(what, summaries, gzip_files)
227
+ files = files_to_download(formats, summaries, gzip_files)
210
228
 
211
229
  await retrieve_files(
212
230
  files,
@@ -223,45 +241,45 @@ async def fetch_many_async(
223
241
  uniprot_accession=uniprot_accession,
224
242
  summary=summary,
225
243
  summary_file=save_dir / f"{uniprot_accession}.json" if save_dir_for_summaries is not None else None,
226
- bcif_file=save_dir / (summary.bcifUrl.name + gzext) if "bcif" in what else None,
227
- cif_file=save_dir / (summary.cifUrl.name + gzext) if "cif" in what else None,
228
- pdb_file=save_dir / (summary.pdbUrl.name + gzext) if "pdb" in what else None,
229
- pae_doc_file=save_dir / (summary.paeDocUrl.name + gzext) if "paeDoc" in what else None,
244
+ bcif_file=save_dir / (summary.bcifUrl.name + gzext) if "bcif" in formats else None,
245
+ cif_file=save_dir / (summary.cifUrl.name + gzext) if "cif" in formats else None,
246
+ pdb_file=save_dir / (summary.pdbUrl.name + gzext) if "pdb" in formats else None,
247
+ pae_doc_file=save_dir / (summary.paeDocUrl.name + gzext) if "paeDoc" in formats else None,
230
248
  am_annotations_file=(
231
249
  save_dir / (summary.amAnnotationsUrl.name + gzext)
232
- if "amAnnotations" in what and summary.amAnnotationsUrl
250
+ if "amAnnotations" in formats and summary.amAnnotationsUrl
233
251
  else None
234
252
  ),
235
253
  am_annotations_hg19_file=(
236
254
  save_dir / (summary.amAnnotationsHg19Url.name + gzext)
237
- if "amAnnotationsHg19" in what and summary.amAnnotationsHg19Url
255
+ if "amAnnotationsHg19" in formats and summary.amAnnotationsHg19Url
238
256
  else None
239
257
  ),
240
258
  am_annotations_hg38_file=(
241
259
  save_dir / (summary.amAnnotationsHg38Url.name + gzext)
242
- if "amAnnotationsHg38" in what and summary.amAnnotationsHg38Url
260
+ if "amAnnotationsHg38" in formats and summary.amAnnotationsHg38Url
243
261
  else None
244
262
  ),
245
- msa_file=(save_dir / (summary.msaUrl.name + gzext) if "msaUrl" in what and summary.msaUrl else None),
263
+ msa_file=(save_dir / (summary.msaUrl.name + gzext) if "msa" in formats and summary.msaUrl else None),
246
264
  plddt_doc_file=(
247
- save_dir / (summary.plddtDocUrl.name + gzext) if "plddtDocUrl" in what and summary.plddtDocUrl else None
265
+ save_dir / (summary.plddtDocUrl.name + gzext) if "plddtDoc" in formats and summary.plddtDocUrl else None
248
266
  ),
249
267
  )
250
268
 
251
269
 
252
270
  def files_to_download(
253
- what: set[DownloadableFormat], summaries: Iterable[tuple[str, EntrySummary]], gzip_files: bool
254
- ) -> set[tuple[URL, str]]:
255
- if not (set(what) <= downloadable_formats):
271
+ formats: set[DownloadableFormat], summaries: Iterable[tuple[str, EntrySummary]], gzip_files: bool
272
+ ) -> set[UrlFileNamePair]:
273
+ if not (set(formats) <= downloadable_formats):
256
274
  msg = (
257
- f"Invalid format(s) specified: {set(what) - downloadable_formats}. "
275
+ f"Invalid format(s) specified: {set(formats) - downloadable_formats}. "
258
276
  f"Valid formats are: {downloadable_formats}"
259
277
  )
260
278
  raise ValueError(msg)
261
279
 
262
- url_filename_pairs: set[tuple[URL, str]] = set()
280
+ url_filename_pairs: set[UrlFileNamePair] = set()
263
281
  for _, summary in summaries:
264
- for fmt in what:
282
+ for fmt in formats:
265
283
  if fmt == "summary":
266
284
  # summary is handled already in fetch_summary
267
285
  continue
@@ -275,26 +293,224 @@ def files_to_download(
275
293
  return url_filename_pairs
276
294
 
277
295
 
296
+ async def fetch_alphafold_db_version() -> str:
297
+ """Fetch the current version of the AlphaFold database.
298
+
299
+ Returns:
300
+ The current version of the AlphaFold database as a string. For example: "6".
301
+ """
302
+ url = "https://ftp.ebi.ac.uk/pub/databases/alphafold/accession_ids.csv"
303
+ headers = {"Range": "bytes=0-200"}
304
+ logger.debug(f"Detecting AlphaFold DB version from head of {url}")
305
+ async with friendly_session() as session, session.get(url, headers=headers) as response:
306
+ response.raise_for_status()
307
+ raw = await response.content.read(200)
308
+ text = raw.decode("utf-8")
309
+ first_line = text.splitlines()[1]
310
+ version = first_line.split(",")[-1]
311
+ logger.debug(f"Found current AlphaFold DB version is '{version}'")
312
+ return version
313
+
314
+
315
+ def _files_for_alphafold_entry(
316
+ uniprot_accession: str,
317
+ formats: set[DownloadableFormat],
318
+ db_version: str,
319
+ gzip_files: bool,
320
+ ) -> UrlFileNamePairsOfFormats:
321
+ templates: dict[DownloadableFormat, URL] = {
322
+ "bcif": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-model_v{db_version}.bcif"),
323
+ "cif": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-model_v{db_version}.cif"),
324
+ "pdb": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-model_v{db_version}.pdb"),
325
+ "paeDoc": URL(
326
+ f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-predicted_aligned_error_v{db_version}.json"
327
+ ),
328
+ "amAnnotations": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-aa-substitutions.csv"),
329
+ "amAnnotationsHg19": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-hg19.csv"),
330
+ "amAnnotationsHg38": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-hg38.csv"),
331
+ "msa": URL(f"https://alphafold.ebi.ac.uk/files/msa/AF-{uniprot_accession}-F1-msa_v{db_version}.a3m"),
332
+ "plddtDoc": URL(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-confidence_v{db_version}.json"),
333
+ }
334
+ url_filename_pairs = {}
335
+ for fmt in formats:
336
+ if fmt == "summary":
337
+ # Summaries are downloaded separately as its using API instead of static files
338
+ continue
339
+ if fmt not in templates:
340
+ logger.warning(f"No URL template found for format '{fmt}'. Skipping.")
341
+ continue
342
+ url = templates[cast("DownloadableFormat", fmt)]
343
+ fn = url.name
344
+ if gzip_files:
345
+ fn += ".gz"
346
+ url_filename_pair = (url, fn)
347
+ url_filename_pairs[fmt] = url_filename_pair
348
+ return url_filename_pairs
349
+
350
+
351
+ def files_for_alphafold_entries(
352
+ uniprot_accessions: Iterable[str],
353
+ formats: set[DownloadableFormat],
354
+ db_version: str,
355
+ gzip_files: bool,
356
+ ) -> dict[str, UrlFileNamePairsOfFormats]:
357
+ """Get the files to download for multiple AlphaFold entries.
358
+
359
+ Args:
360
+ uniprot_accessions: A set of Uniprot accessions.
361
+ formats: A set of formats to download.
362
+ db_version: The version of the AlphaFold database to use.
363
+ gzip_files: Whether to download gzipped files. Otherwise downloads uncompressed files.
364
+
365
+ Returns:
366
+ A mapping of Uniprot accession to a mapping of DownloadableFormat to UrlFileNamePair.
367
+ """
368
+ return {
369
+ uniprot_accession: _files_for_alphafold_entry(
370
+ uniprot_accession, formats=formats, db_version=db_version, gzip_files=gzip_files
371
+ )
372
+ for uniprot_accession in uniprot_accessions
373
+ }
374
+
375
+
376
+ async def _fetch_many_async_without_summary(
377
+ uniprot_accessions: Iterable[str],
378
+ save_dir: Path,
379
+ formats: set[DownloadableFormat],
380
+ db_version: str | None = None,
381
+ max_parallel_downloads: int = 5,
382
+ cacher: Cacher | None = None,
383
+ gzip_files: bool = False,
384
+ ) -> AsyncGenerator[AlphaFoldEntry]:
385
+ if db_version is None:
386
+ db_version = await fetch_alphafold_db_version()
387
+ nested_files = files_for_alphafold_entries(
388
+ uniprot_accessions, formats=formats, db_version=db_version, gzip_files=gzip_files
389
+ )
390
+ files: set[UrlFileNamePair] = set()
391
+ for uniprot_accession in uniprot_accessions:
392
+ files.update(nested_files[uniprot_accession].values())
393
+
394
+ retrieved_files = await retrieve_files(
395
+ files,
396
+ save_dir,
397
+ desc="Downloading AlphaFold files",
398
+ max_parallel_downloads=max_parallel_downloads,
399
+ cacher=cacher,
400
+ gzip_files=gzip_files,
401
+ raise_for_not_found=False,
402
+ )
403
+
404
+ retrieved_files_set = set(retrieved_files)
405
+ for uniprot_accession in uniprot_accessions:
406
+ entry = AlphaFoldEntry(
407
+ uniprot_accession=uniprot_accession,
408
+ )
409
+
410
+ for af_format, url_filename_pair in nested_files[uniprot_accession].items():
411
+ _, filename = url_filename_pair
412
+ filepath = save_dir / filename
413
+ if filepath in retrieved_files_set:
414
+ attr = AlphaFoldEntry.format2attr(af_format)
415
+ setattr(entry, attr, filepath)
416
+ # else: File was not found (404) during download, so we leave the attribute as None
417
+
418
+ yield entry
419
+
420
+
421
+ def fetch_many_async(
422
+ uniprot_accessions: Iterable[str],
423
+ save_dir: Path,
424
+ formats: set[DownloadableFormat],
425
+ db_version: str | None = None,
426
+ max_parallel_downloads: int = 5,
427
+ cacher: Cacher | None = None,
428
+ gzip_files: bool = False,
429
+ all_isoforms: bool = False,
430
+ ) -> AsyncGenerator[AlphaFoldEntry]:
431
+ """Asynchronously fetches summaries and/or files from
432
+ [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
433
+
434
+ Args:
435
+ uniprot_accessions: A set of Uniprot accessions to fetch.
436
+ save_dir: The directory to save the fetched files to.
437
+ formats: A set of formats to download.
438
+ If `summary` is in the set then summaries will be fetched using the API endpoint.
439
+ and later the other files will be downloaded using static file URLs.
440
+ If `summary` is not in the set then all files will be downloaded using static file
441
+ URLs only.
442
+ db_version: The version of the AlphaFold database to use. If None, the latest version will be used.
443
+ max_parallel_downloads: The maximum number of parallel downloads.
444
+ cacher: A cacher to use for caching the fetched files.
445
+ gzip_files: Whether to gzip the downloaded files.
446
+ Summaries are never gzipped.
447
+ all_isoforms: Whether to yield all isoforms of each uniprot entry.
448
+ When False then yields only the canonical sequence per uniprot entry.
449
+
450
+ Yields:
451
+ A dataclass containing the summary, pdb file, and pae file.
452
+
453
+ Raises:
454
+ ValueError: If 'formats' set is empty.
455
+ ValueError: If all_isoforms is True and 'summary' is not in 'formats' set.
456
+ """
457
+ if len(formats) == 0:
458
+ msg = "At least one format must be specified. The 'formats' argument is empty."
459
+ raise ValueError(msg)
460
+ if "summary" in formats:
461
+ if db_version is not None:
462
+ logger.warning("db_version is ignored when 'summary' is in 'formats' set. Always uses latest version.")
463
+ return _fetch_many_async_with_summary(
464
+ uniprot_accessions,
465
+ save_dir,
466
+ formats,
467
+ max_parallel_downloads=max_parallel_downloads,
468
+ cacher=cacher,
469
+ gzip_files=gzip_files,
470
+ all_isoforms=all_isoforms,
471
+ )
472
+ if all_isoforms:
473
+ msg = "Cannot fetch all isoforms when 'summary' is not in 'formats' set."
474
+ raise ValueError(msg)
475
+ return _fetch_many_async_without_summary(
476
+ uniprot_accessions,
477
+ save_dir,
478
+ formats,
479
+ db_version=db_version,
480
+ max_parallel_downloads=max_parallel_downloads,
481
+ cacher=cacher,
482
+ gzip_files=gzip_files,
483
+ )
484
+
485
+
278
486
  def fetch_many(
279
- ids: Iterable[str],
487
+ uniprot_accessions: Iterable[str],
280
488
  save_dir: Path,
281
- what: set[DownloadableFormat],
489
+ formats: set[DownloadableFormat],
490
+ db_version: str | None = None,
282
491
  max_parallel_downloads: int = 5,
283
492
  cacher: Cacher | None = None,
284
493
  gzip_files: bool = False,
285
494
  all_isoforms: bool = False,
286
495
  ) -> list[AlphaFoldEntry]:
287
- """Synchronously fetches summaries and pdb and pae files from AlphaFold Protein Structure Database.
496
+ """Synchronously fetches summaries and/or files like cif from AlphaFold Protein Structure Database.
288
497
 
289
498
  Args:
290
- ids: A set of Uniprot IDs to fetch.
499
+ uniprot_accessions: A set of Uniprot accessions to fetch.
291
500
  save_dir: The directory to save the fetched files to.
292
- what: A set of formats to download.
501
+ formats: A set of formats to download.
502
+ If `summary` is in the set then summaries will be fetched using the API endpoint.
503
+ and later the other files will be downloaded using static file URLs.
504
+ If `summary` is not in the set then all files will be downloaded using static file
505
+ URLs only.
506
+ Excluding 'summary' is much faster as it avoids slow API calls.
507
+ db_version: The version of the AlphaFold database to use. If None, the latest version will be used.
293
508
  max_parallel_downloads: The maximum number of parallel downloads.
294
- cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
509
+ cacher: A cacher to use for caching the fetched files.
295
510
  gzip_files: Whether to gzip the downloaded files.
296
- all_isoforms: Whether to return all isoforms of each uniprot entry.
297
- When False then returns only the canonical sequence of uniprot entry.
511
+ Summaries are never gzipped.
512
+ all_isoforms: Whether to yield all isoforms of each uniprot entry.
513
+ When False then yields only the canonical sequence per uniprot entry.
298
514
 
299
515
  Returns:
300
516
  A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
@@ -304,9 +520,10 @@ def fetch_many(
304
520
  return [
305
521
  entry
306
522
  async for entry in fetch_many_async(
307
- ids,
523
+ uniprot_accessions,
308
524
  save_dir,
309
- what,
525
+ formats,
526
+ db_version=db_version,
310
527
  max_parallel_downloads=max_parallel_downloads,
311
528
  cacher=cacher,
312
529
  gzip_files=gzip_files,
@@ -315,33 +532,3 @@ def fetch_many(
315
532
  ]
316
533
 
317
534
  return run_async(gather_entries())
318
-
319
-
320
- def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
321
- """Convert paths in an AlphaFoldEntry to be relative to the session directory.
322
-
323
- Args:
324
- entry: An AlphaFoldEntry instance with absolute paths.
325
- session_dir: The session directory to which the paths should be made relative.
326
-
327
- Returns:
328
- An AlphaFoldEntry instance with paths relative to the session directory.
329
- """
330
- return AlphaFoldEntry(
331
- uniprot_accession=entry.uniprot_accession,
332
- summary=entry.summary,
333
- summary_file=entry.summary_file.relative_to(session_dir) if entry.summary_file else None,
334
- bcif_file=entry.bcif_file.relative_to(session_dir) if entry.bcif_file else None,
335
- cif_file=entry.cif_file.relative_to(session_dir) if entry.cif_file else None,
336
- pdb_file=entry.pdb_file.relative_to(session_dir) if entry.pdb_file else None,
337
- pae_doc_file=entry.pae_doc_file.relative_to(session_dir) if entry.pae_doc_file else None,
338
- am_annotations_file=entry.am_annotations_file.relative_to(session_dir) if entry.am_annotations_file else None,
339
- am_annotations_hg19_file=(
340
- entry.am_annotations_hg19_file.relative_to(session_dir) if entry.am_annotations_hg19_file else None
341
- ),
342
- am_annotations_hg38_file=(
343
- entry.am_annotations_hg38_file.relative_to(session_dir) if entry.am_annotations_hg38_file else None
344
- ),
345
- msa_file=entry.msa_file.relative_to(session_dir) if entry.msa_file else None,
346
- plddt_doc_file=entry.plddt_doc_file.relative_to(session_dir) if entry.plddt_doc_file else None,
347
- )
protein_quest/cli.py CHANGED
@@ -13,6 +13,7 @@ from io import BytesIO, TextIOWrapper
13
13
  from pathlib import Path
14
14
  from textwrap import dedent
15
15
 
16
+ import shtab
16
17
  from cattrs import structure
17
18
  from rich.console import Console
18
19
  from rich.logging import RichHandler
@@ -81,7 +82,7 @@ def _add_search_uniprot_parser(subparsers: argparse._SubParsersAction):
81
82
  "output",
82
83
  type=argparse.FileType("w", encoding="UTF-8"),
83
84
  help="Output text file for UniProt accessions (one per line). Use `-` for stdout.",
84
- )
85
+ ).complete = shtab.FILE
85
86
  parser.add_argument("--taxon-id", type=str, help="NCBI Taxon ID, e.g. 9606 for Homo Sapiens")
86
87
  parser.add_argument(
87
88
  "--reviewed",
@@ -124,7 +125,7 @@ def _add_search_pdbe_parser(subparsers: argparse._SubParsersAction):
124
125
  "uniprot_accessions",
125
126
  type=argparse.FileType("r", encoding="UTF-8"),
126
127
  help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
127
- )
128
+ ).complete = shtab.FILE
128
129
  parser.add_argument(
129
130
  "output_csv",
130
131
  type=argparse.FileType("w", encoding="UTF-8"),
@@ -136,7 +137,7 @@ def _add_search_pdbe_parser(subparsers: argparse._SubParsersAction):
136
137
  and `chain_length` is the length of the chain, for example `100`.
137
138
  Use `-` for stdout.
138
139
  """),
139
- )
140
+ ).complete = shtab.FILE
140
141
  parser.add_argument(
141
142
  "--limit", type=int, default=10_000, help="Maximum number of PDB uniprot accessions combinations to return"
142
143
  )
@@ -150,6 +151,15 @@ def _add_search_pdbe_parser(subparsers: argparse._SubParsersAction):
150
151
  type=int,
151
152
  help="Maximum number of residues allowed in chain mapped to the UniProt accession.",
152
153
  )
154
+ parser.add_argument(
155
+ "--keep-invalid",
156
+ action="store_true",
157
+ help=dedent("""\
158
+ Keep PDB results when chain length could not be determined.
159
+ If not given, such results are dropped.
160
+ Only applies if min/max residues arguments are set.
161
+ """),
162
+ )
153
163
  parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
154
164
 
155
165
 
@@ -165,12 +175,12 @@ def _add_search_alphafold_parser(subparsers: argparse._SubParsersAction):
165
175
  "uniprot_accessions",
166
176
  type=argparse.FileType("r", encoding="UTF-8"),
167
177
  help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
168
- )
178
+ ).complete = shtab.FILE
169
179
  parser.add_argument(
170
180
  "output_csv",
171
181
  type=argparse.FileType("w", encoding="UTF-8"),
172
182
  help="Output CSV with AlphaFold IDs per UniProt accession. Use `-` for stdout.",
173
- )
183
+ ).complete = shtab.FILE
174
184
  parser.add_argument("--min-sequence-length", type=int, help="Minimum length of the canonical sequence.")
175
185
  parser.add_argument("--max-sequence-length", type=int, help="Maximum length of the canonical sequence.")
176
186
  parser.add_argument(
@@ -194,12 +204,12 @@ def _add_search_emdb_parser(subparsers: argparse._SubParsersAction):
194
204
  "uniprot_accs",
195
205
  type=argparse.FileType("r", encoding="UTF-8"),
196
206
  help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
197
- )
207
+ ).complete = shtab.FILE
198
208
  parser.add_argument(
199
209
  "output_csv",
200
210
  type=argparse.FileType("w", encoding="UTF-8"),
201
211
  help="Output CSV with EMDB IDs per UniProt accession. Use `-` for stdout.",
202
- )
212
+ ).complete = shtab.FILE
203
213
  parser.add_argument("--limit", type=int, default=10_000, help="Maximum number of EMDB entry identifiers to return")
204
214
  parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
205
215
 
@@ -222,7 +232,7 @@ def _add_search_go_parser(subparsers: argparse._SubParsersAction):
222
232
  "output_csv",
223
233
  type=argparse.FileType("w", encoding="UTF-8"),
224
234
  help="Output CSV with GO term results. Use `-` for stdout.",
225
- )
235
+ ).complete = shtab.FILE
226
236
  parser.add_argument("--limit", type=int, default=100, help="Maximum number of GO term results to return")
227
237
 
228
238
 
@@ -244,7 +254,7 @@ def _add_search_taxonomy_parser(subparser: argparse._SubParsersAction):
244
254
  "output_csv",
245
255
  type=argparse.FileType("w", encoding="UTF-8"),
246
256
  help="Output CSV with taxonomy results. Use `-` for stdout.",
247
- )
257
+ ).complete = shtab.FILE
248
258
  parser.add_argument(
249
259
  "--field",
250
260
  type=str,
@@ -285,7 +295,7 @@ def _add_search_interaction_partners_parser(subparsers: argparse._SubParsersActi
285
295
  "output_csv",
286
296
  type=argparse.FileType("w", encoding="UTF-8"),
287
297
  help="Output CSV with interaction partners per UniProt accession. Use `-` for stdout.",
288
- )
298
+ ).complete = shtab.FILE
289
299
  parser.add_argument(
290
300
  "--limit", type=int, default=10_000, help="Maximum number of interaction partner uniprot accessions to return"
291
301
  )
@@ -316,12 +326,12 @@ def _add_search_complexes_parser(subparsers: argparse._SubParsersAction):
316
326
  "uniprot_accessions",
317
327
  type=argparse.FileType("r", encoding="UTF-8"),
318
328
  help="Text file with UniProt accessions (one per line) as query for searching complexes. Use `-` for stdin.",
319
- )
329
+ ).complete = shtab.FILE
320
330
  parser.add_argument(
321
331
  "output_csv",
322
332
  type=argparse.FileType("w", encoding="UTF-8"),
323
333
  help="Output CSV file with complex results. Use `-` for stdout.",
324
- )
334
+ ).complete = shtab.FILE
325
335
  parser.add_argument("--limit", type=int, default=100, help="Maximum number of complex results to return")
326
336
  parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
327
337
 
@@ -354,12 +364,12 @@ def _add_search_uniprot_details_parser(subparsers: argparse._SubParsersAction):
354
364
  "uniprot_accessions",
355
365
  type=argparse.FileType("r", encoding="UTF-8"),
356
366
  help="Text file with UniProt accessions (one per line). Use `-` for stdin.",
357
- )
367
+ ).complete = shtab.FILE
358
368
  parser.add_argument(
359
369
  "output_csv",
360
370
  type=argparse.FileType("w", encoding="UTF-8"),
361
371
  help="Output CSV with UniProt details. Use `-` for stdout.",
362
- )
372
+ ).complete = shtab.FILE
363
373
  parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
364
374
  parser.add_argument("--batch-size", type=int, default=1_000, help="Number of accessions to query per batch")
365
375
 
@@ -387,12 +397,13 @@ def _add_cacher_arguments(parser: argparse.ArgumentParser):
387
397
  action="store_true",
388
398
  help="Disable caching of files to central location.",
389
399
  )
390
- parser.add_argument(
400
+ cache_dir_action = parser.add_argument(
391
401
  "--cache-dir",
392
402
  type=Path,
393
403
  default=user_cache_root_dir(),
394
404
  help="Directory to use as cache for files.",
395
405
  )
406
+ cache_dir_action.complete = shtab.DIRECTORY # type: ignore[missing-attribute]
396
407
  _add_copy_method_arguments(parser)
397
408
 
398
409
 
@@ -411,8 +422,10 @@ def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
411
422
  "pdbe_csv",
412
423
  type=argparse.FileType("r", encoding="UTF-8"),
413
424
  help="CSV file with `pdb_id` column. Other columns are ignored. Use `-` for stdin.",
414
- )
415
- parser.add_argument("output_dir", type=Path, help="Directory to store downloaded PDBe mmCIF files")
425
+ ).complete = shtab.FILE
426
+ parser.add_argument(
427
+ "output_dir", type=Path, help="Directory to store downloaded PDBe mmCIF files"
428
+ ).complete = shtab.DIRECTORY
416
429
  parser.add_argument(
417
430
  "--max-parallel-downloads",
418
431
  type=int,
@@ -434,15 +447,22 @@ def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
434
447
  "alphafold_csv",
435
448
  type=argparse.FileType("r", encoding="UTF-8"),
436
449
  help="CSV file with `af_id` column. Other columns are ignored. Use `-` for stdin.",
437
- )
438
- parser.add_argument("output_dir", type=Path, help="Directory to store downloaded AlphaFold files")
450
+ ).complete = shtab.FILE
439
451
  parser.add_argument(
440
- "--what-formats",
452
+ "output_dir", type=Path, help="Directory to store downloaded AlphaFold files"
453
+ ).complete = shtab.DIRECTORY
454
+ parser.add_argument(
455
+ "--format",
441
456
  type=str,
442
457
  action="append",
443
458
  choices=sorted(downloadable_formats),
444
459
  help=dedent("""AlphaFold formats to retrieve. Can be specified multiple times.
445
- Default is 'summary' and 'cif'."""),
460
+ Default is 'cif'."""),
461
+ )
462
+ parser.add_argument(
463
+ "--db-version",
464
+ type=str,
465
+ help="AlphaFold database version to use. If not given, the latest version is used. For example '6'.",
446
466
  )
447
467
  parser.add_argument(
448
468
  "--gzip-files",
@@ -481,8 +501,10 @@ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
481
501
  "emdb_csv",
482
502
  type=argparse.FileType("r", encoding="UTF-8"),
483
503
  help="CSV file with `emdb_id` column. Other columns are ignored. Use `-` for stdin.",
484
- )
485
- parser.add_argument("output_dir", type=Path, help="Directory to store downloaded EMDB volume files")
504
+ ).complete = shtab.FILE
505
+ parser.add_argument(
506
+ "output_dir", type=Path, help="Directory to store downloaded EMDB volume files"
507
+ ).complete = shtab.DIRECTORY
486
508
  _add_cacher_arguments(parser)
487
509
 
488
510
 
@@ -496,8 +518,12 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
496
518
  Passed files are written with residues below threshold removed."""),
497
519
  formatter_class=ArgumentDefaultsRichHelpFormatter,
498
520
  )
499
- parser.add_argument("input_dir", type=Path, help="Directory with AlphaFold mmcif/PDB files")
500
- parser.add_argument("output_dir", type=Path, help="Directory to write filtered mmcif/PDB files")
521
+ parser.add_argument(
522
+ "input_dir", type=Path, help="Directory with AlphaFold mmcif/PDB files"
523
+ ).complete = shtab.DIRECTORY
524
+ parser.add_argument(
525
+ "output_dir", type=Path, help="Directory to write filtered mmcif/PDB files"
526
+ ).complete = shtab.DIRECTORY
501
527
  parser.add_argument("--confidence-threshold", type=float, default=70, help="pLDDT confidence threshold (0-100)")
502
528
  parser.add_argument(
503
529
  "--min-residues", type=int, default=0, help="Minimum number of high-confidence residues a structure should have"
@@ -515,7 +541,7 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
515
541
  Write filter statistics to file.
516
542
  In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
517
543
  Use `-` for stdout."""),
518
- )
544
+ ).complete = shtab.FILE
519
545
  _add_copy_method_arguments(parser)
520
546
 
521
547
 
@@ -535,7 +561,7 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
535
561
  "chains",
536
562
  type=argparse.FileType("r", encoding="UTF-8"),
537
563
  help="CSV file with `pdb_id` and `chain` columns. Other columns are ignored.",
538
- )
564
+ ).complete = shtab.FILE
539
565
  parser.add_argument(
540
566
  "input_dir",
541
567
  type=Path,
@@ -543,13 +569,13 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
543
569
  Directory with PDB/mmCIF files.
544
570
  Expected filenames are `{pdb_id}.cif.gz`, `{pdb_id}.cif`, `{pdb_id}.pdb.gz` or `{pdb_id}.pdb`.
545
571
  """),
546
- )
572
+ ).complete = shtab.DIRECTORY
547
573
  parser.add_argument(
548
574
  "output_dir",
549
575
  type=Path,
550
576
  help=dedent("""\
551
577
  Directory to write the single-chain PDB/mmCIF files. Output files are in same format as input files."""),
552
- )
578
+ ).complete = shtab.DIRECTORY
553
579
  parser.add_argument(
554
580
  "--scheduler-address",
555
581
  help=dedent("""Address of the Dask scheduler to connect to.
@@ -569,14 +595,16 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
569
595
  """),
570
596
  formatter_class=ArgumentDefaultsRichHelpFormatter,
571
597
  )
572
- parser.add_argument("input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')")
598
+ parser.add_argument(
599
+ "input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')"
600
+ ).complete = shtab.DIRECTORY
573
601
  parser.add_argument(
574
602
  "output_dir",
575
603
  type=Path,
576
604
  help=dedent("""\
577
605
  Directory to write filtered PDB/mmCIF files. Files are copied without modification.
578
606
  """),
579
- )
607
+ ).complete = shtab.DIRECTORY
580
608
  parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
581
609
  parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
582
610
  parser.add_argument(
@@ -586,7 +614,7 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
586
614
  Write filter statistics to file.
587
615
  In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
588
616
  Use `-` for stdout."""),
589
- )
617
+ ).complete = shtab.FILE
590
618
  _add_copy_method_arguments(parser)
591
619
 
592
620
 
@@ -598,14 +626,16 @@ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
598
626
  description="Filter PDB/mmCIF files by secondary structure",
599
627
  formatter_class=ArgumentDefaultsRichHelpFormatter,
600
628
  )
601
- parser.add_argument("input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')")
629
+ parser.add_argument(
630
+ "input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')"
631
+ ).complete = shtab.DIRECTORY
602
632
  parser.add_argument(
603
633
  "output_dir",
604
634
  type=Path,
605
635
  help=dedent("""\
606
636
  Directory to write filtered PDB/mmCIF files. Files are copied without modification.
607
637
  """),
608
- )
638
+ ).complete = shtab.DIRECTORY
609
639
  parser.add_argument("--abs-min-helix-residues", type=int, help="Min residues in helices")
610
640
  parser.add_argument("--abs-max-helix-residues", type=int, help="Max residues in helices")
611
641
  parser.add_argument("--abs-min-sheet-residues", type=int, help="Min residues in sheets")
@@ -623,7 +653,7 @@ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
623
653
  <helix_ratio>,<sheet_ratio>,<passed>,<output_file>`.
624
654
  Use `-` for stdout.
625
655
  """),
626
- )
656
+ ).complete = shtab.FILE
627
657
  _add_copy_method_arguments(parser)
628
658
 
629
659
 
@@ -687,12 +717,12 @@ def _add_convert_uniprot_parser(subparsers: argparse._SubParsersAction):
687
717
  "input_dir",
688
718
  type=Path,
689
719
  help=f"Directory with structure files. Supported extensions are {valid_structure_file_extensions}",
690
- )
720
+ ).complete = shtab.DIRECTORY
691
721
  parser.add_argument(
692
722
  "output",
693
723
  type=argparse.FileType("wt", encoding="UTF-8"),
694
724
  help="Output text file with UniProt accessions (one per line). Use '-' for stdout.",
695
- )
725
+ ).complete = shtab.FILE
696
726
  parser.add_argument(
697
727
  "--grouped",
698
728
  action="store_true",
@@ -712,14 +742,14 @@ def _add_convert_structures_parser(subparsers: argparse._SubParsersAction):
712
742
  "input_dir",
713
743
  type=Path,
714
744
  help=f"Directory with structure files. Supported extensions are {valid_structure_file_extensions}",
715
- )
745
+ ).complete = shtab.DIRECTORY
716
746
  parser.add_argument(
717
747
  "--output-dir",
718
748
  type=Path,
719
749
  help=dedent("""\
720
750
  Directory to write converted structure files. If not given, files are written to `input_dir`.
721
751
  """),
722
- )
752
+ ).complete = shtab.DIRECTORY
723
753
  parser.add_argument(
724
754
  "--format",
725
755
  type=str,
@@ -768,6 +798,7 @@ def make_parser() -> argparse.ArgumentParser:
768
798
  )
769
799
  parser.add_argument("--log-level", default="WARNING", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
770
800
  parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
801
+ shtab.add_argument_to(parser, ["--print-completion"])
771
802
 
772
803
  subparsers = parser.add_subparsers(dest="command", required=True)
773
804
 
@@ -825,6 +856,7 @@ def _handle_search_pdbe(args):
825
856
  output_csv = args.output_csv
826
857
  min_residues = converter.structure(args.min_residues, PositiveInt | None) # pyright: ignore[reportArgumentType]
827
858
  max_residues = converter.structure(args.max_residues, PositiveInt | None) # pyright: ignore[reportArgumentType]
859
+ keep_invalid = args.keep_invalid
828
860
 
829
861
  accs = set(_read_lines(uniprot_accessions))
830
862
  rprint(f"Finding PDB entries for {len(accs)} uniprot accessions")
@@ -833,7 +865,7 @@ def _handle_search_pdbe(args):
833
865
  raw_nr_results = len(results)
834
866
  raw_total_pdbs = sum([len(v) for v in results.values()])
835
867
  if min_residues or max_residues:
836
- results = filter_pdb_results_on_chain_length(results, min_residues, max_residues)
868
+ results = filter_pdb_results_on_chain_length(results, min_residues, max_residues, keep_invalid=keep_invalid)
837
869
  total_pdbs = sum([len(v) for v in results.values()])
838
870
  rprint(f"Before filtering found {raw_total_pdbs} PDB entries for {raw_nr_results} uniprot accessions.")
839
871
  rprint(
@@ -976,25 +1008,26 @@ def _handle_retrieve_pdbe(args: argparse.Namespace):
976
1008
 
977
1009
  def _handle_retrieve_alphafold(args):
978
1010
  download_dir = args.output_dir
979
- what_formats = args.what_formats
1011
+ raw_formats = args.format
980
1012
  alphafold_csv = args.alphafold_csv
981
1013
  max_parallel_downloads = args.max_parallel_downloads
982
1014
  cacher = _initialize_cacher(args)
983
1015
  gzip_files = args.gzip_files
984
1016
  all_isoforms = args.all_isoforms
1017
+ db_version = args.db_version
985
1018
 
986
- if what_formats is None:
987
- what_formats = {"summary", "cif"}
1019
+ if raw_formats is None:
1020
+ raw_formats = {"cif"}
988
1021
 
989
1022
  # TODO besides `uniprot_accession,af_id\n` csv also allow headless single column format
990
- #
991
1023
  af_ids = _read_column_from_csv(alphafold_csv, "af_id")
992
- validated_what: set[DownloadableFormat] = structure(what_formats, set[DownloadableFormat])
993
- rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {validated_what}")
1024
+ formats: set[DownloadableFormat] = structure(raw_formats, set[DownloadableFormat])
1025
+ rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {formats}")
994
1026
  afs = af_fetch(
995
1027
  af_ids,
996
1028
  download_dir,
997
- what=validated_what,
1029
+ formats=formats,
1030
+ db_version=db_version,
998
1031
  max_parallel_downloads=max_parallel_downloads,
999
1032
  cacher=cacher,
1000
1033
  gzip_files=gzip_files,
@@ -167,7 +167,7 @@ mcp.tool(search4macromolecular_complexes, name="search_macromolecular_complexes"
167
167
 
168
168
  @mcp.tool
169
169
  def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[AlphaFoldEntry]:
170
- """Fetch the AlphaFold summary and mmcif file for given UniProt accessions.
170
+ """Fetch the AlphaFold mmCIF file for given UniProt accessions.
171
171
 
172
172
  Args:
173
173
  uniprot_accs: A set of UniProt accessions.
@@ -176,8 +176,8 @@ def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[A
176
176
  Returns:
177
177
  A list of AlphaFold entries.
178
178
  """
179
- what: set[DownloadableFormat] = {"summary", "cif"}
180
- return alphafold_fetch(uniprot_accs, save_dir, what)
179
+ formats: set[DownloadableFormat] = {"cif"}
180
+ return alphafold_fetch(uniprot_accs, save_dir, formats)
181
181
 
182
182
 
183
183
  @mcp.tool
protein_quest/uniprot.py CHANGED
@@ -93,6 +93,14 @@ def _chain_length_from_uniprot_chains(uniprot_chains: str) -> int:
93
93
  return total_length
94
94
 
95
95
 
96
+ class PdbChainLengthError(ValueError):
97
+ """Raised when a UniProt chain description does not yield a chain length."""
98
+
99
+ def __init__(self, pdb_id: str, uniprot_chains: str):
100
+ msg = f"Could not determine chain length of '{pdb_id}' from '{uniprot_chains}'"
101
+ super().__init__(msg)
102
+
103
+
96
104
  @dataclass(frozen=True)
97
105
  class PdbResult:
98
106
  """Result of a PDB search in UniProtKB.
@@ -117,7 +125,10 @@ class PdbResult:
117
125
  @cached_property
118
126
  def chain_length(self) -> int:
119
127
  """The length of the chain from the UniProt chains aka self.uniprot_chains."""
120
- return _chain_length_from_uniprot_chains(self.uniprot_chains)
128
+ try:
129
+ return _chain_length_from_uniprot_chains(self.uniprot_chains)
130
+ except ValueError as e:
131
+ raise PdbChainLengthError(self.id, self.uniprot_chains) from e
121
132
 
122
133
 
123
134
  type PdbResults = dict[str, set[PdbResult]]
@@ -128,6 +139,7 @@ def filter_pdb_results_on_chain_length(
128
139
  pdb_results: PdbResults,
129
140
  min_residues: int | None,
130
141
  max_residues: int | None,
142
+ keep_invalid: bool = False,
131
143
  ) -> PdbResults:
132
144
  """Filter PDB results based on chain length.
133
145
 
@@ -137,6 +149,9 @@ def filter_pdb_results_on_chain_length(
137
149
  If None, no minimum is applied.
138
150
  max_residues: Maximum number of residues allowed in chain mapped to the UniProt accession.
139
151
  If None, no maximum is applied.
152
+ keep_invalid: If True, PDB results with invalid chain length (could not be determined) are kept.
153
+ If False, PDB results with invalid chain length are filtered out.
154
+ Warnings are logged when length can not be determined.
140
155
 
141
156
  Returns:
142
157
  Filtered dictionary with protein IDs as keys and sets of PDB results as values.
@@ -149,12 +164,26 @@ def filter_pdb_results_on_chain_length(
149
164
  raise ValueError(msg)
150
165
  results: PdbResults = {}
151
166
  for uniprot_accession, pdb_entries in pdb_results.items():
152
- filtered_pdb_entries = {
153
- pdb_entry
154
- for pdb_entry in pdb_entries
155
- if (min_residues is None or pdb_entry.chain_length >= min_residues)
156
- and (max_residues is None or pdb_entry.chain_length <= max_residues)
157
- }
167
+ filtered_pdb_entries = set()
168
+ for pdb_entry in pdb_entries:
169
+ try:
170
+ if (min_residues is None or pdb_entry.chain_length >= min_residues) and (
171
+ max_residues is None or pdb_entry.chain_length <= max_residues
172
+ ):
173
+ filtered_pdb_entries.add(pdb_entry)
174
+ except PdbChainLengthError:
175
+ if keep_invalid:
176
+ logger.warning(
177
+ f"Could not determine chain length of '{pdb_entry.id}' from '{pdb_entry.uniprot_chains}' "
178
+ f"belonging to uniprot accession '{uniprot_accession}', "
179
+ "for completeness not filtering it out"
180
+ )
181
+ filtered_pdb_entries.add(pdb_entry)
182
+ else:
183
+ logger.warning(
184
+ f"Filtering out PDB entry '{pdb_entry.id}' belonging to uniprot accession "
185
+ f"'{uniprot_accession}' due to invalid chain length from '{pdb_entry.uniprot_chains}'"
186
+ )
158
187
  if filtered_pdb_entries:
159
188
  # Only include uniprot_accession if there are any pdb entries left after filtering
160
189
  results[uniprot_accession] = filtered_pdb_entries
@@ -337,13 +366,13 @@ def _build_sparql_query_sequence_length_filter(min_length: int | None = None, ma
337
366
  # - http://purl.uniprot.org/isoforms/P42284-2 is ok
338
367
  # - http://purl.uniprot.org/isoforms/P42284-1 is not ok, because it is based on P42284-2
339
368
  # - http://purl.uniprot.org/isoforms/Q7KQZ4-1 is not ok, because it is from another uniprot entry
340
- # TODO use same approach as in retrieve_uniprot_details function
341
369
  header = dedent("""\
342
370
  ?protein up:sequence ?isoform .
343
- FILTER NOT EXISTS { ?isoform up:basedOn ?parent_isoform }
344
- FILTER(
345
- STRAFTER(STR(?protein), "http://purl.uniprot.org/uniprot/") =
346
- STRBEFORE(STRAFTER(STR(?isoform), "http://purl.uniprot.org/isoforms/"), "-"))
371
+ ?isoform a up:Simple_Sequence .
372
+ BIND (IRI(STRBEFORE(REPLACE(
373
+ STR(?isoform), "http://purl.uniprot.org/isoforms/", "http://purl.uniprot.org/uniprot/"
374
+ ), "-")) AS ?ac_of_isoform)
375
+ FILTER (?protein = ?ac_of_isoform)
347
376
  ?isoform rdf:value ?sequence .
348
377
  BIND (STRLEN(?sequence) AS ?seq_length)
349
378
  """)
@@ -875,8 +904,10 @@ def map_uniprot_accessions2uniprot_details(
875
904
  ?protein up:sequence ?isoform .
876
905
  ?isoform a up:Simple_Sequence .
877
906
  ?isoform rdf:value ?sequence .
878
- BIND (STRBEFORE(STRAFTER(STR(?isoform), "http://purl.uniprot.org/isoforms/"), "-") AS ?ac_of_isoform)
879
- FILTER(?ac_of_isoform = ?ac)
907
+ BIND (IRI(STRBEFORE(REPLACE(
908
+ STR(?isoform), "http://purl.uniprot.org/isoforms/", "http://purl.uniprot.org/uniprot/"
909
+ ), "-")) AS ?ac_of_isoform)
910
+ FILTER(?ac_of_isoform = ?protein)
880
911
  }
881
912
  ```
882
913
 
@@ -898,17 +929,20 @@ def map_uniprot_accessions2uniprot_details(
898
929
  (STRLEN(?sequence) AS ?seq_length)
899
930
  """)
900
931
  where_clause = dedent("""
901
- ?protein a up:Protein .
902
932
  ?protein up:mnemonic ?uniprot_id .
903
933
  ?protein up:organism ?organism .
904
934
  ?organism up:scientificName ?taxon_name .
905
935
  ?protein up:reviewed ?reviewed .
936
+ OPTIONAL {
906
937
  ?protein up:recommendedName/up:fullName ?protein_name .
938
+ }
907
939
  ?protein up:sequence ?isoform .
908
940
  ?isoform a up:Simple_Sequence .
909
941
  ?isoform rdf:value ?sequence .
910
- BIND (STRBEFORE(STRAFTER(STR(?isoform), "http://purl.uniprot.org/isoforms/"), "-") AS ?ac_of_isoform)
911
- FILTER(?ac_of_isoform = ?ac)
942
+ BIND (IRI(STRBEFORE(REPLACE(
943
+ STR(?isoform), "http://purl.uniprot.org/isoforms/", "http://purl.uniprot.org/uniprot/"
944
+ ), "-")) AS ?ac_of_isoform)
945
+ FILTER(?ac_of_isoform = ?protein)
912
946
  """)
913
947
  total = len(uniprot_accessions)
914
948
  with tqdm(
@@ -927,12 +961,13 @@ def map_uniprot_accessions2uniprot_details(
927
961
  timeout=timeout,
928
962
  )
929
963
  for raw_result in raw_results:
964
+ protein_name = raw_result.get("protein_name", {}).get("value", "")
930
965
  result = UniprotDetails(
931
966
  uniprot_accession=raw_result["uniprot_accession"]["value"],
932
967
  uniprot_id=raw_result["uniprot_id"]["value"],
933
968
  sequence_length=int(raw_result["seq_length"]["value"]),
934
969
  reviewed=raw_result["reviewed"]["value"] == "true",
935
- protein_name=raw_result["protein_name"]["value"],
970
+ protein_name=protein_name,
936
971
  taxon_id=int(raw_result["taxon_id"]["value"]),
937
972
  taxon_name=raw_result["taxon_name"]["value"],
938
973
  )
protein_quest/utils.py CHANGED
@@ -266,6 +266,7 @@ async def retrieve_files(
266
266
  cacher: Cacher | None = None,
267
267
  chunk_size: int = 524288, # 512 KiB
268
268
  gzip_files: bool = False,
269
+ raise_for_not_found: bool = True,
269
270
  ) -> list[Path]:
270
271
  """Retrieve files from a list of URLs and save them to a directory.
271
272
 
@@ -279,6 +280,9 @@ async def retrieve_files(
279
280
  cacher: An optional cacher to use for caching files.
280
281
  chunk_size: The size of each chunk to read from the response.
281
282
  gzip_files: Whether to gzip the downloaded files.
283
+ This requires the server can send gzip encoded content.
284
+ raise_for_not_found: Whether to raise an error for HTTP 404 errors.
285
+ If false then function does not returns Path for which url gave HTTP 404 error and logs as debug message.
282
286
 
283
287
  Returns:
284
288
  A list of paths to the downloaded files.
@@ -295,11 +299,12 @@ async def retrieve_files(
295
299
  cacher=cacher,
296
300
  chunk_size=chunk_size,
297
301
  gzip_files=gzip_files,
302
+ raise_for_not_found=raise_for_not_found,
298
303
  )
299
304
  for url, filename in urls
300
305
  ]
301
- files: list[Path] = await tqdm.gather(*tasks, desc=desc)
302
- return files
306
+ raw_files: list[Path | None] = await tqdm.gather(*tasks, desc=desc)
307
+ return [f for f in raw_files if f is not None]
303
308
 
304
309
 
305
310
  class InvalidContentEncodingError(aiohttp.ClientResponseError):
@@ -314,7 +319,8 @@ async def _retrieve_file(
314
319
  cacher: Cacher | None = None,
315
320
  chunk_size: int = 524288, # 512 KiB
316
321
  gzip_files: bool = False,
317
- ) -> Path:
322
+ raise_for_not_found=True,
323
+ ) -> Path | None:
318
324
  """Retrieve a single file from a URL and save it to a specified path.
319
325
 
320
326
  Args:
@@ -325,6 +331,9 @@ async def _retrieve_file(
325
331
  cacher: An optional cacher to use for caching files.
326
332
  chunk_size: The size of each chunk to read from the response.
327
333
  gzip_files: Whether to gzip the downloaded file.
334
+ This requires the server can send gzip encoded content.
335
+ raise_for_not_found: Whether to raise an error for HTTP 404 errors.
336
+ If false then function returns None on HTTP 404 errors and logs as debug message.
328
337
 
329
338
  Returns:
330
339
  The path to the saved file.
@@ -348,6 +357,9 @@ async def _retrieve_file(
348
357
  semaphore,
349
358
  session.get(url, headers=headers, auto_decompress=auto_decompress) as resp,
350
359
  ):
360
+ if not raise_for_not_found and resp.status == 404:
361
+ logger.debug(f"File not found at {url}, skipping download.")
362
+ return None
351
363
  resp.raise_for_status()
352
364
  if gzip_files and resp.headers.get("Content-Encoding") != "gzip":
353
365
  msg = f"Server did not send gzip encoded content for {url}, can not save as gzipped file."
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: protein_quest
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: Search/retrieve/filter proteins and protein structures
5
5
  Project-URL: Homepage, https://github.com/haddocking/protein-quest
6
6
  Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
@@ -21,6 +21,7 @@ Requires-Dist: platformdirs>=4.3.8
21
21
  Requires-Dist: psutil>=7.0.0
22
22
  Requires-Dist: rich-argparse>=1.7.1
23
23
  Requires-Dist: rich>=14.0.0
24
+ Requires-Dist: shtab>=1.7.2
24
25
  Requires-Dist: sparqlwrapper>=2.0.0
25
26
  Requires-Dist: tqdm>=4.67.1
26
27
  Requires-Dist: yarl>=1.20.1
@@ -154,7 +155,7 @@ protein-quest retrieve pdbe pdbe.csv downloads-pdbe/
154
155
  protein-quest retrieve alphafold alphafold.csv downloads-af/
155
156
  ```
156
157
 
157
- For each entry downloads the summary.json and cif file.
158
+ For each entry downloads the cif file.
158
159
 
159
160
  ### To retrieve EMDB volume files
160
161
 
@@ -299,6 +300,26 @@ protein-quest mcp
299
300
 
300
301
  The mcp server contains an prompt template to search/retrieve/filter candidate structures.
301
302
 
303
+ ## Shell autocompletion
304
+
305
+ The `protein-quest` command line tool supports shell autocompletion using [shtab](https://shtab.readthedocs.io/).
306
+
307
+ Initialize for bash shell with:
308
+
309
+ ```shell
310
+ mkdir -p ~/.local/share/bash-completion/completions
311
+ protein-quest --print-completion bash > ~/.local/share/bash-completion/completions/protein-quest
312
+ ```
313
+
314
+ Initialize for zsh shell with:
315
+
316
+ ```shell
317
+ mkdir -p ~/.local/share/zsh/site-functions
318
+ protein-quest --print-completion zsh > ~/.local/share/zsh/site-functions/_protein-quest
319
+ fpath=("$HOME/.local/share/zsh/site-functions" $fpath)
320
+ autoload -Uz compinit && compinit
321
+ ```
322
+
302
323
  ## Contributing
303
324
 
304
325
  For development information and contribution guidelines, please see [CONTRIBUTING.md](CONTRIBUTING.md).
@@ -1,27 +1,27 @@
1
1
  protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- protein_quest/__version__.py,sha256=F9kNagC7uEvuPDju8Gzo4Jt81LSvbf0VyONV3GMXT2M,56
3
- protein_quest/cli.py,sha256=082CmSSmxVZoWbnX35AmhqedA4T1dD9v-eMe0vsIDp4,55572
2
+ protein_quest/__version__.py,sha256=z22DsH46rJUgc917FJyc2z9XDmdScvBS92-z4i4GZ98,56
3
+ protein_quest/cli.py,sha256=bE0Xq93LjdMnDoHeRIDUXUU79LyWICnhX8B3m2Lk8ZE,57264
4
4
  protein_quest/converter.py,sha256=Y-Oxf7lDNbEicL6GS-IpNWDwaAiHgIgs5bFAcEHCKdQ,1441
5
5
  protein_quest/emdb.py,sha256=641c6RwNYnu-0GBFyCFBiI58fNc0jMkd0ZZ9MW9-Jmc,1501
6
6
  protein_quest/filters.py,sha256=Xr-cJTtbNjHKuzmXLBf7yZfqKf_U3RTivcVbr620LVU,5225
7
7
  protein_quest/go.py,sha256=lZNEcw8nTc9wpV3cl4y2FG9Lsj8wsXQ6zemmAQs_DWE,5650
8
8
  protein_quest/io.py,sha256=ngV_HU2HIQFO-bP2xQj_fhgv0MYjW4puqz_9CxGpBv8,13017
9
- protein_quest/mcp_server.py,sha256=tZkSG1yx4ocN1rlKgVlU8nUbs6LKpyLrNqP3y6fbJm0,8564
9
+ protein_quest/mcp_server.py,sha256=oHbNjN-Lctc2mY-sjEuo82yRsp1bBsHo2Ag5MwsWx8k,8547
10
10
  protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
11
11
  protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  protein_quest/ss.py,sha256=4ZGIHfjTlodYTXqGUKhMnGbgaStYOGaWg2oYrWIjdgo,10118
13
13
  protein_quest/structure.py,sha256=QozElPz0kbPB_HW-J1WxArTT5e-1vRyBJoBSfHnwoRM,8117
14
14
  protein_quest/taxonomy.py,sha256=4mKv8zll4mX02Ow8CTvyqMJE2KJZvcq3QlTjjjLOJJk,5072
15
- protein_quest/uniprot.py,sha256=mODAcneCnDvinvJ3jffyR11klsgq5b96T_4aVWd-Luw,35158
16
- protein_quest/utils.py,sha256=6OF8X4ia_z1HOYiXy6e-zEWlp_bF1DoZCVrCSg1qivY,19076
15
+ protein_quest/uniprot.py,sha256=kV1lOZ_ugcF-LUff9hvmJPaGwA_uaHPJCL_3DLBIvSE,36798
16
+ protein_quest/utils.py,sha256=5Ncdid-dslggy-Ti1yhOHwdAM7Bxpyia7Re-xDkc2P0,19909
17
17
  protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
18
18
  protein_quest/alphafold/confidence.py,sha256=mVAYTIzdbR8xBjRiUzA0at8wJq9vpfEQWPz5cJefLKs,6766
19
19
  protein_quest/alphafold/entry_summary.py,sha256=Qhnw75RXFaoOU332g7axg_jYbbdZbUpsGPUOwPNDSeU,2114
20
- protein_quest/alphafold/fetch.py,sha256=l8pcXeuDfoXYiwpW5N_uB_9oZpomBgUeF9kROLrM11M,14038
20
+ protein_quest/alphafold/fetch.py,sha256=eKCQHkAMko-d36VvRHLCllLxuAXBdbBUhUONOSCPsds,21970
21
21
  protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
22
22
  protein_quest/pdbe/fetch.py,sha256=e8CHWDX2QzWnVLmYXCfNrscw1UcN1lI9Uz6Z5HmEOEQ,2510
23
- protein_quest-0.7.0.dist-info/METADATA,sha256=JvsZl9XGN57iJn5oSBRIVNIqL6aYEHXQlGpE87nsSvQ,10722
24
- protein_quest-0.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
25
- protein_quest-0.7.0.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
26
- protein_quest-0.7.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
27
- protein_quest-0.7.0.dist-info/RECORD,,
23
+ protein_quest-0.8.0.dist-info/METADATA,sha256=jotRxaLadElgixAW72Axk8qL8wAvzl-cq26mYJBy9zc,11335
24
+ protein_quest-0.8.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
25
+ protein_quest-0.8.0.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
26
+ protein_quest-0.8.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
27
+ protein_quest-0.8.0.dist-info/RECORD,,