protein-quest 0.3.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of protein-quest might be problematic. Click here for more details.

@@ -1,2 +1,2 @@
1
- __version__ = "0.3.2"
1
+ __version__ = "0.5.0"
2
2
  """The version of the package."""
@@ -14,7 +14,7 @@ from yarl import URL
14
14
 
15
15
  from protein_quest.alphafold.entry_summary import EntrySummary
16
16
  from protein_quest.converter import converter
17
- from protein_quest.utils import friendly_session, retrieve_files, run_async
17
+ from protein_quest.utils import Cacher, PassthroughCacher, friendly_session, retrieve_files, run_async
18
18
 
19
19
  logger = logging.getLogger(__name__)
20
20
 
@@ -104,7 +104,7 @@ class AlphaFoldEntry:
104
104
 
105
105
 
106
106
  async def fetch_summary(
107
- qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None
107
+ qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None, cacher: Cacher
108
108
  ) -> list[EntrySummary]:
109
109
  """Fetches a summary from the AlphaFold database for a given qualifier.
110
110
 
@@ -116,6 +116,7 @@ async def fetch_summary(
116
116
  save_dir: An optional directory to save the fetched summary as a JSON file.
117
117
  If set and summary exists then summary will be loaded from disk instead of being fetched from the API.
118
118
  If not set then the summary will not be saved to disk and will always be fetched from the API.
119
+ cacher: A cacher to use for caching the fetched summary. Only used if save_dir is not None.
119
120
 
120
121
  Returns:
121
122
  A list of EntrySummary objects representing the fetched summary.
@@ -124,6 +125,11 @@ async def fetch_summary(
124
125
  fn: AsyncPath | None = None
125
126
  if save_dir is not None:
126
127
  fn = AsyncPath(save_dir / f"{qualifier}.json")
128
+ cached_file = await cacher.copy_from_cache(Path(fn))
129
+ if cached_file is not None:
130
+ logger.debug(f"Using cached file {cached_file} for summary of {qualifier}.")
131
+ raw_data = await AsyncPath(cached_file).read_bytes()
132
+ return converter.loads(raw_data, list[EntrySummary])
127
133
  if await fn.exists():
128
134
  logger.debug(f"File {fn} already exists. Skipping download from {url}.")
129
135
  raw_data = await fn.read_bytes()
@@ -133,18 +139,23 @@ async def fetch_summary(
133
139
  raw_data = await response.content.read()
134
140
  if fn is not None:
135
141
  # TODO return fn and make it part of AlphaFoldEntry as summary_file prop
136
- await fn.write_bytes(raw_data)
142
+ await cacher.write_bytes(Path(fn), raw_data)
137
143
  return converter.loads(raw_data, list[EntrySummary])
138
144
 
139
145
 
140
146
  async def fetch_summaries(
141
- qualifiers: Iterable[str], save_dir: Path | None = None, max_parallel_downloads: int = 5
147
+ qualifiers: Iterable[str],
148
+ save_dir: Path | None = None,
149
+ max_parallel_downloads: int = 5,
150
+ cacher: Cacher | None = None,
142
151
  ) -> AsyncGenerator[EntrySummary]:
143
152
  semaphore = Semaphore(max_parallel_downloads)
144
153
  if save_dir is not None:
145
154
  save_dir.mkdir(parents=True, exist_ok=True)
155
+ if cacher is None:
156
+ cacher = PassthroughCacher()
146
157
  async with friendly_session() as session:
147
- tasks = [fetch_summary(qualifier, session, semaphore, save_dir) for qualifier in qualifiers]
158
+ tasks = [fetch_summary(qualifier, session, semaphore, save_dir, cacher) for qualifier in qualifiers]
148
159
  summaries_per_qualifier: list[list[EntrySummary]] = await tqdm.gather(
149
160
  *tasks, desc="Fetching Alphafold summaries"
150
161
  )
@@ -154,7 +165,11 @@ async def fetch_summaries(
154
165
 
155
166
 
156
167
  async def fetch_many_async(
157
- uniprot_accessions: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
168
+ uniprot_accessions: Iterable[str],
169
+ save_dir: Path,
170
+ what: set[DownloadableFormat],
171
+ max_parallel_downloads: int = 5,
172
+ cacher: Cacher | None = None,
158
173
  ) -> AsyncGenerator[AlphaFoldEntry]:
159
174
  """Asynchronously fetches summaries and files from
160
175
  [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
@@ -164,15 +179,17 @@ async def fetch_many_async(
164
179
  save_dir: The directory to save the fetched files to.
165
180
  what: A set of formats to download.
166
181
  max_parallel_downloads: The maximum number of parallel downloads.
182
+ cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
167
183
 
168
184
  Yields:
169
185
  A dataclass containing the summary, pdb file, and pae file.
170
186
  """
171
187
  save_dir_for_summaries = save_dir if "summary" in what and save_dir is not None else None
188
+
172
189
  summaries = [
173
190
  s
174
191
  async for s in fetch_summaries(
175
- uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads
192
+ uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads, cacher=cacher
176
193
  )
177
194
  ]
178
195
 
@@ -183,6 +200,7 @@ async def fetch_many_async(
183
200
  save_dir,
184
201
  desc="Downloading AlphaFold files",
185
202
  max_parallel_downloads=max_parallel_downloads,
203
+ cacher=cacher,
186
204
  )
187
205
  for summary in summaries:
188
206
  yield AlphaFoldEntry(
@@ -236,7 +254,11 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
236
254
 
237
255
 
238
256
  def fetch_many(
239
- ids: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
257
+ ids: Iterable[str],
258
+ save_dir: Path,
259
+ what: set[DownloadableFormat],
260
+ max_parallel_downloads: int = 5,
261
+ cacher: Cacher | None = None,
240
262
  ) -> list[AlphaFoldEntry]:
241
263
  """Synchronously fetches summaries and pdb and pae files from AlphaFold Protein Structure Database.
242
264
 
@@ -245,6 +267,7 @@ def fetch_many(
245
267
  save_dir: The directory to save the fetched files to.
246
268
  what: A set of formats to download.
247
269
  max_parallel_downloads: The maximum number of parallel downloads.
270
+ cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
248
271
 
249
272
  Returns:
250
273
  A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
@@ -253,7 +276,9 @@ def fetch_many(
253
276
  async def gather_entries():
254
277
  return [
255
278
  entry
256
- async for entry in fetch_many_async(ids, save_dir, what, max_parallel_downloads=max_parallel_downloads)
279
+ async for entry in fetch_many_async(
280
+ ids, save_dir, what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
281
+ )
257
282
  ]
258
283
 
259
284
  return run_async(gather_entries())
protein_quest/cli.py CHANGED
@@ -15,6 +15,7 @@ from textwrap import dedent
15
15
  from cattrs import structure
16
16
  from rich import print as rprint
17
17
  from rich.logging import RichHandler
18
+ from rich.markdown import Markdown
18
19
  from rich.panel import Panel
19
20
  from rich_argparse import ArgumentDefaultsRichHelpFormatter
20
21
  from tqdm.rich import tqdm
@@ -31,8 +32,26 @@ from protein_quest.pdbe import fetch as pdbe_fetch
31
32
  from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
32
33
  from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
33
34
  from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
34
- from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
35
- from protein_quest.utils import CopyMethod, copy_methods, copyfile
35
+ from protein_quest.uniprot import (
36
+ ComplexPortalEntry,
37
+ PdbResult,
38
+ Query,
39
+ search4af,
40
+ search4emdb,
41
+ search4interaction_partners,
42
+ search4macromolecular_complexes,
43
+ search4pdb,
44
+ search4uniprot,
45
+ )
46
+ from protein_quest.utils import (
47
+ Cacher,
48
+ CopyMethod,
49
+ DirectoryCacher,
50
+ PassthroughCacher,
51
+ copy_methods,
52
+ copyfile,
53
+ user_cache_root_dir,
54
+ )
36
55
 
37
56
  logger = logging.getLogger(__name__)
38
57
 
@@ -211,6 +230,73 @@ def _add_search_taxonomy_parser(subparser: argparse._SubParsersAction):
211
230
  parser.add_argument("--limit", type=int, default=100, help="Maximum number of results to return")
212
231
 
213
232
 
233
+ def _add_search_interaction_partners_parser(subparsers: argparse._SubParsersAction):
234
+ """Add search interaction partners subcommand parser."""
235
+ parser = subparsers.add_parser(
236
+ "interaction-partners",
237
+ help="Search for interaction partners of given UniProt accession",
238
+ description=dedent("""\
239
+ Search for interaction partners of given UniProt accession
240
+ in the Uniprot SPARQL endpoint and Complex Portal.
241
+ """),
242
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
243
+ )
244
+ parser.add_argument(
245
+ "uniprot_acc",
246
+ type=str,
247
+ help="UniProt accession (for example P12345).",
248
+ )
249
+ parser.add_argument(
250
+ "--exclude",
251
+ type=str,
252
+ action="append",
253
+ help="UniProt accessions to exclude from the results. For example already known interaction partners.",
254
+ )
255
+ parser.add_argument(
256
+ "output_csv",
257
+ type=argparse.FileType("w", encoding="UTF-8"),
258
+ help="Output CSV with interaction partners per UniProt accession. Use `-` for stdout.",
259
+ )
260
+ parser.add_argument(
261
+ "--limit", type=int, default=10_000, help="Maximum number of interaction partner uniprot accessions to return"
262
+ )
263
+ parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
264
+
265
+
266
+ def _add_search_complexes_parser(subparsers: argparse._SubParsersAction):
267
+ """Add search complexes subcommand parser."""
268
+ description = dedent("""\
269
+ Search for complexes in the Complex Portal.
270
+ https://www.ebi.ac.uk/complexportal/
271
+
272
+ The output CSV file has the following columns:
273
+
274
+ - query_protein: UniProt accession used as query
275
+ - complex_id: Complex Portal identifier
276
+ - complex_url: URL to the Complex Portal entry
277
+ - complex_title: Title of the complex
278
+ - members: Semicolon-separated list of UniProt accessions of complex members
279
+ """)
280
+ parser = subparsers.add_parser(
281
+ "complexes",
282
+ help="Search for complexes in the Complex Portal",
283
+ description=Markdown(description, style="argparse.text"), # type: ignore using rich formatter makes this OK
284
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
285
+ )
286
+ parser.add_argument(
287
+ "uniprot_accs",
288
+ type=argparse.FileType("r", encoding="UTF-8"),
289
+ help="Text file with UniProt accessions (one per line) as query for searching complexes. Use `-` for stdin.",
290
+ )
291
+ parser.add_argument(
292
+ "output_csv",
293
+ type=argparse.FileType("w", encoding="UTF-8"),
294
+ help="Output CSV file with complex results. Use `-` for stdout.",
295
+ )
296
+ parser.add_argument("--limit", type=int, default=100, help="Maximum number of complex results to return")
297
+ parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
298
+
299
+
214
300
  def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
215
301
  """Add retrieve pdbe subcommand parser."""
216
302
  parser = subparsers.add_parser(
@@ -234,6 +320,7 @@ def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
234
320
  default=5,
235
321
  help="Maximum number of parallel downloads",
236
322
  )
323
+ _add_cacher_arguments(parser)
237
324
 
238
325
 
239
326
  def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
@@ -264,6 +351,7 @@ def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
264
351
  default=5,
265
352
  help="Maximum number of parallel downloads",
266
353
  )
354
+ _add_cacher_arguments(parser)
267
355
 
268
356
 
269
357
  def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
@@ -283,22 +371,7 @@ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
283
371
  help="CSV file with `emdb_id` column. Other columns are ignored. Use `-` for stdin.",
284
372
  )
285
373
  parser.add_argument("output_dir", type=Path, help="Directory to store downloaded EMDB volume files")
286
-
287
-
288
- def _add_copy_method_argument(parser: argparse.ArgumentParser):
289
- """Add copy method argument to parser."""
290
- default_copy_method = "symlink"
291
- if os.name == "nt":
292
- # On Windows you need developer mode or admin privileges to create symlinks
293
- # so we default to copying files instead of symlinking
294
- default_copy_method = "copy"
295
- parser.add_argument(
296
- "--copy-method",
297
- type=str,
298
- choices=copy_methods,
299
- default=default_copy_method,
300
- help="How to copy files when no changes are needed to output file.",
301
- )
374
+ _add_cacher_arguments(parser)
302
375
 
303
376
 
304
377
  def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
@@ -331,7 +404,7 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
331
404
  In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
332
405
  Use `-` for stdout."""),
333
406
  )
334
- _add_copy_method_argument(parser)
407
+ _add_copy_method_arguments(parser)
335
408
 
336
409
 
337
410
  def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
@@ -371,7 +444,7 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
371
444
  If not provided, will create a local cluster.
372
445
  If set to `sequential` will run tasks sequentially."""),
373
446
  )
374
- _add_copy_method_argument(parser)
447
+ _add_copy_method_arguments(parser)
375
448
 
376
449
 
377
450
  def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
@@ -394,7 +467,6 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
394
467
  )
395
468
  parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
396
469
  parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
397
- _add_copy_method_argument(parser)
398
470
  parser.add_argument(
399
471
  "--write-stats",
400
472
  type=argparse.FileType("w", encoding="UTF-8"),
@@ -403,6 +475,7 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
403
475
  In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
404
476
  Use `-` for stdout."""),
405
477
  )
478
+ _add_copy_method_arguments(parser)
406
479
 
407
480
 
408
481
  def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
@@ -429,7 +502,6 @@ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
429
502
  parser.add_argument("--ratio-max-helix-residues", type=float, help="Max residues in helices (relative)")
430
503
  parser.add_argument("--ratio-min-sheet-residues", type=float, help="Min residues in sheets (relative)")
431
504
  parser.add_argument("--ratio-max-sheet-residues", type=float, help="Max residues in sheets (relative)")
432
- _add_copy_method_argument(parser)
433
505
  parser.add_argument(
434
506
  "--write-stats",
435
507
  type=argparse.FileType("w", encoding="UTF-8"),
@@ -440,6 +512,7 @@ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
440
512
  Use `-` for stdout.
441
513
  """),
442
514
  )
515
+ _add_copy_method_arguments(parser)
443
516
 
444
517
 
445
518
  def _add_search_subcommands(subparsers: argparse._SubParsersAction):
@@ -458,6 +531,8 @@ def _add_search_subcommands(subparsers: argparse._SubParsersAction):
458
531
  _add_search_emdb_parser(subsubparsers)
459
532
  _add_search_go_parser(subsubparsers)
460
533
  _add_search_taxonomy_parser(subsubparsers)
534
+ _add_search_interaction_partners_parser(subsubparsers)
535
+ _add_search_complexes_parser(subsubparsers)
461
536
 
462
537
 
463
538
  def _add_retrieve_subcommands(subparsers: argparse._SubParsersAction):
@@ -505,6 +580,38 @@ def _add_mcp_command(subparsers: argparse._SubParsersAction):
505
580
  parser.add_argument("--port", default=8000, type=int, help="Port to bind the server to")
506
581
 
507
582
 
583
+ def _add_copy_method_arguments(parser):
584
+ parser.add_argument(
585
+ "--copy-method",
586
+ type=str,
587
+ choices=copy_methods,
588
+ default="hardlink",
589
+ help=dedent("""\
590
+ How to make target file be same file as source file.
591
+ By default uses hardlinks to save disk space.
592
+ Note that hardlinks only work within the same filesystem and are harder to track.
593
+ If you want to track cached files easily then use 'symlink'.
594
+ On Windows you need developer mode or admin privileges to create symlinks.
595
+ """),
596
+ )
597
+
598
+
599
+ def _add_cacher_arguments(parser: argparse.ArgumentParser):
600
+ """Add cacher arguments to parser."""
601
+ parser.add_argument(
602
+ "--no-cache",
603
+ action="store_true",
604
+ help="Disable caching of files to central location.",
605
+ )
606
+ parser.add_argument(
607
+ "--cache-dir",
608
+ type=Path,
609
+ default=user_cache_root_dir(),
610
+ help="Directory to use as cache for files.",
611
+ )
612
+ _add_copy_method_arguments(parser)
613
+
614
+
508
615
  def make_parser() -> argparse.ArgumentParser:
509
616
  parser = argparse.ArgumentParser(
510
617
  description="Protein Quest CLI", prog="protein-quest", formatter_class=ArgumentDefaultsRichHelpFormatter
@@ -636,14 +743,52 @@ def _handle_search_taxonomy(args):
636
743
  _write_taxonomy_csv(results, output_csv)
637
744
 
638
745
 
639
- def _handle_retrieve_pdbe(args):
746
+ def _handle_search_interaction_partners(args: argparse.Namespace):
747
+ uniprot_acc: str = args.uniprot_acc
748
+ excludes: set[str] = set(args.exclude) if args.exclude else set()
749
+ limit: int = args.limit
750
+ timeout: int = args.timeout
751
+ output_csv: TextIOWrapper = args.output_csv
752
+
753
+ rprint(f"Searching for interaction partners of '{uniprot_acc}'")
754
+ results = search4interaction_partners(uniprot_acc, excludes=excludes, limit=limit, timeout=timeout)
755
+ rprint(f"Found {len(results)} interaction partners, written to {output_csv.name}")
756
+ _write_lines(output_csv, results.keys())
757
+
758
+
759
+ def _handle_search_complexes(args: argparse.Namespace):
760
+ uniprot_accs = args.uniprot_accs
761
+ limit = args.limit
762
+ timeout = args.timeout
763
+ output_csv = args.output_csv
764
+
765
+ accs = _read_lines(uniprot_accs)
766
+ rprint(f"Finding complexes for {len(accs)} uniprot accessions")
767
+ results = search4macromolecular_complexes(accs, limit=limit, timeout=timeout)
768
+ rprint(f"Found {len(results)} complexes, written to {output_csv.name}")
769
+ _write_complexes_csv(results, output_csv)
770
+
771
+
772
+ def _initialize_cacher(args: argparse.Namespace) -> Cacher:
773
+ if args.no_cache:
774
+ return PassthroughCacher()
775
+ return DirectoryCacher(
776
+ cache_dir=args.cache_dir,
777
+ copy_method=args.copy_method,
778
+ )
779
+
780
+
781
+ def _handle_retrieve_pdbe(args: argparse.Namespace):
640
782
  pdbe_csv = args.pdbe_csv
641
783
  output_dir = args.output_dir
642
784
  max_parallel_downloads = args.max_parallel_downloads
785
+ cacher = _initialize_cacher(args)
643
786
 
644
787
  pdb_ids = _read_column_from_csv(pdbe_csv, "pdb_id")
645
788
  rprint(f"Retrieving {len(pdb_ids)} PDBe entries")
646
- result = asyncio.run(pdbe_fetch.fetch(pdb_ids, output_dir, max_parallel_downloads=max_parallel_downloads))
789
+ result = asyncio.run(
790
+ pdbe_fetch.fetch(pdb_ids, output_dir, max_parallel_downloads=max_parallel_downloads, cacher=cacher)
791
+ )
647
792
  rprint(f"Retrieved {len(result)} PDBe entries")
648
793
 
649
794
 
@@ -652,6 +797,7 @@ def _handle_retrieve_alphafold(args):
652
797
  what_formats = args.what_formats
653
798
  alphafold_csv = args.alphafold_csv
654
799
  max_parallel_downloads = args.max_parallel_downloads
800
+ cacher = _initialize_cacher(args)
655
801
 
656
802
  if what_formats is None:
657
803
  what_formats = {"summary", "cif"}
@@ -661,7 +807,9 @@ def _handle_retrieve_alphafold(args):
661
807
  af_ids = _read_column_from_csv(alphafold_csv, "af_id")
662
808
  validated_what: set[DownloadableFormat] = structure(what_formats, set[DownloadableFormat])
663
809
  rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {validated_what}")
664
- afs = af_fetch(af_ids, download_dir, what=validated_what, max_parallel_downloads=max_parallel_downloads)
810
+ afs = af_fetch(
811
+ af_ids, download_dir, what=validated_what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
812
+ )
665
813
  total_nr_files = sum(af.nr_of_files() for af in afs)
666
814
  rprint(f"Retrieved {total_nr_files} AlphaFold files and {len(afs)} summaries, written to {download_dir}")
667
815
 
@@ -669,10 +817,11 @@ def _handle_retrieve_alphafold(args):
669
817
  def _handle_retrieve_emdb(args):
670
818
  emdb_csv = args.emdb_csv
671
819
  output_dir = args.output_dir
820
+ cacher = _initialize_cacher(args)
672
821
 
673
822
  emdb_ids = _read_column_from_csv(emdb_csv, "emdb_id")
674
823
  rprint(f"Retrieving {len(emdb_ids)} EMDB entries")
675
- result = asyncio.run(emdb_fetch(emdb_ids, output_dir))
824
+ result = asyncio.run(emdb_fetch(emdb_ids, output_dir, cacher=cacher))
676
825
  rprint(f"Retrieved {len(result)} EMDB entries")
677
826
 
678
827
 
@@ -875,6 +1024,8 @@ HANDLERS: dict[tuple[str, str | None], Callable] = {
875
1024
  ("search", "emdb"): _handle_search_emdb,
876
1025
  ("search", "go"): _handle_search_go,
877
1026
  ("search", "taxonomy"): _handle_search_taxonomy,
1027
+ ("search", "interaction-partners"): _handle_search_interaction_partners,
1028
+ ("search", "complexes"): _handle_search_complexes,
878
1029
  ("retrieve", "pdbe"): _handle_retrieve_pdbe,
879
1030
  ("retrieve", "alphafold"): _handle_retrieve_alphafold,
880
1031
  ("retrieve", "emdb"): _handle_retrieve_emdb,
@@ -937,3 +1088,33 @@ def _iter_csv_rows(file: TextIOWrapper) -> Generator[dict[str, str]]:
937
1088
 
938
1089
  def _read_column_from_csv(file: TextIOWrapper, column: str) -> set[str]:
939
1090
  return {row[column] for row in _iter_csv_rows(file)}
1091
+
1092
+
1093
+ def _write_complexes_csv(complexes: list[ComplexPortalEntry], output_csv: TextIOWrapper) -> None:
1094
+ """Write ComplexPortal information to a CSV file.
1095
+
1096
+ Args:
1097
+ complexes: List of ComplexPortalEntry objects.
1098
+ output_csv: TextIOWrapper to write the CSV data to.
1099
+ """
1100
+ writer = csv.writer(output_csv)
1101
+ writer.writerow(
1102
+ [
1103
+ "query_protein",
1104
+ "complex_id",
1105
+ "complex_url",
1106
+ "complex_title",
1107
+ "members",
1108
+ ]
1109
+ )
1110
+ for entry in complexes:
1111
+ members_str = ";".join(sorted(entry.members))
1112
+ writer.writerow(
1113
+ [
1114
+ entry.query_protein,
1115
+ entry.complex_id,
1116
+ entry.complex_url,
1117
+ entry.complex_title,
1118
+ members_str,
1119
+ ]
1120
+ )
@@ -13,6 +13,7 @@ type PositiveInt = int
13
13
  converter = make_converter()
14
14
  """cattrs converter to read JSON document or dict to Python objects."""
15
15
  converter.register_structure_hook(URL, lambda v, _: URL(v))
16
+ converter.register_unstructure_hook(URL, lambda u: str(u))
16
17
 
17
18
 
18
19
  @converter.register_structure_hook
protein_quest/emdb.py CHANGED
@@ -3,7 +3,7 @@
3
3
  from collections.abc import Iterable, Mapping
4
4
  from pathlib import Path
5
5
 
6
- from protein_quest.utils import retrieve_files
6
+ from protein_quest.utils import Cacher, retrieve_files
7
7
 
8
8
 
9
9
  def _map_id2volume_url(emdb_id: str) -> tuple[str, str]:
@@ -13,13 +13,16 @@ def _map_id2volume_url(emdb_id: str) -> tuple[str, str]:
13
13
  return url, fn
14
14
 
15
15
 
16
- async def fetch(emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 1) -> Mapping[str, Path]:
16
+ async def fetch(
17
+ emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 1, cacher: Cacher | None = None
18
+ ) -> Mapping[str, Path]:
17
19
  """Fetches volume files from the EMDB database.
18
20
 
19
21
  Args:
20
22
  emdb_ids: A list of EMDB IDs to fetch.
21
23
  save_dir: The directory to save the downloaded files.
22
24
  max_parallel_downloads: The maximum number of parallel downloads.
25
+ cacher: An optional cacher to use for caching downloaded files.
23
26
 
24
27
  Returns:
25
28
  A mapping of EMDB IDs to their downloaded files.
@@ -30,5 +33,5 @@ async def fetch(emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads:
30
33
 
31
34
  # TODO show progress of each item
32
35
  # TODO handle failed downloads, by skipping them instead of raising an error
33
- await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading EMDB volume files")
36
+ await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading EMDB volume files", cacher=cacher)
34
37
  return id2paths
@@ -32,6 +32,7 @@ Examples:
32
32
 
33
33
  """
34
34
 
35
+ from collections.abc import Mapping
35
36
  from pathlib import Path
36
37
  from textwrap import dedent
37
38
  from typing import Annotated
@@ -48,7 +49,15 @@ from protein_quest.pdbe.fetch import fetch as pdbe_fetch
48
49
  from protein_quest.pdbe.io import glob_structure_files, nr_residues_in_chain, write_single_chain_pdb_file
49
50
  from protein_quest.ss import filter_file_on_secondary_structure
50
51
  from protein_quest.taxonomy import search_taxon
51
- from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
52
+ from protein_quest.uniprot import (
53
+ PdbResult,
54
+ Query,
55
+ search4af,
56
+ search4emdb,
57
+ search4macromolecular_complexes,
58
+ search4pdb,
59
+ search4uniprot,
60
+ )
52
61
 
53
62
  mcp = FastMCP("protein-quest")
54
63
 
@@ -81,7 +90,18 @@ def search_pdb(
81
90
  return search4pdb(uniprot_accs, limit=limit)
82
91
 
83
92
 
84
- mcp.tool(pdbe_fetch, name="fetch_pdbe_structures")
93
+ @mcp.tool
94
+ async def fetch_pdbe_structures(pdb_ids: set[str], save_dir: Path) -> Mapping[str, Path]:
95
+ """Fetch the PDBe structures for given PDB IDs.
96
+
97
+ Args:
98
+ pdb_ids: A set of PDB IDs.
99
+ save_dir: The directory to save the fetched files.
100
+
101
+ Returns:
102
+ A mapping of PDB ID to the path of the fetched structure file.
103
+ """
104
+ return await pdbe_fetch(pdb_ids, save_dir)
85
105
 
86
106
 
87
107
  @mcp.tool
@@ -137,6 +157,7 @@ def search_alphafolds(
137
157
 
138
158
 
139
159
  mcp.tool(search4emdb, name="search_emdb")
160
+ mcp.tool(search4macromolecular_complexes, name="search_macromolecular_complexes")
140
161
 
141
162
 
142
163
  @mcp.tool
@@ -154,7 +175,17 @@ def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[A
154
175
  return alphafold_fetch(uniprot_accs, save_dir, what)
155
176
 
156
177
 
157
- mcp.tool(emdb_fetch, name="fetch_emdb_volumes")
178
+ @mcp.tool
179
+ async def fetch_emdb_volumes(emdb_ids: set[str], save_dir: Path) -> Mapping[str, Path]:
180
+ """Fetch EMDB volumes for given EMDB IDs.
181
+
182
+ Args:
183
+ emdb_ids: A set of EMDB IDs.
184
+ save_dir: The directory to save the fetched files.
185
+ Returns:
186
+ A mapping of EMDB ID to the path of the fetched volume file.
187
+ """
188
+ return await emdb_fetch(emdb_ids=emdb_ids, save_dir=save_dir)
158
189
 
159
190
 
160
191
  @mcp.tool
@@ -3,7 +3,7 @@
3
3
  from collections.abc import Iterable, Mapping
4
4
  from pathlib import Path
5
5
 
6
- from protein_quest.utils import retrieve_files, run_async
6
+ from protein_quest.utils import Cacher, retrieve_files, run_async
7
7
 
8
8
 
9
9
  def _map_id_mmcif(pdb_id: str) -> tuple[str, str]:
@@ -28,13 +28,16 @@ def _map_id_mmcif(pdb_id: str) -> tuple[str, str]:
28
28
  return url, fn
29
29
 
30
30
 
31
- async def fetch(ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 5) -> Mapping[str, Path]:
31
+ async def fetch(
32
+ ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 5, cacher: Cacher | None = None
33
+ ) -> Mapping[str, Path]:
32
34
  """Fetches mmCIF files from the PDBe database.
33
35
 
34
36
  Args:
35
37
  ids: A set of PDB IDs to fetch.
36
38
  save_dir: The directory to save the fetched mmCIF files to.
37
39
  max_parallel_downloads: The maximum number of parallel downloads.
40
+ cacher: An optional cacher to use for caching downloaded files.
38
41
 
39
42
  Returns:
40
43
  A dict of id and paths to the downloaded mmCIF files.
@@ -47,7 +50,7 @@ async def fetch(ids: Iterable[str], save_dir: Path, max_parallel_downloads: int
47
50
  urls = list(id2urls.values())
48
51
  id2paths = {pdb_id: save_dir / fn for pdb_id, (_, fn) in id2urls.items()}
49
52
 
50
- await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading PDBe mmCIF files")
53
+ await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading PDBe mmCIF files", cacher=cacher)
51
54
  return id2paths
52
55
 
53
56
 
protein_quest/ss.py CHANGED
@@ -111,6 +111,26 @@ class SecondaryStructureFilterQuery:
111
111
  ratio_min_sheet_residues: Ratio | None = None
112
112
  ratio_max_sheet_residues: Ratio | None = None
113
113
 
114
+ def is_actionable(self) -> bool:
115
+ """Check if the secondary structure query has any actionable filters.
116
+
117
+ Returns:
118
+ True if any of the filters are set, False otherwise.
119
+ """
120
+ return any(
121
+ field is not None
122
+ for field in [
123
+ self.abs_min_helix_residues,
124
+ self.abs_max_helix_residues,
125
+ self.abs_min_sheet_residues,
126
+ self.abs_max_sheet_residues,
127
+ self.ratio_min_helix_residues,
128
+ self.ratio_max_helix_residues,
129
+ self.ratio_min_sheet_residues,
130
+ self.ratio_max_sheet_residues,
131
+ ]
132
+ )
133
+
114
134
 
115
135
  def _check_range(min_val, max_val, label):
116
136
  if min_val is not None and max_val is not None and min_val >= max_val: