protein-quest 0.3.2__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of protein-quest might be problematic. Click here for more details.
- protein_quest/__version__.py +1 -1
- protein_quest/alphafold/fetch.py +34 -9
- protein_quest/cli.py +207 -26
- protein_quest/converter.py +1 -0
- protein_quest/emdb.py +6 -3
- protein_quest/mcp_server.py +34 -3
- protein_quest/pdbe/fetch.py +6 -3
- protein_quest/ss.py +20 -0
- protein_quest/uniprot.py +157 -4
- protein_quest/utils.py +367 -23
- {protein_quest-0.3.2.dist-info → protein_quest-0.5.0.dist-info}/METADATA +41 -3
- protein_quest-0.5.0.dist-info/RECORD +26 -0
- protein_quest-0.3.2.dist-info/RECORD +0 -26
- {protein_quest-0.3.2.dist-info → protein_quest-0.5.0.dist-info}/WHEEL +0 -0
- {protein_quest-0.3.2.dist-info → protein_quest-0.5.0.dist-info}/entry_points.txt +0 -0
- {protein_quest-0.3.2.dist-info → protein_quest-0.5.0.dist-info}/licenses/LICENSE +0 -0
protein_quest/__version__.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.5.0"
|
|
2
2
|
"""The version of the package."""
|
protein_quest/alphafold/fetch.py
CHANGED
|
@@ -14,7 +14,7 @@ from yarl import URL
|
|
|
14
14
|
|
|
15
15
|
from protein_quest.alphafold.entry_summary import EntrySummary
|
|
16
16
|
from protein_quest.converter import converter
|
|
17
|
-
from protein_quest.utils import friendly_session, retrieve_files, run_async
|
|
17
|
+
from protein_quest.utils import Cacher, PassthroughCacher, friendly_session, retrieve_files, run_async
|
|
18
18
|
|
|
19
19
|
logger = logging.getLogger(__name__)
|
|
20
20
|
|
|
@@ -104,7 +104,7 @@ class AlphaFoldEntry:
|
|
|
104
104
|
|
|
105
105
|
|
|
106
106
|
async def fetch_summary(
|
|
107
|
-
qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None
|
|
107
|
+
qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None, cacher: Cacher
|
|
108
108
|
) -> list[EntrySummary]:
|
|
109
109
|
"""Fetches a summary from the AlphaFold database for a given qualifier.
|
|
110
110
|
|
|
@@ -116,6 +116,7 @@ async def fetch_summary(
|
|
|
116
116
|
save_dir: An optional directory to save the fetched summary as a JSON file.
|
|
117
117
|
If set and summary exists then summary will be loaded from disk instead of being fetched from the API.
|
|
118
118
|
If not set then the summary will not be saved to disk and will always be fetched from the API.
|
|
119
|
+
cacher: A cacher to use for caching the fetched summary. Only used if save_dir is not None.
|
|
119
120
|
|
|
120
121
|
Returns:
|
|
121
122
|
A list of EntrySummary objects representing the fetched summary.
|
|
@@ -124,6 +125,11 @@ async def fetch_summary(
|
|
|
124
125
|
fn: AsyncPath | None = None
|
|
125
126
|
if save_dir is not None:
|
|
126
127
|
fn = AsyncPath(save_dir / f"{qualifier}.json")
|
|
128
|
+
cached_file = await cacher.copy_from_cache(Path(fn))
|
|
129
|
+
if cached_file is not None:
|
|
130
|
+
logger.debug(f"Using cached file {cached_file} for summary of {qualifier}.")
|
|
131
|
+
raw_data = await AsyncPath(cached_file).read_bytes()
|
|
132
|
+
return converter.loads(raw_data, list[EntrySummary])
|
|
127
133
|
if await fn.exists():
|
|
128
134
|
logger.debug(f"File {fn} already exists. Skipping download from {url}.")
|
|
129
135
|
raw_data = await fn.read_bytes()
|
|
@@ -133,18 +139,23 @@ async def fetch_summary(
|
|
|
133
139
|
raw_data = await response.content.read()
|
|
134
140
|
if fn is not None:
|
|
135
141
|
# TODO return fn and make it part of AlphaFoldEntry as summary_file prop
|
|
136
|
-
await
|
|
142
|
+
await cacher.write_bytes(Path(fn), raw_data)
|
|
137
143
|
return converter.loads(raw_data, list[EntrySummary])
|
|
138
144
|
|
|
139
145
|
|
|
140
146
|
async def fetch_summaries(
|
|
141
|
-
qualifiers: Iterable[str],
|
|
147
|
+
qualifiers: Iterable[str],
|
|
148
|
+
save_dir: Path | None = None,
|
|
149
|
+
max_parallel_downloads: int = 5,
|
|
150
|
+
cacher: Cacher | None = None,
|
|
142
151
|
) -> AsyncGenerator[EntrySummary]:
|
|
143
152
|
semaphore = Semaphore(max_parallel_downloads)
|
|
144
153
|
if save_dir is not None:
|
|
145
154
|
save_dir.mkdir(parents=True, exist_ok=True)
|
|
155
|
+
if cacher is None:
|
|
156
|
+
cacher = PassthroughCacher()
|
|
146
157
|
async with friendly_session() as session:
|
|
147
|
-
tasks = [fetch_summary(qualifier, session, semaphore, save_dir) for qualifier in qualifiers]
|
|
158
|
+
tasks = [fetch_summary(qualifier, session, semaphore, save_dir, cacher) for qualifier in qualifiers]
|
|
148
159
|
summaries_per_qualifier: list[list[EntrySummary]] = await tqdm.gather(
|
|
149
160
|
*tasks, desc="Fetching Alphafold summaries"
|
|
150
161
|
)
|
|
@@ -154,7 +165,11 @@ async def fetch_summaries(
|
|
|
154
165
|
|
|
155
166
|
|
|
156
167
|
async def fetch_many_async(
|
|
157
|
-
uniprot_accessions: Iterable[str],
|
|
168
|
+
uniprot_accessions: Iterable[str],
|
|
169
|
+
save_dir: Path,
|
|
170
|
+
what: set[DownloadableFormat],
|
|
171
|
+
max_parallel_downloads: int = 5,
|
|
172
|
+
cacher: Cacher | None = None,
|
|
158
173
|
) -> AsyncGenerator[AlphaFoldEntry]:
|
|
159
174
|
"""Asynchronously fetches summaries and files from
|
|
160
175
|
[AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
|
|
@@ -164,15 +179,17 @@ async def fetch_many_async(
|
|
|
164
179
|
save_dir: The directory to save the fetched files to.
|
|
165
180
|
what: A set of formats to download.
|
|
166
181
|
max_parallel_downloads: The maximum number of parallel downloads.
|
|
182
|
+
cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
|
|
167
183
|
|
|
168
184
|
Yields:
|
|
169
185
|
A dataclass containing the summary, pdb file, and pae file.
|
|
170
186
|
"""
|
|
171
187
|
save_dir_for_summaries = save_dir if "summary" in what and save_dir is not None else None
|
|
188
|
+
|
|
172
189
|
summaries = [
|
|
173
190
|
s
|
|
174
191
|
async for s in fetch_summaries(
|
|
175
|
-
uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads
|
|
192
|
+
uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads, cacher=cacher
|
|
176
193
|
)
|
|
177
194
|
]
|
|
178
195
|
|
|
@@ -183,6 +200,7 @@ async def fetch_many_async(
|
|
|
183
200
|
save_dir,
|
|
184
201
|
desc="Downloading AlphaFold files",
|
|
185
202
|
max_parallel_downloads=max_parallel_downloads,
|
|
203
|
+
cacher=cacher,
|
|
186
204
|
)
|
|
187
205
|
for summary in summaries:
|
|
188
206
|
yield AlphaFoldEntry(
|
|
@@ -236,7 +254,11 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
|
|
|
236
254
|
|
|
237
255
|
|
|
238
256
|
def fetch_many(
|
|
239
|
-
ids: Iterable[str],
|
|
257
|
+
ids: Iterable[str],
|
|
258
|
+
save_dir: Path,
|
|
259
|
+
what: set[DownloadableFormat],
|
|
260
|
+
max_parallel_downloads: int = 5,
|
|
261
|
+
cacher: Cacher | None = None,
|
|
240
262
|
) -> list[AlphaFoldEntry]:
|
|
241
263
|
"""Synchronously fetches summaries and pdb and pae files from AlphaFold Protein Structure Database.
|
|
242
264
|
|
|
@@ -245,6 +267,7 @@ def fetch_many(
|
|
|
245
267
|
save_dir: The directory to save the fetched files to.
|
|
246
268
|
what: A set of formats to download.
|
|
247
269
|
max_parallel_downloads: The maximum number of parallel downloads.
|
|
270
|
+
cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
|
|
248
271
|
|
|
249
272
|
Returns:
|
|
250
273
|
A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
|
|
@@ -253,7 +276,9 @@ def fetch_many(
|
|
|
253
276
|
async def gather_entries():
|
|
254
277
|
return [
|
|
255
278
|
entry
|
|
256
|
-
async for entry in fetch_many_async(
|
|
279
|
+
async for entry in fetch_many_async(
|
|
280
|
+
ids, save_dir, what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
|
|
281
|
+
)
|
|
257
282
|
]
|
|
258
283
|
|
|
259
284
|
return run_async(gather_entries())
|
protein_quest/cli.py
CHANGED
|
@@ -15,6 +15,7 @@ from textwrap import dedent
|
|
|
15
15
|
from cattrs import structure
|
|
16
16
|
from rich import print as rprint
|
|
17
17
|
from rich.logging import RichHandler
|
|
18
|
+
from rich.markdown import Markdown
|
|
18
19
|
from rich.panel import Panel
|
|
19
20
|
from rich_argparse import ArgumentDefaultsRichHelpFormatter
|
|
20
21
|
from tqdm.rich import tqdm
|
|
@@ -31,8 +32,26 @@ from protein_quest.pdbe import fetch as pdbe_fetch
|
|
|
31
32
|
from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
|
|
32
33
|
from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
|
|
33
34
|
from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
|
|
34
|
-
from protein_quest.uniprot import
|
|
35
|
-
|
|
35
|
+
from protein_quest.uniprot import (
|
|
36
|
+
ComplexPortalEntry,
|
|
37
|
+
PdbResult,
|
|
38
|
+
Query,
|
|
39
|
+
search4af,
|
|
40
|
+
search4emdb,
|
|
41
|
+
search4interaction_partners,
|
|
42
|
+
search4macromolecular_complexes,
|
|
43
|
+
search4pdb,
|
|
44
|
+
search4uniprot,
|
|
45
|
+
)
|
|
46
|
+
from protein_quest.utils import (
|
|
47
|
+
Cacher,
|
|
48
|
+
CopyMethod,
|
|
49
|
+
DirectoryCacher,
|
|
50
|
+
PassthroughCacher,
|
|
51
|
+
copy_methods,
|
|
52
|
+
copyfile,
|
|
53
|
+
user_cache_root_dir,
|
|
54
|
+
)
|
|
36
55
|
|
|
37
56
|
logger = logging.getLogger(__name__)
|
|
38
57
|
|
|
@@ -211,6 +230,73 @@ def _add_search_taxonomy_parser(subparser: argparse._SubParsersAction):
|
|
|
211
230
|
parser.add_argument("--limit", type=int, default=100, help="Maximum number of results to return")
|
|
212
231
|
|
|
213
232
|
|
|
233
|
+
def _add_search_interaction_partners_parser(subparsers: argparse._SubParsersAction):
|
|
234
|
+
"""Add search interaction partners subcommand parser."""
|
|
235
|
+
parser = subparsers.add_parser(
|
|
236
|
+
"interaction-partners",
|
|
237
|
+
help="Search for interaction partners of given UniProt accession",
|
|
238
|
+
description=dedent("""\
|
|
239
|
+
Search for interaction partners of given UniProt accession
|
|
240
|
+
in the Uniprot SPARQL endpoint and Complex Portal.
|
|
241
|
+
"""),
|
|
242
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
243
|
+
)
|
|
244
|
+
parser.add_argument(
|
|
245
|
+
"uniprot_acc",
|
|
246
|
+
type=str,
|
|
247
|
+
help="UniProt accession (for example P12345).",
|
|
248
|
+
)
|
|
249
|
+
parser.add_argument(
|
|
250
|
+
"--exclude",
|
|
251
|
+
type=str,
|
|
252
|
+
action="append",
|
|
253
|
+
help="UniProt accessions to exclude from the results. For example already known interaction partners.",
|
|
254
|
+
)
|
|
255
|
+
parser.add_argument(
|
|
256
|
+
"output_csv",
|
|
257
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
258
|
+
help="Output CSV with interaction partners per UniProt accession. Use `-` for stdout.",
|
|
259
|
+
)
|
|
260
|
+
parser.add_argument(
|
|
261
|
+
"--limit", type=int, default=10_000, help="Maximum number of interaction partner uniprot accessions to return"
|
|
262
|
+
)
|
|
263
|
+
parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _add_search_complexes_parser(subparsers: argparse._SubParsersAction):
|
|
267
|
+
"""Add search complexes subcommand parser."""
|
|
268
|
+
description = dedent("""\
|
|
269
|
+
Search for complexes in the Complex Portal.
|
|
270
|
+
https://www.ebi.ac.uk/complexportal/
|
|
271
|
+
|
|
272
|
+
The output CSV file has the following columns:
|
|
273
|
+
|
|
274
|
+
- query_protein: UniProt accession used as query
|
|
275
|
+
- complex_id: Complex Portal identifier
|
|
276
|
+
- complex_url: URL to the Complex Portal entry
|
|
277
|
+
- complex_title: Title of the complex
|
|
278
|
+
- members: Semicolon-separated list of UniProt accessions of complex members
|
|
279
|
+
""")
|
|
280
|
+
parser = subparsers.add_parser(
|
|
281
|
+
"complexes",
|
|
282
|
+
help="Search for complexes in the Complex Portal",
|
|
283
|
+
description=Markdown(description, style="argparse.text"), # type: ignore using rich formatter makes this OK
|
|
284
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
285
|
+
)
|
|
286
|
+
parser.add_argument(
|
|
287
|
+
"uniprot_accs",
|
|
288
|
+
type=argparse.FileType("r", encoding="UTF-8"),
|
|
289
|
+
help="Text file with UniProt accessions (one per line) as query for searching complexes. Use `-` for stdin.",
|
|
290
|
+
)
|
|
291
|
+
parser.add_argument(
|
|
292
|
+
"output_csv",
|
|
293
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
294
|
+
help="Output CSV file with complex results. Use `-` for stdout.",
|
|
295
|
+
)
|
|
296
|
+
parser.add_argument("--limit", type=int, default=100, help="Maximum number of complex results to return")
|
|
297
|
+
parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
|
|
298
|
+
|
|
299
|
+
|
|
214
300
|
def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
|
|
215
301
|
"""Add retrieve pdbe subcommand parser."""
|
|
216
302
|
parser = subparsers.add_parser(
|
|
@@ -234,6 +320,7 @@ def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
|
|
|
234
320
|
default=5,
|
|
235
321
|
help="Maximum number of parallel downloads",
|
|
236
322
|
)
|
|
323
|
+
_add_cacher_arguments(parser)
|
|
237
324
|
|
|
238
325
|
|
|
239
326
|
def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -264,6 +351,7 @@ def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
|
|
|
264
351
|
default=5,
|
|
265
352
|
help="Maximum number of parallel downloads",
|
|
266
353
|
)
|
|
354
|
+
_add_cacher_arguments(parser)
|
|
267
355
|
|
|
268
356
|
|
|
269
357
|
def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -283,22 +371,7 @@ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
|
|
|
283
371
|
help="CSV file with `emdb_id` column. Other columns are ignored. Use `-` for stdin.",
|
|
284
372
|
)
|
|
285
373
|
parser.add_argument("output_dir", type=Path, help="Directory to store downloaded EMDB volume files")
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
def _add_copy_method_argument(parser: argparse.ArgumentParser):
|
|
289
|
-
"""Add copy method argument to parser."""
|
|
290
|
-
default_copy_method = "symlink"
|
|
291
|
-
if os.name == "nt":
|
|
292
|
-
# On Windows you need developer mode or admin privileges to create symlinks
|
|
293
|
-
# so we default to copying files instead of symlinking
|
|
294
|
-
default_copy_method = "copy"
|
|
295
|
-
parser.add_argument(
|
|
296
|
-
"--copy-method",
|
|
297
|
-
type=str,
|
|
298
|
-
choices=copy_methods,
|
|
299
|
-
default=default_copy_method,
|
|
300
|
-
help="How to copy files when no changes are needed to output file.",
|
|
301
|
-
)
|
|
374
|
+
_add_cacher_arguments(parser)
|
|
302
375
|
|
|
303
376
|
|
|
304
377
|
def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -331,7 +404,7 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
|
|
|
331
404
|
In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
|
|
332
405
|
Use `-` for stdout."""),
|
|
333
406
|
)
|
|
334
|
-
|
|
407
|
+
_add_copy_method_arguments(parser)
|
|
335
408
|
|
|
336
409
|
|
|
337
410
|
def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -371,7 +444,7 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
|
|
|
371
444
|
If not provided, will create a local cluster.
|
|
372
445
|
If set to `sequential` will run tasks sequentially."""),
|
|
373
446
|
)
|
|
374
|
-
|
|
447
|
+
_add_copy_method_arguments(parser)
|
|
375
448
|
|
|
376
449
|
|
|
377
450
|
def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -394,7 +467,6 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
|
|
|
394
467
|
)
|
|
395
468
|
parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
|
|
396
469
|
parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
|
|
397
|
-
_add_copy_method_argument(parser)
|
|
398
470
|
parser.add_argument(
|
|
399
471
|
"--write-stats",
|
|
400
472
|
type=argparse.FileType("w", encoding="UTF-8"),
|
|
@@ -403,6 +475,7 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
|
|
|
403
475
|
In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
|
|
404
476
|
Use `-` for stdout."""),
|
|
405
477
|
)
|
|
478
|
+
_add_copy_method_arguments(parser)
|
|
406
479
|
|
|
407
480
|
|
|
408
481
|
def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -429,7 +502,6 @@ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
|
|
|
429
502
|
parser.add_argument("--ratio-max-helix-residues", type=float, help="Max residues in helices (relative)")
|
|
430
503
|
parser.add_argument("--ratio-min-sheet-residues", type=float, help="Min residues in sheets (relative)")
|
|
431
504
|
parser.add_argument("--ratio-max-sheet-residues", type=float, help="Max residues in sheets (relative)")
|
|
432
|
-
_add_copy_method_argument(parser)
|
|
433
505
|
parser.add_argument(
|
|
434
506
|
"--write-stats",
|
|
435
507
|
type=argparse.FileType("w", encoding="UTF-8"),
|
|
@@ -440,6 +512,7 @@ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
|
|
|
440
512
|
Use `-` for stdout.
|
|
441
513
|
"""),
|
|
442
514
|
)
|
|
515
|
+
_add_copy_method_arguments(parser)
|
|
443
516
|
|
|
444
517
|
|
|
445
518
|
def _add_search_subcommands(subparsers: argparse._SubParsersAction):
|
|
@@ -458,6 +531,8 @@ def _add_search_subcommands(subparsers: argparse._SubParsersAction):
|
|
|
458
531
|
_add_search_emdb_parser(subsubparsers)
|
|
459
532
|
_add_search_go_parser(subsubparsers)
|
|
460
533
|
_add_search_taxonomy_parser(subsubparsers)
|
|
534
|
+
_add_search_interaction_partners_parser(subsubparsers)
|
|
535
|
+
_add_search_complexes_parser(subsubparsers)
|
|
461
536
|
|
|
462
537
|
|
|
463
538
|
def _add_retrieve_subcommands(subparsers: argparse._SubParsersAction):
|
|
@@ -505,6 +580,38 @@ def _add_mcp_command(subparsers: argparse._SubParsersAction):
|
|
|
505
580
|
parser.add_argument("--port", default=8000, type=int, help="Port to bind the server to")
|
|
506
581
|
|
|
507
582
|
|
|
583
|
+
def _add_copy_method_arguments(parser):
|
|
584
|
+
parser.add_argument(
|
|
585
|
+
"--copy-method",
|
|
586
|
+
type=str,
|
|
587
|
+
choices=copy_methods,
|
|
588
|
+
default="hardlink",
|
|
589
|
+
help=dedent("""\
|
|
590
|
+
How to make target file be same file as source file.
|
|
591
|
+
By default uses hardlinks to save disk space.
|
|
592
|
+
Note that hardlinks only work within the same filesystem and are harder to track.
|
|
593
|
+
If you want to track cached files easily then use 'symlink'.
|
|
594
|
+
On Windows you need developer mode or admin privileges to create symlinks.
|
|
595
|
+
"""),
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
def _add_cacher_arguments(parser: argparse.ArgumentParser):
|
|
600
|
+
"""Add cacher arguments to parser."""
|
|
601
|
+
parser.add_argument(
|
|
602
|
+
"--no-cache",
|
|
603
|
+
action="store_true",
|
|
604
|
+
help="Disable caching of files to central location.",
|
|
605
|
+
)
|
|
606
|
+
parser.add_argument(
|
|
607
|
+
"--cache-dir",
|
|
608
|
+
type=Path,
|
|
609
|
+
default=user_cache_root_dir(),
|
|
610
|
+
help="Directory to use as cache for files.",
|
|
611
|
+
)
|
|
612
|
+
_add_copy_method_arguments(parser)
|
|
613
|
+
|
|
614
|
+
|
|
508
615
|
def make_parser() -> argparse.ArgumentParser:
|
|
509
616
|
parser = argparse.ArgumentParser(
|
|
510
617
|
description="Protein Quest CLI", prog="protein-quest", formatter_class=ArgumentDefaultsRichHelpFormatter
|
|
@@ -636,14 +743,52 @@ def _handle_search_taxonomy(args):
|
|
|
636
743
|
_write_taxonomy_csv(results, output_csv)
|
|
637
744
|
|
|
638
745
|
|
|
639
|
-
def
|
|
746
|
+
def _handle_search_interaction_partners(args: argparse.Namespace):
|
|
747
|
+
uniprot_acc: str = args.uniprot_acc
|
|
748
|
+
excludes: set[str] = set(args.exclude) if args.exclude else set()
|
|
749
|
+
limit: int = args.limit
|
|
750
|
+
timeout: int = args.timeout
|
|
751
|
+
output_csv: TextIOWrapper = args.output_csv
|
|
752
|
+
|
|
753
|
+
rprint(f"Searching for interaction partners of '{uniprot_acc}'")
|
|
754
|
+
results = search4interaction_partners(uniprot_acc, excludes=excludes, limit=limit, timeout=timeout)
|
|
755
|
+
rprint(f"Found {len(results)} interaction partners, written to {output_csv.name}")
|
|
756
|
+
_write_lines(output_csv, results.keys())
|
|
757
|
+
|
|
758
|
+
|
|
759
|
+
def _handle_search_complexes(args: argparse.Namespace):
|
|
760
|
+
uniprot_accs = args.uniprot_accs
|
|
761
|
+
limit = args.limit
|
|
762
|
+
timeout = args.timeout
|
|
763
|
+
output_csv = args.output_csv
|
|
764
|
+
|
|
765
|
+
accs = _read_lines(uniprot_accs)
|
|
766
|
+
rprint(f"Finding complexes for {len(accs)} uniprot accessions")
|
|
767
|
+
results = search4macromolecular_complexes(accs, limit=limit, timeout=timeout)
|
|
768
|
+
rprint(f"Found {len(results)} complexes, written to {output_csv.name}")
|
|
769
|
+
_write_complexes_csv(results, output_csv)
|
|
770
|
+
|
|
771
|
+
|
|
772
|
+
def _initialize_cacher(args: argparse.Namespace) -> Cacher:
|
|
773
|
+
if args.no_cache:
|
|
774
|
+
return PassthroughCacher()
|
|
775
|
+
return DirectoryCacher(
|
|
776
|
+
cache_dir=args.cache_dir,
|
|
777
|
+
copy_method=args.copy_method,
|
|
778
|
+
)
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
def _handle_retrieve_pdbe(args: argparse.Namespace):
|
|
640
782
|
pdbe_csv = args.pdbe_csv
|
|
641
783
|
output_dir = args.output_dir
|
|
642
784
|
max_parallel_downloads = args.max_parallel_downloads
|
|
785
|
+
cacher = _initialize_cacher(args)
|
|
643
786
|
|
|
644
787
|
pdb_ids = _read_column_from_csv(pdbe_csv, "pdb_id")
|
|
645
788
|
rprint(f"Retrieving {len(pdb_ids)} PDBe entries")
|
|
646
|
-
result = asyncio.run(
|
|
789
|
+
result = asyncio.run(
|
|
790
|
+
pdbe_fetch.fetch(pdb_ids, output_dir, max_parallel_downloads=max_parallel_downloads, cacher=cacher)
|
|
791
|
+
)
|
|
647
792
|
rprint(f"Retrieved {len(result)} PDBe entries")
|
|
648
793
|
|
|
649
794
|
|
|
@@ -652,6 +797,7 @@ def _handle_retrieve_alphafold(args):
|
|
|
652
797
|
what_formats = args.what_formats
|
|
653
798
|
alphafold_csv = args.alphafold_csv
|
|
654
799
|
max_parallel_downloads = args.max_parallel_downloads
|
|
800
|
+
cacher = _initialize_cacher(args)
|
|
655
801
|
|
|
656
802
|
if what_formats is None:
|
|
657
803
|
what_formats = {"summary", "cif"}
|
|
@@ -661,7 +807,9 @@ def _handle_retrieve_alphafold(args):
|
|
|
661
807
|
af_ids = _read_column_from_csv(alphafold_csv, "af_id")
|
|
662
808
|
validated_what: set[DownloadableFormat] = structure(what_formats, set[DownloadableFormat])
|
|
663
809
|
rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {validated_what}")
|
|
664
|
-
afs = af_fetch(
|
|
810
|
+
afs = af_fetch(
|
|
811
|
+
af_ids, download_dir, what=validated_what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
|
|
812
|
+
)
|
|
665
813
|
total_nr_files = sum(af.nr_of_files() for af in afs)
|
|
666
814
|
rprint(f"Retrieved {total_nr_files} AlphaFold files and {len(afs)} summaries, written to {download_dir}")
|
|
667
815
|
|
|
@@ -669,10 +817,11 @@ def _handle_retrieve_alphafold(args):
|
|
|
669
817
|
def _handle_retrieve_emdb(args):
|
|
670
818
|
emdb_csv = args.emdb_csv
|
|
671
819
|
output_dir = args.output_dir
|
|
820
|
+
cacher = _initialize_cacher(args)
|
|
672
821
|
|
|
673
822
|
emdb_ids = _read_column_from_csv(emdb_csv, "emdb_id")
|
|
674
823
|
rprint(f"Retrieving {len(emdb_ids)} EMDB entries")
|
|
675
|
-
result = asyncio.run(emdb_fetch(emdb_ids, output_dir))
|
|
824
|
+
result = asyncio.run(emdb_fetch(emdb_ids, output_dir, cacher=cacher))
|
|
676
825
|
rprint(f"Retrieved {len(result)} EMDB entries")
|
|
677
826
|
|
|
678
827
|
|
|
@@ -875,6 +1024,8 @@ HANDLERS: dict[tuple[str, str | None], Callable] = {
|
|
|
875
1024
|
("search", "emdb"): _handle_search_emdb,
|
|
876
1025
|
("search", "go"): _handle_search_go,
|
|
877
1026
|
("search", "taxonomy"): _handle_search_taxonomy,
|
|
1027
|
+
("search", "interaction-partners"): _handle_search_interaction_partners,
|
|
1028
|
+
("search", "complexes"): _handle_search_complexes,
|
|
878
1029
|
("retrieve", "pdbe"): _handle_retrieve_pdbe,
|
|
879
1030
|
("retrieve", "alphafold"): _handle_retrieve_alphafold,
|
|
880
1031
|
("retrieve", "emdb"): _handle_retrieve_emdb,
|
|
@@ -937,3 +1088,33 @@ def _iter_csv_rows(file: TextIOWrapper) -> Generator[dict[str, str]]:
|
|
|
937
1088
|
|
|
938
1089
|
def _read_column_from_csv(file: TextIOWrapper, column: str) -> set[str]:
|
|
939
1090
|
return {row[column] for row in _iter_csv_rows(file)}
|
|
1091
|
+
|
|
1092
|
+
|
|
1093
|
+
def _write_complexes_csv(complexes: list[ComplexPortalEntry], output_csv: TextIOWrapper) -> None:
|
|
1094
|
+
"""Write ComplexPortal information to a CSV file.
|
|
1095
|
+
|
|
1096
|
+
Args:
|
|
1097
|
+
complexes: List of ComplexPortalEntry objects.
|
|
1098
|
+
output_csv: TextIOWrapper to write the CSV data to.
|
|
1099
|
+
"""
|
|
1100
|
+
writer = csv.writer(output_csv)
|
|
1101
|
+
writer.writerow(
|
|
1102
|
+
[
|
|
1103
|
+
"query_protein",
|
|
1104
|
+
"complex_id",
|
|
1105
|
+
"complex_url",
|
|
1106
|
+
"complex_title",
|
|
1107
|
+
"members",
|
|
1108
|
+
]
|
|
1109
|
+
)
|
|
1110
|
+
for entry in complexes:
|
|
1111
|
+
members_str = ";".join(sorted(entry.members))
|
|
1112
|
+
writer.writerow(
|
|
1113
|
+
[
|
|
1114
|
+
entry.query_protein,
|
|
1115
|
+
entry.complex_id,
|
|
1116
|
+
entry.complex_url,
|
|
1117
|
+
entry.complex_title,
|
|
1118
|
+
members_str,
|
|
1119
|
+
]
|
|
1120
|
+
)
|
protein_quest/converter.py
CHANGED
|
@@ -13,6 +13,7 @@ type PositiveInt = int
|
|
|
13
13
|
converter = make_converter()
|
|
14
14
|
"""cattrs converter to read JSON document or dict to Python objects."""
|
|
15
15
|
converter.register_structure_hook(URL, lambda v, _: URL(v))
|
|
16
|
+
converter.register_unstructure_hook(URL, lambda u: str(u))
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
@converter.register_structure_hook
|
protein_quest/emdb.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from collections.abc import Iterable, Mapping
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
|
-
from protein_quest.utils import retrieve_files
|
|
6
|
+
from protein_quest.utils import Cacher, retrieve_files
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def _map_id2volume_url(emdb_id: str) -> tuple[str, str]:
|
|
@@ -13,13 +13,16 @@ def _map_id2volume_url(emdb_id: str) -> tuple[str, str]:
|
|
|
13
13
|
return url, fn
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
async def fetch(
|
|
16
|
+
async def fetch(
|
|
17
|
+
emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 1, cacher: Cacher | None = None
|
|
18
|
+
) -> Mapping[str, Path]:
|
|
17
19
|
"""Fetches volume files from the EMDB database.
|
|
18
20
|
|
|
19
21
|
Args:
|
|
20
22
|
emdb_ids: A list of EMDB IDs to fetch.
|
|
21
23
|
save_dir: The directory to save the downloaded files.
|
|
22
24
|
max_parallel_downloads: The maximum number of parallel downloads.
|
|
25
|
+
cacher: An optional cacher to use for caching downloaded files.
|
|
23
26
|
|
|
24
27
|
Returns:
|
|
25
28
|
A mapping of EMDB IDs to their downloaded files.
|
|
@@ -30,5 +33,5 @@ async def fetch(emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads:
|
|
|
30
33
|
|
|
31
34
|
# TODO show progress of each item
|
|
32
35
|
# TODO handle failed downloads, by skipping them instead of raising an error
|
|
33
|
-
await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading EMDB volume files")
|
|
36
|
+
await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading EMDB volume files", cacher=cacher)
|
|
34
37
|
return id2paths
|
protein_quest/mcp_server.py
CHANGED
|
@@ -32,6 +32,7 @@ Examples:
|
|
|
32
32
|
|
|
33
33
|
"""
|
|
34
34
|
|
|
35
|
+
from collections.abc import Mapping
|
|
35
36
|
from pathlib import Path
|
|
36
37
|
from textwrap import dedent
|
|
37
38
|
from typing import Annotated
|
|
@@ -48,7 +49,15 @@ from protein_quest.pdbe.fetch import fetch as pdbe_fetch
|
|
|
48
49
|
from protein_quest.pdbe.io import glob_structure_files, nr_residues_in_chain, write_single_chain_pdb_file
|
|
49
50
|
from protein_quest.ss import filter_file_on_secondary_structure
|
|
50
51
|
from protein_quest.taxonomy import search_taxon
|
|
51
|
-
from protein_quest.uniprot import
|
|
52
|
+
from protein_quest.uniprot import (
|
|
53
|
+
PdbResult,
|
|
54
|
+
Query,
|
|
55
|
+
search4af,
|
|
56
|
+
search4emdb,
|
|
57
|
+
search4macromolecular_complexes,
|
|
58
|
+
search4pdb,
|
|
59
|
+
search4uniprot,
|
|
60
|
+
)
|
|
52
61
|
|
|
53
62
|
mcp = FastMCP("protein-quest")
|
|
54
63
|
|
|
@@ -81,7 +90,18 @@ def search_pdb(
|
|
|
81
90
|
return search4pdb(uniprot_accs, limit=limit)
|
|
82
91
|
|
|
83
92
|
|
|
84
|
-
mcp.tool
|
|
93
|
+
@mcp.tool
|
|
94
|
+
async def fetch_pdbe_structures(pdb_ids: set[str], save_dir: Path) -> Mapping[str, Path]:
|
|
95
|
+
"""Fetch the PDBe structures for given PDB IDs.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
pdb_ids: A set of PDB IDs.
|
|
99
|
+
save_dir: The directory to save the fetched files.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
A mapping of PDB ID to the path of the fetched structure file.
|
|
103
|
+
"""
|
|
104
|
+
return await pdbe_fetch(pdb_ids, save_dir)
|
|
85
105
|
|
|
86
106
|
|
|
87
107
|
@mcp.tool
|
|
@@ -137,6 +157,7 @@ def search_alphafolds(
|
|
|
137
157
|
|
|
138
158
|
|
|
139
159
|
mcp.tool(search4emdb, name="search_emdb")
|
|
160
|
+
mcp.tool(search4macromolecular_complexes, name="search_macromolecular_complexes")
|
|
140
161
|
|
|
141
162
|
|
|
142
163
|
@mcp.tool
|
|
@@ -154,7 +175,17 @@ def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[A
|
|
|
154
175
|
return alphafold_fetch(uniprot_accs, save_dir, what)
|
|
155
176
|
|
|
156
177
|
|
|
157
|
-
mcp.tool
|
|
178
|
+
@mcp.tool
|
|
179
|
+
async def fetch_emdb_volumes(emdb_ids: set[str], save_dir: Path) -> Mapping[str, Path]:
|
|
180
|
+
"""Fetch EMDB volumes for given EMDB IDs.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
emdb_ids: A set of EMDB IDs.
|
|
184
|
+
save_dir: The directory to save the fetched files.
|
|
185
|
+
Returns:
|
|
186
|
+
A mapping of EMDB ID to the path of the fetched volume file.
|
|
187
|
+
"""
|
|
188
|
+
return await emdb_fetch(emdb_ids=emdb_ids, save_dir=save_dir)
|
|
158
189
|
|
|
159
190
|
|
|
160
191
|
@mcp.tool
|
protein_quest/pdbe/fetch.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from collections.abc import Iterable, Mapping
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
|
-
from protein_quest.utils import retrieve_files, run_async
|
|
6
|
+
from protein_quest.utils import Cacher, retrieve_files, run_async
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def _map_id_mmcif(pdb_id: str) -> tuple[str, str]:
|
|
@@ -28,13 +28,16 @@ def _map_id_mmcif(pdb_id: str) -> tuple[str, str]:
|
|
|
28
28
|
return url, fn
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
async def fetch(
|
|
31
|
+
async def fetch(
|
|
32
|
+
ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 5, cacher: Cacher | None = None
|
|
33
|
+
) -> Mapping[str, Path]:
|
|
32
34
|
"""Fetches mmCIF files from the PDBe database.
|
|
33
35
|
|
|
34
36
|
Args:
|
|
35
37
|
ids: A set of PDB IDs to fetch.
|
|
36
38
|
save_dir: The directory to save the fetched mmCIF files to.
|
|
37
39
|
max_parallel_downloads: The maximum number of parallel downloads.
|
|
40
|
+
cacher: An optional cacher to use for caching downloaded files.
|
|
38
41
|
|
|
39
42
|
Returns:
|
|
40
43
|
A dict of id and paths to the downloaded mmCIF files.
|
|
@@ -47,7 +50,7 @@ async def fetch(ids: Iterable[str], save_dir: Path, max_parallel_downloads: int
|
|
|
47
50
|
urls = list(id2urls.values())
|
|
48
51
|
id2paths = {pdb_id: save_dir / fn for pdb_id, (_, fn) in id2urls.items()}
|
|
49
52
|
|
|
50
|
-
await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading PDBe mmCIF files")
|
|
53
|
+
await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading PDBe mmCIF files", cacher=cacher)
|
|
51
54
|
return id2paths
|
|
52
55
|
|
|
53
56
|
|
protein_quest/ss.py
CHANGED
|
@@ -111,6 +111,26 @@ class SecondaryStructureFilterQuery:
|
|
|
111
111
|
ratio_min_sheet_residues: Ratio | None = None
|
|
112
112
|
ratio_max_sheet_residues: Ratio | None = None
|
|
113
113
|
|
|
114
|
+
def is_actionable(self) -> bool:
|
|
115
|
+
"""Check if the secondary structure query has any actionable filters.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
True if any of the filters are set, False otherwise.
|
|
119
|
+
"""
|
|
120
|
+
return any(
|
|
121
|
+
field is not None
|
|
122
|
+
for field in [
|
|
123
|
+
self.abs_min_helix_residues,
|
|
124
|
+
self.abs_max_helix_residues,
|
|
125
|
+
self.abs_min_sheet_residues,
|
|
126
|
+
self.abs_max_sheet_residues,
|
|
127
|
+
self.ratio_min_helix_residues,
|
|
128
|
+
self.ratio_max_helix_residues,
|
|
129
|
+
self.ratio_min_sheet_residues,
|
|
130
|
+
self.ratio_max_sheet_residues,
|
|
131
|
+
]
|
|
132
|
+
)
|
|
133
|
+
|
|
114
134
|
|
|
115
135
|
def _check_range(min_val, max_val, label):
|
|
116
136
|
if min_val is not None and max_val is not None and min_val >= max_val:
|