protein-quest 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of protein-quest might be problematic. Click here for more details.
- protein_quest/__version__.py +1 -1
- protein_quest/alphafold/confidence.py +2 -2
- protein_quest/alphafold/fetch.py +28 -19
- protein_quest/cli.py +133 -68
- protein_quest/filters.py +2 -5
- protein_quest/io.py +350 -0
- protein_quest/mcp_server.py +8 -5
- protein_quest/ss.py +3 -7
- protein_quest/{pdbe/io.py → structure.py} +53 -126
- protein_quest/utils.py +26 -2
- {protein_quest-0.5.1.dist-info → protein_quest-0.6.0.dist-info}/METADATA +12 -1
- protein_quest-0.6.0.dist-info/RECORD +27 -0
- protein_quest-0.5.1.dist-info/RECORD +0 -26
- {protein_quest-0.5.1.dist-info → protein_quest-0.6.0.dist-info}/WHEEL +0 -0
- {protein_quest-0.5.1.dist-info → protein_quest-0.6.0.dist-info}/entry_points.txt +0 -0
- {protein_quest-0.5.1.dist-info → protein_quest-0.6.0.dist-info}/licenses/LICENSE +0 -0
protein_quest/__version__.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.6.0"
|
|
2
2
|
"""The version of the package."""
|
|
@@ -8,7 +8,7 @@ from pathlib import Path
|
|
|
8
8
|
import gemmi
|
|
9
9
|
|
|
10
10
|
from protein_quest.converter import Percentage, PositiveInt, converter
|
|
11
|
-
from protein_quest.
|
|
11
|
+
from protein_quest.io import read_structure, write_structure
|
|
12
12
|
from protein_quest.ss import nr_of_residues_in_total
|
|
13
13
|
from protein_quest.utils import CopyMethod, copyfile
|
|
14
14
|
|
|
@@ -127,7 +127,7 @@ def filter_file_on_residues(
|
|
|
127
127
|
result with filtered_file property set to Path where filtered PDB file is saved.
|
|
128
128
|
or None if structure was filtered out.
|
|
129
129
|
"""
|
|
130
|
-
structure =
|
|
130
|
+
structure = read_structure(file)
|
|
131
131
|
residues = set(find_high_confidence_residues(structure, query.confidence))
|
|
132
132
|
count = len(residues)
|
|
133
133
|
if count < query.min_residues or count > query.max_residues:
|
protein_quest/alphafold/fetch.py
CHANGED
|
@@ -125,15 +125,15 @@ async def fetch_summary(
|
|
|
125
125
|
fn: AsyncPath | None = None
|
|
126
126
|
if save_dir is not None:
|
|
127
127
|
fn = AsyncPath(save_dir / f"{qualifier}.json")
|
|
128
|
+
if await fn.exists():
|
|
129
|
+
logger.debug(f"File {fn} already exists. Skipping download from {url}.")
|
|
130
|
+
raw_data = await fn.read_bytes()
|
|
131
|
+
return converter.loads(raw_data, list[EntrySummary])
|
|
128
132
|
cached_file = await cacher.copy_from_cache(Path(fn))
|
|
129
133
|
if cached_file is not None:
|
|
130
134
|
logger.debug(f"Using cached file {cached_file} for summary of {qualifier}.")
|
|
131
135
|
raw_data = await AsyncPath(cached_file).read_bytes()
|
|
132
136
|
return converter.loads(raw_data, list[EntrySummary])
|
|
133
|
-
if await fn.exists():
|
|
134
|
-
logger.debug(f"File {fn} already exists. Skipping download from {url}.")
|
|
135
|
-
raw_data = await fn.read_bytes()
|
|
136
|
-
return converter.loads(raw_data, list[EntrySummary])
|
|
137
137
|
async with semaphore, session.get(url) as response:
|
|
138
138
|
response.raise_for_status()
|
|
139
139
|
raw_data = await response.content.read()
|
|
@@ -170,6 +170,7 @@ async def fetch_many_async(
|
|
|
170
170
|
what: set[DownloadableFormat],
|
|
171
171
|
max_parallel_downloads: int = 5,
|
|
172
172
|
cacher: Cacher | None = None,
|
|
173
|
+
gzip_files: bool = False,
|
|
173
174
|
) -> AsyncGenerator[AlphaFoldEntry]:
|
|
174
175
|
"""Asynchronously fetches summaries and files from
|
|
175
176
|
[AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
|
|
@@ -180,6 +181,7 @@ async def fetch_many_async(
|
|
|
180
181
|
what: A set of formats to download.
|
|
181
182
|
max_parallel_downloads: The maximum number of parallel downloads.
|
|
182
183
|
cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
|
|
184
|
+
gzip_files: Whether to gzip the downloaded files.
|
|
183
185
|
|
|
184
186
|
Yields:
|
|
185
187
|
A dataclass containing the summary, pdb file, and pae file.
|
|
@@ -193,7 +195,7 @@ async def fetch_many_async(
|
|
|
193
195
|
)
|
|
194
196
|
]
|
|
195
197
|
|
|
196
|
-
files = files_to_download(what, summaries)
|
|
198
|
+
files = files_to_download(what, summaries, gzip_files)
|
|
197
199
|
|
|
198
200
|
await retrieve_files(
|
|
199
201
|
files,
|
|
@@ -201,36 +203,40 @@ async def fetch_many_async(
|
|
|
201
203
|
desc="Downloading AlphaFold files",
|
|
202
204
|
max_parallel_downloads=max_parallel_downloads,
|
|
203
205
|
cacher=cacher,
|
|
206
|
+
gzip_files=gzip_files,
|
|
204
207
|
)
|
|
208
|
+
gzext = ".gz" if gzip_files else ""
|
|
205
209
|
for summary in summaries:
|
|
206
210
|
yield AlphaFoldEntry(
|
|
207
211
|
uniprot_acc=summary.uniprotAccession,
|
|
208
212
|
summary=summary,
|
|
209
213
|
summary_file=save_dir / f"{summary.uniprotAccession}.json" if save_dir_for_summaries is not None else None,
|
|
210
|
-
bcif_file=save_dir / summary.bcifUrl.name if "bcif" in what else None,
|
|
211
|
-
cif_file=save_dir / summary.cifUrl.name if "cif" in what else None,
|
|
212
|
-
pdb_file=save_dir / summary.pdbUrl.name if "pdb" in what else None,
|
|
213
|
-
pae_image_file=save_dir / summary.paeImageUrl.name if "paeImage" in what else None,
|
|
214
|
-
pae_doc_file=save_dir / summary.paeDocUrl.name if "paeDoc" in what else None,
|
|
214
|
+
bcif_file=save_dir / (summary.bcifUrl.name + gzext) if "bcif" in what else None,
|
|
215
|
+
cif_file=save_dir / (summary.cifUrl.name + gzext) if "cif" in what else None,
|
|
216
|
+
pdb_file=save_dir / (summary.pdbUrl.name + gzext) if "pdb" in what else None,
|
|
217
|
+
pae_image_file=save_dir / (summary.paeImageUrl.name + gzext) if "paeImage" in what else None,
|
|
218
|
+
pae_doc_file=save_dir / (summary.paeDocUrl.name + gzext) if "paeDoc" in what else None,
|
|
215
219
|
am_annotations_file=(
|
|
216
|
-
save_dir / summary.amAnnotationsUrl.name
|
|
220
|
+
save_dir / (summary.amAnnotationsUrl.name + gzext)
|
|
217
221
|
if "amAnnotations" in what and summary.amAnnotationsUrl
|
|
218
222
|
else None
|
|
219
223
|
),
|
|
220
224
|
am_annotations_hg19_file=(
|
|
221
|
-
save_dir / summary.amAnnotationsHg19Url.name
|
|
225
|
+
save_dir / (summary.amAnnotationsHg19Url.name + gzext)
|
|
222
226
|
if "amAnnotationsHg19" in what and summary.amAnnotationsHg19Url
|
|
223
227
|
else None
|
|
224
228
|
),
|
|
225
229
|
am_annotations_hg38_file=(
|
|
226
|
-
save_dir / summary.amAnnotationsHg38Url.name
|
|
230
|
+
save_dir / (summary.amAnnotationsHg38Url.name + gzext)
|
|
227
231
|
if "amAnnotationsHg38" in what and summary.amAnnotationsHg38Url
|
|
228
232
|
else None
|
|
229
233
|
),
|
|
230
234
|
)
|
|
231
235
|
|
|
232
236
|
|
|
233
|
-
def files_to_download(
|
|
237
|
+
def files_to_download(
|
|
238
|
+
what: set[DownloadableFormat], summaries: Iterable[EntrySummary], gzip_files: bool
|
|
239
|
+
) -> set[tuple[URL, str]]:
|
|
234
240
|
if not (set(what) <= downloadable_formats):
|
|
235
241
|
msg = (
|
|
236
242
|
f"Invalid format(s) specified: {set(what) - downloadable_formats}. "
|
|
@@ -238,7 +244,7 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
|
|
|
238
244
|
)
|
|
239
245
|
raise ValueError(msg)
|
|
240
246
|
|
|
241
|
-
|
|
247
|
+
url_filename_pairs: set[tuple[URL, str]] = set()
|
|
242
248
|
for summary in summaries:
|
|
243
249
|
for fmt in what:
|
|
244
250
|
if fmt == "summary":
|
|
@@ -248,9 +254,10 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
|
|
|
248
254
|
if url is None:
|
|
249
255
|
logger.warning(f"Summary {summary.uniprotAccession} does not have a URL for format '{fmt}'. Skipping.")
|
|
250
256
|
continue
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
257
|
+
fn = url.name + (".gz" if gzip_files else "")
|
|
258
|
+
url_filename_pair = (url, fn)
|
|
259
|
+
url_filename_pairs.add(url_filename_pair)
|
|
260
|
+
return url_filename_pairs
|
|
254
261
|
|
|
255
262
|
|
|
256
263
|
def fetch_many(
|
|
@@ -259,6 +266,7 @@ def fetch_many(
|
|
|
259
266
|
what: set[DownloadableFormat],
|
|
260
267
|
max_parallel_downloads: int = 5,
|
|
261
268
|
cacher: Cacher | None = None,
|
|
269
|
+
gzip_files: bool = False,
|
|
262
270
|
) -> list[AlphaFoldEntry]:
|
|
263
271
|
"""Synchronously fetches summaries and pdb and pae files from AlphaFold Protein Structure Database.
|
|
264
272
|
|
|
@@ -268,6 +276,7 @@ def fetch_many(
|
|
|
268
276
|
what: A set of formats to download.
|
|
269
277
|
max_parallel_downloads: The maximum number of parallel downloads.
|
|
270
278
|
cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
|
|
279
|
+
gzip_files: Whether to gzip the downloaded files.
|
|
271
280
|
|
|
272
281
|
Returns:
|
|
273
282
|
A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
|
|
@@ -277,7 +286,7 @@ def fetch_many(
|
|
|
277
286
|
return [
|
|
278
287
|
entry
|
|
279
288
|
async for entry in fetch_many_async(
|
|
280
|
-
ids, save_dir, what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
|
|
289
|
+
ids, save_dir, what, max_parallel_downloads=max_parallel_downloads, cacher=cacher, gzip_files=gzip_files
|
|
281
290
|
)
|
|
282
291
|
]
|
|
283
292
|
|
protein_quest/cli.py
CHANGED
|
@@ -28,8 +28,13 @@ from protein_quest.converter import converter
|
|
|
28
28
|
from protein_quest.emdb import fetch as emdb_fetch
|
|
29
29
|
from protein_quest.filters import filter_files_on_chain, filter_files_on_residues
|
|
30
30
|
from protein_quest.go import Aspect, allowed_aspects, search_gene_ontology_term, write_go_terms_to_csv
|
|
31
|
+
from protein_quest.io import (
|
|
32
|
+
convert_to_cif_files,
|
|
33
|
+
glob_structure_files,
|
|
34
|
+
locate_structure_file,
|
|
35
|
+
valid_structure_file_extensions,
|
|
36
|
+
)
|
|
31
37
|
from protein_quest.pdbe import fetch as pdbe_fetch
|
|
32
|
-
from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
|
|
33
38
|
from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
|
|
34
39
|
from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
|
|
35
40
|
from protein_quest.uniprot import (
|
|
@@ -297,6 +302,38 @@ def _add_search_complexes_parser(subparsers: argparse._SubParsersAction):
|
|
|
297
302
|
parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
|
|
298
303
|
|
|
299
304
|
|
|
305
|
+
def _add_copy_method_arguments(parser):
|
|
306
|
+
parser.add_argument(
|
|
307
|
+
"--copy-method",
|
|
308
|
+
type=str,
|
|
309
|
+
choices=copy_methods,
|
|
310
|
+
default="hardlink",
|
|
311
|
+
help=dedent("""\
|
|
312
|
+
How to make target file be same file as source file.
|
|
313
|
+
By default uses hardlinks to save disk space.
|
|
314
|
+
Note that hardlinks only work within the same filesystem and are harder to track.
|
|
315
|
+
If you want to track cached files easily then use 'symlink'.
|
|
316
|
+
On Windows you need developer mode or admin privileges to create symlinks.
|
|
317
|
+
"""),
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _add_cacher_arguments(parser: argparse.ArgumentParser):
|
|
322
|
+
"""Add cacher arguments to parser."""
|
|
323
|
+
parser.add_argument(
|
|
324
|
+
"--no-cache",
|
|
325
|
+
action="store_true",
|
|
326
|
+
help="Disable caching of files to central location.",
|
|
327
|
+
)
|
|
328
|
+
parser.add_argument(
|
|
329
|
+
"--cache-dir",
|
|
330
|
+
type=Path,
|
|
331
|
+
default=user_cache_root_dir(),
|
|
332
|
+
help="Directory to use as cache for files.",
|
|
333
|
+
)
|
|
334
|
+
_add_copy_method_arguments(parser)
|
|
335
|
+
|
|
336
|
+
|
|
300
337
|
def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
|
|
301
338
|
"""Add retrieve pdbe subcommand parser."""
|
|
302
339
|
parser = subparsers.add_parser(
|
|
@@ -345,6 +382,11 @@ def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
|
|
|
345
382
|
help=dedent("""AlphaFold formats to retrieve. Can be specified multiple times.
|
|
346
383
|
Default is 'summary' and 'cif'."""),
|
|
347
384
|
)
|
|
385
|
+
parser.add_argument(
|
|
386
|
+
"--gzip-files",
|
|
387
|
+
action="store_true",
|
|
388
|
+
help="Whether to gzip the downloaded files. Excludes summary files, they are always uncompressed.",
|
|
389
|
+
)
|
|
348
390
|
parser.add_argument(
|
|
349
391
|
"--max-parallel-downloads",
|
|
350
392
|
type=int,
|
|
@@ -561,6 +603,33 @@ def _add_filter_subcommands(subparsers: argparse._SubParsersAction):
|
|
|
561
603
|
_add_filter_ss_parser(subsubparsers)
|
|
562
604
|
|
|
563
605
|
|
|
606
|
+
def _add_convert_subcommands(subparsers: argparse._SubParsersAction):
|
|
607
|
+
"""Add convert command."""
|
|
608
|
+
parser = subparsers.add_parser(
|
|
609
|
+
"convert", help="Convert structure files between formats", formatter_class=ArgumentDefaultsRichHelpFormatter
|
|
610
|
+
)
|
|
611
|
+
parser.add_argument(
|
|
612
|
+
"input_dir",
|
|
613
|
+
type=Path,
|
|
614
|
+
help=f"Directory with structure files. Supported extensions are {valid_structure_file_extensions}",
|
|
615
|
+
)
|
|
616
|
+
parser.add_argument(
|
|
617
|
+
"--output-dir",
|
|
618
|
+
type=Path,
|
|
619
|
+
help=dedent("""\
|
|
620
|
+
Directory to write converted structure files. If not given, files are written to `input_dir`.
|
|
621
|
+
"""),
|
|
622
|
+
)
|
|
623
|
+
parser.add_argument(
|
|
624
|
+
"--format",
|
|
625
|
+
type=str,
|
|
626
|
+
choices=("cif",),
|
|
627
|
+
default="cif",
|
|
628
|
+
help="Output format to convert to.",
|
|
629
|
+
)
|
|
630
|
+
_add_copy_method_arguments(parser)
|
|
631
|
+
|
|
632
|
+
|
|
564
633
|
def _add_mcp_command(subparsers: argparse._SubParsersAction):
|
|
565
634
|
"""Add MCP command."""
|
|
566
635
|
|
|
@@ -580,38 +649,6 @@ def _add_mcp_command(subparsers: argparse._SubParsersAction):
|
|
|
580
649
|
parser.add_argument("--port", default=8000, type=int, help="Port to bind the server to")
|
|
581
650
|
|
|
582
651
|
|
|
583
|
-
def _add_copy_method_arguments(parser):
|
|
584
|
-
parser.add_argument(
|
|
585
|
-
"--copy-method",
|
|
586
|
-
type=str,
|
|
587
|
-
choices=copy_methods,
|
|
588
|
-
default="hardlink",
|
|
589
|
-
help=dedent("""\
|
|
590
|
-
How to make target file be same file as source file.
|
|
591
|
-
By default uses hardlinks to save disk space.
|
|
592
|
-
Note that hardlinks only work within the same filesystem and are harder to track.
|
|
593
|
-
If you want to track cached files easily then use 'symlink'.
|
|
594
|
-
On Windows you need developer mode or admin privileges to create symlinks.
|
|
595
|
-
"""),
|
|
596
|
-
)
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
def _add_cacher_arguments(parser: argparse.ArgumentParser):
|
|
600
|
-
"""Add cacher arguments to parser."""
|
|
601
|
-
parser.add_argument(
|
|
602
|
-
"--no-cache",
|
|
603
|
-
action="store_true",
|
|
604
|
-
help="Disable caching of files to central location.",
|
|
605
|
-
)
|
|
606
|
-
parser.add_argument(
|
|
607
|
-
"--cache-dir",
|
|
608
|
-
type=Path,
|
|
609
|
-
default=user_cache_root_dir(),
|
|
610
|
-
help="Directory to use as cache for files.",
|
|
611
|
-
)
|
|
612
|
-
_add_copy_method_arguments(parser)
|
|
613
|
-
|
|
614
|
-
|
|
615
652
|
def make_parser() -> argparse.ArgumentParser:
|
|
616
653
|
parser = argparse.ArgumentParser(
|
|
617
654
|
description="Protein Quest CLI", prog="protein-quest", formatter_class=ArgumentDefaultsRichHelpFormatter
|
|
@@ -624,27 +661,12 @@ def make_parser() -> argparse.ArgumentParser:
|
|
|
624
661
|
_add_search_subcommands(subparsers)
|
|
625
662
|
_add_retrieve_subcommands(subparsers)
|
|
626
663
|
_add_filter_subcommands(subparsers)
|
|
664
|
+
_add_convert_subcommands(subparsers)
|
|
627
665
|
_add_mcp_command(subparsers)
|
|
628
666
|
|
|
629
667
|
return parser
|
|
630
668
|
|
|
631
669
|
|
|
632
|
-
def main():
|
|
633
|
-
"""Main entry point for the CLI."""
|
|
634
|
-
parser = make_parser()
|
|
635
|
-
args = parser.parse_args()
|
|
636
|
-
logging.basicConfig(level=args.log_level, handlers=[RichHandler(show_level=False)])
|
|
637
|
-
|
|
638
|
-
# Dispatch table to reduce complexity
|
|
639
|
-
cmd = args.command
|
|
640
|
-
sub = getattr(args, f"{cmd}_cmd", None)
|
|
641
|
-
handler = HANDLERS.get((cmd, sub))
|
|
642
|
-
if handler is None:
|
|
643
|
-
msg = f"Unknown command: {cmd} {sub}"
|
|
644
|
-
raise SystemExit(msg)
|
|
645
|
-
handler(args)
|
|
646
|
-
|
|
647
|
-
|
|
648
670
|
def _handle_search_uniprot(args):
|
|
649
671
|
taxon_id = args.taxon_id
|
|
650
672
|
reviewed = args.reviewed
|
|
@@ -798,6 +820,7 @@ def _handle_retrieve_alphafold(args):
|
|
|
798
820
|
alphafold_csv = args.alphafold_csv
|
|
799
821
|
max_parallel_downloads = args.max_parallel_downloads
|
|
800
822
|
cacher = _initialize_cacher(args)
|
|
823
|
+
gzip_files = args.gzip_files
|
|
801
824
|
|
|
802
825
|
if what_formats is None:
|
|
803
826
|
what_formats = {"summary", "cif"}
|
|
@@ -808,7 +831,12 @@ def _handle_retrieve_alphafold(args):
|
|
|
808
831
|
validated_what: set[DownloadableFormat] = structure(what_formats, set[DownloadableFormat])
|
|
809
832
|
rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {validated_what}")
|
|
810
833
|
afs = af_fetch(
|
|
811
|
-
af_ids,
|
|
834
|
+
af_ids,
|
|
835
|
+
download_dir,
|
|
836
|
+
what=validated_what,
|
|
837
|
+
max_parallel_downloads=max_parallel_downloads,
|
|
838
|
+
cacher=cacher,
|
|
839
|
+
gzip_files=gzip_files,
|
|
812
840
|
)
|
|
813
841
|
total_nr_files = sum(af.nr_of_files() for af in afs)
|
|
814
842
|
rprint(f"Retrieved {total_nr_files} AlphaFold files and {len(afs)} summaries, written to {download_dir}")
|
|
@@ -1017,24 +1045,24 @@ def _handle_mcp(args):
|
|
|
1017
1045
|
mcp.run(transport=args.transport, host=args.host, port=args.port)
|
|
1018
1046
|
|
|
1019
1047
|
|
|
1020
|
-
|
|
1021
|
-
(
|
|
1022
|
-
(
|
|
1023
|
-
(
|
|
1024
|
-
|
|
1025
|
-
(
|
|
1026
|
-
("
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
}
|
|
1048
|
+
def _handle_convert(args):
|
|
1049
|
+
input_dir = structure(args.input_dir, Path)
|
|
1050
|
+
output_dir = input_dir if args.output_dir is None else structure(args.output_dir, Path)
|
|
1051
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
1052
|
+
|
|
1053
|
+
input_files = sorted(glob_structure_files(input_dir))
|
|
1054
|
+
rprint(f"Converting {len(input_files)} files in {input_dir} directory to cif format.")
|
|
1055
|
+
for _ in tqdm(
|
|
1056
|
+
convert_to_cif_files(
|
|
1057
|
+
input_files,
|
|
1058
|
+
output_dir,
|
|
1059
|
+
copy_method=copy_method,
|
|
1060
|
+
),
|
|
1061
|
+
total=len(input_files),
|
|
1062
|
+
unit="file",
|
|
1063
|
+
):
|
|
1064
|
+
pass
|
|
1065
|
+
rprint(f"Converted {len(input_files)} files into {output_dir}.")
|
|
1038
1066
|
|
|
1039
1067
|
|
|
1040
1068
|
def _read_lines(file: TextIOWrapper) -> list[str]:
|
|
@@ -1118,3 +1146,40 @@ def _write_complexes_csv(complexes: list[ComplexPortalEntry], output_csv: TextIO
|
|
|
1118
1146
|
members_str,
|
|
1119
1147
|
]
|
|
1120
1148
|
)
|
|
1149
|
+
|
|
1150
|
+
|
|
1151
|
+
HANDLERS: dict[tuple[str, str | None], Callable] = {
|
|
1152
|
+
("search", "uniprot"): _handle_search_uniprot,
|
|
1153
|
+
("search", "pdbe"): _handle_search_pdbe,
|
|
1154
|
+
("search", "alphafold"): _handle_search_alphafold,
|
|
1155
|
+
("search", "emdb"): _handle_search_emdb,
|
|
1156
|
+
("search", "go"): _handle_search_go,
|
|
1157
|
+
("search", "taxonomy"): _handle_search_taxonomy,
|
|
1158
|
+
("search", "interaction-partners"): _handle_search_interaction_partners,
|
|
1159
|
+
("search", "complexes"): _handle_search_complexes,
|
|
1160
|
+
("retrieve", "pdbe"): _handle_retrieve_pdbe,
|
|
1161
|
+
("retrieve", "alphafold"): _handle_retrieve_alphafold,
|
|
1162
|
+
("retrieve", "emdb"): _handle_retrieve_emdb,
|
|
1163
|
+
("filter", "confidence"): _handle_filter_confidence,
|
|
1164
|
+
("filter", "chain"): _handle_filter_chain,
|
|
1165
|
+
("filter", "residue"): _handle_filter_residue,
|
|
1166
|
+
("filter", "secondary-structure"): _handle_filter_ss,
|
|
1167
|
+
("mcp", None): _handle_mcp,
|
|
1168
|
+
("convert", None): _handle_convert,
|
|
1169
|
+
}
|
|
1170
|
+
|
|
1171
|
+
|
|
1172
|
+
def main():
|
|
1173
|
+
"""Main entry point for the CLI."""
|
|
1174
|
+
parser = make_parser()
|
|
1175
|
+
args = parser.parse_args()
|
|
1176
|
+
logging.basicConfig(level=args.log_level, handlers=[RichHandler(show_level=False)])
|
|
1177
|
+
|
|
1178
|
+
# Dispatch table to reduce complexity
|
|
1179
|
+
cmd = args.command
|
|
1180
|
+
sub = getattr(args, f"{cmd}_cmd", None)
|
|
1181
|
+
handler = HANDLERS.get((cmd, sub))
|
|
1182
|
+
if handler is None:
|
|
1183
|
+
msg = f"Unknown command: {cmd} {sub}"
|
|
1184
|
+
raise SystemExit(msg)
|
|
1185
|
+
handler(args)
|
protein_quest/filters.py
CHANGED
|
@@ -11,10 +11,7 @@ from distributed.deploy.cluster import Cluster
|
|
|
11
11
|
from tqdm.auto import tqdm
|
|
12
12
|
|
|
13
13
|
from protein_quest.parallel import configure_dask_scheduler, dask_map_with_progress
|
|
14
|
-
from protein_quest.
|
|
15
|
-
nr_residues_in_chain,
|
|
16
|
-
write_single_chain_pdb_file,
|
|
17
|
-
)
|
|
14
|
+
from protein_quest.structure import nr_residues_in_chain, write_single_chain_structure_file
|
|
18
15
|
from protein_quest.utils import CopyMethod, copyfile
|
|
19
16
|
|
|
20
17
|
logger = logging.getLogger(__name__)
|
|
@@ -38,7 +35,7 @@ def filter_file_on_chain(
|
|
|
38
35
|
input_file, chain_id = file_and_chain
|
|
39
36
|
logger.debug("Filtering %s on chain %s", input_file, chain_id)
|
|
40
37
|
try:
|
|
41
|
-
output_file =
|
|
38
|
+
output_file = write_single_chain_structure_file(
|
|
42
39
|
input_file, chain_id, output_dir, out_chain=out_chain, copy_method=copy_method
|
|
43
40
|
)
|
|
44
41
|
return ChainFilterStatistics(
|
protein_quest/io.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
"""Module for structure file input/output."""
|
|
2
|
+
|
|
3
|
+
import gzip
|
|
4
|
+
import logging
|
|
5
|
+
import shutil
|
|
6
|
+
import tempfile
|
|
7
|
+
from collections.abc import Generator, Iterable
|
|
8
|
+
from io import StringIO
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Literal, get_args
|
|
11
|
+
from urllib.request import urlopen
|
|
12
|
+
|
|
13
|
+
import gemmi
|
|
14
|
+
from mmcif.api.DictionaryApi import DictionaryApi
|
|
15
|
+
from mmcif.io.BinaryCifReader import BinaryCifReader
|
|
16
|
+
from mmcif.io.BinaryCifWriter import BinaryCifWriter
|
|
17
|
+
from mmcif.io.PdbxReader import PdbxReader
|
|
18
|
+
from mmcif.io.PdbxWriter import PdbxWriter
|
|
19
|
+
|
|
20
|
+
from protein_quest.utils import CopyMethod, copyfile, user_cache_root_dir
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
# TODO remove once v0.7.4 of gemmi is released,
|
|
25
|
+
# as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
|
|
26
|
+
# Swallow gemmi leaked function warnings
|
|
27
|
+
gemmi.set_leak_warnings(False)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
StructureFileExtensions = Literal[".pdb", ".pdb.gz", ".ent", ".ent.gz", ".cif", ".cif.gz", ".bcif", ".bcif.gz"]
|
|
31
|
+
"""Type of supported structure file extensions."""
|
|
32
|
+
valid_structure_file_extensions: set[str] = set(get_args(StructureFileExtensions))
|
|
33
|
+
"""Set of valid structure file extensions."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def write_structure(structure: gemmi.Structure, path: Path):
|
|
37
|
+
"""Write a gemmi structure to a file.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
structure: The gemmi structure to write.
|
|
41
|
+
path: The file path to write the structure to.
|
|
42
|
+
The format depends on the file extension.
|
|
43
|
+
See [StructureFileExtensions][protein_quest.io.StructureFileExtensions]
|
|
44
|
+
for supported extensions.
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
ValueError: If the file extension is not supported.
|
|
48
|
+
"""
|
|
49
|
+
if path.name.endswith(".pdb") or path.name.endswith(".ent"):
|
|
50
|
+
body: str = structure.make_pdb_string()
|
|
51
|
+
path.write_text(body)
|
|
52
|
+
elif path.name.endswith(".pdb.gz") or path.name.endswith(".ent.gz"):
|
|
53
|
+
body: str = structure.make_pdb_string()
|
|
54
|
+
with gzip.open(path, "wt") as f:
|
|
55
|
+
f.write(body)
|
|
56
|
+
elif path.name.endswith(".cif"):
|
|
57
|
+
# do not write chem_comp so it is viewable by molstar
|
|
58
|
+
# see https://github.com/project-gemmi/gemmi/discussions/362
|
|
59
|
+
doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
|
|
60
|
+
doc.write_file(str(path))
|
|
61
|
+
elif path.name.endswith(".cif.gz"):
|
|
62
|
+
doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
|
|
63
|
+
cif_str = doc.as_string()
|
|
64
|
+
with gzip.open(path, "wt") as f:
|
|
65
|
+
f.write(cif_str)
|
|
66
|
+
elif path.name.endswith(".bcif"):
|
|
67
|
+
structure2bcif(structure, path)
|
|
68
|
+
elif path.name.endswith(".bcif.gz"):
|
|
69
|
+
structure2bcifgz(structure, path)
|
|
70
|
+
else:
|
|
71
|
+
msg = f"Unsupported file extension in {path.name}. Supported extensions are: {valid_structure_file_extensions}"
|
|
72
|
+
raise ValueError(msg)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def read_structure(file: Path) -> gemmi.Structure:
|
|
76
|
+
"""Read a structure from a file.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
file: Path to the input structure file.
|
|
80
|
+
See [StructureFileExtensions][protein_quest.io.StructureFileExtensions]
|
|
81
|
+
for supported extensions.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
A gemmi Structure object representing the structure in the file.
|
|
85
|
+
"""
|
|
86
|
+
if file.name.endswith(".bcif"):
|
|
87
|
+
return bcif2structure(file)
|
|
88
|
+
if file.name.endswith(".bcif.gz"):
|
|
89
|
+
return bcifgz2structure(file)
|
|
90
|
+
return gemmi.read_structure(str(file))
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def bcif2cif(bcif_file: Path) -> str:
|
|
94
|
+
"""Convert a binary CIF (bcif) file to a CIF string.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
bcif_file: Path to the binary CIF file.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
A string containing the CIF representation of the structure.
|
|
101
|
+
"""
|
|
102
|
+
reader = BinaryCifReader()
|
|
103
|
+
container = reader.deserialize(str(bcif_file))
|
|
104
|
+
capture = StringIO()
|
|
105
|
+
writer = PdbxWriter(capture)
|
|
106
|
+
writer.write(container)
|
|
107
|
+
return capture.getvalue()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def bcifgz2structure(bcif_gz_file: Path) -> gemmi.Structure:
|
|
111
|
+
"""Read a binary CIF (bcif) gzipped file and return a gemmi Structure object.
|
|
112
|
+
|
|
113
|
+
This is slower than other formats because gemmi does not support reading bcif files directly.
|
|
114
|
+
So we first gunzip the file to a temporary location, convert it to a cif string using mmcif package,
|
|
115
|
+
and then read the cif string using gemmi.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
bcif_gz_file: Path to the binary CIF gzipped file.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
A gemmi Structure object representing the structure in the bcif.gz file.
|
|
122
|
+
"""
|
|
123
|
+
with tempfile.NamedTemporaryFile(suffix=".bcif", delete=True) as tmp_bcif:
|
|
124
|
+
tmp_path = Path(tmp_bcif.name)
|
|
125
|
+
gunzip_file(bcif_gz_file, output_file=tmp_path, keep_original=True)
|
|
126
|
+
return bcif2structure(tmp_path)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def bcif2structure(bcif_file: Path) -> gemmi.Structure:
|
|
130
|
+
"""Read a binary CIF (bcif) file and return a gemmi Structure object.
|
|
131
|
+
|
|
132
|
+
This is slower than other formats because gemmi does not support reading bcif files directly.
|
|
133
|
+
So we convert it to a cif string first using mmcif package and then read the cif string using gemmi.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
bcif_file: Path to the binary CIF file.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
A gemmi Structure object representing the structure in the bcif file.
|
|
140
|
+
"""
|
|
141
|
+
cif_content = bcif2cif(bcif_file)
|
|
142
|
+
doc = gemmi.cif.read_string(cif_content)
|
|
143
|
+
block = doc.sole_block()
|
|
144
|
+
return gemmi.make_structure_from_block(block)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _initialize_dictionary_api(containers) -> DictionaryApi:
|
|
148
|
+
dict_local = user_cache_root_dir() / "mmcif_pdbx_v5_next.dic"
|
|
149
|
+
if not dict_local.exists():
|
|
150
|
+
dict_url = "https://raw.githubusercontent.com/wwpdb-dictionaries/mmcif_pdbx/master/dist/mmcif_pdbx_v5_next.dic"
|
|
151
|
+
logger.info("Downloading mmcif dictionary from %s to %s", dict_url, dict_local)
|
|
152
|
+
dict_local.parent.mkdir(parents=True, exist_ok=True)
|
|
153
|
+
with dict_local.open("wb") as f, urlopen(dict_url) as response: # noqa: S310 url is hardcoded and https
|
|
154
|
+
f.write(response.read())
|
|
155
|
+
return DictionaryApi(containerList=containers, consolidate=True)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def structure2bcif(structure: gemmi.Structure, bcif_file: Path):
|
|
159
|
+
"""Write a gemmi Structure object to a binary CIF (bcif) file.
|
|
160
|
+
|
|
161
|
+
This is slower than other formats because gemmi does not support writing bcif files directly.
|
|
162
|
+
So we convert it to a cif string first using gemmi and then convert cif to bcif using mmcif package.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
structure: The gemmi Structure object to write.
|
|
166
|
+
bcif_file: Path to the output binary CIF file.
|
|
167
|
+
"""
|
|
168
|
+
doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
|
|
169
|
+
containers = []
|
|
170
|
+
with StringIO(doc.as_string()) as sio:
|
|
171
|
+
reader = PdbxReader(sio)
|
|
172
|
+
reader.read(containers)
|
|
173
|
+
dict_api = _initialize_dictionary_api(containers)
|
|
174
|
+
writer = BinaryCifWriter(dictionaryApi=dict_api)
|
|
175
|
+
writer.serialize(str(bcif_file), containers)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def gunzip_file(gz_file: Path, output_file: Path | None = None, keep_original: bool = True) -> Path:
|
|
179
|
+
"""Unzip a .gz file.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
gz_file: Path to the .gz file.
|
|
183
|
+
output_file: Optional path to the output unzipped file. If None, the .gz suffix is removed from gz_file.
|
|
184
|
+
keep_original: Whether to keep the original .gz file. Default is True.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Path to the unzipped file.
|
|
188
|
+
|
|
189
|
+
Raises:
|
|
190
|
+
ValueError: If output_file is None and gz_file does not end with .gz.
|
|
191
|
+
"""
|
|
192
|
+
if output_file is None and not gz_file.name.endswith(".gz"):
|
|
193
|
+
msg = f"If output_file is not provided, {gz_file} must end with .gz"
|
|
194
|
+
raise ValueError(msg)
|
|
195
|
+
out_file = output_file or gz_file.with_suffix("")
|
|
196
|
+
with gzip.open(gz_file, "rb") as f_in, out_file.open("wb") as f_out:
|
|
197
|
+
shutil.copyfileobj(f_in, f_out)
|
|
198
|
+
if not keep_original:
|
|
199
|
+
gz_file.unlink()
|
|
200
|
+
return out_file
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def structure2bcifgz(structure: gemmi.Structure, bcif_gz_file: Path):
|
|
204
|
+
"""Write a gemmi Structure object to a binary CIF gzipped (bcif.gz) file.
|
|
205
|
+
|
|
206
|
+
This is slower than other formats because gemmi does not support writing bcif files directly.
|
|
207
|
+
So we convert it to a cif string first using gemmi and then convert cif to bcif using mmcif package.
|
|
208
|
+
Finally, we gzip the bcif file.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
structure: The gemmi Structure object to write.
|
|
212
|
+
bcif_gz_file: Path to the output binary CIF gzipped file.
|
|
213
|
+
"""
|
|
214
|
+
with tempfile.NamedTemporaryFile(suffix=".bcif", delete=True) as tmp_bcif:
|
|
215
|
+
tmp_path = Path(tmp_bcif.name)
|
|
216
|
+
structure2bcif(structure, tmp_path)
|
|
217
|
+
with tmp_path.open("rb") as f_in, gzip.open(bcif_gz_file, "wb") as f_out:
|
|
218
|
+
shutil.copyfileobj(f_in, f_out)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def convert_to_cif_files(
|
|
222
|
+
input_files: Iterable[Path], output_dir: Path, copy_method: CopyMethod
|
|
223
|
+
) -> Generator[tuple[Path, Path]]:
|
|
224
|
+
"""Convert structure files to .cif format.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
input_files: Iterable of structure files to convert.
|
|
228
|
+
output_dir: Directory to save the converted .cif files.
|
|
229
|
+
copy_method: How to copy when no changes are needed to output file.
|
|
230
|
+
|
|
231
|
+
Yields:
|
|
232
|
+
A tuple of the input file and the output file.
|
|
233
|
+
"""
|
|
234
|
+
for input_file in input_files:
|
|
235
|
+
output_file = convert_to_cif_file(input_file, output_dir, copy_method)
|
|
236
|
+
yield input_file, output_file
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def convert_to_cif_file(input_file: Path, output_dir: Path, copy_method: CopyMethod) -> Path:
|
|
240
|
+
"""Convert a single structure file to .cif format.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
input_file: The structure file to convert.
|
|
244
|
+
See [StructureFileExtensions][protein_quest.io.StructureFileExtensions]
|
|
245
|
+
for supported extensions.
|
|
246
|
+
output_dir: Directory to save the converted .cif file.
|
|
247
|
+
copy_method: How to copy when no changes are needed to output file.
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
Path to the converted .cif file.
|
|
251
|
+
"""
|
|
252
|
+
name, extension = split_name_and_extension(input_file.name)
|
|
253
|
+
output_file = output_dir / f"{name}.cif"
|
|
254
|
+
if output_file.exists():
|
|
255
|
+
logger.info("Output file %s already exists for input file %s. Skipping.", output_file, input_file)
|
|
256
|
+
elif extension in {".pdb", ".pdb.gz", ".ent", ".ent.gz"}:
|
|
257
|
+
structure = read_structure(input_file)
|
|
258
|
+
write_structure(structure, output_file)
|
|
259
|
+
elif extension == ".cif":
|
|
260
|
+
logger.info("File %s is already in .cif format, copying to %s", input_file, output_dir)
|
|
261
|
+
copyfile(input_file, output_file, copy_method)
|
|
262
|
+
elif extension == ".cif.gz":
|
|
263
|
+
gunzip_file(input_file, output_file=output_file, keep_original=True)
|
|
264
|
+
elif extension == ".bcif":
|
|
265
|
+
with output_file.open("w") as f:
|
|
266
|
+
f.write(bcif2cif(input_file))
|
|
267
|
+
else:
|
|
268
|
+
msg = (
|
|
269
|
+
f"Unsupported file extension {extension} in {input_file}. "
|
|
270
|
+
f"Supported extensions are {valid_structure_file_extensions}."
|
|
271
|
+
)
|
|
272
|
+
raise ValueError(msg)
|
|
273
|
+
return output_file
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def split_name_and_extension(name: str) -> tuple[str, str]:
|
|
277
|
+
"""Split a filename into its name and extension.
|
|
278
|
+
|
|
279
|
+
`.gz` is considered part of the extension if present.
|
|
280
|
+
|
|
281
|
+
Examples:
|
|
282
|
+
Some example usages.
|
|
283
|
+
|
|
284
|
+
>>> from protein_quest.pdbe.io import split_name_and_extension
|
|
285
|
+
>>> split_name_and_extension("1234.pdb")
|
|
286
|
+
('1234', '.pdb')
|
|
287
|
+
>>> split_name_and_extension("1234.pdb.gz")
|
|
288
|
+
('1234', '.pdb.gz')
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
name: The filename to split.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
A tuple containing the name and the extension.
|
|
295
|
+
"""
|
|
296
|
+
ext = ""
|
|
297
|
+
if name.endswith(".gz"):
|
|
298
|
+
ext = ".gz"
|
|
299
|
+
name = name.removesuffix(".gz")
|
|
300
|
+
i = name.rfind(".")
|
|
301
|
+
if 0 < i < len(name) - 1:
|
|
302
|
+
ext = name[i:] + ext
|
|
303
|
+
name = name[:i]
|
|
304
|
+
return name, ext
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def locate_structure_file(root: Path, pdb_id: str) -> Path:
|
|
308
|
+
"""Locate a structure file for a given PDB ID in the specified directory.
|
|
309
|
+
|
|
310
|
+
Uses [StructureFileExtensions][protein_quest.io.StructureFileExtensions] as potential extensions.
|
|
311
|
+
Also tries different casing of the PDB ID.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
root: The root directory to search in.
|
|
315
|
+
pdb_id: The PDB ID to locate.
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
The path to the located structure file.
|
|
319
|
+
|
|
320
|
+
Raises:
|
|
321
|
+
FileNotFoundError: If no structure file is found for the given PDB ID.
|
|
322
|
+
"""
|
|
323
|
+
for ext in valid_structure_file_extensions:
|
|
324
|
+
candidates = (
|
|
325
|
+
root / f"{pdb_id}{ext}",
|
|
326
|
+
root / f"{pdb_id.lower()}{ext}",
|
|
327
|
+
root / f"{pdb_id.upper()}{ext}",
|
|
328
|
+
root / f"pdb{pdb_id.lower()}{ext}",
|
|
329
|
+
)
|
|
330
|
+
for candidate in candidates:
|
|
331
|
+
if candidate.exists():
|
|
332
|
+
return candidate
|
|
333
|
+
msg = f"No structure file found for {pdb_id} in {root}"
|
|
334
|
+
raise FileNotFoundError(msg)
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def glob_structure_files(input_dir: Path) -> Generator[Path]:
|
|
338
|
+
"""Glob for structure files in a directory.
|
|
339
|
+
|
|
340
|
+
Uses [StructureFileExtensions][protein_quest.io.StructureFileExtensions] as valid extensions.
|
|
341
|
+
Does not search recursively.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
input_dir: The input directory to search for structure files.
|
|
345
|
+
|
|
346
|
+
Yields:
|
|
347
|
+
Paths to the found structure files.
|
|
348
|
+
"""
|
|
349
|
+
for ext in valid_structure_file_extensions:
|
|
350
|
+
yield from input_dir.glob(f"*{ext}")
|
protein_quest/mcp_server.py
CHANGED
|
@@ -45,9 +45,10 @@ from protein_quest.alphafold.fetch import AlphaFoldEntry, DownloadableFormat
|
|
|
45
45
|
from protein_quest.alphafold.fetch import fetch_many as alphafold_fetch
|
|
46
46
|
from protein_quest.emdb import fetch as emdb_fetch
|
|
47
47
|
from protein_quest.go import search_gene_ontology_term
|
|
48
|
+
from protein_quest.io import convert_to_cif_file, glob_structure_files
|
|
48
49
|
from protein_quest.pdbe.fetch import fetch as pdbe_fetch
|
|
49
|
-
from protein_quest.pdbe.io import glob_structure_files, nr_residues_in_chain, write_single_chain_pdb_file
|
|
50
50
|
from protein_quest.ss import filter_file_on_secondary_structure
|
|
51
|
+
from protein_quest.structure import nr_residues_in_chain, write_single_chain_structure_file
|
|
51
52
|
from protein_quest.taxonomy import search_taxon
|
|
52
53
|
from protein_quest.uniprot import (
|
|
53
54
|
PdbResult,
|
|
@@ -112,18 +113,18 @@ def extract_single_chain_from_structure(
|
|
|
112
113
|
out_chain: str = "A",
|
|
113
114
|
) -> Path:
|
|
114
115
|
"""
|
|
115
|
-
Extract a single chain from a mmCIF
|
|
116
|
+
Extract a single chain from a structure (mmCIF or pdb) file and write to a new file.
|
|
116
117
|
|
|
117
118
|
Args:
|
|
118
|
-
input_file: Path to the input mmCIF
|
|
119
|
+
input_file: Path to the input structure (mmCIF or pdb) file.
|
|
119
120
|
chain2keep: The chain to keep.
|
|
120
121
|
output_dir: Directory to save the output file.
|
|
121
122
|
out_chain: The chain identifier for the output file.
|
|
122
123
|
|
|
123
124
|
Returns:
|
|
124
|
-
Path to the output mmCIF
|
|
125
|
+
Path to the output structure (mmCIF or pdb) file
|
|
125
126
|
"""
|
|
126
|
-
return
|
|
127
|
+
return write_single_chain_structure_file(input_file, chain2keep, output_dir, out_chain)
|
|
127
128
|
|
|
128
129
|
|
|
129
130
|
@mcp.tool
|
|
@@ -199,6 +200,8 @@ def alphafold_confidence_filter(file: Path, query: ConfidenceFilterQuery, filter
|
|
|
199
200
|
|
|
200
201
|
mcp.tool(filter_file_on_secondary_structure)
|
|
201
202
|
|
|
203
|
+
mcp.tool(convert_to_cif_file)
|
|
204
|
+
|
|
202
205
|
|
|
203
206
|
@mcp.prompt
|
|
204
207
|
def candidate_structures(
|
protein_quest/ss.py
CHANGED
|
@@ -5,17 +5,13 @@ from collections.abc import Generator, Iterable
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
|
-
from gemmi import Structure
|
|
8
|
+
from gemmi import Structure
|
|
9
9
|
|
|
10
10
|
from protein_quest.converter import PositiveInt, Ratio, converter
|
|
11
|
+
from protein_quest.io import read_structure
|
|
11
12
|
|
|
12
13
|
logger = logging.getLogger(__name__)
|
|
13
14
|
|
|
14
|
-
# TODO remove once v0.7.4 of gemmi is released,
|
|
15
|
-
# as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
|
|
16
|
-
# Swallow gemmi leaked function warnings
|
|
17
|
-
set_leak_warnings(False)
|
|
18
|
-
|
|
19
15
|
# TODO if a structure has no secondary structure information, calculate it with `gemmi ss`.
|
|
20
16
|
# https://github.com/MonomerLibrary/monomers/wiki/Installation as --monomers dir
|
|
21
17
|
# gemmi executable is in https://pypi.org/project/gemmi-program/
|
|
@@ -261,7 +257,7 @@ def filter_file_on_secondary_structure(
|
|
|
261
257
|
Returns:
|
|
262
258
|
Filtering statistics and whether file passed.
|
|
263
259
|
"""
|
|
264
|
-
structure = read_structure(
|
|
260
|
+
structure = read_structure(file_path)
|
|
265
261
|
return filter_on_secondary_structure(structure, query)
|
|
266
262
|
|
|
267
263
|
|
|
@@ -1,51 +1,29 @@
|
|
|
1
|
-
"""Module for
|
|
1
|
+
"""Module for querying and modifying [gemmi structures][gemmi.Structure]."""
|
|
2
2
|
|
|
3
|
-
import gzip
|
|
4
3
|
import logging
|
|
5
|
-
from collections.abc import
|
|
4
|
+
from collections.abc import Iterable
|
|
6
5
|
from datetime import UTC, datetime
|
|
7
6
|
from pathlib import Path
|
|
8
7
|
|
|
9
8
|
import gemmi
|
|
10
9
|
|
|
11
10
|
from protein_quest.__version__ import __version__
|
|
11
|
+
from protein_quest.io import read_structure, split_name_and_extension, write_structure
|
|
12
12
|
from protein_quest.utils import CopyMethod, copyfile
|
|
13
13
|
|
|
14
14
|
logger = logging.getLogger(__name__)
|
|
15
15
|
|
|
16
|
-
# TODO remove once v0.7.4 of gemmi is released,
|
|
17
|
-
# as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
|
|
18
|
-
# Swallow gemmi leaked function warnings
|
|
19
|
-
gemmi.set_leak_warnings(False)
|
|
20
16
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
"""Returns the number of residues in a specific chain from a mmCIF/pdb file.
|
|
17
|
+
def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain | None:
|
|
18
|
+
"""Find a chain in a model.
|
|
24
19
|
|
|
25
20
|
Args:
|
|
26
|
-
|
|
27
|
-
|
|
21
|
+
model: The gemmi model to search in.
|
|
22
|
+
wanted_chain: The chain identifier to search for.
|
|
28
23
|
|
|
29
24
|
Returns:
|
|
30
|
-
The
|
|
25
|
+
The found chain or None if not found.
|
|
31
26
|
"""
|
|
32
|
-
structure = gemmi.read_structure(str(file))
|
|
33
|
-
gchain = find_chain_in_structure(structure, chain)
|
|
34
|
-
if gchain is None:
|
|
35
|
-
logger.warning("Chain %s not found in %s. Returning 0.", chain, file)
|
|
36
|
-
return 0
|
|
37
|
-
return len(gchain)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def find_chain_in_structure(structure: gemmi.Structure, wanted_chain: str) -> gemmi.Chain | None:
|
|
41
|
-
for model in structure:
|
|
42
|
-
chain = find_chain_in_model(model, wanted_chain)
|
|
43
|
-
if chain is not None:
|
|
44
|
-
return chain
|
|
45
|
-
return None
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain | None:
|
|
49
27
|
chain = model.find_chain(wanted_chain)
|
|
50
28
|
if chain is None:
|
|
51
29
|
# For chain A in 4v92 the find_chain method returns None,
|
|
@@ -57,106 +35,39 @@ def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain |
|
|
|
57
35
|
return chain
|
|
58
36
|
|
|
59
37
|
|
|
60
|
-
def
|
|
61
|
-
"""
|
|
62
|
-
|
|
63
|
-
Args:
|
|
64
|
-
structure: The gemmi structure to write.
|
|
65
|
-
path: The file path to write the structure to.
|
|
66
|
-
The format depends on the file extension.
|
|
67
|
-
Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz.
|
|
68
|
-
|
|
69
|
-
Raises:
|
|
70
|
-
ValueError: If the file extension is not supported.
|
|
71
|
-
"""
|
|
72
|
-
if path.name.endswith(".pdb"):
|
|
73
|
-
body: str = structure.make_pdb_string()
|
|
74
|
-
path.write_text(body)
|
|
75
|
-
elif path.name.endswith(".pdb.gz"):
|
|
76
|
-
body: str = structure.make_pdb_string()
|
|
77
|
-
with gzip.open(path, "wt") as f:
|
|
78
|
-
f.write(body)
|
|
79
|
-
elif path.name.endswith(".cif"):
|
|
80
|
-
# do not write chem_comp so it is viewable by molstar
|
|
81
|
-
# see https://github.com/project-gemmi/gemmi/discussions/362
|
|
82
|
-
doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
|
|
83
|
-
doc.write_file(str(path))
|
|
84
|
-
elif path.name.endswith(".cif.gz"):
|
|
85
|
-
doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
|
|
86
|
-
cif_str = doc.as_string()
|
|
87
|
-
with gzip.open(path, "wt") as f:
|
|
88
|
-
f.write(cif_str)
|
|
89
|
-
else:
|
|
90
|
-
msg = f"Unsupported file extension in {path.name}. Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz"
|
|
91
|
-
raise ValueError(msg)
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def _split_name_and_extension(name: str) -> tuple[str, str]:
|
|
95
|
-
# 1234.pdb -> (1234, .pdb)
|
|
96
|
-
# 1234.pdb.gz -> (1234, .pdb.gz)
|
|
97
|
-
# 1234.cif -> (1234, .cif)
|
|
98
|
-
# 1234.cif.gz -> (1234, .cif.gz)
|
|
99
|
-
if name.endswith(".pdb.gz"):
|
|
100
|
-
return name.replace(".pdb.gz", ""), ".pdb.gz"
|
|
101
|
-
if name.endswith(".cif.gz"):
|
|
102
|
-
return name.replace(".cif.gz", ""), ".cif.gz"
|
|
103
|
-
if name.endswith(".pdb"):
|
|
104
|
-
return name.replace(".pdb", ""), ".pdb"
|
|
105
|
-
if name.endswith(".cif"):
|
|
106
|
-
return name.replace(".cif", ""), ".cif"
|
|
107
|
-
|
|
108
|
-
msg = f"Unknown file extension in {name}. Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz"
|
|
109
|
-
raise ValueError(msg)
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def locate_structure_file(root: Path, pdb_id: str) -> Path:
|
|
113
|
-
"""Locate a structure file for a given PDB ID in the specified directory.
|
|
38
|
+
def find_chain_in_structure(structure: gemmi.Structure, wanted_chain: str) -> gemmi.Chain | None:
|
|
39
|
+
"""Find a chain in a structure.
|
|
114
40
|
|
|
115
41
|
Args:
|
|
116
|
-
|
|
117
|
-
|
|
42
|
+
structure: The gemmi structure to search in.
|
|
43
|
+
wanted_chain: The chain identifier to search for.
|
|
118
44
|
|
|
119
45
|
Returns:
|
|
120
|
-
The
|
|
121
|
-
|
|
122
|
-
Raises:
|
|
123
|
-
FileNotFoundError: If no structure file is found for the given PDB ID.
|
|
46
|
+
The found chain or None if not found.
|
|
124
47
|
"""
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
root / f"{pdb_id.upper()}{ext}",
|
|
131
|
-
root / f"pdb{pdb_id.lower()}{ext}",
|
|
132
|
-
)
|
|
133
|
-
for candidate in candidates:
|
|
134
|
-
if candidate.exists():
|
|
135
|
-
return candidate
|
|
136
|
-
msg = f"No structure file found for {pdb_id} in {root}"
|
|
137
|
-
raise FileNotFoundError(msg)
|
|
48
|
+
for model in structure:
|
|
49
|
+
chain = find_chain_in_model(model, wanted_chain)
|
|
50
|
+
if chain is not None:
|
|
51
|
+
return chain
|
|
52
|
+
return None
|
|
138
53
|
|
|
139
54
|
|
|
140
|
-
def
|
|
141
|
-
"""
|
|
55
|
+
def nr_residues_in_chain(file: Path, chain: str = "A") -> int:
|
|
56
|
+
"""Returns the number of residues in a specific chain from a structure file.
|
|
142
57
|
|
|
143
58
|
Args:
|
|
144
|
-
|
|
59
|
+
file: Path to the input structure file.
|
|
60
|
+
chain: Chain to count residues of.
|
|
145
61
|
|
|
146
|
-
|
|
147
|
-
|
|
62
|
+
Returns:
|
|
63
|
+
The number of residues in the specified chain.
|
|
148
64
|
"""
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
def __init__(self, chain: str, file: Path | str, available_chains: Iterable[str]):
|
|
157
|
-
super().__init__(f"Chain {chain} not found in {file}. Available chains are: {available_chains}")
|
|
158
|
-
self.chain_id = chain
|
|
159
|
-
self.file = file
|
|
65
|
+
structure = read_structure(file)
|
|
66
|
+
gchain = find_chain_in_structure(structure, chain)
|
|
67
|
+
if gchain is None:
|
|
68
|
+
logger.warning("Chain %s not found in %s. Returning 0.", chain, file)
|
|
69
|
+
return 0
|
|
70
|
+
return len(gchain)
|
|
160
71
|
|
|
161
72
|
|
|
162
73
|
def _dedup_helices(structure: gemmi.Structure):
|
|
@@ -198,18 +109,34 @@ def _add_provenance_info(structure: gemmi.Structure, chain2keep: str, out_chain:
|
|
|
198
109
|
|
|
199
110
|
|
|
200
111
|
def chains_in_structure(structure: gemmi.Structure) -> set[gemmi.Chain]:
|
|
201
|
-
"""Get a list of chains in a structure.
|
|
112
|
+
"""Get a list of chains in a structure.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
structure: The gemmi structure to get chains from.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
A set of chains in the structure.
|
|
119
|
+
"""
|
|
202
120
|
return {c for model in structure for c in model}
|
|
203
121
|
|
|
204
122
|
|
|
205
|
-
|
|
123
|
+
class ChainNotFoundError(IndexError):
|
|
124
|
+
"""Exception raised when a chain is not found in a structure."""
|
|
125
|
+
|
|
126
|
+
def __init__(self, chain: str, file: Path | str, available_chains: Iterable[str]):
|
|
127
|
+
super().__init__(f"Chain {chain} not found in {file}. Available chains are: {available_chains}")
|
|
128
|
+
self.chain_id = chain
|
|
129
|
+
self.file = file
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def write_single_chain_structure_file(
|
|
206
133
|
input_file: Path,
|
|
207
134
|
chain2keep: str,
|
|
208
135
|
output_dir: Path,
|
|
209
136
|
out_chain: str = "A",
|
|
210
137
|
copy_method: CopyMethod = "copy",
|
|
211
138
|
) -> Path:
|
|
212
|
-
"""Write a single chain from a
|
|
139
|
+
"""Write a single chain from a structure file to a new structure file.
|
|
213
140
|
|
|
214
141
|
Also
|
|
215
142
|
|
|
@@ -226,14 +153,14 @@ def write_single_chain_pdb_file(
|
|
|
226
153
|
```
|
|
227
154
|
|
|
228
155
|
Args:
|
|
229
|
-
input_file: Path to the input
|
|
156
|
+
input_file: Path to the input structure file.
|
|
230
157
|
chain2keep: The chain to keep.
|
|
231
158
|
output_dir: Directory to save the output file.
|
|
232
159
|
out_chain: The chain identifier for the output file.
|
|
233
160
|
copy_method: How to copy when no changes are needed to output file.
|
|
234
161
|
|
|
235
162
|
Returns:
|
|
236
|
-
Path to the output
|
|
163
|
+
Path to the output structure file
|
|
237
164
|
|
|
238
165
|
Raises:
|
|
239
166
|
FileNotFoundError: If the input file does not exist.
|
|
@@ -241,7 +168,7 @@ def write_single_chain_pdb_file(
|
|
|
241
168
|
"""
|
|
242
169
|
|
|
243
170
|
logger.debug(f"chain2keep: {chain2keep}, out_chain: {out_chain}")
|
|
244
|
-
structure =
|
|
171
|
+
structure = read_structure(input_file)
|
|
245
172
|
structure.setup_entities()
|
|
246
173
|
|
|
247
174
|
chain = find_chain_in_structure(structure, chain2keep)
|
|
@@ -249,7 +176,7 @@ def write_single_chain_pdb_file(
|
|
|
249
176
|
if chain is None:
|
|
250
177
|
raise ChainNotFoundError(chain2keep, input_file, chainnames_in_structure)
|
|
251
178
|
chain_name = chain.name
|
|
252
|
-
name, extension =
|
|
179
|
+
name, extension = split_name_and_extension(input_file.name)
|
|
253
180
|
output_file = output_dir / f"{name}_{chain_name}2{out_chain}{extension}"
|
|
254
181
|
|
|
255
182
|
if output_file.exists():
|
protein_quest/utils.py
CHANGED
|
@@ -265,6 +265,7 @@ async def retrieve_files(
|
|
|
265
265
|
desc: str = "Downloading files",
|
|
266
266
|
cacher: Cacher | None = None,
|
|
267
267
|
chunk_size: int = 524288, # 512 KiB
|
|
268
|
+
gzip_files: bool = False,
|
|
268
269
|
) -> list[Path]:
|
|
269
270
|
"""Retrieve files from a list of URLs and save them to a directory.
|
|
270
271
|
|
|
@@ -277,6 +278,7 @@ async def retrieve_files(
|
|
|
277
278
|
desc: Description for the progress bar.
|
|
278
279
|
cacher: An optional cacher to use for caching files.
|
|
279
280
|
chunk_size: The size of each chunk to read from the response.
|
|
281
|
+
gzip_files: Whether to gzip the downloaded files.
|
|
280
282
|
|
|
281
283
|
Returns:
|
|
282
284
|
A list of paths to the downloaded files.
|
|
@@ -292,6 +294,7 @@ async def retrieve_files(
|
|
|
292
294
|
semaphore=semaphore,
|
|
293
295
|
cacher=cacher,
|
|
294
296
|
chunk_size=chunk_size,
|
|
297
|
+
gzip_files=gzip_files,
|
|
295
298
|
)
|
|
296
299
|
for url, filename in urls
|
|
297
300
|
]
|
|
@@ -299,6 +302,10 @@ async def retrieve_files(
|
|
|
299
302
|
return files
|
|
300
303
|
|
|
301
304
|
|
|
305
|
+
class InvalidContentEncodingError(aiohttp.ClientResponseError):
|
|
306
|
+
"""Content encoding is invalid."""
|
|
307
|
+
|
|
308
|
+
|
|
302
309
|
async def _retrieve_file(
|
|
303
310
|
session: RetryClient,
|
|
304
311
|
url: URL | str,
|
|
@@ -306,6 +313,7 @@ async def _retrieve_file(
|
|
|
306
313
|
semaphore: asyncio.Semaphore,
|
|
307
314
|
cacher: Cacher | None = None,
|
|
308
315
|
chunk_size: int = 524288, # 512 KiB
|
|
316
|
+
gzip_files: bool = False,
|
|
309
317
|
) -> Path:
|
|
310
318
|
"""Retrieve a single file from a URL and save it to a specified path.
|
|
311
319
|
|
|
@@ -316,6 +324,7 @@ async def _retrieve_file(
|
|
|
316
324
|
semaphore: A semaphore to limit the number of concurrent downloads.
|
|
317
325
|
cacher: An optional cacher to use for caching files.
|
|
318
326
|
chunk_size: The size of each chunk to read from the response.
|
|
327
|
+
gzip_files: Whether to gzip the downloaded file.
|
|
319
328
|
|
|
320
329
|
Returns:
|
|
321
330
|
The path to the saved file.
|
|
@@ -330,12 +339,27 @@ async def _retrieve_file(
|
|
|
330
339
|
logger.debug(f"File {save_path} was copied from cache {cached_file}. Skipping download from {url}.")
|
|
331
340
|
return save_path
|
|
332
341
|
|
|
342
|
+
# Alphafold server and many other web servers can return gzipped responses,
|
|
343
|
+
# when we want to save as *.gz, we use raw stream
|
|
344
|
+
# otherwise aiohttp will decompress it automatically for us.
|
|
345
|
+
auto_decompress = not gzip_files
|
|
346
|
+
headers = {"Accept-Encoding": "gzip"}
|
|
333
347
|
async with (
|
|
334
348
|
semaphore,
|
|
335
|
-
session.get(url) as resp,
|
|
349
|
+
session.get(url, headers=headers, auto_decompress=auto_decompress) as resp,
|
|
336
350
|
):
|
|
337
351
|
resp.raise_for_status()
|
|
338
|
-
|
|
352
|
+
if gzip_files and resp.headers.get("Content-Encoding") != "gzip":
|
|
353
|
+
msg = f"Server did not send gzip encoded content for {url}, can not save as gzipped file."
|
|
354
|
+
raise InvalidContentEncodingError(
|
|
355
|
+
request_info=resp.request_info,
|
|
356
|
+
history=resp.history,
|
|
357
|
+
status=415,
|
|
358
|
+
message=msg,
|
|
359
|
+
headers=resp.headers,
|
|
360
|
+
)
|
|
361
|
+
iterator = resp.content.iter_chunked(chunk_size)
|
|
362
|
+
await cacher.write_iter(save_path, iterator)
|
|
339
363
|
return save_path
|
|
340
364
|
|
|
341
365
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: protein_quest
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Search/retrieve/filter proteins and protein structures
|
|
5
5
|
Project-URL: Homepage, https://github.com/haddocking/protein-quest
|
|
6
6
|
Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
|
|
@@ -17,6 +17,7 @@ Requires-Dist: cattrs[orjson]>=24.1.3
|
|
|
17
17
|
Requires-Dist: dask>=2025.5.1
|
|
18
18
|
Requires-Dist: distributed>=2025.5.1
|
|
19
19
|
Requires-Dist: gemmi>=0.7.3
|
|
20
|
+
Requires-Dist: mmcif>=0.92.0
|
|
20
21
|
Requires-Dist: platformdirs>=4.3.8
|
|
21
22
|
Requires-Dist: psutil>=7.0.0
|
|
22
23
|
Requires-Dist: rich-argparse>=1.7.1
|
|
@@ -71,6 +72,7 @@ graph TB;
|
|
|
71
72
|
fetchad -->|mmcif_files| confidencefilter{{Filter out low confidence}}
|
|
72
73
|
confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
|
|
73
74
|
residuefilter --> |mmcif_files| ssfilter
|
|
75
|
+
ssfilter -. mmcif_files .-> convert2cif([Convert to cif])
|
|
74
76
|
classDef dashedBorder stroke-dasharray: 5 5;
|
|
75
77
|
goterm:::dashedBorder
|
|
76
78
|
taxonomy:::dashedBorder
|
|
@@ -78,6 +80,7 @@ graph TB;
|
|
|
78
80
|
fetchemdb:::dashedBorder
|
|
79
81
|
searchintactionpartners:::dashedBorder
|
|
80
82
|
searchcomplexes:::dashedBorder
|
|
83
|
+
convert2cif:::dashedBorder
|
|
81
84
|
```
|
|
82
85
|
|
|
83
86
|
(Dotted nodes and edges are side-quests.)
|
|
@@ -242,6 +245,14 @@ query_protein,complex_id,complex_url,complex_title,members
|
|
|
242
245
|
Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
|
|
243
246
|
```
|
|
244
247
|
|
|
248
|
+
### Convert structure files to .cif format
|
|
249
|
+
|
|
250
|
+
Some tools (for example [powerfit](https://github.com/haddocking/powerfit)) only work with `.cif` files and not `*.cif.gz` or `*.bcif` files.
|
|
251
|
+
|
|
252
|
+
```shell
|
|
253
|
+
protein-quest convert --output-dir ./filtered-cif ./filtered-ss
|
|
254
|
+
```
|
|
255
|
+
|
|
245
256
|
## Model Context Protocol (MCP) server
|
|
246
257
|
|
|
247
258
|
Protein quest can also help LLMs like Claude Sonnet 4 by providing a [set of tools](https://modelcontextprotocol.io/docs/learn/server-concepts#tools-ai-actions) for protein structures.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
protein_quest/__version__.py,sha256=z_nR_Ti0YfIwFSKDD18DIrz_r3zxWQ8EGCNr2XUWkY0,56
|
|
3
|
+
protein_quest/cli.py,sha256=pWwMIzWBrtqhZbvTIkvd1XhA5u9J-WAAg7A3hJZGtlk,46201
|
|
4
|
+
protein_quest/converter.py,sha256=Y-Oxf7lDNbEicL6GS-IpNWDwaAiHgIgs5bFAcEHCKdQ,1441
|
|
5
|
+
protein_quest/emdb.py,sha256=641c6RwNYnu-0GBFyCFBiI58fNc0jMkd0ZZ9MW9-Jmc,1501
|
|
6
|
+
protein_quest/filters.py,sha256=Xr-cJTtbNjHKuzmXLBf7yZfqKf_U3RTivcVbr620LVU,5225
|
|
7
|
+
protein_quest/go.py,sha256=lZNEcw8nTc9wpV3cl4y2FG9Lsj8wsXQ6zemmAQs_DWE,5650
|
|
8
|
+
protein_quest/io.py,sha256=ngV_HU2HIQFO-bP2xQj_fhgv0MYjW4puqz_9CxGpBv8,13017
|
|
9
|
+
protein_quest/mcp_server.py,sha256=rQv2srhF3_SYYK1TD3htIyxNiunU7a8FDC7CYT_oJFE,8269
|
|
10
|
+
protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
|
|
11
|
+
protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
protein_quest/ss.py,sha256=4ZGIHfjTlodYTXqGUKhMnGbgaStYOGaWg2oYrWIjdgo,10118
|
|
13
|
+
protein_quest/structure.py,sha256=1FTKN0mYKTwZHlyIB4ORSAgSHFKK-UAK7T-qoFo1vyI,7162
|
|
14
|
+
protein_quest/taxonomy.py,sha256=4mKv8zll4mX02Ow8CTvyqMJE2KJZvcq3QlTjjjLOJJk,5072
|
|
15
|
+
protein_quest/uniprot.py,sha256=92G5YiJAJwUBKJQHPrM6DZlaLe-XG4qBg0zy0BDGFYY,24354
|
|
16
|
+
protein_quest/utils.py,sha256=6OF8X4ia_z1HOYiXy6e-zEWlp_bF1DoZCVrCSg1qivY,19076
|
|
17
|
+
protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
|
|
18
|
+
protein_quest/alphafold/confidence.py,sha256=mVAYTIzdbR8xBjRiUzA0at8wJq9vpfEQWPz5cJefLKs,6766
|
|
19
|
+
protein_quest/alphafold/entry_summary.py,sha256=GtE3rT7wH3vIOOeiXY2s80Fo6EzdoqlcvakW8K591Yk,1257
|
|
20
|
+
protein_quest/alphafold/fetch.py,sha256=n5SlqbQfU1PE4X8saV4O1nCrKRn3Q2UcMlrNw5-163w,12801
|
|
21
|
+
protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
|
|
22
|
+
protein_quest/pdbe/fetch.py,sha256=e8CHWDX2QzWnVLmYXCfNrscw1UcN1lI9Uz6Z5HmEOEQ,2510
|
|
23
|
+
protein_quest-0.6.0.dist-info/METADATA,sha256=8rX0ixi4Xl516LkxOlOKKRe364nKIjP7mKn67xuOcDA,9623
|
|
24
|
+
protein_quest-0.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
25
|
+
protein_quest-0.6.0.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
|
|
26
|
+
protein_quest-0.6.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
27
|
+
protein_quest-0.6.0.dist-info/RECORD,,
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
protein_quest/__version__.py,sha256=iRjDp09jO2JFmZdsWS3ikyYYQ8S33AzhMdrr00gEG9g,56
|
|
3
|
-
protein_quest/cli.py,sha256=xiXt_2l3MxbTbmxm2sz0w8_OdJr8gz_B68GBVv5wHjE,44182
|
|
4
|
-
protein_quest/converter.py,sha256=Y-Oxf7lDNbEicL6GS-IpNWDwaAiHgIgs5bFAcEHCKdQ,1441
|
|
5
|
-
protein_quest/emdb.py,sha256=641c6RwNYnu-0GBFyCFBiI58fNc0jMkd0ZZ9MW9-Jmc,1501
|
|
6
|
-
protein_quest/filters.py,sha256=-gasSXR4g5SzYSYbkfcDwR-tm2KCAhCMdpIVJrUPR1w,5224
|
|
7
|
-
protein_quest/go.py,sha256=lZNEcw8nTc9wpV3cl4y2FG9Lsj8wsXQ6zemmAQs_DWE,5650
|
|
8
|
-
protein_quest/mcp_server.py,sha256=PCXxcU3GElKg2sjMlxbsM63OiFxg9AtmfKwBJ1_0AQE,8130
|
|
9
|
-
protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
|
|
10
|
-
protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
protein_quest/ss.py,sha256=qOr0aMycNAtZmXXvhCN-KZH3Qp4EejnBcE6fsFgCrmY,10343
|
|
12
|
-
protein_quest/taxonomy.py,sha256=4mKv8zll4mX02Ow8CTvyqMJE2KJZvcq3QlTjjjLOJJk,5072
|
|
13
|
-
protein_quest/uniprot.py,sha256=92G5YiJAJwUBKJQHPrM6DZlaLe-XG4qBg0zy0BDGFYY,24354
|
|
14
|
-
protein_quest/utils.py,sha256=2lQ7jPHWtDySBTYnoL9VTKl5XUgQVYgp9Prb7qEnjtQ,17982
|
|
15
|
-
protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
|
|
16
|
-
protein_quest/alphafold/confidence.py,sha256=pYIuwYdkuPuHLagcX1dSvSyZ_84xboRLfHUxkEoc4MY,6766
|
|
17
|
-
protein_quest/alphafold/entry_summary.py,sha256=GtE3rT7wH3vIOOeiXY2s80Fo6EzdoqlcvakW8K591Yk,1257
|
|
18
|
-
protein_quest/alphafold/fetch.py,sha256=wIsgPZmtnE5EoAL9G22Y6Ehx9d0md53Mw88-6LLGp0Q,12298
|
|
19
|
-
protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
|
|
20
|
-
protein_quest/pdbe/fetch.py,sha256=e8CHWDX2QzWnVLmYXCfNrscw1UcN1lI9Uz6Z5HmEOEQ,2510
|
|
21
|
-
protein_quest/pdbe/io.py,sha256=iGLvmsD-eEYnrgZDYfkGWIDCzwDRRD5dwqB480talCs,10037
|
|
22
|
-
protein_quest-0.5.1.dist-info/METADATA,sha256=MPfZLLa8XC1tZ3okRYIT3Hs3pMvd8ShA17Yy2axKBe8,9236
|
|
23
|
-
protein_quest-0.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
24
|
-
protein_quest-0.5.1.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
|
|
25
|
-
protein_quest-0.5.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
26
|
-
protein_quest-0.5.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|