protein-quest 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of protein-quest might be problematic. Click here for more details.

@@ -1,2 +1,2 @@
1
- __version__ = "0.5.1"
1
+ __version__ = "0.6.0"
2
2
  """The version of the package."""
@@ -8,7 +8,7 @@ from pathlib import Path
8
8
  import gemmi
9
9
 
10
10
  from protein_quest.converter import Percentage, PositiveInt, converter
11
- from protein_quest.pdbe.io import write_structure
11
+ from protein_quest.io import read_structure, write_structure
12
12
  from protein_quest.ss import nr_of_residues_in_total
13
13
  from protein_quest.utils import CopyMethod, copyfile
14
14
 
@@ -127,7 +127,7 @@ def filter_file_on_residues(
127
127
  result with filtered_file property set to Path where filtered PDB file is saved.
128
128
  or None if structure was filtered out.
129
129
  """
130
- structure = gemmi.read_structure(str(file))
130
+ structure = read_structure(file)
131
131
  residues = set(find_high_confidence_residues(structure, query.confidence))
132
132
  count = len(residues)
133
133
  if count < query.min_residues or count > query.max_residues:
@@ -125,15 +125,15 @@ async def fetch_summary(
125
125
  fn: AsyncPath | None = None
126
126
  if save_dir is not None:
127
127
  fn = AsyncPath(save_dir / f"{qualifier}.json")
128
+ if await fn.exists():
129
+ logger.debug(f"File {fn} already exists. Skipping download from {url}.")
130
+ raw_data = await fn.read_bytes()
131
+ return converter.loads(raw_data, list[EntrySummary])
128
132
  cached_file = await cacher.copy_from_cache(Path(fn))
129
133
  if cached_file is not None:
130
134
  logger.debug(f"Using cached file {cached_file} for summary of {qualifier}.")
131
135
  raw_data = await AsyncPath(cached_file).read_bytes()
132
136
  return converter.loads(raw_data, list[EntrySummary])
133
- if await fn.exists():
134
- logger.debug(f"File {fn} already exists. Skipping download from {url}.")
135
- raw_data = await fn.read_bytes()
136
- return converter.loads(raw_data, list[EntrySummary])
137
137
  async with semaphore, session.get(url) as response:
138
138
  response.raise_for_status()
139
139
  raw_data = await response.content.read()
@@ -170,6 +170,7 @@ async def fetch_many_async(
170
170
  what: set[DownloadableFormat],
171
171
  max_parallel_downloads: int = 5,
172
172
  cacher: Cacher | None = None,
173
+ gzip_files: bool = False,
173
174
  ) -> AsyncGenerator[AlphaFoldEntry]:
174
175
  """Asynchronously fetches summaries and files from
175
176
  [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
@@ -180,6 +181,7 @@ async def fetch_many_async(
180
181
  what: A set of formats to download.
181
182
  max_parallel_downloads: The maximum number of parallel downloads.
182
183
  cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
184
+ gzip_files: Whether to gzip the downloaded files.
183
185
 
184
186
  Yields:
185
187
  A dataclass containing the summary, pdb file, and pae file.
@@ -193,7 +195,7 @@ async def fetch_many_async(
193
195
  )
194
196
  ]
195
197
 
196
- files = files_to_download(what, summaries)
198
+ files = files_to_download(what, summaries, gzip_files)
197
199
 
198
200
  await retrieve_files(
199
201
  files,
@@ -201,36 +203,40 @@ async def fetch_many_async(
201
203
  desc="Downloading AlphaFold files",
202
204
  max_parallel_downloads=max_parallel_downloads,
203
205
  cacher=cacher,
206
+ gzip_files=gzip_files,
204
207
  )
208
+ gzext = ".gz" if gzip_files else ""
205
209
  for summary in summaries:
206
210
  yield AlphaFoldEntry(
207
211
  uniprot_acc=summary.uniprotAccession,
208
212
  summary=summary,
209
213
  summary_file=save_dir / f"{summary.uniprotAccession}.json" if save_dir_for_summaries is not None else None,
210
- bcif_file=save_dir / summary.bcifUrl.name if "bcif" in what else None,
211
- cif_file=save_dir / summary.cifUrl.name if "cif" in what else None,
212
- pdb_file=save_dir / summary.pdbUrl.name if "pdb" in what else None,
213
- pae_image_file=save_dir / summary.paeImageUrl.name if "paeImage" in what else None,
214
- pae_doc_file=save_dir / summary.paeDocUrl.name if "paeDoc" in what else None,
214
+ bcif_file=save_dir / (summary.bcifUrl.name + gzext) if "bcif" in what else None,
215
+ cif_file=save_dir / (summary.cifUrl.name + gzext) if "cif" in what else None,
216
+ pdb_file=save_dir / (summary.pdbUrl.name + gzext) if "pdb" in what else None,
217
+ pae_image_file=save_dir / (summary.paeImageUrl.name + gzext) if "paeImage" in what else None,
218
+ pae_doc_file=save_dir / (summary.paeDocUrl.name + gzext) if "paeDoc" in what else None,
215
219
  am_annotations_file=(
216
- save_dir / summary.amAnnotationsUrl.name
220
+ save_dir / (summary.amAnnotationsUrl.name + gzext)
217
221
  if "amAnnotations" in what and summary.amAnnotationsUrl
218
222
  else None
219
223
  ),
220
224
  am_annotations_hg19_file=(
221
- save_dir / summary.amAnnotationsHg19Url.name
225
+ save_dir / (summary.amAnnotationsHg19Url.name + gzext)
222
226
  if "amAnnotationsHg19" in what and summary.amAnnotationsHg19Url
223
227
  else None
224
228
  ),
225
229
  am_annotations_hg38_file=(
226
- save_dir / summary.amAnnotationsHg38Url.name
230
+ save_dir / (summary.amAnnotationsHg38Url.name + gzext)
227
231
  if "amAnnotationsHg38" in what and summary.amAnnotationsHg38Url
228
232
  else None
229
233
  ),
230
234
  )
231
235
 
232
236
 
233
- def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySummary]) -> set[tuple[URL, str]]:
237
+ def files_to_download(
238
+ what: set[DownloadableFormat], summaries: Iterable[EntrySummary], gzip_files: bool
239
+ ) -> set[tuple[URL, str]]:
234
240
  if not (set(what) <= downloadable_formats):
235
241
  msg = (
236
242
  f"Invalid format(s) specified: {set(what) - downloadable_formats}. "
@@ -238,7 +244,7 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
238
244
  )
239
245
  raise ValueError(msg)
240
246
 
241
- files: set[tuple[URL, str]] = set()
247
+ url_filename_pairs: set[tuple[URL, str]] = set()
242
248
  for summary in summaries:
243
249
  for fmt in what:
244
250
  if fmt == "summary":
@@ -248,9 +254,10 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
248
254
  if url is None:
249
255
  logger.warning(f"Summary {summary.uniprotAccession} does not have a URL for format '{fmt}'. Skipping.")
250
256
  continue
251
- file = (url, url.name)
252
- files.add(file)
253
- return files
257
+ fn = url.name + (".gz" if gzip_files else "")
258
+ url_filename_pair = (url, fn)
259
+ url_filename_pairs.add(url_filename_pair)
260
+ return url_filename_pairs
254
261
 
255
262
 
256
263
  def fetch_many(
@@ -259,6 +266,7 @@ def fetch_many(
259
266
  what: set[DownloadableFormat],
260
267
  max_parallel_downloads: int = 5,
261
268
  cacher: Cacher | None = None,
269
+ gzip_files: bool = False,
262
270
  ) -> list[AlphaFoldEntry]:
263
271
  """Synchronously fetches summaries and pdb and pae files from AlphaFold Protein Structure Database.
264
272
 
@@ -268,6 +276,7 @@ def fetch_many(
268
276
  what: A set of formats to download.
269
277
  max_parallel_downloads: The maximum number of parallel downloads.
270
278
  cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
279
+ gzip_files: Whether to gzip the downloaded files.
271
280
 
272
281
  Returns:
273
282
  A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
@@ -277,7 +286,7 @@ def fetch_many(
277
286
  return [
278
287
  entry
279
288
  async for entry in fetch_many_async(
280
- ids, save_dir, what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
289
+ ids, save_dir, what, max_parallel_downloads=max_parallel_downloads, cacher=cacher, gzip_files=gzip_files
281
290
  )
282
291
  ]
283
292
 
protein_quest/cli.py CHANGED
@@ -28,8 +28,13 @@ from protein_quest.converter import converter
28
28
  from protein_quest.emdb import fetch as emdb_fetch
29
29
  from protein_quest.filters import filter_files_on_chain, filter_files_on_residues
30
30
  from protein_quest.go import Aspect, allowed_aspects, search_gene_ontology_term, write_go_terms_to_csv
31
+ from protein_quest.io import (
32
+ convert_to_cif_files,
33
+ glob_structure_files,
34
+ locate_structure_file,
35
+ valid_structure_file_extensions,
36
+ )
31
37
  from protein_quest.pdbe import fetch as pdbe_fetch
32
- from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
33
38
  from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
34
39
  from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
35
40
  from protein_quest.uniprot import (
@@ -297,6 +302,38 @@ def _add_search_complexes_parser(subparsers: argparse._SubParsersAction):
297
302
  parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
298
303
 
299
304
 
305
+ def _add_copy_method_arguments(parser):
306
+ parser.add_argument(
307
+ "--copy-method",
308
+ type=str,
309
+ choices=copy_methods,
310
+ default="hardlink",
311
+ help=dedent("""\
312
+ How to make target file be same file as source file.
313
+ By default uses hardlinks to save disk space.
314
+ Note that hardlinks only work within the same filesystem and are harder to track.
315
+ If you want to track cached files easily then use 'symlink'.
316
+ On Windows you need developer mode or admin privileges to create symlinks.
317
+ """),
318
+ )
319
+
320
+
321
+ def _add_cacher_arguments(parser: argparse.ArgumentParser):
322
+ """Add cacher arguments to parser."""
323
+ parser.add_argument(
324
+ "--no-cache",
325
+ action="store_true",
326
+ help="Disable caching of files to central location.",
327
+ )
328
+ parser.add_argument(
329
+ "--cache-dir",
330
+ type=Path,
331
+ default=user_cache_root_dir(),
332
+ help="Directory to use as cache for files.",
333
+ )
334
+ _add_copy_method_arguments(parser)
335
+
336
+
300
337
  def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
301
338
  """Add retrieve pdbe subcommand parser."""
302
339
  parser = subparsers.add_parser(
@@ -345,6 +382,11 @@ def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
345
382
  help=dedent("""AlphaFold formats to retrieve. Can be specified multiple times.
346
383
  Default is 'summary' and 'cif'."""),
347
384
  )
385
+ parser.add_argument(
386
+ "--gzip-files",
387
+ action="store_true",
388
+ help="Whether to gzip the downloaded files. Excludes summary files, they are always uncompressed.",
389
+ )
348
390
  parser.add_argument(
349
391
  "--max-parallel-downloads",
350
392
  type=int,
@@ -561,6 +603,33 @@ def _add_filter_subcommands(subparsers: argparse._SubParsersAction):
561
603
  _add_filter_ss_parser(subsubparsers)
562
604
 
563
605
 
606
+ def _add_convert_subcommands(subparsers: argparse._SubParsersAction):
607
+ """Add convert command."""
608
+ parser = subparsers.add_parser(
609
+ "convert", help="Convert structure files between formats", formatter_class=ArgumentDefaultsRichHelpFormatter
610
+ )
611
+ parser.add_argument(
612
+ "input_dir",
613
+ type=Path,
614
+ help=f"Directory with structure files. Supported extensions are {valid_structure_file_extensions}",
615
+ )
616
+ parser.add_argument(
617
+ "--output-dir",
618
+ type=Path,
619
+ help=dedent("""\
620
+ Directory to write converted structure files. If not given, files are written to `input_dir`.
621
+ """),
622
+ )
623
+ parser.add_argument(
624
+ "--format",
625
+ type=str,
626
+ choices=("cif",),
627
+ default="cif",
628
+ help="Output format to convert to.",
629
+ )
630
+ _add_copy_method_arguments(parser)
631
+
632
+
564
633
  def _add_mcp_command(subparsers: argparse._SubParsersAction):
565
634
  """Add MCP command."""
566
635
 
@@ -580,38 +649,6 @@ def _add_mcp_command(subparsers: argparse._SubParsersAction):
580
649
  parser.add_argument("--port", default=8000, type=int, help="Port to bind the server to")
581
650
 
582
651
 
583
- def _add_copy_method_arguments(parser):
584
- parser.add_argument(
585
- "--copy-method",
586
- type=str,
587
- choices=copy_methods,
588
- default="hardlink",
589
- help=dedent("""\
590
- How to make target file be same file as source file.
591
- By default uses hardlinks to save disk space.
592
- Note that hardlinks only work within the same filesystem and are harder to track.
593
- If you want to track cached files easily then use 'symlink'.
594
- On Windows you need developer mode or admin privileges to create symlinks.
595
- """),
596
- )
597
-
598
-
599
- def _add_cacher_arguments(parser: argparse.ArgumentParser):
600
- """Add cacher arguments to parser."""
601
- parser.add_argument(
602
- "--no-cache",
603
- action="store_true",
604
- help="Disable caching of files to central location.",
605
- )
606
- parser.add_argument(
607
- "--cache-dir",
608
- type=Path,
609
- default=user_cache_root_dir(),
610
- help="Directory to use as cache for files.",
611
- )
612
- _add_copy_method_arguments(parser)
613
-
614
-
615
652
  def make_parser() -> argparse.ArgumentParser:
616
653
  parser = argparse.ArgumentParser(
617
654
  description="Protein Quest CLI", prog="protein-quest", formatter_class=ArgumentDefaultsRichHelpFormatter
@@ -624,27 +661,12 @@ def make_parser() -> argparse.ArgumentParser:
624
661
  _add_search_subcommands(subparsers)
625
662
  _add_retrieve_subcommands(subparsers)
626
663
  _add_filter_subcommands(subparsers)
664
+ _add_convert_subcommands(subparsers)
627
665
  _add_mcp_command(subparsers)
628
666
 
629
667
  return parser
630
668
 
631
669
 
632
- def main():
633
- """Main entry point for the CLI."""
634
- parser = make_parser()
635
- args = parser.parse_args()
636
- logging.basicConfig(level=args.log_level, handlers=[RichHandler(show_level=False)])
637
-
638
- # Dispatch table to reduce complexity
639
- cmd = args.command
640
- sub = getattr(args, f"{cmd}_cmd", None)
641
- handler = HANDLERS.get((cmd, sub))
642
- if handler is None:
643
- msg = f"Unknown command: {cmd} {sub}"
644
- raise SystemExit(msg)
645
- handler(args)
646
-
647
-
648
670
  def _handle_search_uniprot(args):
649
671
  taxon_id = args.taxon_id
650
672
  reviewed = args.reviewed
@@ -798,6 +820,7 @@ def _handle_retrieve_alphafold(args):
798
820
  alphafold_csv = args.alphafold_csv
799
821
  max_parallel_downloads = args.max_parallel_downloads
800
822
  cacher = _initialize_cacher(args)
823
+ gzip_files = args.gzip_files
801
824
 
802
825
  if what_formats is None:
803
826
  what_formats = {"summary", "cif"}
@@ -808,7 +831,12 @@ def _handle_retrieve_alphafold(args):
808
831
  validated_what: set[DownloadableFormat] = structure(what_formats, set[DownloadableFormat])
809
832
  rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {validated_what}")
810
833
  afs = af_fetch(
811
- af_ids, download_dir, what=validated_what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
834
+ af_ids,
835
+ download_dir,
836
+ what=validated_what,
837
+ max_parallel_downloads=max_parallel_downloads,
838
+ cacher=cacher,
839
+ gzip_files=gzip_files,
812
840
  )
813
841
  total_nr_files = sum(af.nr_of_files() for af in afs)
814
842
  rprint(f"Retrieved {total_nr_files} AlphaFold files and {len(afs)} summaries, written to {download_dir}")
@@ -1017,24 +1045,24 @@ def _handle_mcp(args):
1017
1045
  mcp.run(transport=args.transport, host=args.host, port=args.port)
1018
1046
 
1019
1047
 
1020
- HANDLERS: dict[tuple[str, str | None], Callable] = {
1021
- ("search", "uniprot"): _handle_search_uniprot,
1022
- ("search", "pdbe"): _handle_search_pdbe,
1023
- ("search", "alphafold"): _handle_search_alphafold,
1024
- ("search", "emdb"): _handle_search_emdb,
1025
- ("search", "go"): _handle_search_go,
1026
- ("search", "taxonomy"): _handle_search_taxonomy,
1027
- ("search", "interaction-partners"): _handle_search_interaction_partners,
1028
- ("search", "complexes"): _handle_search_complexes,
1029
- ("retrieve", "pdbe"): _handle_retrieve_pdbe,
1030
- ("retrieve", "alphafold"): _handle_retrieve_alphafold,
1031
- ("retrieve", "emdb"): _handle_retrieve_emdb,
1032
- ("filter", "confidence"): _handle_filter_confidence,
1033
- ("filter", "chain"): _handle_filter_chain,
1034
- ("filter", "residue"): _handle_filter_residue,
1035
- ("filter", "secondary-structure"): _handle_filter_ss,
1036
- ("mcp", None): _handle_mcp,
1037
- }
1048
+ def _handle_convert(args):
1049
+ input_dir = structure(args.input_dir, Path)
1050
+ output_dir = input_dir if args.output_dir is None else structure(args.output_dir, Path)
1051
+ copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
1052
+
1053
+ input_files = sorted(glob_structure_files(input_dir))
1054
+ rprint(f"Converting {len(input_files)} files in {input_dir} directory to cif format.")
1055
+ for _ in tqdm(
1056
+ convert_to_cif_files(
1057
+ input_files,
1058
+ output_dir,
1059
+ copy_method=copy_method,
1060
+ ),
1061
+ total=len(input_files),
1062
+ unit="file",
1063
+ ):
1064
+ pass
1065
+ rprint(f"Converted {len(input_files)} files into {output_dir}.")
1038
1066
 
1039
1067
 
1040
1068
  def _read_lines(file: TextIOWrapper) -> list[str]:
@@ -1118,3 +1146,40 @@ def _write_complexes_csv(complexes: list[ComplexPortalEntry], output_csv: TextIO
1118
1146
  members_str,
1119
1147
  ]
1120
1148
  )
1149
+
1150
+
1151
+ HANDLERS: dict[tuple[str, str | None], Callable] = {
1152
+ ("search", "uniprot"): _handle_search_uniprot,
1153
+ ("search", "pdbe"): _handle_search_pdbe,
1154
+ ("search", "alphafold"): _handle_search_alphafold,
1155
+ ("search", "emdb"): _handle_search_emdb,
1156
+ ("search", "go"): _handle_search_go,
1157
+ ("search", "taxonomy"): _handle_search_taxonomy,
1158
+ ("search", "interaction-partners"): _handle_search_interaction_partners,
1159
+ ("search", "complexes"): _handle_search_complexes,
1160
+ ("retrieve", "pdbe"): _handle_retrieve_pdbe,
1161
+ ("retrieve", "alphafold"): _handle_retrieve_alphafold,
1162
+ ("retrieve", "emdb"): _handle_retrieve_emdb,
1163
+ ("filter", "confidence"): _handle_filter_confidence,
1164
+ ("filter", "chain"): _handle_filter_chain,
1165
+ ("filter", "residue"): _handle_filter_residue,
1166
+ ("filter", "secondary-structure"): _handle_filter_ss,
1167
+ ("mcp", None): _handle_mcp,
1168
+ ("convert", None): _handle_convert,
1169
+ }
1170
+
1171
+
1172
+ def main():
1173
+ """Main entry point for the CLI."""
1174
+ parser = make_parser()
1175
+ args = parser.parse_args()
1176
+ logging.basicConfig(level=args.log_level, handlers=[RichHandler(show_level=False)])
1177
+
1178
+ # Dispatch table to reduce complexity
1179
+ cmd = args.command
1180
+ sub = getattr(args, f"{cmd}_cmd", None)
1181
+ handler = HANDLERS.get((cmd, sub))
1182
+ if handler is None:
1183
+ msg = f"Unknown command: {cmd} {sub}"
1184
+ raise SystemExit(msg)
1185
+ handler(args)
protein_quest/filters.py CHANGED
@@ -11,10 +11,7 @@ from distributed.deploy.cluster import Cluster
11
11
  from tqdm.auto import tqdm
12
12
 
13
13
  from protein_quest.parallel import configure_dask_scheduler, dask_map_with_progress
14
- from protein_quest.pdbe.io import (
15
- nr_residues_in_chain,
16
- write_single_chain_pdb_file,
17
- )
14
+ from protein_quest.structure import nr_residues_in_chain, write_single_chain_structure_file
18
15
  from protein_quest.utils import CopyMethod, copyfile
19
16
 
20
17
  logger = logging.getLogger(__name__)
@@ -38,7 +35,7 @@ def filter_file_on_chain(
38
35
  input_file, chain_id = file_and_chain
39
36
  logger.debug("Filtering %s on chain %s", input_file, chain_id)
40
37
  try:
41
- output_file = write_single_chain_pdb_file(
38
+ output_file = write_single_chain_structure_file(
42
39
  input_file, chain_id, output_dir, out_chain=out_chain, copy_method=copy_method
43
40
  )
44
41
  return ChainFilterStatistics(
protein_quest/io.py ADDED
@@ -0,0 +1,350 @@
1
+ """Module for structure file input/output."""
2
+
3
+ import gzip
4
+ import logging
5
+ import shutil
6
+ import tempfile
7
+ from collections.abc import Generator, Iterable
8
+ from io import StringIO
9
+ from pathlib import Path
10
+ from typing import Literal, get_args
11
+ from urllib.request import urlopen
12
+
13
+ import gemmi
14
+ from mmcif.api.DictionaryApi import DictionaryApi
15
+ from mmcif.io.BinaryCifReader import BinaryCifReader
16
+ from mmcif.io.BinaryCifWriter import BinaryCifWriter
17
+ from mmcif.io.PdbxReader import PdbxReader
18
+ from mmcif.io.PdbxWriter import PdbxWriter
19
+
20
+ from protein_quest.utils import CopyMethod, copyfile, user_cache_root_dir
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # TODO remove once v0.7.4 of gemmi is released,
25
+ # as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
26
+ # Swallow gemmi leaked function warnings
27
+ gemmi.set_leak_warnings(False)
28
+
29
+
30
+ StructureFileExtensions = Literal[".pdb", ".pdb.gz", ".ent", ".ent.gz", ".cif", ".cif.gz", ".bcif", ".bcif.gz"]
31
+ """Type of supported structure file extensions."""
32
+ valid_structure_file_extensions: set[str] = set(get_args(StructureFileExtensions))
33
+ """Set of valid structure file extensions."""
34
+
35
+
36
+ def write_structure(structure: gemmi.Structure, path: Path):
37
+ """Write a gemmi structure to a file.
38
+
39
+ Args:
40
+ structure: The gemmi structure to write.
41
+ path: The file path to write the structure to.
42
+ The format depends on the file extension.
43
+ See [StructureFileExtensions][protein_quest.io.StructureFileExtensions]
44
+ for supported extensions.
45
+
46
+ Raises:
47
+ ValueError: If the file extension is not supported.
48
+ """
49
+ if path.name.endswith(".pdb") or path.name.endswith(".ent"):
50
+ body: str = structure.make_pdb_string()
51
+ path.write_text(body)
52
+ elif path.name.endswith(".pdb.gz") or path.name.endswith(".ent.gz"):
53
+ body: str = structure.make_pdb_string()
54
+ with gzip.open(path, "wt") as f:
55
+ f.write(body)
56
+ elif path.name.endswith(".cif"):
57
+ # do not write chem_comp so it is viewable by molstar
58
+ # see https://github.com/project-gemmi/gemmi/discussions/362
59
+ doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
60
+ doc.write_file(str(path))
61
+ elif path.name.endswith(".cif.gz"):
62
+ doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
63
+ cif_str = doc.as_string()
64
+ with gzip.open(path, "wt") as f:
65
+ f.write(cif_str)
66
+ elif path.name.endswith(".bcif"):
67
+ structure2bcif(structure, path)
68
+ elif path.name.endswith(".bcif.gz"):
69
+ structure2bcifgz(structure, path)
70
+ else:
71
+ msg = f"Unsupported file extension in {path.name}. Supported extensions are: {valid_structure_file_extensions}"
72
+ raise ValueError(msg)
73
+
74
+
75
+ def read_structure(file: Path) -> gemmi.Structure:
76
+ """Read a structure from a file.
77
+
78
+ Args:
79
+ file: Path to the input structure file.
80
+ See [StructureFileExtensions][protein_quest.io.StructureFileExtensions]
81
+ for supported extensions.
82
+
83
+ Returns:
84
+ A gemmi Structure object representing the structure in the file.
85
+ """
86
+ if file.name.endswith(".bcif"):
87
+ return bcif2structure(file)
88
+ if file.name.endswith(".bcif.gz"):
89
+ return bcifgz2structure(file)
90
+ return gemmi.read_structure(str(file))
91
+
92
+
93
+ def bcif2cif(bcif_file: Path) -> str:
94
+ """Convert a binary CIF (bcif) file to a CIF string.
95
+
96
+ Args:
97
+ bcif_file: Path to the binary CIF file.
98
+
99
+ Returns:
100
+ A string containing the CIF representation of the structure.
101
+ """
102
+ reader = BinaryCifReader()
103
+ container = reader.deserialize(str(bcif_file))
104
+ capture = StringIO()
105
+ writer = PdbxWriter(capture)
106
+ writer.write(container)
107
+ return capture.getvalue()
108
+
109
+
110
+ def bcifgz2structure(bcif_gz_file: Path) -> gemmi.Structure:
111
+ """Read a binary CIF (bcif) gzipped file and return a gemmi Structure object.
112
+
113
+ This is slower than other formats because gemmi does not support reading bcif files directly.
114
+ So we first gunzip the file to a temporary location, convert it to a cif string using mmcif package,
115
+ and then read the cif string using gemmi.
116
+
117
+ Args:
118
+ bcif_gz_file: Path to the binary CIF gzipped file.
119
+
120
+ Returns:
121
+ A gemmi Structure object representing the structure in the bcif.gz file.
122
+ """
123
+ with tempfile.NamedTemporaryFile(suffix=".bcif", delete=True) as tmp_bcif:
124
+ tmp_path = Path(tmp_bcif.name)
125
+ gunzip_file(bcif_gz_file, output_file=tmp_path, keep_original=True)
126
+ return bcif2structure(tmp_path)
127
+
128
+
129
+ def bcif2structure(bcif_file: Path) -> gemmi.Structure:
130
+ """Read a binary CIF (bcif) file and return a gemmi Structure object.
131
+
132
+ This is slower than other formats because gemmi does not support reading bcif files directly.
133
+ So we convert it to a cif string first using mmcif package and then read the cif string using gemmi.
134
+
135
+ Args:
136
+ bcif_file: Path to the binary CIF file.
137
+
138
+ Returns:
139
+ A gemmi Structure object representing the structure in the bcif file.
140
+ """
141
+ cif_content = bcif2cif(bcif_file)
142
+ doc = gemmi.cif.read_string(cif_content)
143
+ block = doc.sole_block()
144
+ return gemmi.make_structure_from_block(block)
145
+
146
+
147
+ def _initialize_dictionary_api(containers) -> DictionaryApi:
148
+ dict_local = user_cache_root_dir() / "mmcif_pdbx_v5_next.dic"
149
+ if not dict_local.exists():
150
+ dict_url = "https://raw.githubusercontent.com/wwpdb-dictionaries/mmcif_pdbx/master/dist/mmcif_pdbx_v5_next.dic"
151
+ logger.info("Downloading mmcif dictionary from %s to %s", dict_url, dict_local)
152
+ dict_local.parent.mkdir(parents=True, exist_ok=True)
153
+ with dict_local.open("wb") as f, urlopen(dict_url) as response: # noqa: S310 url is hardcoded and https
154
+ f.write(response.read())
155
+ return DictionaryApi(containerList=containers, consolidate=True)
156
+
157
+
158
+ def structure2bcif(structure: gemmi.Structure, bcif_file: Path):
159
+ """Write a gemmi Structure object to a binary CIF (bcif) file.
160
+
161
+ This is slower than other formats because gemmi does not support writing bcif files directly.
162
+ So we convert it to a cif string first using gemmi and then convert cif to bcif using mmcif package.
163
+
164
+ Args:
165
+ structure: The gemmi Structure object to write.
166
+ bcif_file: Path to the output binary CIF file.
167
+ """
168
+ doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
169
+ containers = []
170
+ with StringIO(doc.as_string()) as sio:
171
+ reader = PdbxReader(sio)
172
+ reader.read(containers)
173
+ dict_api = _initialize_dictionary_api(containers)
174
+ writer = BinaryCifWriter(dictionaryApi=dict_api)
175
+ writer.serialize(str(bcif_file), containers)
176
+
177
+
178
+ def gunzip_file(gz_file: Path, output_file: Path | None = None, keep_original: bool = True) -> Path:
179
+ """Unzip a .gz file.
180
+
181
+ Args:
182
+ gz_file: Path to the .gz file.
183
+ output_file: Optional path to the output unzipped file. If None, the .gz suffix is removed from gz_file.
184
+ keep_original: Whether to keep the original .gz file. Default is True.
185
+
186
+ Returns:
187
+ Path to the unzipped file.
188
+
189
+ Raises:
190
+ ValueError: If output_file is None and gz_file does not end with .gz.
191
+ """
192
+ if output_file is None and not gz_file.name.endswith(".gz"):
193
+ msg = f"If output_file is not provided, {gz_file} must end with .gz"
194
+ raise ValueError(msg)
195
+ out_file = output_file or gz_file.with_suffix("")
196
+ with gzip.open(gz_file, "rb") as f_in, out_file.open("wb") as f_out:
197
+ shutil.copyfileobj(f_in, f_out)
198
+ if not keep_original:
199
+ gz_file.unlink()
200
+ return out_file
201
+
202
+
203
+ def structure2bcifgz(structure: gemmi.Structure, bcif_gz_file: Path):
204
+ """Write a gemmi Structure object to a binary CIF gzipped (bcif.gz) file.
205
+
206
+ This is slower than other formats because gemmi does not support writing bcif files directly.
207
+ So we convert it to a cif string first using gemmi and then convert cif to bcif using mmcif package.
208
+ Finally, we gzip the bcif file.
209
+
210
+ Args:
211
+ structure: The gemmi Structure object to write.
212
+ bcif_gz_file: Path to the output binary CIF gzipped file.
213
+ """
214
+ with tempfile.NamedTemporaryFile(suffix=".bcif", delete=True) as tmp_bcif:
215
+ tmp_path = Path(tmp_bcif.name)
216
+ structure2bcif(structure, tmp_path)
217
+ with tmp_path.open("rb") as f_in, gzip.open(bcif_gz_file, "wb") as f_out:
218
+ shutil.copyfileobj(f_in, f_out)
219
+
220
+
221
+ def convert_to_cif_files(
222
+ input_files: Iterable[Path], output_dir: Path, copy_method: CopyMethod
223
+ ) -> Generator[tuple[Path, Path]]:
224
+ """Convert structure files to .cif format.
225
+
226
+ Args:
227
+ input_files: Iterable of structure files to convert.
228
+ output_dir: Directory to save the converted .cif files.
229
+ copy_method: How to copy when no changes are needed to output file.
230
+
231
+ Yields:
232
+ A tuple of the input file and the output file.
233
+ """
234
+ for input_file in input_files:
235
+ output_file = convert_to_cif_file(input_file, output_dir, copy_method)
236
+ yield input_file, output_file
237
+
238
+
239
+ def convert_to_cif_file(input_file: Path, output_dir: Path, copy_method: CopyMethod) -> Path:
240
+ """Convert a single structure file to .cif format.
241
+
242
+ Args:
243
+ input_file: The structure file to convert.
244
+ See [StructureFileExtensions][protein_quest.io.StructureFileExtensions]
245
+ for supported extensions.
246
+ output_dir: Directory to save the converted .cif file.
247
+ copy_method: How to copy when no changes are needed to output file.
248
+
249
+ Returns:
250
+ Path to the converted .cif file.
251
+ """
252
+ name, extension = split_name_and_extension(input_file.name)
253
+ output_file = output_dir / f"{name}.cif"
254
+ if output_file.exists():
255
+ logger.info("Output file %s already exists for input file %s. Skipping.", output_file, input_file)
256
+ elif extension in {".pdb", ".pdb.gz", ".ent", ".ent.gz"}:
257
+ structure = read_structure(input_file)
258
+ write_structure(structure, output_file)
259
+ elif extension == ".cif":
260
+ logger.info("File %s is already in .cif format, copying to %s", input_file, output_dir)
261
+ copyfile(input_file, output_file, copy_method)
262
+ elif extension == ".cif.gz":
263
+ gunzip_file(input_file, output_file=output_file, keep_original=True)
264
+ elif extension == ".bcif":
265
+ with output_file.open("w") as f:
266
+ f.write(bcif2cif(input_file))
267
+ else:
268
+ msg = (
269
+ f"Unsupported file extension {extension} in {input_file}. "
270
+ f"Supported extensions are {valid_structure_file_extensions}."
271
+ )
272
+ raise ValueError(msg)
273
+ return output_file
274
+
275
+
276
+ def split_name_and_extension(name: str) -> tuple[str, str]:
277
+ """Split a filename into its name and extension.
278
+
279
+ `.gz` is considered part of the extension if present.
280
+
281
+ Examples:
282
+ Some example usages.
283
+
284
+ >>> from protein_quest.pdbe.io import split_name_and_extension
285
+ >>> split_name_and_extension("1234.pdb")
286
+ ('1234', '.pdb')
287
+ >>> split_name_and_extension("1234.pdb.gz")
288
+ ('1234', '.pdb.gz')
289
+
290
+ Args:
291
+ name: The filename to split.
292
+
293
+ Returns:
294
+ A tuple containing the name and the extension.
295
+ """
296
+ ext = ""
297
+ if name.endswith(".gz"):
298
+ ext = ".gz"
299
+ name = name.removesuffix(".gz")
300
+ i = name.rfind(".")
301
+ if 0 < i < len(name) - 1:
302
+ ext = name[i:] + ext
303
+ name = name[:i]
304
+ return name, ext
305
+
306
+
307
+ def locate_structure_file(root: Path, pdb_id: str) -> Path:
308
+ """Locate a structure file for a given PDB ID in the specified directory.
309
+
310
+ Uses [StructureFileExtensions][protein_quest.io.StructureFileExtensions] as potential extensions.
311
+ Also tries different casing of the PDB ID.
312
+
313
+ Args:
314
+ root: The root directory to search in.
315
+ pdb_id: The PDB ID to locate.
316
+
317
+ Returns:
318
+ The path to the located structure file.
319
+
320
+ Raises:
321
+ FileNotFoundError: If no structure file is found for the given PDB ID.
322
+ """
323
+ for ext in valid_structure_file_extensions:
324
+ candidates = (
325
+ root / f"{pdb_id}{ext}",
326
+ root / f"{pdb_id.lower()}{ext}",
327
+ root / f"{pdb_id.upper()}{ext}",
328
+ root / f"pdb{pdb_id.lower()}{ext}",
329
+ )
330
+ for candidate in candidates:
331
+ if candidate.exists():
332
+ return candidate
333
+ msg = f"No structure file found for {pdb_id} in {root}"
334
+ raise FileNotFoundError(msg)
335
+
336
+
337
+ def glob_structure_files(input_dir: Path) -> Generator[Path]:
338
+ """Glob for structure files in a directory.
339
+
340
+ Uses [StructureFileExtensions][protein_quest.io.StructureFileExtensions] as valid extensions.
341
+ Does not search recursively.
342
+
343
+ Args:
344
+ input_dir: The input directory to search for structure files.
345
+
346
+ Yields:
347
+ Paths to the found structure files.
348
+ """
349
+ for ext in valid_structure_file_extensions:
350
+ yield from input_dir.glob(f"*{ext}")
@@ -45,9 +45,10 @@ from protein_quest.alphafold.fetch import AlphaFoldEntry, DownloadableFormat
45
45
  from protein_quest.alphafold.fetch import fetch_many as alphafold_fetch
46
46
  from protein_quest.emdb import fetch as emdb_fetch
47
47
  from protein_quest.go import search_gene_ontology_term
48
+ from protein_quest.io import convert_to_cif_file, glob_structure_files
48
49
  from protein_quest.pdbe.fetch import fetch as pdbe_fetch
49
- from protein_quest.pdbe.io import glob_structure_files, nr_residues_in_chain, write_single_chain_pdb_file
50
50
  from protein_quest.ss import filter_file_on_secondary_structure
51
+ from protein_quest.structure import nr_residues_in_chain, write_single_chain_structure_file
51
52
  from protein_quest.taxonomy import search_taxon
52
53
  from protein_quest.uniprot import (
53
54
  PdbResult,
@@ -112,18 +113,18 @@ def extract_single_chain_from_structure(
112
113
  out_chain: str = "A",
113
114
  ) -> Path:
114
115
  """
115
- Extract a single chain from a mmCIF/pdb file and write to a new file.
116
+ Extract a single chain from a structure (mmCIF or pdb) file and write to a new file.
116
117
 
117
118
  Args:
118
- input_file: Path to the input mmCIF/pdb file.
119
+ input_file: Path to the input structure (mmCIF or pdb) file.
119
120
  chain2keep: The chain to keep.
120
121
  output_dir: Directory to save the output file.
121
122
  out_chain: The chain identifier for the output file.
122
123
 
123
124
  Returns:
124
- Path to the output mmCIF/pdb file
125
+ Path to the output structure (mmCIF or pdb) file
125
126
  """
126
- return write_single_chain_pdb_file(input_file, chain2keep, output_dir, out_chain)
127
+ return write_single_chain_structure_file(input_file, chain2keep, output_dir, out_chain)
127
128
 
128
129
 
129
130
  @mcp.tool
@@ -199,6 +200,8 @@ def alphafold_confidence_filter(file: Path, query: ConfidenceFilterQuery, filter
199
200
 
200
201
  mcp.tool(filter_file_on_secondary_structure)
201
202
 
203
+ mcp.tool(convert_to_cif_file)
204
+
202
205
 
203
206
  @mcp.prompt
204
207
  def candidate_structures(
protein_quest/ss.py CHANGED
@@ -5,17 +5,13 @@ from collections.abc import Generator, Iterable
5
5
  from dataclasses import dataclass
6
6
  from pathlib import Path
7
7
 
8
- from gemmi import Structure, read_structure, set_leak_warnings
8
+ from gemmi import Structure
9
9
 
10
10
  from protein_quest.converter import PositiveInt, Ratio, converter
11
+ from protein_quest.io import read_structure
11
12
 
12
13
  logger = logging.getLogger(__name__)
13
14
 
14
- # TODO remove once v0.7.4 of gemmi is released,
15
- # as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
16
- # Swallow gemmi leaked function warnings
17
- set_leak_warnings(False)
18
-
19
15
  # TODO if a structure has no secondary structure information, calculate it with `gemmi ss`.
20
16
  # https://github.com/MonomerLibrary/monomers/wiki/Installation as --monomers dir
21
17
  # gemmi executable is in https://pypi.org/project/gemmi-program/
@@ -261,7 +257,7 @@ def filter_file_on_secondary_structure(
261
257
  Returns:
262
258
  Filtering statistics and whether file passed.
263
259
  """
264
- structure = read_structure(str(file_path))
260
+ structure = read_structure(file_path)
265
261
  return filter_on_secondary_structure(structure, query)
266
262
 
267
263
 
@@ -1,51 +1,29 @@
1
- """Module for structure file input/output."""
1
+ """Module for querying and modifying [gemmi structures][gemmi.Structure]."""
2
2
 
3
- import gzip
4
3
  import logging
5
- from collections.abc import Generator, Iterable
4
+ from collections.abc import Iterable
6
5
  from datetime import UTC, datetime
7
6
  from pathlib import Path
8
7
 
9
8
  import gemmi
10
9
 
11
10
  from protein_quest.__version__ import __version__
11
+ from protein_quest.io import read_structure, split_name_and_extension, write_structure
12
12
  from protein_quest.utils import CopyMethod, copyfile
13
13
 
14
14
  logger = logging.getLogger(__name__)
15
15
 
16
- # TODO remove once v0.7.4 of gemmi is released,
17
- # as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
18
- # Swallow gemmi leaked function warnings
19
- gemmi.set_leak_warnings(False)
20
16
 
21
-
22
- def nr_residues_in_chain(file: Path | str, chain: str = "A") -> int:
23
- """Returns the number of residues in a specific chain from a mmCIF/pdb file.
17
+ def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain | None:
18
+ """Find a chain in a model.
24
19
 
25
20
  Args:
26
- file: Path to the input mmCIF/pdb file.
27
- chain: Chain to count residues of.
21
+ model: The gemmi model to search in.
22
+ wanted_chain: The chain identifier to search for.
28
23
 
29
24
  Returns:
30
- The number of residues in the specified chain.
25
+ The found chain or None if not found.
31
26
  """
32
- structure = gemmi.read_structure(str(file))
33
- gchain = find_chain_in_structure(structure, chain)
34
- if gchain is None:
35
- logger.warning("Chain %s not found in %s. Returning 0.", chain, file)
36
- return 0
37
- return len(gchain)
38
-
39
-
40
- def find_chain_in_structure(structure: gemmi.Structure, wanted_chain: str) -> gemmi.Chain | None:
41
- for model in structure:
42
- chain = find_chain_in_model(model, wanted_chain)
43
- if chain is not None:
44
- return chain
45
- return None
46
-
47
-
48
- def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain | None:
49
27
  chain = model.find_chain(wanted_chain)
50
28
  if chain is None:
51
29
  # For chain A in 4v92 the find_chain method returns None,
@@ -57,106 +35,39 @@ def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain |
57
35
  return chain
58
36
 
59
37
 
60
- def write_structure(structure: gemmi.Structure, path: Path):
61
- """Write a gemmi structure to a file.
62
-
63
- Args:
64
- structure: The gemmi structure to write.
65
- path: The file path to write the structure to.
66
- The format depends on the file extension.
67
- Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz.
68
-
69
- Raises:
70
- ValueError: If the file extension is not supported.
71
- """
72
- if path.name.endswith(".pdb"):
73
- body: str = structure.make_pdb_string()
74
- path.write_text(body)
75
- elif path.name.endswith(".pdb.gz"):
76
- body: str = structure.make_pdb_string()
77
- with gzip.open(path, "wt") as f:
78
- f.write(body)
79
- elif path.name.endswith(".cif"):
80
- # do not write chem_comp so it is viewable by molstar
81
- # see https://github.com/project-gemmi/gemmi/discussions/362
82
- doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
83
- doc.write_file(str(path))
84
- elif path.name.endswith(".cif.gz"):
85
- doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
86
- cif_str = doc.as_string()
87
- with gzip.open(path, "wt") as f:
88
- f.write(cif_str)
89
- else:
90
- msg = f"Unsupported file extension in {path.name}. Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz"
91
- raise ValueError(msg)
92
-
93
-
94
- def _split_name_and_extension(name: str) -> tuple[str, str]:
95
- # 1234.pdb -> (1234, .pdb)
96
- # 1234.pdb.gz -> (1234, .pdb.gz)
97
- # 1234.cif -> (1234, .cif)
98
- # 1234.cif.gz -> (1234, .cif.gz)
99
- if name.endswith(".pdb.gz"):
100
- return name.replace(".pdb.gz", ""), ".pdb.gz"
101
- if name.endswith(".cif.gz"):
102
- return name.replace(".cif.gz", ""), ".cif.gz"
103
- if name.endswith(".pdb"):
104
- return name.replace(".pdb", ""), ".pdb"
105
- if name.endswith(".cif"):
106
- return name.replace(".cif", ""), ".cif"
107
-
108
- msg = f"Unknown file extension in {name}. Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz"
109
- raise ValueError(msg)
110
-
111
-
112
- def locate_structure_file(root: Path, pdb_id: str) -> Path:
113
- """Locate a structure file for a given PDB ID in the specified directory.
38
+ def find_chain_in_structure(structure: gemmi.Structure, wanted_chain: str) -> gemmi.Chain | None:
39
+ """Find a chain in a structure.
114
40
 
115
41
  Args:
116
- root: The root directory to search in.
117
- pdb_id: The PDB ID to locate.
42
+ structure: The gemmi structure to search in.
43
+ wanted_chain: The chain identifier to search for.
118
44
 
119
45
  Returns:
120
- The path to the located structure file.
121
-
122
- Raises:
123
- FileNotFoundError: If no structure file is found for the given PDB ID.
46
+ The found chain or None if not found.
124
47
  """
125
- exts = [".cif.gz", ".cif", ".pdb.gz", ".pdb", ".ent", ".ent.gz"]
126
- for ext in exts:
127
- candidates = (
128
- root / f"{pdb_id}{ext}",
129
- root / f"{pdb_id.lower()}{ext}",
130
- root / f"{pdb_id.upper()}{ext}",
131
- root / f"pdb{pdb_id.lower()}{ext}",
132
- )
133
- for candidate in candidates:
134
- if candidate.exists():
135
- return candidate
136
- msg = f"No structure file found for {pdb_id} in {root}"
137
- raise FileNotFoundError(msg)
48
+ for model in structure:
49
+ chain = find_chain_in_model(model, wanted_chain)
50
+ if chain is not None:
51
+ return chain
52
+ return None
138
53
 
139
54
 
140
- def glob_structure_files(input_dir: Path) -> Generator[Path]:
141
- """Glob for structure files in a directory.
55
+ def nr_residues_in_chain(file: Path, chain: str = "A") -> int:
56
+ """Returns the number of residues in a specific chain from a structure file.
142
57
 
143
58
  Args:
144
- input_dir: The input directory to search for structure files.
59
+ file: Path to the input structure file.
60
+ chain: Chain to count residues of.
145
61
 
146
- Yields:
147
- Paths to the found structure files.
62
+ Returns:
63
+ The number of residues in the specified chain.
148
64
  """
149
- for ext in [".cif.gz", ".cif", ".pdb.gz", ".pdb"]:
150
- yield from input_dir.glob(f"*{ext}")
151
-
152
-
153
- class ChainNotFoundError(IndexError):
154
- """Exception raised when a chain is not found in a structure."""
155
-
156
- def __init__(self, chain: str, file: Path | str, available_chains: Iterable[str]):
157
- super().__init__(f"Chain {chain} not found in {file}. Available chains are: {available_chains}")
158
- self.chain_id = chain
159
- self.file = file
65
+ structure = read_structure(file)
66
+ gchain = find_chain_in_structure(structure, chain)
67
+ if gchain is None:
68
+ logger.warning("Chain %s not found in %s. Returning 0.", chain, file)
69
+ return 0
70
+ return len(gchain)
160
71
 
161
72
 
162
73
  def _dedup_helices(structure: gemmi.Structure):
@@ -198,18 +109,34 @@ def _add_provenance_info(structure: gemmi.Structure, chain2keep: str, out_chain:
198
109
 
199
110
 
200
111
  def chains_in_structure(structure: gemmi.Structure) -> set[gemmi.Chain]:
201
- """Get a list of chains in a structure."""
112
+ """Get a list of chains in a structure.
113
+
114
+ Args:
115
+ structure: The gemmi structure to get chains from.
116
+
117
+ Returns:
118
+ A set of chains in the structure.
119
+ """
202
120
  return {c for model in structure for c in model}
203
121
 
204
122
 
205
- def write_single_chain_pdb_file(
123
+ class ChainNotFoundError(IndexError):
124
+ """Exception raised when a chain is not found in a structure."""
125
+
126
+ def __init__(self, chain: str, file: Path | str, available_chains: Iterable[str]):
127
+ super().__init__(f"Chain {chain} not found in {file}. Available chains are: {available_chains}")
128
+ self.chain_id = chain
129
+ self.file = file
130
+
131
+
132
+ def write_single_chain_structure_file(
206
133
  input_file: Path,
207
134
  chain2keep: str,
208
135
  output_dir: Path,
209
136
  out_chain: str = "A",
210
137
  copy_method: CopyMethod = "copy",
211
138
  ) -> Path:
212
- """Write a single chain from a mmCIF/pdb file to a new mmCIF/pdb file.
139
+ """Write a single chain from a structure file to a new structure file.
213
140
 
214
141
  Also
215
142
 
@@ -226,14 +153,14 @@ def write_single_chain_pdb_file(
226
153
  ```
227
154
 
228
155
  Args:
229
- input_file: Path to the input mmCIF/pdb file.
156
+ input_file: Path to the input structure file.
230
157
  chain2keep: The chain to keep.
231
158
  output_dir: Directory to save the output file.
232
159
  out_chain: The chain identifier for the output file.
233
160
  copy_method: How to copy when no changes are needed to output file.
234
161
 
235
162
  Returns:
236
- Path to the output mmCIF/pdb file
163
+ Path to the output structure file
237
164
 
238
165
  Raises:
239
166
  FileNotFoundError: If the input file does not exist.
@@ -241,7 +168,7 @@ def write_single_chain_pdb_file(
241
168
  """
242
169
 
243
170
  logger.debug(f"chain2keep: {chain2keep}, out_chain: {out_chain}")
244
- structure = gemmi.read_structure(str(input_file))
171
+ structure = read_structure(input_file)
245
172
  structure.setup_entities()
246
173
 
247
174
  chain = find_chain_in_structure(structure, chain2keep)
@@ -249,7 +176,7 @@ def write_single_chain_pdb_file(
249
176
  if chain is None:
250
177
  raise ChainNotFoundError(chain2keep, input_file, chainnames_in_structure)
251
178
  chain_name = chain.name
252
- name, extension = _split_name_and_extension(input_file.name)
179
+ name, extension = split_name_and_extension(input_file.name)
253
180
  output_file = output_dir / f"{name}_{chain_name}2{out_chain}{extension}"
254
181
 
255
182
  if output_file.exists():
protein_quest/utils.py CHANGED
@@ -265,6 +265,7 @@ async def retrieve_files(
265
265
  desc: str = "Downloading files",
266
266
  cacher: Cacher | None = None,
267
267
  chunk_size: int = 524288, # 512 KiB
268
+ gzip_files: bool = False,
268
269
  ) -> list[Path]:
269
270
  """Retrieve files from a list of URLs and save them to a directory.
270
271
 
@@ -277,6 +278,7 @@ async def retrieve_files(
277
278
  desc: Description for the progress bar.
278
279
  cacher: An optional cacher to use for caching files.
279
280
  chunk_size: The size of each chunk to read from the response.
281
+ gzip_files: Whether to gzip the downloaded files.
280
282
 
281
283
  Returns:
282
284
  A list of paths to the downloaded files.
@@ -292,6 +294,7 @@ async def retrieve_files(
292
294
  semaphore=semaphore,
293
295
  cacher=cacher,
294
296
  chunk_size=chunk_size,
297
+ gzip_files=gzip_files,
295
298
  )
296
299
  for url, filename in urls
297
300
  ]
@@ -299,6 +302,10 @@ async def retrieve_files(
299
302
  return files
300
303
 
301
304
 
305
+ class InvalidContentEncodingError(aiohttp.ClientResponseError):
306
+ """Content encoding is invalid."""
307
+
308
+
302
309
  async def _retrieve_file(
303
310
  session: RetryClient,
304
311
  url: URL | str,
@@ -306,6 +313,7 @@ async def _retrieve_file(
306
313
  semaphore: asyncio.Semaphore,
307
314
  cacher: Cacher | None = None,
308
315
  chunk_size: int = 524288, # 512 KiB
316
+ gzip_files: bool = False,
309
317
  ) -> Path:
310
318
  """Retrieve a single file from a URL and save it to a specified path.
311
319
 
@@ -316,6 +324,7 @@ async def _retrieve_file(
316
324
  semaphore: A semaphore to limit the number of concurrent downloads.
317
325
  cacher: An optional cacher to use for caching files.
318
326
  chunk_size: The size of each chunk to read from the response.
327
+ gzip_files: Whether to gzip the downloaded file.
319
328
 
320
329
  Returns:
321
330
  The path to the saved file.
@@ -330,12 +339,27 @@ async def _retrieve_file(
330
339
  logger.debug(f"File {save_path} was copied from cache {cached_file}. Skipping download from {url}.")
331
340
  return save_path
332
341
 
342
+ # Alphafold server and many other web servers can return gzipped responses,
343
+ # when we want to save as *.gz, we use raw stream
344
+ # otherwise aiohttp will decompress it automatically for us.
345
+ auto_decompress = not gzip_files
346
+ headers = {"Accept-Encoding": "gzip"}
333
347
  async with (
334
348
  semaphore,
335
- session.get(url) as resp,
349
+ session.get(url, headers=headers, auto_decompress=auto_decompress) as resp,
336
350
  ):
337
351
  resp.raise_for_status()
338
- await cacher.write_iter(save_path, resp.content.iter_chunked(chunk_size))
352
+ if gzip_files and resp.headers.get("Content-Encoding") != "gzip":
353
+ msg = f"Server did not send gzip encoded content for {url}, can not save as gzipped file."
354
+ raise InvalidContentEncodingError(
355
+ request_info=resp.request_info,
356
+ history=resp.history,
357
+ status=415,
358
+ message=msg,
359
+ headers=resp.headers,
360
+ )
361
+ iterator = resp.content.iter_chunked(chunk_size)
362
+ await cacher.write_iter(save_path, iterator)
339
363
  return save_path
340
364
 
341
365
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: protein_quest
3
- Version: 0.5.1
3
+ Version: 0.6.0
4
4
  Summary: Search/retrieve/filter proteins and protein structures
5
5
  Project-URL: Homepage, https://github.com/haddocking/protein-quest
6
6
  Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
@@ -17,6 +17,7 @@ Requires-Dist: cattrs[orjson]>=24.1.3
17
17
  Requires-Dist: dask>=2025.5.1
18
18
  Requires-Dist: distributed>=2025.5.1
19
19
  Requires-Dist: gemmi>=0.7.3
20
+ Requires-Dist: mmcif>=0.92.0
20
21
  Requires-Dist: platformdirs>=4.3.8
21
22
  Requires-Dist: psutil>=7.0.0
22
23
  Requires-Dist: rich-argparse>=1.7.1
@@ -71,6 +72,7 @@ graph TB;
71
72
  fetchad -->|mmcif_files| confidencefilter{{Filter out low confidence}}
72
73
  confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
73
74
  residuefilter --> |mmcif_files| ssfilter
75
+ ssfilter -. mmcif_files .-> convert2cif([Convert to cif])
74
76
  classDef dashedBorder stroke-dasharray: 5 5;
75
77
  goterm:::dashedBorder
76
78
  taxonomy:::dashedBorder
@@ -78,6 +80,7 @@ graph TB;
78
80
  fetchemdb:::dashedBorder
79
81
  searchintactionpartners:::dashedBorder
80
82
  searchcomplexes:::dashedBorder
83
+ convert2cif:::dashedBorder
81
84
  ```
82
85
 
83
86
  (Dotted nodes and edges are side-quests.)
@@ -242,6 +245,14 @@ query_protein,complex_id,complex_url,complex_title,members
242
245
  Q05471,CPX-2122,https://www.ebi.ac.uk/complexportal/complex/CPX-2122,Swr1 chromatin remodelling complex,P31376;P35817;P38326;P53201;P53930;P60010;P80428;Q03388;Q03433;Q03940;Q05471;Q06707;Q12464;Q12509
243
246
  ```
244
247
 
248
+ ### Convert structure files to .cif format
249
+
250
+ Some tools (for example [powerfit](https://github.com/haddocking/powerfit)) only work with `.cif` files and not `*.cif.gz` or `*.bcif` files.
251
+
252
+ ```shell
253
+ protein-quest convert --output-dir ./filtered-cif ./filtered-ss
254
+ ```
255
+
245
256
  ## Model Context Protocol (MCP) server
246
257
 
247
258
  Protein quest can also help LLMs like Claude Sonnet 4 by providing a [set of tools](https://modelcontextprotocol.io/docs/learn/server-concepts#tools-ai-actions) for protein structures.
@@ -0,0 +1,27 @@
1
+ protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ protein_quest/__version__.py,sha256=z_nR_Ti0YfIwFSKDD18DIrz_r3zxWQ8EGCNr2XUWkY0,56
3
+ protein_quest/cli.py,sha256=pWwMIzWBrtqhZbvTIkvd1XhA5u9J-WAAg7A3hJZGtlk,46201
4
+ protein_quest/converter.py,sha256=Y-Oxf7lDNbEicL6GS-IpNWDwaAiHgIgs5bFAcEHCKdQ,1441
5
+ protein_quest/emdb.py,sha256=641c6RwNYnu-0GBFyCFBiI58fNc0jMkd0ZZ9MW9-Jmc,1501
6
+ protein_quest/filters.py,sha256=Xr-cJTtbNjHKuzmXLBf7yZfqKf_U3RTivcVbr620LVU,5225
7
+ protein_quest/go.py,sha256=lZNEcw8nTc9wpV3cl4y2FG9Lsj8wsXQ6zemmAQs_DWE,5650
8
+ protein_quest/io.py,sha256=ngV_HU2HIQFO-bP2xQj_fhgv0MYjW4puqz_9CxGpBv8,13017
9
+ protein_quest/mcp_server.py,sha256=rQv2srhF3_SYYK1TD3htIyxNiunU7a8FDC7CYT_oJFE,8269
10
+ protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
11
+ protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ protein_quest/ss.py,sha256=4ZGIHfjTlodYTXqGUKhMnGbgaStYOGaWg2oYrWIjdgo,10118
13
+ protein_quest/structure.py,sha256=1FTKN0mYKTwZHlyIB4ORSAgSHFKK-UAK7T-qoFo1vyI,7162
14
+ protein_quest/taxonomy.py,sha256=4mKv8zll4mX02Ow8CTvyqMJE2KJZvcq3QlTjjjLOJJk,5072
15
+ protein_quest/uniprot.py,sha256=92G5YiJAJwUBKJQHPrM6DZlaLe-XG4qBg0zy0BDGFYY,24354
16
+ protein_quest/utils.py,sha256=6OF8X4ia_z1HOYiXy6e-zEWlp_bF1DoZCVrCSg1qivY,19076
17
+ protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
18
+ protein_quest/alphafold/confidence.py,sha256=mVAYTIzdbR8xBjRiUzA0at8wJq9vpfEQWPz5cJefLKs,6766
19
+ protein_quest/alphafold/entry_summary.py,sha256=GtE3rT7wH3vIOOeiXY2s80Fo6EzdoqlcvakW8K591Yk,1257
20
+ protein_quest/alphafold/fetch.py,sha256=n5SlqbQfU1PE4X8saV4O1nCrKRn3Q2UcMlrNw5-163w,12801
21
+ protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
22
+ protein_quest/pdbe/fetch.py,sha256=e8CHWDX2QzWnVLmYXCfNrscw1UcN1lI9Uz6Z5HmEOEQ,2510
23
+ protein_quest-0.6.0.dist-info/METADATA,sha256=8rX0ixi4Xl516LkxOlOKKRe364nKIjP7mKn67xuOcDA,9623
24
+ protein_quest-0.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
25
+ protein_quest-0.6.0.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
26
+ protein_quest-0.6.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
27
+ protein_quest-0.6.0.dist-info/RECORD,,
@@ -1,26 +0,0 @@
1
- protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- protein_quest/__version__.py,sha256=iRjDp09jO2JFmZdsWS3ikyYYQ8S33AzhMdrr00gEG9g,56
3
- protein_quest/cli.py,sha256=xiXt_2l3MxbTbmxm2sz0w8_OdJr8gz_B68GBVv5wHjE,44182
4
- protein_quest/converter.py,sha256=Y-Oxf7lDNbEicL6GS-IpNWDwaAiHgIgs5bFAcEHCKdQ,1441
5
- protein_quest/emdb.py,sha256=641c6RwNYnu-0GBFyCFBiI58fNc0jMkd0ZZ9MW9-Jmc,1501
6
- protein_quest/filters.py,sha256=-gasSXR4g5SzYSYbkfcDwR-tm2KCAhCMdpIVJrUPR1w,5224
7
- protein_quest/go.py,sha256=lZNEcw8nTc9wpV3cl4y2FG9Lsj8wsXQ6zemmAQs_DWE,5650
8
- protein_quest/mcp_server.py,sha256=PCXxcU3GElKg2sjMlxbsM63OiFxg9AtmfKwBJ1_0AQE,8130
9
- protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
10
- protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- protein_quest/ss.py,sha256=qOr0aMycNAtZmXXvhCN-KZH3Qp4EejnBcE6fsFgCrmY,10343
12
- protein_quest/taxonomy.py,sha256=4mKv8zll4mX02Ow8CTvyqMJE2KJZvcq3QlTjjjLOJJk,5072
13
- protein_quest/uniprot.py,sha256=92G5YiJAJwUBKJQHPrM6DZlaLe-XG4qBg0zy0BDGFYY,24354
14
- protein_quest/utils.py,sha256=2lQ7jPHWtDySBTYnoL9VTKl5XUgQVYgp9Prb7qEnjtQ,17982
15
- protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
16
- protein_quest/alphafold/confidence.py,sha256=pYIuwYdkuPuHLagcX1dSvSyZ_84xboRLfHUxkEoc4MY,6766
17
- protein_quest/alphafold/entry_summary.py,sha256=GtE3rT7wH3vIOOeiXY2s80Fo6EzdoqlcvakW8K591Yk,1257
18
- protein_quest/alphafold/fetch.py,sha256=wIsgPZmtnE5EoAL9G22Y6Ehx9d0md53Mw88-6LLGp0Q,12298
19
- protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
20
- protein_quest/pdbe/fetch.py,sha256=e8CHWDX2QzWnVLmYXCfNrscw1UcN1lI9Uz6Z5HmEOEQ,2510
21
- protein_quest/pdbe/io.py,sha256=iGLvmsD-eEYnrgZDYfkGWIDCzwDRRD5dwqB480talCs,10037
22
- protein_quest-0.5.1.dist-info/METADATA,sha256=MPfZLLa8XC1tZ3okRYIT3Hs3pMvd8ShA17Yy2axKBe8,9236
23
- protein_quest-0.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
24
- protein_quest-0.5.1.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
25
- protein_quest-0.5.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
26
- protein_quest-0.5.1.dist-info/RECORD,,