protein-quest 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,104 @@
1
+ """Dask helper functions."""
2
+
3
+ import logging
4
+ import os
5
+ from collections.abc import Callable, Collection
6
+ from typing import Concatenate, ParamSpec, cast
7
+
8
+ from dask.distributed import Client, LocalCluster, progress
9
+ from distributed.deploy.cluster import Cluster
10
+ from psutil import cpu_count
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def configure_dask_scheduler(
16
+ scheduler_address: str | Cluster | None,
17
+ name: str,
18
+ nproc: int = 1,
19
+ ) -> str | Cluster:
20
+ """Configure the Dask scheduler by reusing existing or creating a new cluster.
21
+
22
+ Args:
23
+ scheduler_address: Address of the Dask scheduler to connect to, or None for local cluster.
24
+ name: Name for the Dask cluster.
25
+ nproc: Number of processes to use per worker for CPU support.
26
+
27
+ Returns:
28
+ A Dask Cluster instance or a string address for the scheduler.
29
+ """
30
+ if scheduler_address is None:
31
+ scheduler_address = _configure_cpu_dask_scheduler(nproc, name)
32
+ logger.info(f"Using local Dask cluster: {scheduler_address}")
33
+
34
+ return scheduler_address
35
+
36
+
37
+ def nr_cpus() -> int:
38
+ """Determine the number of CPU cores to use.
39
+
40
+ If the environment variables SLURM_CPUS_PER_TASK or OMP_NUM_THREADS are set,
41
+ their value is used. Otherwise, the number of physical CPU cores is returned.
42
+
43
+ Returns:
44
+ The number of CPU cores to use.
45
+
46
+ Raises:
47
+ ValueError: If the number of physical CPU cores cannot be determined.
48
+ """
49
+ physical_cores = cpu_count(logical=False)
50
+ if physical_cores is None:
51
+ msg = "Cannot determine number of logical CPU cores."
52
+ raise ValueError(msg)
53
+ for var in ["SLURM_CPUS_PER_TASK", "OMP_NUM_THREADS"]:
54
+ value = os.environ.get(var)
55
+ if value is not None:
56
+ logger.warning(
57
+ 'Not using all CPU cores (%s) of machine, environment variable "%s" is set to %s.',
58
+ physical_cores,
59
+ var,
60
+ value,
61
+ )
62
+ return int(value)
63
+ return physical_cores
64
+
65
+
66
+ def _configure_cpu_dask_scheduler(nproc: int, name: str) -> LocalCluster:
67
+ total_cpus = nr_cpus()
68
+ n_workers = total_cpus // nproc
69
+ # Use single thread per worker to prevent GIL slowing down the computations
70
+ return LocalCluster(name=name, threads_per_worker=1, n_workers=n_workers)
71
+
72
+
73
+ # Generic type parameters used across helpers
74
+ P = ParamSpec("P")
75
+
76
+
77
+ def dask_map_with_progress[T, R, **P](
78
+ client: Client,
79
+ func: Callable[Concatenate[T, P], R],
80
+ iterable: Collection[T],
81
+ *args: P.args,
82
+ **kwargs: P.kwargs,
83
+ ) -> list[R]:
84
+ """
85
+ Wrapper for map, progress, and gather of Dask that returns a correctly typed list.
86
+
87
+ Args:
88
+ client: Dask client.
89
+ func: Function to map; first parameter comes from ``iterable`` and any
90
+ additional parameters can be provided positionally via ``*args`` or
91
+ as keyword arguments via ``**kwargs``.
92
+ iterable: Collection of arguments to map over.
93
+ *args: Additional positional arguments to pass to client.map().
94
+ **kwargs: Additional keyword arguments to pass to client.map().
95
+
96
+ Returns:
97
+ List of results of type returned by `func` function.
98
+ """
99
+ if client.dashboard_link:
100
+ logger.info(f"Follow progress on dask dashboard at: {client.dashboard_link}")
101
+ futures = client.map(func, iterable, *args, **kwargs)
102
+ progress(futures)
103
+ results = client.gather(futures)
104
+ return cast("list[R]", results)
@@ -0,0 +1 @@
1
+ """Modules related to PDBe (Protein Data Bank in Europe)."""
@@ -0,0 +1,68 @@
1
+ """Module for fetching structures from PDBe."""
2
+
3
+ from collections.abc import Iterable, Mapping
4
+ from pathlib import Path
5
+
6
+ from protein_quest.utils import Cacher, retrieve_files, run_async
7
+
8
+
9
+ def _map_id_mmcif(pdb_id: str) -> tuple[str, str]:
10
+ """
11
+ Map PDB id to a download gzipped mmCIF url and file.
12
+
13
+ For example for PDB id "8WAS", the url will be
14
+ "https://www.ebi.ac.uk/pdbe/entry-files/download/8was.cif.gz" and the file will be "8was.cif.gz".
15
+
16
+ Args:
17
+ pdb_id: The PDB ID to map.
18
+
19
+ Returns:
20
+ A tuple containing the URL to download the mmCIF file and the filename.
21
+ """
22
+ fn = f"{pdb_id.lower()}.cif.gz"
23
+ # On PDBe you can sometimes download an updated mmCIF file,
24
+ # Current url is for the archive mmCIF file
25
+ # TODO check if archive is OK, or if we should try to download the updated file
26
+ # this will cause many more requests, so we should only do this if needed
27
+ url = f"https://www.ebi.ac.uk/pdbe/entry-files/download/{fn}"
28
+ return url, fn
29
+
30
+
31
+ async def fetch(
32
+ ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 5, cacher: Cacher | None = None
33
+ ) -> Mapping[str, Path]:
34
+ """Fetches mmCIF files from the PDBe database.
35
+
36
+ Args:
37
+ ids: A set of PDB IDs to fetch.
38
+ save_dir: The directory to save the fetched mmCIF files to.
39
+ max_parallel_downloads: The maximum number of parallel downloads.
40
+ cacher: An optional cacher to use for caching downloaded files.
41
+
42
+ Returns:
43
+ A dict of id and paths to the downloaded mmCIF files.
44
+ """
45
+
46
+ # The future result, is in a different order than the input ids,
47
+ # so we need to map the ids to the urls and filenames.
48
+
49
+ id2urls = {pdb_id: _map_id_mmcif(pdb_id) for pdb_id in ids}
50
+ urls = list(id2urls.values())
51
+ id2paths = {pdb_id: save_dir / fn for pdb_id, (_, fn) in id2urls.items()}
52
+
53
+ await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading PDBe mmCIF files", cacher=cacher)
54
+ return id2paths
55
+
56
+
57
+ def sync_fetch(ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 5) -> Mapping[str, Path]:
58
+ """Synchronously fetches mmCIF files from the PDBe database.
59
+
60
+ Args:
61
+ ids: A set of PDB IDs to fetch.
62
+ save_dir: The directory to save the fetched mmCIF files to.
63
+ max_parallel_downloads: The maximum number of parallel downloads.
64
+
65
+ Returns:
66
+ A dict of id and paths to the downloaded mmCIF files.
67
+ """
68
+ return run_async(fetch(ids, save_dir, max_parallel_downloads))
protein_quest/py.typed ADDED
File without changes
protein_quest/ss.py ADDED
@@ -0,0 +1,280 @@
1
+ """Module for dealing with secondary structure."""
2
+
3
+ import logging
4
+ from collections.abc import Generator, Iterable
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+ from gemmi import Structure
9
+
10
+ from protein_quest.converter import PositiveInt, Ratio, converter
11
+ from protein_quest.io import read_structure
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # TODO if a structure has no secondary structure information, calculate it with `gemmi ss`.
16
+ # https://github.com/MonomerLibrary/monomers/wiki/Installation as --monomers dir
17
+ # gemmi executable is in https://pypi.org/project/gemmi-program/
18
+ # `gemmi ss` only prints secondary structure to stdout with `-v` flag.
19
+
20
+
21
+ def nr_of_residues_in_total(structure: Structure) -> int:
22
+ """Count the total number of residues in the structure.
23
+
24
+ Args:
25
+ structure: The gemmi Structure object to analyze.
26
+
27
+ Returns:
28
+ The total number of residues in the structure.
29
+ """
30
+ count = 0
31
+ for model in structure:
32
+ for chain in model:
33
+ count += len(chain)
34
+ return count
35
+
36
+
37
+ def nr_of_residues_in_helix(structure: Structure) -> int:
38
+ """Count the number of residues in alpha helices.
39
+
40
+ Requires structure to have secondary structure information.
41
+
42
+ Args:
43
+ structure: The gemmi Structure object to analyze.
44
+
45
+ Returns:
46
+ The number of residues in alpha helices.
47
+ """
48
+ # For cif files from AlphaFold the helix.length is set to -1
49
+ # so use resid instead
50
+ count = 0
51
+ for helix in structure.helices:
52
+ end = helix.end.res_id.seqid.num
53
+ start = helix.start.res_id.seqid.num
54
+ if end is None or start is None:
55
+ logger.warning(f"Invalid helix coordinates: {helix.end} or {helix.start}")
56
+ continue
57
+ length = end - start + 1
58
+ count += length
59
+ return count
60
+
61
+
62
+ def nr_of_residues_in_sheet(structure: Structure) -> int:
63
+ """Count the number of residues in beta sheets.
64
+
65
+ Requires structure to have secondary structure information.
66
+
67
+ Args:
68
+ structure: The gemmi Structure object to analyze.
69
+
70
+ Returns:
71
+ The number of residues in beta sheets.
72
+ """
73
+ count = 0
74
+ for sheet in structure.sheets:
75
+ for strand in sheet.strands:
76
+ end = strand.end.res_id.seqid.num
77
+ start = strand.start.res_id.seqid.num
78
+ if end is None or start is None:
79
+ logger.warning(f"Invalid strand coordinates: {strand.end} or {strand.start}")
80
+ continue
81
+ length = end - start + 1
82
+ count += length
83
+ return count
84
+
85
+
86
+ @dataclass
87
+ class SecondaryStructureFilterQuery:
88
+ """Query object to filter on secondary structure.
89
+
90
+ Parameters:
91
+ abs_min_helix_residues: Minimum number of residues in helices (absolute).
92
+ abs_max_helix_residues: Maximum number of residues in helices (absolute).
93
+ abs_min_sheet_residues: Minimum number of residues in sheets (absolute).
94
+ abs_max_sheet_residues: Maximum number of residues in sheets (absolute).
95
+ ratio_min_helix_residues: Minimum number of residues in helices (relative).
96
+ ratio_max_helix_residues: Maximum number of residues in helices (relative).
97
+ ratio_min_sheet_residues: Minimum number of residues in sheets (relative).
98
+ ratio_max_sheet_residues: Maximum number of residues in sheets (relative).
99
+ """
100
+
101
+ abs_min_helix_residues: PositiveInt | None = None
102
+ abs_max_helix_residues: PositiveInt | None = None
103
+ abs_min_sheet_residues: PositiveInt | None = None
104
+ abs_max_sheet_residues: PositiveInt | None = None
105
+ ratio_min_helix_residues: Ratio | None = None
106
+ ratio_max_helix_residues: Ratio | None = None
107
+ ratio_min_sheet_residues: Ratio | None = None
108
+ ratio_max_sheet_residues: Ratio | None = None
109
+
110
+ def is_actionable(self) -> bool:
111
+ """Check if the secondary structure query has any actionable filters.
112
+
113
+ Returns:
114
+ True if any of the filters are set, False otherwise.
115
+ """
116
+ return any(
117
+ field is not None
118
+ for field in [
119
+ self.abs_min_helix_residues,
120
+ self.abs_max_helix_residues,
121
+ self.abs_min_sheet_residues,
122
+ self.abs_max_sheet_residues,
123
+ self.ratio_min_helix_residues,
124
+ self.ratio_max_helix_residues,
125
+ self.ratio_min_sheet_residues,
126
+ self.ratio_max_sheet_residues,
127
+ ]
128
+ )
129
+
130
+
131
+ def _check_range(min_val, max_val, label):
132
+ if min_val is not None and max_val is not None and min_val >= max_val:
133
+ msg = f"Invalid {label} range: min {min_val} must be smaller than max {max_val}"
134
+ raise ValueError(msg)
135
+
136
+
137
+ base_query_hook = converter.get_structure_hook(SecondaryStructureFilterQuery)
138
+
139
+
140
+ @converter.register_structure_hook
141
+ def secondary_structure_filter_query_hook(value, _type) -> SecondaryStructureFilterQuery:
142
+ result: SecondaryStructureFilterQuery = base_query_hook(value, _type)
143
+ _check_range(result.abs_min_helix_residues, result.abs_max_helix_residues, "absolute helix residue")
144
+ _check_range(result.abs_min_sheet_residues, result.abs_max_sheet_residues, "absolute sheet residue")
145
+ _check_range(result.ratio_min_helix_residues, result.ratio_max_helix_residues, "ratio helix residue")
146
+ _check_range(result.ratio_min_sheet_residues, result.ratio_max_sheet_residues, "ratio sheet residue")
147
+ return result
148
+
149
+
150
+ @dataclass
151
+ class SecondaryStructureStats:
152
+ """Statistics about the secondary structure of a protein.
153
+
154
+ Parameters:
155
+ nr_residues: Total number of residues in the structure.
156
+ nr_helix_residues: Number of residues in helices.
157
+ nr_sheet_residues: Number of residues in sheets.
158
+ helix_ratio: Ratio of residues in helices.
159
+ sheet_ratio: Ratio of residues in sheets.
160
+ """
161
+
162
+ nr_residues: PositiveInt
163
+ nr_helix_residues: PositiveInt
164
+ nr_sheet_residues: PositiveInt
165
+ helix_ratio: Ratio
166
+ sheet_ratio: Ratio
167
+
168
+
169
+ @dataclass
170
+ class SecondaryStructureFilterResult:
171
+ """Result of filtering on secondary structure.
172
+
173
+ Parameters:
174
+ stats: The secondary structure statistics.
175
+ passed: Whether the structure passed the filtering criteria.
176
+ """
177
+
178
+ stats: SecondaryStructureStats
179
+ passed: bool = False
180
+
181
+
182
+ def _gather_stats(structure: Structure) -> SecondaryStructureStats:
183
+ nr_total_residues = nr_of_residues_in_total(structure)
184
+ nr_helix_residues = nr_of_residues_in_helix(structure)
185
+ nr_sheet_residues = nr_of_residues_in_sheet(structure)
186
+ if nr_total_residues == 0:
187
+ msg = "Structure has zero residues; cannot compute secondary structure ratios."
188
+ raise ValueError(msg)
189
+ helix_ratio = nr_helix_residues / nr_total_residues
190
+ sheet_ratio = nr_sheet_residues / nr_total_residues
191
+ return SecondaryStructureStats(
192
+ nr_residues=nr_total_residues,
193
+ nr_helix_residues=nr_helix_residues,
194
+ nr_sheet_residues=nr_sheet_residues,
195
+ helix_ratio=helix_ratio,
196
+ sheet_ratio=sheet_ratio,
197
+ )
198
+
199
+
200
+ def filter_on_secondary_structure(
201
+ structure: Structure,
202
+ query: SecondaryStructureFilterQuery,
203
+ ) -> SecondaryStructureFilterResult:
204
+ """Filter a structure based on secondary structure criteria.
205
+
206
+ Args:
207
+ structure: The gemmi Structure object to analyze.
208
+ query: The filtering criteria to apply.
209
+
210
+ Returns:
211
+ Filtering statistics and whether structure passed.
212
+ """
213
+ stats = _gather_stats(structure)
214
+ conditions: list[bool] = []
215
+
216
+ # Helix absolute thresholds
217
+ if query.abs_min_helix_residues is not None:
218
+ conditions.append(stats.nr_helix_residues >= query.abs_min_helix_residues)
219
+ if query.abs_max_helix_residues is not None:
220
+ conditions.append(stats.nr_helix_residues <= query.abs_max_helix_residues)
221
+
222
+ # Helix ratio thresholds
223
+ if query.ratio_min_helix_residues is not None:
224
+ conditions.append(stats.helix_ratio >= query.ratio_min_helix_residues)
225
+ if query.ratio_max_helix_residues is not None:
226
+ conditions.append(stats.helix_ratio <= query.ratio_max_helix_residues)
227
+
228
+ # Sheet absolute thresholds
229
+ if query.abs_min_sheet_residues is not None:
230
+ conditions.append(stats.nr_sheet_residues >= query.abs_min_sheet_residues)
231
+ if query.abs_max_sheet_residues is not None:
232
+ conditions.append(stats.nr_sheet_residues <= query.abs_max_sheet_residues)
233
+
234
+ # Sheet ratio thresholds
235
+ if query.ratio_min_sheet_residues is not None:
236
+ conditions.append(stats.sheet_ratio >= query.ratio_min_sheet_residues)
237
+ if query.ratio_max_sheet_residues is not None:
238
+ conditions.append(stats.sheet_ratio <= query.ratio_max_sheet_residues)
239
+
240
+ if not conditions:
241
+ msg = "No filtering conditions provided. Please specify at least one condition."
242
+ raise ValueError(msg)
243
+ passed = all(conditions)
244
+ return SecondaryStructureFilterResult(stats=stats, passed=passed)
245
+
246
+
247
+ def filter_file_on_secondary_structure(
248
+ file_path: Path,
249
+ query: SecondaryStructureFilterQuery,
250
+ ) -> SecondaryStructureFilterResult:
251
+ """Filter a structure file based on secondary structure criteria.
252
+
253
+ Args:
254
+ file_path: The path to the structure file to analyze.
255
+ query: The filtering criteria to apply.
256
+
257
+ Returns:
258
+ Filtering statistics and whether file passed.
259
+ """
260
+ structure = read_structure(file_path)
261
+ return filter_on_secondary_structure(structure, query)
262
+
263
+
264
+ def filter_files_on_secondary_structure(
265
+ file_paths: Iterable[Path],
266
+ query: SecondaryStructureFilterQuery,
267
+ ) -> Generator[tuple[Path, SecondaryStructureFilterResult]]:
268
+ """Filter multiple structure files based on secondary structure criteria.
269
+
270
+ Args:
271
+ file_paths: A list of paths to the structure files to analyze.
272
+ query: The filtering criteria to apply.
273
+
274
+ Yields:
275
+ For each file returns the filtering statistics and whether structure passed.
276
+ """
277
+ # TODO check if quick enough in serial mode, if not switch to dask map
278
+ for file_path in file_paths:
279
+ result = filter_file_on_secondary_structure(file_path, query)
280
+ yield file_path, result
@@ -0,0 +1,232 @@
1
+ """Module for querying and modifying [gemmi structures][gemmi.Structure]."""
2
+
3
+ import logging
4
+ from collections.abc import Iterable
5
+ from datetime import UTC, datetime
6
+ from pathlib import Path
7
+
8
+ import gemmi
9
+
10
+ from protein_quest.__version__ import __version__
11
+ from protein_quest.io import read_structure, split_name_and_extension, write_structure
12
+ from protein_quest.utils import CopyMethod, copyfile
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain | None:
18
+ """Find a chain in a model.
19
+
20
+ Args:
21
+ model: The gemmi model to search in.
22
+ wanted_chain: The chain identifier to search for.
23
+
24
+ Returns:
25
+ The found chain or None if not found.
26
+ """
27
+ chain = model.find_chain(wanted_chain)
28
+ if chain is None:
29
+ # For chain A in 4v92 the find_chain method returns None,
30
+ # however it is prefixed with 'B',
31
+ # so we try again as last char of chain name
32
+ mchains = [c for c in model if c.name.endswith(wanted_chain)]
33
+ if mchains:
34
+ return mchains[0]
35
+ return chain
36
+
37
+
38
+ def find_chain_in_structure(structure: gemmi.Structure, wanted_chain: str) -> gemmi.Chain | None:
39
+ """Find a chain in a structure.
40
+
41
+ Args:
42
+ structure: The gemmi structure to search in.
43
+ wanted_chain: The chain identifier to search for.
44
+
45
+ Returns:
46
+ The found chain or None if not found.
47
+ """
48
+ for model in structure:
49
+ chain = find_chain_in_model(model, wanted_chain)
50
+ if chain is not None:
51
+ return chain
52
+ return None
53
+
54
+
55
+ def nr_residues_in_chain(file: Path, chain: str = "A") -> int:
56
+ """Returns the number of residues in a specific chain from a structure file.
57
+
58
+ Args:
59
+ file: Path to the input structure file.
60
+ chain: Chain to count residues of.
61
+
62
+ Returns:
63
+ The number of residues in the specified chain.
64
+ """
65
+ structure = read_structure(file)
66
+ gchain = find_chain_in_structure(structure, chain)
67
+ if gchain is None:
68
+ logger.warning("Chain %s not found in %s. Returning 0.", chain, file)
69
+ return 0
70
+ return len(gchain)
71
+
72
+
73
+ def _dedup_helices(structure: gemmi.Structure):
74
+ helix_starts: set[str] = set()
75
+ duplicate_helix_indexes: list[int] = []
76
+ for hindex, helix in enumerate(structure.helices):
77
+ if str(helix.start) in helix_starts:
78
+ logger.debug(f"Duplicate start helix found: {hindex} {helix.start}, removing")
79
+ duplicate_helix_indexes.append(hindex)
80
+ else:
81
+ helix_starts.add(str(helix.start))
82
+ for helix_index in reversed(duplicate_helix_indexes):
83
+ structure.helices.pop(helix_index)
84
+
85
+
86
+ def _dedup_sheets(structure: gemmi.Structure, chain2keep: str):
87
+ duplicate_sheet_indexes: list[int] = []
88
+ for sindex, sheet in enumerate(structure.sheets):
89
+ if sheet.name != chain2keep:
90
+ duplicate_sheet_indexes.append(sindex)
91
+ for sheet_index in reversed(duplicate_sheet_indexes):
92
+ structure.sheets.pop(sheet_index)
93
+
94
+
95
+ def _add_provenance_info(structure: gemmi.Structure, chain2keep: str, out_chain: str):
96
+ old_id = structure.name
97
+ new_id = structure.name + f"{chain2keep}2{out_chain}"
98
+ structure.name = new_id
99
+ structure.info["_entry.id"] = new_id
100
+ new_title = f"From {old_id} chain {chain2keep} to {out_chain}"
101
+ structure.info["_struct.title"] = new_title
102
+ structure.info["_struct_keywords.pdbx_keywords"] = new_title.upper()
103
+ new_si = gemmi.SoftwareItem()
104
+ new_si.classification = gemmi.SoftwareItem.Classification.DataExtraction
105
+ new_si.name = "protein-quest.pdbe.io.write_single_chain_pdb_file"
106
+ new_si.version = str(__version__)
107
+ new_si.date = str(datetime.now(tz=UTC).date())
108
+ structure.meta.software = [*structure.meta.software, new_si]
109
+
110
+
111
+ def chains_in_structure(structure: gemmi.Structure) -> set[gemmi.Chain]:
112
+ """Get a list of chains in a structure.
113
+
114
+ Args:
115
+ structure: The gemmi structure to get chains from.
116
+
117
+ Returns:
118
+ A set of chains in the structure.
119
+ """
120
+ return {c for model in structure for c in model}
121
+
122
+
123
+ class ChainNotFoundError(IndexError):
124
+ """Exception raised when a chain is not found in a structure."""
125
+
126
+ def __init__(self, chain: str, file: Path | str, available_chains: Iterable[str]):
127
+ super().__init__(f"Chain {chain} not found in {file}. Available chains are: {available_chains}")
128
+ self.chain_id = chain
129
+ self.file = file
130
+
131
+
132
+ def write_single_chain_structure_file(
133
+ input_file: Path,
134
+ chain2keep: str,
135
+ output_dir: Path,
136
+ out_chain: str = "A",
137
+ copy_method: CopyMethod = "copy",
138
+ ) -> Path:
139
+ """Write a single chain from a structure file to a new structure file.
140
+
141
+ Also
142
+
143
+ - removes ligands and waters
144
+ - renumbers atoms ids
145
+ - removes chem_comp section from cif files
146
+ - adds provenance information to the header like software and input file+chain
147
+
148
+ This function is equivalent to the following gemmi commands:
149
+
150
+ ```shell
151
+ gemmi convert --remove-lig-wat --select=B --to=cif chain-in/3JRS.cif - | \\
152
+ gemmi convert --from=cif --rename-chain=B:A - chain-out/3JRS_B2A.gemmi.cif
153
+ ```
154
+
155
+ Args:
156
+ input_file: Path to the input structure file.
157
+ chain2keep: The chain to keep.
158
+ output_dir: Directory to save the output file.
159
+ out_chain: The chain identifier for the output file.
160
+ copy_method: How to copy when no changes are needed to output file.
161
+
162
+ Returns:
163
+ Path to the output structure file
164
+
165
+ Raises:
166
+ FileNotFoundError: If the input file does not exist.
167
+ ChainNotFoundError: If the specified chain is not found in the input file.
168
+ """
169
+
170
+ logger.debug(f"chain2keep: {chain2keep}, out_chain: {out_chain}")
171
+ structure = read_structure(input_file)
172
+ structure.setup_entities()
173
+
174
+ chain = find_chain_in_structure(structure, chain2keep)
175
+ chainnames_in_structure = {c.name for c in chains_in_structure(structure)}
176
+ if chain is None:
177
+ raise ChainNotFoundError(chain2keep, input_file, chainnames_in_structure)
178
+ chain_name = chain.name
179
+ name, extension = split_name_and_extension(input_file.name)
180
+ output_file = output_dir / f"{name}_{chain_name}2{out_chain}{extension}"
181
+
182
+ if output_file.exists():
183
+ logger.info("Output file %s already exists for input file %s. Skipping.", output_file, input_file)
184
+ return output_file
185
+
186
+ if chain_name == out_chain and len(chainnames_in_structure) == 1:
187
+ logger.info(
188
+ "%s only has chain %s and out_chain is also %s. Copying file to %s.",
189
+ input_file,
190
+ chain_name,
191
+ out_chain,
192
+ output_file,
193
+ )
194
+ copyfile(input_file, output_file, copy_method)
195
+ return output_file
196
+
197
+ gemmi.Selection(chain_name).remove_not_selected(structure)
198
+ for m in structure:
199
+ m.remove_ligands_and_waters()
200
+ structure.setup_entities()
201
+ structure.rename_chain(chain_name, out_chain)
202
+ _dedup_helices(structure)
203
+ _dedup_sheets(structure, out_chain)
204
+ _add_provenance_info(structure, chain_name, out_chain)
205
+
206
+ write_structure(structure, output_file)
207
+
208
+ return output_file
209
+
210
+
211
+ def structure2uniprot_accessions(structure: gemmi.Structure) -> set[str]:
212
+ """Extract UniProt accessions from a gemmi Structure object.
213
+
214
+ Logs a warning and returns an empty set if no accessions are found in structure.
215
+
216
+ Args:
217
+ structure: The gemmi Structure object to extract UniProt accessions from.
218
+
219
+ Returns:
220
+ A set of UniProt accessions found in the structure.
221
+ """
222
+ block = structure.make_mmcif_block(gemmi.MmcifOutputGroups(False, struct_ref=True))
223
+ struct_ref = block.get_mmcif_category("_struct_ref.")
224
+ uniprot_accessions: set[str] = set()
225
+ for i, db_name in enumerate(struct_ref["db_name"]):
226
+ if db_name != "UNP":
227
+ continue
228
+ pdbx_db_accession = struct_ref["pdbx_db_accession"][i]
229
+ uniprot_accessions.add(pdbx_db_accession)
230
+ if not uniprot_accessions:
231
+ logger.warning("No UniProt accessions found in structure %s", structure.name)
232
+ return uniprot_accessions