protein-quest 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of protein-quest might be problematic. Click here for more details.

@@ -1,51 +1,29 @@
1
- """Module for structure file input/output."""
1
+ """Module for querying and modifying [gemmi structures][gemmi.Structure]."""
2
2
 
3
- import gzip
4
3
  import logging
5
- from collections.abc import Generator, Iterable
4
+ from collections.abc import Iterable
6
5
  from datetime import UTC, datetime
7
6
  from pathlib import Path
8
7
 
9
8
  import gemmi
10
9
 
11
10
  from protein_quest.__version__ import __version__
11
+ from protein_quest.io import read_structure, split_name_and_extension, write_structure
12
12
  from protein_quest.utils import CopyMethod, copyfile
13
13
 
14
14
  logger = logging.getLogger(__name__)
15
15
 
16
- # TODO remove once v0.7.4 of gemmi is released,
17
- # as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
18
- # Swallow gemmi leaked function warnings
19
- gemmi.set_leak_warnings(False)
20
16
 
21
-
22
- def nr_residues_in_chain(file: Path | str, chain: str = "A") -> int:
23
- """Returns the number of residues in a specific chain from a mmCIF/pdb file.
17
+ def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain | None:
18
+ """Find a chain in a model.
24
19
 
25
20
  Args:
26
- file: Path to the input mmCIF/pdb file.
27
- chain: Chain to count residues of.
21
+ model: The gemmi model to search in.
22
+ wanted_chain: The chain identifier to search for.
28
23
 
29
24
  Returns:
30
- The number of residues in the specified chain.
25
+ The found chain or None if not found.
31
26
  """
32
- structure = gemmi.read_structure(str(file))
33
- gchain = find_chain_in_structure(structure, chain)
34
- if gchain is None:
35
- logger.warning("Chain %s not found in %s. Returning 0.", chain, file)
36
- return 0
37
- return len(gchain)
38
-
39
-
40
- def find_chain_in_structure(structure: gemmi.Structure, wanted_chain: str) -> gemmi.Chain | None:
41
- for model in structure:
42
- chain = find_chain_in_model(model, wanted_chain)
43
- if chain is not None:
44
- return chain
45
- return None
46
-
47
-
48
- def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain | None:
49
27
  chain = model.find_chain(wanted_chain)
50
28
  if chain is None:
51
29
  # For chain A in 4v92 the find_chain method returns None,
@@ -57,106 +35,39 @@ def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain |
57
35
  return chain
58
36
 
59
37
 
60
- def write_structure(structure: gemmi.Structure, path: Path):
61
- """Write a gemmi structure to a file.
62
-
63
- Args:
64
- structure: The gemmi structure to write.
65
- path: The file path to write the structure to.
66
- The format depends on the file extension.
67
- Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz.
68
-
69
- Raises:
70
- ValueError: If the file extension is not supported.
71
- """
72
- if path.name.endswith(".pdb"):
73
- body: str = structure.make_pdb_string()
74
- path.write_text(body)
75
- elif path.name.endswith(".pdb.gz"):
76
- body: str = structure.make_pdb_string()
77
- with gzip.open(path, "wt") as f:
78
- f.write(body)
79
- elif path.name.endswith(".cif"):
80
- # do not write chem_comp so it is viewable by molstar
81
- # see https://github.com/project-gemmi/gemmi/discussions/362
82
- doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
83
- doc.write_file(str(path))
84
- elif path.name.endswith(".cif.gz"):
85
- doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
86
- cif_str = doc.as_string()
87
- with gzip.open(path, "wt") as f:
88
- f.write(cif_str)
89
- else:
90
- msg = f"Unsupported file extension in {path.name}. Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz"
91
- raise ValueError(msg)
92
-
93
-
94
- def _split_name_and_extension(name: str) -> tuple[str, str]:
95
- # 1234.pdb -> (1234, .pdb)
96
- # 1234.pdb.gz -> (1234, .pdb.gz)
97
- # 1234.cif -> (1234, .cif)
98
- # 1234.cif.gz -> (1234, .cif.gz)
99
- if name.endswith(".pdb.gz"):
100
- return name.replace(".pdb.gz", ""), ".pdb.gz"
101
- if name.endswith(".cif.gz"):
102
- return name.replace(".cif.gz", ""), ".cif.gz"
103
- if name.endswith(".pdb"):
104
- return name.replace(".pdb", ""), ".pdb"
105
- if name.endswith(".cif"):
106
- return name.replace(".cif", ""), ".cif"
107
-
108
- msg = f"Unknown file extension in {name}. Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz"
109
- raise ValueError(msg)
110
-
111
-
112
- def locate_structure_file(root: Path, pdb_id: str) -> Path:
113
- """Locate a structure file for a given PDB ID in the specified directory.
38
+ def find_chain_in_structure(structure: gemmi.Structure, wanted_chain: str) -> gemmi.Chain | None:
39
+ """Find a chain in a structure.
114
40
 
115
41
  Args:
116
- root: The root directory to search in.
117
- pdb_id: The PDB ID to locate.
42
+ structure: The gemmi structure to search in.
43
+ wanted_chain: The chain identifier to search for.
118
44
 
119
45
  Returns:
120
- The path to the located structure file.
121
-
122
- Raises:
123
- FileNotFoundError: If no structure file is found for the given PDB ID.
46
+ The found chain or None if not found.
124
47
  """
125
- exts = [".cif.gz", ".cif", ".pdb.gz", ".pdb", ".ent", ".ent.gz"]
126
- for ext in exts:
127
- candidates = (
128
- root / f"{pdb_id}{ext}",
129
- root / f"{pdb_id.lower()}{ext}",
130
- root / f"{pdb_id.upper()}{ext}",
131
- root / f"pdb{pdb_id.lower()}{ext}",
132
- )
133
- for candidate in candidates:
134
- if candidate.exists():
135
- return candidate
136
- msg = f"No structure file found for {pdb_id} in {root}"
137
- raise FileNotFoundError(msg)
48
+ for model in structure:
49
+ chain = find_chain_in_model(model, wanted_chain)
50
+ if chain is not None:
51
+ return chain
52
+ return None
138
53
 
139
54
 
140
- def glob_structure_files(input_dir: Path) -> Generator[Path]:
141
- """Glob for structure files in a directory.
55
+ def nr_residues_in_chain(file: Path, chain: str = "A") -> int:
56
+ """Returns the number of residues in a specific chain from a structure file.
142
57
 
143
58
  Args:
144
- input_dir: The input directory to search for structure files.
59
+ file: Path to the input structure file.
60
+ chain: Chain to count residues of.
145
61
 
146
- Yields:
147
- Paths to the found structure files.
62
+ Returns:
63
+ The number of residues in the specified chain.
148
64
  """
149
- for ext in [".cif.gz", ".cif", ".pdb.gz", ".pdb"]:
150
- yield from input_dir.glob(f"*{ext}")
151
-
152
-
153
- class ChainNotFoundError(IndexError):
154
- """Exception raised when a chain is not found in a structure."""
155
-
156
- def __init__(self, chain: str, file: Path | str, available_chains: Iterable[str]):
157
- super().__init__(f"Chain {chain} not found in {file}. Available chains are: {available_chains}")
158
- self.chain_id = chain
159
- self.file = file
65
+ structure = read_structure(file)
66
+ gchain = find_chain_in_structure(structure, chain)
67
+ if gchain is None:
68
+ logger.warning("Chain %s not found in %s. Returning 0.", chain, file)
69
+ return 0
70
+ return len(gchain)
160
71
 
161
72
 
162
73
  def _dedup_helices(structure: gemmi.Structure):
@@ -198,18 +109,34 @@ def _add_provenance_info(structure: gemmi.Structure, chain2keep: str, out_chain:
198
109
 
199
110
 
200
111
  def chains_in_structure(structure: gemmi.Structure) -> set[gemmi.Chain]:
201
- """Get a list of chains in a structure."""
112
+ """Get a list of chains in a structure.
113
+
114
+ Args:
115
+ structure: The gemmi structure to get chains from.
116
+
117
+ Returns:
118
+ A set of chains in the structure.
119
+ """
202
120
  return {c for model in structure for c in model}
203
121
 
204
122
 
205
- def write_single_chain_pdb_file(
123
+ class ChainNotFoundError(IndexError):
124
+ """Exception raised when a chain is not found in a structure."""
125
+
126
+ def __init__(self, chain: str, file: Path | str, available_chains: Iterable[str]):
127
+ super().__init__(f"Chain {chain} not found in {file}. Available chains are: {available_chains}")
128
+ self.chain_id = chain
129
+ self.file = file
130
+
131
+
132
+ def write_single_chain_structure_file(
206
133
  input_file: Path,
207
134
  chain2keep: str,
208
135
  output_dir: Path,
209
136
  out_chain: str = "A",
210
137
  copy_method: CopyMethod = "copy",
211
138
  ) -> Path:
212
- """Write a single chain from a mmCIF/pdb file to a new mmCIF/pdb file.
139
+ """Write a single chain from a structure file to a new structure file.
213
140
 
214
141
  Also
215
142
 
@@ -226,14 +153,14 @@ def write_single_chain_pdb_file(
226
153
  ```
227
154
 
228
155
  Args:
229
- input_file: Path to the input mmCIF/pdb file.
156
+ input_file: Path to the input structure file.
230
157
  chain2keep: The chain to keep.
231
158
  output_dir: Directory to save the output file.
232
159
  out_chain: The chain identifier for the output file.
233
160
  copy_method: How to copy when no changes are needed to output file.
234
161
 
235
162
  Returns:
236
- Path to the output mmCIF/pdb file
163
+ Path to the output structure file
237
164
 
238
165
  Raises:
239
166
  FileNotFoundError: If the input file does not exist.
@@ -241,7 +168,7 @@ def write_single_chain_pdb_file(
241
168
  """
242
169
 
243
170
  logger.debug(f"chain2keep: {chain2keep}, out_chain: {out_chain}")
244
- structure = gemmi.read_structure(str(input_file))
171
+ structure = read_structure(input_file)
245
172
  structure.setup_entities()
246
173
 
247
174
  chain = find_chain_in_structure(structure, chain2keep)
@@ -249,7 +176,7 @@ def write_single_chain_pdb_file(
249
176
  if chain is None:
250
177
  raise ChainNotFoundError(chain2keep, input_file, chainnames_in_structure)
251
178
  chain_name = chain.name
252
- name, extension = _split_name_and_extension(input_file.name)
179
+ name, extension = split_name_and_extension(input_file.name)
253
180
  output_file = output_dir / f"{name}_{chain_name}2{out_chain}{extension}"
254
181
 
255
182
  if output_file.exists():
@@ -279,3 +206,27 @@ def write_single_chain_pdb_file(
279
206
  write_structure(structure, output_file)
280
207
 
281
208
  return output_file
209
+
210
+
211
+ def structure2uniprot_accessions(structure: gemmi.Structure) -> set[str]:
212
+ """Extract UniProt accessions from a gemmi Structure object.
213
+
214
+ Logs a warning and returns an empty set if no accessions are found in structure.
215
+
216
+ Args:
217
+ structure: The gemmi Structure object to extract UniProt accessions from.
218
+
219
+ Returns:
220
+ A set of UniProt accessions found in the structure.
221
+ """
222
+ block = structure.make_mmcif_block(gemmi.MmcifOutputGroups(False, struct_ref=True))
223
+ struct_ref = block.get_mmcif_category("_struct_ref.")
224
+ uniprot_accessions: set[str] = set()
225
+ for i, db_name in enumerate(struct_ref["db_name"]):
226
+ if db_name != "UNP":
227
+ continue
228
+ pdbx_db_accession = struct_ref["pdbx_db_accession"][i]
229
+ uniprot_accessions.add(pdbx_db_accession)
230
+ if not uniprot_accessions:
231
+ logger.warning("No UniProt accessions found in structure %s", structure.name)
232
+ return uniprot_accessions