protein-quest 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of protein-quest might be problematic. Click here for more details.
- protein_quest/__version__.py +1 -1
- protein_quest/alphafold/confidence.py +2 -2
- protein_quest/alphafold/entry_summary.py +46 -22
- protein_quest/alphafold/fetch.py +76 -42
- protein_quest/cli.py +385 -114
- protein_quest/filters.py +2 -5
- protein_quest/io.py +350 -0
- protein_quest/mcp_server.py +21 -7
- protein_quest/ss.py +3 -7
- protein_quest/{pdbe/io.py → structure.py} +77 -126
- protein_quest/uniprot.py +287 -15
- protein_quest/utils.py +26 -2
- {protein_quest-0.5.1.dist-info → protein_quest-0.7.0.dist-info}/METADATA +42 -5
- protein_quest-0.7.0.dist-info/RECORD +27 -0
- protein_quest-0.5.1.dist-info/RECORD +0 -26
- {protein_quest-0.5.1.dist-info → protein_quest-0.7.0.dist-info}/WHEEL +0 -0
- {protein_quest-0.5.1.dist-info → protein_quest-0.7.0.dist-info}/entry_points.txt +0 -0
- {protein_quest-0.5.1.dist-info → protein_quest-0.7.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,51 +1,29 @@
|
|
|
1
|
-
"""Module for
|
|
1
|
+
"""Module for querying and modifying [gemmi structures][gemmi.Structure]."""
|
|
2
2
|
|
|
3
|
-
import gzip
|
|
4
3
|
import logging
|
|
5
|
-
from collections.abc import
|
|
4
|
+
from collections.abc import Iterable
|
|
6
5
|
from datetime import UTC, datetime
|
|
7
6
|
from pathlib import Path
|
|
8
7
|
|
|
9
8
|
import gemmi
|
|
10
9
|
|
|
11
10
|
from protein_quest.__version__ import __version__
|
|
11
|
+
from protein_quest.io import read_structure, split_name_and_extension, write_structure
|
|
12
12
|
from protein_quest.utils import CopyMethod, copyfile
|
|
13
13
|
|
|
14
14
|
logger = logging.getLogger(__name__)
|
|
15
15
|
|
|
16
|
-
# TODO remove once v0.7.4 of gemmi is released,
|
|
17
|
-
# as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
|
|
18
|
-
# Swallow gemmi leaked function warnings
|
|
19
|
-
gemmi.set_leak_warnings(False)
|
|
20
16
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
"""Returns the number of residues in a specific chain from a mmCIF/pdb file.
|
|
17
|
+
def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain | None:
|
|
18
|
+
"""Find a chain in a model.
|
|
24
19
|
|
|
25
20
|
Args:
|
|
26
|
-
|
|
27
|
-
|
|
21
|
+
model: The gemmi model to search in.
|
|
22
|
+
wanted_chain: The chain identifier to search for.
|
|
28
23
|
|
|
29
24
|
Returns:
|
|
30
|
-
The
|
|
25
|
+
The found chain or None if not found.
|
|
31
26
|
"""
|
|
32
|
-
structure = gemmi.read_structure(str(file))
|
|
33
|
-
gchain = find_chain_in_structure(structure, chain)
|
|
34
|
-
if gchain is None:
|
|
35
|
-
logger.warning("Chain %s not found in %s. Returning 0.", chain, file)
|
|
36
|
-
return 0
|
|
37
|
-
return len(gchain)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def find_chain_in_structure(structure: gemmi.Structure, wanted_chain: str) -> gemmi.Chain | None:
|
|
41
|
-
for model in structure:
|
|
42
|
-
chain = find_chain_in_model(model, wanted_chain)
|
|
43
|
-
if chain is not None:
|
|
44
|
-
return chain
|
|
45
|
-
return None
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain | None:
|
|
49
27
|
chain = model.find_chain(wanted_chain)
|
|
50
28
|
if chain is None:
|
|
51
29
|
# For chain A in 4v92 the find_chain method returns None,
|
|
@@ -57,106 +35,39 @@ def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain |
|
|
|
57
35
|
return chain
|
|
58
36
|
|
|
59
37
|
|
|
60
|
-
def
|
|
61
|
-
"""
|
|
62
|
-
|
|
63
|
-
Args:
|
|
64
|
-
structure: The gemmi structure to write.
|
|
65
|
-
path: The file path to write the structure to.
|
|
66
|
-
The format depends on the file extension.
|
|
67
|
-
Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz.
|
|
68
|
-
|
|
69
|
-
Raises:
|
|
70
|
-
ValueError: If the file extension is not supported.
|
|
71
|
-
"""
|
|
72
|
-
if path.name.endswith(".pdb"):
|
|
73
|
-
body: str = structure.make_pdb_string()
|
|
74
|
-
path.write_text(body)
|
|
75
|
-
elif path.name.endswith(".pdb.gz"):
|
|
76
|
-
body: str = structure.make_pdb_string()
|
|
77
|
-
with gzip.open(path, "wt") as f:
|
|
78
|
-
f.write(body)
|
|
79
|
-
elif path.name.endswith(".cif"):
|
|
80
|
-
# do not write chem_comp so it is viewable by molstar
|
|
81
|
-
# see https://github.com/project-gemmi/gemmi/discussions/362
|
|
82
|
-
doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
|
|
83
|
-
doc.write_file(str(path))
|
|
84
|
-
elif path.name.endswith(".cif.gz"):
|
|
85
|
-
doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
|
|
86
|
-
cif_str = doc.as_string()
|
|
87
|
-
with gzip.open(path, "wt") as f:
|
|
88
|
-
f.write(cif_str)
|
|
89
|
-
else:
|
|
90
|
-
msg = f"Unsupported file extension in {path.name}. Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz"
|
|
91
|
-
raise ValueError(msg)
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def _split_name_and_extension(name: str) -> tuple[str, str]:
|
|
95
|
-
# 1234.pdb -> (1234, .pdb)
|
|
96
|
-
# 1234.pdb.gz -> (1234, .pdb.gz)
|
|
97
|
-
# 1234.cif -> (1234, .cif)
|
|
98
|
-
# 1234.cif.gz -> (1234, .cif.gz)
|
|
99
|
-
if name.endswith(".pdb.gz"):
|
|
100
|
-
return name.replace(".pdb.gz", ""), ".pdb.gz"
|
|
101
|
-
if name.endswith(".cif.gz"):
|
|
102
|
-
return name.replace(".cif.gz", ""), ".cif.gz"
|
|
103
|
-
if name.endswith(".pdb"):
|
|
104
|
-
return name.replace(".pdb", ""), ".pdb"
|
|
105
|
-
if name.endswith(".cif"):
|
|
106
|
-
return name.replace(".cif", ""), ".cif"
|
|
107
|
-
|
|
108
|
-
msg = f"Unknown file extension in {name}. Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz"
|
|
109
|
-
raise ValueError(msg)
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def locate_structure_file(root: Path, pdb_id: str) -> Path:
|
|
113
|
-
"""Locate a structure file for a given PDB ID in the specified directory.
|
|
38
|
+
def find_chain_in_structure(structure: gemmi.Structure, wanted_chain: str) -> gemmi.Chain | None:
|
|
39
|
+
"""Find a chain in a structure.
|
|
114
40
|
|
|
115
41
|
Args:
|
|
116
|
-
|
|
117
|
-
|
|
42
|
+
structure: The gemmi structure to search in.
|
|
43
|
+
wanted_chain: The chain identifier to search for.
|
|
118
44
|
|
|
119
45
|
Returns:
|
|
120
|
-
The
|
|
121
|
-
|
|
122
|
-
Raises:
|
|
123
|
-
FileNotFoundError: If no structure file is found for the given PDB ID.
|
|
46
|
+
The found chain or None if not found.
|
|
124
47
|
"""
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
root / f"{pdb_id.upper()}{ext}",
|
|
131
|
-
root / f"pdb{pdb_id.lower()}{ext}",
|
|
132
|
-
)
|
|
133
|
-
for candidate in candidates:
|
|
134
|
-
if candidate.exists():
|
|
135
|
-
return candidate
|
|
136
|
-
msg = f"No structure file found for {pdb_id} in {root}"
|
|
137
|
-
raise FileNotFoundError(msg)
|
|
48
|
+
for model in structure:
|
|
49
|
+
chain = find_chain_in_model(model, wanted_chain)
|
|
50
|
+
if chain is not None:
|
|
51
|
+
return chain
|
|
52
|
+
return None
|
|
138
53
|
|
|
139
54
|
|
|
140
|
-
def
|
|
141
|
-
"""
|
|
55
|
+
def nr_residues_in_chain(file: Path, chain: str = "A") -> int:
|
|
56
|
+
"""Returns the number of residues in a specific chain from a structure file.
|
|
142
57
|
|
|
143
58
|
Args:
|
|
144
|
-
|
|
59
|
+
file: Path to the input structure file.
|
|
60
|
+
chain: Chain to count residues of.
|
|
145
61
|
|
|
146
|
-
|
|
147
|
-
|
|
62
|
+
Returns:
|
|
63
|
+
The number of residues in the specified chain.
|
|
148
64
|
"""
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
def __init__(self, chain: str, file: Path | str, available_chains: Iterable[str]):
|
|
157
|
-
super().__init__(f"Chain {chain} not found in {file}. Available chains are: {available_chains}")
|
|
158
|
-
self.chain_id = chain
|
|
159
|
-
self.file = file
|
|
65
|
+
structure = read_structure(file)
|
|
66
|
+
gchain = find_chain_in_structure(structure, chain)
|
|
67
|
+
if gchain is None:
|
|
68
|
+
logger.warning("Chain %s not found in %s. Returning 0.", chain, file)
|
|
69
|
+
return 0
|
|
70
|
+
return len(gchain)
|
|
160
71
|
|
|
161
72
|
|
|
162
73
|
def _dedup_helices(structure: gemmi.Structure):
|
|
@@ -198,18 +109,34 @@ def _add_provenance_info(structure: gemmi.Structure, chain2keep: str, out_chain:
|
|
|
198
109
|
|
|
199
110
|
|
|
200
111
|
def chains_in_structure(structure: gemmi.Structure) -> set[gemmi.Chain]:
|
|
201
|
-
"""Get a list of chains in a structure.
|
|
112
|
+
"""Get a list of chains in a structure.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
structure: The gemmi structure to get chains from.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
A set of chains in the structure.
|
|
119
|
+
"""
|
|
202
120
|
return {c for model in structure for c in model}
|
|
203
121
|
|
|
204
122
|
|
|
205
|
-
|
|
123
|
+
class ChainNotFoundError(IndexError):
|
|
124
|
+
"""Exception raised when a chain is not found in a structure."""
|
|
125
|
+
|
|
126
|
+
def __init__(self, chain: str, file: Path | str, available_chains: Iterable[str]):
|
|
127
|
+
super().__init__(f"Chain {chain} not found in {file}. Available chains are: {available_chains}")
|
|
128
|
+
self.chain_id = chain
|
|
129
|
+
self.file = file
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def write_single_chain_structure_file(
|
|
206
133
|
input_file: Path,
|
|
207
134
|
chain2keep: str,
|
|
208
135
|
output_dir: Path,
|
|
209
136
|
out_chain: str = "A",
|
|
210
137
|
copy_method: CopyMethod = "copy",
|
|
211
138
|
) -> Path:
|
|
212
|
-
"""Write a single chain from a
|
|
139
|
+
"""Write a single chain from a structure file to a new structure file.
|
|
213
140
|
|
|
214
141
|
Also
|
|
215
142
|
|
|
@@ -226,14 +153,14 @@ def write_single_chain_pdb_file(
|
|
|
226
153
|
```
|
|
227
154
|
|
|
228
155
|
Args:
|
|
229
|
-
input_file: Path to the input
|
|
156
|
+
input_file: Path to the input structure file.
|
|
230
157
|
chain2keep: The chain to keep.
|
|
231
158
|
output_dir: Directory to save the output file.
|
|
232
159
|
out_chain: The chain identifier for the output file.
|
|
233
160
|
copy_method: How to copy when no changes are needed to output file.
|
|
234
161
|
|
|
235
162
|
Returns:
|
|
236
|
-
Path to the output
|
|
163
|
+
Path to the output structure file
|
|
237
164
|
|
|
238
165
|
Raises:
|
|
239
166
|
FileNotFoundError: If the input file does not exist.
|
|
@@ -241,7 +168,7 @@ def write_single_chain_pdb_file(
|
|
|
241
168
|
"""
|
|
242
169
|
|
|
243
170
|
logger.debug(f"chain2keep: {chain2keep}, out_chain: {out_chain}")
|
|
244
|
-
structure =
|
|
171
|
+
structure = read_structure(input_file)
|
|
245
172
|
structure.setup_entities()
|
|
246
173
|
|
|
247
174
|
chain = find_chain_in_structure(structure, chain2keep)
|
|
@@ -249,7 +176,7 @@ def write_single_chain_pdb_file(
|
|
|
249
176
|
if chain is None:
|
|
250
177
|
raise ChainNotFoundError(chain2keep, input_file, chainnames_in_structure)
|
|
251
178
|
chain_name = chain.name
|
|
252
|
-
name, extension =
|
|
179
|
+
name, extension = split_name_and_extension(input_file.name)
|
|
253
180
|
output_file = output_dir / f"{name}_{chain_name}2{out_chain}{extension}"
|
|
254
181
|
|
|
255
182
|
if output_file.exists():
|
|
@@ -279,3 +206,27 @@ def write_single_chain_pdb_file(
|
|
|
279
206
|
write_structure(structure, output_file)
|
|
280
207
|
|
|
281
208
|
return output_file
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def structure2uniprot_accessions(structure: gemmi.Structure) -> set[str]:
|
|
212
|
+
"""Extract UniProt accessions from a gemmi Structure object.
|
|
213
|
+
|
|
214
|
+
Logs a warning and returns an empty set if no accessions are found in structure.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
structure: The gemmi Structure object to extract UniProt accessions from.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
A set of UniProt accessions found in the structure.
|
|
221
|
+
"""
|
|
222
|
+
block = structure.make_mmcif_block(gemmi.MmcifOutputGroups(False, struct_ref=True))
|
|
223
|
+
struct_ref = block.get_mmcif_category("_struct_ref.")
|
|
224
|
+
uniprot_accessions: set[str] = set()
|
|
225
|
+
for i, db_name in enumerate(struct_ref["db_name"]):
|
|
226
|
+
if db_name != "UNP":
|
|
227
|
+
continue
|
|
228
|
+
pdbx_db_accession = struct_ref["pdbx_db_accession"][i]
|
|
229
|
+
uniprot_accessions.add(pdbx_db_accession)
|
|
230
|
+
if not uniprot_accessions:
|
|
231
|
+
logger.warning("No UniProt accessions found in structure %s", structure.name)
|
|
232
|
+
return uniprot_accessions
|