protein-quest 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of protein-quest might be problematic. Click here for more details.
- protein_quest/__version__.py +1 -1
- protein_quest/alphafold/confidence.py +2 -2
- protein_quest/alphafold/entry_summary.py +46 -22
- protein_quest/alphafold/fetch.py +76 -42
- protein_quest/cli.py +385 -114
- protein_quest/filters.py +2 -5
- protein_quest/io.py +350 -0
- protein_quest/mcp_server.py +21 -7
- protein_quest/ss.py +3 -7
- protein_quest/{pdbe/io.py → structure.py} +77 -126
- protein_quest/uniprot.py +287 -15
- protein_quest/utils.py +26 -2
- {protein_quest-0.5.1.dist-info → protein_quest-0.7.0.dist-info}/METADATA +42 -5
- protein_quest-0.7.0.dist-info/RECORD +27 -0
- protein_quest-0.5.1.dist-info/RECORD +0 -26
- {protein_quest-0.5.1.dist-info → protein_quest-0.7.0.dist-info}/WHEEL +0 -0
- {protein_quest-0.5.1.dist-info → protein_quest-0.7.0.dist-info}/entry_points.txt +0 -0
- {protein_quest-0.5.1.dist-info → protein_quest-0.7.0.dist-info}/licenses/LICENSE +0 -0
protein_quest/filters.py
CHANGED
|
@@ -11,10 +11,7 @@ from distributed.deploy.cluster import Cluster
|
|
|
11
11
|
from tqdm.auto import tqdm
|
|
12
12
|
|
|
13
13
|
from protein_quest.parallel import configure_dask_scheduler, dask_map_with_progress
|
|
14
|
-
from protein_quest.
|
|
15
|
-
nr_residues_in_chain,
|
|
16
|
-
write_single_chain_pdb_file,
|
|
17
|
-
)
|
|
14
|
+
from protein_quest.structure import nr_residues_in_chain, write_single_chain_structure_file
|
|
18
15
|
from protein_quest.utils import CopyMethod, copyfile
|
|
19
16
|
|
|
20
17
|
logger = logging.getLogger(__name__)
|
|
@@ -38,7 +35,7 @@ def filter_file_on_chain(
|
|
|
38
35
|
input_file, chain_id = file_and_chain
|
|
39
36
|
logger.debug("Filtering %s on chain %s", input_file, chain_id)
|
|
40
37
|
try:
|
|
41
|
-
output_file =
|
|
38
|
+
output_file = write_single_chain_structure_file(
|
|
42
39
|
input_file, chain_id, output_dir, out_chain=out_chain, copy_method=copy_method
|
|
43
40
|
)
|
|
44
41
|
return ChainFilterStatistics(
|
protein_quest/io.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
"""Module for structure file input/output."""
|
|
2
|
+
|
|
3
|
+
import gzip
|
|
4
|
+
import logging
|
|
5
|
+
import shutil
|
|
6
|
+
import tempfile
|
|
7
|
+
from collections.abc import Generator, Iterable
|
|
8
|
+
from io import StringIO
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Literal, get_args
|
|
11
|
+
from urllib.request import urlopen
|
|
12
|
+
|
|
13
|
+
import gemmi
|
|
14
|
+
from mmcif.api.DictionaryApi import DictionaryApi
|
|
15
|
+
from mmcif.io.BinaryCifReader import BinaryCifReader
|
|
16
|
+
from mmcif.io.BinaryCifWriter import BinaryCifWriter
|
|
17
|
+
from mmcif.io.PdbxReader import PdbxReader
|
|
18
|
+
from mmcif.io.PdbxWriter import PdbxWriter
|
|
19
|
+
|
|
20
|
+
from protein_quest.utils import CopyMethod, copyfile, user_cache_root_dir
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
# TODO remove once v0.7.4 of gemmi is released,
|
|
25
|
+
# as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
|
|
26
|
+
# Swallow gemmi leaked function warnings
|
|
27
|
+
gemmi.set_leak_warnings(False)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
StructureFileExtensions = Literal[".pdb", ".pdb.gz", ".ent", ".ent.gz", ".cif", ".cif.gz", ".bcif", ".bcif.gz"]
|
|
31
|
+
"""Type of supported structure file extensions."""
|
|
32
|
+
valid_structure_file_extensions: set[str] = set(get_args(StructureFileExtensions))
|
|
33
|
+
"""Set of valid structure file extensions."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def write_structure(structure: gemmi.Structure, path: Path):
|
|
37
|
+
"""Write a gemmi structure to a file.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
structure: The gemmi structure to write.
|
|
41
|
+
path: The file path to write the structure to.
|
|
42
|
+
The format depends on the file extension.
|
|
43
|
+
See [StructureFileExtensions][protein_quest.io.StructureFileExtensions]
|
|
44
|
+
for supported extensions.
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
ValueError: If the file extension is not supported.
|
|
48
|
+
"""
|
|
49
|
+
if path.name.endswith(".pdb") or path.name.endswith(".ent"):
|
|
50
|
+
body: str = structure.make_pdb_string()
|
|
51
|
+
path.write_text(body)
|
|
52
|
+
elif path.name.endswith(".pdb.gz") or path.name.endswith(".ent.gz"):
|
|
53
|
+
body: str = structure.make_pdb_string()
|
|
54
|
+
with gzip.open(path, "wt") as f:
|
|
55
|
+
f.write(body)
|
|
56
|
+
elif path.name.endswith(".cif"):
|
|
57
|
+
# do not write chem_comp so it is viewable by molstar
|
|
58
|
+
# see https://github.com/project-gemmi/gemmi/discussions/362
|
|
59
|
+
doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
|
|
60
|
+
doc.write_file(str(path))
|
|
61
|
+
elif path.name.endswith(".cif.gz"):
|
|
62
|
+
doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
|
|
63
|
+
cif_str = doc.as_string()
|
|
64
|
+
with gzip.open(path, "wt") as f:
|
|
65
|
+
f.write(cif_str)
|
|
66
|
+
elif path.name.endswith(".bcif"):
|
|
67
|
+
structure2bcif(structure, path)
|
|
68
|
+
elif path.name.endswith(".bcif.gz"):
|
|
69
|
+
structure2bcifgz(structure, path)
|
|
70
|
+
else:
|
|
71
|
+
msg = f"Unsupported file extension in {path.name}. Supported extensions are: {valid_structure_file_extensions}"
|
|
72
|
+
raise ValueError(msg)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def read_structure(file: Path) -> gemmi.Structure:
|
|
76
|
+
"""Read a structure from a file.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
file: Path to the input structure file.
|
|
80
|
+
See [StructureFileExtensions][protein_quest.io.StructureFileExtensions]
|
|
81
|
+
for supported extensions.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
A gemmi Structure object representing the structure in the file.
|
|
85
|
+
"""
|
|
86
|
+
if file.name.endswith(".bcif"):
|
|
87
|
+
return bcif2structure(file)
|
|
88
|
+
if file.name.endswith(".bcif.gz"):
|
|
89
|
+
return bcifgz2structure(file)
|
|
90
|
+
return gemmi.read_structure(str(file))
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def bcif2cif(bcif_file: Path) -> str:
|
|
94
|
+
"""Convert a binary CIF (bcif) file to a CIF string.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
bcif_file: Path to the binary CIF file.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
A string containing the CIF representation of the structure.
|
|
101
|
+
"""
|
|
102
|
+
reader = BinaryCifReader()
|
|
103
|
+
container = reader.deserialize(str(bcif_file))
|
|
104
|
+
capture = StringIO()
|
|
105
|
+
writer = PdbxWriter(capture)
|
|
106
|
+
writer.write(container)
|
|
107
|
+
return capture.getvalue()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def bcifgz2structure(bcif_gz_file: Path) -> gemmi.Structure:
|
|
111
|
+
"""Read a binary CIF (bcif) gzipped file and return a gemmi Structure object.
|
|
112
|
+
|
|
113
|
+
This is slower than other formats because gemmi does not support reading bcif files directly.
|
|
114
|
+
So we first gunzip the file to a temporary location, convert it to a cif string using mmcif package,
|
|
115
|
+
and then read the cif string using gemmi.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
bcif_gz_file: Path to the binary CIF gzipped file.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
A gemmi Structure object representing the structure in the bcif.gz file.
|
|
122
|
+
"""
|
|
123
|
+
with tempfile.NamedTemporaryFile(suffix=".bcif", delete=True) as tmp_bcif:
|
|
124
|
+
tmp_path = Path(tmp_bcif.name)
|
|
125
|
+
gunzip_file(bcif_gz_file, output_file=tmp_path, keep_original=True)
|
|
126
|
+
return bcif2structure(tmp_path)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def bcif2structure(bcif_file: Path) -> gemmi.Structure:
|
|
130
|
+
"""Read a binary CIF (bcif) file and return a gemmi Structure object.
|
|
131
|
+
|
|
132
|
+
This is slower than other formats because gemmi does not support reading bcif files directly.
|
|
133
|
+
So we convert it to a cif string first using mmcif package and then read the cif string using gemmi.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
bcif_file: Path to the binary CIF file.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
A gemmi Structure object representing the structure in the bcif file.
|
|
140
|
+
"""
|
|
141
|
+
cif_content = bcif2cif(bcif_file)
|
|
142
|
+
doc = gemmi.cif.read_string(cif_content)
|
|
143
|
+
block = doc.sole_block()
|
|
144
|
+
return gemmi.make_structure_from_block(block)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _initialize_dictionary_api(containers) -> DictionaryApi:
|
|
148
|
+
dict_local = user_cache_root_dir() / "mmcif_pdbx_v5_next.dic"
|
|
149
|
+
if not dict_local.exists():
|
|
150
|
+
dict_url = "https://raw.githubusercontent.com/wwpdb-dictionaries/mmcif_pdbx/master/dist/mmcif_pdbx_v5_next.dic"
|
|
151
|
+
logger.info("Downloading mmcif dictionary from %s to %s", dict_url, dict_local)
|
|
152
|
+
dict_local.parent.mkdir(parents=True, exist_ok=True)
|
|
153
|
+
with dict_local.open("wb") as f, urlopen(dict_url) as response: # noqa: S310 url is hardcoded and https
|
|
154
|
+
f.write(response.read())
|
|
155
|
+
return DictionaryApi(containerList=containers, consolidate=True)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def structure2bcif(structure: gemmi.Structure, bcif_file: Path):
|
|
159
|
+
"""Write a gemmi Structure object to a binary CIF (bcif) file.
|
|
160
|
+
|
|
161
|
+
This is slower than other formats because gemmi does not support writing bcif files directly.
|
|
162
|
+
So we convert it to a cif string first using gemmi and then convert cif to bcif using mmcif package.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
structure: The gemmi Structure object to write.
|
|
166
|
+
bcif_file: Path to the output binary CIF file.
|
|
167
|
+
"""
|
|
168
|
+
doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
|
|
169
|
+
containers = []
|
|
170
|
+
with StringIO(doc.as_string()) as sio:
|
|
171
|
+
reader = PdbxReader(sio)
|
|
172
|
+
reader.read(containers)
|
|
173
|
+
dict_api = _initialize_dictionary_api(containers)
|
|
174
|
+
writer = BinaryCifWriter(dictionaryApi=dict_api)
|
|
175
|
+
writer.serialize(str(bcif_file), containers)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def gunzip_file(gz_file: Path, output_file: Path | None = None, keep_original: bool = True) -> Path:
|
|
179
|
+
"""Unzip a .gz file.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
gz_file: Path to the .gz file.
|
|
183
|
+
output_file: Optional path to the output unzipped file. If None, the .gz suffix is removed from gz_file.
|
|
184
|
+
keep_original: Whether to keep the original .gz file. Default is True.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Path to the unzipped file.
|
|
188
|
+
|
|
189
|
+
Raises:
|
|
190
|
+
ValueError: If output_file is None and gz_file does not end with .gz.
|
|
191
|
+
"""
|
|
192
|
+
if output_file is None and not gz_file.name.endswith(".gz"):
|
|
193
|
+
msg = f"If output_file is not provided, {gz_file} must end with .gz"
|
|
194
|
+
raise ValueError(msg)
|
|
195
|
+
out_file = output_file or gz_file.with_suffix("")
|
|
196
|
+
with gzip.open(gz_file, "rb") as f_in, out_file.open("wb") as f_out:
|
|
197
|
+
shutil.copyfileobj(f_in, f_out)
|
|
198
|
+
if not keep_original:
|
|
199
|
+
gz_file.unlink()
|
|
200
|
+
return out_file
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def structure2bcifgz(structure: gemmi.Structure, bcif_gz_file: Path):
|
|
204
|
+
"""Write a gemmi Structure object to a binary CIF gzipped (bcif.gz) file.
|
|
205
|
+
|
|
206
|
+
This is slower than other formats because gemmi does not support writing bcif files directly.
|
|
207
|
+
So we convert it to a cif string first using gemmi and then convert cif to bcif using mmcif package.
|
|
208
|
+
Finally, we gzip the bcif file.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
structure: The gemmi Structure object to write.
|
|
212
|
+
bcif_gz_file: Path to the output binary CIF gzipped file.
|
|
213
|
+
"""
|
|
214
|
+
with tempfile.NamedTemporaryFile(suffix=".bcif", delete=True) as tmp_bcif:
|
|
215
|
+
tmp_path = Path(tmp_bcif.name)
|
|
216
|
+
structure2bcif(structure, tmp_path)
|
|
217
|
+
with tmp_path.open("rb") as f_in, gzip.open(bcif_gz_file, "wb") as f_out:
|
|
218
|
+
shutil.copyfileobj(f_in, f_out)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def convert_to_cif_files(
|
|
222
|
+
input_files: Iterable[Path], output_dir: Path, copy_method: CopyMethod
|
|
223
|
+
) -> Generator[tuple[Path, Path]]:
|
|
224
|
+
"""Convert structure files to .cif format.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
input_files: Iterable of structure files to convert.
|
|
228
|
+
output_dir: Directory to save the converted .cif files.
|
|
229
|
+
copy_method: How to copy when no changes are needed to output file.
|
|
230
|
+
|
|
231
|
+
Yields:
|
|
232
|
+
A tuple of the input file and the output file.
|
|
233
|
+
"""
|
|
234
|
+
for input_file in input_files:
|
|
235
|
+
output_file = convert_to_cif_file(input_file, output_dir, copy_method)
|
|
236
|
+
yield input_file, output_file
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def convert_to_cif_file(input_file: Path, output_dir: Path, copy_method: CopyMethod) -> Path:
|
|
240
|
+
"""Convert a single structure file to .cif format.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
input_file: The structure file to convert.
|
|
244
|
+
See [StructureFileExtensions][protein_quest.io.StructureFileExtensions]
|
|
245
|
+
for supported extensions.
|
|
246
|
+
output_dir: Directory to save the converted .cif file.
|
|
247
|
+
copy_method: How to copy when no changes are needed to output file.
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
Path to the converted .cif file.
|
|
251
|
+
"""
|
|
252
|
+
name, extension = split_name_and_extension(input_file.name)
|
|
253
|
+
output_file = output_dir / f"{name}.cif"
|
|
254
|
+
if output_file.exists():
|
|
255
|
+
logger.info("Output file %s already exists for input file %s. Skipping.", output_file, input_file)
|
|
256
|
+
elif extension in {".pdb", ".pdb.gz", ".ent", ".ent.gz"}:
|
|
257
|
+
structure = read_structure(input_file)
|
|
258
|
+
write_structure(structure, output_file)
|
|
259
|
+
elif extension == ".cif":
|
|
260
|
+
logger.info("File %s is already in .cif format, copying to %s", input_file, output_dir)
|
|
261
|
+
copyfile(input_file, output_file, copy_method)
|
|
262
|
+
elif extension == ".cif.gz":
|
|
263
|
+
gunzip_file(input_file, output_file=output_file, keep_original=True)
|
|
264
|
+
elif extension == ".bcif":
|
|
265
|
+
with output_file.open("w") as f:
|
|
266
|
+
f.write(bcif2cif(input_file))
|
|
267
|
+
else:
|
|
268
|
+
msg = (
|
|
269
|
+
f"Unsupported file extension {extension} in {input_file}. "
|
|
270
|
+
f"Supported extensions are {valid_structure_file_extensions}."
|
|
271
|
+
)
|
|
272
|
+
raise ValueError(msg)
|
|
273
|
+
return output_file
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def split_name_and_extension(name: str) -> tuple[str, str]:
|
|
277
|
+
"""Split a filename into its name and extension.
|
|
278
|
+
|
|
279
|
+
`.gz` is considered part of the extension if present.
|
|
280
|
+
|
|
281
|
+
Examples:
|
|
282
|
+
Some example usages.
|
|
283
|
+
|
|
284
|
+
>>> from protein_quest.pdbe.io import split_name_and_extension
|
|
285
|
+
>>> split_name_and_extension("1234.pdb")
|
|
286
|
+
('1234', '.pdb')
|
|
287
|
+
>>> split_name_and_extension("1234.pdb.gz")
|
|
288
|
+
('1234', '.pdb.gz')
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
name: The filename to split.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
A tuple containing the name and the extension.
|
|
295
|
+
"""
|
|
296
|
+
ext = ""
|
|
297
|
+
if name.endswith(".gz"):
|
|
298
|
+
ext = ".gz"
|
|
299
|
+
name = name.removesuffix(".gz")
|
|
300
|
+
i = name.rfind(".")
|
|
301
|
+
if 0 < i < len(name) - 1:
|
|
302
|
+
ext = name[i:] + ext
|
|
303
|
+
name = name[:i]
|
|
304
|
+
return name, ext
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def locate_structure_file(root: Path, pdb_id: str) -> Path:
|
|
308
|
+
"""Locate a structure file for a given PDB ID in the specified directory.
|
|
309
|
+
|
|
310
|
+
Uses [StructureFileExtensions][protein_quest.io.StructureFileExtensions] as potential extensions.
|
|
311
|
+
Also tries different casing of the PDB ID.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
root: The root directory to search in.
|
|
315
|
+
pdb_id: The PDB ID to locate.
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
The path to the located structure file.
|
|
319
|
+
|
|
320
|
+
Raises:
|
|
321
|
+
FileNotFoundError: If no structure file is found for the given PDB ID.
|
|
322
|
+
"""
|
|
323
|
+
for ext in valid_structure_file_extensions:
|
|
324
|
+
candidates = (
|
|
325
|
+
root / f"{pdb_id}{ext}",
|
|
326
|
+
root / f"{pdb_id.lower()}{ext}",
|
|
327
|
+
root / f"{pdb_id.upper()}{ext}",
|
|
328
|
+
root / f"pdb{pdb_id.lower()}{ext}",
|
|
329
|
+
)
|
|
330
|
+
for candidate in candidates:
|
|
331
|
+
if candidate.exists():
|
|
332
|
+
return candidate
|
|
333
|
+
msg = f"No structure file found for {pdb_id} in {root}"
|
|
334
|
+
raise FileNotFoundError(msg)
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def glob_structure_files(input_dir: Path) -> Generator[Path]:
|
|
338
|
+
"""Glob for structure files in a directory.
|
|
339
|
+
|
|
340
|
+
Uses [StructureFileExtensions][protein_quest.io.StructureFileExtensions] as valid extensions.
|
|
341
|
+
Does not search recursively.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
input_dir: The input directory to search for structure files.
|
|
345
|
+
|
|
346
|
+
Yields:
|
|
347
|
+
Paths to the found structure files.
|
|
348
|
+
"""
|
|
349
|
+
for ext in valid_structure_file_extensions:
|
|
350
|
+
yield from input_dir.glob(f"*{ext}")
|
protein_quest/mcp_server.py
CHANGED
|
@@ -45,9 +45,14 @@ from protein_quest.alphafold.fetch import AlphaFoldEntry, DownloadableFormat
|
|
|
45
45
|
from protein_quest.alphafold.fetch import fetch_many as alphafold_fetch
|
|
46
46
|
from protein_quest.emdb import fetch as emdb_fetch
|
|
47
47
|
from protein_quest.go import search_gene_ontology_term
|
|
48
|
+
from protein_quest.io import convert_to_cif_file, glob_structure_files, read_structure
|
|
48
49
|
from protein_quest.pdbe.fetch import fetch as pdbe_fetch
|
|
49
|
-
from protein_quest.pdbe.io import glob_structure_files, nr_residues_in_chain, write_single_chain_pdb_file
|
|
50
50
|
from protein_quest.ss import filter_file_on_secondary_structure
|
|
51
|
+
from protein_quest.structure import (
|
|
52
|
+
nr_residues_in_chain,
|
|
53
|
+
structure2uniprot_accessions,
|
|
54
|
+
write_single_chain_structure_file,
|
|
55
|
+
)
|
|
51
56
|
from protein_quest.taxonomy import search_taxon
|
|
52
57
|
from protein_quest.uniprot import (
|
|
53
58
|
PdbResult,
|
|
@@ -112,23 +117,23 @@ def extract_single_chain_from_structure(
|
|
|
112
117
|
out_chain: str = "A",
|
|
113
118
|
) -> Path:
|
|
114
119
|
"""
|
|
115
|
-
Extract a single chain from a mmCIF
|
|
120
|
+
Extract a single chain from a structure (mmCIF or pdb) file and write to a new file.
|
|
116
121
|
|
|
117
122
|
Args:
|
|
118
|
-
input_file: Path to the input mmCIF
|
|
123
|
+
input_file: Path to the input structure (mmCIF or pdb) file.
|
|
119
124
|
chain2keep: The chain to keep.
|
|
120
125
|
output_dir: Directory to save the output file.
|
|
121
126
|
out_chain: The chain identifier for the output file.
|
|
122
127
|
|
|
123
128
|
Returns:
|
|
124
|
-
Path to the output mmCIF
|
|
129
|
+
Path to the output structure (mmCIF or pdb) file
|
|
125
130
|
"""
|
|
126
|
-
return
|
|
131
|
+
return write_single_chain_structure_file(input_file, chain2keep, output_dir, out_chain)
|
|
127
132
|
|
|
128
133
|
|
|
129
134
|
@mcp.tool
|
|
130
135
|
def list_structure_files(path: Path) -> list[Path]:
|
|
131
|
-
"""List structure files (.pdb, .pdb.gz, .cif, .cif.gz) in the specified directory."""
|
|
136
|
+
"""List structure files (.pdb, .pdb.gz, .cif, .cif.gz, .bcif) in the specified directory."""
|
|
132
137
|
return list(glob_structure_files(path))
|
|
133
138
|
|
|
134
139
|
|
|
@@ -149,7 +154,7 @@ def search_alphafolds(
|
|
|
149
154
|
Field(description="Set of uniprot accessions which have an AlphaFold entry"),
|
|
150
155
|
]:
|
|
151
156
|
"""Search for AlphaFold entries in UniProtKB accessions."""
|
|
152
|
-
# each uniprot
|
|
157
|
+
# each uniprot accession can have one or more AlphaFold IDs
|
|
153
158
|
# an AlphaFold ID is the same as the uniprot accession
|
|
154
159
|
# so we return a subset of uniprot_accs
|
|
155
160
|
results = search4af(uniprot_accs, limit)
|
|
@@ -199,6 +204,15 @@ def alphafold_confidence_filter(file: Path, query: ConfidenceFilterQuery, filter
|
|
|
199
204
|
|
|
200
205
|
mcp.tool(filter_file_on_secondary_structure)
|
|
201
206
|
|
|
207
|
+
mcp.tool(convert_to_cif_file)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
@mcp.tool
|
|
211
|
+
def uniprot_accessions_of_structure_file(file: Path) -> set[str]:
|
|
212
|
+
"""Extract UniProt accessions from structure file."""
|
|
213
|
+
structure = read_structure(file)
|
|
214
|
+
return structure2uniprot_accessions(structure)
|
|
215
|
+
|
|
202
216
|
|
|
203
217
|
@mcp.prompt
|
|
204
218
|
def candidate_structures(
|
protein_quest/ss.py
CHANGED
|
@@ -5,17 +5,13 @@ from collections.abc import Generator, Iterable
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
|
-
from gemmi import Structure
|
|
8
|
+
from gemmi import Structure
|
|
9
9
|
|
|
10
10
|
from protein_quest.converter import PositiveInt, Ratio, converter
|
|
11
|
+
from protein_quest.io import read_structure
|
|
11
12
|
|
|
12
13
|
logger = logging.getLogger(__name__)
|
|
13
14
|
|
|
14
|
-
# TODO remove once v0.7.4 of gemmi is released,
|
|
15
|
-
# as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
|
|
16
|
-
# Swallow gemmi leaked function warnings
|
|
17
|
-
set_leak_warnings(False)
|
|
18
|
-
|
|
19
15
|
# TODO if a structure has no secondary structure information, calculate it with `gemmi ss`.
|
|
20
16
|
# https://github.com/MonomerLibrary/monomers/wiki/Installation as --monomers dir
|
|
21
17
|
# gemmi executable is in https://pypi.org/project/gemmi-program/
|
|
@@ -261,7 +257,7 @@ def filter_file_on_secondary_structure(
|
|
|
261
257
|
Returns:
|
|
262
258
|
Filtering statistics and whether file passed.
|
|
263
259
|
"""
|
|
264
|
-
structure = read_structure(
|
|
260
|
+
structure = read_structure(file_path)
|
|
265
261
|
return filter_on_secondary_structure(structure, query)
|
|
266
262
|
|
|
267
263
|
|