protein-quest 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of protein-quest might be problematic. Click here for more details.
- protein_quest/__init__.py +0 -0
- protein_quest/__version__.py +1 -0
- protein_quest/alphafold/__init__.py +1 -0
- protein_quest/alphafold/confidence.py +153 -0
- protein_quest/alphafold/entry_summary.py +38 -0
- protein_quest/alphafold/fetch.py +314 -0
- protein_quest/cli.py +782 -0
- protein_quest/emdb.py +34 -0
- protein_quest/filters.py +107 -0
- protein_quest/go.py +168 -0
- protein_quest/mcp_server.py +208 -0
- protein_quest/parallel.py +68 -0
- protein_quest/pdbe/__init__.py +1 -0
- protein_quest/pdbe/fetch.py +51 -0
- protein_quest/pdbe/io.py +185 -0
- protein_quest/py.typed +0 -0
- protein_quest/taxonomy.py +139 -0
- protein_quest/uniprot.py +511 -0
- protein_quest/utils.py +105 -0
- protein_quest-0.3.0.dist-info/METADATA +219 -0
- protein_quest-0.3.0.dist-info/RECORD +24 -0
- protein_quest-0.3.0.dist-info/WHEEL +4 -0
- protein_quest-0.3.0.dist-info/entry_points.txt +2 -0
- protein_quest-0.3.0.dist-info/licenses/LICENSE +201 -0
protein_quest/pdbe/io.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""Module for structure file input/output."""
|
|
2
|
+
|
|
3
|
+
import gzip
|
|
4
|
+
import logging
|
|
5
|
+
from collections.abc import Generator
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import gemmi
|
|
9
|
+
|
|
10
|
+
from protein_quest import __version__
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def nr_residues_in_chain(file: Path | str, chain: str = "A") -> int:
|
|
16
|
+
"""Returns the number of residues in a specific chain from a mmCIF/pdb file.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
file: Path to the input mmCIF/pdb file.
|
|
20
|
+
chain: Chain to count residues of.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
The number of residues in the specified chain.
|
|
24
|
+
"""
|
|
25
|
+
structure = gemmi.read_structure(str(file))
|
|
26
|
+
model = structure[0]
|
|
27
|
+
gchain = find_chain_in_model(model, chain)
|
|
28
|
+
if gchain is None:
|
|
29
|
+
logger.warning("Chain %s not found in %s. Returning 0.", chain, file)
|
|
30
|
+
return 0
|
|
31
|
+
return len(gchain)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain | None:
|
|
35
|
+
chain = model.find_chain(wanted_chain)
|
|
36
|
+
if chain is None:
|
|
37
|
+
# For chain A in 4v92 the find_chain method returns None,
|
|
38
|
+
# however it is prefixed with 'B',
|
|
39
|
+
# so we try again as last char of chain name
|
|
40
|
+
mchains = [c for c in model if c.name.endswith(wanted_chain)]
|
|
41
|
+
if mchains:
|
|
42
|
+
return mchains[0]
|
|
43
|
+
return chain
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def write_structure(structure: gemmi.Structure, path: Path):
|
|
47
|
+
"""Write a gemmi structure to a file.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
structure: The gemmi structure to write.
|
|
51
|
+
path: The file path to write the structure to.
|
|
52
|
+
The format depends on the file extension.
|
|
53
|
+
Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz.
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
ValueError: If the file extension is not supported.
|
|
57
|
+
"""
|
|
58
|
+
if path.name.endswith(".pdb"):
|
|
59
|
+
body: str = structure.make_pdb_string()
|
|
60
|
+
path.write_text(body)
|
|
61
|
+
elif path.name.endswith(".pdb.gz"):
|
|
62
|
+
body: str = structure.make_pdb_string()
|
|
63
|
+
with gzip.open(path, "wt") as f:
|
|
64
|
+
f.write(body)
|
|
65
|
+
elif path.name.endswith(".cif"):
|
|
66
|
+
doc = structure.make_mmcif_document()
|
|
67
|
+
doc.write_file(str(path))
|
|
68
|
+
elif path.name.endswith(".cif.gz"):
|
|
69
|
+
doc = structure.make_mmcif_document()
|
|
70
|
+
cif_str = doc.as_string()
|
|
71
|
+
with gzip.open(path, "wt") as f:
|
|
72
|
+
f.write(cif_str)
|
|
73
|
+
else:
|
|
74
|
+
msg = f"Unsupported file extension in {path.name}. Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz"
|
|
75
|
+
raise ValueError(msg)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _split_name_and_extension(name: str) -> tuple[str, str]:
|
|
79
|
+
# 1234.pdb -> (1234, .pdb)
|
|
80
|
+
# 1234.pdb.gz -> (1234, .pdb.gz)
|
|
81
|
+
# 1234.cif -> (1234, .cif)
|
|
82
|
+
# 1234.cif.gz -> (1234, .cif.gz)
|
|
83
|
+
if name.endswith(".pdb.gz"):
|
|
84
|
+
return name.replace(".pdb.gz", ""), ".pdb.gz"
|
|
85
|
+
if name.endswith(".cif.gz"):
|
|
86
|
+
return name.replace(".cif.gz", ""), ".cif.gz"
|
|
87
|
+
if name.endswith(".pdb"):
|
|
88
|
+
return name.replace(".pdb", ""), ".pdb"
|
|
89
|
+
if name.endswith(".cif"):
|
|
90
|
+
return name.replace(".cif", ""), ".cif"
|
|
91
|
+
|
|
92
|
+
msg = f"Unknown file extension in {name}. Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz"
|
|
93
|
+
raise ValueError(msg)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def locate_structure_file(root: Path, pdb_id: str) -> Path:
|
|
97
|
+
"""Locate a structure file for a given PDB ID in the specified directory.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
root: The root directory to search in.
|
|
101
|
+
pdb_id: The PDB ID to locate.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
The path to the located structure file.
|
|
105
|
+
|
|
106
|
+
Raises:
|
|
107
|
+
FileNotFoundError: If no structure file is found for the given PDB ID.
|
|
108
|
+
"""
|
|
109
|
+
exts = [".cif.gz", ".cif", ".pdb.gz", ".pdb"]
|
|
110
|
+
# files downloaded from https://www.ebi.ac.uk/pdbe/ website
|
|
111
|
+
# have file names like pdb6t5y.ent or pdb6t5y.ent.gz for a PDB formatted file.
|
|
112
|
+
# TODO support pdb6t5y.ent or pdb6t5y.ent.gz file names
|
|
113
|
+
for ext in exts:
|
|
114
|
+
candidate = root / f"{pdb_id.lower()}{ext}"
|
|
115
|
+
if candidate.exists():
|
|
116
|
+
return candidate
|
|
117
|
+
msg = f"No structure file found for {pdb_id} in {root}"
|
|
118
|
+
raise FileNotFoundError(msg)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def glob_structure_files(input_dir: Path) -> Generator[Path]:
|
|
122
|
+
"""Glob for structure files in a directory.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
input_dir: The input directory to search for structure files.
|
|
126
|
+
|
|
127
|
+
Yields:
|
|
128
|
+
Paths to the found structure files.
|
|
129
|
+
"""
|
|
130
|
+
for ext in [".cif.gz", ".cif", ".pdb.gz", ".pdb"]:
|
|
131
|
+
yield from input_dir.glob(f"*{ext}")
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def write_single_chain_pdb_file(
|
|
135
|
+
input_file: Path, chain2keep: str, output_dir: Path, out_chain: str = "A"
|
|
136
|
+
) -> Path | None:
|
|
137
|
+
"""Write a single chain from a mmCIF/pdb file to a new mmCIF/pdb file.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
input_file: Path to the input mmCIF/pdb file.
|
|
141
|
+
chain2keep: The chain to keep.
|
|
142
|
+
output_dir: Directory to save the output file.
|
|
143
|
+
out_chain: The chain identifier for the output file.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Path to the output mmCIF/pdb file or None if not created.
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
structure = gemmi.read_structure(str(input_file))
|
|
150
|
+
model = structure[0]
|
|
151
|
+
|
|
152
|
+
# Only count residues of polymer
|
|
153
|
+
model.remove_ligands_and_waters()
|
|
154
|
+
|
|
155
|
+
chain = find_chain_in_model(model, chain2keep)
|
|
156
|
+
if chain is None:
|
|
157
|
+
logger.warning(
|
|
158
|
+
"Chain %s not found in %s. Skipping.",
|
|
159
|
+
chain2keep,
|
|
160
|
+
input_file,
|
|
161
|
+
)
|
|
162
|
+
return None
|
|
163
|
+
name, extension = _split_name_and_extension(input_file.name)
|
|
164
|
+
output_file = output_dir / f"{name}_{chain.name}2{out_chain}{extension}"
|
|
165
|
+
|
|
166
|
+
new_structure = gemmi.Structure()
|
|
167
|
+
new_structure.resolution = structure.resolution
|
|
168
|
+
new_id = structure.name + f"{chain2keep}2{out_chain}"
|
|
169
|
+
new_structure.name = new_id
|
|
170
|
+
new_structure.info["_entry.id"] = new_id
|
|
171
|
+
new_title = f"From {structure.info['_entry.id']} chain {chain2keep} to {out_chain}"
|
|
172
|
+
new_structure.info["_struct.title"] = new_title
|
|
173
|
+
new_structure.info["_struct_keywords.pdbx_keywords"] = new_title.upper()
|
|
174
|
+
new_si = gemmi.SoftwareItem()
|
|
175
|
+
new_si.classification = gemmi.SoftwareItem.Classification.DataExtraction
|
|
176
|
+
new_si.name = "protein-quest"
|
|
177
|
+
new_si.version = str(__version__)
|
|
178
|
+
new_structure.meta.software.append(new_si)
|
|
179
|
+
new_model = gemmi.Model(1)
|
|
180
|
+
chain.name = out_chain
|
|
181
|
+
new_model.add_chain(chain)
|
|
182
|
+
new_structure.add_model(new_model)
|
|
183
|
+
write_structure(new_structure, output_file)
|
|
184
|
+
|
|
185
|
+
return output_file
|
protein_quest/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Module for searching taxon information from UniProt."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import gzip
|
|
5
|
+
import logging
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Literal, get_args
|
|
8
|
+
|
|
9
|
+
from aiohttp.client import ClientResponse
|
|
10
|
+
from aiohttp_retry import RetryClient
|
|
11
|
+
from cattrs.gen import make_dict_structure_fn, override
|
|
12
|
+
from cattrs.preconf.orjson import make_converter
|
|
13
|
+
from yarl import URL
|
|
14
|
+
|
|
15
|
+
from protein_quest.go import TextIOWrapper
|
|
16
|
+
from protein_quest.utils import friendly_session
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True, slots=True)
|
|
22
|
+
class Taxon:
|
|
23
|
+
taxon_id: str
|
|
24
|
+
scientific_name: str
|
|
25
|
+
rank: str
|
|
26
|
+
common_name: str | None = None
|
|
27
|
+
other_names: set[str] | None = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True, slots=True)
|
|
31
|
+
class SearchTaxonResponse:
|
|
32
|
+
results: list[Taxon]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
converter = make_converter()
|
|
36
|
+
|
|
37
|
+
converter.register_structure_hook(
|
|
38
|
+
Taxon,
|
|
39
|
+
make_dict_structure_fn(
|
|
40
|
+
Taxon,
|
|
41
|
+
converter,
|
|
42
|
+
taxon_id=override(rename="taxonId"),
|
|
43
|
+
scientific_name=override(rename="scientificName"),
|
|
44
|
+
common_name=override(rename="commonName"),
|
|
45
|
+
other_names=override(rename="otherNames"),
|
|
46
|
+
),
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
SearchField = Literal["tax_id", "scientific", "common", "parent"]
|
|
50
|
+
search_fields: set[SearchField | None] = set(get_args(SearchField)) | {None}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _get_next_page(response: ClientResponse) -> URL | str | None:
|
|
54
|
+
next_page = response.links.getone("next", None)
|
|
55
|
+
if next_page is None:
|
|
56
|
+
return None
|
|
57
|
+
return next_page.getone("url", None)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
async def _fetch_page(url: URL | str, session: RetryClient) -> tuple[list[Taxon], URL | str | None]:
|
|
61
|
+
async with session.get(url) as response:
|
|
62
|
+
response.raise_for_status()
|
|
63
|
+
gzipped_raw_data = await response.read()
|
|
64
|
+
next_page = _get_next_page(response)
|
|
65
|
+
raw_data = gzip.decompress(gzipped_raw_data)
|
|
66
|
+
taxons = converter.loads(raw_data, SearchTaxonResponse).results
|
|
67
|
+
return taxons, next_page
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
async def search_taxon(query: str, field: SearchField | None = None, limit: int = 100) -> list[Taxon]:
|
|
71
|
+
"""Search for taxon information in UniProt.
|
|
72
|
+
|
|
73
|
+
Uses <https://www.uniprot.org/taxonomy?query=*>.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
query: Search query for the taxon.
|
|
77
|
+
field: Field to search in.
|
|
78
|
+
If None, searches in all fields.
|
|
79
|
+
If "tax_id" then searches by taxon ID.
|
|
80
|
+
If "parent" then given a parent taxon ID returns all its children.
|
|
81
|
+
For example, if the parent taxon ID is 9606 (Human), it will return Neanderthal and Denisovan.
|
|
82
|
+
limit: Maximum number of results to return.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
List of Taxon objects matching the search query.
|
|
86
|
+
|
|
87
|
+
Raises:
|
|
88
|
+
ValueError: If the search field is invalid.
|
|
89
|
+
"""
|
|
90
|
+
# https://rest.uniprot.org/taxonomy/search?compressed=true&format=json&query=%28Mouse%29&size=500
|
|
91
|
+
page_limit = 100
|
|
92
|
+
if field not in search_fields:
|
|
93
|
+
msg = f"Invalid search field: {field}. Must be one of {search_fields}."
|
|
94
|
+
raise ValueError(msg)
|
|
95
|
+
if field is not None:
|
|
96
|
+
# ((common:"house+mouse"))
|
|
97
|
+
query = f'(({field}:"{query}"))'
|
|
98
|
+
params = {"query": query, "limit": str(page_limit), "compressed": "true", "format": "json"}
|
|
99
|
+
url = URL("https://rest.uniprot.org/taxonomy/search").with_query(params)
|
|
100
|
+
logger.debug("Fetching uniprot taxonomy from %s with params %s", url, params)
|
|
101
|
+
async with friendly_session() as session:
|
|
102
|
+
# Fetch first page
|
|
103
|
+
taxons, next_page = await _fetch_page(url, session)
|
|
104
|
+
if len(taxons) >= limit:
|
|
105
|
+
return taxons[:limit]
|
|
106
|
+
if next_page is None:
|
|
107
|
+
return taxons
|
|
108
|
+
|
|
109
|
+
# Fetch next pages
|
|
110
|
+
while next_page:
|
|
111
|
+
logger.debug("Fetching next page of uniprot taxonomy from %s", next_page)
|
|
112
|
+
next_data, next_page = await _fetch_page(next_page, session)
|
|
113
|
+
taxons.extend(next_data)
|
|
114
|
+
if len(taxons) >= limit:
|
|
115
|
+
return taxons[:limit]
|
|
116
|
+
if next_page is None:
|
|
117
|
+
return taxons
|
|
118
|
+
return taxons
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _write_taxonomy_csv(taxons: list[Taxon], output_csv: TextIOWrapper) -> None:
|
|
122
|
+
"""Write taxon information to a CSV file.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
taxons: List of Taxon objects to write to the CSV file.
|
|
126
|
+
output_csv: File object for the output CSV file.
|
|
127
|
+
"""
|
|
128
|
+
writer = csv.writer(output_csv)
|
|
129
|
+
writer.writerow(["taxon_id", "scientific_name", "common_name", "rank", "other_names"])
|
|
130
|
+
for taxon in taxons:
|
|
131
|
+
writer.writerow(
|
|
132
|
+
[
|
|
133
|
+
taxon.taxon_id,
|
|
134
|
+
taxon.scientific_name,
|
|
135
|
+
taxon.common_name,
|
|
136
|
+
taxon.rank,
|
|
137
|
+
";".join(taxon.other_names) if taxon.other_names else "",
|
|
138
|
+
]
|
|
139
|
+
)
|