protein-quest 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- protein_quest/__init__.py +0 -0
- protein_quest/__version__.py +2 -0
- protein_quest/alphafold/__init__.py +1 -0
- protein_quest/alphafold/confidence.py +226 -0
- protein_quest/alphafold/entry_summary.py +64 -0
- protein_quest/alphafold/fetch.py +534 -0
- protein_quest/cli.py +1428 -0
- protein_quest/converter.py +46 -0
- protein_quest/emdb.py +37 -0
- protein_quest/filters.py +163 -0
- protein_quest/go.py +165 -0
- protein_quest/io.py +350 -0
- protein_quest/mcp_server.py +256 -0
- protein_quest/parallel.py +104 -0
- protein_quest/pdbe/__init__.py +1 -0
- protein_quest/pdbe/fetch.py +68 -0
- protein_quest/py.typed +0 -0
- protein_quest/ss.py +280 -0
- protein_quest/structure.py +232 -0
- protein_quest/taxonomy.py +149 -0
- protein_quest/uniprot.py +975 -0
- protein_quest/utils.py +547 -0
- protein_quest-0.9.0.dist-info/METADATA +325 -0
- protein_quest-0.9.0.dist-info/RECORD +27 -0
- protein_quest-0.9.0.dist-info/WHEEL +4 -0
- protein_quest-0.9.0.dist-info/entry_points.txt +2 -0
- protein_quest-0.9.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Convert json or dict to Python objects."""
|
|
2
|
+
|
|
3
|
+
from cattrs.preconf.orjson import make_converter
|
|
4
|
+
from yarl import URL
|
|
5
|
+
|
|
6
|
+
type Percentage = float
|
|
7
|
+
"""Type alias for percentage values (0.0-100.0)."""
|
|
8
|
+
type Ratio = float
|
|
9
|
+
"""Type alias for ratio values (0.0-1.0)."""
|
|
10
|
+
type PositiveInt = int
|
|
11
|
+
"""Type alias for positive integer values (>= 0)."""
|
|
12
|
+
|
|
13
|
+
converter = make_converter()
|
|
14
|
+
"""cattrs converter to read JSON document or dict to Python objects."""
|
|
15
|
+
converter.register_structure_hook(URL, lambda v, _: URL(v))
|
|
16
|
+
converter.register_unstructure_hook(URL, lambda u: str(u))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@converter.register_structure_hook
|
|
20
|
+
def percentage_hook(val, _) -> Percentage:
|
|
21
|
+
value = float(val)
|
|
22
|
+
"""Cattrs hook to validate percentage values."""
|
|
23
|
+
if not 0.0 <= value <= 100.0:
|
|
24
|
+
msg = f"Value {value} is not a valid percentage (0.0-100.0)"
|
|
25
|
+
raise ValueError(msg)
|
|
26
|
+
return value
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@converter.register_structure_hook
|
|
30
|
+
def ratio_hook(val, _) -> Ratio:
|
|
31
|
+
"""Cattrs hook to validate ratio values."""
|
|
32
|
+
value = float(val)
|
|
33
|
+
if not 0.0 <= value <= 1.0:
|
|
34
|
+
msg = f"Value {value} is not a valid ratio (0.0-1.0)"
|
|
35
|
+
raise ValueError(msg)
|
|
36
|
+
return value
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@converter.register_structure_hook
|
|
40
|
+
def positive_int_hook(val, _) -> PositiveInt:
|
|
41
|
+
"""Cattrs hook to validate positive integer values."""
|
|
42
|
+
value = int(val)
|
|
43
|
+
if value < 0:
|
|
44
|
+
msg = f"Value {value} is not a valid positive integer (>= 0)"
|
|
45
|
+
raise ValueError(msg)
|
|
46
|
+
return value
|
protein_quest/emdb.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Module dealing with Electron Microscopy Data Bank (EMDB)."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable, Mapping
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from protein_quest.utils import Cacher, retrieve_files
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _map_id2volume_url(emdb_id: str) -> tuple[str, str]:
|
|
10
|
+
# https://ftp.ebi.ac.uk/pub/databases/emdb/structures/EMD-19583/map/emd_19583.map.gz
|
|
11
|
+
fn = emdb_id.lower().replace("emd-", "emd_") + ".map.gz"
|
|
12
|
+
url = f"https://ftp.ebi.ac.uk/pub/databases/emdb/structures/{emdb_id}/map/{fn}"
|
|
13
|
+
return url, fn
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def fetch(
|
|
17
|
+
emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 1, cacher: Cacher | None = None
|
|
18
|
+
) -> Mapping[str, Path]:
|
|
19
|
+
"""Fetches volume files from the EMDB database.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
emdb_ids: A list of EMDB IDs to fetch.
|
|
23
|
+
save_dir: The directory to save the downloaded files.
|
|
24
|
+
max_parallel_downloads: The maximum number of parallel downloads.
|
|
25
|
+
cacher: An optional cacher to use for caching downloaded files.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
A mapping of EMDB IDs to their downloaded files.
|
|
29
|
+
"""
|
|
30
|
+
id2urls = {emdb_id: _map_id2volume_url(emdb_id) for emdb_id in emdb_ids}
|
|
31
|
+
urls = list(id2urls.values())
|
|
32
|
+
id2paths = {emdb_id: save_dir / fn for emdb_id, (_, fn) in id2urls.items()}
|
|
33
|
+
|
|
34
|
+
# TODO show progress of each item
|
|
35
|
+
# TODO handle failed downloads, by skipping them instead of raising an error
|
|
36
|
+
await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading EMDB volume files", cacher=cacher)
|
|
37
|
+
return id2paths
|
protein_quest/filters.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""Module for filtering structure files and their contents."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections.abc import Collection, Generator
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Literal
|
|
8
|
+
|
|
9
|
+
from dask.distributed import Client
|
|
10
|
+
from distributed.deploy.cluster import Cluster
|
|
11
|
+
from tqdm.auto import tqdm
|
|
12
|
+
|
|
13
|
+
from protein_quest.parallel import configure_dask_scheduler, dask_map_with_progress
|
|
14
|
+
from protein_quest.structure import nr_residues_in_chain, write_single_chain_structure_file
|
|
15
|
+
from protein_quest.utils import CopyMethod, copyfile
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class ChainFilterStatistics:
|
|
22
|
+
input_file: Path
|
|
23
|
+
chain_id: str
|
|
24
|
+
passed: bool = False
|
|
25
|
+
output_file: Path | None = None
|
|
26
|
+
discard_reason: Exception | None = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def filter_file_on_chain(
|
|
30
|
+
file_and_chain: tuple[Path, str],
|
|
31
|
+
output_dir: Path,
|
|
32
|
+
out_chain: str = "A",
|
|
33
|
+
copy_method: CopyMethod = "copy",
|
|
34
|
+
) -> ChainFilterStatistics:
|
|
35
|
+
input_file, chain_id = file_and_chain
|
|
36
|
+
logger.debug("Filtering %s on chain %s", input_file, chain_id)
|
|
37
|
+
try:
|
|
38
|
+
output_file = write_single_chain_structure_file(
|
|
39
|
+
input_file, chain_id, output_dir, out_chain=out_chain, copy_method=copy_method
|
|
40
|
+
)
|
|
41
|
+
return ChainFilterStatistics(
|
|
42
|
+
input_file=input_file,
|
|
43
|
+
chain_id=chain_id,
|
|
44
|
+
output_file=output_file,
|
|
45
|
+
passed=True,
|
|
46
|
+
)
|
|
47
|
+
except Exception as e: # noqa: BLE001 - error is handled downstream
|
|
48
|
+
return ChainFilterStatistics(input_file=input_file, chain_id=chain_id, discard_reason=e)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _filter_files_on_chain_sequentially(
|
|
52
|
+
file2chains: Collection[tuple[Path, str]],
|
|
53
|
+
output_dir: Path,
|
|
54
|
+
out_chain: str = "A",
|
|
55
|
+
copy_method: CopyMethod = "copy",
|
|
56
|
+
) -> list[ChainFilterStatistics]:
|
|
57
|
+
results = []
|
|
58
|
+
for file_and_chain in tqdm(file2chains, unit="file"):
|
|
59
|
+
result = filter_file_on_chain(
|
|
60
|
+
file_and_chain,
|
|
61
|
+
output_dir=output_dir,
|
|
62
|
+
out_chain=out_chain,
|
|
63
|
+
copy_method=copy_method,
|
|
64
|
+
)
|
|
65
|
+
results.append(result)
|
|
66
|
+
return results
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def filter_files_on_chain(
|
|
70
|
+
file2chains: Collection[tuple[Path, str]],
|
|
71
|
+
output_dir: Path,
|
|
72
|
+
out_chain: str = "A",
|
|
73
|
+
scheduler_address: str | Cluster | Literal["sequential"] | None = None,
|
|
74
|
+
copy_method: CopyMethod = "copy",
|
|
75
|
+
) -> list[ChainFilterStatistics]:
|
|
76
|
+
"""Filter mmcif/PDB files by chain.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
file2chains: Which chain to keep for each PDB file.
|
|
80
|
+
First item is the PDB file path, second item is the chain ID.
|
|
81
|
+
output_dir: The directory where the filtered files will be written.
|
|
82
|
+
out_chain: Under what name to write the kept chain.
|
|
83
|
+
scheduler_address: The address of the Dask scheduler.
|
|
84
|
+
If not provided, will create a local cluster.
|
|
85
|
+
If set to `sequential` will run tasks sequentially.
|
|
86
|
+
copy_method: How to copy when a direct copy is possible.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Result of the filtering process.
|
|
90
|
+
"""
|
|
91
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
92
|
+
if scheduler_address == "sequential":
|
|
93
|
+
return _filter_files_on_chain_sequentially(
|
|
94
|
+
file2chains, output_dir, out_chain=out_chain, copy_method=copy_method
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# TODO make logger.debug in filter_file_on_chain show to user when --log
|
|
98
|
+
# GPT-5 generated a fairly difficult setup with a WorkerPlugin, need to find a simpler approach
|
|
99
|
+
scheduler_address = configure_dask_scheduler(
|
|
100
|
+
scheduler_address,
|
|
101
|
+
name="filter-chain",
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
with Client(scheduler_address) as client:
|
|
105
|
+
client.forward_logging()
|
|
106
|
+
return dask_map_with_progress(
|
|
107
|
+
client,
|
|
108
|
+
filter_file_on_chain,
|
|
109
|
+
file2chains,
|
|
110
|
+
output_dir=output_dir,
|
|
111
|
+
out_chain=out_chain,
|
|
112
|
+
copy_method=copy_method,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@dataclass
|
|
117
|
+
class ResidueFilterStatistics:
|
|
118
|
+
"""Statistics for filtering files based on residue count in a specific chain.
|
|
119
|
+
|
|
120
|
+
Parameters:
|
|
121
|
+
input_file: The path to the input file.
|
|
122
|
+
residue_count: The number of residues.
|
|
123
|
+
passed: Whether the file passed the filtering criteria.
|
|
124
|
+
output_file: The path to the output file, if passed.
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
input_file: Path
|
|
128
|
+
residue_count: int
|
|
129
|
+
passed: bool
|
|
130
|
+
output_file: Path | None
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def filter_files_on_residues(
|
|
134
|
+
input_files: list[Path],
|
|
135
|
+
output_dir: Path,
|
|
136
|
+
min_residues: int,
|
|
137
|
+
max_residues: int,
|
|
138
|
+
chain: str = "A",
|
|
139
|
+
copy_method: CopyMethod = "copy",
|
|
140
|
+
) -> Generator[ResidueFilterStatistics]:
|
|
141
|
+
"""Filter PDB/mmCIF files by number of residues in given chain.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
input_files: The list of input PDB/mmCIF files.
|
|
145
|
+
output_dir: The directory where the filtered files will be written.
|
|
146
|
+
min_residues: The minimum number of residues in chain.
|
|
147
|
+
max_residues: The maximum number of residues in chain.
|
|
148
|
+
chain: The chain to count residues of.
|
|
149
|
+
copy_method: How to copy passed files to output directory:
|
|
150
|
+
|
|
151
|
+
Yields:
|
|
152
|
+
Objects containing information about the filtering process for each input file.
|
|
153
|
+
"""
|
|
154
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
155
|
+
for input_file in tqdm(input_files, unit="file"):
|
|
156
|
+
residue_count = nr_residues_in_chain(input_file, chain=chain)
|
|
157
|
+
passed = min_residues <= residue_count <= max_residues
|
|
158
|
+
if passed:
|
|
159
|
+
output_file = output_dir / input_file.name
|
|
160
|
+
copyfile(input_file, output_file, copy_method)
|
|
161
|
+
yield ResidueFilterStatistics(input_file, residue_count, True, output_file)
|
|
162
|
+
else:
|
|
163
|
+
yield ResidueFilterStatistics(input_file, residue_count, False, None)
|
protein_quest/go.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""Module for Gene Ontology (GO) functions."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import logging
|
|
5
|
+
from collections.abc import Generator
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from io import TextIOWrapper
|
|
8
|
+
from typing import Literal, get_args
|
|
9
|
+
|
|
10
|
+
from cattrs.gen import make_dict_structure_fn, override
|
|
11
|
+
|
|
12
|
+
from protein_quest.converter import converter
|
|
13
|
+
from protein_quest.utils import friendly_session
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
Aspect = Literal["cellular_component", "biological_process", "molecular_function"]
|
|
18
|
+
"""The aspect of the GO term."""
|
|
19
|
+
allowed_aspects = set(get_args(Aspect))
|
|
20
|
+
"""Allowed aspects for GO terms."""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(frozen=True, slots=True)
|
|
24
|
+
class GoTerm:
|
|
25
|
+
"""A Gene Ontology (GO) term.
|
|
26
|
+
|
|
27
|
+
Parameters:
|
|
28
|
+
id: The unique identifier for the GO term, e.g., 'GO:0043293'.
|
|
29
|
+
is_obsolete: Whether the GO term is obsolete.
|
|
30
|
+
name: The name of the GO term.
|
|
31
|
+
definition: The definition of the GO term.
|
|
32
|
+
aspect: The aspect of the GO term.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
id: str
|
|
36
|
+
is_obsolete: bool
|
|
37
|
+
name: str
|
|
38
|
+
definition: str
|
|
39
|
+
aspect: Aspect
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(frozen=True, slots=True)
|
|
43
|
+
class PageInfo:
|
|
44
|
+
current: int
|
|
45
|
+
total: int
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass(frozen=True, slots=True)
|
|
49
|
+
class SearchResponse:
|
|
50
|
+
results: list[GoTerm]
|
|
51
|
+
number_of_hits: int
|
|
52
|
+
page_info: PageInfo
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def flatten_definition(definition, _context) -> str:
|
|
56
|
+
return definition["text"]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# Use hook to convert incoming camelCase to snake_case
|
|
60
|
+
# and to flatten definition {text} to text
|
|
61
|
+
# see https://catt.rs/en/stable/customizing.html#rename
|
|
62
|
+
converter.register_structure_hook(
|
|
63
|
+
GoTerm,
|
|
64
|
+
make_dict_structure_fn(
|
|
65
|
+
GoTerm,
|
|
66
|
+
converter,
|
|
67
|
+
is_obsolete=override(rename="isObsolete"),
|
|
68
|
+
definition=override(struct_hook=flatten_definition),
|
|
69
|
+
),
|
|
70
|
+
)
|
|
71
|
+
converter.register_structure_hook(
|
|
72
|
+
SearchResponse,
|
|
73
|
+
make_dict_structure_fn(
|
|
74
|
+
SearchResponse, converter, number_of_hits=override(rename="numberOfHits"), page_info=override(rename="pageInfo")
|
|
75
|
+
),
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
async def search_gene_ontology_term(
|
|
80
|
+
term: str, aspect: Aspect | None = None, include_obsolete: bool = False, limit: int = 100
|
|
81
|
+
) -> list[GoTerm]:
|
|
82
|
+
"""Search for a Gene Ontology (GO) term by its name or ID.
|
|
83
|
+
|
|
84
|
+
Calls the EBI QuickGO API at https://www.ebi.ac.uk/QuickGO/api/index.html .
|
|
85
|
+
|
|
86
|
+
Examples:
|
|
87
|
+
To search for `apoptosome` terms do.
|
|
88
|
+
|
|
89
|
+
>>> from protein_quest.go import search_go_term
|
|
90
|
+
>>> r = await search_go_term('apoptosome')
|
|
91
|
+
>>> len(r)
|
|
92
|
+
5
|
|
93
|
+
>>> r[0]
|
|
94
|
+
GoTerm(id='GO:0043293', is_obsolete=False, name='apoptosome', definition='A multisubunit protein ...')
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
term: The GO term to search for. For example `nucleus` or `GO:0006816`.
|
|
98
|
+
aspect: The aspect to filter by. If not given, all aspects are included.
|
|
99
|
+
include_obsolete: Whether to include obsolete terms. By default, obsolete terms are excluded.
|
|
100
|
+
limit: The maximum number of results to return.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
List of GO terms
|
|
104
|
+
|
|
105
|
+
Raises:
|
|
106
|
+
ValueError: If the aspect is invalid.
|
|
107
|
+
"""
|
|
108
|
+
url = "https://www.ebi.ac.uk/QuickGO/services/ontology/go/search"
|
|
109
|
+
page_limit = 100
|
|
110
|
+
params = {"query": term, "limit": str(page_limit), "page": "1"}
|
|
111
|
+
if aspect is not None and aspect not in allowed_aspects:
|
|
112
|
+
msg = f"Invalid aspect: {aspect}. Allowed aspects are: {allowed_aspects} or None."
|
|
113
|
+
raise ValueError(msg)
|
|
114
|
+
logger.debug("Fetching GO terms from %s with params %s", url, params)
|
|
115
|
+
async with friendly_session() as session:
|
|
116
|
+
# Fetch first page to learn how many pages there are
|
|
117
|
+
async with session.get(url, params=params) as response:
|
|
118
|
+
response.raise_for_status()
|
|
119
|
+
raw_data = await response.read()
|
|
120
|
+
data = converter.loads(raw_data, SearchResponse)
|
|
121
|
+
|
|
122
|
+
terms = list(_filter_go_terms(data.results, aspect, include_obsolete))
|
|
123
|
+
if len(terms) >= limit:
|
|
124
|
+
# Do not fetch additional pages if we have enough results
|
|
125
|
+
return terms[:limit]
|
|
126
|
+
total_pages = data.page_info.total
|
|
127
|
+
logger.debug("GO search returned %s pages (current=%s)", total_pages, data.page_info.current)
|
|
128
|
+
|
|
129
|
+
# Retrieve remaining pages (if any) and extend results
|
|
130
|
+
if total_pages > 1:
|
|
131
|
+
for page in range(2, total_pages + 1):
|
|
132
|
+
params["page"] = str(page)
|
|
133
|
+
logger.debug("Fetching additional GO terms page %s/%s with params %s", page, total_pages, params)
|
|
134
|
+
async with session.get(url, params=params) as response:
|
|
135
|
+
response.raise_for_status()
|
|
136
|
+
raw_data = await response.read()
|
|
137
|
+
data = converter.loads(raw_data, SearchResponse)
|
|
138
|
+
terms.extend(_filter_go_terms(data.results, aspect, include_obsolete))
|
|
139
|
+
if len(terms) >= limit:
|
|
140
|
+
# Do not fetch additional pages if we have enough results
|
|
141
|
+
break
|
|
142
|
+
|
|
143
|
+
return terms[:limit]
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _filter_go_terms(terms: list[GoTerm], aspect: Aspect | None, include_obsolete: bool) -> Generator[GoTerm]:
|
|
147
|
+
for oboterm in terms:
|
|
148
|
+
if not include_obsolete and oboterm.is_obsolete:
|
|
149
|
+
continue
|
|
150
|
+
if aspect and oboterm.aspect != aspect:
|
|
151
|
+
continue
|
|
152
|
+
yield oboterm
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def write_go_terms_to_csv(terms: list[GoTerm], csv_file: TextIOWrapper) -> None:
|
|
156
|
+
"""Write a list of GO terms to a CSV file.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
terms: The list of GO terms to write.
|
|
160
|
+
csv_file: The CSV file to write to.
|
|
161
|
+
"""
|
|
162
|
+
writer = csv.writer(csv_file)
|
|
163
|
+
writer.writerow(["id", "name", "aspect", "definition"])
|
|
164
|
+
for term in terms:
|
|
165
|
+
writer.writerow([term.id, term.name, term.aspect, term.definition])
|