protein-quest 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ """Convert json or dict to Python objects."""
2
+
3
+ from cattrs.preconf.orjson import make_converter
4
+ from yarl import URL
5
+
6
+ type Percentage = float
7
+ """Type alias for percentage values (0.0-100.0)."""
8
+ type Ratio = float
9
+ """Type alias for ratio values (0.0-1.0)."""
10
+ type PositiveInt = int
11
+ """Type alias for positive integer values (>= 0)."""
12
+
13
+ converter = make_converter()
14
+ """cattrs converter to read JSON document or dict to Python objects."""
15
+ converter.register_structure_hook(URL, lambda v, _: URL(v))
16
+ converter.register_unstructure_hook(URL, lambda u: str(u))
17
+
18
+
19
+ @converter.register_structure_hook
20
+ def percentage_hook(val, _) -> Percentage:
21
+ value = float(val)
22
+ """Cattrs hook to validate percentage values."""
23
+ if not 0.0 <= value <= 100.0:
24
+ msg = f"Value {value} is not a valid percentage (0.0-100.0)"
25
+ raise ValueError(msg)
26
+ return value
27
+
28
+
29
+ @converter.register_structure_hook
30
+ def ratio_hook(val, _) -> Ratio:
31
+ """Cattrs hook to validate ratio values."""
32
+ value = float(val)
33
+ if not 0.0 <= value <= 1.0:
34
+ msg = f"Value {value} is not a valid ratio (0.0-1.0)"
35
+ raise ValueError(msg)
36
+ return value
37
+
38
+
39
+ @converter.register_structure_hook
40
+ def positive_int_hook(val, _) -> PositiveInt:
41
+ """Cattrs hook to validate positive integer values."""
42
+ value = int(val)
43
+ if value < 0:
44
+ msg = f"Value {value} is not a valid positive integer (>= 0)"
45
+ raise ValueError(msg)
46
+ return value
protein_quest/emdb.py ADDED
@@ -0,0 +1,37 @@
1
+ """Module dealing with Electron Microscopy Data Bank (EMDB)."""
2
+
3
+ from collections.abc import Iterable, Mapping
4
+ from pathlib import Path
5
+
6
+ from protein_quest.utils import Cacher, retrieve_files
7
+
8
+
9
+ def _map_id2volume_url(emdb_id: str) -> tuple[str, str]:
10
+ # https://ftp.ebi.ac.uk/pub/databases/emdb/structures/EMD-19583/map/emd_19583.map.gz
11
+ fn = emdb_id.lower().replace("emd-", "emd_") + ".map.gz"
12
+ url = f"https://ftp.ebi.ac.uk/pub/databases/emdb/structures/{emdb_id}/map/{fn}"
13
+ return url, fn
14
+
15
+
16
+ async def fetch(
17
+ emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 1, cacher: Cacher | None = None
18
+ ) -> Mapping[str, Path]:
19
+ """Fetches volume files from the EMDB database.
20
+
21
+ Args:
22
+ emdb_ids: A list of EMDB IDs to fetch.
23
+ save_dir: The directory to save the downloaded files.
24
+ max_parallel_downloads: The maximum number of parallel downloads.
25
+ cacher: An optional cacher to use for caching downloaded files.
26
+
27
+ Returns:
28
+ A mapping of EMDB IDs to their downloaded files.
29
+ """
30
+ id2urls = {emdb_id: _map_id2volume_url(emdb_id) for emdb_id in emdb_ids}
31
+ urls = list(id2urls.values())
32
+ id2paths = {emdb_id: save_dir / fn for emdb_id, (_, fn) in id2urls.items()}
33
+
34
+ # TODO show progress of each item
35
+ # TODO handle failed downloads, by skipping them instead of raising an error
36
+ await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading EMDB volume files", cacher=cacher)
37
+ return id2paths
@@ -0,0 +1,163 @@
1
+ """Module for filtering structure files and their contents."""
2
+
3
+ import logging
4
+ from collections.abc import Collection, Generator
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Literal
8
+
9
+ from dask.distributed import Client
10
+ from distributed.deploy.cluster import Cluster
11
+ from tqdm.auto import tqdm
12
+
13
+ from protein_quest.parallel import configure_dask_scheduler, dask_map_with_progress
14
+ from protein_quest.structure import nr_residues_in_chain, write_single_chain_structure_file
15
+ from protein_quest.utils import CopyMethod, copyfile
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @dataclass
21
+ class ChainFilterStatistics:
22
+ input_file: Path
23
+ chain_id: str
24
+ passed: bool = False
25
+ output_file: Path | None = None
26
+ discard_reason: Exception | None = None
27
+
28
+
29
+ def filter_file_on_chain(
30
+ file_and_chain: tuple[Path, str],
31
+ output_dir: Path,
32
+ out_chain: str = "A",
33
+ copy_method: CopyMethod = "copy",
34
+ ) -> ChainFilterStatistics:
35
+ input_file, chain_id = file_and_chain
36
+ logger.debug("Filtering %s on chain %s", input_file, chain_id)
37
+ try:
38
+ output_file = write_single_chain_structure_file(
39
+ input_file, chain_id, output_dir, out_chain=out_chain, copy_method=copy_method
40
+ )
41
+ return ChainFilterStatistics(
42
+ input_file=input_file,
43
+ chain_id=chain_id,
44
+ output_file=output_file,
45
+ passed=True,
46
+ )
47
+ except Exception as e: # noqa: BLE001 - error is handled downstream
48
+ return ChainFilterStatistics(input_file=input_file, chain_id=chain_id, discard_reason=e)
49
+
50
+
51
+ def _filter_files_on_chain_sequentially(
52
+ file2chains: Collection[tuple[Path, str]],
53
+ output_dir: Path,
54
+ out_chain: str = "A",
55
+ copy_method: CopyMethod = "copy",
56
+ ) -> list[ChainFilterStatistics]:
57
+ results = []
58
+ for file_and_chain in tqdm(file2chains, unit="file"):
59
+ result = filter_file_on_chain(
60
+ file_and_chain,
61
+ output_dir=output_dir,
62
+ out_chain=out_chain,
63
+ copy_method=copy_method,
64
+ )
65
+ results.append(result)
66
+ return results
67
+
68
+
69
+ def filter_files_on_chain(
70
+ file2chains: Collection[tuple[Path, str]],
71
+ output_dir: Path,
72
+ out_chain: str = "A",
73
+ scheduler_address: str | Cluster | Literal["sequential"] | None = None,
74
+ copy_method: CopyMethod = "copy",
75
+ ) -> list[ChainFilterStatistics]:
76
+ """Filter mmcif/PDB files by chain.
77
+
78
+ Args:
79
+ file2chains: Which chain to keep for each PDB file.
80
+ First item is the PDB file path, second item is the chain ID.
81
+ output_dir: The directory where the filtered files will be written.
82
+ out_chain: Under what name to write the kept chain.
83
+ scheduler_address: The address of the Dask scheduler.
84
+ If not provided, will create a local cluster.
85
+ If set to `sequential` will run tasks sequentially.
86
+ copy_method: How to copy when a direct copy is possible.
87
+
88
+ Returns:
89
+ Result of the filtering process.
90
+ """
91
+ output_dir.mkdir(parents=True, exist_ok=True)
92
+ if scheduler_address == "sequential":
93
+ return _filter_files_on_chain_sequentially(
94
+ file2chains, output_dir, out_chain=out_chain, copy_method=copy_method
95
+ )
96
+
97
+ # TODO make logger.debug in filter_file_on_chain show to user when --log
98
+ # GPT-5 generated a fairly difficult setup with a WorkerPlugin, need to find a simpler approach
99
+ scheduler_address = configure_dask_scheduler(
100
+ scheduler_address,
101
+ name="filter-chain",
102
+ )
103
+
104
+ with Client(scheduler_address) as client:
105
+ client.forward_logging()
106
+ return dask_map_with_progress(
107
+ client,
108
+ filter_file_on_chain,
109
+ file2chains,
110
+ output_dir=output_dir,
111
+ out_chain=out_chain,
112
+ copy_method=copy_method,
113
+ )
114
+
115
+
116
+ @dataclass
117
+ class ResidueFilterStatistics:
118
+ """Statistics for filtering files based on residue count in a specific chain.
119
+
120
+ Parameters:
121
+ input_file: The path to the input file.
122
+ residue_count: The number of residues.
123
+ passed: Whether the file passed the filtering criteria.
124
+ output_file: The path to the output file, if passed.
125
+ """
126
+
127
+ input_file: Path
128
+ residue_count: int
129
+ passed: bool
130
+ output_file: Path | None
131
+
132
+
133
+ def filter_files_on_residues(
134
+ input_files: list[Path],
135
+ output_dir: Path,
136
+ min_residues: int,
137
+ max_residues: int,
138
+ chain: str = "A",
139
+ copy_method: CopyMethod = "copy",
140
+ ) -> Generator[ResidueFilterStatistics]:
141
+ """Filter PDB/mmCIF files by number of residues in given chain.
142
+
143
+ Args:
144
+ input_files: The list of input PDB/mmCIF files.
145
+ output_dir: The directory where the filtered files will be written.
146
+ min_residues: The minimum number of residues in chain.
147
+ max_residues: The maximum number of residues in chain.
148
+ chain: The chain to count residues of.
149
+ copy_method: How to copy passed files to output directory:
150
+
151
+ Yields:
152
+ Objects containing information about the filtering process for each input file.
153
+ """
154
+ output_dir.mkdir(parents=True, exist_ok=True)
155
+ for input_file in tqdm(input_files, unit="file"):
156
+ residue_count = nr_residues_in_chain(input_file, chain=chain)
157
+ passed = min_residues <= residue_count <= max_residues
158
+ if passed:
159
+ output_file = output_dir / input_file.name
160
+ copyfile(input_file, output_file, copy_method)
161
+ yield ResidueFilterStatistics(input_file, residue_count, True, output_file)
162
+ else:
163
+ yield ResidueFilterStatistics(input_file, residue_count, False, None)
protein_quest/go.py ADDED
@@ -0,0 +1,165 @@
1
+ """Module for Gene Ontology (GO) functions."""
2
+
3
+ import csv
4
+ import logging
5
+ from collections.abc import Generator
6
+ from dataclasses import dataclass
7
+ from io import TextIOWrapper
8
+ from typing import Literal, get_args
9
+
10
+ from cattrs.gen import make_dict_structure_fn, override
11
+
12
+ from protein_quest.converter import converter
13
+ from protein_quest.utils import friendly_session
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ Aspect = Literal["cellular_component", "biological_process", "molecular_function"]
18
+ """The aspect of the GO term."""
19
+ allowed_aspects = set(get_args(Aspect))
20
+ """Allowed aspects for GO terms."""
21
+
22
+
23
+ @dataclass(frozen=True, slots=True)
24
+ class GoTerm:
25
+ """A Gene Ontology (GO) term.
26
+
27
+ Parameters:
28
+ id: The unique identifier for the GO term, e.g., 'GO:0043293'.
29
+ is_obsolete: Whether the GO term is obsolete.
30
+ name: The name of the GO term.
31
+ definition: The definition of the GO term.
32
+ aspect: The aspect of the GO term.
33
+ """
34
+
35
+ id: str
36
+ is_obsolete: bool
37
+ name: str
38
+ definition: str
39
+ aspect: Aspect
40
+
41
+
42
+ @dataclass(frozen=True, slots=True)
43
+ class PageInfo:
44
+ current: int
45
+ total: int
46
+
47
+
48
+ @dataclass(frozen=True, slots=True)
49
+ class SearchResponse:
50
+ results: list[GoTerm]
51
+ number_of_hits: int
52
+ page_info: PageInfo
53
+
54
+
55
+ def flatten_definition(definition, _context) -> str:
56
+ return definition["text"]
57
+
58
+
59
+ # Use hook to convert incoming camelCase to snake_case
60
+ # and to flatten definition {text} to text
61
+ # see https://catt.rs/en/stable/customizing.html#rename
62
+ converter.register_structure_hook(
63
+ GoTerm,
64
+ make_dict_structure_fn(
65
+ GoTerm,
66
+ converter,
67
+ is_obsolete=override(rename="isObsolete"),
68
+ definition=override(struct_hook=flatten_definition),
69
+ ),
70
+ )
71
+ converter.register_structure_hook(
72
+ SearchResponse,
73
+ make_dict_structure_fn(
74
+ SearchResponse, converter, number_of_hits=override(rename="numberOfHits"), page_info=override(rename="pageInfo")
75
+ ),
76
+ )
77
+
78
+
79
+ async def search_gene_ontology_term(
80
+ term: str, aspect: Aspect | None = None, include_obsolete: bool = False, limit: int = 100
81
+ ) -> list[GoTerm]:
82
+ """Search for a Gene Ontology (GO) term by its name or ID.
83
+
84
+ Calls the EBI QuickGO API at https://www.ebi.ac.uk/QuickGO/api/index.html .
85
+
86
+ Examples:
87
+ To search for `apoptosome` terms do.
88
+
89
+ >>> from protein_quest.go import search_go_term
90
+ >>> r = await search_go_term('apoptosome')
91
+ >>> len(r)
92
+ 5
93
+ >>> r[0]
94
+ GoTerm(id='GO:0043293', is_obsolete=False, name='apoptosome', definition='A multisubunit protein ...')
95
+
96
+ Args:
97
+ term: The GO term to search for. For example `nucleus` or `GO:0006816`.
98
+ aspect: The aspect to filter by. If not given, all aspects are included.
99
+ include_obsolete: Whether to include obsolete terms. By default, obsolete terms are excluded.
100
+ limit: The maximum number of results to return.
101
+
102
+ Returns:
103
+ List of GO terms
104
+
105
+ Raises:
106
+ ValueError: If the aspect is invalid.
107
+ """
108
+ url = "https://www.ebi.ac.uk/QuickGO/services/ontology/go/search"
109
+ page_limit = 100
110
+ params = {"query": term, "limit": str(page_limit), "page": "1"}
111
+ if aspect is not None and aspect not in allowed_aspects:
112
+ msg = f"Invalid aspect: {aspect}. Allowed aspects are: {allowed_aspects} or None."
113
+ raise ValueError(msg)
114
+ logger.debug("Fetching GO terms from %s with params %s", url, params)
115
+ async with friendly_session() as session:
116
+ # Fetch first page to learn how many pages there are
117
+ async with session.get(url, params=params) as response:
118
+ response.raise_for_status()
119
+ raw_data = await response.read()
120
+ data = converter.loads(raw_data, SearchResponse)
121
+
122
+ terms = list(_filter_go_terms(data.results, aspect, include_obsolete))
123
+ if len(terms) >= limit:
124
+ # Do not fetch additional pages if we have enough results
125
+ return terms[:limit]
126
+ total_pages = data.page_info.total
127
+ logger.debug("GO search returned %s pages (current=%s)", total_pages, data.page_info.current)
128
+
129
+ # Retrieve remaining pages (if any) and extend results
130
+ if total_pages > 1:
131
+ for page in range(2, total_pages + 1):
132
+ params["page"] = str(page)
133
+ logger.debug("Fetching additional GO terms page %s/%s with params %s", page, total_pages, params)
134
+ async with session.get(url, params=params) as response:
135
+ response.raise_for_status()
136
+ raw_data = await response.read()
137
+ data = converter.loads(raw_data, SearchResponse)
138
+ terms.extend(_filter_go_terms(data.results, aspect, include_obsolete))
139
+ if len(terms) >= limit:
140
+ # Do not fetch additional pages if we have enough results
141
+ break
142
+
143
+ return terms[:limit]
144
+
145
+
146
+ def _filter_go_terms(terms: list[GoTerm], aspect: Aspect | None, include_obsolete: bool) -> Generator[GoTerm]:
147
+ for oboterm in terms:
148
+ if not include_obsolete and oboterm.is_obsolete:
149
+ continue
150
+ if aspect and oboterm.aspect != aspect:
151
+ continue
152
+ yield oboterm
153
+
154
+
155
+ def write_go_terms_to_csv(terms: list[GoTerm], csv_file: TextIOWrapper) -> None:
156
+ """Write a list of GO terms to a CSV file.
157
+
158
+ Args:
159
+ terms: The list of GO terms to write.
160
+ csv_file: The CSV file to write to.
161
+ """
162
+ writer = csv.writer(csv_file)
163
+ writer.writerow(["id", "name", "aspect", "definition"])
164
+ for term in terms:
165
+ writer.writerow([term.id, term.name, term.aspect, term.definition])