protein-quest 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,2 @@
1
+ __version__ = "0.9.0"
2
+ """The version of the package."""
@@ -0,0 +1 @@
1
+ """Modules related to AlphaFold Knowledge Base."""
@@ -0,0 +1,226 @@
1
+ """Module for filtering alphafold structures on confidence."""
2
+
3
+ import logging
4
+ from collections.abc import Generator
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Literal
8
+
9
+ import gemmi
10
+ from dask.distributed import Client
11
+ from distributed.deploy.cluster import Cluster
12
+ from tqdm.auto import tqdm
13
+
14
+ from protein_quest.converter import Percentage, PositiveInt, converter
15
+ from protein_quest.io import read_structure, write_structure
16
+ from protein_quest.parallel import configure_dask_scheduler, dask_map_with_progress
17
+ from protein_quest.ss import nr_of_residues_in_total
18
+ from protein_quest.utils import CopyMethod, copyfile
19
+
20
+ """
21
+ Methods to filter AlphaFoldDB structures on confidence scores.
22
+
23
+ In AlphaFold PDB files, the b-factor column has the
24
+ predicted local distance difference test (pLDDT).
25
+
26
+ See https://www.ebi.ac.uk/training/online/courses/alphafold/inputs-and-outputs/evaluating-alphafolds-predicted-structures-using-confidence-scores/plddt-understanding-local-confidence/
27
+ """
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ def find_high_confidence_residues(structure: gemmi.Structure, confidence: float) -> Generator[int]:
33
+ """Find residues in the structure with pLDDT confidence above the given threshold.
34
+
35
+ Args:
36
+ structure: The AlphaFoldDB structure to search.
37
+ confidence: The confidence threshold (pLDDT) to use for filtering.
38
+
39
+ Yields:
40
+ The sequence numbers of residues with pLDDT above the confidence threshold.
41
+ """
42
+ for model in structure:
43
+ for chain in model:
44
+ for res in chain:
45
+ res_confidence = res[0].b_iso
46
+ if res_confidence > confidence:
47
+ seqid = res.seqid.num
48
+ if seqid is not None:
49
+ yield seqid
50
+
51
+
52
+ def filter_out_low_confidence_residues(structure: gemmi.Structure, allowed_residues: set[int]) -> gemmi.Structure:
53
+ """Filter out residues from the structure that do not have high confidence.
54
+
55
+ Args:
56
+ structure: The AlphaFoldDB structure to filter.
57
+ allowed_residues: The set of residue sequence numbers to keep.
58
+
59
+ Returns:
60
+ A new AlphaFoldDB structure with low confidence residues removed.
61
+ """
62
+ new_structure = structure.clone()
63
+ for model in new_structure:
64
+ new_chains = []
65
+ for chain in model:
66
+ new_chain = gemmi.Chain(chain.name)
67
+ for res in chain:
68
+ if res.seqid.num in allowed_residues:
69
+ new_chain.add_residue(res)
70
+ new_chains.append(new_chain)
71
+ for new_chain in new_chains:
72
+ model.remove_chain(new_chain.name)
73
+ model.add_chain(new_chain)
74
+ return new_structure
75
+
76
+
77
+ @dataclass
78
+ class ConfidenceFilterQuery:
79
+ """Query for filtering AlphaFoldDB structures based on confidence.
80
+
81
+ Parameters:
82
+ confidence: The confidence threshold for filtering residues.
83
+ Residues with a pLDDT (b-factor) above this value are considered high confidence.
84
+ min_residues: The minimum number of high-confidence residues required to keep the structure.
85
+ max_residues: The maximum number of high-confidence residues required to keep the structure.
86
+ """
87
+
88
+ confidence: Percentage
89
+ min_residues: PositiveInt
90
+ max_residues: PositiveInt
91
+
92
+
93
+ base_query_hook = converter.get_structure_hook(ConfidenceFilterQuery)
94
+
95
+
96
+ @converter.register_structure_hook
97
+ def confidence_filter_query_hook(val, _type) -> ConfidenceFilterQuery:
98
+ result: ConfidenceFilterQuery = base_query_hook(val, _type)
99
+ if result.min_residues > result.max_residues:
100
+ msg = f"min_residues {result.min_residues} cannot be larger than max_residues {result.max_residues}"
101
+ raise ValueError(msg)
102
+ return result
103
+
104
+
105
+ @dataclass
106
+ class ConfidenceFilterResult:
107
+ """Result of filtering AlphaFoldDB structures based on confidence (pLDDT).
108
+
109
+ Parameters:
110
+ input_file: The name of the mmcif/PDB file that was processed.
111
+ count: The number of residues with a pLDDT above the confidence threshold.
112
+ filtered_file: The path to the filtered mmcif/PDB file, if passed filter.
113
+ """
114
+
115
+ input_file: str
116
+ count: PositiveInt
117
+ filtered_file: Path | None = None
118
+
119
+
120
+ def filter_file_on_confidence(
121
+ file: Path, query: ConfidenceFilterQuery, filtered_dir: Path, copy_method: CopyMethod = "copy"
122
+ ) -> ConfidenceFilterResult:
123
+ """Filter a single AlphaFoldDB structure file (*.pdb[.gz], *.cif[.gz]) based on confidence.
124
+
125
+ Args:
126
+ file: The path to the PDB file to filter.
127
+ query: The confidence filter query.
128
+ filtered_dir: The directory to save the filtered PDB file.
129
+ copy_method: How to copy when no residues have to be removed.
130
+
131
+ Returns:
132
+ result with filtered_file property set to Path where filtered PDB file is saved.
133
+ or None if structure was filtered out.
134
+ """
135
+ structure = read_structure(file)
136
+ residues = set(find_high_confidence_residues(structure, query.confidence))
137
+ count = len(residues)
138
+ if count < query.min_residues or count > query.max_residues:
139
+ # Skip structure that is outside the min and max threshold
140
+ # just return number of high confidence residues
141
+ return ConfidenceFilterResult(
142
+ input_file=file.name,
143
+ count=count,
144
+ )
145
+ total_residues = nr_of_residues_in_total(structure)
146
+ filtered_file = filtered_dir / file.name
147
+ if count == total_residues:
148
+ # if no residues have to be removed then copy instead of slower gemmi writing
149
+ copyfile(file, filtered_file, copy_method)
150
+ else:
151
+ new_structure = filter_out_low_confidence_residues(
152
+ structure,
153
+ residues,
154
+ )
155
+ write_structure(new_structure, filtered_file)
156
+ return ConfidenceFilterResult(
157
+ input_file=file.name,
158
+ count=count,
159
+ filtered_file=filtered_file,
160
+ )
161
+
162
+
163
+ def _filter_files_on_confidence_sequentially(
164
+ alphafold_pdb_files: list[Path],
165
+ query: ConfidenceFilterQuery,
166
+ filtered_dir: Path,
167
+ copy_method: CopyMethod = "copy",
168
+ ) -> list[ConfidenceFilterResult]:
169
+ results = []
170
+ for file in tqdm(
171
+ alphafold_pdb_files,
172
+ total=len(alphafold_pdb_files),
173
+ desc="Filtering on confidence",
174
+ unit="file",
175
+ ):
176
+ result = filter_file_on_confidence(file, query, filtered_dir, copy_method)
177
+ results.append(result)
178
+ return results
179
+
180
+
181
+ def filter_files_on_confidence(
182
+ alphafold_pdb_files: list[Path],
183
+ query: ConfidenceFilterQuery,
184
+ filtered_dir: Path,
185
+ copy_method: CopyMethod = "copy",
186
+ scheduler_address: str | Cluster | Literal["sequential"] | None = None,
187
+ ) -> list[ConfidenceFilterResult]:
188
+ """Filter AlphaFoldDB structures based on confidence.
189
+
190
+ Args:
191
+ alphafold_pdb_files: List of mmcif/PDB files from AlphaFoldDB to filter.
192
+ query: The confidence filter query containing the confidence thresholds.
193
+ filtered_dir: Directory where the filtered mmcif/PDB files will be saved.
194
+ copy_method: How to copy when a direct copy is possible.
195
+ scheduler_address: The address of the Dask scheduler.
196
+ If not provided, will create a local cluster.
197
+ If set to `sequential` will run tasks sequentially.
198
+
199
+ Returns:
200
+ For each mmcif/PDB files returns whether it was filtered or not,
201
+ and number of residues with pLDDT above the confidence threshold.
202
+ """
203
+ filtered_dir.mkdir(parents=True, exist_ok=True)
204
+ if scheduler_address == "sequential":
205
+ return _filter_files_on_confidence_sequentially(
206
+ alphafold_pdb_files,
207
+ query,
208
+ filtered_dir,
209
+ copy_method=copy_method,
210
+ )
211
+
212
+ scheduler_address = configure_dask_scheduler(
213
+ scheduler_address,
214
+ name="filter-confidence",
215
+ )
216
+
217
+ with Client(scheduler_address) as client:
218
+ client.forward_logging()
219
+ return dask_map_with_progress(
220
+ client,
221
+ filter_file_on_confidence,
222
+ alphafold_pdb_files,
223
+ query=query,
224
+ filtered_dir=filtered_dir,
225
+ copy_method=copy_method,
226
+ )
@@ -0,0 +1,64 @@
1
+ # ruff: noqa: N815 allow camelCase follow what api returns
2
+ from dataclasses import dataclass
3
+
4
+ from yarl import URL
5
+
6
+
7
+ @dataclass
8
+ class EntrySummary:
9
+ """Dataclass representing a summary of an AlphaFold entry.
10
+
11
+ Modelled after NewEntrySummary in [https://alphafold.ebi.ac.uk/api/openapi.json](https://alphafold.ebi.ac.uk/api/openapi.json)
12
+ with URL types and without deprecated fields.
13
+ """
14
+
15
+ allVersions: list[int]
16
+ bcifUrl: URL
17
+ cifUrl: URL
18
+ entityType: str
19
+ fractionPlddtConfident: float
20
+ fractionPlddtLow: float
21
+ fractionPlddtVeryHigh: float
22
+ fractionPlddtVeryLow: float
23
+ globalMetricValue: float
24
+ isUniProt: bool
25
+ latestVersion: int
26
+ modelCreatedDate: str
27
+ modelEntityId: str
28
+ paeDocUrl: URL
29
+ pdbUrl: URL
30
+ providerId: str
31
+ sequence: str
32
+ sequenceChecksum: str
33
+ sequenceEnd: int
34
+ sequenceStart: int
35
+ sequenceVersionDate: str
36
+ toolUsed: str
37
+ alternativeNames: list[str] | None = None
38
+ amAnnotationsHg19Url: URL | None = None
39
+ amAnnotationsHg38Url: URL | None = None
40
+ amAnnotationsUrl: URL | None = None
41
+ catalyticActivities: list[str] | None = None
42
+ complexName: str | None = None
43
+ functions: list[str] | None = None
44
+ gene: str | None = None
45
+ geneSynonyms: list[str] | None = None
46
+ ipSAE: float | None = None
47
+ ipTM: float | None = None
48
+ isUniProtReferenceProteome: bool | None = None
49
+ isUniProtReviewed: bool | None = None
50
+ keywords: list[str] | None = None
51
+ msaUrl: URL | None = None
52
+ organismCommonNames: list[str] | None = None
53
+ organismScientificName: str | None = None
54
+ organismSynonyms: list[str] | None = None
55
+ plddtDocUrl: URL | None = None
56
+ proteinFullNames: list[str] | None = None
57
+ proteinShortNames: list[str] | None = None
58
+ stoichiometry: int | None = None
59
+ taxId: int | None = None
60
+ taxonomyLineage: list[str] | None = None
61
+ # uniprotAccession is isoform id (<uniprot_accession>-<isoform number>) when entry has multiple isoforms.
62
+ uniprotAccession: str | None = None
63
+ uniprotDescription: str | None = None
64
+ uniprotId: str | None = None