protein-quest 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- protein_quest/__init__.py +0 -0
- protein_quest/__version__.py +2 -0
- protein_quest/alphafold/__init__.py +1 -0
- protein_quest/alphafold/confidence.py +226 -0
- protein_quest/alphafold/entry_summary.py +64 -0
- protein_quest/alphafold/fetch.py +534 -0
- protein_quest/cli.py +1428 -0
- protein_quest/converter.py +46 -0
- protein_quest/emdb.py +37 -0
- protein_quest/filters.py +163 -0
- protein_quest/go.py +165 -0
- protein_quest/io.py +350 -0
- protein_quest/mcp_server.py +256 -0
- protein_quest/parallel.py +104 -0
- protein_quest/pdbe/__init__.py +1 -0
- protein_quest/pdbe/fetch.py +68 -0
- protein_quest/py.typed +0 -0
- protein_quest/ss.py +280 -0
- protein_quest/structure.py +232 -0
- protein_quest/taxonomy.py +149 -0
- protein_quest/uniprot.py +975 -0
- protein_quest/utils.py +547 -0
- protein_quest-0.9.0.dist-info/METADATA +325 -0
- protein_quest-0.9.0.dist-info/RECORD +27 -0
- protein_quest-0.9.0.dist-info/WHEEL +4 -0
- protein_quest-0.9.0.dist-info/entry_points.txt +2 -0
- protein_quest-0.9.0.dist-info/licenses/LICENSE +201 -0
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Modules related to AlphaFold Knowledge Base."""
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""Module for filtering alphafold structures on confidence."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections.abc import Generator
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Literal
|
|
8
|
+
|
|
9
|
+
import gemmi
|
|
10
|
+
from dask.distributed import Client
|
|
11
|
+
from distributed.deploy.cluster import Cluster
|
|
12
|
+
from tqdm.auto import tqdm
|
|
13
|
+
|
|
14
|
+
from protein_quest.converter import Percentage, PositiveInt, converter
|
|
15
|
+
from protein_quest.io import read_structure, write_structure
|
|
16
|
+
from protein_quest.parallel import configure_dask_scheduler, dask_map_with_progress
|
|
17
|
+
from protein_quest.ss import nr_of_residues_in_total
|
|
18
|
+
from protein_quest.utils import CopyMethod, copyfile
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
Methods to filter AlphaFoldDB structures on confidence scores.
|
|
22
|
+
|
|
23
|
+
In AlphaFold PDB files, the b-factor column has the
|
|
24
|
+
predicted local distance difference test (pLDDT).
|
|
25
|
+
|
|
26
|
+
See https://www.ebi.ac.uk/training/online/courses/alphafold/inputs-and-outputs/evaluating-alphafolds-predicted-structures-using-confidence-scores/plddt-understanding-local-confidence/
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def find_high_confidence_residues(structure: gemmi.Structure, confidence: float) -> Generator[int]:
|
|
33
|
+
"""Find residues in the structure with pLDDT confidence above the given threshold.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
structure: The AlphaFoldDB structure to search.
|
|
37
|
+
confidence: The confidence threshold (pLDDT) to use for filtering.
|
|
38
|
+
|
|
39
|
+
Yields:
|
|
40
|
+
The sequence numbers of residues with pLDDT above the confidence threshold.
|
|
41
|
+
"""
|
|
42
|
+
for model in structure:
|
|
43
|
+
for chain in model:
|
|
44
|
+
for res in chain:
|
|
45
|
+
res_confidence = res[0].b_iso
|
|
46
|
+
if res_confidence > confidence:
|
|
47
|
+
seqid = res.seqid.num
|
|
48
|
+
if seqid is not None:
|
|
49
|
+
yield seqid
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def filter_out_low_confidence_residues(structure: gemmi.Structure, allowed_residues: set[int]) -> gemmi.Structure:
|
|
53
|
+
"""Filter out residues from the structure that do not have high confidence.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
structure: The AlphaFoldDB structure to filter.
|
|
57
|
+
allowed_residues: The set of residue sequence numbers to keep.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
A new AlphaFoldDB structure with low confidence residues removed.
|
|
61
|
+
"""
|
|
62
|
+
new_structure = structure.clone()
|
|
63
|
+
for model in new_structure:
|
|
64
|
+
new_chains = []
|
|
65
|
+
for chain in model:
|
|
66
|
+
new_chain = gemmi.Chain(chain.name)
|
|
67
|
+
for res in chain:
|
|
68
|
+
if res.seqid.num in allowed_residues:
|
|
69
|
+
new_chain.add_residue(res)
|
|
70
|
+
new_chains.append(new_chain)
|
|
71
|
+
for new_chain in new_chains:
|
|
72
|
+
model.remove_chain(new_chain.name)
|
|
73
|
+
model.add_chain(new_chain)
|
|
74
|
+
return new_structure
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class ConfidenceFilterQuery:
|
|
79
|
+
"""Query for filtering AlphaFoldDB structures based on confidence.
|
|
80
|
+
|
|
81
|
+
Parameters:
|
|
82
|
+
confidence: The confidence threshold for filtering residues.
|
|
83
|
+
Residues with a pLDDT (b-factor) above this value are considered high confidence.
|
|
84
|
+
min_residues: The minimum number of high-confidence residues required to keep the structure.
|
|
85
|
+
max_residues: The maximum number of high-confidence residues required to keep the structure.
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
confidence: Percentage
|
|
89
|
+
min_residues: PositiveInt
|
|
90
|
+
max_residues: PositiveInt
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
base_query_hook = converter.get_structure_hook(ConfidenceFilterQuery)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@converter.register_structure_hook
|
|
97
|
+
def confidence_filter_query_hook(val, _type) -> ConfidenceFilterQuery:
|
|
98
|
+
result: ConfidenceFilterQuery = base_query_hook(val, _type)
|
|
99
|
+
if result.min_residues > result.max_residues:
|
|
100
|
+
msg = f"min_residues {result.min_residues} cannot be larger than max_residues {result.max_residues}"
|
|
101
|
+
raise ValueError(msg)
|
|
102
|
+
return result
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass
|
|
106
|
+
class ConfidenceFilterResult:
|
|
107
|
+
"""Result of filtering AlphaFoldDB structures based on confidence (pLDDT).
|
|
108
|
+
|
|
109
|
+
Parameters:
|
|
110
|
+
input_file: The name of the mmcif/PDB file that was processed.
|
|
111
|
+
count: The number of residues with a pLDDT above the confidence threshold.
|
|
112
|
+
filtered_file: The path to the filtered mmcif/PDB file, if passed filter.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
input_file: str
|
|
116
|
+
count: PositiveInt
|
|
117
|
+
filtered_file: Path | None = None
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def filter_file_on_confidence(
|
|
121
|
+
file: Path, query: ConfidenceFilterQuery, filtered_dir: Path, copy_method: CopyMethod = "copy"
|
|
122
|
+
) -> ConfidenceFilterResult:
|
|
123
|
+
"""Filter a single AlphaFoldDB structure file (*.pdb[.gz], *.cif[.gz]) based on confidence.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
file: The path to the PDB file to filter.
|
|
127
|
+
query: The confidence filter query.
|
|
128
|
+
filtered_dir: The directory to save the filtered PDB file.
|
|
129
|
+
copy_method: How to copy when no residues have to be removed.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
result with filtered_file property set to Path where filtered PDB file is saved.
|
|
133
|
+
or None if structure was filtered out.
|
|
134
|
+
"""
|
|
135
|
+
structure = read_structure(file)
|
|
136
|
+
residues = set(find_high_confidence_residues(structure, query.confidence))
|
|
137
|
+
count = len(residues)
|
|
138
|
+
if count < query.min_residues or count > query.max_residues:
|
|
139
|
+
# Skip structure that is outside the min and max threshold
|
|
140
|
+
# just return number of high confidence residues
|
|
141
|
+
return ConfidenceFilterResult(
|
|
142
|
+
input_file=file.name,
|
|
143
|
+
count=count,
|
|
144
|
+
)
|
|
145
|
+
total_residues = nr_of_residues_in_total(structure)
|
|
146
|
+
filtered_file = filtered_dir / file.name
|
|
147
|
+
if count == total_residues:
|
|
148
|
+
# if no residues have to be removed then copy instead of slower gemmi writing
|
|
149
|
+
copyfile(file, filtered_file, copy_method)
|
|
150
|
+
else:
|
|
151
|
+
new_structure = filter_out_low_confidence_residues(
|
|
152
|
+
structure,
|
|
153
|
+
residues,
|
|
154
|
+
)
|
|
155
|
+
write_structure(new_structure, filtered_file)
|
|
156
|
+
return ConfidenceFilterResult(
|
|
157
|
+
input_file=file.name,
|
|
158
|
+
count=count,
|
|
159
|
+
filtered_file=filtered_file,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _filter_files_on_confidence_sequentially(
|
|
164
|
+
alphafold_pdb_files: list[Path],
|
|
165
|
+
query: ConfidenceFilterQuery,
|
|
166
|
+
filtered_dir: Path,
|
|
167
|
+
copy_method: CopyMethod = "copy",
|
|
168
|
+
) -> list[ConfidenceFilterResult]:
|
|
169
|
+
results = []
|
|
170
|
+
for file in tqdm(
|
|
171
|
+
alphafold_pdb_files,
|
|
172
|
+
total=len(alphafold_pdb_files),
|
|
173
|
+
desc="Filtering on confidence",
|
|
174
|
+
unit="file",
|
|
175
|
+
):
|
|
176
|
+
result = filter_file_on_confidence(file, query, filtered_dir, copy_method)
|
|
177
|
+
results.append(result)
|
|
178
|
+
return results
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def filter_files_on_confidence(
|
|
182
|
+
alphafold_pdb_files: list[Path],
|
|
183
|
+
query: ConfidenceFilterQuery,
|
|
184
|
+
filtered_dir: Path,
|
|
185
|
+
copy_method: CopyMethod = "copy",
|
|
186
|
+
scheduler_address: str | Cluster | Literal["sequential"] | None = None,
|
|
187
|
+
) -> list[ConfidenceFilterResult]:
|
|
188
|
+
"""Filter AlphaFoldDB structures based on confidence.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
alphafold_pdb_files: List of mmcif/PDB files from AlphaFoldDB to filter.
|
|
192
|
+
query: The confidence filter query containing the confidence thresholds.
|
|
193
|
+
filtered_dir: Directory where the filtered mmcif/PDB files will be saved.
|
|
194
|
+
copy_method: How to copy when a direct copy is possible.
|
|
195
|
+
scheduler_address: The address of the Dask scheduler.
|
|
196
|
+
If not provided, will create a local cluster.
|
|
197
|
+
If set to `sequential` will run tasks sequentially.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
For each mmcif/PDB files returns whether it was filtered or not,
|
|
201
|
+
and number of residues with pLDDT above the confidence threshold.
|
|
202
|
+
"""
|
|
203
|
+
filtered_dir.mkdir(parents=True, exist_ok=True)
|
|
204
|
+
if scheduler_address == "sequential":
|
|
205
|
+
return _filter_files_on_confidence_sequentially(
|
|
206
|
+
alphafold_pdb_files,
|
|
207
|
+
query,
|
|
208
|
+
filtered_dir,
|
|
209
|
+
copy_method=copy_method,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
scheduler_address = configure_dask_scheduler(
|
|
213
|
+
scheduler_address,
|
|
214
|
+
name="filter-confidence",
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
with Client(scheduler_address) as client:
|
|
218
|
+
client.forward_logging()
|
|
219
|
+
return dask_map_with_progress(
|
|
220
|
+
client,
|
|
221
|
+
filter_file_on_confidence,
|
|
222
|
+
alphafold_pdb_files,
|
|
223
|
+
query=query,
|
|
224
|
+
filtered_dir=filtered_dir,
|
|
225
|
+
copy_method=copy_method,
|
|
226
|
+
)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# ruff: noqa: N815 allow camelCase follow what api returns
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from yarl import URL
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class EntrySummary:
|
|
9
|
+
"""Dataclass representing a summary of an AlphaFold entry.
|
|
10
|
+
|
|
11
|
+
Modelled after NewEntrySummary in [https://alphafold.ebi.ac.uk/api/openapi.json](https://alphafold.ebi.ac.uk/api/openapi.json)
|
|
12
|
+
with URL types and without deprecated fields.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
allVersions: list[int]
|
|
16
|
+
bcifUrl: URL
|
|
17
|
+
cifUrl: URL
|
|
18
|
+
entityType: str
|
|
19
|
+
fractionPlddtConfident: float
|
|
20
|
+
fractionPlddtLow: float
|
|
21
|
+
fractionPlddtVeryHigh: float
|
|
22
|
+
fractionPlddtVeryLow: float
|
|
23
|
+
globalMetricValue: float
|
|
24
|
+
isUniProt: bool
|
|
25
|
+
latestVersion: int
|
|
26
|
+
modelCreatedDate: str
|
|
27
|
+
modelEntityId: str
|
|
28
|
+
paeDocUrl: URL
|
|
29
|
+
pdbUrl: URL
|
|
30
|
+
providerId: str
|
|
31
|
+
sequence: str
|
|
32
|
+
sequenceChecksum: str
|
|
33
|
+
sequenceEnd: int
|
|
34
|
+
sequenceStart: int
|
|
35
|
+
sequenceVersionDate: str
|
|
36
|
+
toolUsed: str
|
|
37
|
+
alternativeNames: list[str] | None = None
|
|
38
|
+
amAnnotationsHg19Url: URL | None = None
|
|
39
|
+
amAnnotationsHg38Url: URL | None = None
|
|
40
|
+
amAnnotationsUrl: URL | None = None
|
|
41
|
+
catalyticActivities: list[str] | None = None
|
|
42
|
+
complexName: str | None = None
|
|
43
|
+
functions: list[str] | None = None
|
|
44
|
+
gene: str | None = None
|
|
45
|
+
geneSynonyms: list[str] | None = None
|
|
46
|
+
ipSAE: float | None = None
|
|
47
|
+
ipTM: float | None = None
|
|
48
|
+
isUniProtReferenceProteome: bool | None = None
|
|
49
|
+
isUniProtReviewed: bool | None = None
|
|
50
|
+
keywords: list[str] | None = None
|
|
51
|
+
msaUrl: URL | None = None
|
|
52
|
+
organismCommonNames: list[str] | None = None
|
|
53
|
+
organismScientificName: str | None = None
|
|
54
|
+
organismSynonyms: list[str] | None = None
|
|
55
|
+
plddtDocUrl: URL | None = None
|
|
56
|
+
proteinFullNames: list[str] | None = None
|
|
57
|
+
proteinShortNames: list[str] | None = None
|
|
58
|
+
stoichiometry: int | None = None
|
|
59
|
+
taxId: int | None = None
|
|
60
|
+
taxonomyLineage: list[str] | None = None
|
|
61
|
+
# uniprotAccession is isoform id (<uniprot_accession>-<isoform number>) when entry has multiple isoforms.
|
|
62
|
+
uniprotAccession: str | None = None
|
|
63
|
+
uniprotDescription: str | None = None
|
|
64
|
+
uniprotId: str | None = None
|