protein-quest 0.3.1__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {protein_quest-0.3.1 → protein_quest-0.3.2}/PKG-INFO +18 -4
- {protein_quest-0.3.1 → protein_quest-0.3.2}/README.md +17 -3
- {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/notebooks/alphafold.ipynb +3 -3
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/__version__.py +1 -1
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/alphafold/confidence.py +42 -15
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/alphafold/fetch.py +2 -4
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/cli.py +153 -13
- protein_quest-0.3.2/src/protein_quest/converter.py +45 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/filters.py +39 -7
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/go.py +1 -4
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/mcp_server.py +4 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/pdbe/io.py +122 -41
- protein_quest-0.3.2/src/protein_quest/ss.py +264 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/taxonomy.py +1 -3
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/utils.py +28 -1
- protein_quest-0.3.2/tests/alphafold/test_confidence.py +155 -0
- protein_quest-0.3.2/tests/fixtures/3JRS_B2A.cif.gz +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/pdbe/test_io.py +39 -4
- protein_quest-0.3.2/tests/test_ss.py +227 -0
- protein_quest-0.3.2/tests/test_utils.py +31 -0
- protein_quest-0.3.1/tests/alphafold/test_confidence.py +0 -63
- {protein_quest-0.3.1 → protein_quest-0.3.2}/.github/workflows/ci.yml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/.github/workflows/pages.yml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/.github/workflows/pypi-publish.yml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/.gitignore +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/.vscode/extensions.json +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/CITATION.cff +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/CODE_OF_CONDUCT.md +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/CONTRIBUTING.md +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/LICENSE +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/CONTRIBUTING.md +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/cli_doc_hook.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/index.md +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/notebooks/.gitignore +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/notebooks/index.md +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/notebooks/pdbe.ipynb +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/notebooks/uniprot.ipynb +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/docs/protein-quest-mcp.png +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/mkdocs.yml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/pyproject.toml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/__init__.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/alphafold/__init__.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/alphafold/entry_summary.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/emdb.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/parallel.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/pdbe/__init__.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/pdbe/fetch.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/py.typed +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/src/protein_quest/uniprot.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/alphafold/AF-A1YPR0-F1-model_v4.pdb +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/alphafold/cassettes/test_fetch/test_fetch_many.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/alphafold/test_entry_summary.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/alphafold/test_fetch.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/cassettes/test_emdb/test_fetch.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/cassettes/test_go/test_search_gene_ontology_term.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/cassettes/test_taxonomy/test_search_taxon.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/cassettes/test_taxonomy/test_search_taxon_by_id.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/cassettes/test_uniprot/test_search4af.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/cassettes/test_uniprot/test_search4emdb.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/cassettes/test_uniprot/test_search4pdb.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/cassettes/test_uniprot/test_search4uniprot.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/pdbe/cassettes/test_fetch/test_fetch.yaml +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/pdbe/fixtures/2y29.cif +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/pdbe/test_fetch.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/test_cli.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/test_emdb.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/test_go.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/test_mcp.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/test_taxonomy.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/tests/test_uniprot.py +0 -0
- {protein_quest-0.3.1 → protein_quest-0.3.2}/uv.lock +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: protein_quest
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: Search/retrieve/filter proteins and protein structures
|
|
5
5
|
Project-URL: Homepage, https://github.com/haddocking/protein-quest
|
|
6
6
|
Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
|
|
@@ -59,9 +59,11 @@ graph TB;
|
|
|
59
59
|
searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
|
|
60
60
|
searchaf --> |uniprot_accessions|fetchad(Retrieve AlphaFold)
|
|
61
61
|
searchemdb -. emdb_ids .->fetchemdb[Retrieve EMDB]
|
|
62
|
-
fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{Filter on chain of uniprot}
|
|
63
|
-
chainfilter --> |mmcif_files| residuefilter{Filter on chain length}
|
|
64
|
-
fetchad -->|pdb_files| confidencefilter{Filter out low confidence}
|
|
62
|
+
fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{{Filter on chain of uniprot}}
|
|
63
|
+
chainfilter --> |mmcif_files| residuefilter{{Filter on chain length}}
|
|
64
|
+
fetchad -->|pdb_files| confidencefilter{{Filter out low confidence}}
|
|
65
|
+
confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
|
|
66
|
+
residuefilter --> |mmcif_files| ssfilter
|
|
65
67
|
classDef dashedBorder stroke-dasharray: 5 5;
|
|
66
68
|
goterm:::dashedBorder
|
|
67
69
|
taxonomy:::dashedBorder
|
|
@@ -175,6 +177,18 @@ protein-quest filter residue \
|
|
|
175
177
|
./filtered-chains ./filtered
|
|
176
178
|
```
|
|
177
179
|
|
|
180
|
+
### To filter on secondary structure
|
|
181
|
+
|
|
182
|
+
To filter on structure being mostly alpha helices and have no beta sheets.
|
|
183
|
+
|
|
184
|
+
```shell
|
|
185
|
+
protein-quest filter secondary-structure \
|
|
186
|
+
--ratio-min-helix-residues 0.5 \
|
|
187
|
+
--ratio-max-sheet-residues 0.0 \
|
|
188
|
+
--write-stats filtered-ss/stats.csv \
|
|
189
|
+
./filtered-chains ./filtered-ss
|
|
190
|
+
```
|
|
191
|
+
|
|
178
192
|
### Search Taxonomy
|
|
179
193
|
|
|
180
194
|
```shell
|
|
@@ -29,9 +29,11 @@ graph TB;
|
|
|
29
29
|
searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
|
|
30
30
|
searchaf --> |uniprot_accessions|fetchad(Retrieve AlphaFold)
|
|
31
31
|
searchemdb -. emdb_ids .->fetchemdb[Retrieve EMDB]
|
|
32
|
-
fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{Filter on chain of uniprot}
|
|
33
|
-
chainfilter --> |mmcif_files| residuefilter{Filter on chain length}
|
|
34
|
-
fetchad -->|pdb_files| confidencefilter{Filter out low confidence}
|
|
32
|
+
fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{{Filter on chain of uniprot}}
|
|
33
|
+
chainfilter --> |mmcif_files| residuefilter{{Filter on chain length}}
|
|
34
|
+
fetchad -->|pdb_files| confidencefilter{{Filter out low confidence}}
|
|
35
|
+
confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
|
|
36
|
+
residuefilter --> |mmcif_files| ssfilter
|
|
35
37
|
classDef dashedBorder stroke-dasharray: 5 5;
|
|
36
38
|
goterm:::dashedBorder
|
|
37
39
|
taxonomy:::dashedBorder
|
|
@@ -145,6 +147,18 @@ protein-quest filter residue \
|
|
|
145
147
|
./filtered-chains ./filtered
|
|
146
148
|
```
|
|
147
149
|
|
|
150
|
+
### To filter on secondary structure
|
|
151
|
+
|
|
152
|
+
To filter on structure being mostly alpha helices and have no beta sheets.
|
|
153
|
+
|
|
154
|
+
```shell
|
|
155
|
+
protein-quest filter secondary-structure \
|
|
156
|
+
--ratio-min-helix-residues 0.5 \
|
|
157
|
+
--ratio-max-sheet-residues 0.0 \
|
|
158
|
+
--write-stats filtered-ss/stats.csv \
|
|
159
|
+
./filtered-chains ./filtered-ss
|
|
160
|
+
```
|
|
161
|
+
|
|
148
162
|
### Search Taxonomy
|
|
149
163
|
|
|
150
164
|
```shell
|
|
@@ -301,7 +301,7 @@
|
|
|
301
301
|
"metadata": {},
|
|
302
302
|
"outputs": [],
|
|
303
303
|
"source": [
|
|
304
|
-
"query = ConfidenceFilterQuery(confidence=80,
|
|
304
|
+
"query = ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000)"
|
|
305
305
|
]
|
|
306
306
|
},
|
|
307
307
|
{
|
|
@@ -318,7 +318,7 @@
|
|
|
318
318
|
},
|
|
319
319
|
{
|
|
320
320
|
"cell_type": "code",
|
|
321
|
-
"execution_count":
|
|
321
|
+
"execution_count": null,
|
|
322
322
|
"id": "6a6f8e3f",
|
|
323
323
|
"metadata": {},
|
|
324
324
|
"outputs": [
|
|
@@ -338,7 +338,7 @@
|
|
|
338
338
|
"source": [
|
|
339
339
|
"list(\n",
|
|
340
340
|
" filter_files_on_confidence(\n",
|
|
341
|
-
" input_files, ConfidenceFilterQuery(confidence=80,
|
|
341
|
+
" input_files, ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000), output_dir\n",
|
|
342
342
|
" )\n",
|
|
343
343
|
")"
|
|
344
344
|
]
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.2"
|
|
2
2
|
"""The version of the package."""
|
|
@@ -7,7 +7,10 @@ from pathlib import Path
|
|
|
7
7
|
|
|
8
8
|
import gemmi
|
|
9
9
|
|
|
10
|
+
from protein_quest.converter import Percentage, PositiveInt, converter
|
|
10
11
|
from protein_quest.pdbe.io import write_structure
|
|
12
|
+
from protein_quest.ss import nr_of_residues_in_total
|
|
13
|
+
from protein_quest.utils import CopyMethod, copyfile
|
|
11
14
|
|
|
12
15
|
"""
|
|
13
16
|
Methods to filter AlphaFoldDB structures on confidence scores.
|
|
@@ -73,13 +76,25 @@ class ConfidenceFilterQuery:
|
|
|
73
76
|
Parameters:
|
|
74
77
|
confidence: The confidence threshold for filtering residues.
|
|
75
78
|
Residues with a pLDDT (b-factor) above this value are considered high confidence.
|
|
76
|
-
|
|
77
|
-
|
|
79
|
+
min_residues: The minimum number of high-confidence residues required to keep the structure.
|
|
80
|
+
max_residues: The maximum number of high-confidence residues required to keep the structure.
|
|
78
81
|
"""
|
|
79
82
|
|
|
80
|
-
confidence:
|
|
81
|
-
|
|
82
|
-
|
|
83
|
+
confidence: Percentage
|
|
84
|
+
min_residues: PositiveInt
|
|
85
|
+
max_residues: PositiveInt
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
base_query_hook = converter.get_structure_hook(ConfidenceFilterQuery)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@converter.register_structure_hook
|
|
92
|
+
def confidence_filter_query_hook(val, _type) -> ConfidenceFilterQuery:
|
|
93
|
+
result: ConfidenceFilterQuery = base_query_hook(val, _type)
|
|
94
|
+
if result.min_residues > result.max_residues:
|
|
95
|
+
msg = f"min_residues {result.min_residues} cannot be larger than max_residues {result.max_residues}"
|
|
96
|
+
raise ValueError(msg)
|
|
97
|
+
return result
|
|
83
98
|
|
|
84
99
|
|
|
85
100
|
@dataclass
|
|
@@ -93,17 +108,20 @@ class ConfidenceFilterResult:
|
|
|
93
108
|
"""
|
|
94
109
|
|
|
95
110
|
input_file: str
|
|
96
|
-
count:
|
|
111
|
+
count: PositiveInt
|
|
97
112
|
filtered_file: Path | None = None
|
|
98
113
|
|
|
99
114
|
|
|
100
|
-
def filter_file_on_residues(
|
|
115
|
+
def filter_file_on_residues(
|
|
116
|
+
file: Path, query: ConfidenceFilterQuery, filtered_dir: Path, copy_method: CopyMethod = "copy"
|
|
117
|
+
) -> ConfidenceFilterResult:
|
|
101
118
|
"""Filter a single AlphaFoldDB structure file (*.pdb[.gz], *.cif[.gz]) based on confidence.
|
|
102
119
|
|
|
103
120
|
Args:
|
|
104
121
|
file: The path to the PDB file to filter.
|
|
105
122
|
query: The confidence filter query.
|
|
106
123
|
filtered_dir: The directory to save the filtered PDB file.
|
|
124
|
+
copy_method: How to copy when no residues have to be removed.
|
|
107
125
|
|
|
108
126
|
Returns:
|
|
109
127
|
result with filtered_file property set to Path where filtered PDB file is saved.
|
|
@@ -112,19 +130,24 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
|
|
|
112
130
|
structure = gemmi.read_structure(str(file))
|
|
113
131
|
residues = set(find_high_confidence_residues(structure, query.confidence))
|
|
114
132
|
count = len(residues)
|
|
115
|
-
if count < query.
|
|
133
|
+
if count < query.min_residues or count > query.max_residues:
|
|
116
134
|
# Skip structure that is outside the min and max threshold
|
|
117
135
|
# just return number of high confidence residues
|
|
118
136
|
return ConfidenceFilterResult(
|
|
119
137
|
input_file=file.name,
|
|
120
138
|
count=count,
|
|
121
139
|
)
|
|
140
|
+
total_residues = nr_of_residues_in_total(structure)
|
|
122
141
|
filtered_file = filtered_dir / file.name
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
142
|
+
if count == total_residues:
|
|
143
|
+
# if no residues have to be removed then copy instead of slower gemmi writing
|
|
144
|
+
copyfile(file, filtered_file, copy_method)
|
|
145
|
+
else:
|
|
146
|
+
new_structure = filter_out_low_confidence_residues(
|
|
147
|
+
structure,
|
|
148
|
+
residues,
|
|
149
|
+
)
|
|
150
|
+
write_structure(new_structure, filtered_file)
|
|
128
151
|
return ConfidenceFilterResult(
|
|
129
152
|
input_file=file.name,
|
|
130
153
|
count=count,
|
|
@@ -133,7 +156,10 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
|
|
|
133
156
|
|
|
134
157
|
|
|
135
158
|
def filter_files_on_confidence(
|
|
136
|
-
alphafold_pdb_files: list[Path],
|
|
159
|
+
alphafold_pdb_files: list[Path],
|
|
160
|
+
query: ConfidenceFilterQuery,
|
|
161
|
+
filtered_dir: Path,
|
|
162
|
+
copy_method: CopyMethod = "copy",
|
|
137
163
|
) -> Generator[ConfidenceFilterResult]:
|
|
138
164
|
"""Filter AlphaFoldDB structures based on confidence.
|
|
139
165
|
|
|
@@ -141,6 +167,7 @@ def filter_files_on_confidence(
|
|
|
141
167
|
alphafold_pdb_files: List of mmcif/PDB files from AlphaFoldDB to filter.
|
|
142
168
|
query: The confidence filter query containing the confidence thresholds.
|
|
143
169
|
filtered_dir: Directory where the filtered mmcif/PDB files will be saved.
|
|
170
|
+
copy_method: How to copy when a direct copy is possible.
|
|
144
171
|
|
|
145
172
|
Yields:
|
|
146
173
|
For each mmcif/PDB files yields whether it was filtered or not,
|
|
@@ -150,4 +177,4 @@ def filter_files_on_confidence(
|
|
|
150
177
|
# In ../filter.py:filter_files_on_residues() we filter on number of residues on a file level
|
|
151
178
|
# here we filter on file level and inside file remove low confidence residues
|
|
152
179
|
for pdb_file in alphafold_pdb_files:
|
|
153
|
-
yield filter_file_on_residues(pdb_file, query, filtered_dir)
|
|
180
|
+
yield filter_file_on_residues(pdb_file, query, filtered_dir, copy_method)
|
|
@@ -9,17 +9,15 @@ from typing import Literal, cast, get_args
|
|
|
9
9
|
|
|
10
10
|
from aiohttp_retry import RetryClient
|
|
11
11
|
from aiopath import AsyncPath
|
|
12
|
-
from cattrs.preconf.orjson import make_converter
|
|
13
12
|
from tqdm.asyncio import tqdm
|
|
14
13
|
from yarl import URL
|
|
15
14
|
|
|
16
15
|
from protein_quest.alphafold.entry_summary import EntrySummary
|
|
16
|
+
from protein_quest.converter import converter
|
|
17
17
|
from protein_quest.utils import friendly_session, retrieve_files, run_async
|
|
18
18
|
|
|
19
19
|
logger = logging.getLogger(__name__)
|
|
20
|
-
|
|
21
|
-
"""cattrs converter to read AlphaFold summary JSON document."""
|
|
22
|
-
converter.register_structure_hook(URL, lambda v, _: URL(v))
|
|
20
|
+
|
|
23
21
|
|
|
24
22
|
DownloadableFormat = Literal[
|
|
25
23
|
"summary",
|
|
@@ -23,13 +23,16 @@ from protein_quest.__version__ import __version__
|
|
|
23
23
|
from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence
|
|
24
24
|
from protein_quest.alphafold.fetch import DownloadableFormat, downloadable_formats
|
|
25
25
|
from protein_quest.alphafold.fetch import fetch_many as af_fetch
|
|
26
|
+
from protein_quest.converter import converter
|
|
26
27
|
from protein_quest.emdb import fetch as emdb_fetch
|
|
27
28
|
from protein_quest.filters import filter_files_on_chain, filter_files_on_residues
|
|
28
29
|
from protein_quest.go import Aspect, allowed_aspects, search_gene_ontology_term, write_go_terms_to_csv
|
|
29
30
|
from protein_quest.pdbe import fetch as pdbe_fetch
|
|
30
31
|
from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
|
|
32
|
+
from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
|
|
31
33
|
from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
|
|
32
34
|
from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
|
|
35
|
+
from protein_quest.utils import CopyMethod, copy_methods, copyfile
|
|
33
36
|
|
|
34
37
|
logger = logging.getLogger(__name__)
|
|
35
38
|
|
|
@@ -282,6 +285,22 @@ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
|
|
|
282
285
|
parser.add_argument("output_dir", type=Path, help="Directory to store downloaded EMDB volume files")
|
|
283
286
|
|
|
284
287
|
|
|
288
|
+
def _add_copy_method_argument(parser: argparse.ArgumentParser):
|
|
289
|
+
"""Add copy method argument to parser."""
|
|
290
|
+
default_copy_method = "symlink"
|
|
291
|
+
if os.name == "nt":
|
|
292
|
+
# On Windows you need developer mode or admin privileges to create symlinks
|
|
293
|
+
# so we default to copying files instead of symlinking
|
|
294
|
+
default_copy_method = "copy"
|
|
295
|
+
parser.add_argument(
|
|
296
|
+
"--copy-method",
|
|
297
|
+
type=str,
|
|
298
|
+
choices=copy_methods,
|
|
299
|
+
default=default_copy_method,
|
|
300
|
+
help="How to copy files when no changes are needed to output file.",
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
|
|
285
304
|
def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
|
|
286
305
|
"""Add filter confidence subcommand parser."""
|
|
287
306
|
parser = subparsers.add_parser(
|
|
@@ -312,6 +331,7 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
|
|
|
312
331
|
In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
|
|
313
332
|
Use `-` for stdout."""),
|
|
314
333
|
)
|
|
334
|
+
_add_copy_method_argument(parser)
|
|
315
335
|
|
|
316
336
|
|
|
317
337
|
def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -347,8 +367,11 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
|
|
|
347
367
|
)
|
|
348
368
|
parser.add_argument(
|
|
349
369
|
"--scheduler-address",
|
|
350
|
-
help="Address of the Dask scheduler to connect to.
|
|
370
|
+
help=dedent("""Address of the Dask scheduler to connect to.
|
|
371
|
+
If not provided, will create a local cluster.
|
|
372
|
+
If set to `sequential` will run tasks sequentially."""),
|
|
351
373
|
)
|
|
374
|
+
_add_copy_method_argument(parser)
|
|
352
375
|
|
|
353
376
|
|
|
354
377
|
def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -371,6 +394,7 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
|
|
|
371
394
|
)
|
|
372
395
|
parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
|
|
373
396
|
parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
|
|
397
|
+
_add_copy_method_argument(parser)
|
|
374
398
|
parser.add_argument(
|
|
375
399
|
"--write-stats",
|
|
376
400
|
type=argparse.FileType("w", encoding="UTF-8"),
|
|
@@ -381,6 +405,43 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
|
|
|
381
405
|
)
|
|
382
406
|
|
|
383
407
|
|
|
408
|
+
def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
|
|
409
|
+
"""Add filter secondary structure subcommand parser."""
|
|
410
|
+
parser = subparsers.add_parser(
|
|
411
|
+
"secondary-structure",
|
|
412
|
+
help="Filter PDB/mmCIF files by secondary structure",
|
|
413
|
+
description="Filter PDB/mmCIF files by secondary structure",
|
|
414
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
415
|
+
)
|
|
416
|
+
parser.add_argument("input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')")
|
|
417
|
+
parser.add_argument(
|
|
418
|
+
"output_dir",
|
|
419
|
+
type=Path,
|
|
420
|
+
help=dedent("""\
|
|
421
|
+
Directory to write filtered PDB/mmCIF files. Files are copied without modification.
|
|
422
|
+
"""),
|
|
423
|
+
)
|
|
424
|
+
parser.add_argument("--abs-min-helix-residues", type=int, help="Min residues in helices")
|
|
425
|
+
parser.add_argument("--abs-max-helix-residues", type=int, help="Max residues in helices")
|
|
426
|
+
parser.add_argument("--abs-min-sheet-residues", type=int, help="Min residues in sheets")
|
|
427
|
+
parser.add_argument("--abs-max-sheet-residues", type=int, help="Max residues in sheets")
|
|
428
|
+
parser.add_argument("--ratio-min-helix-residues", type=float, help="Min residues in helices (relative)")
|
|
429
|
+
parser.add_argument("--ratio-max-helix-residues", type=float, help="Max residues in helices (relative)")
|
|
430
|
+
parser.add_argument("--ratio-min-sheet-residues", type=float, help="Min residues in sheets (relative)")
|
|
431
|
+
parser.add_argument("--ratio-max-sheet-residues", type=float, help="Max residues in sheets (relative)")
|
|
432
|
+
_add_copy_method_argument(parser)
|
|
433
|
+
parser.add_argument(
|
|
434
|
+
"--write-stats",
|
|
435
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
436
|
+
help=dedent("""
|
|
437
|
+
Write filter statistics to file. In CSV format with columns:
|
|
438
|
+
`<input_file>,<nr_residues>,<nr_helix_residues>,<nr_sheet_residues>,
|
|
439
|
+
<helix_ratio>,<sheet_ratio>,<passed>,<output_file>`.
|
|
440
|
+
Use `-` for stdout.
|
|
441
|
+
"""),
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
|
|
384
445
|
def _add_search_subcommands(subparsers: argparse._SubParsersAction):
|
|
385
446
|
"""Add search command and its subcommands."""
|
|
386
447
|
parser = subparsers.add_parser(
|
|
@@ -422,6 +483,7 @@ def _add_filter_subcommands(subparsers: argparse._SubParsersAction):
|
|
|
422
483
|
_add_filter_confidence_parser(subsubparsers)
|
|
423
484
|
_add_filter_chain_parser(subsubparsers)
|
|
424
485
|
_add_filter_residue_parser(subsubparsers)
|
|
486
|
+
_add_filter_ss_parser(subsubparsers)
|
|
425
487
|
|
|
426
488
|
|
|
427
489
|
def _add_mcp_command(subparsers: argparse._SubParsersAction):
|
|
@@ -620,21 +682,22 @@ def _handle_filter_confidence(args: argparse.Namespace):
|
|
|
620
682
|
# to get rid of duplication
|
|
621
683
|
input_dir = structure(args.input_dir, Path)
|
|
622
684
|
output_dir = structure(args.output_dir, Path)
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
min_residues =
|
|
626
|
-
max_residues =
|
|
685
|
+
|
|
686
|
+
confidence_threshold = args.confidence_threshold
|
|
687
|
+
min_residues = args.min_residues
|
|
688
|
+
max_residues = args.max_residues
|
|
627
689
|
stats_file: TextIOWrapper | None = args.write_stats
|
|
690
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
628
691
|
|
|
629
692
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
630
693
|
input_files = sorted(glob_structure_files(input_dir))
|
|
631
694
|
nr_input_files = len(input_files)
|
|
632
695
|
rprint(f"Starting confidence filtering of {nr_input_files} mmcif/PDB files in {input_dir} directory.")
|
|
633
|
-
query = structure(
|
|
696
|
+
query = converter.structure(
|
|
634
697
|
{
|
|
635
698
|
"confidence": confidence_threshold,
|
|
636
|
-
"
|
|
637
|
-
"
|
|
699
|
+
"min_residues": min_residues,
|
|
700
|
+
"max_residues": max_residues,
|
|
638
701
|
},
|
|
639
702
|
ConfidenceFilterQuery,
|
|
640
703
|
)
|
|
@@ -643,7 +706,11 @@ def _handle_filter_confidence(args: argparse.Namespace):
|
|
|
643
706
|
writer.writerow(["input_file", "residue_count", "passed", "output_file"])
|
|
644
707
|
|
|
645
708
|
passed_count = 0
|
|
646
|
-
for r in tqdm(
|
|
709
|
+
for r in tqdm(
|
|
710
|
+
filter_files_on_confidence(input_files, query, output_dir, copy_method=copy_method),
|
|
711
|
+
total=len(input_files),
|
|
712
|
+
unit="file",
|
|
713
|
+
):
|
|
647
714
|
if r.filtered_file:
|
|
648
715
|
passed_count += 1
|
|
649
716
|
if stats_file:
|
|
@@ -656,9 +723,10 @@ def _handle_filter_confidence(args: argparse.Namespace):
|
|
|
656
723
|
|
|
657
724
|
def _handle_filter_chain(args):
|
|
658
725
|
input_dir = args.input_dir
|
|
659
|
-
output_dir = args.output_dir
|
|
726
|
+
output_dir = structure(args.output_dir, Path)
|
|
660
727
|
pdb_id2chain_mapping_file = args.chains
|
|
661
|
-
scheduler_address = args.scheduler_address
|
|
728
|
+
scheduler_address = structure(args.scheduler_address, str | None) # pyright: ignore[reportArgumentType]
|
|
729
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
662
730
|
|
|
663
731
|
# make sure files in input dir with entries in mapping file are the same
|
|
664
732
|
# complain when files from mapping file are missing on disk
|
|
@@ -683,18 +751,25 @@ def _handle_filter_chain(args):
|
|
|
683
751
|
rprint("[red]No valid structure files found. Exiting.")
|
|
684
752
|
sys.exit(1)
|
|
685
753
|
|
|
686
|
-
results = filter_files_on_chain(
|
|
754
|
+
results = filter_files_on_chain(
|
|
755
|
+
file2chain, output_dir, scheduler_address=scheduler_address, copy_method=copy_method
|
|
756
|
+
)
|
|
687
757
|
|
|
688
758
|
nr_written = len([r for r in results if r.passed])
|
|
689
759
|
|
|
690
760
|
rprint(f"Wrote {nr_written} single-chain PDB/mmCIF files to {output_dir}.")
|
|
691
761
|
|
|
762
|
+
for result in results:
|
|
763
|
+
if result.discard_reason:
|
|
764
|
+
rprint(f"[red]Discarding {result.input_file} ({result.discard_reason})[/red]")
|
|
765
|
+
|
|
692
766
|
|
|
693
767
|
def _handle_filter_residue(args):
|
|
694
768
|
input_dir = structure(args.input_dir, Path)
|
|
695
769
|
output_dir = structure(args.output_dir, Path)
|
|
696
770
|
min_residues = structure(args.min_residues, int)
|
|
697
771
|
max_residues = structure(args.max_residues, int)
|
|
772
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
698
773
|
stats_file: TextIOWrapper | None = args.write_stats
|
|
699
774
|
|
|
700
775
|
if stats_file:
|
|
@@ -705,7 +780,9 @@ def _handle_filter_residue(args):
|
|
|
705
780
|
input_files = sorted(glob_structure_files(input_dir))
|
|
706
781
|
nr_total = len(input_files)
|
|
707
782
|
rprint(f"Filtering {nr_total} files in {input_dir} directory by number of residues in chain A.")
|
|
708
|
-
for r in filter_files_on_residues(
|
|
783
|
+
for r in filter_files_on_residues(
|
|
784
|
+
input_files, output_dir, min_residues=min_residues, max_residues=max_residues, copy_method=copy_method
|
|
785
|
+
):
|
|
709
786
|
if stats_file:
|
|
710
787
|
writer.writerow([r.input_file, r.residue_count, r.passed, r.output_file])
|
|
711
788
|
if r.passed:
|
|
@@ -716,6 +793,68 @@ def _handle_filter_residue(args):
|
|
|
716
793
|
rprint(f"Statistics written to {stats_file.name}")
|
|
717
794
|
|
|
718
795
|
|
|
796
|
+
def _handle_filter_ss(args):
|
|
797
|
+
input_dir = structure(args.input_dir, Path)
|
|
798
|
+
output_dir = structure(args.output_dir, Path)
|
|
799
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
800
|
+
stats_file: TextIOWrapper | None = args.write_stats
|
|
801
|
+
|
|
802
|
+
raw_query = {
|
|
803
|
+
"abs_min_helix_residues": args.abs_min_helix_residues,
|
|
804
|
+
"abs_max_helix_residues": args.abs_max_helix_residues,
|
|
805
|
+
"abs_min_sheet_residues": args.abs_min_sheet_residues,
|
|
806
|
+
"abs_max_sheet_residues": args.abs_max_sheet_residues,
|
|
807
|
+
"ratio_min_helix_residues": args.ratio_min_helix_residues,
|
|
808
|
+
"ratio_max_helix_residues": args.ratio_max_helix_residues,
|
|
809
|
+
"ratio_min_sheet_residues": args.ratio_min_sheet_residues,
|
|
810
|
+
"ratio_max_sheet_residues": args.ratio_max_sheet_residues,
|
|
811
|
+
}
|
|
812
|
+
query = converter.structure(raw_query, SecondaryStructureFilterQuery)
|
|
813
|
+
input_files = sorted(glob_structure_files(input_dir))
|
|
814
|
+
nr_total = len(input_files)
|
|
815
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
816
|
+
|
|
817
|
+
if stats_file:
|
|
818
|
+
writer = csv.writer(stats_file)
|
|
819
|
+
writer.writerow(
|
|
820
|
+
[
|
|
821
|
+
"input_file",
|
|
822
|
+
"nr_residues",
|
|
823
|
+
"nr_helix_residues",
|
|
824
|
+
"nr_sheet_residues",
|
|
825
|
+
"helix_ratio",
|
|
826
|
+
"sheet_ratio",
|
|
827
|
+
"passed",
|
|
828
|
+
"output_file",
|
|
829
|
+
]
|
|
830
|
+
)
|
|
831
|
+
|
|
832
|
+
rprint(f"Filtering {nr_total} files in {input_dir} directory by secondary structure.")
|
|
833
|
+
nr_passed = 0
|
|
834
|
+
for input_file, result in filter_files_on_secondary_structure(input_files, query=query):
|
|
835
|
+
output_file: Path | None = None
|
|
836
|
+
if result.passed:
|
|
837
|
+
output_file = output_dir / input_file.name
|
|
838
|
+
copyfile(input_file, output_file, copy_method)
|
|
839
|
+
nr_passed += 1
|
|
840
|
+
if stats_file:
|
|
841
|
+
writer.writerow(
|
|
842
|
+
[
|
|
843
|
+
input_file,
|
|
844
|
+
result.stats.nr_residues,
|
|
845
|
+
result.stats.nr_helix_residues,
|
|
846
|
+
result.stats.nr_sheet_residues,
|
|
847
|
+
round(result.stats.helix_ratio, 3),
|
|
848
|
+
round(result.stats.sheet_ratio, 3),
|
|
849
|
+
result.passed,
|
|
850
|
+
output_file,
|
|
851
|
+
]
|
|
852
|
+
)
|
|
853
|
+
rprint(f"Wrote {nr_passed} files to {output_dir} directory.")
|
|
854
|
+
if stats_file:
|
|
855
|
+
rprint(f"Statistics written to {stats_file.name}")
|
|
856
|
+
|
|
857
|
+
|
|
719
858
|
def _handle_mcp(args):
|
|
720
859
|
if find_spec("fastmcp") is None:
|
|
721
860
|
msg = "Unable to start MCP server, please install `protein-quest[mcp]`."
|
|
@@ -742,6 +881,7 @@ HANDLERS: dict[tuple[str, str | None], Callable] = {
|
|
|
742
881
|
("filter", "confidence"): _handle_filter_confidence,
|
|
743
882
|
("filter", "chain"): _handle_filter_chain,
|
|
744
883
|
("filter", "residue"): _handle_filter_residue,
|
|
884
|
+
("filter", "secondary-structure"): _handle_filter_ss,
|
|
745
885
|
("mcp", None): _handle_mcp,
|
|
746
886
|
}
|
|
747
887
|
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Convert json or dict to Python objects."""
|
|
2
|
+
|
|
3
|
+
from cattrs.preconf.orjson import make_converter
|
|
4
|
+
from yarl import URL
|
|
5
|
+
|
|
6
|
+
type Percentage = float
|
|
7
|
+
"""Type alias for percentage values (0.0-100.0)."""
|
|
8
|
+
type Ratio = float
|
|
9
|
+
"""Type alias for ratio values (0.0-1.0)."""
|
|
10
|
+
type PositiveInt = int
|
|
11
|
+
"""Type alias for positive integer values (>= 0)."""
|
|
12
|
+
|
|
13
|
+
converter = make_converter()
|
|
14
|
+
"""cattrs converter to read JSON document or dict to Python objects."""
|
|
15
|
+
converter.register_structure_hook(URL, lambda v, _: URL(v))
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@converter.register_structure_hook
|
|
19
|
+
def percentage_hook(val, _) -> Percentage:
|
|
20
|
+
value = float(val)
|
|
21
|
+
"""Cattrs hook to validate percentage values."""
|
|
22
|
+
if not 0.0 <= value <= 100.0:
|
|
23
|
+
msg = f"Value {value} is not a valid percentage (0.0-100.0)"
|
|
24
|
+
raise ValueError(msg)
|
|
25
|
+
return value
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@converter.register_structure_hook
|
|
29
|
+
def ratio_hook(val, _) -> Ratio:
|
|
30
|
+
"""Cattrs hook to validate ratio values."""
|
|
31
|
+
value = float(val)
|
|
32
|
+
if not 0.0 <= value <= 1.0:
|
|
33
|
+
msg = f"Value {value} is not a valid ratio (0.0-1.0)"
|
|
34
|
+
raise ValueError(msg)
|
|
35
|
+
return value
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@converter.register_structure_hook
|
|
39
|
+
def positive_int_hook(val, _) -> PositiveInt:
|
|
40
|
+
"""Cattrs hook to validate positive integer values."""
|
|
41
|
+
value = int(val)
|
|
42
|
+
if value < 0:
|
|
43
|
+
msg = f"Value {value} is not a valid positive integer (>= 0)"
|
|
44
|
+
raise ValueError(msg)
|
|
45
|
+
return value
|