protein-quest 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of protein-quest might be problematic. Click here for more details.
- protein_quest/__version__.py +1 -1
- protein_quest/alphafold/confidence.py +42 -15
- protein_quest/alphafold/fetch.py +2 -4
- protein_quest/cli.py +292 -14
- protein_quest/converter.py +46 -0
- protein_quest/filters.py +39 -7
- protein_quest/go.py +1 -4
- protein_quest/mcp_server.py +14 -1
- protein_quest/pdbe/io.py +122 -41
- protein_quest/ss.py +284 -0
- protein_quest/taxonomy.py +1 -3
- protein_quest/uniprot.py +157 -4
- protein_quest/utils.py +28 -1
- {protein_quest-0.3.1.dist-info → protein_quest-0.4.0.dist-info}/METADATA +48 -4
- protein_quest-0.4.0.dist-info/RECORD +26 -0
- protein_quest-0.3.1.dist-info/RECORD +0 -24
- {protein_quest-0.3.1.dist-info → protein_quest-0.4.0.dist-info}/WHEEL +0 -0
- {protein_quest-0.3.1.dist-info → protein_quest-0.4.0.dist-info}/entry_points.txt +0 -0
- {protein_quest-0.3.1.dist-info → protein_quest-0.4.0.dist-info}/licenses/LICENSE +0 -0
protein_quest/__version__.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.4.0"
|
|
2
2
|
"""The version of the package."""
|
|
@@ -7,7 +7,10 @@ from pathlib import Path
|
|
|
7
7
|
|
|
8
8
|
import gemmi
|
|
9
9
|
|
|
10
|
+
from protein_quest.converter import Percentage, PositiveInt, converter
|
|
10
11
|
from protein_quest.pdbe.io import write_structure
|
|
12
|
+
from protein_quest.ss import nr_of_residues_in_total
|
|
13
|
+
from protein_quest.utils import CopyMethod, copyfile
|
|
11
14
|
|
|
12
15
|
"""
|
|
13
16
|
Methods to filter AlphaFoldDB structures on confidence scores.
|
|
@@ -73,13 +76,25 @@ class ConfidenceFilterQuery:
|
|
|
73
76
|
Parameters:
|
|
74
77
|
confidence: The confidence threshold for filtering residues.
|
|
75
78
|
Residues with a pLDDT (b-factor) above this value are considered high confidence.
|
|
76
|
-
|
|
77
|
-
|
|
79
|
+
min_residues: The minimum number of high-confidence residues required to keep the structure.
|
|
80
|
+
max_residues: The maximum number of high-confidence residues required to keep the structure.
|
|
78
81
|
"""
|
|
79
82
|
|
|
80
|
-
confidence:
|
|
81
|
-
|
|
82
|
-
|
|
83
|
+
confidence: Percentage
|
|
84
|
+
min_residues: PositiveInt
|
|
85
|
+
max_residues: PositiveInt
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
base_query_hook = converter.get_structure_hook(ConfidenceFilterQuery)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@converter.register_structure_hook
|
|
92
|
+
def confidence_filter_query_hook(val, _type) -> ConfidenceFilterQuery:
|
|
93
|
+
result: ConfidenceFilterQuery = base_query_hook(val, _type)
|
|
94
|
+
if result.min_residues > result.max_residues:
|
|
95
|
+
msg = f"min_residues {result.min_residues} cannot be larger than max_residues {result.max_residues}"
|
|
96
|
+
raise ValueError(msg)
|
|
97
|
+
return result
|
|
83
98
|
|
|
84
99
|
|
|
85
100
|
@dataclass
|
|
@@ -93,17 +108,20 @@ class ConfidenceFilterResult:
|
|
|
93
108
|
"""
|
|
94
109
|
|
|
95
110
|
input_file: str
|
|
96
|
-
count:
|
|
111
|
+
count: PositiveInt
|
|
97
112
|
filtered_file: Path | None = None
|
|
98
113
|
|
|
99
114
|
|
|
100
|
-
def filter_file_on_residues(
|
|
115
|
+
def filter_file_on_residues(
|
|
116
|
+
file: Path, query: ConfidenceFilterQuery, filtered_dir: Path, copy_method: CopyMethod = "copy"
|
|
117
|
+
) -> ConfidenceFilterResult:
|
|
101
118
|
"""Filter a single AlphaFoldDB structure file (*.pdb[.gz], *.cif[.gz]) based on confidence.
|
|
102
119
|
|
|
103
120
|
Args:
|
|
104
121
|
file: The path to the PDB file to filter.
|
|
105
122
|
query: The confidence filter query.
|
|
106
123
|
filtered_dir: The directory to save the filtered PDB file.
|
|
124
|
+
copy_method: How to copy when no residues have to be removed.
|
|
107
125
|
|
|
108
126
|
Returns:
|
|
109
127
|
result with filtered_file property set to Path where filtered PDB file is saved.
|
|
@@ -112,19 +130,24 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
|
|
|
112
130
|
structure = gemmi.read_structure(str(file))
|
|
113
131
|
residues = set(find_high_confidence_residues(structure, query.confidence))
|
|
114
132
|
count = len(residues)
|
|
115
|
-
if count < query.
|
|
133
|
+
if count < query.min_residues or count > query.max_residues:
|
|
116
134
|
# Skip structure that is outside the min and max threshold
|
|
117
135
|
# just return number of high confidence residues
|
|
118
136
|
return ConfidenceFilterResult(
|
|
119
137
|
input_file=file.name,
|
|
120
138
|
count=count,
|
|
121
139
|
)
|
|
140
|
+
total_residues = nr_of_residues_in_total(structure)
|
|
122
141
|
filtered_file = filtered_dir / file.name
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
142
|
+
if count == total_residues:
|
|
143
|
+
# if no residues have to be removed then copy instead of slower gemmi writing
|
|
144
|
+
copyfile(file, filtered_file, copy_method)
|
|
145
|
+
else:
|
|
146
|
+
new_structure = filter_out_low_confidence_residues(
|
|
147
|
+
structure,
|
|
148
|
+
residues,
|
|
149
|
+
)
|
|
150
|
+
write_structure(new_structure, filtered_file)
|
|
128
151
|
return ConfidenceFilterResult(
|
|
129
152
|
input_file=file.name,
|
|
130
153
|
count=count,
|
|
@@ -133,7 +156,10 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
|
|
|
133
156
|
|
|
134
157
|
|
|
135
158
|
def filter_files_on_confidence(
|
|
136
|
-
alphafold_pdb_files: list[Path],
|
|
159
|
+
alphafold_pdb_files: list[Path],
|
|
160
|
+
query: ConfidenceFilterQuery,
|
|
161
|
+
filtered_dir: Path,
|
|
162
|
+
copy_method: CopyMethod = "copy",
|
|
137
163
|
) -> Generator[ConfidenceFilterResult]:
|
|
138
164
|
"""Filter AlphaFoldDB structures based on confidence.
|
|
139
165
|
|
|
@@ -141,6 +167,7 @@ def filter_files_on_confidence(
|
|
|
141
167
|
alphafold_pdb_files: List of mmcif/PDB files from AlphaFoldDB to filter.
|
|
142
168
|
query: The confidence filter query containing the confidence thresholds.
|
|
143
169
|
filtered_dir: Directory where the filtered mmcif/PDB files will be saved.
|
|
170
|
+
copy_method: How to copy when a direct copy is possible.
|
|
144
171
|
|
|
145
172
|
Yields:
|
|
146
173
|
For each mmcif/PDB files yields whether it was filtered or not,
|
|
@@ -150,4 +177,4 @@ def filter_files_on_confidence(
|
|
|
150
177
|
# In ../filter.py:filter_files_on_residues() we filter on number of residues on a file level
|
|
151
178
|
# here we filter on file level and inside file remove low confidence residues
|
|
152
179
|
for pdb_file in alphafold_pdb_files:
|
|
153
|
-
yield filter_file_on_residues(pdb_file, query, filtered_dir)
|
|
180
|
+
yield filter_file_on_residues(pdb_file, query, filtered_dir, copy_method)
|
protein_quest/alphafold/fetch.py
CHANGED
|
@@ -9,17 +9,15 @@ from typing import Literal, cast, get_args
|
|
|
9
9
|
|
|
10
10
|
from aiohttp_retry import RetryClient
|
|
11
11
|
from aiopath import AsyncPath
|
|
12
|
-
from cattrs.preconf.orjson import make_converter
|
|
13
12
|
from tqdm.asyncio import tqdm
|
|
14
13
|
from yarl import URL
|
|
15
14
|
|
|
16
15
|
from protein_quest.alphafold.entry_summary import EntrySummary
|
|
16
|
+
from protein_quest.converter import converter
|
|
17
17
|
from protein_quest.utils import friendly_session, retrieve_files, run_async
|
|
18
18
|
|
|
19
19
|
logger = logging.getLogger(__name__)
|
|
20
|
-
|
|
21
|
-
"""cattrs converter to read AlphaFold summary JSON document."""
|
|
22
|
-
converter.register_structure_hook(URL, lambda v, _: URL(v))
|
|
20
|
+
|
|
23
21
|
|
|
24
22
|
DownloadableFormat = Literal[
|
|
25
23
|
"summary",
|
protein_quest/cli.py
CHANGED
|
@@ -15,6 +15,7 @@ from textwrap import dedent
|
|
|
15
15
|
from cattrs import structure
|
|
16
16
|
from rich import print as rprint
|
|
17
17
|
from rich.logging import RichHandler
|
|
18
|
+
from rich.markdown import Markdown
|
|
18
19
|
from rich.panel import Panel
|
|
19
20
|
from rich_argparse import ArgumentDefaultsRichHelpFormatter
|
|
20
21
|
from tqdm.rich import tqdm
|
|
@@ -23,13 +24,26 @@ from protein_quest.__version__ import __version__
|
|
|
23
24
|
from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence
|
|
24
25
|
from protein_quest.alphafold.fetch import DownloadableFormat, downloadable_formats
|
|
25
26
|
from protein_quest.alphafold.fetch import fetch_many as af_fetch
|
|
27
|
+
from protein_quest.converter import converter
|
|
26
28
|
from protein_quest.emdb import fetch as emdb_fetch
|
|
27
29
|
from protein_quest.filters import filter_files_on_chain, filter_files_on_residues
|
|
28
30
|
from protein_quest.go import Aspect, allowed_aspects, search_gene_ontology_term, write_go_terms_to_csv
|
|
29
31
|
from protein_quest.pdbe import fetch as pdbe_fetch
|
|
30
32
|
from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
|
|
33
|
+
from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
|
|
31
34
|
from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
|
|
32
|
-
from protein_quest.uniprot import
|
|
35
|
+
from protein_quest.uniprot import (
|
|
36
|
+
ComplexPortalEntry,
|
|
37
|
+
PdbResult,
|
|
38
|
+
Query,
|
|
39
|
+
search4af,
|
|
40
|
+
search4emdb,
|
|
41
|
+
search4interaction_partners,
|
|
42
|
+
search4macromolecular_complexes,
|
|
43
|
+
search4pdb,
|
|
44
|
+
search4uniprot,
|
|
45
|
+
)
|
|
46
|
+
from protein_quest.utils import CopyMethod, copy_methods, copyfile
|
|
33
47
|
|
|
34
48
|
logger = logging.getLogger(__name__)
|
|
35
49
|
|
|
@@ -208,6 +222,73 @@ def _add_search_taxonomy_parser(subparser: argparse._SubParsersAction):
|
|
|
208
222
|
parser.add_argument("--limit", type=int, default=100, help="Maximum number of results to return")
|
|
209
223
|
|
|
210
224
|
|
|
225
|
+
def _add_search_interaction_partners_parser(subparsers: argparse._SubParsersAction):
|
|
226
|
+
"""Add search interaction partners subcommand parser."""
|
|
227
|
+
parser = subparsers.add_parser(
|
|
228
|
+
"interaction-partners",
|
|
229
|
+
help="Search for interaction partners of given UniProt accession",
|
|
230
|
+
description=dedent("""\
|
|
231
|
+
Search for interaction partners of given UniProt accession
|
|
232
|
+
in the Uniprot SPARQL endpoint and Complex Portal.
|
|
233
|
+
"""),
|
|
234
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
235
|
+
)
|
|
236
|
+
parser.add_argument(
|
|
237
|
+
"uniprot_acc",
|
|
238
|
+
type=str,
|
|
239
|
+
help="UniProt accession (for example P12345).",
|
|
240
|
+
)
|
|
241
|
+
parser.add_argument(
|
|
242
|
+
"--exclude",
|
|
243
|
+
type=str,
|
|
244
|
+
action="append",
|
|
245
|
+
help="UniProt accessions to exclude from the results. For example already known interaction partners.",
|
|
246
|
+
)
|
|
247
|
+
parser.add_argument(
|
|
248
|
+
"output_csv",
|
|
249
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
250
|
+
help="Output CSV with interaction partners per UniProt accession. Use `-` for stdout.",
|
|
251
|
+
)
|
|
252
|
+
parser.add_argument(
|
|
253
|
+
"--limit", type=int, default=10_000, help="Maximum number of interaction partner uniprot accessions to return"
|
|
254
|
+
)
|
|
255
|
+
parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _add_search_complexes_parser(subparsers: argparse._SubParsersAction):
|
|
259
|
+
"""Add search complexes subcommand parser."""
|
|
260
|
+
description = dedent("""\
|
|
261
|
+
Search for complexes in the Complex Portal.
|
|
262
|
+
https://www.ebi.ac.uk/complexportal/
|
|
263
|
+
|
|
264
|
+
The output CSV file has the following columns:
|
|
265
|
+
|
|
266
|
+
- query_protein: UniProt accession used as query
|
|
267
|
+
- complex_id: Complex Portal identifier
|
|
268
|
+
- complex_url: URL to the Complex Portal entry
|
|
269
|
+
- complex_title: Title of the complex
|
|
270
|
+
- members: Semicolon-separated list of UniProt accessions of complex members
|
|
271
|
+
""")
|
|
272
|
+
parser = subparsers.add_parser(
|
|
273
|
+
"complexes",
|
|
274
|
+
help="Search for complexes in the Complex Portal",
|
|
275
|
+
description=Markdown(description, style="argparse.text"), # type: ignore using rich formatter makes this OK
|
|
276
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
277
|
+
)
|
|
278
|
+
parser.add_argument(
|
|
279
|
+
"uniprot_accs",
|
|
280
|
+
type=argparse.FileType("r", encoding="UTF-8"),
|
|
281
|
+
help="Text file with UniProt accessions (one per line) as query for searching complexes. Use `-` for stdin.",
|
|
282
|
+
)
|
|
283
|
+
parser.add_argument(
|
|
284
|
+
"output_csv",
|
|
285
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
286
|
+
help="Output CSV file with complex results. Use `-` for stdout.",
|
|
287
|
+
)
|
|
288
|
+
parser.add_argument("--limit", type=int, default=100, help="Maximum number of complex results to return")
|
|
289
|
+
parser.add_argument("--timeout", type=int, default=1_800, help="Maximum seconds to wait for query to complete")
|
|
290
|
+
|
|
291
|
+
|
|
211
292
|
def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
|
|
212
293
|
"""Add retrieve pdbe subcommand parser."""
|
|
213
294
|
parser = subparsers.add_parser(
|
|
@@ -282,6 +363,22 @@ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
|
|
|
282
363
|
parser.add_argument("output_dir", type=Path, help="Directory to store downloaded EMDB volume files")
|
|
283
364
|
|
|
284
365
|
|
|
366
|
+
def _add_copy_method_argument(parser: argparse.ArgumentParser):
|
|
367
|
+
"""Add copy method argument to parser."""
|
|
368
|
+
default_copy_method = "symlink"
|
|
369
|
+
if os.name == "nt":
|
|
370
|
+
# On Windows you need developer mode or admin privileges to create symlinks
|
|
371
|
+
# so we default to copying files instead of symlinking
|
|
372
|
+
default_copy_method = "copy"
|
|
373
|
+
parser.add_argument(
|
|
374
|
+
"--copy-method",
|
|
375
|
+
type=str,
|
|
376
|
+
choices=copy_methods,
|
|
377
|
+
default=default_copy_method,
|
|
378
|
+
help="How to copy files when no changes are needed to output file.",
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
|
|
285
382
|
def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
|
|
286
383
|
"""Add filter confidence subcommand parser."""
|
|
287
384
|
parser = subparsers.add_parser(
|
|
@@ -312,6 +409,7 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
|
|
|
312
409
|
In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
|
|
313
410
|
Use `-` for stdout."""),
|
|
314
411
|
)
|
|
412
|
+
_add_copy_method_argument(parser)
|
|
315
413
|
|
|
316
414
|
|
|
317
415
|
def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -347,8 +445,11 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
|
|
|
347
445
|
)
|
|
348
446
|
parser.add_argument(
|
|
349
447
|
"--scheduler-address",
|
|
350
|
-
help="Address of the Dask scheduler to connect to.
|
|
448
|
+
help=dedent("""Address of the Dask scheduler to connect to.
|
|
449
|
+
If not provided, will create a local cluster.
|
|
450
|
+
If set to `sequential` will run tasks sequentially."""),
|
|
351
451
|
)
|
|
452
|
+
_add_copy_method_argument(parser)
|
|
352
453
|
|
|
353
454
|
|
|
354
455
|
def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -371,6 +472,7 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
|
|
|
371
472
|
)
|
|
372
473
|
parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
|
|
373
474
|
parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
|
|
475
|
+
_add_copy_method_argument(parser)
|
|
374
476
|
parser.add_argument(
|
|
375
477
|
"--write-stats",
|
|
376
478
|
type=argparse.FileType("w", encoding="UTF-8"),
|
|
@@ -381,6 +483,43 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
|
|
|
381
483
|
)
|
|
382
484
|
|
|
383
485
|
|
|
486
|
+
def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
|
|
487
|
+
"""Add filter secondary structure subcommand parser."""
|
|
488
|
+
parser = subparsers.add_parser(
|
|
489
|
+
"secondary-structure",
|
|
490
|
+
help="Filter PDB/mmCIF files by secondary structure",
|
|
491
|
+
description="Filter PDB/mmCIF files by secondary structure",
|
|
492
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
493
|
+
)
|
|
494
|
+
parser.add_argument("input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')")
|
|
495
|
+
parser.add_argument(
|
|
496
|
+
"output_dir",
|
|
497
|
+
type=Path,
|
|
498
|
+
help=dedent("""\
|
|
499
|
+
Directory to write filtered PDB/mmCIF files. Files are copied without modification.
|
|
500
|
+
"""),
|
|
501
|
+
)
|
|
502
|
+
parser.add_argument("--abs-min-helix-residues", type=int, help="Min residues in helices")
|
|
503
|
+
parser.add_argument("--abs-max-helix-residues", type=int, help="Max residues in helices")
|
|
504
|
+
parser.add_argument("--abs-min-sheet-residues", type=int, help="Min residues in sheets")
|
|
505
|
+
parser.add_argument("--abs-max-sheet-residues", type=int, help="Max residues in sheets")
|
|
506
|
+
parser.add_argument("--ratio-min-helix-residues", type=float, help="Min residues in helices (relative)")
|
|
507
|
+
parser.add_argument("--ratio-max-helix-residues", type=float, help="Max residues in helices (relative)")
|
|
508
|
+
parser.add_argument("--ratio-min-sheet-residues", type=float, help="Min residues in sheets (relative)")
|
|
509
|
+
parser.add_argument("--ratio-max-sheet-residues", type=float, help="Max residues in sheets (relative)")
|
|
510
|
+
_add_copy_method_argument(parser)
|
|
511
|
+
parser.add_argument(
|
|
512
|
+
"--write-stats",
|
|
513
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
514
|
+
help=dedent("""
|
|
515
|
+
Write filter statistics to file. In CSV format with columns:
|
|
516
|
+
`<input_file>,<nr_residues>,<nr_helix_residues>,<nr_sheet_residues>,
|
|
517
|
+
<helix_ratio>,<sheet_ratio>,<passed>,<output_file>`.
|
|
518
|
+
Use `-` for stdout.
|
|
519
|
+
"""),
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
|
|
384
523
|
def _add_search_subcommands(subparsers: argparse._SubParsersAction):
|
|
385
524
|
"""Add search command and its subcommands."""
|
|
386
525
|
parser = subparsers.add_parser(
|
|
@@ -397,6 +536,8 @@ def _add_search_subcommands(subparsers: argparse._SubParsersAction):
|
|
|
397
536
|
_add_search_emdb_parser(subsubparsers)
|
|
398
537
|
_add_search_go_parser(subsubparsers)
|
|
399
538
|
_add_search_taxonomy_parser(subsubparsers)
|
|
539
|
+
_add_search_interaction_partners_parser(subsubparsers)
|
|
540
|
+
_add_search_complexes_parser(subsubparsers)
|
|
400
541
|
|
|
401
542
|
|
|
402
543
|
def _add_retrieve_subcommands(subparsers: argparse._SubParsersAction):
|
|
@@ -422,6 +563,7 @@ def _add_filter_subcommands(subparsers: argparse._SubParsersAction):
|
|
|
422
563
|
_add_filter_confidence_parser(subsubparsers)
|
|
423
564
|
_add_filter_chain_parser(subsubparsers)
|
|
424
565
|
_add_filter_residue_parser(subsubparsers)
|
|
566
|
+
_add_filter_ss_parser(subsubparsers)
|
|
425
567
|
|
|
426
568
|
|
|
427
569
|
def _add_mcp_command(subparsers: argparse._SubParsersAction):
|
|
@@ -574,6 +716,32 @@ def _handle_search_taxonomy(args):
|
|
|
574
716
|
_write_taxonomy_csv(results, output_csv)
|
|
575
717
|
|
|
576
718
|
|
|
719
|
+
def _handle_search_interaction_partners(args: argparse.Namespace):
|
|
720
|
+
uniprot_acc: str = args.uniprot_acc
|
|
721
|
+
excludes: set[str] = set(args.exclude) if args.exclude else set()
|
|
722
|
+
limit: int = args.limit
|
|
723
|
+
timeout: int = args.timeout
|
|
724
|
+
output_csv: TextIOWrapper = args.output_csv
|
|
725
|
+
|
|
726
|
+
rprint(f"Searching for interaction partners of '{uniprot_acc}'")
|
|
727
|
+
results = search4interaction_partners(uniprot_acc, excludes=excludes, limit=limit, timeout=timeout)
|
|
728
|
+
rprint(f"Found {len(results)} interaction partners, written to {output_csv.name}")
|
|
729
|
+
_write_lines(output_csv, results.keys())
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
def _handle_search_complexes(args: argparse.Namespace):
|
|
733
|
+
uniprot_accs = args.uniprot_accs
|
|
734
|
+
limit = args.limit
|
|
735
|
+
timeout = args.timeout
|
|
736
|
+
output_csv = args.output_csv
|
|
737
|
+
|
|
738
|
+
accs = _read_lines(uniprot_accs)
|
|
739
|
+
rprint(f"Finding complexes for {len(accs)} uniprot accessions")
|
|
740
|
+
results = search4macromolecular_complexes(accs, limit=limit, timeout=timeout)
|
|
741
|
+
rprint(f"Found {len(results)} complexes, written to {output_csv.name}")
|
|
742
|
+
_write_complexes_csv(results, output_csv)
|
|
743
|
+
|
|
744
|
+
|
|
577
745
|
def _handle_retrieve_pdbe(args):
|
|
578
746
|
pdbe_csv = args.pdbe_csv
|
|
579
747
|
output_dir = args.output_dir
|
|
@@ -620,21 +788,22 @@ def _handle_filter_confidence(args: argparse.Namespace):
|
|
|
620
788
|
# to get rid of duplication
|
|
621
789
|
input_dir = structure(args.input_dir, Path)
|
|
622
790
|
output_dir = structure(args.output_dir, Path)
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
min_residues =
|
|
626
|
-
max_residues =
|
|
791
|
+
|
|
792
|
+
confidence_threshold = args.confidence_threshold
|
|
793
|
+
min_residues = args.min_residues
|
|
794
|
+
max_residues = args.max_residues
|
|
627
795
|
stats_file: TextIOWrapper | None = args.write_stats
|
|
796
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
628
797
|
|
|
629
798
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
630
799
|
input_files = sorted(glob_structure_files(input_dir))
|
|
631
800
|
nr_input_files = len(input_files)
|
|
632
801
|
rprint(f"Starting confidence filtering of {nr_input_files} mmcif/PDB files in {input_dir} directory.")
|
|
633
|
-
query = structure(
|
|
802
|
+
query = converter.structure(
|
|
634
803
|
{
|
|
635
804
|
"confidence": confidence_threshold,
|
|
636
|
-
"
|
|
637
|
-
"
|
|
805
|
+
"min_residues": min_residues,
|
|
806
|
+
"max_residues": max_residues,
|
|
638
807
|
},
|
|
639
808
|
ConfidenceFilterQuery,
|
|
640
809
|
)
|
|
@@ -643,7 +812,11 @@ def _handle_filter_confidence(args: argparse.Namespace):
|
|
|
643
812
|
writer.writerow(["input_file", "residue_count", "passed", "output_file"])
|
|
644
813
|
|
|
645
814
|
passed_count = 0
|
|
646
|
-
for r in tqdm(
|
|
815
|
+
for r in tqdm(
|
|
816
|
+
filter_files_on_confidence(input_files, query, output_dir, copy_method=copy_method),
|
|
817
|
+
total=len(input_files),
|
|
818
|
+
unit="file",
|
|
819
|
+
):
|
|
647
820
|
if r.filtered_file:
|
|
648
821
|
passed_count += 1
|
|
649
822
|
if stats_file:
|
|
@@ -656,9 +829,10 @@ def _handle_filter_confidence(args: argparse.Namespace):
|
|
|
656
829
|
|
|
657
830
|
def _handle_filter_chain(args):
|
|
658
831
|
input_dir = args.input_dir
|
|
659
|
-
output_dir = args.output_dir
|
|
832
|
+
output_dir = structure(args.output_dir, Path)
|
|
660
833
|
pdb_id2chain_mapping_file = args.chains
|
|
661
|
-
scheduler_address = args.scheduler_address
|
|
834
|
+
scheduler_address = structure(args.scheduler_address, str | None) # pyright: ignore[reportArgumentType]
|
|
835
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
662
836
|
|
|
663
837
|
# make sure files in input dir with entries in mapping file are the same
|
|
664
838
|
# complain when files from mapping file are missing on disk
|
|
@@ -683,18 +857,25 @@ def _handle_filter_chain(args):
|
|
|
683
857
|
rprint("[red]No valid structure files found. Exiting.")
|
|
684
858
|
sys.exit(1)
|
|
685
859
|
|
|
686
|
-
results = filter_files_on_chain(
|
|
860
|
+
results = filter_files_on_chain(
|
|
861
|
+
file2chain, output_dir, scheduler_address=scheduler_address, copy_method=copy_method
|
|
862
|
+
)
|
|
687
863
|
|
|
688
864
|
nr_written = len([r for r in results if r.passed])
|
|
689
865
|
|
|
690
866
|
rprint(f"Wrote {nr_written} single-chain PDB/mmCIF files to {output_dir}.")
|
|
691
867
|
|
|
868
|
+
for result in results:
|
|
869
|
+
if result.discard_reason:
|
|
870
|
+
rprint(f"[red]Discarding {result.input_file} ({result.discard_reason})[/red]")
|
|
871
|
+
|
|
692
872
|
|
|
693
873
|
def _handle_filter_residue(args):
|
|
694
874
|
input_dir = structure(args.input_dir, Path)
|
|
695
875
|
output_dir = structure(args.output_dir, Path)
|
|
696
876
|
min_residues = structure(args.min_residues, int)
|
|
697
877
|
max_residues = structure(args.max_residues, int)
|
|
878
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
698
879
|
stats_file: TextIOWrapper | None = args.write_stats
|
|
699
880
|
|
|
700
881
|
if stats_file:
|
|
@@ -705,7 +886,9 @@ def _handle_filter_residue(args):
|
|
|
705
886
|
input_files = sorted(glob_structure_files(input_dir))
|
|
706
887
|
nr_total = len(input_files)
|
|
707
888
|
rprint(f"Filtering {nr_total} files in {input_dir} directory by number of residues in chain A.")
|
|
708
|
-
for r in filter_files_on_residues(
|
|
889
|
+
for r in filter_files_on_residues(
|
|
890
|
+
input_files, output_dir, min_residues=min_residues, max_residues=max_residues, copy_method=copy_method
|
|
891
|
+
):
|
|
709
892
|
if stats_file:
|
|
710
893
|
writer.writerow([r.input_file, r.residue_count, r.passed, r.output_file])
|
|
711
894
|
if r.passed:
|
|
@@ -716,6 +899,68 @@ def _handle_filter_residue(args):
|
|
|
716
899
|
rprint(f"Statistics written to {stats_file.name}")
|
|
717
900
|
|
|
718
901
|
|
|
902
|
+
def _handle_filter_ss(args):
|
|
903
|
+
input_dir = structure(args.input_dir, Path)
|
|
904
|
+
output_dir = structure(args.output_dir, Path)
|
|
905
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
906
|
+
stats_file: TextIOWrapper | None = args.write_stats
|
|
907
|
+
|
|
908
|
+
raw_query = {
|
|
909
|
+
"abs_min_helix_residues": args.abs_min_helix_residues,
|
|
910
|
+
"abs_max_helix_residues": args.abs_max_helix_residues,
|
|
911
|
+
"abs_min_sheet_residues": args.abs_min_sheet_residues,
|
|
912
|
+
"abs_max_sheet_residues": args.abs_max_sheet_residues,
|
|
913
|
+
"ratio_min_helix_residues": args.ratio_min_helix_residues,
|
|
914
|
+
"ratio_max_helix_residues": args.ratio_max_helix_residues,
|
|
915
|
+
"ratio_min_sheet_residues": args.ratio_min_sheet_residues,
|
|
916
|
+
"ratio_max_sheet_residues": args.ratio_max_sheet_residues,
|
|
917
|
+
}
|
|
918
|
+
query = converter.structure(raw_query, SecondaryStructureFilterQuery)
|
|
919
|
+
input_files = sorted(glob_structure_files(input_dir))
|
|
920
|
+
nr_total = len(input_files)
|
|
921
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
922
|
+
|
|
923
|
+
if stats_file:
|
|
924
|
+
writer = csv.writer(stats_file)
|
|
925
|
+
writer.writerow(
|
|
926
|
+
[
|
|
927
|
+
"input_file",
|
|
928
|
+
"nr_residues",
|
|
929
|
+
"nr_helix_residues",
|
|
930
|
+
"nr_sheet_residues",
|
|
931
|
+
"helix_ratio",
|
|
932
|
+
"sheet_ratio",
|
|
933
|
+
"passed",
|
|
934
|
+
"output_file",
|
|
935
|
+
]
|
|
936
|
+
)
|
|
937
|
+
|
|
938
|
+
rprint(f"Filtering {nr_total} files in {input_dir} directory by secondary structure.")
|
|
939
|
+
nr_passed = 0
|
|
940
|
+
for input_file, result in filter_files_on_secondary_structure(input_files, query=query):
|
|
941
|
+
output_file: Path | None = None
|
|
942
|
+
if result.passed:
|
|
943
|
+
output_file = output_dir / input_file.name
|
|
944
|
+
copyfile(input_file, output_file, copy_method)
|
|
945
|
+
nr_passed += 1
|
|
946
|
+
if stats_file:
|
|
947
|
+
writer.writerow(
|
|
948
|
+
[
|
|
949
|
+
input_file,
|
|
950
|
+
result.stats.nr_residues,
|
|
951
|
+
result.stats.nr_helix_residues,
|
|
952
|
+
result.stats.nr_sheet_residues,
|
|
953
|
+
round(result.stats.helix_ratio, 3),
|
|
954
|
+
round(result.stats.sheet_ratio, 3),
|
|
955
|
+
result.passed,
|
|
956
|
+
output_file,
|
|
957
|
+
]
|
|
958
|
+
)
|
|
959
|
+
rprint(f"Wrote {nr_passed} files to {output_dir} directory.")
|
|
960
|
+
if stats_file:
|
|
961
|
+
rprint(f"Statistics written to {stats_file.name}")
|
|
962
|
+
|
|
963
|
+
|
|
719
964
|
def _handle_mcp(args):
|
|
720
965
|
if find_spec("fastmcp") is None:
|
|
721
966
|
msg = "Unable to start MCP server, please install `protein-quest[mcp]`."
|
|
@@ -736,12 +981,15 @@ HANDLERS: dict[tuple[str, str | None], Callable] = {
|
|
|
736
981
|
("search", "emdb"): _handle_search_emdb,
|
|
737
982
|
("search", "go"): _handle_search_go,
|
|
738
983
|
("search", "taxonomy"): _handle_search_taxonomy,
|
|
984
|
+
("search", "interaction-partners"): _handle_search_interaction_partners,
|
|
985
|
+
("search", "complexes"): _handle_search_complexes,
|
|
739
986
|
("retrieve", "pdbe"): _handle_retrieve_pdbe,
|
|
740
987
|
("retrieve", "alphafold"): _handle_retrieve_alphafold,
|
|
741
988
|
("retrieve", "emdb"): _handle_retrieve_emdb,
|
|
742
989
|
("filter", "confidence"): _handle_filter_confidence,
|
|
743
990
|
("filter", "chain"): _handle_filter_chain,
|
|
744
991
|
("filter", "residue"): _handle_filter_residue,
|
|
992
|
+
("filter", "secondary-structure"): _handle_filter_ss,
|
|
745
993
|
("mcp", None): _handle_mcp,
|
|
746
994
|
}
|
|
747
995
|
|
|
@@ -797,3 +1045,33 @@ def _iter_csv_rows(file: TextIOWrapper) -> Generator[dict[str, str]]:
|
|
|
797
1045
|
|
|
798
1046
|
def _read_column_from_csv(file: TextIOWrapper, column: str) -> set[str]:
|
|
799
1047
|
return {row[column] for row in _iter_csv_rows(file)}
|
|
1048
|
+
|
|
1049
|
+
|
|
1050
|
+
def _write_complexes_csv(complexes: list[ComplexPortalEntry], output_csv: TextIOWrapper) -> None:
|
|
1051
|
+
"""Write ComplexPortal information to a CSV file.
|
|
1052
|
+
|
|
1053
|
+
Args:
|
|
1054
|
+
complexes: List of ComplexPortalEntry objects.
|
|
1055
|
+
output_csv: TextIOWrapper to write the CSV data to.
|
|
1056
|
+
"""
|
|
1057
|
+
writer = csv.writer(output_csv)
|
|
1058
|
+
writer.writerow(
|
|
1059
|
+
[
|
|
1060
|
+
"query_protein",
|
|
1061
|
+
"complex_id",
|
|
1062
|
+
"complex_url",
|
|
1063
|
+
"complex_title",
|
|
1064
|
+
"members",
|
|
1065
|
+
]
|
|
1066
|
+
)
|
|
1067
|
+
for entry in complexes:
|
|
1068
|
+
members_str = ";".join(sorted(entry.members))
|
|
1069
|
+
writer.writerow(
|
|
1070
|
+
[
|
|
1071
|
+
entry.query_protein,
|
|
1072
|
+
entry.complex_id,
|
|
1073
|
+
entry.complex_url,
|
|
1074
|
+
entry.complex_title,
|
|
1075
|
+
members_str,
|
|
1076
|
+
]
|
|
1077
|
+
)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Convert json or dict to Python objects."""
|
|
2
|
+
|
|
3
|
+
from cattrs.preconf.orjson import make_converter
|
|
4
|
+
from yarl import URL
|
|
5
|
+
|
|
6
|
+
type Percentage = float
|
|
7
|
+
"""Type alias for percentage values (0.0-100.0)."""
|
|
8
|
+
type Ratio = float
|
|
9
|
+
"""Type alias for ratio values (0.0-1.0)."""
|
|
10
|
+
type PositiveInt = int
|
|
11
|
+
"""Type alias for positive integer values (>= 0)."""
|
|
12
|
+
|
|
13
|
+
converter = make_converter()
|
|
14
|
+
"""cattrs converter to read JSON document or dict to Python objects."""
|
|
15
|
+
converter.register_structure_hook(URL, lambda v, _: URL(v))
|
|
16
|
+
converter.register_unstructure_hook(URL, lambda u: str(u))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@converter.register_structure_hook
|
|
20
|
+
def percentage_hook(val, _) -> Percentage:
|
|
21
|
+
value = float(val)
|
|
22
|
+
"""Cattrs hook to validate percentage values."""
|
|
23
|
+
if not 0.0 <= value <= 100.0:
|
|
24
|
+
msg = f"Value {value} is not a valid percentage (0.0-100.0)"
|
|
25
|
+
raise ValueError(msg)
|
|
26
|
+
return value
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@converter.register_structure_hook
|
|
30
|
+
def ratio_hook(val, _) -> Ratio:
|
|
31
|
+
"""Cattrs hook to validate ratio values."""
|
|
32
|
+
value = float(val)
|
|
33
|
+
if not 0.0 <= value <= 1.0:
|
|
34
|
+
msg = f"Value {value} is not a valid ratio (0.0-1.0)"
|
|
35
|
+
raise ValueError(msg)
|
|
36
|
+
return value
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@converter.register_structure_hook
|
|
40
|
+
def positive_int_hook(val, _) -> PositiveInt:
|
|
41
|
+
"""Cattrs hook to validate positive integer values."""
|
|
42
|
+
value = int(val)
|
|
43
|
+
if value < 0:
|
|
44
|
+
msg = f"Value {value} is not a valid positive integer (>= 0)"
|
|
45
|
+
raise ValueError(msg)
|
|
46
|
+
return value
|