protein-quest 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of protein-quest might be problematic. Click here for more details.
- protein_quest/__version__.py +1 -1
- protein_quest/alphafold/confidence.py +42 -15
- protein_quest/alphafold/fetch.py +2 -4
- protein_quest/cli.py +153 -13
- protein_quest/converter.py +45 -0
- protein_quest/filters.py +39 -7
- protein_quest/go.py +1 -4
- protein_quest/mcp_server.py +4 -0
- protein_quest/pdbe/io.py +122 -41
- protein_quest/ss.py +264 -0
- protein_quest/taxonomy.py +1 -3
- protein_quest/utils.py +28 -1
- {protein_quest-0.3.1.dist-info → protein_quest-0.3.2.dist-info}/METADATA +18 -4
- protein_quest-0.3.2.dist-info/RECORD +26 -0
- protein_quest-0.3.1.dist-info/RECORD +0 -24
- {protein_quest-0.3.1.dist-info → protein_quest-0.3.2.dist-info}/WHEEL +0 -0
- {protein_quest-0.3.1.dist-info → protein_quest-0.3.2.dist-info}/entry_points.txt +0 -0
- {protein_quest-0.3.1.dist-info → protein_quest-0.3.2.dist-info}/licenses/LICENSE +0 -0
protein_quest/__version__.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.2"
|
|
2
2
|
"""The version of the package."""
|
|
@@ -7,7 +7,10 @@ from pathlib import Path
|
|
|
7
7
|
|
|
8
8
|
import gemmi
|
|
9
9
|
|
|
10
|
+
from protein_quest.converter import Percentage, PositiveInt, converter
|
|
10
11
|
from protein_quest.pdbe.io import write_structure
|
|
12
|
+
from protein_quest.ss import nr_of_residues_in_total
|
|
13
|
+
from protein_quest.utils import CopyMethod, copyfile
|
|
11
14
|
|
|
12
15
|
"""
|
|
13
16
|
Methods to filter AlphaFoldDB structures on confidence scores.
|
|
@@ -73,13 +76,25 @@ class ConfidenceFilterQuery:
|
|
|
73
76
|
Parameters:
|
|
74
77
|
confidence: The confidence threshold for filtering residues.
|
|
75
78
|
Residues with a pLDDT (b-factor) above this value are considered high confidence.
|
|
76
|
-
|
|
77
|
-
|
|
79
|
+
min_residues: The minimum number of high-confidence residues required to keep the structure.
|
|
80
|
+
max_residues: The maximum number of high-confidence residues required to keep the structure.
|
|
78
81
|
"""
|
|
79
82
|
|
|
80
|
-
confidence:
|
|
81
|
-
|
|
82
|
-
|
|
83
|
+
confidence: Percentage
|
|
84
|
+
min_residues: PositiveInt
|
|
85
|
+
max_residues: PositiveInt
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
base_query_hook = converter.get_structure_hook(ConfidenceFilterQuery)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@converter.register_structure_hook
|
|
92
|
+
def confidence_filter_query_hook(val, _type) -> ConfidenceFilterQuery:
|
|
93
|
+
result: ConfidenceFilterQuery = base_query_hook(val, _type)
|
|
94
|
+
if result.min_residues > result.max_residues:
|
|
95
|
+
msg = f"min_residues {result.min_residues} cannot be larger than max_residues {result.max_residues}"
|
|
96
|
+
raise ValueError(msg)
|
|
97
|
+
return result
|
|
83
98
|
|
|
84
99
|
|
|
85
100
|
@dataclass
|
|
@@ -93,17 +108,20 @@ class ConfidenceFilterResult:
|
|
|
93
108
|
"""
|
|
94
109
|
|
|
95
110
|
input_file: str
|
|
96
|
-
count:
|
|
111
|
+
count: PositiveInt
|
|
97
112
|
filtered_file: Path | None = None
|
|
98
113
|
|
|
99
114
|
|
|
100
|
-
def filter_file_on_residues(
|
|
115
|
+
def filter_file_on_residues(
|
|
116
|
+
file: Path, query: ConfidenceFilterQuery, filtered_dir: Path, copy_method: CopyMethod = "copy"
|
|
117
|
+
) -> ConfidenceFilterResult:
|
|
101
118
|
"""Filter a single AlphaFoldDB structure file (*.pdb[.gz], *.cif[.gz]) based on confidence.
|
|
102
119
|
|
|
103
120
|
Args:
|
|
104
121
|
file: The path to the PDB file to filter.
|
|
105
122
|
query: The confidence filter query.
|
|
106
123
|
filtered_dir: The directory to save the filtered PDB file.
|
|
124
|
+
copy_method: How to copy when no residues have to be removed.
|
|
107
125
|
|
|
108
126
|
Returns:
|
|
109
127
|
result with filtered_file property set to Path where filtered PDB file is saved.
|
|
@@ -112,19 +130,24 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
|
|
|
112
130
|
structure = gemmi.read_structure(str(file))
|
|
113
131
|
residues = set(find_high_confidence_residues(structure, query.confidence))
|
|
114
132
|
count = len(residues)
|
|
115
|
-
if count < query.
|
|
133
|
+
if count < query.min_residues or count > query.max_residues:
|
|
116
134
|
# Skip structure that is outside the min and max threshold
|
|
117
135
|
# just return number of high confidence residues
|
|
118
136
|
return ConfidenceFilterResult(
|
|
119
137
|
input_file=file.name,
|
|
120
138
|
count=count,
|
|
121
139
|
)
|
|
140
|
+
total_residues = nr_of_residues_in_total(structure)
|
|
122
141
|
filtered_file = filtered_dir / file.name
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
142
|
+
if count == total_residues:
|
|
143
|
+
# if no residues have to be removed then copy instead of slower gemmi writing
|
|
144
|
+
copyfile(file, filtered_file, copy_method)
|
|
145
|
+
else:
|
|
146
|
+
new_structure = filter_out_low_confidence_residues(
|
|
147
|
+
structure,
|
|
148
|
+
residues,
|
|
149
|
+
)
|
|
150
|
+
write_structure(new_structure, filtered_file)
|
|
128
151
|
return ConfidenceFilterResult(
|
|
129
152
|
input_file=file.name,
|
|
130
153
|
count=count,
|
|
@@ -133,7 +156,10 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
|
|
|
133
156
|
|
|
134
157
|
|
|
135
158
|
def filter_files_on_confidence(
|
|
136
|
-
alphafold_pdb_files: list[Path],
|
|
159
|
+
alphafold_pdb_files: list[Path],
|
|
160
|
+
query: ConfidenceFilterQuery,
|
|
161
|
+
filtered_dir: Path,
|
|
162
|
+
copy_method: CopyMethod = "copy",
|
|
137
163
|
) -> Generator[ConfidenceFilterResult]:
|
|
138
164
|
"""Filter AlphaFoldDB structures based on confidence.
|
|
139
165
|
|
|
@@ -141,6 +167,7 @@ def filter_files_on_confidence(
|
|
|
141
167
|
alphafold_pdb_files: List of mmcif/PDB files from AlphaFoldDB to filter.
|
|
142
168
|
query: The confidence filter query containing the confidence thresholds.
|
|
143
169
|
filtered_dir: Directory where the filtered mmcif/PDB files will be saved.
|
|
170
|
+
copy_method: How to copy when a direct copy is possible.
|
|
144
171
|
|
|
145
172
|
Yields:
|
|
146
173
|
For each mmcif/PDB files yields whether it was filtered or not,
|
|
@@ -150,4 +177,4 @@ def filter_files_on_confidence(
|
|
|
150
177
|
# In ../filter.py:filter_files_on_residues() we filter on number of residues on a file level
|
|
151
178
|
# here we filter on file level and inside file remove low confidence residues
|
|
152
179
|
for pdb_file in alphafold_pdb_files:
|
|
153
|
-
yield filter_file_on_residues(pdb_file, query, filtered_dir)
|
|
180
|
+
yield filter_file_on_residues(pdb_file, query, filtered_dir, copy_method)
|
protein_quest/alphafold/fetch.py
CHANGED
|
@@ -9,17 +9,15 @@ from typing import Literal, cast, get_args
|
|
|
9
9
|
|
|
10
10
|
from aiohttp_retry import RetryClient
|
|
11
11
|
from aiopath import AsyncPath
|
|
12
|
-
from cattrs.preconf.orjson import make_converter
|
|
13
12
|
from tqdm.asyncio import tqdm
|
|
14
13
|
from yarl import URL
|
|
15
14
|
|
|
16
15
|
from protein_quest.alphafold.entry_summary import EntrySummary
|
|
16
|
+
from protein_quest.converter import converter
|
|
17
17
|
from protein_quest.utils import friendly_session, retrieve_files, run_async
|
|
18
18
|
|
|
19
19
|
logger = logging.getLogger(__name__)
|
|
20
|
-
|
|
21
|
-
"""cattrs converter to read AlphaFold summary JSON document."""
|
|
22
|
-
converter.register_structure_hook(URL, lambda v, _: URL(v))
|
|
20
|
+
|
|
23
21
|
|
|
24
22
|
DownloadableFormat = Literal[
|
|
25
23
|
"summary",
|
protein_quest/cli.py
CHANGED
|
@@ -23,13 +23,16 @@ from protein_quest.__version__ import __version__
|
|
|
23
23
|
from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence
|
|
24
24
|
from protein_quest.alphafold.fetch import DownloadableFormat, downloadable_formats
|
|
25
25
|
from protein_quest.alphafold.fetch import fetch_many as af_fetch
|
|
26
|
+
from protein_quest.converter import converter
|
|
26
27
|
from protein_quest.emdb import fetch as emdb_fetch
|
|
27
28
|
from protein_quest.filters import filter_files_on_chain, filter_files_on_residues
|
|
28
29
|
from protein_quest.go import Aspect, allowed_aspects, search_gene_ontology_term, write_go_terms_to_csv
|
|
29
30
|
from protein_quest.pdbe import fetch as pdbe_fetch
|
|
30
31
|
from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
|
|
32
|
+
from protein_quest.ss import SecondaryStructureFilterQuery, filter_files_on_secondary_structure
|
|
31
33
|
from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
|
|
32
34
|
from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
|
|
35
|
+
from protein_quest.utils import CopyMethod, copy_methods, copyfile
|
|
33
36
|
|
|
34
37
|
logger = logging.getLogger(__name__)
|
|
35
38
|
|
|
@@ -282,6 +285,22 @@ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
|
|
|
282
285
|
parser.add_argument("output_dir", type=Path, help="Directory to store downloaded EMDB volume files")
|
|
283
286
|
|
|
284
287
|
|
|
288
|
+
def _add_copy_method_argument(parser: argparse.ArgumentParser):
|
|
289
|
+
"""Add copy method argument to parser."""
|
|
290
|
+
default_copy_method = "symlink"
|
|
291
|
+
if os.name == "nt":
|
|
292
|
+
# On Windows you need developer mode or admin privileges to create symlinks
|
|
293
|
+
# so we default to copying files instead of symlinking
|
|
294
|
+
default_copy_method = "copy"
|
|
295
|
+
parser.add_argument(
|
|
296
|
+
"--copy-method",
|
|
297
|
+
type=str,
|
|
298
|
+
choices=copy_methods,
|
|
299
|
+
default=default_copy_method,
|
|
300
|
+
help="How to copy files when no changes are needed to output file.",
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
|
|
285
304
|
def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
|
|
286
305
|
"""Add filter confidence subcommand parser."""
|
|
287
306
|
parser = subparsers.add_parser(
|
|
@@ -312,6 +331,7 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
|
|
|
312
331
|
In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
|
|
313
332
|
Use `-` for stdout."""),
|
|
314
333
|
)
|
|
334
|
+
_add_copy_method_argument(parser)
|
|
315
335
|
|
|
316
336
|
|
|
317
337
|
def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -347,8 +367,11 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
|
|
|
347
367
|
)
|
|
348
368
|
parser.add_argument(
|
|
349
369
|
"--scheduler-address",
|
|
350
|
-
help="Address of the Dask scheduler to connect to.
|
|
370
|
+
help=dedent("""Address of the Dask scheduler to connect to.
|
|
371
|
+
If not provided, will create a local cluster.
|
|
372
|
+
If set to `sequential` will run tasks sequentially."""),
|
|
351
373
|
)
|
|
374
|
+
_add_copy_method_argument(parser)
|
|
352
375
|
|
|
353
376
|
|
|
354
377
|
def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -371,6 +394,7 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
|
|
|
371
394
|
)
|
|
372
395
|
parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
|
|
373
396
|
parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
|
|
397
|
+
_add_copy_method_argument(parser)
|
|
374
398
|
parser.add_argument(
|
|
375
399
|
"--write-stats",
|
|
376
400
|
type=argparse.FileType("w", encoding="UTF-8"),
|
|
@@ -381,6 +405,43 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
|
|
|
381
405
|
)
|
|
382
406
|
|
|
383
407
|
|
|
408
|
+
def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
|
|
409
|
+
"""Add filter secondary structure subcommand parser."""
|
|
410
|
+
parser = subparsers.add_parser(
|
|
411
|
+
"secondary-structure",
|
|
412
|
+
help="Filter PDB/mmCIF files by secondary structure",
|
|
413
|
+
description="Filter PDB/mmCIF files by secondary structure",
|
|
414
|
+
formatter_class=ArgumentDefaultsRichHelpFormatter,
|
|
415
|
+
)
|
|
416
|
+
parser.add_argument("input_dir", type=Path, help="Directory with PDB/mmCIF files (e.g., from 'filter chain')")
|
|
417
|
+
parser.add_argument(
|
|
418
|
+
"output_dir",
|
|
419
|
+
type=Path,
|
|
420
|
+
help=dedent("""\
|
|
421
|
+
Directory to write filtered PDB/mmCIF files. Files are copied without modification.
|
|
422
|
+
"""),
|
|
423
|
+
)
|
|
424
|
+
parser.add_argument("--abs-min-helix-residues", type=int, help="Min residues in helices")
|
|
425
|
+
parser.add_argument("--abs-max-helix-residues", type=int, help="Max residues in helices")
|
|
426
|
+
parser.add_argument("--abs-min-sheet-residues", type=int, help="Min residues in sheets")
|
|
427
|
+
parser.add_argument("--abs-max-sheet-residues", type=int, help="Max residues in sheets")
|
|
428
|
+
parser.add_argument("--ratio-min-helix-residues", type=float, help="Min residues in helices (relative)")
|
|
429
|
+
parser.add_argument("--ratio-max-helix-residues", type=float, help="Max residues in helices (relative)")
|
|
430
|
+
parser.add_argument("--ratio-min-sheet-residues", type=float, help="Min residues in sheets (relative)")
|
|
431
|
+
parser.add_argument("--ratio-max-sheet-residues", type=float, help="Max residues in sheets (relative)")
|
|
432
|
+
_add_copy_method_argument(parser)
|
|
433
|
+
parser.add_argument(
|
|
434
|
+
"--write-stats",
|
|
435
|
+
type=argparse.FileType("w", encoding="UTF-8"),
|
|
436
|
+
help=dedent("""
|
|
437
|
+
Write filter statistics to file. In CSV format with columns:
|
|
438
|
+
`<input_file>,<nr_residues>,<nr_helix_residues>,<nr_sheet_residues>,
|
|
439
|
+
<helix_ratio>,<sheet_ratio>,<passed>,<output_file>`.
|
|
440
|
+
Use `-` for stdout.
|
|
441
|
+
"""),
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
|
|
384
445
|
def _add_search_subcommands(subparsers: argparse._SubParsersAction):
|
|
385
446
|
"""Add search command and its subcommands."""
|
|
386
447
|
parser = subparsers.add_parser(
|
|
@@ -422,6 +483,7 @@ def _add_filter_subcommands(subparsers: argparse._SubParsersAction):
|
|
|
422
483
|
_add_filter_confidence_parser(subsubparsers)
|
|
423
484
|
_add_filter_chain_parser(subsubparsers)
|
|
424
485
|
_add_filter_residue_parser(subsubparsers)
|
|
486
|
+
_add_filter_ss_parser(subsubparsers)
|
|
425
487
|
|
|
426
488
|
|
|
427
489
|
def _add_mcp_command(subparsers: argparse._SubParsersAction):
|
|
@@ -620,21 +682,22 @@ def _handle_filter_confidence(args: argparse.Namespace):
|
|
|
620
682
|
# to get rid of duplication
|
|
621
683
|
input_dir = structure(args.input_dir, Path)
|
|
622
684
|
output_dir = structure(args.output_dir, Path)
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
min_residues =
|
|
626
|
-
max_residues =
|
|
685
|
+
|
|
686
|
+
confidence_threshold = args.confidence_threshold
|
|
687
|
+
min_residues = args.min_residues
|
|
688
|
+
max_residues = args.max_residues
|
|
627
689
|
stats_file: TextIOWrapper | None = args.write_stats
|
|
690
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
628
691
|
|
|
629
692
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
630
693
|
input_files = sorted(glob_structure_files(input_dir))
|
|
631
694
|
nr_input_files = len(input_files)
|
|
632
695
|
rprint(f"Starting confidence filtering of {nr_input_files} mmcif/PDB files in {input_dir} directory.")
|
|
633
|
-
query = structure(
|
|
696
|
+
query = converter.structure(
|
|
634
697
|
{
|
|
635
698
|
"confidence": confidence_threshold,
|
|
636
|
-
"
|
|
637
|
-
"
|
|
699
|
+
"min_residues": min_residues,
|
|
700
|
+
"max_residues": max_residues,
|
|
638
701
|
},
|
|
639
702
|
ConfidenceFilterQuery,
|
|
640
703
|
)
|
|
@@ -643,7 +706,11 @@ def _handle_filter_confidence(args: argparse.Namespace):
|
|
|
643
706
|
writer.writerow(["input_file", "residue_count", "passed", "output_file"])
|
|
644
707
|
|
|
645
708
|
passed_count = 0
|
|
646
|
-
for r in tqdm(
|
|
709
|
+
for r in tqdm(
|
|
710
|
+
filter_files_on_confidence(input_files, query, output_dir, copy_method=copy_method),
|
|
711
|
+
total=len(input_files),
|
|
712
|
+
unit="file",
|
|
713
|
+
):
|
|
647
714
|
if r.filtered_file:
|
|
648
715
|
passed_count += 1
|
|
649
716
|
if stats_file:
|
|
@@ -656,9 +723,10 @@ def _handle_filter_confidence(args: argparse.Namespace):
|
|
|
656
723
|
|
|
657
724
|
def _handle_filter_chain(args):
|
|
658
725
|
input_dir = args.input_dir
|
|
659
|
-
output_dir = args.output_dir
|
|
726
|
+
output_dir = structure(args.output_dir, Path)
|
|
660
727
|
pdb_id2chain_mapping_file = args.chains
|
|
661
|
-
scheduler_address = args.scheduler_address
|
|
728
|
+
scheduler_address = structure(args.scheduler_address, str | None) # pyright: ignore[reportArgumentType]
|
|
729
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
662
730
|
|
|
663
731
|
# make sure files in input dir with entries in mapping file are the same
|
|
664
732
|
# complain when files from mapping file are missing on disk
|
|
@@ -683,18 +751,25 @@ def _handle_filter_chain(args):
|
|
|
683
751
|
rprint("[red]No valid structure files found. Exiting.")
|
|
684
752
|
sys.exit(1)
|
|
685
753
|
|
|
686
|
-
results = filter_files_on_chain(
|
|
754
|
+
results = filter_files_on_chain(
|
|
755
|
+
file2chain, output_dir, scheduler_address=scheduler_address, copy_method=copy_method
|
|
756
|
+
)
|
|
687
757
|
|
|
688
758
|
nr_written = len([r for r in results if r.passed])
|
|
689
759
|
|
|
690
760
|
rprint(f"Wrote {nr_written} single-chain PDB/mmCIF files to {output_dir}.")
|
|
691
761
|
|
|
762
|
+
for result in results:
|
|
763
|
+
if result.discard_reason:
|
|
764
|
+
rprint(f"[red]Discarding {result.input_file} ({result.discard_reason})[/red]")
|
|
765
|
+
|
|
692
766
|
|
|
693
767
|
def _handle_filter_residue(args):
|
|
694
768
|
input_dir = structure(args.input_dir, Path)
|
|
695
769
|
output_dir = structure(args.output_dir, Path)
|
|
696
770
|
min_residues = structure(args.min_residues, int)
|
|
697
771
|
max_residues = structure(args.max_residues, int)
|
|
772
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
698
773
|
stats_file: TextIOWrapper | None = args.write_stats
|
|
699
774
|
|
|
700
775
|
if stats_file:
|
|
@@ -705,7 +780,9 @@ def _handle_filter_residue(args):
|
|
|
705
780
|
input_files = sorted(glob_structure_files(input_dir))
|
|
706
781
|
nr_total = len(input_files)
|
|
707
782
|
rprint(f"Filtering {nr_total} files in {input_dir} directory by number of residues in chain A.")
|
|
708
|
-
for r in filter_files_on_residues(
|
|
783
|
+
for r in filter_files_on_residues(
|
|
784
|
+
input_files, output_dir, min_residues=min_residues, max_residues=max_residues, copy_method=copy_method
|
|
785
|
+
):
|
|
709
786
|
if stats_file:
|
|
710
787
|
writer.writerow([r.input_file, r.residue_count, r.passed, r.output_file])
|
|
711
788
|
if r.passed:
|
|
@@ -716,6 +793,68 @@ def _handle_filter_residue(args):
|
|
|
716
793
|
rprint(f"Statistics written to {stats_file.name}")
|
|
717
794
|
|
|
718
795
|
|
|
796
|
+
def _handle_filter_ss(args):
|
|
797
|
+
input_dir = structure(args.input_dir, Path)
|
|
798
|
+
output_dir = structure(args.output_dir, Path)
|
|
799
|
+
copy_method: CopyMethod = structure(args.copy_method, CopyMethod) # pyright: ignore[reportArgumentType]
|
|
800
|
+
stats_file: TextIOWrapper | None = args.write_stats
|
|
801
|
+
|
|
802
|
+
raw_query = {
|
|
803
|
+
"abs_min_helix_residues": args.abs_min_helix_residues,
|
|
804
|
+
"abs_max_helix_residues": args.abs_max_helix_residues,
|
|
805
|
+
"abs_min_sheet_residues": args.abs_min_sheet_residues,
|
|
806
|
+
"abs_max_sheet_residues": args.abs_max_sheet_residues,
|
|
807
|
+
"ratio_min_helix_residues": args.ratio_min_helix_residues,
|
|
808
|
+
"ratio_max_helix_residues": args.ratio_max_helix_residues,
|
|
809
|
+
"ratio_min_sheet_residues": args.ratio_min_sheet_residues,
|
|
810
|
+
"ratio_max_sheet_residues": args.ratio_max_sheet_residues,
|
|
811
|
+
}
|
|
812
|
+
query = converter.structure(raw_query, SecondaryStructureFilterQuery)
|
|
813
|
+
input_files = sorted(glob_structure_files(input_dir))
|
|
814
|
+
nr_total = len(input_files)
|
|
815
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
816
|
+
|
|
817
|
+
if stats_file:
|
|
818
|
+
writer = csv.writer(stats_file)
|
|
819
|
+
writer.writerow(
|
|
820
|
+
[
|
|
821
|
+
"input_file",
|
|
822
|
+
"nr_residues",
|
|
823
|
+
"nr_helix_residues",
|
|
824
|
+
"nr_sheet_residues",
|
|
825
|
+
"helix_ratio",
|
|
826
|
+
"sheet_ratio",
|
|
827
|
+
"passed",
|
|
828
|
+
"output_file",
|
|
829
|
+
]
|
|
830
|
+
)
|
|
831
|
+
|
|
832
|
+
rprint(f"Filtering {nr_total} files in {input_dir} directory by secondary structure.")
|
|
833
|
+
nr_passed = 0
|
|
834
|
+
for input_file, result in filter_files_on_secondary_structure(input_files, query=query):
|
|
835
|
+
output_file: Path | None = None
|
|
836
|
+
if result.passed:
|
|
837
|
+
output_file = output_dir / input_file.name
|
|
838
|
+
copyfile(input_file, output_file, copy_method)
|
|
839
|
+
nr_passed += 1
|
|
840
|
+
if stats_file:
|
|
841
|
+
writer.writerow(
|
|
842
|
+
[
|
|
843
|
+
input_file,
|
|
844
|
+
result.stats.nr_residues,
|
|
845
|
+
result.stats.nr_helix_residues,
|
|
846
|
+
result.stats.nr_sheet_residues,
|
|
847
|
+
round(result.stats.helix_ratio, 3),
|
|
848
|
+
round(result.stats.sheet_ratio, 3),
|
|
849
|
+
result.passed,
|
|
850
|
+
output_file,
|
|
851
|
+
]
|
|
852
|
+
)
|
|
853
|
+
rprint(f"Wrote {nr_passed} files to {output_dir} directory.")
|
|
854
|
+
if stats_file:
|
|
855
|
+
rprint(f"Statistics written to {stats_file.name}")
|
|
856
|
+
|
|
857
|
+
|
|
719
858
|
def _handle_mcp(args):
|
|
720
859
|
if find_spec("fastmcp") is None:
|
|
721
860
|
msg = "Unable to start MCP server, please install `protein-quest[mcp]`."
|
|
@@ -742,6 +881,7 @@ HANDLERS: dict[tuple[str, str | None], Callable] = {
|
|
|
742
881
|
("filter", "confidence"): _handle_filter_confidence,
|
|
743
882
|
("filter", "chain"): _handle_filter_chain,
|
|
744
883
|
("filter", "residue"): _handle_filter_residue,
|
|
884
|
+
("filter", "secondary-structure"): _handle_filter_ss,
|
|
745
885
|
("mcp", None): _handle_mcp,
|
|
746
886
|
}
|
|
747
887
|
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Convert json or dict to Python objects."""
|
|
2
|
+
|
|
3
|
+
from cattrs.preconf.orjson import make_converter
|
|
4
|
+
from yarl import URL
|
|
5
|
+
|
|
6
|
+
type Percentage = float
|
|
7
|
+
"""Type alias for percentage values (0.0-100.0)."""
|
|
8
|
+
type Ratio = float
|
|
9
|
+
"""Type alias for ratio values (0.0-1.0)."""
|
|
10
|
+
type PositiveInt = int
|
|
11
|
+
"""Type alias for positive integer values (>= 0)."""
|
|
12
|
+
|
|
13
|
+
converter = make_converter()
|
|
14
|
+
"""cattrs converter to read JSON document or dict to Python objects."""
|
|
15
|
+
converter.register_structure_hook(URL, lambda v, _: URL(v))
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@converter.register_structure_hook
|
|
19
|
+
def percentage_hook(val, _) -> Percentage:
|
|
20
|
+
value = float(val)
|
|
21
|
+
"""Cattrs hook to validate percentage values."""
|
|
22
|
+
if not 0.0 <= value <= 100.0:
|
|
23
|
+
msg = f"Value {value} is not a valid percentage (0.0-100.0)"
|
|
24
|
+
raise ValueError(msg)
|
|
25
|
+
return value
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@converter.register_structure_hook
|
|
29
|
+
def ratio_hook(val, _) -> Ratio:
|
|
30
|
+
"""Cattrs hook to validate ratio values."""
|
|
31
|
+
value = float(val)
|
|
32
|
+
if not 0.0 <= value <= 1.0:
|
|
33
|
+
msg = f"Value {value} is not a valid ratio (0.0-1.0)"
|
|
34
|
+
raise ValueError(msg)
|
|
35
|
+
return value
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@converter.register_structure_hook
|
|
39
|
+
def positive_int_hook(val, _) -> PositiveInt:
|
|
40
|
+
"""Cattrs hook to validate positive integer values."""
|
|
41
|
+
value = int(val)
|
|
42
|
+
if value < 0:
|
|
43
|
+
msg = f"Value {value} is not a valid positive integer (>= 0)"
|
|
44
|
+
raise ValueError(msg)
|
|
45
|
+
return value
|
protein_quest/filters.py
CHANGED
|
@@ -4,7 +4,7 @@ import logging
|
|
|
4
4
|
from collections.abc import Collection, Generator
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from
|
|
7
|
+
from typing import Literal
|
|
8
8
|
|
|
9
9
|
from dask.distributed import Client
|
|
10
10
|
from distributed.deploy.cluster import Cluster
|
|
@@ -15,6 +15,7 @@ from protein_quest.pdbe.io import (
|
|
|
15
15
|
nr_residues_in_chain,
|
|
16
16
|
write_single_chain_pdb_file,
|
|
17
17
|
)
|
|
18
|
+
from protein_quest.utils import CopyMethod, copyfile
|
|
18
19
|
|
|
19
20
|
logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
@@ -29,11 +30,17 @@ class ChainFilterStatistics:
|
|
|
29
30
|
|
|
30
31
|
|
|
31
32
|
def filter_file_on_chain(
|
|
32
|
-
file_and_chain: tuple[Path, str],
|
|
33
|
+
file_and_chain: tuple[Path, str],
|
|
34
|
+
output_dir: Path,
|
|
35
|
+
out_chain: str = "A",
|
|
36
|
+
copy_method: CopyMethod = "copy",
|
|
33
37
|
) -> ChainFilterStatistics:
|
|
34
38
|
input_file, chain_id = file_and_chain
|
|
39
|
+
logger.debug("Filtering %s on chain %s", input_file, chain_id)
|
|
35
40
|
try:
|
|
36
|
-
output_file = write_single_chain_pdb_file(
|
|
41
|
+
output_file = write_single_chain_pdb_file(
|
|
42
|
+
input_file, chain_id, output_dir, out_chain=out_chain, copy_method=copy_method
|
|
43
|
+
)
|
|
37
44
|
return ChainFilterStatistics(
|
|
38
45
|
input_file=input_file,
|
|
39
46
|
chain_id=chain_id,
|
|
@@ -48,7 +55,8 @@ def filter_files_on_chain(
|
|
|
48
55
|
file2chains: Collection[tuple[Path, str]],
|
|
49
56
|
output_dir: Path,
|
|
50
57
|
out_chain: str = "A",
|
|
51
|
-
scheduler_address: str | Cluster | None = None,
|
|
58
|
+
scheduler_address: str | Cluster | Literal["sequential"] | None = None,
|
|
59
|
+
copy_method: CopyMethod = "copy",
|
|
52
60
|
) -> list[ChainFilterStatistics]:
|
|
53
61
|
"""Filter mmcif/PDB files by chain.
|
|
54
62
|
|
|
@@ -58,19 +66,37 @@ def filter_files_on_chain(
|
|
|
58
66
|
output_dir: The directory where the filtered files will be written.
|
|
59
67
|
out_chain: Under what name to write the kept chain.
|
|
60
68
|
scheduler_address: The address of the Dask scheduler.
|
|
69
|
+
If not provided, will create a local cluster.
|
|
70
|
+
If set to `sequential` will run tasks sequentially.
|
|
71
|
+
copy_method: How to copy when a direct copy is possible.
|
|
61
72
|
|
|
62
73
|
Returns:
|
|
63
74
|
Result of the filtering process.
|
|
64
75
|
"""
|
|
65
76
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
77
|
+
if scheduler_address == "sequential":
|
|
78
|
+
|
|
79
|
+
def task(file_and_chain: tuple[Path, str]) -> ChainFilterStatistics:
|
|
80
|
+
return filter_file_on_chain(file_and_chain, output_dir, out_chain=out_chain, copy_method=copy_method)
|
|
81
|
+
|
|
82
|
+
return list(map(task, file2chains))
|
|
83
|
+
|
|
84
|
+
# TODO make logger.debug in filter_file_on_chain show to user when --log
|
|
85
|
+
# GPT-5 generated a fairly difficult setup with a WorkerPlugin, need to find a simpler approach
|
|
66
86
|
scheduler_address = configure_dask_scheduler(
|
|
67
87
|
scheduler_address,
|
|
68
88
|
name="filter-chain",
|
|
69
89
|
)
|
|
70
90
|
|
|
71
91
|
with Client(scheduler_address) as client:
|
|
92
|
+
client.forward_logging()
|
|
72
93
|
return dask_map_with_progress(
|
|
73
|
-
client,
|
|
94
|
+
client,
|
|
95
|
+
filter_file_on_chain,
|
|
96
|
+
file2chains,
|
|
97
|
+
output_dir=output_dir,
|
|
98
|
+
out_chain=out_chain,
|
|
99
|
+
copy_method=copy_method,
|
|
74
100
|
)
|
|
75
101
|
|
|
76
102
|
|
|
@@ -92,7 +118,12 @@ class ResidueFilterStatistics:
|
|
|
92
118
|
|
|
93
119
|
|
|
94
120
|
def filter_files_on_residues(
|
|
95
|
-
input_files: list[Path],
|
|
121
|
+
input_files: list[Path],
|
|
122
|
+
output_dir: Path,
|
|
123
|
+
min_residues: int,
|
|
124
|
+
max_residues: int,
|
|
125
|
+
chain: str = "A",
|
|
126
|
+
copy_method: CopyMethod = "copy",
|
|
96
127
|
) -> Generator[ResidueFilterStatistics]:
|
|
97
128
|
"""Filter PDB/mmCIF files by number of residues in given chain.
|
|
98
129
|
|
|
@@ -102,6 +133,7 @@ def filter_files_on_residues(
|
|
|
102
133
|
min_residues: The minimum number of residues in chain.
|
|
103
134
|
max_residues: The maximum number of residues in chain.
|
|
104
135
|
chain: The chain to count residues of.
|
|
136
|
+
copy_method: How to copy passed files to output directory:
|
|
105
137
|
|
|
106
138
|
Yields:
|
|
107
139
|
Objects containing information about the filtering process for each input file.
|
|
@@ -112,7 +144,7 @@ def filter_files_on_residues(
|
|
|
112
144
|
passed = min_residues <= residue_count <= max_residues
|
|
113
145
|
if passed:
|
|
114
146
|
output_file = output_dir / input_file.name
|
|
115
|
-
copyfile(input_file, output_file)
|
|
147
|
+
copyfile(input_file, output_file, copy_method)
|
|
116
148
|
yield ResidueFilterStatistics(input_file, residue_count, True, output_file)
|
|
117
149
|
else:
|
|
118
150
|
yield ResidueFilterStatistics(input_file, residue_count, False, None)
|
protein_quest/go.py
CHANGED
|
@@ -8,8 +8,8 @@ from io import TextIOWrapper
|
|
|
8
8
|
from typing import Literal, get_args
|
|
9
9
|
|
|
10
10
|
from cattrs.gen import make_dict_structure_fn, override
|
|
11
|
-
from cattrs.preconf.orjson import make_converter
|
|
12
11
|
|
|
12
|
+
from protein_quest.converter import converter
|
|
13
13
|
from protein_quest.utils import friendly_session
|
|
14
14
|
|
|
15
15
|
logger = logging.getLogger(__name__)
|
|
@@ -52,9 +52,6 @@ class SearchResponse:
|
|
|
52
52
|
page_info: PageInfo
|
|
53
53
|
|
|
54
54
|
|
|
55
|
-
converter = make_converter()
|
|
56
|
-
|
|
57
|
-
|
|
58
55
|
def flatten_definition(definition, _context) -> str:
|
|
59
56
|
return definition["text"]
|
|
60
57
|
|
protein_quest/mcp_server.py
CHANGED
|
@@ -46,6 +46,7 @@ from protein_quest.emdb import fetch as emdb_fetch
|
|
|
46
46
|
from protein_quest.go import search_gene_ontology_term
|
|
47
47
|
from protein_quest.pdbe.fetch import fetch as pdbe_fetch
|
|
48
48
|
from protein_quest.pdbe.io import glob_structure_files, nr_residues_in_chain, write_single_chain_pdb_file
|
|
49
|
+
from protein_quest.ss import filter_file_on_secondary_structure
|
|
49
50
|
from protein_quest.taxonomy import search_taxon
|
|
50
51
|
from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
|
|
51
52
|
|
|
@@ -165,6 +166,9 @@ def alphafold_confidence_filter(file: Path, query: ConfidenceFilterQuery, filter
|
|
|
165
166
|
return filter_file_on_residues(file, query, filtered_dir)
|
|
166
167
|
|
|
167
168
|
|
|
169
|
+
mcp.tool(filter_file_on_secondary_structure)
|
|
170
|
+
|
|
171
|
+
|
|
168
172
|
@mcp.prompt
|
|
169
173
|
def candidate_structures(
|
|
170
174
|
species: str = "Human",
|
protein_quest/pdbe/io.py
CHANGED
|
@@ -2,12 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
import gzip
|
|
4
4
|
import logging
|
|
5
|
-
from collections.abc import Generator
|
|
5
|
+
from collections.abc import Generator, Iterable
|
|
6
|
+
from datetime import UTC, datetime
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
|
|
8
9
|
import gemmi
|
|
9
10
|
|
|
10
|
-
from protein_quest import __version__
|
|
11
|
+
from protein_quest.__version__ import __version__
|
|
12
|
+
from protein_quest.utils import CopyMethod, copyfile
|
|
11
13
|
|
|
12
14
|
logger = logging.getLogger(__name__)
|
|
13
15
|
|
|
@@ -28,14 +30,21 @@ def nr_residues_in_chain(file: Path | str, chain: str = "A") -> int:
|
|
|
28
30
|
The number of residues in the specified chain.
|
|
29
31
|
"""
|
|
30
32
|
structure = gemmi.read_structure(str(file))
|
|
31
|
-
|
|
32
|
-
gchain = find_chain_in_model(model, chain)
|
|
33
|
+
gchain = find_chain_in_structure(structure, chain)
|
|
33
34
|
if gchain is None:
|
|
34
35
|
logger.warning("Chain %s not found in %s. Returning 0.", chain, file)
|
|
35
36
|
return 0
|
|
36
37
|
return len(gchain)
|
|
37
38
|
|
|
38
39
|
|
|
40
|
+
def find_chain_in_structure(structure: gemmi.Structure, wanted_chain: str) -> gemmi.Chain | None:
|
|
41
|
+
for model in structure:
|
|
42
|
+
chain = find_chain_in_model(model, wanted_chain)
|
|
43
|
+
if chain is not None:
|
|
44
|
+
return chain
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
|
|
39
48
|
def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain | None:
|
|
40
49
|
chain = model.find_chain(wanted_chain)
|
|
41
50
|
if chain is None:
|
|
@@ -68,10 +77,12 @@ def write_structure(structure: gemmi.Structure, path: Path):
|
|
|
68
77
|
with gzip.open(path, "wt") as f:
|
|
69
78
|
f.write(body)
|
|
70
79
|
elif path.name.endswith(".cif"):
|
|
71
|
-
|
|
80
|
+
# do not write chem_comp so it is viewable by molstar
|
|
81
|
+
# see https://github.com/project-gemmi/gemmi/discussions/362
|
|
82
|
+
doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
|
|
72
83
|
doc.write_file(str(path))
|
|
73
84
|
elif path.name.endswith(".cif.gz"):
|
|
74
|
-
doc = structure.make_mmcif_document()
|
|
85
|
+
doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
|
|
75
86
|
cif_str = doc.as_string()
|
|
76
87
|
with gzip.open(path, "wt") as f:
|
|
77
88
|
f.write(cif_str)
|
|
@@ -111,14 +122,17 @@ def locate_structure_file(root: Path, pdb_id: str) -> Path:
|
|
|
111
122
|
Raises:
|
|
112
123
|
FileNotFoundError: If no structure file is found for the given PDB ID.
|
|
113
124
|
"""
|
|
114
|
-
exts = [".cif.gz", ".cif", ".pdb.gz", ".pdb"]
|
|
115
|
-
# files downloaded from https://www.ebi.ac.uk/pdbe/ website
|
|
116
|
-
# have file names like pdb6t5y.ent or pdb6t5y.ent.gz for a PDB formatted file.
|
|
117
|
-
# TODO support pdb6t5y.ent or pdb6t5y.ent.gz file names
|
|
125
|
+
exts = [".cif.gz", ".cif", ".pdb.gz", ".pdb", ".ent", ".ent.gz"]
|
|
118
126
|
for ext in exts:
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
127
|
+
candidates = (
|
|
128
|
+
root / f"{pdb_id}{ext}",
|
|
129
|
+
root / f"{pdb_id.lower()}{ext}",
|
|
130
|
+
root / f"{pdb_id.upper()}{ext}",
|
|
131
|
+
root / f"pdb{pdb_id.lower()}{ext}",
|
|
132
|
+
)
|
|
133
|
+
for candidate in candidates:
|
|
134
|
+
if candidate.exists():
|
|
135
|
+
return candidate
|
|
122
136
|
msg = f"No structure file found for {pdb_id} in {root}"
|
|
123
137
|
raise FileNotFoundError(msg)
|
|
124
138
|
|
|
@@ -139,20 +153,84 @@ def glob_structure_files(input_dir: Path) -> Generator[Path]:
|
|
|
139
153
|
class ChainNotFoundError(IndexError):
|
|
140
154
|
"""Exception raised when a chain is not found in a structure."""
|
|
141
155
|
|
|
142
|
-
def __init__(self, chain: str, file: Path | str):
|
|
143
|
-
super().__init__(f"Chain {chain} not found in {file}")
|
|
156
|
+
def __init__(self, chain: str, file: Path | str, available_chains: Iterable[str]):
|
|
157
|
+
super().__init__(f"Chain {chain} not found in {file}. Available chains are: {available_chains}")
|
|
144
158
|
self.chain_id = chain
|
|
145
159
|
self.file = file
|
|
146
160
|
|
|
147
161
|
|
|
148
|
-
def
|
|
162
|
+
def _dedup_helices(structure: gemmi.Structure):
|
|
163
|
+
helix_starts: set[str] = set()
|
|
164
|
+
duplicate_helix_indexes: list[int] = []
|
|
165
|
+
for hindex, helix in enumerate(structure.helices):
|
|
166
|
+
if str(helix.start) in helix_starts:
|
|
167
|
+
logger.debug(f"Duplicate start helix found: {hindex} {helix.start}, removing")
|
|
168
|
+
duplicate_helix_indexes.append(hindex)
|
|
169
|
+
else:
|
|
170
|
+
helix_starts.add(str(helix.start))
|
|
171
|
+
for helix_index in reversed(duplicate_helix_indexes):
|
|
172
|
+
structure.helices.pop(helix_index)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _dedup_sheets(structure: gemmi.Structure, chain2keep: str):
|
|
176
|
+
duplicate_sheet_indexes: list[int] = []
|
|
177
|
+
for sindex, sheet in enumerate(structure.sheets):
|
|
178
|
+
if sheet.name != chain2keep:
|
|
179
|
+
duplicate_sheet_indexes.append(sindex)
|
|
180
|
+
for sheet_index in reversed(duplicate_sheet_indexes):
|
|
181
|
+
structure.sheets.pop(sheet_index)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _add_provenance_info(structure: gemmi.Structure, chain2keep: str, out_chain: str):
|
|
185
|
+
old_id = structure.name
|
|
186
|
+
new_id = structure.name + f"{chain2keep}2{out_chain}"
|
|
187
|
+
structure.name = new_id
|
|
188
|
+
structure.info["_entry.id"] = new_id
|
|
189
|
+
new_title = f"From {old_id} chain {chain2keep} to {out_chain}"
|
|
190
|
+
structure.info["_struct.title"] = new_title
|
|
191
|
+
structure.info["_struct_keywords.pdbx_keywords"] = new_title.upper()
|
|
192
|
+
new_si = gemmi.SoftwareItem()
|
|
193
|
+
new_si.classification = gemmi.SoftwareItem.Classification.DataExtraction
|
|
194
|
+
new_si.name = "protein-quest.pdbe.io.write_single_chain_pdb_file"
|
|
195
|
+
new_si.version = str(__version__)
|
|
196
|
+
new_si.date = str(datetime.now(tz=UTC).date())
|
|
197
|
+
structure.meta.software = [*structure.meta.software, new_si]
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def chains_in_structure(structure: gemmi.Structure) -> set[gemmi.Chain]:
|
|
201
|
+
"""Get a list of chains in a structure."""
|
|
202
|
+
return {c for model in structure for c in model}
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def write_single_chain_pdb_file(
|
|
206
|
+
input_file: Path,
|
|
207
|
+
chain2keep: str,
|
|
208
|
+
output_dir: Path,
|
|
209
|
+
out_chain: str = "A",
|
|
210
|
+
copy_method: CopyMethod = "copy",
|
|
211
|
+
) -> Path:
|
|
149
212
|
"""Write a single chain from a mmCIF/pdb file to a new mmCIF/pdb file.
|
|
150
213
|
|
|
214
|
+
Also
|
|
215
|
+
|
|
216
|
+
- removes ligands and waters
|
|
217
|
+
- renumbers atoms ids
|
|
218
|
+
- removes chem_comp section from cif files
|
|
219
|
+
- adds provenance information to the header like software and input file+chain
|
|
220
|
+
|
|
221
|
+
This function is equivalent to the following gemmi commands:
|
|
222
|
+
|
|
223
|
+
```shell
|
|
224
|
+
gemmi convert --remove-lig-wat --select=B --to=cif chain-in/3JRS.cif - | \\
|
|
225
|
+
gemmi convert --from=cif --rename-chain=B:A - chain-out/3JRS_B2A.gemmi.cif
|
|
226
|
+
```
|
|
227
|
+
|
|
151
228
|
Args:
|
|
152
229
|
input_file: Path to the input mmCIF/pdb file.
|
|
153
230
|
chain2keep: The chain to keep.
|
|
154
231
|
output_dir: Directory to save the output file.
|
|
155
232
|
out_chain: The chain identifier for the output file.
|
|
233
|
+
copy_method: How to copy when no changes are needed to output file.
|
|
156
234
|
|
|
157
235
|
Returns:
|
|
158
236
|
Path to the output mmCIF/pdb file
|
|
@@ -162,39 +240,42 @@ def write_single_chain_pdb_file(input_file: Path, chain2keep: str, output_dir: P
|
|
|
162
240
|
ChainNotFoundError: If the specified chain is not found in the input file.
|
|
163
241
|
"""
|
|
164
242
|
|
|
243
|
+
logger.debug(f"chain2keep: {chain2keep}, out_chain: {out_chain}")
|
|
165
244
|
structure = gemmi.read_structure(str(input_file))
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
# Only count residues of polymer
|
|
169
|
-
model.remove_ligands_and_waters()
|
|
245
|
+
structure.setup_entities()
|
|
170
246
|
|
|
171
|
-
chain =
|
|
247
|
+
chain = find_chain_in_structure(structure, chain2keep)
|
|
248
|
+
chainnames_in_structure = {c.name for c in chains_in_structure(structure)}
|
|
172
249
|
if chain is None:
|
|
173
|
-
raise ChainNotFoundError(chain2keep, input_file)
|
|
250
|
+
raise ChainNotFoundError(chain2keep, input_file, chainnames_in_structure)
|
|
251
|
+
chain_name = chain.name
|
|
174
252
|
name, extension = _split_name_and_extension(input_file.name)
|
|
175
|
-
output_file = output_dir / f"{name}_{
|
|
253
|
+
output_file = output_dir / f"{name}_{chain_name}2{out_chain}{extension}"
|
|
176
254
|
|
|
177
255
|
if output_file.exists():
|
|
178
256
|
logger.info("Output file %s already exists for input file %s. Skipping.", output_file, input_file)
|
|
179
257
|
return output_file
|
|
180
258
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
259
|
+
if chain_name == out_chain and len(chainnames_in_structure) == 1:
|
|
260
|
+
logger.info(
|
|
261
|
+
"%s only has chain %s and out_chain is also %s. Copying file to %s.",
|
|
262
|
+
input_file,
|
|
263
|
+
chain_name,
|
|
264
|
+
out_chain,
|
|
265
|
+
output_file,
|
|
266
|
+
)
|
|
267
|
+
copyfile(input_file, output_file, copy_method)
|
|
268
|
+
return output_file
|
|
269
|
+
|
|
270
|
+
gemmi.Selection(chain_name).remove_not_selected(structure)
|
|
271
|
+
for m in structure:
|
|
272
|
+
m.remove_ligands_and_waters()
|
|
273
|
+
structure.setup_entities()
|
|
274
|
+
structure.rename_chain(chain_name, out_chain)
|
|
275
|
+
_dedup_helices(structure)
|
|
276
|
+
_dedup_sheets(structure, out_chain)
|
|
277
|
+
_add_provenance_info(structure, chain_name, out_chain)
|
|
278
|
+
|
|
279
|
+
write_structure(structure, output_file)
|
|
199
280
|
|
|
200
281
|
return output_file
|
protein_quest/ss.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""Module for dealing with secondary structure."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections.abc import Generator, Iterable
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from gemmi import Structure, read_structure, set_leak_warnings
|
|
9
|
+
|
|
10
|
+
from protein_quest.converter import PositiveInt, Ratio, converter
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# TODO remove once v0.7.4 of gemmi is released,
|
|
15
|
+
# as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
|
|
16
|
+
# Swallow gemmi leaked function warnings
|
|
17
|
+
set_leak_warnings(False)
|
|
18
|
+
|
|
19
|
+
# TODO if a structure has no secondary structure information, calculate it with `gemmi ss`.
|
|
20
|
+
# https://github.com/MonomerLibrary/monomers/wiki/Installation as --monomers dir
|
|
21
|
+
# gemmi executable is in https://pypi.org/project/gemmi-program/
|
|
22
|
+
# `gemmi ss` only prints secondary structure to stdout with `-v` flag.
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def nr_of_residues_in_total(structure: Structure) -> int:
|
|
26
|
+
"""Count the total number of residues in the structure.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
structure: The gemmi Structure object to analyze.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
The total number of residues in the structure.
|
|
33
|
+
"""
|
|
34
|
+
count = 0
|
|
35
|
+
for model in structure:
|
|
36
|
+
for chain in model:
|
|
37
|
+
count += len(chain)
|
|
38
|
+
return count
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def nr_of_residues_in_helix(structure: Structure) -> int:
|
|
42
|
+
"""Count the number of residues in alpha helices.
|
|
43
|
+
|
|
44
|
+
Requires structure to have secondary structure information.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
structure: The gemmi Structure object to analyze.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
The number of residues in alpha helices.
|
|
51
|
+
"""
|
|
52
|
+
# For cif files from AlphaFold the helix.length is set to -1
|
|
53
|
+
# so use resid instead
|
|
54
|
+
count = 0
|
|
55
|
+
for helix in structure.helices:
|
|
56
|
+
end = helix.end.res_id.seqid.num
|
|
57
|
+
start = helix.start.res_id.seqid.num
|
|
58
|
+
if end is None or start is None:
|
|
59
|
+
logger.warning(f"Invalid helix coordinates: {helix.end} or {helix.start}")
|
|
60
|
+
continue
|
|
61
|
+
length = end - start + 1
|
|
62
|
+
count += length
|
|
63
|
+
return count
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def nr_of_residues_in_sheet(structure: Structure) -> int:
|
|
67
|
+
"""Count the number of residues in beta sheets.
|
|
68
|
+
|
|
69
|
+
Requires structure to have secondary structure information.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
structure: The gemmi Structure object to analyze.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
The number of residues in beta sheets.
|
|
76
|
+
"""
|
|
77
|
+
count = 0
|
|
78
|
+
for sheet in structure.sheets:
|
|
79
|
+
for strand in sheet.strands:
|
|
80
|
+
end = strand.end.res_id.seqid.num
|
|
81
|
+
start = strand.start.res_id.seqid.num
|
|
82
|
+
if end is None or start is None:
|
|
83
|
+
logger.warning(f"Invalid strand coordinates: {strand.end} or {strand.start}")
|
|
84
|
+
continue
|
|
85
|
+
length = end - start + 1
|
|
86
|
+
count += length
|
|
87
|
+
return count
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class SecondaryStructureFilterQuery:
|
|
92
|
+
"""Query object to filter on secondary structure.
|
|
93
|
+
|
|
94
|
+
Parameters:
|
|
95
|
+
abs_min_helix_residues: Minimum number of residues in helices (absolute).
|
|
96
|
+
abs_max_helix_residues: Maximum number of residues in helices (absolute).
|
|
97
|
+
abs_min_sheet_residues: Minimum number of residues in sheets (absolute).
|
|
98
|
+
abs_max_sheet_residues: Maximum number of residues in sheets (absolute).
|
|
99
|
+
ratio_min_helix_residues: Minimum number of residues in helices (relative).
|
|
100
|
+
ratio_max_helix_residues: Maximum number of residues in helices (relative).
|
|
101
|
+
ratio_min_sheet_residues: Minimum number of residues in sheets (relative).
|
|
102
|
+
ratio_max_sheet_residues: Maximum number of residues in sheets (relative).
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
abs_min_helix_residues: PositiveInt | None = None
|
|
106
|
+
abs_max_helix_residues: PositiveInt | None = None
|
|
107
|
+
abs_min_sheet_residues: PositiveInt | None = None
|
|
108
|
+
abs_max_sheet_residues: PositiveInt | None = None
|
|
109
|
+
ratio_min_helix_residues: Ratio | None = None
|
|
110
|
+
ratio_max_helix_residues: Ratio | None = None
|
|
111
|
+
ratio_min_sheet_residues: Ratio | None = None
|
|
112
|
+
ratio_max_sheet_residues: Ratio | None = None
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _check_range(min_val, max_val, label):
|
|
116
|
+
if min_val is not None and max_val is not None and min_val >= max_val:
|
|
117
|
+
msg = f"Invalid {label} range: min {min_val} must be smaller than max {max_val}"
|
|
118
|
+
raise ValueError(msg)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
base_query_hook = converter.get_structure_hook(SecondaryStructureFilterQuery)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@converter.register_structure_hook
|
|
125
|
+
def secondary_structure_filter_query_hook(value, _type) -> SecondaryStructureFilterQuery:
|
|
126
|
+
result: SecondaryStructureFilterQuery = base_query_hook(value, _type)
|
|
127
|
+
_check_range(result.abs_min_helix_residues, result.abs_max_helix_residues, "absolute helix residue")
|
|
128
|
+
_check_range(result.abs_min_sheet_residues, result.abs_max_sheet_residues, "absolute sheet residue")
|
|
129
|
+
_check_range(result.ratio_min_helix_residues, result.ratio_max_helix_residues, "ratio helix residue")
|
|
130
|
+
_check_range(result.ratio_min_sheet_residues, result.ratio_max_sheet_residues, "ratio sheet residue")
|
|
131
|
+
return result
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@dataclass
|
|
135
|
+
class SecondaryStructureStats:
|
|
136
|
+
"""Statistics about the secondary structure of a protein.
|
|
137
|
+
|
|
138
|
+
Parameters:
|
|
139
|
+
nr_residues: Total number of residues in the structure.
|
|
140
|
+
nr_helix_residues: Number of residues in helices.
|
|
141
|
+
nr_sheet_residues: Number of residues in sheets.
|
|
142
|
+
helix_ratio: Ratio of residues in helices.
|
|
143
|
+
sheet_ratio: Ratio of residues in sheets.
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
nr_residues: PositiveInt
|
|
147
|
+
nr_helix_residues: PositiveInt
|
|
148
|
+
nr_sheet_residues: PositiveInt
|
|
149
|
+
helix_ratio: Ratio
|
|
150
|
+
sheet_ratio: Ratio
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@dataclass
|
|
154
|
+
class SecondaryStructureFilterResult:
|
|
155
|
+
"""Result of filtering on secondary structure.
|
|
156
|
+
|
|
157
|
+
Parameters:
|
|
158
|
+
stats: The secondary structure statistics.
|
|
159
|
+
passed: Whether the structure passed the filtering criteria.
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
stats: SecondaryStructureStats
|
|
163
|
+
passed: bool = False
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _gather_stats(structure: Structure) -> SecondaryStructureStats:
|
|
167
|
+
nr_total_residues = nr_of_residues_in_total(structure)
|
|
168
|
+
nr_helix_residues = nr_of_residues_in_helix(structure)
|
|
169
|
+
nr_sheet_residues = nr_of_residues_in_sheet(structure)
|
|
170
|
+
if nr_total_residues == 0:
|
|
171
|
+
msg = "Structure has zero residues; cannot compute secondary structure ratios."
|
|
172
|
+
raise ValueError(msg)
|
|
173
|
+
helix_ratio = nr_helix_residues / nr_total_residues
|
|
174
|
+
sheet_ratio = nr_sheet_residues / nr_total_residues
|
|
175
|
+
return SecondaryStructureStats(
|
|
176
|
+
nr_residues=nr_total_residues,
|
|
177
|
+
nr_helix_residues=nr_helix_residues,
|
|
178
|
+
nr_sheet_residues=nr_sheet_residues,
|
|
179
|
+
helix_ratio=helix_ratio,
|
|
180
|
+
sheet_ratio=sheet_ratio,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def filter_on_secondary_structure(
|
|
185
|
+
structure: Structure,
|
|
186
|
+
query: SecondaryStructureFilterQuery,
|
|
187
|
+
) -> SecondaryStructureFilterResult:
|
|
188
|
+
"""Filter a structure based on secondary structure criteria.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
structure: The gemmi Structure object to analyze.
|
|
192
|
+
query: The filtering criteria to apply.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Filtering statistics and whether structure passed.
|
|
196
|
+
"""
|
|
197
|
+
stats = _gather_stats(structure)
|
|
198
|
+
conditions: list[bool] = []
|
|
199
|
+
|
|
200
|
+
# Helix absolute thresholds
|
|
201
|
+
if query.abs_min_helix_residues is not None:
|
|
202
|
+
conditions.append(stats.nr_helix_residues >= query.abs_min_helix_residues)
|
|
203
|
+
if query.abs_max_helix_residues is not None:
|
|
204
|
+
conditions.append(stats.nr_helix_residues <= query.abs_max_helix_residues)
|
|
205
|
+
|
|
206
|
+
# Helix ratio thresholds
|
|
207
|
+
if query.ratio_min_helix_residues is not None:
|
|
208
|
+
conditions.append(stats.helix_ratio >= query.ratio_min_helix_residues)
|
|
209
|
+
if query.ratio_max_helix_residues is not None:
|
|
210
|
+
conditions.append(stats.helix_ratio <= query.ratio_max_helix_residues)
|
|
211
|
+
|
|
212
|
+
# Sheet absolute thresholds
|
|
213
|
+
if query.abs_min_sheet_residues is not None:
|
|
214
|
+
conditions.append(stats.nr_sheet_residues >= query.abs_min_sheet_residues)
|
|
215
|
+
if query.abs_max_sheet_residues is not None:
|
|
216
|
+
conditions.append(stats.nr_sheet_residues <= query.abs_max_sheet_residues)
|
|
217
|
+
|
|
218
|
+
# Sheet ratio thresholds
|
|
219
|
+
if query.ratio_min_sheet_residues is not None:
|
|
220
|
+
conditions.append(stats.sheet_ratio >= query.ratio_min_sheet_residues)
|
|
221
|
+
if query.ratio_max_sheet_residues is not None:
|
|
222
|
+
conditions.append(stats.sheet_ratio <= query.ratio_max_sheet_residues)
|
|
223
|
+
|
|
224
|
+
if not conditions:
|
|
225
|
+
msg = "No filtering conditions provided. Please specify at least one condition."
|
|
226
|
+
raise ValueError(msg)
|
|
227
|
+
passed = all(conditions)
|
|
228
|
+
return SecondaryStructureFilterResult(stats=stats, passed=passed)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def filter_file_on_secondary_structure(
|
|
232
|
+
file_path: Path,
|
|
233
|
+
query: SecondaryStructureFilterQuery,
|
|
234
|
+
) -> SecondaryStructureFilterResult:
|
|
235
|
+
"""Filter a structure file based on secondary structure criteria.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
file_path: The path to the structure file to analyze.
|
|
239
|
+
query: The filtering criteria to apply.
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Filtering statistics and whether file passed.
|
|
243
|
+
"""
|
|
244
|
+
structure = read_structure(str(file_path))
|
|
245
|
+
return filter_on_secondary_structure(structure, query)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def filter_files_on_secondary_structure(
|
|
249
|
+
file_paths: Iterable[Path],
|
|
250
|
+
query: SecondaryStructureFilterQuery,
|
|
251
|
+
) -> Generator[tuple[Path, SecondaryStructureFilterResult]]:
|
|
252
|
+
"""Filter multiple structure files based on secondary structure criteria.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
file_paths: A list of paths to the structure files to analyze.
|
|
256
|
+
query: The filtering criteria to apply.
|
|
257
|
+
|
|
258
|
+
Yields:
|
|
259
|
+
For each file returns the filtering statistics and whether structure passed.
|
|
260
|
+
"""
|
|
261
|
+
# TODO check if quick enough in serial mode, if not switch to dask map
|
|
262
|
+
for file_path in file_paths:
|
|
263
|
+
result = filter_file_on_secondary_structure(file_path, query)
|
|
264
|
+
yield file_path, result
|
protein_quest/taxonomy.py
CHANGED
|
@@ -9,9 +9,9 @@ from typing import Literal, get_args
|
|
|
9
9
|
from aiohttp.client import ClientResponse
|
|
10
10
|
from aiohttp_retry import RetryClient
|
|
11
11
|
from cattrs.gen import make_dict_structure_fn, override
|
|
12
|
-
from cattrs.preconf.orjson import make_converter
|
|
13
12
|
from yarl import URL
|
|
14
13
|
|
|
14
|
+
from protein_quest.converter import converter
|
|
15
15
|
from protein_quest.go import TextIOWrapper
|
|
16
16
|
from protein_quest.utils import friendly_session
|
|
17
17
|
|
|
@@ -42,8 +42,6 @@ class SearchTaxonResponse:
|
|
|
42
42
|
results: list[Taxon]
|
|
43
43
|
|
|
44
44
|
|
|
45
|
-
converter = make_converter()
|
|
46
|
-
|
|
47
45
|
converter.register_structure_hook(
|
|
48
46
|
Taxon,
|
|
49
47
|
make_dict_structure_fn(
|
protein_quest/utils.py
CHANGED
|
@@ -2,11 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
|
+
import shutil
|
|
5
6
|
from collections.abc import Coroutine, Iterable
|
|
6
7
|
from contextlib import asynccontextmanager
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
from textwrap import dedent
|
|
9
|
-
from typing import Any
|
|
10
|
+
from typing import Any, Literal, get_args
|
|
10
11
|
|
|
11
12
|
import aiofiles
|
|
12
13
|
import aiohttp
|
|
@@ -138,3 +139,29 @@ def run_async[R](coroutine: Coroutine[Any, Any, R]) -> R:
|
|
|
138
139
|
return asyncio.run(coroutine)
|
|
139
140
|
except RuntimeError as e:
|
|
140
141
|
raise NestedAsyncIOLoopError from e
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
CopyMethod = Literal["copy", "symlink"]
|
|
145
|
+
copy_methods = set(get_args(CopyMethod))
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def copyfile(source: Path, target: Path, copy_method: CopyMethod = "copy"):
|
|
149
|
+
"""Make target path be same file as source by either copying or symlinking.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
source: The source file to copy or symlink.
|
|
153
|
+
target: The target file to create.
|
|
154
|
+
copy_method: The method to use for copying.
|
|
155
|
+
|
|
156
|
+
Raises:
|
|
157
|
+
FileNotFoundError: If the source file or parent of target does not exist.
|
|
158
|
+
ValueError: If the method is not "copy" or "symlink".
|
|
159
|
+
"""
|
|
160
|
+
if copy_method == "copy":
|
|
161
|
+
shutil.copyfile(source, target)
|
|
162
|
+
elif copy_method == "symlink":
|
|
163
|
+
rel_source = source.relative_to(target.parent, walk_up=True)
|
|
164
|
+
target.symlink_to(rel_source)
|
|
165
|
+
else:
|
|
166
|
+
msg = f"Unknown method: {copy_method}"
|
|
167
|
+
raise ValueError(msg)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: protein_quest
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: Search/retrieve/filter proteins and protein structures
|
|
5
5
|
Project-URL: Homepage, https://github.com/haddocking/protein-quest
|
|
6
6
|
Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
|
|
@@ -59,9 +59,11 @@ graph TB;
|
|
|
59
59
|
searchpdbe -->|pdb_ids|fetchpdbe[Retrieve PDBe]
|
|
60
60
|
searchaf --> |uniprot_accessions|fetchad(Retrieve AlphaFold)
|
|
61
61
|
searchemdb -. emdb_ids .->fetchemdb[Retrieve EMDB]
|
|
62
|
-
fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{Filter on chain of uniprot}
|
|
63
|
-
chainfilter --> |mmcif_files| residuefilter{Filter on chain length}
|
|
64
|
-
fetchad -->|pdb_files| confidencefilter{Filter out low confidence}
|
|
62
|
+
fetchpdbe -->|mmcif_files_with_uniprot_acc| chainfilter{{Filter on chain of uniprot}}
|
|
63
|
+
chainfilter --> |mmcif_files| residuefilter{{Filter on chain length}}
|
|
64
|
+
fetchad -->|pdb_files| confidencefilter{{Filter out low confidence}}
|
|
65
|
+
confidencefilter --> |mmcif_files| ssfilter{{Filter on secondary structure}}
|
|
66
|
+
residuefilter --> |mmcif_files| ssfilter
|
|
65
67
|
classDef dashedBorder stroke-dasharray: 5 5;
|
|
66
68
|
goterm:::dashedBorder
|
|
67
69
|
taxonomy:::dashedBorder
|
|
@@ -175,6 +177,18 @@ protein-quest filter residue \
|
|
|
175
177
|
./filtered-chains ./filtered
|
|
176
178
|
```
|
|
177
179
|
|
|
180
|
+
### To filter on secondary structure
|
|
181
|
+
|
|
182
|
+
To filter on structure being mostly alpha helices and have no beta sheets.
|
|
183
|
+
|
|
184
|
+
```shell
|
|
185
|
+
protein-quest filter secondary-structure \
|
|
186
|
+
--ratio-min-helix-residues 0.5 \
|
|
187
|
+
--ratio-max-sheet-residues 0.0 \
|
|
188
|
+
--write-stats filtered-ss/stats.csv \
|
|
189
|
+
./filtered-chains ./filtered-ss
|
|
190
|
+
```
|
|
191
|
+
|
|
178
192
|
### Search Taxonomy
|
|
179
193
|
|
|
180
194
|
```shell
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
protein_quest/__version__.py,sha256=tDIN8WjNdFKRoXsf6tArV0_n6nbcPEBWNv1zuhaRbKo,56
|
|
3
|
+
protein_quest/cli.py,sha256=k4HC282QkbAAIk614vIJgaKfkS3XD9hYj7E5hEuiDxA,37893
|
|
4
|
+
protein_quest/converter.py,sha256=tSDw7HOlC7UoWryr_G-sHGzGq8nwflzSq8o7Gv1hWuQ,1382
|
|
5
|
+
protein_quest/emdb.py,sha256=QEeU0VJQ4lLM-o5yAU3QZlrtzDZNgnC5fCjlqPtTyAY,1370
|
|
6
|
+
protein_quest/filters.py,sha256=-gasSXR4g5SzYSYbkfcDwR-tm2KCAhCMdpIVJrUPR1w,5224
|
|
7
|
+
protein_quest/go.py,sha256=lZNEcw8nTc9wpV3cl4y2FG9Lsj8wsXQ6zemmAQs_DWE,5650
|
|
8
|
+
protein_quest/mcp_server.py,sha256=auftrx4aBZp1P-pBcunkPiSmXLtOIZ6MTuhUuW7yrGY,7241
|
|
9
|
+
protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
|
|
10
|
+
protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
protein_quest/ss.py,sha256=MMHgqKPxjYpjyExiqslWjmyG7aeForeAeJorCYdh75g,9663
|
|
12
|
+
protein_quest/taxonomy.py,sha256=4mKv8zll4mX02Ow8CTvyqMJE2KJZvcq3QlTjjjLOJJk,5072
|
|
13
|
+
protein_quest/uniprot.py,sha256=8qWV4GWqHTRfed0bE_TdgsLYcnDT_vzKu-6JxIgapJQ,18680
|
|
14
|
+
protein_quest/utils.py,sha256=z4PPPcog6nvPhA93DWVf7stv5uJ4h_2BP5owdhoO5mo,5626
|
|
15
|
+
protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
|
|
16
|
+
protein_quest/alphafold/confidence.py,sha256=pYIuwYdkuPuHLagcX1dSvSyZ_84xboRLfHUxkEoc4MY,6766
|
|
17
|
+
protein_quest/alphafold/entry_summary.py,sha256=GtE3rT7wH3vIOOeiXY2s80Fo6EzdoqlcvakW8K591Yk,1257
|
|
18
|
+
protein_quest/alphafold/fetch.py,sha256=iFHORaO-2NvPwmpm33tfOFUcSJx8mBGwMXxwc4bRuk8,11336
|
|
19
|
+
protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
|
|
20
|
+
protein_quest/pdbe/fetch.py,sha256=tlCrWoaOrwxnQFrf-PnimUUa6lmtHwwysS51efYsBcA,2379
|
|
21
|
+
protein_quest/pdbe/io.py,sha256=iGLvmsD-eEYnrgZDYfkGWIDCzwDRRD5dwqB480talCs,10037
|
|
22
|
+
protein_quest-0.3.2.dist-info/METADATA,sha256=wcURSjBlmkCt-ddhZX7xRYrL-7tT1VuBpJ36_mP0Iuk,7760
|
|
23
|
+
protein_quest-0.3.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
24
|
+
protein_quest-0.3.2.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
|
|
25
|
+
protein_quest-0.3.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
26
|
+
protein_quest-0.3.2.dist-info/RECORD,,
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
protein_quest/__version__.py,sha256=Bu2gp24I4eIxc1qgY2e0PnF8N-szjUpFQwVAe10IRAo,56
|
|
3
|
-
protein_quest/cli.py,sha256=xjiWtRDqv-Ruv1fpvXq4dmDSuuyewxw81akDs1ktVbI,31772
|
|
4
|
-
protein_quest/emdb.py,sha256=QEeU0VJQ4lLM-o5yAU3QZlrtzDZNgnC5fCjlqPtTyAY,1370
|
|
5
|
-
protein_quest/filters.py,sha256=3vqfFH87Lz7r9uYiSvwMxzShMfRNv1Zv_freJtDljrU,4051
|
|
6
|
-
protein_quest/go.py,sha256=ycV3-grxuIKFt28bFgH6iRKmt5AEGi7txoTbaAnBxQE,5684
|
|
7
|
-
protein_quest/mcp_server.py,sha256=1_CGC0peqoNUFBvgFWupKwIWjmHsKxN5Vxy1K7dt5Dw,7130
|
|
8
|
-
protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
|
|
9
|
-
protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
protein_quest/taxonomy.py,sha256=wPzLjum5n_SEkL2rHUKvyRnjL1pG7bhEnE2vMmXixEc,5105
|
|
11
|
-
protein_quest/uniprot.py,sha256=8qWV4GWqHTRfed0bE_TdgsLYcnDT_vzKu-6JxIgapJQ,18680
|
|
12
|
-
protein_quest/utils.py,sha256=YhlTJreIr1bExbh1M514l6sz4GmLVa3RN57mI1kjjuw,4730
|
|
13
|
-
protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
|
|
14
|
-
protein_quest/alphafold/confidence.py,sha256=GGd_vYsqVvs9InvFKtqHdGKB_61GHllPmDyIztvzG7E,5625
|
|
15
|
-
protein_quest/alphafold/entry_summary.py,sha256=GtE3rT7wH3vIOOeiXY2s80Fo6EzdoqlcvakW8K591Yk,1257
|
|
16
|
-
protein_quest/alphafold/fetch.py,sha256=1mDbQNm01cxlwFNDsKHBWD7MEwzB3PaheskdaLN7XJs,11491
|
|
17
|
-
protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
|
|
18
|
-
protein_quest/pdbe/fetch.py,sha256=tlCrWoaOrwxnQFrf-PnimUUa6lmtHwwysS51efYsBcA,2379
|
|
19
|
-
protein_quest/pdbe/io.py,sha256=J6fHlRLHLALnpxDgSUUnFCNFV9Hr3u6eJDO6j81ftT4,6936
|
|
20
|
-
protein_quest-0.3.1.dist-info/METADATA,sha256=fWvmMbm5aEMb3WbWgPAqwEOWeYJSY47iuZLaRIgBuuk,7305
|
|
21
|
-
protein_quest-0.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
22
|
-
protein_quest-0.3.1.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
|
|
23
|
-
protein_quest-0.3.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
24
|
-
protein_quest-0.3.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|