protein-quest 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of protein-quest might be problematic. Click here for more details.
- protein_quest/__version__.py +2 -1
- protein_quest/alphafold/confidence.py +2 -2
- protein_quest/alphafold/entry_summary.py +11 -9
- protein_quest/alphafold/fetch.py +37 -61
- protein_quest/cli.py +35 -18
- protein_quest/filters.py +43 -32
- protein_quest/mcp_server.py +4 -5
- protein_quest/parallel.py +37 -1
- protein_quest/pdbe/fetch.py +15 -1
- protein_quest/pdbe/io.py +25 -10
- protein_quest/taxonomy.py +12 -0
- protein_quest/utils.py +38 -3
- {protein_quest-0.3.0.dist-info → protein_quest-0.3.1.dist-info}/METADATA +4 -8
- protein_quest-0.3.1.dist-info/RECORD +24 -0
- protein_quest-0.3.0.dist-info/RECORD +0 -24
- {protein_quest-0.3.0.dist-info → protein_quest-0.3.1.dist-info}/WHEEL +0 -0
- {protein_quest-0.3.0.dist-info → protein_quest-0.3.1.dist-info}/entry_points.txt +0 -0
- {protein_quest-0.3.0.dist-info → protein_quest-0.3.1.dist-info}/licenses/LICENSE +0 -0
protein_quest/__version__.py
CHANGED
|
@@ -1 +1,2 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.1"
|
|
2
|
+
"""The version of the package."""
|
|
@@ -98,7 +98,7 @@ class ConfidenceFilterResult:
|
|
|
98
98
|
|
|
99
99
|
|
|
100
100
|
def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_dir: Path) -> ConfidenceFilterResult:
|
|
101
|
-
"""Filter a single AlphaFoldDB structure file based on confidence.
|
|
101
|
+
"""Filter a single AlphaFoldDB structure file (*.pdb[.gz], *.cif[.gz]) based on confidence.
|
|
102
102
|
|
|
103
103
|
Args:
|
|
104
104
|
file: The path to the PDB file to filter.
|
|
@@ -107,7 +107,7 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
|
|
|
107
107
|
|
|
108
108
|
Returns:
|
|
109
109
|
result with filtered_file property set to Path where filtered PDB file is saved.
|
|
110
|
-
|
|
110
|
+
or None if structure was filtered out.
|
|
111
111
|
"""
|
|
112
112
|
structure = gemmi.read_structure(str(file))
|
|
113
113
|
residues = set(find_high_confidence_residues(structure, query.confidence))
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
# ruff: noqa: N815 allow camelCase follow what api returns
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
|
|
4
|
+
from yarl import URL
|
|
5
|
+
|
|
4
6
|
|
|
5
7
|
@dataclass
|
|
6
8
|
class EntrySummary:
|
|
7
9
|
"""Dataclass representing a summary of an AlphaFold entry.
|
|
8
10
|
|
|
9
|
-
Modelled after EntrySummary in https://alphafold.ebi.ac.uk/api/openapi.json
|
|
11
|
+
Modelled after EntrySummary in [https://alphafold.ebi.ac.uk/api/openapi.json](https://alphafold.ebi.ac.uk/api/openapi.json)
|
|
10
12
|
"""
|
|
11
13
|
|
|
12
14
|
entryId: str
|
|
@@ -21,17 +23,17 @@ class EntrySummary:
|
|
|
21
23
|
modelCreatedDate: str
|
|
22
24
|
latestVersion: int
|
|
23
25
|
allVersions: list[int]
|
|
24
|
-
bcifUrl:
|
|
25
|
-
cifUrl:
|
|
26
|
-
pdbUrl:
|
|
27
|
-
paeImageUrl:
|
|
28
|
-
paeDocUrl:
|
|
26
|
+
bcifUrl: URL
|
|
27
|
+
cifUrl: URL
|
|
28
|
+
pdbUrl: URL
|
|
29
|
+
paeImageUrl: URL
|
|
30
|
+
paeDocUrl: URL
|
|
29
31
|
gene: str | None = None
|
|
30
32
|
sequenceChecksum: str | None = None
|
|
31
33
|
sequenceVersionDate: str | None = None
|
|
32
|
-
amAnnotationsUrl:
|
|
33
|
-
amAnnotationsHg19Url:
|
|
34
|
-
amAnnotationsHg38Url:
|
|
34
|
+
amAnnotationsUrl: URL | None = None
|
|
35
|
+
amAnnotationsHg19Url: URL | None = None
|
|
36
|
+
amAnnotationsHg38Url: URL | None = None
|
|
35
37
|
isReviewed: bool | None = None
|
|
36
38
|
isReferenceProteome: bool | None = None
|
|
37
39
|
# TODO add new fields from https://alphafold.ebi.ac.uk/#/public-api/get_uniprot_summary_api_uniprot_summary__qualifier__json_get
|
protein_quest/alphafold/fetch.py
CHANGED
|
@@ -1,26 +1,28 @@
|
|
|
1
1
|
"""Module for fetch Alphafold data."""
|
|
2
2
|
|
|
3
|
-
import asyncio
|
|
4
3
|
import logging
|
|
5
4
|
from asyncio import Semaphore
|
|
6
5
|
from collections.abc import AsyncGenerator, Iterable
|
|
7
6
|
from dataclasses import dataclass
|
|
8
7
|
from pathlib import Path
|
|
9
|
-
from
|
|
10
|
-
from typing import Literal
|
|
8
|
+
from typing import Literal, cast, get_args
|
|
11
9
|
|
|
12
10
|
from aiohttp_retry import RetryClient
|
|
13
11
|
from aiopath import AsyncPath
|
|
14
12
|
from cattrs.preconf.orjson import make_converter
|
|
15
13
|
from tqdm.asyncio import tqdm
|
|
14
|
+
from yarl import URL
|
|
16
15
|
|
|
17
16
|
from protein_quest.alphafold.entry_summary import EntrySummary
|
|
18
|
-
from protein_quest.utils import friendly_session, retrieve_files
|
|
17
|
+
from protein_quest.utils import friendly_session, retrieve_files, run_async
|
|
19
18
|
|
|
20
19
|
logger = logging.getLogger(__name__)
|
|
21
20
|
converter = make_converter()
|
|
21
|
+
"""cattrs converter to read AlphaFold summary JSON document."""
|
|
22
|
+
converter.register_structure_hook(URL, lambda v, _: URL(v))
|
|
22
23
|
|
|
23
24
|
DownloadableFormat = Literal[
|
|
25
|
+
"summary",
|
|
24
26
|
"bcif",
|
|
25
27
|
"cif",
|
|
26
28
|
"pdb",
|
|
@@ -32,16 +34,7 @@ DownloadableFormat = Literal[
|
|
|
32
34
|
]
|
|
33
35
|
"""Types of formats that can be downloaded from the AlphaFold web service."""
|
|
34
36
|
|
|
35
|
-
downloadable_formats: set[DownloadableFormat] =
|
|
36
|
-
"bcif",
|
|
37
|
-
"cif",
|
|
38
|
-
"pdb",
|
|
39
|
-
"paeImage",
|
|
40
|
-
"paeDoc",
|
|
41
|
-
"amAnnotations",
|
|
42
|
-
"amAnnotationsHg19",
|
|
43
|
-
"amAnnotationsHg38",
|
|
44
|
-
}
|
|
37
|
+
downloadable_formats: set[DownloadableFormat] = set(get_args(DownloadableFormat))
|
|
45
38
|
"""Set of formats that can be downloaded from the AlphaFold web service."""
|
|
46
39
|
|
|
47
40
|
|
|
@@ -59,6 +52,7 @@ class AlphaFoldEntry:
|
|
|
59
52
|
|
|
60
53
|
uniprot_acc: str
|
|
61
54
|
summary: EntrySummary | None
|
|
55
|
+
summary_file: Path | None = None
|
|
62
56
|
bcif_file: Path | None = None
|
|
63
57
|
cif_file: Path | None = None
|
|
64
58
|
pdb_file: Path | None = None
|
|
@@ -127,10 +121,6 @@ async def fetch_summary(
|
|
|
127
121
|
|
|
128
122
|
Returns:
|
|
129
123
|
A list of EntrySummary objects representing the fetched summary.
|
|
130
|
-
|
|
131
|
-
Raises:
|
|
132
|
-
HTTPError: If the HTTP request returns an error status code.
|
|
133
|
-
Exception: If there is an error during file reading/writing or data conversion.
|
|
134
124
|
"""
|
|
135
125
|
url = f"https://alphafold.ebi.ac.uk/api/prediction/{qualifier}"
|
|
136
126
|
fn: AsyncPath | None = None
|
|
@@ -144,6 +134,7 @@ async def fetch_summary(
|
|
|
144
134
|
response.raise_for_status()
|
|
145
135
|
raw_data = await response.content.read()
|
|
146
136
|
if fn is not None:
|
|
137
|
+
# TODO return fn and make it part of AlphaFoldEntry as summary_file prop
|
|
147
138
|
await fn.write_bytes(raw_data)
|
|
148
139
|
return converter.loads(raw_data, list[EntrySummary])
|
|
149
140
|
|
|
@@ -164,19 +155,14 @@ async def fetch_summaries(
|
|
|
164
155
|
yield summary
|
|
165
156
|
|
|
166
157
|
|
|
167
|
-
def url2name(url: str) -> str:
|
|
168
|
-
"""Given a URL, return the final path component as the name of the file."""
|
|
169
|
-
return url.split("/")[-1]
|
|
170
|
-
|
|
171
|
-
|
|
172
158
|
async def fetch_many_async(
|
|
173
|
-
|
|
159
|
+
uniprot_accessions: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
|
|
174
160
|
) -> AsyncGenerator[AlphaFoldEntry]:
|
|
175
|
-
"""Asynchronously fetches summaries and
|
|
161
|
+
"""Asynchronously fetches summaries and files from
|
|
176
162
|
[AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
|
|
177
163
|
|
|
178
164
|
Args:
|
|
179
|
-
|
|
165
|
+
uniprot_accessions: A set of Uniprot acessions to fetch.
|
|
180
166
|
save_dir: The directory to save the fetched files to.
|
|
181
167
|
what: A set of formats to download.
|
|
182
168
|
max_parallel_downloads: The maximum number of parallel downloads.
|
|
@@ -184,7 +170,13 @@ async def fetch_many_async(
|
|
|
184
170
|
Yields:
|
|
185
171
|
A dataclass containing the summary, pdb file, and pae file.
|
|
186
172
|
"""
|
|
187
|
-
|
|
173
|
+
save_dir_for_summaries = save_dir if "summary" in what and save_dir is not None else None
|
|
174
|
+
summaries = [
|
|
175
|
+
s
|
|
176
|
+
async for s in fetch_summaries(
|
|
177
|
+
uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads
|
|
178
|
+
)
|
|
179
|
+
]
|
|
188
180
|
|
|
189
181
|
files = files_to_download(what, summaries)
|
|
190
182
|
|
|
@@ -198,30 +190,31 @@ async def fetch_many_async(
|
|
|
198
190
|
yield AlphaFoldEntry(
|
|
199
191
|
uniprot_acc=summary.uniprotAccession,
|
|
200
192
|
summary=summary,
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
193
|
+
summary_file=save_dir / f"{summary.uniprotAccession}.json" if save_dir_for_summaries is not None else None,
|
|
194
|
+
bcif_file=save_dir / summary.bcifUrl.name if "bcif" in what else None,
|
|
195
|
+
cif_file=save_dir / summary.cifUrl.name if "cif" in what else None,
|
|
196
|
+
pdb_file=save_dir / summary.pdbUrl.name if "pdb" in what else None,
|
|
197
|
+
pae_image_file=save_dir / summary.paeImageUrl.name if "paeImage" in what else None,
|
|
198
|
+
pae_doc_file=save_dir / summary.paeDocUrl.name if "paeDoc" in what else None,
|
|
206
199
|
am_annotations_file=(
|
|
207
|
-
save_dir /
|
|
200
|
+
save_dir / summary.amAnnotationsUrl.name
|
|
208
201
|
if "amAnnotations" in what and summary.amAnnotationsUrl
|
|
209
202
|
else None
|
|
210
203
|
),
|
|
211
204
|
am_annotations_hg19_file=(
|
|
212
|
-
save_dir /
|
|
205
|
+
save_dir / summary.amAnnotationsHg19Url.name
|
|
213
206
|
if "amAnnotationsHg19" in what and summary.amAnnotationsHg19Url
|
|
214
207
|
else None
|
|
215
208
|
),
|
|
216
209
|
am_annotations_hg38_file=(
|
|
217
|
-
save_dir /
|
|
210
|
+
save_dir / summary.amAnnotationsHg38Url.name
|
|
218
211
|
if "amAnnotationsHg38" in what and summary.amAnnotationsHg38Url
|
|
219
212
|
else None
|
|
220
213
|
),
|
|
221
214
|
)
|
|
222
215
|
|
|
223
216
|
|
|
224
|
-
def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySummary]) -> set[tuple[
|
|
217
|
+
def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySummary]) -> set[tuple[URL, str]]:
|
|
225
218
|
if not (set(what) <= downloadable_formats):
|
|
226
219
|
msg = (
|
|
227
220
|
f"Invalid format(s) specified: {set(what) - downloadable_formats}. "
|
|
@@ -229,24 +222,21 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
|
|
|
229
222
|
)
|
|
230
223
|
raise ValueError(msg)
|
|
231
224
|
|
|
232
|
-
files: set[tuple[
|
|
225
|
+
files: set[tuple[URL, str]] = set()
|
|
233
226
|
for summary in summaries:
|
|
234
227
|
for fmt in what:
|
|
235
|
-
|
|
228
|
+
if fmt == "summary":
|
|
229
|
+
# summary is handled already in fetch_summary
|
|
230
|
+
continue
|
|
231
|
+
url = cast("URL | None", getattr(summary, f"{fmt}Url", None))
|
|
236
232
|
if url is None:
|
|
237
233
|
logger.warning(f"Summary {summary.uniprotAccession} does not have a URL for format '{fmt}'. Skipping.")
|
|
238
234
|
continue
|
|
239
|
-
file = (url,
|
|
235
|
+
file = (url, url.name)
|
|
240
236
|
files.add(file)
|
|
241
237
|
return files
|
|
242
238
|
|
|
243
239
|
|
|
244
|
-
class NestedAsyncIOLoopError(RuntimeError):
|
|
245
|
-
"""Custom error for nested async I/O loops."""
|
|
246
|
-
|
|
247
|
-
pass
|
|
248
|
-
|
|
249
|
-
|
|
250
240
|
def fetch_many(
|
|
251
241
|
ids: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
|
|
252
242
|
) -> list[AlphaFoldEntry]:
|
|
@@ -260,9 +250,6 @@ def fetch_many(
|
|
|
260
250
|
|
|
261
251
|
Returns:
|
|
262
252
|
A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
|
|
263
|
-
|
|
264
|
-
Raises:
|
|
265
|
-
NestedAsyncIOLoopError: If called from a nested async I/O loop like in a Jupyter notebook.
|
|
266
253
|
"""
|
|
267
254
|
|
|
268
255
|
async def gather_entries():
|
|
@@ -271,19 +258,7 @@ def fetch_many(
|
|
|
271
258
|
async for entry in fetch_many_async(ids, save_dir, what, max_parallel_downloads=max_parallel_downloads)
|
|
272
259
|
]
|
|
273
260
|
|
|
274
|
-
|
|
275
|
-
return asyncio.run(gather_entries())
|
|
276
|
-
except RuntimeError as e:
|
|
277
|
-
msg = dedent("""\
|
|
278
|
-
Can not run async method from an environment where the asyncio event loop is already running.
|
|
279
|
-
Like a Jupyter notebook.
|
|
280
|
-
|
|
281
|
-
Please use the `fetch_many_async` function directly or before call
|
|
282
|
-
|
|
283
|
-
import nest_asyncio
|
|
284
|
-
nest_asyncio.apply()
|
|
285
|
-
""")
|
|
286
|
-
raise NestedAsyncIOLoopError(msg) from e
|
|
261
|
+
return run_async(gather_entries())
|
|
287
262
|
|
|
288
263
|
|
|
289
264
|
def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
|
|
@@ -299,6 +274,7 @@ def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
|
|
|
299
274
|
return AlphaFoldEntry(
|
|
300
275
|
uniprot_acc=entry.uniprot_acc,
|
|
301
276
|
summary=entry.summary,
|
|
277
|
+
summary_file=entry.summary_file.relative_to(session_dir) if entry.summary_file else None,
|
|
302
278
|
bcif_file=entry.bcif_file.relative_to(session_dir) if entry.bcif_file else None,
|
|
303
279
|
cif_file=entry.cif_file.relative_to(session_dir) if entry.cif_file else None,
|
|
304
280
|
pdb_file=entry.pdb_file.relative_to(session_dir) if entry.pdb_file else None,
|
protein_quest/cli.py
CHANGED
|
@@ -5,7 +5,8 @@ import asyncio
|
|
|
5
5
|
import csv
|
|
6
6
|
import logging
|
|
7
7
|
import os
|
|
8
|
-
|
|
8
|
+
import sys
|
|
9
|
+
from collections.abc import Callable, Generator, Iterable
|
|
9
10
|
from importlib.util import find_spec
|
|
10
11
|
from io import TextIOWrapper
|
|
11
12
|
from pathlib import Path
|
|
@@ -14,6 +15,7 @@ from textwrap import dedent
|
|
|
14
15
|
from cattrs import structure
|
|
15
16
|
from rich import print as rprint
|
|
16
17
|
from rich.logging import RichHandler
|
|
18
|
+
from rich.panel import Panel
|
|
17
19
|
from rich_argparse import ArgumentDefaultsRichHelpFormatter
|
|
18
20
|
from tqdm.rich import tqdm
|
|
19
21
|
|
|
@@ -25,7 +27,7 @@ from protein_quest.emdb import fetch as emdb_fetch
|
|
|
25
27
|
from protein_quest.filters import filter_files_on_chain, filter_files_on_residues
|
|
26
28
|
from protein_quest.go import Aspect, allowed_aspects, search_gene_ontology_term, write_go_terms_to_csv
|
|
27
29
|
from protein_quest.pdbe import fetch as pdbe_fetch
|
|
28
|
-
from protein_quest.pdbe.io import glob_structure_files
|
|
30
|
+
from protein_quest.pdbe.io import glob_structure_files, locate_structure_file
|
|
29
31
|
from protein_quest.taxonomy import SearchField, _write_taxonomy_csv, search_fields, search_taxon
|
|
30
32
|
from protein_quest.uniprot import PdbResult, Query, search4af, search4emdb, search4pdb, search4uniprot
|
|
31
33
|
|
|
@@ -246,12 +248,12 @@ def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
|
|
|
246
248
|
)
|
|
247
249
|
parser.add_argument("output_dir", type=Path, help="Directory to store downloaded AlphaFold files")
|
|
248
250
|
parser.add_argument(
|
|
249
|
-
"--what-
|
|
251
|
+
"--what-formats",
|
|
250
252
|
type=str,
|
|
251
253
|
action="append",
|
|
252
254
|
choices=sorted(downloadable_formats),
|
|
253
255
|
help=dedent("""AlphaFold formats to retrieve. Can be specified multiple times.
|
|
254
|
-
Default is '
|
|
256
|
+
Default is 'summary' and 'cif'."""),
|
|
255
257
|
)
|
|
256
258
|
parser.add_argument(
|
|
257
259
|
"--max-parallel-downloads",
|
|
@@ -585,17 +587,17 @@ def _handle_retrieve_pdbe(args):
|
|
|
585
587
|
|
|
586
588
|
def _handle_retrieve_alphafold(args):
|
|
587
589
|
download_dir = args.output_dir
|
|
588
|
-
|
|
590
|
+
what_formats = args.what_formats
|
|
589
591
|
alphafold_csv = args.alphafold_csv
|
|
590
592
|
max_parallel_downloads = args.max_parallel_downloads
|
|
591
593
|
|
|
592
|
-
if
|
|
593
|
-
|
|
594
|
+
if what_formats is None:
|
|
595
|
+
what_formats = {"summary", "cif"}
|
|
594
596
|
|
|
595
597
|
# TODO besides `uniprot_acc,af_id\n` csv also allow headless single column format
|
|
596
598
|
#
|
|
597
|
-
af_ids =
|
|
598
|
-
validated_what: set[DownloadableFormat] = structure(
|
|
599
|
+
af_ids = _read_column_from_csv(alphafold_csv, "af_id")
|
|
600
|
+
validated_what: set[DownloadableFormat] = structure(what_formats, set[DownloadableFormat])
|
|
599
601
|
rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {validated_what}")
|
|
600
602
|
afs = af_fetch(af_ids, download_dir, what=validated_what, max_parallel_downloads=max_parallel_downloads)
|
|
601
603
|
total_nr_files = sum(af.nr_of_files() for af in afs)
|
|
@@ -658,12 +660,32 @@ def _handle_filter_chain(args):
|
|
|
658
660
|
pdb_id2chain_mapping_file = args.chains
|
|
659
661
|
scheduler_address = args.scheduler_address
|
|
660
662
|
|
|
663
|
+
# make sure files in input dir with entries in mapping file are the same
|
|
664
|
+
# complain when files from mapping file are missing on disk
|
|
661
665
|
rows = list(_iter_csv_rows(pdb_id2chain_mapping_file))
|
|
662
|
-
|
|
666
|
+
file2chain: set[tuple[Path, str]] = set()
|
|
667
|
+
errors: list[FileNotFoundError] = []
|
|
663
668
|
|
|
664
|
-
|
|
669
|
+
for row in rows:
|
|
670
|
+
pdb_id = row["pdb_id"]
|
|
671
|
+
chain = row["chain"]
|
|
672
|
+
try:
|
|
673
|
+
f = locate_structure_file(input_dir, pdb_id)
|
|
674
|
+
file2chain.add((f, chain))
|
|
675
|
+
except FileNotFoundError as e:
|
|
676
|
+
errors.append(e)
|
|
665
677
|
|
|
666
|
-
|
|
678
|
+
if errors:
|
|
679
|
+
msg = f"Some structure files could not be found ({len(errors)} missing), skipping them"
|
|
680
|
+
rprint(Panel(os.linesep.join(map(str, errors)), title=msg, style="red"))
|
|
681
|
+
|
|
682
|
+
if not file2chain:
|
|
683
|
+
rprint("[red]No valid structure files found. Exiting.")
|
|
684
|
+
sys.exit(1)
|
|
685
|
+
|
|
686
|
+
results = filter_files_on_chain(file2chain, output_dir, scheduler_address=scheduler_address)
|
|
687
|
+
|
|
688
|
+
nr_written = len([r for r in results if r.passed])
|
|
667
689
|
|
|
668
690
|
rprint(f"Wrote {nr_written} single-chain PDB/mmCIF files to {output_dir}.")
|
|
669
691
|
|
|
@@ -768,12 +790,7 @@ def _write_dict_of_sets2csv(file: TextIOWrapper, data: dict[str, set[str]], ref_
|
|
|
768
790
|
writer.writerow({"uniprot_acc": uniprot_acc, ref_id_field: ref_id})
|
|
769
791
|
|
|
770
792
|
|
|
771
|
-
def
|
|
772
|
-
reader = csv.DictReader(file)
|
|
773
|
-
yield from reader
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
def _iter_csv_rows(file: TextIOWrapper):
|
|
793
|
+
def _iter_csv_rows(file: TextIOWrapper) -> Generator[dict[str, str]]:
|
|
777
794
|
reader = csv.DictReader(file)
|
|
778
795
|
yield from reader
|
|
779
796
|
|
protein_quest/filters.py
CHANGED
|
@@ -1,19 +1,17 @@
|
|
|
1
1
|
"""Module for filtering structure files and their contents."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
from collections.abc import Generator
|
|
4
|
+
from collections.abc import Collection, Generator
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from shutil import copyfile
|
|
8
|
-
from typing import cast
|
|
9
8
|
|
|
10
|
-
from dask.distributed import Client
|
|
9
|
+
from dask.distributed import Client
|
|
11
10
|
from distributed.deploy.cluster import Cluster
|
|
12
11
|
from tqdm.auto import tqdm
|
|
13
12
|
|
|
14
|
-
from protein_quest.parallel import configure_dask_scheduler
|
|
13
|
+
from protein_quest.parallel import configure_dask_scheduler, dask_map_with_progress
|
|
15
14
|
from protein_quest.pdbe.io import (
|
|
16
|
-
locate_structure_file,
|
|
17
15
|
nr_residues_in_chain,
|
|
18
16
|
write_single_chain_pdb_file,
|
|
19
17
|
)
|
|
@@ -21,25 +19,48 @@ from protein_quest.pdbe.io import (
|
|
|
21
19
|
logger = logging.getLogger(__name__)
|
|
22
20
|
|
|
23
21
|
|
|
22
|
+
@dataclass
|
|
23
|
+
class ChainFilterStatistics:
|
|
24
|
+
input_file: Path
|
|
25
|
+
chain_id: str
|
|
26
|
+
passed: bool = False
|
|
27
|
+
output_file: Path | None = None
|
|
28
|
+
discard_reason: Exception | None = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def filter_file_on_chain(
|
|
32
|
+
file_and_chain: tuple[Path, str], output_dir: Path, out_chain: str = "A"
|
|
33
|
+
) -> ChainFilterStatistics:
|
|
34
|
+
input_file, chain_id = file_and_chain
|
|
35
|
+
try:
|
|
36
|
+
output_file = write_single_chain_pdb_file(input_file, chain_id, output_dir, out_chain=out_chain)
|
|
37
|
+
return ChainFilterStatistics(
|
|
38
|
+
input_file=input_file,
|
|
39
|
+
chain_id=chain_id,
|
|
40
|
+
output_file=output_file,
|
|
41
|
+
passed=True,
|
|
42
|
+
)
|
|
43
|
+
except Exception as e: # noqa: BLE001 - error is handled downstream
|
|
44
|
+
return ChainFilterStatistics(input_file=input_file, chain_id=chain_id, discard_reason=e)
|
|
45
|
+
|
|
46
|
+
|
|
24
47
|
def filter_files_on_chain(
|
|
25
|
-
|
|
26
|
-
id2chains: dict[str, str],
|
|
48
|
+
file2chains: Collection[tuple[Path, str]],
|
|
27
49
|
output_dir: Path,
|
|
28
|
-
scheduler_address: str | Cluster | None = None,
|
|
29
50
|
out_chain: str = "A",
|
|
30
|
-
|
|
51
|
+
scheduler_address: str | Cluster | None = None,
|
|
52
|
+
) -> list[ChainFilterStatistics]:
|
|
31
53
|
"""Filter mmcif/PDB files by chain.
|
|
32
54
|
|
|
33
55
|
Args:
|
|
34
|
-
|
|
35
|
-
|
|
56
|
+
file2chains: Which chain to keep for each PDB file.
|
|
57
|
+
First item is the PDB file path, second item is the chain ID.
|
|
36
58
|
output_dir: The directory where the filtered files will be written.
|
|
37
|
-
scheduler_address: The address of the Dask scheduler.
|
|
38
59
|
out_chain: Under what name to write the kept chain.
|
|
60
|
+
scheduler_address: The address of the Dask scheduler.
|
|
39
61
|
|
|
40
62
|
Returns:
|
|
41
|
-
|
|
42
|
-
Last tuple item is None if something went wrong like chain not present.
|
|
63
|
+
Result of the filtering process.
|
|
43
64
|
"""
|
|
44
65
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
45
66
|
scheduler_address = configure_dask_scheduler(
|
|
@@ -47,24 +68,14 @@ def filter_files_on_chain(
|
|
|
47
68
|
name="filter-chain",
|
|
48
69
|
)
|
|
49
70
|
|
|
50
|
-
def task(id2chain: tuple[str, str]) -> tuple[str, str, Path | None]:
|
|
51
|
-
pdb_id, chain = id2chain
|
|
52
|
-
input_file = locate_structure_file(input_dir, pdb_id)
|
|
53
|
-
return pdb_id, chain, write_single_chain_pdb_file(input_file, chain, output_dir, out_chain=out_chain)
|
|
54
|
-
|
|
55
71
|
with Client(scheduler_address) as client:
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
progress(futures)
|
|
61
|
-
|
|
62
|
-
results = client.gather(futures)
|
|
63
|
-
return cast("list[tuple[str,str, Path | None]]", results)
|
|
72
|
+
return dask_map_with_progress(
|
|
73
|
+
client, filter_file_on_chain, file2chains, output_dir=output_dir, out_chain=out_chain
|
|
74
|
+
)
|
|
64
75
|
|
|
65
76
|
|
|
66
77
|
@dataclass
|
|
67
|
-
class
|
|
78
|
+
class ResidueFilterStatistics:
|
|
68
79
|
"""Statistics for filtering files based on residue count in a specific chain.
|
|
69
80
|
|
|
70
81
|
Parameters:
|
|
@@ -82,7 +93,7 @@ class FilterStat:
|
|
|
82
93
|
|
|
83
94
|
def filter_files_on_residues(
|
|
84
95
|
input_files: list[Path], output_dir: Path, min_residues: int, max_residues: int, chain: str = "A"
|
|
85
|
-
) -> Generator[
|
|
96
|
+
) -> Generator[ResidueFilterStatistics]:
|
|
86
97
|
"""Filter PDB/mmCIF files by number of residues in given chain.
|
|
87
98
|
|
|
88
99
|
Args:
|
|
@@ -93,7 +104,7 @@ def filter_files_on_residues(
|
|
|
93
104
|
chain: The chain to count residues of.
|
|
94
105
|
|
|
95
106
|
Yields:
|
|
96
|
-
|
|
107
|
+
Objects containing information about the filtering process for each input file.
|
|
97
108
|
"""
|
|
98
109
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
99
110
|
for input_file in tqdm(input_files, unit="file"):
|
|
@@ -102,6 +113,6 @@ def filter_files_on_residues(
|
|
|
102
113
|
if passed:
|
|
103
114
|
output_file = output_dir / input_file.name
|
|
104
115
|
copyfile(input_file, output_file)
|
|
105
|
-
yield
|
|
116
|
+
yield ResidueFilterStatistics(input_file, residue_count, True, output_file)
|
|
106
117
|
else:
|
|
107
|
-
yield
|
|
118
|
+
yield ResidueFilterStatistics(input_file, residue_count, False, None)
|
protein_quest/mcp_server.py
CHANGED
|
@@ -24,12 +24,11 @@ npx @modelcontextprotocol/inspector
|
|
|
24
24
|
# Choose STDIO
|
|
25
25
|
# command: uv run protein-quest mcp
|
|
26
26
|
# id: protein-quest
|
|
27
|
-
# Prompt: What are the PDBe structures for `A8MT69` uniprot accession?
|
|
28
27
|
```
|
|
29
28
|
|
|
30
29
|
Examples:
|
|
31
30
|
|
|
32
|
-
|
|
31
|
+
- What are the PDBe structures for `A8MT69` uniprot accession?
|
|
33
32
|
|
|
34
33
|
"""
|
|
35
34
|
|
|
@@ -90,7 +89,7 @@ def extract_single_chain_from_structure(
|
|
|
90
89
|
chain2keep: str,
|
|
91
90
|
output_dir: Path,
|
|
92
91
|
out_chain: str = "A",
|
|
93
|
-
) -> Path
|
|
92
|
+
) -> Path:
|
|
94
93
|
"""
|
|
95
94
|
Extract a single chain from a mmCIF/pdb file and write to a new file.
|
|
96
95
|
|
|
@@ -101,7 +100,7 @@ def extract_single_chain_from_structure(
|
|
|
101
100
|
out_chain: The chain identifier for the output file.
|
|
102
101
|
|
|
103
102
|
Returns:
|
|
104
|
-
Path to the output mmCIF/pdb file
|
|
103
|
+
Path to the output mmCIF/pdb file
|
|
105
104
|
"""
|
|
106
105
|
return write_single_chain_pdb_file(input_file, chain2keep, output_dir, out_chain)
|
|
107
106
|
|
|
@@ -150,7 +149,7 @@ def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[A
|
|
|
150
149
|
Returns:
|
|
151
150
|
A list of AlphaFold entries.
|
|
152
151
|
"""
|
|
153
|
-
what: set[DownloadableFormat] = {"cif"}
|
|
152
|
+
what: set[DownloadableFormat] = {"summary", "cif"}
|
|
154
153
|
return alphafold_fetch(uniprot_accs, save_dir, what)
|
|
155
154
|
|
|
156
155
|
|
protein_quest/parallel.py
CHANGED
|
@@ -2,8 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
|
+
from collections.abc import Callable, Collection
|
|
6
|
+
from typing import Concatenate, ParamSpec, cast
|
|
5
7
|
|
|
6
|
-
from dask.distributed import LocalCluster
|
|
8
|
+
from dask.distributed import Client, LocalCluster, progress
|
|
7
9
|
from distributed.deploy.cluster import Cluster
|
|
8
10
|
from psutil import cpu_count
|
|
9
11
|
|
|
@@ -66,3 +68,37 @@ def _configure_cpu_dask_scheduler(nproc: int, name: str) -> LocalCluster:
|
|
|
66
68
|
n_workers = total_cpus // nproc
|
|
67
69
|
# Use single thread per worker to prevent GIL slowing down the computations
|
|
68
70
|
return LocalCluster(name=name, threads_per_worker=1, n_workers=n_workers)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# Generic type parameters used across helpers
|
|
74
|
+
P = ParamSpec("P")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def dask_map_with_progress[T, R, **P](
|
|
78
|
+
client: Client,
|
|
79
|
+
func: Callable[Concatenate[T, P], R],
|
|
80
|
+
iterable: Collection[T],
|
|
81
|
+
*args: P.args,
|
|
82
|
+
**kwargs: P.kwargs,
|
|
83
|
+
) -> list[R]:
|
|
84
|
+
"""
|
|
85
|
+
Wrapper for map, progress, and gather of Dask that returns a correctly typed list.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
client: Dask client.
|
|
89
|
+
func: Function to map; first parameter comes from ``iterable`` and any
|
|
90
|
+
additional parameters can be provided positionally via ``*args`` or
|
|
91
|
+
as keyword arguments via ``**kwargs``.
|
|
92
|
+
iterable: Collection of arguments to map over.
|
|
93
|
+
*args: Additional positional arguments to pass to client.map().
|
|
94
|
+
**kwargs: Additional keyword arguments to pass to client.map().
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
List of results of type returned by `func` function.
|
|
98
|
+
"""
|
|
99
|
+
if client.dashboard_link:
|
|
100
|
+
logger.info(f"Follow progress on dask dashboard at: {client.dashboard_link}")
|
|
101
|
+
futures = client.map(func, iterable, *args, **kwargs)
|
|
102
|
+
progress(futures)
|
|
103
|
+
results = client.gather(futures)
|
|
104
|
+
return cast("list[R]", results)
|
protein_quest/pdbe/fetch.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from collections.abc import Iterable, Mapping
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
|
-
from protein_quest.utils import retrieve_files
|
|
6
|
+
from protein_quest.utils import retrieve_files, run_async
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def _map_id_mmcif(pdb_id: str) -> tuple[str, str]:
|
|
@@ -49,3 +49,17 @@ async def fetch(ids: Iterable[str], save_dir: Path, max_parallel_downloads: int
|
|
|
49
49
|
|
|
50
50
|
await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading PDBe mmCIF files")
|
|
51
51
|
return id2paths
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def sync_fetch(ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 5) -> Mapping[str, Path]:
|
|
55
|
+
"""Synchronously fetches mmCIF files from the PDBe database.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
ids: A set of PDB IDs to fetch.
|
|
59
|
+
save_dir: The directory to save the fetched mmCIF files to.
|
|
60
|
+
max_parallel_downloads: The maximum number of parallel downloads.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
A dict of id and paths to the downloaded mmCIF files.
|
|
64
|
+
"""
|
|
65
|
+
return run_async(fetch(ids, save_dir, max_parallel_downloads))
|
protein_quest/pdbe/io.py
CHANGED
|
@@ -11,6 +11,11 @@ from protein_quest import __version__
|
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
14
|
+
# TODO remove once v0.7.4 of gemmi is released,
|
|
15
|
+
# as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
|
|
16
|
+
# Swallow gemmi leaked function warnings
|
|
17
|
+
gemmi.set_leak_warnings(False)
|
|
18
|
+
|
|
14
19
|
|
|
15
20
|
def nr_residues_in_chain(file: Path | str, chain: str = "A") -> int:
|
|
16
21
|
"""Returns the number of residues in a specific chain from a mmCIF/pdb file.
|
|
@@ -131,9 +136,16 @@ def glob_structure_files(input_dir: Path) -> Generator[Path]:
|
|
|
131
136
|
yield from input_dir.glob(f"*{ext}")
|
|
132
137
|
|
|
133
138
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
139
|
+
class ChainNotFoundError(IndexError):
|
|
140
|
+
"""Exception raised when a chain is not found in a structure."""
|
|
141
|
+
|
|
142
|
+
def __init__(self, chain: str, file: Path | str):
|
|
143
|
+
super().__init__(f"Chain {chain} not found in {file}")
|
|
144
|
+
self.chain_id = chain
|
|
145
|
+
self.file = file
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def write_single_chain_pdb_file(input_file: Path, chain2keep: str, output_dir: Path, out_chain: str = "A") -> Path:
|
|
137
149
|
"""Write a single chain from a mmCIF/pdb file to a new mmCIF/pdb file.
|
|
138
150
|
|
|
139
151
|
Args:
|
|
@@ -143,7 +155,11 @@ def write_single_chain_pdb_file(
|
|
|
143
155
|
out_chain: The chain identifier for the output file.
|
|
144
156
|
|
|
145
157
|
Returns:
|
|
146
|
-
Path to the output mmCIF/pdb file
|
|
158
|
+
Path to the output mmCIF/pdb file
|
|
159
|
+
|
|
160
|
+
Raises:
|
|
161
|
+
FileNotFoundError: If the input file does not exist.
|
|
162
|
+
ChainNotFoundError: If the specified chain is not found in the input file.
|
|
147
163
|
"""
|
|
148
164
|
|
|
149
165
|
structure = gemmi.read_structure(str(input_file))
|
|
@@ -154,15 +170,14 @@ def write_single_chain_pdb_file(
|
|
|
154
170
|
|
|
155
171
|
chain = find_chain_in_model(model, chain2keep)
|
|
156
172
|
if chain is None:
|
|
157
|
-
|
|
158
|
-
"Chain %s not found in %s. Skipping.",
|
|
159
|
-
chain2keep,
|
|
160
|
-
input_file,
|
|
161
|
-
)
|
|
162
|
-
return None
|
|
173
|
+
raise ChainNotFoundError(chain2keep, input_file)
|
|
163
174
|
name, extension = _split_name_and_extension(input_file.name)
|
|
164
175
|
output_file = output_dir / f"{name}_{chain.name}2{out_chain}{extension}"
|
|
165
176
|
|
|
177
|
+
if output_file.exists():
|
|
178
|
+
logger.info("Output file %s already exists for input file %s. Skipping.", output_file, input_file)
|
|
179
|
+
return output_file
|
|
180
|
+
|
|
166
181
|
new_structure = gemmi.Structure()
|
|
167
182
|
new_structure.resolution = structure.resolution
|
|
168
183
|
new_id = structure.name + f"{chain2keep}2{out_chain}"
|
protein_quest/taxonomy.py
CHANGED
|
@@ -20,6 +20,16 @@ logger = logging.getLogger(__name__)
|
|
|
20
20
|
|
|
21
21
|
@dataclass(frozen=True, slots=True)
|
|
22
22
|
class Taxon:
|
|
23
|
+
"""Dataclass representing a taxon.
|
|
24
|
+
|
|
25
|
+
Arguments:
|
|
26
|
+
taxon_id: The unique identifier for the taxon.
|
|
27
|
+
scientific_name: The scientific name of the taxon.
|
|
28
|
+
rank: The taxonomic rank of the taxon (e.g., species, genus).
|
|
29
|
+
common_name: The common name of the taxon (if available).
|
|
30
|
+
other_names: A set of other names for the taxon (if available).
|
|
31
|
+
"""
|
|
32
|
+
|
|
23
33
|
taxon_id: str
|
|
24
34
|
scientific_name: str
|
|
25
35
|
rank: str
|
|
@@ -47,7 +57,9 @@ converter.register_structure_hook(
|
|
|
47
57
|
)
|
|
48
58
|
|
|
49
59
|
SearchField = Literal["tax_id", "scientific", "common", "parent"]
|
|
60
|
+
"""Type of search field"""
|
|
50
61
|
search_fields: set[SearchField | None] = set(get_args(SearchField)) | {None}
|
|
62
|
+
"""Set of valid search fields"""
|
|
51
63
|
|
|
52
64
|
|
|
53
65
|
def _get_next_page(response: ClientResponse) -> URL | str | None:
|
protein_quest/utils.py
CHANGED
|
@@ -2,20 +2,23 @@
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
|
-
from collections.abc import Iterable
|
|
5
|
+
from collections.abc import Coroutine, Iterable
|
|
6
6
|
from contextlib import asynccontextmanager
|
|
7
7
|
from pathlib import Path
|
|
8
|
+
from textwrap import dedent
|
|
9
|
+
from typing import Any
|
|
8
10
|
|
|
9
11
|
import aiofiles
|
|
10
12
|
import aiohttp
|
|
11
13
|
from aiohttp_retry import ExponentialRetry, RetryClient
|
|
12
14
|
from tqdm.asyncio import tqdm
|
|
15
|
+
from yarl import URL
|
|
13
16
|
|
|
14
17
|
logger = logging.getLogger(__name__)
|
|
15
18
|
|
|
16
19
|
|
|
17
20
|
async def retrieve_files(
|
|
18
|
-
urls: Iterable[tuple[str, str]],
|
|
21
|
+
urls: Iterable[tuple[URL | str, str]],
|
|
19
22
|
save_dir: Path,
|
|
20
23
|
max_parallel_downloads: int = 5,
|
|
21
24
|
retries: int = 3,
|
|
@@ -45,7 +48,7 @@ async def retrieve_files(
|
|
|
45
48
|
|
|
46
49
|
async def _retrieve_file(
|
|
47
50
|
session: RetryClient,
|
|
48
|
-
url: str,
|
|
51
|
+
url: URL | str,
|
|
49
52
|
save_path: Path,
|
|
50
53
|
semaphore: asyncio.Semaphore,
|
|
51
54
|
ovewrite: bool = False,
|
|
@@ -103,3 +106,35 @@ async def friendly_session(retries: int = 3, total_timeout: int = 300):
|
|
|
103
106
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
104
107
|
client = RetryClient(client_session=session, retry_options=retry_options)
|
|
105
108
|
yield client
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class NestedAsyncIOLoopError(RuntimeError):
|
|
112
|
+
"""Custom error for nested async I/O loops."""
|
|
113
|
+
|
|
114
|
+
def __init__(self) -> None:
|
|
115
|
+
msg = dedent("""\
|
|
116
|
+
Can not run async method from an environment where the asyncio event loop is already running.
|
|
117
|
+
Like a Jupyter notebook.
|
|
118
|
+
|
|
119
|
+
Please use the async function directly or
|
|
120
|
+
call `import nest_asyncio; nest_asyncio.apply()` and try again.
|
|
121
|
+
""")
|
|
122
|
+
super().__init__(msg)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def run_async[R](coroutine: Coroutine[Any, Any, R]) -> R:
|
|
126
|
+
"""Run an async coroutine with nicer error.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
coroutine: The async coroutine to run.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
The result of the coroutine.
|
|
133
|
+
|
|
134
|
+
Raises:
|
|
135
|
+
NestedAsyncIOLoopError: If called from a nested async I/O loop like in a Jupyter notebook.
|
|
136
|
+
"""
|
|
137
|
+
try:
|
|
138
|
+
return asyncio.run(coroutine)
|
|
139
|
+
except RuntimeError as e:
|
|
140
|
+
raise NestedAsyncIOLoopError from e
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: protein_quest
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Search/retrieve/filter proteins and protein structures
|
|
5
5
|
Project-URL: Homepage, https://github.com/haddocking/protein-quest
|
|
6
6
|
Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
|
|
@@ -13,19 +13,16 @@ Requires-Dist: aiohttp-retry>=2.9.1
|
|
|
13
13
|
Requires-Dist: aiohttp[speedups]>=3.11.18
|
|
14
14
|
Requires-Dist: aiopath>=0.7.7
|
|
15
15
|
Requires-Dist: attrs>=25.3.0
|
|
16
|
-
Requires-Dist: bokeh>=3.7.3
|
|
17
16
|
Requires-Dist: cattrs[orjson]>=24.1.3
|
|
18
17
|
Requires-Dist: dask>=2025.5.1
|
|
19
18
|
Requires-Dist: distributed>=2025.5.1
|
|
20
19
|
Requires-Dist: gemmi>=0.7.3
|
|
21
|
-
Requires-Dist: molviewspec>=1.6.0
|
|
22
|
-
Requires-Dist: pandas>=2.3.0
|
|
23
|
-
Requires-Dist: platformdirs>=4.3.8
|
|
24
20
|
Requires-Dist: psutil>=7.0.0
|
|
25
21
|
Requires-Dist: rich-argparse>=1.7.1
|
|
26
22
|
Requires-Dist: rich>=14.0.0
|
|
27
23
|
Requires-Dist: sparqlwrapper>=2.0.0
|
|
28
24
|
Requires-Dist: tqdm>=4.67.1
|
|
25
|
+
Requires-Dist: yarl>=1.20.1
|
|
29
26
|
Provides-Extra: mcp
|
|
30
27
|
Requires-Dist: fastmcp>=2.11.3; extra == 'mcp'
|
|
31
28
|
Requires-Dist: pydantic>=2.11.7; extra == 'mcp'
|
|
@@ -37,8 +34,7 @@ Description-Content-Type: text/markdown
|
|
|
37
34
|
[](https://github.com/haddocking/protein-quest/actions/workflows/ci.yml)
|
|
38
35
|
[](https://www.research-software.nl/software/protein-quest)
|
|
39
36
|
[](https://pypi.org/project/protein-quest/)
|
|
40
|
-
|
|
41
|
-
[](https://doi.org/10.5281/zenodo.15632658)
|
|
37
|
+
[](https://doi.org/10.5281/zenodo.16941288)
|
|
42
38
|
[](https://app.codacy.com/gh/haddocking/protein-quest/coverage?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_coverage)
|
|
43
39
|
|
|
44
40
|
Python package to search/retrieve/filter proteins and protein structures.
|
|
@@ -90,7 +86,7 @@ pip install git+https://github.com/haddocking/protein-quest.git
|
|
|
90
86
|
|
|
91
87
|
The main entry point is the `protein-quest` command line tool which has multiple subcommands to perform actions.
|
|
92
88
|
|
|
93
|
-
To use programmaticly, see [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
|
|
89
|
+
To use programmaticly, see the [Jupyter notebooks](https://www.bonvinlab.org/protein-quest/notebooks) and [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
|
|
94
90
|
|
|
95
91
|
### Search Uniprot accessions
|
|
96
92
|
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
protein_quest/__version__.py,sha256=Bu2gp24I4eIxc1qgY2e0PnF8N-szjUpFQwVAe10IRAo,56
|
|
3
|
+
protein_quest/cli.py,sha256=xjiWtRDqv-Ruv1fpvXq4dmDSuuyewxw81akDs1ktVbI,31772
|
|
4
|
+
protein_quest/emdb.py,sha256=QEeU0VJQ4lLM-o5yAU3QZlrtzDZNgnC5fCjlqPtTyAY,1370
|
|
5
|
+
protein_quest/filters.py,sha256=3vqfFH87Lz7r9uYiSvwMxzShMfRNv1Zv_freJtDljrU,4051
|
|
6
|
+
protein_quest/go.py,sha256=ycV3-grxuIKFt28bFgH6iRKmt5AEGi7txoTbaAnBxQE,5684
|
|
7
|
+
protein_quest/mcp_server.py,sha256=1_CGC0peqoNUFBvgFWupKwIWjmHsKxN5Vxy1K7dt5Dw,7130
|
|
8
|
+
protein_quest/parallel.py,sha256=ZJrLO1t2HXs4EeNctytvBTyROPBq-4-gLf35PiolHf0,3468
|
|
9
|
+
protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
protein_quest/taxonomy.py,sha256=wPzLjum5n_SEkL2rHUKvyRnjL1pG7bhEnE2vMmXixEc,5105
|
|
11
|
+
protein_quest/uniprot.py,sha256=8qWV4GWqHTRfed0bE_TdgsLYcnDT_vzKu-6JxIgapJQ,18680
|
|
12
|
+
protein_quest/utils.py,sha256=YhlTJreIr1bExbh1M514l6sz4GmLVa3RN57mI1kjjuw,4730
|
|
13
|
+
protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
|
|
14
|
+
protein_quest/alphafold/confidence.py,sha256=GGd_vYsqVvs9InvFKtqHdGKB_61GHllPmDyIztvzG7E,5625
|
|
15
|
+
protein_quest/alphafold/entry_summary.py,sha256=GtE3rT7wH3vIOOeiXY2s80Fo6EzdoqlcvakW8K591Yk,1257
|
|
16
|
+
protein_quest/alphafold/fetch.py,sha256=1mDbQNm01cxlwFNDsKHBWD7MEwzB3PaheskdaLN7XJs,11491
|
|
17
|
+
protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
|
|
18
|
+
protein_quest/pdbe/fetch.py,sha256=tlCrWoaOrwxnQFrf-PnimUUa6lmtHwwysS51efYsBcA,2379
|
|
19
|
+
protein_quest/pdbe/io.py,sha256=J6fHlRLHLALnpxDgSUUnFCNFV9Hr3u6eJDO6j81ftT4,6936
|
|
20
|
+
protein_quest-0.3.1.dist-info/METADATA,sha256=fWvmMbm5aEMb3WbWgPAqwEOWeYJSY47iuZLaRIgBuuk,7305
|
|
21
|
+
protein_quest-0.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
22
|
+
protein_quest-0.3.1.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
|
|
23
|
+
protein_quest-0.3.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
24
|
+
protein_quest-0.3.1.dist-info/RECORD,,
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
protein_quest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
protein_quest/__version__.py,sha256=VrXpHDu3erkzwl_WXrqINBm9xWkcyUy53IQOj042dOs,22
|
|
3
|
-
protein_quest/cli.py,sha256=oyDin6Z92Q17mUmTCasKgju3YUJbPu298gniNakQUwY,31121
|
|
4
|
-
protein_quest/emdb.py,sha256=QEeU0VJQ4lLM-o5yAU3QZlrtzDZNgnC5fCjlqPtTyAY,1370
|
|
5
|
-
protein_quest/filters.py,sha256=GNtM1N1S1mNUqAvX7OvyhOvnUWo4qx2hMneORbc-Qz8,3797
|
|
6
|
-
protein_quest/go.py,sha256=ycV3-grxuIKFt28bFgH6iRKmt5AEGi7txoTbaAnBxQE,5684
|
|
7
|
-
protein_quest/mcp_server.py,sha256=xIaOy6sY_gW5R_oMImI2yBmbBGtZZICOxXLzOkFmm-w,7197
|
|
8
|
-
protein_quest/parallel.py,sha256=kCH6KCJYJZVoq0_Qz8ZLbHnf2OJG-h4uxd9oH2rLNKc,2201
|
|
9
|
-
protein_quest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
protein_quest/taxonomy.py,sha256=kAKKZT_mOtmX8ZWNIE9i7emE23VEewkj12X7d_t3p2Y,4659
|
|
11
|
-
protein_quest/uniprot.py,sha256=8qWV4GWqHTRfed0bE_TdgsLYcnDT_vzKu-6JxIgapJQ,18680
|
|
12
|
-
protein_quest/utils.py,sha256=HUvqfsuMBIFOVFlb_QC2to_UQkiZ0_fwHLlckifuXss,3700
|
|
13
|
-
protein_quest/alphafold/__init__.py,sha256=Ktasi5BRp71wO7-PpOGDpIRRtBEefs8knIdlKQeLQpk,51
|
|
14
|
-
protein_quest/alphafold/confidence.py,sha256=-lbwijzVMhRd98bxwFDbSi7idiUKJ5BpOsGFrvuTEnQ,5596
|
|
15
|
-
protein_quest/alphafold/entry_summary.py,sha256=P-S8qrXkU-wwIccA1nGol1lfDkUW0Sg0th_3EU-WjN8,1187
|
|
16
|
-
protein_quest/alphafold/fetch.py,sha256=eq__PfqisuUIQBUM8KVghpiEOBGF-zXWNC6Ll_Hlz2E,11828
|
|
17
|
-
protein_quest/pdbe/__init__.py,sha256=eNNHtN60NAGea7gvRkIzkoTXsYPK99s-ldIcKWYO6So,61
|
|
18
|
-
protein_quest/pdbe/fetch.py,sha256=iTyS4ucV2KZl4jTgrUFOZhsXs3cRUIuvmTbXNm_pY8U,1850
|
|
19
|
-
protein_quest/pdbe/io.py,sha256=0ldsrIHKaaurrM2FfWXbqm1iRj3q6xw8-lptfYU1yEw,6231
|
|
20
|
-
protein_quest-0.3.0.dist-info/METADATA,sha256=yiHZn4gDdwilbCoxrF0pCjVk04v_O5pwpwrtr6oPLrE,7369
|
|
21
|
-
protein_quest-0.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
22
|
-
protein_quest-0.3.0.dist-info/entry_points.txt,sha256=f1RtOxv9TFBO3w01EMEuFXBTMsqKsQcKlkxmj9zE-0g,57
|
|
23
|
-
protein_quest-0.3.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
24
|
-
protein_quest-0.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|