protein-quest 0.4.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of protein-quest might be problematic. Click here for more details.
- {protein_quest-0.4.0 → protein_quest-0.5.0}/.github/workflows/ci.yml +0 -9
- {protein_quest-0.4.0 → protein_quest-0.5.0}/.github/workflows/pages.yml +9 -1
- {protein_quest-0.4.0 → protein_quest-0.5.0}/.gitignore +14 -1
- {protein_quest-0.4.0 → protein_quest-0.5.0}/PKG-INFO +9 -1
- {protein_quest-0.4.0 → protein_quest-0.5.0}/README.md +7 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/mkdocs.yml +3 -4
- {protein_quest-0.4.0 → protein_quest-0.5.0}/pyproject.toml +2 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/__version__.py +1 -1
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/alphafold/fetch.py +34 -9
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/cli.py +68 -25
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/emdb.py +6 -3
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/mcp_server.py +24 -2
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/pdbe/fetch.py +6 -3
- protein_quest-0.5.0/src/protein_quest/utils.py +511 -0
- protein_quest-0.5.0/tests/test_utils.py +326 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/uv.lock +19 -1
- protein_quest-0.4.0/docs/cli_doc_hook.py +0 -113
- protein_quest-0.4.0/src/protein_quest/utils.py +0 -167
- protein_quest-0.4.0/tests/test_utils.py +0 -31
- {protein_quest-0.4.0 → protein_quest-0.5.0}/.github/workflows/pypi-publish.yml +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/.vscode/extensions.json +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/CITATION.cff +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/CODE_OF_CONDUCT.md +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/CONTRIBUTING.md +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/LICENSE +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/docs/CONTRIBUTING.md +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/docs/index.md +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/docs/notebooks/.gitignore +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/docs/notebooks/alphafold.ipynb +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/docs/notebooks/index.md +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/docs/notebooks/pdbe.ipynb +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/docs/notebooks/uniprot.ipynb +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/docs/protein-quest-mcp.png +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/__init__.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/alphafold/__init__.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/alphafold/confidence.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/alphafold/entry_summary.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/converter.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/filters.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/go.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/parallel.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/pdbe/__init__.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/pdbe/io.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/py.typed +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/ss.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/taxonomy.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/src/protein_quest/uniprot.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/alphafold/AF-A1YPR0-F1-model_v4.pdb +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/alphafold/cassettes/test_fetch/test_fetch_many.yaml +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/alphafold/test_confidence.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/alphafold/test_entry_summary.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/alphafold/test_fetch.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/cassettes/test_emdb/test_fetch.yaml +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/cassettes/test_go/test_search_gene_ontology_term.yaml +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/cassettes/test_taxonomy/test_search_taxon.yaml +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/cassettes/test_taxonomy/test_search_taxon_by_id.yaml +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/cassettes/test_uniprot/test_search4af.yaml +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/cassettes/test_uniprot/test_search4emdb.yaml +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/cassettes/test_uniprot/test_search4interaction_partners.yaml +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/cassettes/test_uniprot/test_search4macromolecular_complexes.yaml +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/cassettes/test_uniprot/test_search4pdb.yaml +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/cassettes/test_uniprot/test_search4uniprot.yaml +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/fixtures/3JRS_B2A.cif.gz +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/pdbe/cassettes/test_fetch/test_fetch.yaml +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/pdbe/fixtures/2y29.cif +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/pdbe/test_fetch.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/pdbe/test_io.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/test_cli.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/test_converter.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/test_emdb.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/test_go.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/test_mcp.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/test_ss.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/test_taxonomy.py +0 -0
- {protein_quest-0.4.0 → protein_quest-0.5.0}/tests/test_uniprot.py +0 -0
|
@@ -27,20 +27,11 @@ jobs:
|
|
|
27
27
|
- name: Run tests
|
|
28
28
|
run: |
|
|
29
29
|
uv run pytest --cov --cov-report=xml
|
|
30
|
-
echo $? > pytest-exitcode
|
|
31
|
-
continue-on-error: true
|
|
32
|
-
# Always upload coverage, even if tests fail
|
|
33
30
|
- name: Run codacy-coverage-reporter
|
|
34
31
|
uses: codacy/codacy-coverage-reporter-action@v1.3.0
|
|
35
32
|
with:
|
|
36
33
|
project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
|
|
37
34
|
coverage-reports: coverage.xml
|
|
38
|
-
- name: Fail job if pytest failed
|
|
39
|
-
run: |
|
|
40
|
-
if [ -f pytest-exitcode ] && [ "$(cat pytest-exitcode)" -ne 0 ]; then
|
|
41
|
-
echo "Pytest failed, failing job."
|
|
42
|
-
exit 1
|
|
43
|
-
fi
|
|
44
35
|
build:
|
|
45
36
|
name: build
|
|
46
37
|
runs-on: ubuntu-latest
|
|
@@ -5,6 +5,7 @@ on:
|
|
|
5
5
|
branches:
|
|
6
6
|
- main
|
|
7
7
|
workflow_dispatch:
|
|
8
|
+
pull_request:
|
|
8
9
|
|
|
9
10
|
permissions:
|
|
10
11
|
contents: read
|
|
@@ -13,7 +14,7 @@ permissions:
|
|
|
13
14
|
|
|
14
15
|
# Only have one deployment in progress at a time
|
|
15
16
|
concurrency:
|
|
16
|
-
group:
|
|
17
|
+
group: pages
|
|
17
18
|
cancel-in-progress: true
|
|
18
19
|
|
|
19
20
|
jobs:
|
|
@@ -32,6 +33,10 @@ jobs:
|
|
|
32
33
|
- name: Build MkDocs site
|
|
33
34
|
run: |
|
|
34
35
|
uv run mkdocs build
|
|
36
|
+
env:
|
|
37
|
+
# Force colored output from rich library
|
|
38
|
+
TTY_COMPATIBLE: '1'
|
|
39
|
+
TTY_INTERACTIVE: '0'
|
|
35
40
|
|
|
36
41
|
- name: Upload artifact
|
|
37
42
|
uses: actions/upload-pages-artifact@v3
|
|
@@ -42,6 +47,9 @@ jobs:
|
|
|
42
47
|
# Add a dependency to the build job
|
|
43
48
|
needs: build
|
|
44
49
|
|
|
50
|
+
# Only deploy on pushes to main or manual trigger of main branch
|
|
51
|
+
if: github.ref == 'refs/heads/main'
|
|
52
|
+
|
|
45
53
|
# Grant GITHUB_TOKEN the permissions required to make a Pages deployment
|
|
46
54
|
permissions:
|
|
47
55
|
pages: write # to deploy to Pages
|
|
@@ -73,4 +73,17 @@ venv.bak/
|
|
|
73
73
|
/docs/pdb_files/
|
|
74
74
|
/docs/density_filtered/
|
|
75
75
|
/site
|
|
76
|
-
/mysession/
|
|
76
|
+
/mysession/
|
|
77
|
+
# Paths generated in README.md examples
|
|
78
|
+
uniprot_accs.txt
|
|
79
|
+
pdbe.csv
|
|
80
|
+
alphafold.csv
|
|
81
|
+
emdbs.csv
|
|
82
|
+
interaction-partners-of-Q05471.txt
|
|
83
|
+
complexes.csv
|
|
84
|
+
downloads-af/
|
|
85
|
+
downloads-emdb/
|
|
86
|
+
downloads-pdbe/
|
|
87
|
+
filtered/
|
|
88
|
+
filtered-chains/
|
|
89
|
+
filtered-ss/
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: protein_quest
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Search/retrieve/filter proteins and protein structures
|
|
5
5
|
Project-URL: Homepage, https://github.com/haddocking/protein-quest
|
|
6
6
|
Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
|
|
@@ -17,6 +17,7 @@ Requires-Dist: cattrs[orjson]>=24.1.3
|
|
|
17
17
|
Requires-Dist: dask>=2025.5.1
|
|
18
18
|
Requires-Dist: distributed>=2025.5.1
|
|
19
19
|
Requires-Dist: gemmi>=0.7.3
|
|
20
|
+
Requires-Dist: platformdirs>=4.3.8
|
|
20
21
|
Requires-Dist: psutil>=7.0.0
|
|
21
22
|
Requires-Dist: rich-argparse>=1.7.1
|
|
22
23
|
Requires-Dist: rich>=14.0.0
|
|
@@ -47,6 +48,10 @@ It uses
|
|
|
47
48
|
- [gemmi](https://project-gemmi.github.io/) to work with macromolecular models.
|
|
48
49
|
- [dask-distributed](https://docs.dask.org/en/latest/) to compute in parallel.
|
|
49
50
|
|
|
51
|
+
The package is used by
|
|
52
|
+
|
|
53
|
+
- [protein-detective](https://github.com/haddocking/protein-detective)
|
|
54
|
+
|
|
50
55
|
An example workflow:
|
|
51
56
|
|
|
52
57
|
```mermaid
|
|
@@ -94,6 +99,9 @@ The main entry point is the `protein-quest` command line tool which has multiple
|
|
|
94
99
|
|
|
95
100
|
To use programmaticly, see the [Jupyter notebooks](https://www.bonvinlab.org/protein-quest/notebooks) and [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
|
|
96
101
|
|
|
102
|
+
While downloading or copying files it uses a global cache (located at `~/.cache/protein-quest`) and hardlinks to save disk space and improve speed.
|
|
103
|
+
This behavior can be customized with the `--no-cache`, `--cache-dir`, and `--copy-method` command line arguments.
|
|
104
|
+
|
|
97
105
|
### Search Uniprot accessions
|
|
98
106
|
|
|
99
107
|
```shell
|
|
@@ -17,6 +17,10 @@ It uses
|
|
|
17
17
|
- [gemmi](https://project-gemmi.github.io/) to work with macromolecular models.
|
|
18
18
|
- [dask-distributed](https://docs.dask.org/en/latest/) to compute in parallel.
|
|
19
19
|
|
|
20
|
+
The package is used by
|
|
21
|
+
|
|
22
|
+
- [protein-detective](https://github.com/haddocking/protein-detective)
|
|
23
|
+
|
|
20
24
|
An example workflow:
|
|
21
25
|
|
|
22
26
|
```mermaid
|
|
@@ -64,6 +68,9 @@ The main entry point is the `protein-quest` command line tool which has multiple
|
|
|
64
68
|
|
|
65
69
|
To use programmaticly, see the [Jupyter notebooks](https://www.bonvinlab.org/protein-quest/notebooks) and [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
|
|
66
70
|
|
|
71
|
+
While downloading or copying files it uses a global cache (located at `~/.cache/protein-quest`) and hardlinks to save disk space and improve speed.
|
|
72
|
+
This behavior can be customized with the `--no-cache`, `--cache-dir`, and `--copy-method` command line arguments.
|
|
73
|
+
|
|
67
74
|
### Search Uniprot accessions
|
|
68
75
|
|
|
69
76
|
```shell
|
|
@@ -3,10 +3,6 @@ site_url: https://bonvinlab.org/protein_quest
|
|
|
3
3
|
repo_name: haddocking/protein-quest
|
|
4
4
|
repo_url: https://github.com/haddocking/protein-quest
|
|
5
5
|
watch: [mkdocs.yml, README.md, src/protein_quest]
|
|
6
|
-
exclude_docs: |
|
|
7
|
-
cli_doc_hook.py
|
|
8
|
-
hooks:
|
|
9
|
-
- docs/cli_doc_hook.py
|
|
10
6
|
use_directory_urls: false
|
|
11
7
|
theme:
|
|
12
8
|
name: material
|
|
@@ -61,6 +57,9 @@ plugins:
|
|
|
61
57
|
remove_tag_config:
|
|
62
58
|
remove_input_tags:
|
|
63
59
|
- hide_code
|
|
60
|
+
- mkdocs-rich-argparse:
|
|
61
|
+
module: protein_quest.cli
|
|
62
|
+
factory: make_parser
|
|
64
63
|
|
|
65
64
|
markdown_extensions:
|
|
66
65
|
# Use to render part of README as home
|
|
@@ -20,6 +20,7 @@ dependencies = [
|
|
|
20
20
|
"sparqlwrapper>=2.0.0",
|
|
21
21
|
"tqdm>=4.67.1",
|
|
22
22
|
"yarl>=1.20.1",
|
|
23
|
+
"platformdirs>=4.3.8",
|
|
23
24
|
]
|
|
24
25
|
|
|
25
26
|
[project.urls]
|
|
@@ -57,6 +58,7 @@ docs = [
|
|
|
57
58
|
"mkdocs-autoapi>=0.4.1",
|
|
58
59
|
"mkdocs-jupyter>=0.25.1",
|
|
59
60
|
"mkdocs-material>=9.6.14",
|
|
61
|
+
"mkdocs-rich-argparse>=0.1.2",
|
|
60
62
|
"mkdocstrings[python]>=0.29.1",
|
|
61
63
|
]
|
|
62
64
|
docs-type = [
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.5.0"
|
|
2
2
|
"""The version of the package."""
|
|
@@ -14,7 +14,7 @@ from yarl import URL
|
|
|
14
14
|
|
|
15
15
|
from protein_quest.alphafold.entry_summary import EntrySummary
|
|
16
16
|
from protein_quest.converter import converter
|
|
17
|
-
from protein_quest.utils import friendly_session, retrieve_files, run_async
|
|
17
|
+
from protein_quest.utils import Cacher, PassthroughCacher, friendly_session, retrieve_files, run_async
|
|
18
18
|
|
|
19
19
|
logger = logging.getLogger(__name__)
|
|
20
20
|
|
|
@@ -104,7 +104,7 @@ class AlphaFoldEntry:
|
|
|
104
104
|
|
|
105
105
|
|
|
106
106
|
async def fetch_summary(
|
|
107
|
-
qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None
|
|
107
|
+
qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None, cacher: Cacher
|
|
108
108
|
) -> list[EntrySummary]:
|
|
109
109
|
"""Fetches a summary from the AlphaFold database for a given qualifier.
|
|
110
110
|
|
|
@@ -116,6 +116,7 @@ async def fetch_summary(
|
|
|
116
116
|
save_dir: An optional directory to save the fetched summary as a JSON file.
|
|
117
117
|
If set and summary exists then summary will be loaded from disk instead of being fetched from the API.
|
|
118
118
|
If not set then the summary will not be saved to disk and will always be fetched from the API.
|
|
119
|
+
cacher: A cacher to use for caching the fetched summary. Only used if save_dir is not None.
|
|
119
120
|
|
|
120
121
|
Returns:
|
|
121
122
|
A list of EntrySummary objects representing the fetched summary.
|
|
@@ -124,6 +125,11 @@ async def fetch_summary(
|
|
|
124
125
|
fn: AsyncPath | None = None
|
|
125
126
|
if save_dir is not None:
|
|
126
127
|
fn = AsyncPath(save_dir / f"{qualifier}.json")
|
|
128
|
+
cached_file = await cacher.copy_from_cache(Path(fn))
|
|
129
|
+
if cached_file is not None:
|
|
130
|
+
logger.debug(f"Using cached file {cached_file} for summary of {qualifier}.")
|
|
131
|
+
raw_data = await AsyncPath(cached_file).read_bytes()
|
|
132
|
+
return converter.loads(raw_data, list[EntrySummary])
|
|
127
133
|
if await fn.exists():
|
|
128
134
|
logger.debug(f"File {fn} already exists. Skipping download from {url}.")
|
|
129
135
|
raw_data = await fn.read_bytes()
|
|
@@ -133,18 +139,23 @@ async def fetch_summary(
|
|
|
133
139
|
raw_data = await response.content.read()
|
|
134
140
|
if fn is not None:
|
|
135
141
|
# TODO return fn and make it part of AlphaFoldEntry as summary_file prop
|
|
136
|
-
await
|
|
142
|
+
await cacher.write_bytes(Path(fn), raw_data)
|
|
137
143
|
return converter.loads(raw_data, list[EntrySummary])
|
|
138
144
|
|
|
139
145
|
|
|
140
146
|
async def fetch_summaries(
|
|
141
|
-
qualifiers: Iterable[str],
|
|
147
|
+
qualifiers: Iterable[str],
|
|
148
|
+
save_dir: Path | None = None,
|
|
149
|
+
max_parallel_downloads: int = 5,
|
|
150
|
+
cacher: Cacher | None = None,
|
|
142
151
|
) -> AsyncGenerator[EntrySummary]:
|
|
143
152
|
semaphore = Semaphore(max_parallel_downloads)
|
|
144
153
|
if save_dir is not None:
|
|
145
154
|
save_dir.mkdir(parents=True, exist_ok=True)
|
|
155
|
+
if cacher is None:
|
|
156
|
+
cacher = PassthroughCacher()
|
|
146
157
|
async with friendly_session() as session:
|
|
147
|
-
tasks = [fetch_summary(qualifier, session, semaphore, save_dir) for qualifier in qualifiers]
|
|
158
|
+
tasks = [fetch_summary(qualifier, session, semaphore, save_dir, cacher) for qualifier in qualifiers]
|
|
148
159
|
summaries_per_qualifier: list[list[EntrySummary]] = await tqdm.gather(
|
|
149
160
|
*tasks, desc="Fetching Alphafold summaries"
|
|
150
161
|
)
|
|
@@ -154,7 +165,11 @@ async def fetch_summaries(
|
|
|
154
165
|
|
|
155
166
|
|
|
156
167
|
async def fetch_many_async(
|
|
157
|
-
uniprot_accessions: Iterable[str],
|
|
168
|
+
uniprot_accessions: Iterable[str],
|
|
169
|
+
save_dir: Path,
|
|
170
|
+
what: set[DownloadableFormat],
|
|
171
|
+
max_parallel_downloads: int = 5,
|
|
172
|
+
cacher: Cacher | None = None,
|
|
158
173
|
) -> AsyncGenerator[AlphaFoldEntry]:
|
|
159
174
|
"""Asynchronously fetches summaries and files from
|
|
160
175
|
[AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
|
|
@@ -164,15 +179,17 @@ async def fetch_many_async(
|
|
|
164
179
|
save_dir: The directory to save the fetched files to.
|
|
165
180
|
what: A set of formats to download.
|
|
166
181
|
max_parallel_downloads: The maximum number of parallel downloads.
|
|
182
|
+
cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
|
|
167
183
|
|
|
168
184
|
Yields:
|
|
169
185
|
A dataclass containing the summary, pdb file, and pae file.
|
|
170
186
|
"""
|
|
171
187
|
save_dir_for_summaries = save_dir if "summary" in what and save_dir is not None else None
|
|
188
|
+
|
|
172
189
|
summaries = [
|
|
173
190
|
s
|
|
174
191
|
async for s in fetch_summaries(
|
|
175
|
-
uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads
|
|
192
|
+
uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads, cacher=cacher
|
|
176
193
|
)
|
|
177
194
|
]
|
|
178
195
|
|
|
@@ -183,6 +200,7 @@ async def fetch_many_async(
|
|
|
183
200
|
save_dir,
|
|
184
201
|
desc="Downloading AlphaFold files",
|
|
185
202
|
max_parallel_downloads=max_parallel_downloads,
|
|
203
|
+
cacher=cacher,
|
|
186
204
|
)
|
|
187
205
|
for summary in summaries:
|
|
188
206
|
yield AlphaFoldEntry(
|
|
@@ -236,7 +254,11 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
|
|
|
236
254
|
|
|
237
255
|
|
|
238
256
|
def fetch_many(
|
|
239
|
-
ids: Iterable[str],
|
|
257
|
+
ids: Iterable[str],
|
|
258
|
+
save_dir: Path,
|
|
259
|
+
what: set[DownloadableFormat],
|
|
260
|
+
max_parallel_downloads: int = 5,
|
|
261
|
+
cacher: Cacher | None = None,
|
|
240
262
|
) -> list[AlphaFoldEntry]:
|
|
241
263
|
"""Synchronously fetches summaries and pdb and pae files from AlphaFold Protein Structure Database.
|
|
242
264
|
|
|
@@ -245,6 +267,7 @@ def fetch_many(
|
|
|
245
267
|
save_dir: The directory to save the fetched files to.
|
|
246
268
|
what: A set of formats to download.
|
|
247
269
|
max_parallel_downloads: The maximum number of parallel downloads.
|
|
270
|
+
cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
|
|
248
271
|
|
|
249
272
|
Returns:
|
|
250
273
|
A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
|
|
@@ -253,7 +276,9 @@ def fetch_many(
|
|
|
253
276
|
async def gather_entries():
|
|
254
277
|
return [
|
|
255
278
|
entry
|
|
256
|
-
async for entry in fetch_many_async(
|
|
279
|
+
async for entry in fetch_many_async(
|
|
280
|
+
ids, save_dir, what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
|
|
281
|
+
)
|
|
257
282
|
]
|
|
258
283
|
|
|
259
284
|
return run_async(gather_entries())
|
|
@@ -43,7 +43,15 @@ from protein_quest.uniprot import (
|
|
|
43
43
|
search4pdb,
|
|
44
44
|
search4uniprot,
|
|
45
45
|
)
|
|
46
|
-
from protein_quest.utils import
|
|
46
|
+
from protein_quest.utils import (
|
|
47
|
+
Cacher,
|
|
48
|
+
CopyMethod,
|
|
49
|
+
DirectoryCacher,
|
|
50
|
+
PassthroughCacher,
|
|
51
|
+
copy_methods,
|
|
52
|
+
copyfile,
|
|
53
|
+
user_cache_root_dir,
|
|
54
|
+
)
|
|
47
55
|
|
|
48
56
|
logger = logging.getLogger(__name__)
|
|
49
57
|
|
|
@@ -312,6 +320,7 @@ def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
|
|
|
312
320
|
default=5,
|
|
313
321
|
help="Maximum number of parallel downloads",
|
|
314
322
|
)
|
|
323
|
+
_add_cacher_arguments(parser)
|
|
315
324
|
|
|
316
325
|
|
|
317
326
|
def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -342,6 +351,7 @@ def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
|
|
|
342
351
|
default=5,
|
|
343
352
|
help="Maximum number of parallel downloads",
|
|
344
353
|
)
|
|
354
|
+
_add_cacher_arguments(parser)
|
|
345
355
|
|
|
346
356
|
|
|
347
357
|
def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -361,22 +371,7 @@ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
|
|
|
361
371
|
help="CSV file with `emdb_id` column. Other columns are ignored. Use `-` for stdin.",
|
|
362
372
|
)
|
|
363
373
|
parser.add_argument("output_dir", type=Path, help="Directory to store downloaded EMDB volume files")
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
def _add_copy_method_argument(parser: argparse.ArgumentParser):
|
|
367
|
-
"""Add copy method argument to parser."""
|
|
368
|
-
default_copy_method = "symlink"
|
|
369
|
-
if os.name == "nt":
|
|
370
|
-
# On Windows you need developer mode or admin privileges to create symlinks
|
|
371
|
-
# so we default to copying files instead of symlinking
|
|
372
|
-
default_copy_method = "copy"
|
|
373
|
-
parser.add_argument(
|
|
374
|
-
"--copy-method",
|
|
375
|
-
type=str,
|
|
376
|
-
choices=copy_methods,
|
|
377
|
-
default=default_copy_method,
|
|
378
|
-
help="How to copy files when no changes are needed to output file.",
|
|
379
|
-
)
|
|
374
|
+
_add_cacher_arguments(parser)
|
|
380
375
|
|
|
381
376
|
|
|
382
377
|
def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -409,7 +404,7 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
|
|
|
409
404
|
In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
|
|
410
405
|
Use `-` for stdout."""),
|
|
411
406
|
)
|
|
412
|
-
|
|
407
|
+
_add_copy_method_arguments(parser)
|
|
413
408
|
|
|
414
409
|
|
|
415
410
|
def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -449,7 +444,7 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
|
|
|
449
444
|
If not provided, will create a local cluster.
|
|
450
445
|
If set to `sequential` will run tasks sequentially."""),
|
|
451
446
|
)
|
|
452
|
-
|
|
447
|
+
_add_copy_method_arguments(parser)
|
|
453
448
|
|
|
454
449
|
|
|
455
450
|
def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -472,7 +467,6 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
|
|
|
472
467
|
)
|
|
473
468
|
parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
|
|
474
469
|
parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
|
|
475
|
-
_add_copy_method_argument(parser)
|
|
476
470
|
parser.add_argument(
|
|
477
471
|
"--write-stats",
|
|
478
472
|
type=argparse.FileType("w", encoding="UTF-8"),
|
|
@@ -481,6 +475,7 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
|
|
|
481
475
|
In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
|
|
482
476
|
Use `-` for stdout."""),
|
|
483
477
|
)
|
|
478
|
+
_add_copy_method_arguments(parser)
|
|
484
479
|
|
|
485
480
|
|
|
486
481
|
def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
|
|
@@ -507,7 +502,6 @@ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
|
|
|
507
502
|
parser.add_argument("--ratio-max-helix-residues", type=float, help="Max residues in helices (relative)")
|
|
508
503
|
parser.add_argument("--ratio-min-sheet-residues", type=float, help="Min residues in sheets (relative)")
|
|
509
504
|
parser.add_argument("--ratio-max-sheet-residues", type=float, help="Max residues in sheets (relative)")
|
|
510
|
-
_add_copy_method_argument(parser)
|
|
511
505
|
parser.add_argument(
|
|
512
506
|
"--write-stats",
|
|
513
507
|
type=argparse.FileType("w", encoding="UTF-8"),
|
|
@@ -518,6 +512,7 @@ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
|
|
|
518
512
|
Use `-` for stdout.
|
|
519
513
|
"""),
|
|
520
514
|
)
|
|
515
|
+
_add_copy_method_arguments(parser)
|
|
521
516
|
|
|
522
517
|
|
|
523
518
|
def _add_search_subcommands(subparsers: argparse._SubParsersAction):
|
|
@@ -585,6 +580,38 @@ def _add_mcp_command(subparsers: argparse._SubParsersAction):
|
|
|
585
580
|
parser.add_argument("--port", default=8000, type=int, help="Port to bind the server to")
|
|
586
581
|
|
|
587
582
|
|
|
583
|
+
def _add_copy_method_arguments(parser):
|
|
584
|
+
parser.add_argument(
|
|
585
|
+
"--copy-method",
|
|
586
|
+
type=str,
|
|
587
|
+
choices=copy_methods,
|
|
588
|
+
default="hardlink",
|
|
589
|
+
help=dedent("""\
|
|
590
|
+
How to make target file be same file as source file.
|
|
591
|
+
By default uses hardlinks to save disk space.
|
|
592
|
+
Note that hardlinks only work within the same filesystem and are harder to track.
|
|
593
|
+
If you want to track cached files easily then use 'symlink'.
|
|
594
|
+
On Windows you need developer mode or admin privileges to create symlinks.
|
|
595
|
+
"""),
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
def _add_cacher_arguments(parser: argparse.ArgumentParser):
|
|
600
|
+
"""Add cacher arguments to parser."""
|
|
601
|
+
parser.add_argument(
|
|
602
|
+
"--no-cache",
|
|
603
|
+
action="store_true",
|
|
604
|
+
help="Disable caching of files to central location.",
|
|
605
|
+
)
|
|
606
|
+
parser.add_argument(
|
|
607
|
+
"--cache-dir",
|
|
608
|
+
type=Path,
|
|
609
|
+
default=user_cache_root_dir(),
|
|
610
|
+
help="Directory to use as cache for files.",
|
|
611
|
+
)
|
|
612
|
+
_add_copy_method_arguments(parser)
|
|
613
|
+
|
|
614
|
+
|
|
588
615
|
def make_parser() -> argparse.ArgumentParser:
|
|
589
616
|
parser = argparse.ArgumentParser(
|
|
590
617
|
description="Protein Quest CLI", prog="protein-quest", formatter_class=ArgumentDefaultsRichHelpFormatter
|
|
@@ -742,14 +769,26 @@ def _handle_search_complexes(args: argparse.Namespace):
|
|
|
742
769
|
_write_complexes_csv(results, output_csv)
|
|
743
770
|
|
|
744
771
|
|
|
745
|
-
def
|
|
772
|
+
def _initialize_cacher(args: argparse.Namespace) -> Cacher:
|
|
773
|
+
if args.no_cache:
|
|
774
|
+
return PassthroughCacher()
|
|
775
|
+
return DirectoryCacher(
|
|
776
|
+
cache_dir=args.cache_dir,
|
|
777
|
+
copy_method=args.copy_method,
|
|
778
|
+
)
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
def _handle_retrieve_pdbe(args: argparse.Namespace):
|
|
746
782
|
pdbe_csv = args.pdbe_csv
|
|
747
783
|
output_dir = args.output_dir
|
|
748
784
|
max_parallel_downloads = args.max_parallel_downloads
|
|
785
|
+
cacher = _initialize_cacher(args)
|
|
749
786
|
|
|
750
787
|
pdb_ids = _read_column_from_csv(pdbe_csv, "pdb_id")
|
|
751
788
|
rprint(f"Retrieving {len(pdb_ids)} PDBe entries")
|
|
752
|
-
result = asyncio.run(
|
|
789
|
+
result = asyncio.run(
|
|
790
|
+
pdbe_fetch.fetch(pdb_ids, output_dir, max_parallel_downloads=max_parallel_downloads, cacher=cacher)
|
|
791
|
+
)
|
|
753
792
|
rprint(f"Retrieved {len(result)} PDBe entries")
|
|
754
793
|
|
|
755
794
|
|
|
@@ -758,6 +797,7 @@ def _handle_retrieve_alphafold(args):
|
|
|
758
797
|
what_formats = args.what_formats
|
|
759
798
|
alphafold_csv = args.alphafold_csv
|
|
760
799
|
max_parallel_downloads = args.max_parallel_downloads
|
|
800
|
+
cacher = _initialize_cacher(args)
|
|
761
801
|
|
|
762
802
|
if what_formats is None:
|
|
763
803
|
what_formats = {"summary", "cif"}
|
|
@@ -767,7 +807,9 @@ def _handle_retrieve_alphafold(args):
|
|
|
767
807
|
af_ids = _read_column_from_csv(alphafold_csv, "af_id")
|
|
768
808
|
validated_what: set[DownloadableFormat] = structure(what_formats, set[DownloadableFormat])
|
|
769
809
|
rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {validated_what}")
|
|
770
|
-
afs = af_fetch(
|
|
810
|
+
afs = af_fetch(
|
|
811
|
+
af_ids, download_dir, what=validated_what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
|
|
812
|
+
)
|
|
771
813
|
total_nr_files = sum(af.nr_of_files() for af in afs)
|
|
772
814
|
rprint(f"Retrieved {total_nr_files} AlphaFold files and {len(afs)} summaries, written to {download_dir}")
|
|
773
815
|
|
|
@@ -775,10 +817,11 @@ def _handle_retrieve_alphafold(args):
|
|
|
775
817
|
def _handle_retrieve_emdb(args):
|
|
776
818
|
emdb_csv = args.emdb_csv
|
|
777
819
|
output_dir = args.output_dir
|
|
820
|
+
cacher = _initialize_cacher(args)
|
|
778
821
|
|
|
779
822
|
emdb_ids = _read_column_from_csv(emdb_csv, "emdb_id")
|
|
780
823
|
rprint(f"Retrieving {len(emdb_ids)} EMDB entries")
|
|
781
|
-
result = asyncio.run(emdb_fetch(emdb_ids, output_dir))
|
|
824
|
+
result = asyncio.run(emdb_fetch(emdb_ids, output_dir, cacher=cacher))
|
|
782
825
|
rprint(f"Retrieved {len(result)} EMDB entries")
|
|
783
826
|
|
|
784
827
|
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from collections.abc import Iterable, Mapping
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
|
-
from protein_quest.utils import retrieve_files
|
|
6
|
+
from protein_quest.utils import Cacher, retrieve_files
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def _map_id2volume_url(emdb_id: str) -> tuple[str, str]:
|
|
@@ -13,13 +13,16 @@ def _map_id2volume_url(emdb_id: str) -> tuple[str, str]:
|
|
|
13
13
|
return url, fn
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
async def fetch(
|
|
16
|
+
async def fetch(
|
|
17
|
+
emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 1, cacher: Cacher | None = None
|
|
18
|
+
) -> Mapping[str, Path]:
|
|
17
19
|
"""Fetches volume files from the EMDB database.
|
|
18
20
|
|
|
19
21
|
Args:
|
|
20
22
|
emdb_ids: A list of EMDB IDs to fetch.
|
|
21
23
|
save_dir: The directory to save the downloaded files.
|
|
22
24
|
max_parallel_downloads: The maximum number of parallel downloads.
|
|
25
|
+
cacher: An optional cacher to use for caching downloaded files.
|
|
23
26
|
|
|
24
27
|
Returns:
|
|
25
28
|
A mapping of EMDB IDs to their downloaded files.
|
|
@@ -30,5 +33,5 @@ async def fetch(emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads:
|
|
|
30
33
|
|
|
31
34
|
# TODO show progress of each item
|
|
32
35
|
# TODO handle failed downloads, by skipping them instead of raising an error
|
|
33
|
-
await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading EMDB volume files")
|
|
36
|
+
await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading EMDB volume files", cacher=cacher)
|
|
34
37
|
return id2paths
|
|
@@ -32,6 +32,7 @@ Examples:
|
|
|
32
32
|
|
|
33
33
|
"""
|
|
34
34
|
|
|
35
|
+
from collections.abc import Mapping
|
|
35
36
|
from pathlib import Path
|
|
36
37
|
from textwrap import dedent
|
|
37
38
|
from typing import Annotated
|
|
@@ -89,7 +90,18 @@ def search_pdb(
|
|
|
89
90
|
return search4pdb(uniprot_accs, limit=limit)
|
|
90
91
|
|
|
91
92
|
|
|
92
|
-
mcp.tool
|
|
93
|
+
@mcp.tool
|
|
94
|
+
async def fetch_pdbe_structures(pdb_ids: set[str], save_dir: Path) -> Mapping[str, Path]:
|
|
95
|
+
"""Fetch the PDBe structures for given PDB IDs.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
pdb_ids: A set of PDB IDs.
|
|
99
|
+
save_dir: The directory to save the fetched files.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
A mapping of PDB ID to the path of the fetched structure file.
|
|
103
|
+
"""
|
|
104
|
+
return await pdbe_fetch(pdb_ids, save_dir)
|
|
93
105
|
|
|
94
106
|
|
|
95
107
|
@mcp.tool
|
|
@@ -163,7 +175,17 @@ def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[A
|
|
|
163
175
|
return alphafold_fetch(uniprot_accs, save_dir, what)
|
|
164
176
|
|
|
165
177
|
|
|
166
|
-
mcp.tool
|
|
178
|
+
@mcp.tool
|
|
179
|
+
async def fetch_emdb_volumes(emdb_ids: set[str], save_dir: Path) -> Mapping[str, Path]:
|
|
180
|
+
"""Fetch EMDB volumes for given EMDB IDs.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
emdb_ids: A set of EMDB IDs.
|
|
184
|
+
save_dir: The directory to save the fetched files.
|
|
185
|
+
Returns:
|
|
186
|
+
A mapping of EMDB ID to the path of the fetched volume file.
|
|
187
|
+
"""
|
|
188
|
+
return await emdb_fetch(emdb_ids=emdb_ids, save_dir=save_dir)
|
|
167
189
|
|
|
168
190
|
|
|
169
191
|
@mcp.tool
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from collections.abc import Iterable, Mapping
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
|
-
from protein_quest.utils import retrieve_files, run_async
|
|
6
|
+
from protein_quest.utils import Cacher, retrieve_files, run_async
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def _map_id_mmcif(pdb_id: str) -> tuple[str, str]:
|
|
@@ -28,13 +28,16 @@ def _map_id_mmcif(pdb_id: str) -> tuple[str, str]:
|
|
|
28
28
|
return url, fn
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
async def fetch(
|
|
31
|
+
async def fetch(
|
|
32
|
+
ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 5, cacher: Cacher | None = None
|
|
33
|
+
) -> Mapping[str, Path]:
|
|
32
34
|
"""Fetches mmCIF files from the PDBe database.
|
|
33
35
|
|
|
34
36
|
Args:
|
|
35
37
|
ids: A set of PDB IDs to fetch.
|
|
36
38
|
save_dir: The directory to save the fetched mmCIF files to.
|
|
37
39
|
max_parallel_downloads: The maximum number of parallel downloads.
|
|
40
|
+
cacher: An optional cacher to use for caching downloaded files.
|
|
38
41
|
|
|
39
42
|
Returns:
|
|
40
43
|
A dict of id and paths to the downloaded mmCIF files.
|
|
@@ -47,7 +50,7 @@ async def fetch(ids: Iterable[str], save_dir: Path, max_parallel_downloads: int
|
|
|
47
50
|
urls = list(id2urls.values())
|
|
48
51
|
id2paths = {pdb_id: save_dir / fn for pdb_id, (_, fn) in id2urls.items()}
|
|
49
52
|
|
|
50
|
-
await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading PDBe mmCIF files")
|
|
53
|
+
await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading PDBe mmCIF files", cacher=cacher)
|
|
51
54
|
return id2paths
|
|
52
55
|
|
|
53
56
|
|