protein-quest 0.4.0__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of protein-quest might be problematic. Click here for more details.

Files changed (75) hide show
  1. {protein_quest-0.4.0 → protein_quest-0.5.1}/.github/workflows/ci.yml +0 -9
  2. {protein_quest-0.4.0 → protein_quest-0.5.1}/.github/workflows/pages.yml +9 -1
  3. {protein_quest-0.4.0 → protein_quest-0.5.1}/.gitignore +14 -1
  4. {protein_quest-0.4.0 → protein_quest-0.5.1}/PKG-INFO +9 -1
  5. {protein_quest-0.4.0 → protein_quest-0.5.1}/README.md +7 -0
  6. {protein_quest-0.4.0 → protein_quest-0.5.1}/mkdocs.yml +3 -4
  7. {protein_quest-0.4.0 → protein_quest-0.5.1}/pyproject.toml +2 -0
  8. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/__version__.py +1 -1
  9. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/alphafold/fetch.py +34 -9
  10. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/cli.py +68 -25
  11. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/emdb.py +6 -3
  12. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/mcp_server.py +24 -2
  13. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/pdbe/fetch.py +6 -3
  14. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/uniprot.py +7 -3
  15. protein_quest-0.5.1/src/protein_quest/utils.py +511 -0
  16. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/cassettes/test_uniprot/test_search4interaction_partners.yaml +47 -49
  17. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/cassettes/test_uniprot/test_search4macromolecular_complexes.yaml +46 -46
  18. protein_quest-0.5.1/tests/test_utils.py +326 -0
  19. {protein_quest-0.4.0 → protein_quest-0.5.1}/uv.lock +19 -1
  20. protein_quest-0.4.0/docs/cli_doc_hook.py +0 -113
  21. protein_quest-0.4.0/src/protein_quest/utils.py +0 -167
  22. protein_quest-0.4.0/tests/test_utils.py +0 -31
  23. {protein_quest-0.4.0 → protein_quest-0.5.1}/.github/workflows/pypi-publish.yml +0 -0
  24. {protein_quest-0.4.0 → protein_quest-0.5.1}/.vscode/extensions.json +0 -0
  25. {protein_quest-0.4.0 → protein_quest-0.5.1}/CITATION.cff +0 -0
  26. {protein_quest-0.4.0 → protein_quest-0.5.1}/CODE_OF_CONDUCT.md +0 -0
  27. {protein_quest-0.4.0 → protein_quest-0.5.1}/CONTRIBUTING.md +0 -0
  28. {protein_quest-0.4.0 → protein_quest-0.5.1}/LICENSE +0 -0
  29. {protein_quest-0.4.0 → protein_quest-0.5.1}/docs/CONTRIBUTING.md +0 -0
  30. {protein_quest-0.4.0 → protein_quest-0.5.1}/docs/index.md +0 -0
  31. {protein_quest-0.4.0 → protein_quest-0.5.1}/docs/notebooks/.gitignore +0 -0
  32. {protein_quest-0.4.0 → protein_quest-0.5.1}/docs/notebooks/alphafold.ipynb +0 -0
  33. {protein_quest-0.4.0 → protein_quest-0.5.1}/docs/notebooks/index.md +0 -0
  34. {protein_quest-0.4.0 → protein_quest-0.5.1}/docs/notebooks/pdbe.ipynb +0 -0
  35. {protein_quest-0.4.0 → protein_quest-0.5.1}/docs/notebooks/uniprot.ipynb +0 -0
  36. {protein_quest-0.4.0 → protein_quest-0.5.1}/docs/protein-quest-mcp.png +0 -0
  37. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/__init__.py +0 -0
  38. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/alphafold/__init__.py +0 -0
  39. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/alphafold/confidence.py +0 -0
  40. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/alphafold/entry_summary.py +0 -0
  41. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/converter.py +0 -0
  42. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/filters.py +0 -0
  43. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/go.py +0 -0
  44. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/parallel.py +0 -0
  45. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/pdbe/__init__.py +0 -0
  46. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/pdbe/io.py +0 -0
  47. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/py.typed +0 -0
  48. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/ss.py +0 -0
  49. {protein_quest-0.4.0 → protein_quest-0.5.1}/src/protein_quest/taxonomy.py +0 -0
  50. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/alphafold/AF-A1YPR0-F1-model_v4.pdb +0 -0
  51. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/alphafold/cassettes/test_fetch/test_fetch_many.yaml +0 -0
  52. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/alphafold/test_confidence.py +0 -0
  53. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/alphafold/test_entry_summary.py +0 -0
  54. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/alphafold/test_fetch.py +0 -0
  55. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/cassettes/test_emdb/test_fetch.yaml +0 -0
  56. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/cassettes/test_go/test_search_gene_ontology_term.yaml +0 -0
  57. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/cassettes/test_taxonomy/test_search_taxon.yaml +0 -0
  58. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/cassettes/test_taxonomy/test_search_taxon_by_id.yaml +0 -0
  59. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/cassettes/test_uniprot/test_search4af.yaml +0 -0
  60. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/cassettes/test_uniprot/test_search4emdb.yaml +0 -0
  61. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/cassettes/test_uniprot/test_search4pdb.yaml +0 -0
  62. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/cassettes/test_uniprot/test_search4uniprot.yaml +0 -0
  63. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/fixtures/3JRS_B2A.cif.gz +0 -0
  64. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/pdbe/cassettes/test_fetch/test_fetch.yaml +0 -0
  65. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/pdbe/fixtures/2y29.cif +0 -0
  66. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/pdbe/test_fetch.py +0 -0
  67. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/pdbe/test_io.py +0 -0
  68. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/test_cli.py +0 -0
  69. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/test_converter.py +0 -0
  70. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/test_emdb.py +0 -0
  71. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/test_go.py +0 -0
  72. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/test_mcp.py +0 -0
  73. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/test_ss.py +0 -0
  74. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/test_taxonomy.py +0 -0
  75. {protein_quest-0.4.0 → protein_quest-0.5.1}/tests/test_uniprot.py +0 -0
@@ -27,20 +27,11 @@ jobs:
27
27
  - name: Run tests
28
28
  run: |
29
29
  uv run pytest --cov --cov-report=xml
30
- echo $? > pytest-exitcode
31
- continue-on-error: true
32
- # Always upload coverage, even if tests fail
33
30
  - name: Run codacy-coverage-reporter
34
31
  uses: codacy/codacy-coverage-reporter-action@v1.3.0
35
32
  with:
36
33
  project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
37
34
  coverage-reports: coverage.xml
38
- - name: Fail job if pytest failed
39
- run: |
40
- if [ -f pytest-exitcode ] && [ "$(cat pytest-exitcode)" -ne 0 ]; then
41
- echo "Pytest failed, failing job."
42
- exit 1
43
- fi
44
35
  build:
45
36
  name: build
46
37
  runs-on: ubuntu-latest
@@ -5,6 +5,7 @@ on:
5
5
  branches:
6
6
  - main
7
7
  workflow_dispatch:
8
+ pull_request:
8
9
 
9
10
  permissions:
10
11
  contents: read
@@ -13,7 +14,7 @@ permissions:
13
14
 
14
15
  # Only have one deployment in progress at a time
15
16
  concurrency:
16
- group: "pages"
17
+ group: pages
17
18
  cancel-in-progress: true
18
19
 
19
20
  jobs:
@@ -32,6 +33,10 @@ jobs:
32
33
  - name: Build MkDocs site
33
34
  run: |
34
35
  uv run mkdocs build
36
+ env:
37
+ # Force colored output from rich library
38
+ TTY_COMPATIBLE: '1'
39
+ TTY_INTERACTIVE: '0'
35
40
 
36
41
  - name: Upload artifact
37
42
  uses: actions/upload-pages-artifact@v3
@@ -42,6 +47,9 @@ jobs:
42
47
  # Add a dependency to the build job
43
48
  needs: build
44
49
 
50
+ # Only deploy on pushes to main or manual trigger of main branch
51
+ if: github.ref == 'refs/heads/main'
52
+
45
53
  # Grant GITHUB_TOKEN the permissions required to make a Pages deployment
46
54
  permissions:
47
55
  pages: write # to deploy to Pages
@@ -73,4 +73,17 @@ venv.bak/
73
73
  /docs/pdb_files/
74
74
  /docs/density_filtered/
75
75
  /site
76
- /mysession/
76
+ /mysession/
77
+ # Paths generated in README.md examples
78
+ uniprot_accs.txt
79
+ pdbe.csv
80
+ alphafold.csv
81
+ emdbs.csv
82
+ interaction-partners-of-Q05471.txt
83
+ complexes.csv
84
+ downloads-af/
85
+ downloads-emdb/
86
+ downloads-pdbe/
87
+ filtered/
88
+ filtered-chains/
89
+ filtered-ss/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: protein_quest
3
- Version: 0.4.0
3
+ Version: 0.5.1
4
4
  Summary: Search/retrieve/filter proteins and protein structures
5
5
  Project-URL: Homepage, https://github.com/haddocking/protein-quest
6
6
  Project-URL: Issues, https://github.com/haddocking/protein-quest/issues
@@ -17,6 +17,7 @@ Requires-Dist: cattrs[orjson]>=24.1.3
17
17
  Requires-Dist: dask>=2025.5.1
18
18
  Requires-Dist: distributed>=2025.5.1
19
19
  Requires-Dist: gemmi>=0.7.3
20
+ Requires-Dist: platformdirs>=4.3.8
20
21
  Requires-Dist: psutil>=7.0.0
21
22
  Requires-Dist: rich-argparse>=1.7.1
22
23
  Requires-Dist: rich>=14.0.0
@@ -47,6 +48,10 @@ It uses
47
48
  - [gemmi](https://project-gemmi.github.io/) to work with macromolecular models.
48
49
  - [dask-distributed](https://docs.dask.org/en/latest/) to compute in parallel.
49
50
 
51
+ The package is used by
52
+
53
+ - [protein-detective](https://github.com/haddocking/protein-detective)
54
+
50
55
  An example workflow:
51
56
 
52
57
  ```mermaid
@@ -94,6 +99,9 @@ The main entry point is the `protein-quest` command line tool which has multiple
94
99
 
95
100
  To use programmaticly, see the [Jupyter notebooks](https://www.bonvinlab.org/protein-quest/notebooks) and [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
96
101
 
102
+ While downloading or copying files it uses a global cache (located at `~/.cache/protein-quest`) and hardlinks to save disk space and improve speed.
103
+ This behavior can be customized with the `--no-cache`, `--cache-dir`, and `--copy-method` command line arguments.
104
+
97
105
  ### Search Uniprot accessions
98
106
 
99
107
  ```shell
@@ -17,6 +17,10 @@ It uses
17
17
  - [gemmi](https://project-gemmi.github.io/) to work with macromolecular models.
18
18
  - [dask-distributed](https://docs.dask.org/en/latest/) to compute in parallel.
19
19
 
20
+ The package is used by
21
+
22
+ - [protein-detective](https://github.com/haddocking/protein-detective)
23
+
20
24
  An example workflow:
21
25
 
22
26
  ```mermaid
@@ -64,6 +68,9 @@ The main entry point is the `protein-quest` command line tool which has multiple
64
68
 
65
69
  To use programmaticly, see the [Jupyter notebooks](https://www.bonvinlab.org/protein-quest/notebooks) and [API documentation](https://www.bonvinlab.org/protein-quest/autoapi/summary/).
66
70
 
71
+ While downloading or copying files it uses a global cache (located at `~/.cache/protein-quest`) and hardlinks to save disk space and improve speed.
72
+ This behavior can be customized with the `--no-cache`, `--cache-dir`, and `--copy-method` command line arguments.
73
+
67
74
  ### Search Uniprot accessions
68
75
 
69
76
  ```shell
@@ -3,10 +3,6 @@ site_url: https://bonvinlab.org/protein_quest
3
3
  repo_name: haddocking/protein-quest
4
4
  repo_url: https://github.com/haddocking/protein-quest
5
5
  watch: [mkdocs.yml, README.md, src/protein_quest]
6
- exclude_docs: |
7
- cli_doc_hook.py
8
- hooks:
9
- - docs/cli_doc_hook.py
10
6
  use_directory_urls: false
11
7
  theme:
12
8
  name: material
@@ -61,6 +57,9 @@ plugins:
61
57
  remove_tag_config:
62
58
  remove_input_tags:
63
59
  - hide_code
60
+ - mkdocs-rich-argparse:
61
+ module: protein_quest.cli
62
+ factory: make_parser
64
63
 
65
64
  markdown_extensions:
66
65
  # Use to render part of README as home
@@ -20,6 +20,7 @@ dependencies = [
20
20
  "sparqlwrapper>=2.0.0",
21
21
  "tqdm>=4.67.1",
22
22
  "yarl>=1.20.1",
23
+ "platformdirs>=4.3.8",
23
24
  ]
24
25
 
25
26
  [project.urls]
@@ -57,6 +58,7 @@ docs = [
57
58
  "mkdocs-autoapi>=0.4.1",
58
59
  "mkdocs-jupyter>=0.25.1",
59
60
  "mkdocs-material>=9.6.14",
61
+ "mkdocs-rich-argparse>=0.1.2",
60
62
  "mkdocstrings[python]>=0.29.1",
61
63
  ]
62
64
  docs-type = [
@@ -1,2 +1,2 @@
1
- __version__ = "0.4.0"
1
+ __version__ = "0.5.1"
2
2
  """The version of the package."""
@@ -14,7 +14,7 @@ from yarl import URL
14
14
 
15
15
  from protein_quest.alphafold.entry_summary import EntrySummary
16
16
  from protein_quest.converter import converter
17
- from protein_quest.utils import friendly_session, retrieve_files, run_async
17
+ from protein_quest.utils import Cacher, PassthroughCacher, friendly_session, retrieve_files, run_async
18
18
 
19
19
  logger = logging.getLogger(__name__)
20
20
 
@@ -104,7 +104,7 @@ class AlphaFoldEntry:
104
104
 
105
105
 
106
106
  async def fetch_summary(
107
- qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None
107
+ qualifier: str, session: RetryClient, semaphore: Semaphore, save_dir: Path | None, cacher: Cacher
108
108
  ) -> list[EntrySummary]:
109
109
  """Fetches a summary from the AlphaFold database for a given qualifier.
110
110
 
@@ -116,6 +116,7 @@ async def fetch_summary(
116
116
  save_dir: An optional directory to save the fetched summary as a JSON file.
117
117
  If set and summary exists then summary will be loaded from disk instead of being fetched from the API.
118
118
  If not set then the summary will not be saved to disk and will always be fetched from the API.
119
+ cacher: A cacher to use for caching the fetched summary. Only used if save_dir is not None.
119
120
 
120
121
  Returns:
121
122
  A list of EntrySummary objects representing the fetched summary.
@@ -124,6 +125,11 @@ async def fetch_summary(
124
125
  fn: AsyncPath | None = None
125
126
  if save_dir is not None:
126
127
  fn = AsyncPath(save_dir / f"{qualifier}.json")
128
+ cached_file = await cacher.copy_from_cache(Path(fn))
129
+ if cached_file is not None:
130
+ logger.debug(f"Using cached file {cached_file} for summary of {qualifier}.")
131
+ raw_data = await AsyncPath(cached_file).read_bytes()
132
+ return converter.loads(raw_data, list[EntrySummary])
127
133
  if await fn.exists():
128
134
  logger.debug(f"File {fn} already exists. Skipping download from {url}.")
129
135
  raw_data = await fn.read_bytes()
@@ -133,18 +139,23 @@ async def fetch_summary(
133
139
  raw_data = await response.content.read()
134
140
  if fn is not None:
135
141
  # TODO return fn and make it part of AlphaFoldEntry as summary_file prop
136
- await fn.write_bytes(raw_data)
142
+ await cacher.write_bytes(Path(fn), raw_data)
137
143
  return converter.loads(raw_data, list[EntrySummary])
138
144
 
139
145
 
140
146
  async def fetch_summaries(
141
- qualifiers: Iterable[str], save_dir: Path | None = None, max_parallel_downloads: int = 5
147
+ qualifiers: Iterable[str],
148
+ save_dir: Path | None = None,
149
+ max_parallel_downloads: int = 5,
150
+ cacher: Cacher | None = None,
142
151
  ) -> AsyncGenerator[EntrySummary]:
143
152
  semaphore = Semaphore(max_parallel_downloads)
144
153
  if save_dir is not None:
145
154
  save_dir.mkdir(parents=True, exist_ok=True)
155
+ if cacher is None:
156
+ cacher = PassthroughCacher()
146
157
  async with friendly_session() as session:
147
- tasks = [fetch_summary(qualifier, session, semaphore, save_dir) for qualifier in qualifiers]
158
+ tasks = [fetch_summary(qualifier, session, semaphore, save_dir, cacher) for qualifier in qualifiers]
148
159
  summaries_per_qualifier: list[list[EntrySummary]] = await tqdm.gather(
149
160
  *tasks, desc="Fetching Alphafold summaries"
150
161
  )
@@ -154,7 +165,11 @@ async def fetch_summaries(
154
165
 
155
166
 
156
167
  async def fetch_many_async(
157
- uniprot_accessions: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
168
+ uniprot_accessions: Iterable[str],
169
+ save_dir: Path,
170
+ what: set[DownloadableFormat],
171
+ max_parallel_downloads: int = 5,
172
+ cacher: Cacher | None = None,
158
173
  ) -> AsyncGenerator[AlphaFoldEntry]:
159
174
  """Asynchronously fetches summaries and files from
160
175
  [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
@@ -164,15 +179,17 @@ async def fetch_many_async(
164
179
  save_dir: The directory to save the fetched files to.
165
180
  what: A set of formats to download.
166
181
  max_parallel_downloads: The maximum number of parallel downloads.
182
+ cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
167
183
 
168
184
  Yields:
169
185
  A dataclass containing the summary, pdb file, and pae file.
170
186
  """
171
187
  save_dir_for_summaries = save_dir if "summary" in what and save_dir is not None else None
188
+
172
189
  summaries = [
173
190
  s
174
191
  async for s in fetch_summaries(
175
- uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads
192
+ uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads, cacher=cacher
176
193
  )
177
194
  ]
178
195
 
@@ -183,6 +200,7 @@ async def fetch_many_async(
183
200
  save_dir,
184
201
  desc="Downloading AlphaFold files",
185
202
  max_parallel_downloads=max_parallel_downloads,
203
+ cacher=cacher,
186
204
  )
187
205
  for summary in summaries:
188
206
  yield AlphaFoldEntry(
@@ -236,7 +254,11 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
236
254
 
237
255
 
238
256
  def fetch_many(
239
- ids: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
257
+ ids: Iterable[str],
258
+ save_dir: Path,
259
+ what: set[DownloadableFormat],
260
+ max_parallel_downloads: int = 5,
261
+ cacher: Cacher | None = None,
240
262
  ) -> list[AlphaFoldEntry]:
241
263
  """Synchronously fetches summaries and pdb and pae files from AlphaFold Protein Structure Database.
242
264
 
@@ -245,6 +267,7 @@ def fetch_many(
245
267
  save_dir: The directory to save the fetched files to.
246
268
  what: A set of formats to download.
247
269
  max_parallel_downloads: The maximum number of parallel downloads.
270
+ cacher: A cacher to use for caching the fetched files. Only used if summary is in what set.
248
271
 
249
272
  Returns:
250
273
  A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
@@ -253,7 +276,9 @@ def fetch_many(
253
276
  async def gather_entries():
254
277
  return [
255
278
  entry
256
- async for entry in fetch_many_async(ids, save_dir, what, max_parallel_downloads=max_parallel_downloads)
279
+ async for entry in fetch_many_async(
280
+ ids, save_dir, what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
281
+ )
257
282
  ]
258
283
 
259
284
  return run_async(gather_entries())
@@ -43,7 +43,15 @@ from protein_quest.uniprot import (
43
43
  search4pdb,
44
44
  search4uniprot,
45
45
  )
46
- from protein_quest.utils import CopyMethod, copy_methods, copyfile
46
+ from protein_quest.utils import (
47
+ Cacher,
48
+ CopyMethod,
49
+ DirectoryCacher,
50
+ PassthroughCacher,
51
+ copy_methods,
52
+ copyfile,
53
+ user_cache_root_dir,
54
+ )
47
55
 
48
56
  logger = logging.getLogger(__name__)
49
57
 
@@ -312,6 +320,7 @@ def _add_retrieve_pdbe_parser(subparsers: argparse._SubParsersAction):
312
320
  default=5,
313
321
  help="Maximum number of parallel downloads",
314
322
  )
323
+ _add_cacher_arguments(parser)
315
324
 
316
325
 
317
326
  def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
@@ -342,6 +351,7 @@ def _add_retrieve_alphafold_parser(subparsers: argparse._SubParsersAction):
342
351
  default=5,
343
352
  help="Maximum number of parallel downloads",
344
353
  )
354
+ _add_cacher_arguments(parser)
345
355
 
346
356
 
347
357
  def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
@@ -361,22 +371,7 @@ def _add_retrieve_emdb_parser(subparsers: argparse._SubParsersAction):
361
371
  help="CSV file with `emdb_id` column. Other columns are ignored. Use `-` for stdin.",
362
372
  )
363
373
  parser.add_argument("output_dir", type=Path, help="Directory to store downloaded EMDB volume files")
364
-
365
-
366
- def _add_copy_method_argument(parser: argparse.ArgumentParser):
367
- """Add copy method argument to parser."""
368
- default_copy_method = "symlink"
369
- if os.name == "nt":
370
- # On Windows you need developer mode or admin privileges to create symlinks
371
- # so we default to copying files instead of symlinking
372
- default_copy_method = "copy"
373
- parser.add_argument(
374
- "--copy-method",
375
- type=str,
376
- choices=copy_methods,
377
- default=default_copy_method,
378
- help="How to copy files when no changes are needed to output file.",
379
- )
374
+ _add_cacher_arguments(parser)
380
375
 
381
376
 
382
377
  def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
@@ -409,7 +404,7 @@ def _add_filter_confidence_parser(subparsers: argparse._SubParsersAction):
409
404
  In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
410
405
  Use `-` for stdout."""),
411
406
  )
412
- _add_copy_method_argument(parser)
407
+ _add_copy_method_arguments(parser)
413
408
 
414
409
 
415
410
  def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
@@ -449,7 +444,7 @@ def _add_filter_chain_parser(subparsers: argparse._SubParsersAction):
449
444
  If not provided, will create a local cluster.
450
445
  If set to `sequential` will run tasks sequentially."""),
451
446
  )
452
- _add_copy_method_argument(parser)
447
+ _add_copy_method_arguments(parser)
453
448
 
454
449
 
455
450
  def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
@@ -472,7 +467,6 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
472
467
  )
473
468
  parser.add_argument("--min-residues", type=int, default=0, help="Min residues in chain A")
474
469
  parser.add_argument("--max-residues", type=int, default=10_000_000, help="Max residues in chain A")
475
- _add_copy_method_argument(parser)
476
470
  parser.add_argument(
477
471
  "--write-stats",
478
472
  type=argparse.FileType("w", encoding="UTF-8"),
@@ -481,6 +475,7 @@ def _add_filter_residue_parser(subparsers: argparse._SubParsersAction):
481
475
  In CSV format with `<input_file>,<residue_count>,<passed>,<output_file>` columns.
482
476
  Use `-` for stdout."""),
483
477
  )
478
+ _add_copy_method_arguments(parser)
484
479
 
485
480
 
486
481
  def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
@@ -507,7 +502,6 @@ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
507
502
  parser.add_argument("--ratio-max-helix-residues", type=float, help="Max residues in helices (relative)")
508
503
  parser.add_argument("--ratio-min-sheet-residues", type=float, help="Min residues in sheets (relative)")
509
504
  parser.add_argument("--ratio-max-sheet-residues", type=float, help="Max residues in sheets (relative)")
510
- _add_copy_method_argument(parser)
511
505
  parser.add_argument(
512
506
  "--write-stats",
513
507
  type=argparse.FileType("w", encoding="UTF-8"),
@@ -518,6 +512,7 @@ def _add_filter_ss_parser(subparsers: argparse._SubParsersAction):
518
512
  Use `-` for stdout.
519
513
  """),
520
514
  )
515
+ _add_copy_method_arguments(parser)
521
516
 
522
517
 
523
518
  def _add_search_subcommands(subparsers: argparse._SubParsersAction):
@@ -585,6 +580,38 @@ def _add_mcp_command(subparsers: argparse._SubParsersAction):
585
580
  parser.add_argument("--port", default=8000, type=int, help="Port to bind the server to")
586
581
 
587
582
 
583
+ def _add_copy_method_arguments(parser):
584
+ parser.add_argument(
585
+ "--copy-method",
586
+ type=str,
587
+ choices=copy_methods,
588
+ default="hardlink",
589
+ help=dedent("""\
590
+ How to make target file be same file as source file.
591
+ By default uses hardlinks to save disk space.
592
+ Note that hardlinks only work within the same filesystem and are harder to track.
593
+ If you want to track cached files easily then use 'symlink'.
594
+ On Windows you need developer mode or admin privileges to create symlinks.
595
+ """),
596
+ )
597
+
598
+
599
+ def _add_cacher_arguments(parser: argparse.ArgumentParser):
600
+ """Add cacher arguments to parser."""
601
+ parser.add_argument(
602
+ "--no-cache",
603
+ action="store_true",
604
+ help="Disable caching of files to central location.",
605
+ )
606
+ parser.add_argument(
607
+ "--cache-dir",
608
+ type=Path,
609
+ default=user_cache_root_dir(),
610
+ help="Directory to use as cache for files.",
611
+ )
612
+ _add_copy_method_arguments(parser)
613
+
614
+
588
615
  def make_parser() -> argparse.ArgumentParser:
589
616
  parser = argparse.ArgumentParser(
590
617
  description="Protein Quest CLI", prog="protein-quest", formatter_class=ArgumentDefaultsRichHelpFormatter
@@ -742,14 +769,26 @@ def _handle_search_complexes(args: argparse.Namespace):
742
769
  _write_complexes_csv(results, output_csv)
743
770
 
744
771
 
745
- def _handle_retrieve_pdbe(args):
772
+ def _initialize_cacher(args: argparse.Namespace) -> Cacher:
773
+ if args.no_cache:
774
+ return PassthroughCacher()
775
+ return DirectoryCacher(
776
+ cache_dir=args.cache_dir,
777
+ copy_method=args.copy_method,
778
+ )
779
+
780
+
781
+ def _handle_retrieve_pdbe(args: argparse.Namespace):
746
782
  pdbe_csv = args.pdbe_csv
747
783
  output_dir = args.output_dir
748
784
  max_parallel_downloads = args.max_parallel_downloads
785
+ cacher = _initialize_cacher(args)
749
786
 
750
787
  pdb_ids = _read_column_from_csv(pdbe_csv, "pdb_id")
751
788
  rprint(f"Retrieving {len(pdb_ids)} PDBe entries")
752
- result = asyncio.run(pdbe_fetch.fetch(pdb_ids, output_dir, max_parallel_downloads=max_parallel_downloads))
789
+ result = asyncio.run(
790
+ pdbe_fetch.fetch(pdb_ids, output_dir, max_parallel_downloads=max_parallel_downloads, cacher=cacher)
791
+ )
753
792
  rprint(f"Retrieved {len(result)} PDBe entries")
754
793
 
755
794
 
@@ -758,6 +797,7 @@ def _handle_retrieve_alphafold(args):
758
797
  what_formats = args.what_formats
759
798
  alphafold_csv = args.alphafold_csv
760
799
  max_parallel_downloads = args.max_parallel_downloads
800
+ cacher = _initialize_cacher(args)
761
801
 
762
802
  if what_formats is None:
763
803
  what_formats = {"summary", "cif"}
@@ -767,7 +807,9 @@ def _handle_retrieve_alphafold(args):
767
807
  af_ids = _read_column_from_csv(alphafold_csv, "af_id")
768
808
  validated_what: set[DownloadableFormat] = structure(what_formats, set[DownloadableFormat])
769
809
  rprint(f"Retrieving {len(af_ids)} AlphaFold entries with formats {validated_what}")
770
- afs = af_fetch(af_ids, download_dir, what=validated_what, max_parallel_downloads=max_parallel_downloads)
810
+ afs = af_fetch(
811
+ af_ids, download_dir, what=validated_what, max_parallel_downloads=max_parallel_downloads, cacher=cacher
812
+ )
771
813
  total_nr_files = sum(af.nr_of_files() for af in afs)
772
814
  rprint(f"Retrieved {total_nr_files} AlphaFold files and {len(afs)} summaries, written to {download_dir}")
773
815
 
@@ -775,10 +817,11 @@ def _handle_retrieve_alphafold(args):
775
817
  def _handle_retrieve_emdb(args):
776
818
  emdb_csv = args.emdb_csv
777
819
  output_dir = args.output_dir
820
+ cacher = _initialize_cacher(args)
778
821
 
779
822
  emdb_ids = _read_column_from_csv(emdb_csv, "emdb_id")
780
823
  rprint(f"Retrieving {len(emdb_ids)} EMDB entries")
781
- result = asyncio.run(emdb_fetch(emdb_ids, output_dir))
824
+ result = asyncio.run(emdb_fetch(emdb_ids, output_dir, cacher=cacher))
782
825
  rprint(f"Retrieved {len(result)} EMDB entries")
783
826
 
784
827
 
@@ -3,7 +3,7 @@
3
3
  from collections.abc import Iterable, Mapping
4
4
  from pathlib import Path
5
5
 
6
- from protein_quest.utils import retrieve_files
6
+ from protein_quest.utils import Cacher, retrieve_files
7
7
 
8
8
 
9
9
  def _map_id2volume_url(emdb_id: str) -> tuple[str, str]:
@@ -13,13 +13,16 @@ def _map_id2volume_url(emdb_id: str) -> tuple[str, str]:
13
13
  return url, fn
14
14
 
15
15
 
16
- async def fetch(emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 1) -> Mapping[str, Path]:
16
+ async def fetch(
17
+ emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 1, cacher: Cacher | None = None
18
+ ) -> Mapping[str, Path]:
17
19
  """Fetches volume files from the EMDB database.
18
20
 
19
21
  Args:
20
22
  emdb_ids: A list of EMDB IDs to fetch.
21
23
  save_dir: The directory to save the downloaded files.
22
24
  max_parallel_downloads: The maximum number of parallel downloads.
25
+ cacher: An optional cacher to use for caching downloaded files.
23
26
 
24
27
  Returns:
25
28
  A mapping of EMDB IDs to their downloaded files.
@@ -30,5 +33,5 @@ async def fetch(emdb_ids: Iterable[str], save_dir: Path, max_parallel_downloads:
30
33
 
31
34
  # TODO show progress of each item
32
35
  # TODO handle failed downloads, by skipping them instead of raising an error
33
- await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading EMDB volume files")
36
+ await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading EMDB volume files", cacher=cacher)
34
37
  return id2paths
@@ -32,6 +32,7 @@ Examples:
32
32
 
33
33
  """
34
34
 
35
+ from collections.abc import Mapping
35
36
  from pathlib import Path
36
37
  from textwrap import dedent
37
38
  from typing import Annotated
@@ -89,7 +90,18 @@ def search_pdb(
89
90
  return search4pdb(uniprot_accs, limit=limit)
90
91
 
91
92
 
92
- mcp.tool(pdbe_fetch, name="fetch_pdbe_structures")
93
+ @mcp.tool
94
+ async def fetch_pdbe_structures(pdb_ids: set[str], save_dir: Path) -> Mapping[str, Path]:
95
+ """Fetch the PDBe structures for given PDB IDs.
96
+
97
+ Args:
98
+ pdb_ids: A set of PDB IDs.
99
+ save_dir: The directory to save the fetched files.
100
+
101
+ Returns:
102
+ A mapping of PDB ID to the path of the fetched structure file.
103
+ """
104
+ return await pdbe_fetch(pdb_ids, save_dir)
93
105
 
94
106
 
95
107
  @mcp.tool
@@ -163,7 +175,17 @@ def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[A
163
175
  return alphafold_fetch(uniprot_accs, save_dir, what)
164
176
 
165
177
 
166
- mcp.tool(emdb_fetch, name="fetch_emdb_volumes")
178
+ @mcp.tool
179
+ async def fetch_emdb_volumes(emdb_ids: set[str], save_dir: Path) -> Mapping[str, Path]:
180
+ """Fetch EMDB volumes for given EMDB IDs.
181
+
182
+ Args:
183
+ emdb_ids: A set of EMDB IDs.
184
+ save_dir: The directory to save the fetched files.
185
+ Returns:
186
+ A mapping of EMDB ID to the path of the fetched volume file.
187
+ """
188
+ return await emdb_fetch(emdb_ids=emdb_ids, save_dir=save_dir)
167
189
 
168
190
 
169
191
  @mcp.tool
@@ -3,7 +3,7 @@
3
3
  from collections.abc import Iterable, Mapping
4
4
  from pathlib import Path
5
5
 
6
- from protein_quest.utils import retrieve_files, run_async
6
+ from protein_quest.utils import Cacher, retrieve_files, run_async
7
7
 
8
8
 
9
9
  def _map_id_mmcif(pdb_id: str) -> tuple[str, str]:
@@ -28,13 +28,16 @@ def _map_id_mmcif(pdb_id: str) -> tuple[str, str]:
28
28
  return url, fn
29
29
 
30
30
 
31
- async def fetch(ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 5) -> Mapping[str, Path]:
31
+ async def fetch(
32
+ ids: Iterable[str], save_dir: Path, max_parallel_downloads: int = 5, cacher: Cacher | None = None
33
+ ) -> Mapping[str, Path]:
32
34
  """Fetches mmCIF files from the PDBe database.
33
35
 
34
36
  Args:
35
37
  ids: A set of PDB IDs to fetch.
36
38
  save_dir: The directory to save the fetched mmCIF files to.
37
39
  max_parallel_downloads: The maximum number of parallel downloads.
40
+ cacher: An optional cacher to use for caching downloaded files.
38
41
 
39
42
  Returns:
40
43
  A dict of id and paths to the downloaded mmCIF files.
@@ -47,7 +50,7 @@ async def fetch(ids: Iterable[str], save_dir: Path, max_parallel_downloads: int
47
50
  urls = list(id2urls.values())
48
51
  id2paths = {pdb_id: save_dir / fn for pdb_id, (_, fn) in id2urls.items()}
49
52
 
50
- await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading PDBe mmCIF files")
53
+ await retrieve_files(urls, save_dir, max_parallel_downloads, desc="Downloading PDBe mmCIF files", cacher=cacher)
51
54
  return id2paths
52
55
 
53
56
 
@@ -525,7 +525,9 @@ def _build_complex_sparql_query(uniprot_accs: Iterable[str], limit: int) -> str:
525
525
  ?protein
526
526
  ?cp_db
527
527
  ?cp_comment
528
- (GROUP_CONCAT(DISTINCT ?member; separator=",") AS ?complex_members)
528
+ (GROUP_CONCAT(
529
+ DISTINCT STRAFTER(STR(?member), "http://purl.uniprot.org/uniprot/"); separator=","
530
+ ) AS ?complex_members)
529
531
  (COUNT(DISTINCT ?member) AS ?member_count)
530
532
  WHERE {
531
533
  # Input UniProt accessions
@@ -550,7 +552,9 @@ def _build_complex_sparql_query(uniprot_accs: Iterable[str], limit: int) -> str:
550
552
  """
551
553
  select_clause = dedent("""\
552
554
  ?protein ?cp_db ?cp_comment
553
- (GROUP_CONCAT(DISTINCT ?member; separator=",") AS ?complex_members)
555
+ (GROUP_CONCAT(
556
+ DISTINCT STRAFTER(STR(?member), "http://purl.uniprot.org/uniprot/"); separator=","
557
+ ) AS ?complex_members)
554
558
  """)
555
559
  where_clause = dedent("""
556
560
  # --- Complex Info ---
@@ -596,7 +600,7 @@ def _flatten_results_complex(raw_results) -> list[ComplexPortalEntry]:
596
600
  complex_id = raw_result["cp_db"]["value"].split("/")[-1]
597
601
  complex_url = f"https://www.ebi.ac.uk/complexportal/complex/{complex_id}"
598
602
  complex_title = raw_result.get("cp_comment", {}).get("value", "")
599
- members = {m.split("/")[-1] for m in raw_result["complex_members"]["value"].split(",")}
603
+ members = set(raw_result["complex_members"]["value"].split(","))
600
604
  results.append(
601
605
  ComplexPortalEntry(
602
606
  query_protein=query_protein,