protein-quest 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of protein-quest might be problematic. Click here for more details.

@@ -1 +1,2 @@
1
- __version__ = "0.3.0"
1
+ __version__ = "0.3.2"
2
+ """The version of the package."""
@@ -7,7 +7,10 @@ from pathlib import Path
7
7
 
8
8
  import gemmi
9
9
 
10
+ from protein_quest.converter import Percentage, PositiveInt, converter
10
11
  from protein_quest.pdbe.io import write_structure
12
+ from protein_quest.ss import nr_of_residues_in_total
13
+ from protein_quest.utils import CopyMethod, copyfile
11
14
 
12
15
  """
13
16
  Methods to filter AlphaFoldDB structures on confidence scores.
@@ -73,13 +76,25 @@ class ConfidenceFilterQuery:
73
76
  Parameters:
74
77
  confidence: The confidence threshold for filtering residues.
75
78
  Residues with a pLDDT (b-factor) above this value are considered high confidence.
76
- min_threshold: The minimum number of high-confidence residues required to keep the structure.
77
- max_threshold: The maximum number of high-confidence residues required to keep the structure.
79
+ min_residues: The minimum number of high-confidence residues required to keep the structure.
80
+ max_residues: The maximum number of high-confidence residues required to keep the structure.
78
81
  """
79
82
 
80
- confidence: float
81
- min_threshold: int
82
- max_threshold: int
83
+ confidence: Percentage
84
+ min_residues: PositiveInt
85
+ max_residues: PositiveInt
86
+
87
+
88
+ base_query_hook = converter.get_structure_hook(ConfidenceFilterQuery)
89
+
90
+
91
+ @converter.register_structure_hook
92
+ def confidence_filter_query_hook(val, _type) -> ConfidenceFilterQuery:
93
+ result: ConfidenceFilterQuery = base_query_hook(val, _type)
94
+ if result.min_residues > result.max_residues:
95
+ msg = f"min_residues {result.min_residues} cannot be larger than max_residues {result.max_residues}"
96
+ raise ValueError(msg)
97
+ return result
83
98
 
84
99
 
85
100
  @dataclass
@@ -93,38 +108,46 @@ class ConfidenceFilterResult:
93
108
  """
94
109
 
95
110
  input_file: str
96
- count: int
111
+ count: PositiveInt
97
112
  filtered_file: Path | None = None
98
113
 
99
114
 
100
- def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_dir: Path) -> ConfidenceFilterResult:
101
- """Filter a single AlphaFoldDB structure file based on confidence.
115
+ def filter_file_on_residues(
116
+ file: Path, query: ConfidenceFilterQuery, filtered_dir: Path, copy_method: CopyMethod = "copy"
117
+ ) -> ConfidenceFilterResult:
118
+ """Filter a single AlphaFoldDB structure file (*.pdb[.gz], *.cif[.gz]) based on confidence.
102
119
 
103
120
  Args:
104
121
  file: The path to the PDB file to filter.
105
122
  query: The confidence filter query.
106
123
  filtered_dir: The directory to save the filtered PDB file.
124
+ copy_method: How to copy when no residues have to be removed.
107
125
 
108
126
  Returns:
109
127
  result with filtered_file property set to Path where filtered PDB file is saved.
110
- or None if structure was filtered out.
128
+ or None if structure was filtered out.
111
129
  """
112
130
  structure = gemmi.read_structure(str(file))
113
131
  residues = set(find_high_confidence_residues(structure, query.confidence))
114
132
  count = len(residues)
115
- if count < query.min_threshold or count > query.max_threshold:
133
+ if count < query.min_residues or count > query.max_residues:
116
134
  # Skip structure that is outside the min and max threshold
117
135
  # just return number of high confidence residues
118
136
  return ConfidenceFilterResult(
119
137
  input_file=file.name,
120
138
  count=count,
121
139
  )
140
+ total_residues = nr_of_residues_in_total(structure)
122
141
  filtered_file = filtered_dir / file.name
123
- new_structure = filter_out_low_confidence_residues(
124
- structure,
125
- residues,
126
- )
127
- write_structure(new_structure, filtered_file)
142
+ if count == total_residues:
143
+ # if no residues have to be removed then copy instead of slower gemmi writing
144
+ copyfile(file, filtered_file, copy_method)
145
+ else:
146
+ new_structure = filter_out_low_confidence_residues(
147
+ structure,
148
+ residues,
149
+ )
150
+ write_structure(new_structure, filtered_file)
128
151
  return ConfidenceFilterResult(
129
152
  input_file=file.name,
130
153
  count=count,
@@ -133,7 +156,10 @@ def filter_file_on_residues(file: Path, query: ConfidenceFilterQuery, filtered_d
133
156
 
134
157
 
135
158
  def filter_files_on_confidence(
136
- alphafold_pdb_files: list[Path], query: ConfidenceFilterQuery, filtered_dir: Path
159
+ alphafold_pdb_files: list[Path],
160
+ query: ConfidenceFilterQuery,
161
+ filtered_dir: Path,
162
+ copy_method: CopyMethod = "copy",
137
163
  ) -> Generator[ConfidenceFilterResult]:
138
164
  """Filter AlphaFoldDB structures based on confidence.
139
165
 
@@ -141,6 +167,7 @@ def filter_files_on_confidence(
141
167
  alphafold_pdb_files: List of mmcif/PDB files from AlphaFoldDB to filter.
142
168
  query: The confidence filter query containing the confidence thresholds.
143
169
  filtered_dir: Directory where the filtered mmcif/PDB files will be saved.
170
+ copy_method: How to copy when a direct copy is possible.
144
171
 
145
172
  Yields:
146
173
  For each mmcif/PDB files yields whether it was filtered or not,
@@ -150,4 +177,4 @@ def filter_files_on_confidence(
150
177
  # In ../filter.py:filter_files_on_residues() we filter on number of residues on a file level
151
178
  # here we filter on file level and inside file remove low confidence residues
152
179
  for pdb_file in alphafold_pdb_files:
153
- yield filter_file_on_residues(pdb_file, query, filtered_dir)
180
+ yield filter_file_on_residues(pdb_file, query, filtered_dir, copy_method)
@@ -1,12 +1,14 @@
1
1
  # ruff: noqa: N815 allow camelCase follow what api returns
2
2
  from dataclasses import dataclass
3
3
 
4
+ from yarl import URL
5
+
4
6
 
5
7
  @dataclass
6
8
  class EntrySummary:
7
9
  """Dataclass representing a summary of an AlphaFold entry.
8
10
 
9
- Modelled after EntrySummary in https://alphafold.ebi.ac.uk/api/openapi.json
11
+ Modelled after EntrySummary in [https://alphafold.ebi.ac.uk/api/openapi.json](https://alphafold.ebi.ac.uk/api/openapi.json)
10
12
  """
11
13
 
12
14
  entryId: str
@@ -21,17 +23,17 @@ class EntrySummary:
21
23
  modelCreatedDate: str
22
24
  latestVersion: int
23
25
  allVersions: list[int]
24
- bcifUrl: str
25
- cifUrl: str
26
- pdbUrl: str
27
- paeImageUrl: str
28
- paeDocUrl: str
26
+ bcifUrl: URL
27
+ cifUrl: URL
28
+ pdbUrl: URL
29
+ paeImageUrl: URL
30
+ paeDocUrl: URL
29
31
  gene: str | None = None
30
32
  sequenceChecksum: str | None = None
31
33
  sequenceVersionDate: str | None = None
32
- amAnnotationsUrl: str | None = None
33
- amAnnotationsHg19Url: str | None = None
34
- amAnnotationsHg38Url: str | None = None
34
+ amAnnotationsUrl: URL | None = None
35
+ amAnnotationsHg19Url: URL | None = None
36
+ amAnnotationsHg38Url: URL | None = None
35
37
  isReviewed: bool | None = None
36
38
  isReferenceProteome: bool | None = None
37
39
  # TODO add new fields from https://alphafold.ebi.ac.uk/#/public-api/get_uniprot_summary_api_uniprot_summary__qualifier__json_get
@@ -1,26 +1,26 @@
1
1
  """Module for fetch Alphafold data."""
2
2
 
3
- import asyncio
4
3
  import logging
5
4
  from asyncio import Semaphore
6
5
  from collections.abc import AsyncGenerator, Iterable
7
6
  from dataclasses import dataclass
8
7
  from pathlib import Path
9
- from textwrap import dedent
10
- from typing import Literal
8
+ from typing import Literal, cast, get_args
11
9
 
12
10
  from aiohttp_retry import RetryClient
13
11
  from aiopath import AsyncPath
14
- from cattrs.preconf.orjson import make_converter
15
12
  from tqdm.asyncio import tqdm
13
+ from yarl import URL
16
14
 
17
15
  from protein_quest.alphafold.entry_summary import EntrySummary
18
- from protein_quest.utils import friendly_session, retrieve_files
16
+ from protein_quest.converter import converter
17
+ from protein_quest.utils import friendly_session, retrieve_files, run_async
19
18
 
20
19
  logger = logging.getLogger(__name__)
21
- converter = make_converter()
20
+
22
21
 
23
22
  DownloadableFormat = Literal[
23
+ "summary",
24
24
  "bcif",
25
25
  "cif",
26
26
  "pdb",
@@ -32,16 +32,7 @@ DownloadableFormat = Literal[
32
32
  ]
33
33
  """Types of formats that can be downloaded from the AlphaFold web service."""
34
34
 
35
- downloadable_formats: set[DownloadableFormat] = {
36
- "bcif",
37
- "cif",
38
- "pdb",
39
- "paeImage",
40
- "paeDoc",
41
- "amAnnotations",
42
- "amAnnotationsHg19",
43
- "amAnnotationsHg38",
44
- }
35
+ downloadable_formats: set[DownloadableFormat] = set(get_args(DownloadableFormat))
45
36
  """Set of formats that can be downloaded from the AlphaFold web service."""
46
37
 
47
38
 
@@ -59,6 +50,7 @@ class AlphaFoldEntry:
59
50
 
60
51
  uniprot_acc: str
61
52
  summary: EntrySummary | None
53
+ summary_file: Path | None = None
62
54
  bcif_file: Path | None = None
63
55
  cif_file: Path | None = None
64
56
  pdb_file: Path | None = None
@@ -127,10 +119,6 @@ async def fetch_summary(
127
119
 
128
120
  Returns:
129
121
  A list of EntrySummary objects representing the fetched summary.
130
-
131
- Raises:
132
- HTTPError: If the HTTP request returns an error status code.
133
- Exception: If there is an error during file reading/writing or data conversion.
134
122
  """
135
123
  url = f"https://alphafold.ebi.ac.uk/api/prediction/{qualifier}"
136
124
  fn: AsyncPath | None = None
@@ -144,6 +132,7 @@ async def fetch_summary(
144
132
  response.raise_for_status()
145
133
  raw_data = await response.content.read()
146
134
  if fn is not None:
135
+ # TODO return fn and make it part of AlphaFoldEntry as summary_file prop
147
136
  await fn.write_bytes(raw_data)
148
137
  return converter.loads(raw_data, list[EntrySummary])
149
138
 
@@ -164,19 +153,14 @@ async def fetch_summaries(
164
153
  yield summary
165
154
 
166
155
 
167
- def url2name(url: str) -> str:
168
- """Given a URL, return the final path component as the name of the file."""
169
- return url.split("/")[-1]
170
-
171
-
172
156
  async def fetch_many_async(
173
- ids: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
157
+ uniprot_accessions: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
174
158
  ) -> AsyncGenerator[AlphaFoldEntry]:
175
- """Asynchronously fetches summaries and pdb and pae (predicted alignment error) files from
159
+ """Asynchronously fetches summaries and files from
176
160
  [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).
177
161
 
178
162
  Args:
179
- ids: A set of Uniprot IDs to fetch.
163
+ uniprot_accessions: A set of Uniprot acessions to fetch.
180
164
  save_dir: The directory to save the fetched files to.
181
165
  what: A set of formats to download.
182
166
  max_parallel_downloads: The maximum number of parallel downloads.
@@ -184,7 +168,13 @@ async def fetch_many_async(
184
168
  Yields:
185
169
  A dataclass containing the summary, pdb file, and pae file.
186
170
  """
187
- summaries = [s async for s in fetch_summaries(ids, save_dir, max_parallel_downloads=max_parallel_downloads)]
171
+ save_dir_for_summaries = save_dir if "summary" in what and save_dir is not None else None
172
+ summaries = [
173
+ s
174
+ async for s in fetch_summaries(
175
+ uniprot_accessions, save_dir_for_summaries, max_parallel_downloads=max_parallel_downloads
176
+ )
177
+ ]
188
178
 
189
179
  files = files_to_download(what, summaries)
190
180
 
@@ -198,30 +188,31 @@ async def fetch_many_async(
198
188
  yield AlphaFoldEntry(
199
189
  uniprot_acc=summary.uniprotAccession,
200
190
  summary=summary,
201
- bcif_file=save_dir / url2name(summary.bcifUrl) if "bcif" in what else None,
202
- cif_file=save_dir / url2name(summary.cifUrl) if "cif" in what else None,
203
- pdb_file=save_dir / url2name(summary.pdbUrl) if "pdb" in what else None,
204
- pae_image_file=save_dir / url2name(summary.paeImageUrl) if "paeImage" in what else None,
205
- pae_doc_file=save_dir / url2name(summary.paeDocUrl) if "paeDoc" in what else None,
191
+ summary_file=save_dir / f"{summary.uniprotAccession}.json" if save_dir_for_summaries is not None else None,
192
+ bcif_file=save_dir / summary.bcifUrl.name if "bcif" in what else None,
193
+ cif_file=save_dir / summary.cifUrl.name if "cif" in what else None,
194
+ pdb_file=save_dir / summary.pdbUrl.name if "pdb" in what else None,
195
+ pae_image_file=save_dir / summary.paeImageUrl.name if "paeImage" in what else None,
196
+ pae_doc_file=save_dir / summary.paeDocUrl.name if "paeDoc" in what else None,
206
197
  am_annotations_file=(
207
- save_dir / url2name(summary.amAnnotationsUrl)
198
+ save_dir / summary.amAnnotationsUrl.name
208
199
  if "amAnnotations" in what and summary.amAnnotationsUrl
209
200
  else None
210
201
  ),
211
202
  am_annotations_hg19_file=(
212
- save_dir / url2name(summary.amAnnotationsHg19Url)
203
+ save_dir / summary.amAnnotationsHg19Url.name
213
204
  if "amAnnotationsHg19" in what and summary.amAnnotationsHg19Url
214
205
  else None
215
206
  ),
216
207
  am_annotations_hg38_file=(
217
- save_dir / url2name(summary.amAnnotationsHg38Url)
208
+ save_dir / summary.amAnnotationsHg38Url.name
218
209
  if "amAnnotationsHg38" in what and summary.amAnnotationsHg38Url
219
210
  else None
220
211
  ),
221
212
  )
222
213
 
223
214
 
224
- def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySummary]) -> set[tuple[str, str]]:
215
+ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySummary]) -> set[tuple[URL, str]]:
225
216
  if not (set(what) <= downloadable_formats):
226
217
  msg = (
227
218
  f"Invalid format(s) specified: {set(what) - downloadable_formats}. "
@@ -229,24 +220,21 @@ def files_to_download(what: set[DownloadableFormat], summaries: Iterable[EntrySu
229
220
  )
230
221
  raise ValueError(msg)
231
222
 
232
- files: set[tuple[str, str]] = set()
223
+ files: set[tuple[URL, str]] = set()
233
224
  for summary in summaries:
234
225
  for fmt in what:
235
- url = getattr(summary, f"{fmt}Url", None)
226
+ if fmt == "summary":
227
+ # summary is handled already in fetch_summary
228
+ continue
229
+ url = cast("URL | None", getattr(summary, f"{fmt}Url", None))
236
230
  if url is None:
237
231
  logger.warning(f"Summary {summary.uniprotAccession} does not have a URL for format '{fmt}'. Skipping.")
238
232
  continue
239
- file = (url, url2name(url))
233
+ file = (url, url.name)
240
234
  files.add(file)
241
235
  return files
242
236
 
243
237
 
244
- class NestedAsyncIOLoopError(RuntimeError):
245
- """Custom error for nested async I/O loops."""
246
-
247
- pass
248
-
249
-
250
238
  def fetch_many(
251
239
  ids: Iterable[str], save_dir: Path, what: set[DownloadableFormat], max_parallel_downloads: int = 5
252
240
  ) -> list[AlphaFoldEntry]:
@@ -260,9 +248,6 @@ def fetch_many(
260
248
 
261
249
  Returns:
262
250
  A list of AlphaFoldEntry dataclasses containing the summary, pdb file, and pae file.
263
-
264
- Raises:
265
- NestedAsyncIOLoopError: If called from a nested async I/O loop like in a Jupyter notebook.
266
251
  """
267
252
 
268
253
  async def gather_entries():
@@ -271,19 +256,7 @@ def fetch_many(
271
256
  async for entry in fetch_many_async(ids, save_dir, what, max_parallel_downloads=max_parallel_downloads)
272
257
  ]
273
258
 
274
- try:
275
- return asyncio.run(gather_entries())
276
- except RuntimeError as e:
277
- msg = dedent("""\
278
- Can not run async method from an environment where the asyncio event loop is already running.
279
- Like a Jupyter notebook.
280
-
281
- Please use the `fetch_many_async` function directly or before call
282
-
283
- import nest_asyncio
284
- nest_asyncio.apply()
285
- """)
286
- raise NestedAsyncIOLoopError(msg) from e
259
+ return run_async(gather_entries())
287
260
 
288
261
 
289
262
  def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
@@ -299,6 +272,7 @@ def relative_to(entry: AlphaFoldEntry, session_dir: Path) -> AlphaFoldEntry:
299
272
  return AlphaFoldEntry(
300
273
  uniprot_acc=entry.uniprot_acc,
301
274
  summary=entry.summary,
275
+ summary_file=entry.summary_file.relative_to(session_dir) if entry.summary_file else None,
302
276
  bcif_file=entry.bcif_file.relative_to(session_dir) if entry.bcif_file else None,
303
277
  cif_file=entry.cif_file.relative_to(session_dir) if entry.cif_file else None,
304
278
  pdb_file=entry.pdb_file.relative_to(session_dir) if entry.pdb_file else None,