gentroutils 1.6.0.dev2__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -32,12 +32,24 @@ class FTPtoGCPTransferableObject(TransferableObject):
32
32
  async with aioftp.Client.context(ftp_obj.server, user="anonymous", password="anonymous") as ftp: # noqa: S106
33
33
  bucket = storage.Client().bucket(gcs_obj.bucket)
34
34
  blob = bucket.blob(gcs_obj.object)
35
- logger.info(f"Changing directory to {ftp_obj.base_dir}.")
36
- await ftp.change_directory(ftp_obj.base_dir)
37
- pwd = await ftp.get_current_directory()
38
- dir_match = re.match(r"^.*(?P<release_date>\d{4}\/\d{2}\/\d{2}){1}$", str(pwd))
35
+ logger.info(f"Searching for the release date in the provided ftp path: {ftp_obj.base_dir}.")
36
+ dir_match = re.match(r"^.*(?P<release_date>\d{4}\/\d{2}\/\d{2}){1}$", str(ftp_obj.base_dir))
37
+
39
38
  if dir_match:
40
- logger.info(f"Found release date!: {dir_match.group('release_date')}")
39
+ logger.info(f"Found release date to search in the ftp {dir_match.group('release_date')}.")
40
+ release_date = dir_match.group("release_date")
41
+ try:
42
+ await ftp.change_directory(ftp_obj.base_dir)
43
+ except aioftp.StatusCodeError as e:
44
+ logger.error(f"Failed to change directory to {ftp_obj.base_dir}: {e}")
45
+ logger.warning("Attempting to load the `latest` release.")
46
+ ftp_obj = FTPPath(self.source.replace(release_date, "latest"))
47
+ try:
48
+ await ftp.change_directory(ftp_obj.base_dir)
49
+ except aioftp.StatusCodeError as e:
50
+ logger.error(f"Failed to find the latest release under {ftp_obj}")
51
+ raise
52
+
41
53
  buffer = io.BytesIO()
42
54
  stream = await ftp.download_stream(ftp_obj.filename)
43
55
  async with stream:
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import asyncio
6
+ from collections import defaultdict
6
7
  from dataclasses import dataclass
7
8
  from datetime import date
8
9
 
@@ -13,7 +14,12 @@ from pydantic import AliasPath, BaseModel, Field
13
14
  from gentroutils.errors import GentroutilsError, GentroutilsErrorMessage
14
15
 
15
16
 
16
- def _requires_release_date_template(path: str) -> str:
17
+ class KeepMissing(defaultdict[str, str]):
18
+ def __missing__(self, key):
19
+ return "{" + key + "}"
20
+
21
+
22
+ def destination_validator(path: str) -> str:
17
23
  """Ensure that the destination path contains a template for the release date."""
18
24
  if "{release_date}" not in path:
19
25
  raise GentroutilsError(GentroutilsErrorMessage.MISSING_RELEASE_DATE_TEMPLATE, release_date="{release_date}")
@@ -34,7 +40,7 @@ class TemplateDestination:
34
40
 
35
41
  This method returns a new TemplateDestination object (not a copy of the current one) with the formatted destination.
36
42
  """
37
- return TemplateDestination(self.destination.format(**substitutions), True)
43
+ return TemplateDestination(self.destination.format_map(KeepMissing(**substitutions)), True)
38
44
 
39
45
 
40
46
  class GwasCatalogReleaseInfo(BaseModel):
@@ -83,6 +89,7 @@ class GwasCatalogReleaseInfo(BaseModel):
83
89
  @classmethod
84
90
  def from_uri(cls, uri: str) -> GwasCatalogReleaseInfo:
85
91
  """Fetch the release information from the specified URI."""
92
+ logger.debug(f"Fetching release info from {uri}")
86
93
  try:
87
94
  return asyncio.run(cls._get_release_info(uri))
88
95
  except aiohttp.ClientError as e:
@@ -1,7 +1,6 @@
1
1
  """Module to handle the crawling of GWAS Catalog release information."""
2
2
 
3
3
  import tempfile
4
- from functools import cached_property
5
4
  from pathlib import Path
6
5
  from typing import Annotated, Any, Self
7
6
 
@@ -9,9 +8,10 @@ from loguru import logger
9
8
  from otter.storage import get_remote_storage
10
9
  from otter.task.model import Spec, Task, TaskContext
11
10
  from otter.task.task_reporter import report
12
- from pydantic import AfterValidator, computed_field
11
+ from pydantic import AfterValidator
13
12
 
14
- from gentroutils.tasks import GwasCatalogReleaseInfo, TemplateDestination, _requires_release_date_template
13
+ from gentroutils.io.path import GCSPath
14
+ from gentroutils.tasks import GwasCatalogReleaseInfo, TemplateDestination, destination_validator
15
15
 
16
16
 
17
17
  class CrawlSpec(Spec):
@@ -68,7 +68,7 @@ class CrawlSpec(Spec):
68
68
  stats_uri: str = "https://www.ebi.ac.uk/gwas/api/search/stats"
69
69
  """The URI to crawl the release statistics information from."""
70
70
 
71
- destination_template: Annotated[str, AfterValidator(_requires_release_date_template)]
71
+ destination_template: Annotated[str, AfterValidator(destination_validator)]
72
72
  """The destination path to save the release information.
73
73
  This path should always be a template string that includes `{release_date}`.
74
74
  For example, `gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json`.
@@ -91,8 +91,6 @@ class CrawlSpec(Spec):
91
91
  promoting the release as the latest release.
92
92
  """
93
93
 
94
- @computed_field # type: ignore[prop-decorator]
95
- @cached_property
96
94
  def destinations(self) -> list[TemplateDestination]:
97
95
  """Get the list of destinations templates where the release information will be saved.
98
96
 
@@ -105,17 +103,17 @@ class CrawlSpec(Spec):
105
103
  1. The destination template with the release date substituted.
106
104
  2. The destination with the release date substituted to `latest`.
107
105
  """
108
- d1 = self.destination_template
106
+ d1 = TemplateDestination(self.destination_template, False)
109
107
  if self.promote:
110
- d2 = self.destination_template.format(release_date="latest")
111
- return [TemplateDestination(d1, False), TemplateDestination(d2, True)]
112
- return [TemplateDestination(d1, False)]
108
+ d2 = d1.format({"release_date": "latest"})
109
+ return [d1, d2]
110
+ return [d1]
113
111
 
114
112
  def substituted_destinations(self, release_info: GwasCatalogReleaseInfo) -> list[str]:
115
113
  """Safely parse the destination name to ensure it is valid."""
116
114
  substitutions = {"release_date": release_info.strfmt("%Y%m%d")}
117
115
  return [
118
- d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations
116
+ d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations()
119
117
  ]
120
118
 
121
119
  def model_post_init(self, __context: Any) -> None:
@@ -141,9 +139,10 @@ class Crawl(Task):
141
139
  logger.info(f"Destinations for release information: {destinations}")
142
140
  for destination in destinations:
143
141
  storage = get_remote_storage(destination)
142
+ assert "gs://" in destination, f"Invalid GCS path in destination template: {destination}"
144
143
  storage.upload(Path(source.name), destination)
145
144
  logger.info(f"Release information written to {destination}")
146
- return self
145
+ return self
147
146
 
148
147
  @report
149
148
  def run(self) -> Self:
@@ -3,16 +3,16 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from datetime import date
6
- from functools import cached_property
7
6
  from typing import Annotated, Any, Self
8
7
 
8
+ from loguru import logger
9
9
  from otter.task.model import Spec, Task, TaskContext
10
10
  from otter.task.task_reporter import report
11
- from pydantic import AfterValidator, computed_field
11
+ from pydantic import AfterValidator
12
12
 
13
13
  from gentroutils.io.transfer.polars_to_gcs import PolarsDataFrameToGCSTransferableObject
14
14
  from gentroutils.parsers.curation import GWASCatalogCuration
15
- from gentroutils.tasks import TemplateDestination, _requires_release_date_template
15
+ from gentroutils.tasks import TemplateDestination, destination_validator
16
16
  from gentroutils.transfer import TransferManager
17
17
 
18
18
 
@@ -50,14 +50,12 @@ class CurationSpec(Spec):
50
50
  studies: str
51
51
  """The path to the studies data."""
52
52
 
53
- destination_template: Annotated[str, AfterValidator(_requires_release_date_template)]
53
+ destination_template: Annotated[str, AfterValidator(destination_validator)]
54
54
  """The destination path for the curation data."""
55
55
 
56
56
  promote: bool = False
57
57
  """Whether to promote the curation data to the latest version."""
58
58
 
59
- @computed_field # type: ignore[prop-decorator]
60
- @cached_property
61
59
  def destinations(self) -> list[TemplateDestination]:
62
60
  """Get the list of destinations templates where the release information will be saved.
63
61
 
@@ -80,7 +78,7 @@ class CurationSpec(Spec):
80
78
  """Safely parse the destination name to ensure it is valid."""
81
79
  substitutions = {"release_date": release_date.strftime("%Y%m%d")}
82
80
  return [
83
- d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations
81
+ d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations()
84
82
  ]
85
83
 
86
84
  def model_post_init(self, __context: Any) -> None:
@@ -99,9 +97,13 @@ class Curation(Task):
99
97
  @report
100
98
  def run(self) -> Self:
101
99
  """Run the curation task."""
100
+ logger.info("Starting curation task.")
102
101
  release_date = date.today()
102
+ logger.debug(f"Using release date: {release_date}")
103
103
  destinations = self.spec.substituted_destinations(release_date)
104
+ logger.debug(f"Destinations for curation data: {destinations}")
104
105
  curation = GWASCatalogCuration.from_prev_curation(self.spec.previous_curation, self.spec.studies)
106
+ logger.debug(f"Curation result preview:\n{curation.result.head()}")
105
107
  transfer_objects = [
106
108
  PolarsDataFrameToGCSTransferableObject(source=curation.result, destination=d) for d in destinations
107
109
  ]
@@ -1,15 +1,14 @@
1
1
  """Module to handle the fetching of GWAS Catalog release files."""
2
2
 
3
- from functools import cached_property
4
3
  from typing import Annotated, Any, Self
5
4
 
6
5
  from loguru import logger
7
6
  from otter.task.model import Spec, Task, TaskContext
8
7
  from otter.task.task_reporter import report
9
- from pydantic import AfterValidator, computed_field
8
+ from pydantic import AfterValidator
10
9
 
11
10
  from gentroutils.io.transfer import FTPtoGCPTransferableObject
12
- from gentroutils.tasks import GwasCatalogReleaseInfo, TemplateDestination, _requires_release_date_template
11
+ from gentroutils.tasks import GwasCatalogReleaseInfo, TemplateDestination, destination_validator
13
12
  from gentroutils.transfer import TransferManager
14
13
 
15
14
  MAX_CONCURRENT_CONNECTIONS = 10
@@ -57,10 +56,10 @@ class FetchSpec(Spec):
57
56
  stats_uri: str = "https://www.ebi.ac.uk/gwas/api/search/stats"
58
57
  """The URI to crawl the release statistics information from."""
59
58
 
60
- source_template: Annotated[str, AfterValidator(_requires_release_date_template)]
59
+ source_template: Annotated[str, AfterValidator(destination_validator)]
61
60
  """The template URI of the file to download."""
62
61
 
63
- destination_template: Annotated[str, AfterValidator(_requires_release_date_template)]
62
+ destination_template: Annotated[str, AfterValidator(destination_validator)]
64
63
  """The template URI to upload the file to."""
65
64
 
66
65
  promote: bool = False
@@ -78,8 +77,6 @@ class FetchSpec(Spec):
78
77
  promoting the release as the latest release.
79
78
  """
80
79
 
81
- @computed_field # type: ignore[prop-decorator]
82
- @cached_property
83
80
  def destinations(self) -> list[TemplateDestination]:
84
81
  """Get the list of destinations templates where the release information will be saved.
85
82
 
@@ -102,7 +99,7 @@ class FetchSpec(Spec):
102
99
  """Safely parse the destination name to ensure it is valid."""
103
100
  substitutions = {"release_date": release_info.strfmt("%Y%m%d")}
104
101
  return [
105
- d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations
102
+ d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations()
106
103
  ]
107
104
 
108
105
  def substituted_sources(self, release_info: GwasCatalogReleaseInfo) -> list[str]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gentroutils
3
- Version: 1.6.0.dev2
3
+ Version: 3.0.0
4
4
  Summary: Open Targets python genetics utility CLI tools
5
5
  Author-email: Szymon Szyszkowski <ss60@sanger.ac.uk>
6
6
  License-Expression: Apache-2.0
@@ -15,10 +15,11 @@ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
15
  Requires-Python: >=3.13
16
16
  Requires-Dist: aioftp>=0.25.1
17
17
  Requires-Dist: aiohttp>=3.11.18
18
+ Requires-Dist: gcsfs>=2025.7.0
18
19
  Requires-Dist: google-cloud-storage>=3.1.1
19
20
  Requires-Dist: loguru>=0.7.3
20
21
  Requires-Dist: opentargets-otter>=25.0.2
21
- Requires-Dist: polars>=1.31.0
22
+ Requires-Dist: polars[fsspec,gcs]>=1.31.0
22
23
  Requires-Dist: pydantic>=2.10.6
23
24
  Requires-Dist: tqdm>=4.67.1
24
25
  Description-Content-Type: text/markdown
@@ -48,6 +49,7 @@ gentroutils --help
48
49
  ## Usage
49
50
 
50
51
  To run a single step run
52
+
51
53
  ```{bash}
52
54
  uv run gentroutils -s gwas_catalog_release # After cloning the repository
53
55
  gentroutils -s gwas_catalog_release -c otter_config.yaml # When installed by pip
@@ -60,6 +62,11 @@ The `gentroutils` repository uses the [otter](https://github.com/opentargets/ott
60
62
 
61
63
  For the top level fields refer to the [otter documentation](https://opentargets.github.io/otter/otter.config.html)
62
64
 
65
+ > [!NOTE]
66
+ > All `destination_template` must point to the Google Cloud Storage (GCS) bucket objects.
67
+ > All `source_template` must point to the FTP server paths.
68
+ > In case this is not enforced, the user may experience silent failures.
69
+
63
70
  ```yaml
64
71
  ---
65
72
  work_path: ./work
@@ -91,7 +98,7 @@ steps:
91
98
  - fetch studies
92
99
  previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
93
100
  studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
94
- destination_template: ./work/curation_{release_date}.tsv
101
+ destination_template: gs://gwas_catalog_inputs/gentroutils/curation/{release_date}/GWAS_Catalog_study_curation.tsv
95
102
  promote: true
96
103
  ```
97
104
 
@@ -114,8 +121,7 @@ The list of tasks (defined in the `config.yaml` file) that can be run are:
114
121
 
115
122
  This task fetches the latest GWAS Catalog release metadata from the `https://www.ebi.ac.uk/gwas/api/search/stats` endpoint and saves it to the specified destination.
116
123
 
117
- > [!NOTE]
118
- > **Task parameters**
124
+ > [!NOTE] > **Task parameters**
119
125
  >
120
126
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
121
127
  > - The `destination_template` is where the metadata will be saved, and it uses the `{release_date}` placeholder to specify the release date dynamically. By default it searches for the release directly in the stats_uri json output.
@@ -135,8 +141,7 @@ This task fetches the latest GWAS Catalog release metadata from the `https://www
135
141
 
136
142
  This task fetches the GWAS Catalog associations file from the specified FTP server and saves it to the specified destination.
137
143
 
138
- > [!NOTE]
139
- > **Task parameters**
144
+ > [!NOTE] > **Task parameters**
140
145
  >
141
146
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
142
147
  > - The `source_template` is the URL of the GWAS Catalog associations file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
@@ -157,8 +162,7 @@ This task fetches the GWAS Catalog associations file from the specified FTP serv
157
162
 
158
163
  This task fetches the GWAS Catalog studies file from the specified FTP server and saves it to the specified destination.
159
164
 
160
- > [!NOTE]
161
- > **Task parameters**
165
+ > [!NOTE] > **Task parameters**
162
166
  >
163
167
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
164
168
  > - The `source_template` is the URL of the GWAS Catalog studies file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
@@ -179,8 +183,7 @@ This task fetches the GWAS Catalog studies file from the specified FTP server an
179
183
 
180
184
  This task fetches the GWAS Catalog ancestries file from the specified FTP server and saves it to the specified destination.
181
185
 
182
- > [!NOTE]
183
- > **Task parameters**
186
+ > [!NOTE] > **Task parameters**
184
187
  >
185
188
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
186
189
  > - The `source_template` is the URL of the GWAS Catalog ancestries file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
@@ -203,7 +206,7 @@ This task fetches the GWAS Catalog ancestries file from the specified FTP server
203
206
 
204
207
  This task is used to build the GWAS Catalog curation file that is later used as a template for manual curation. It requires the `fetch studies` task to be completed before it can run. This is due to the fact that the curation file is build based on the list of studies fetched from `download studies` file.
205
208
 
206
- > [!NOTE]
209
+ > [!NOTE]
207
210
  > **Task parameters**
208
211
  >
209
212
  > - The `requires` field specifies that this task depends on the `fetch studies` task, meaning it will only run after the studies have been fetched.
@@ -268,6 +271,7 @@ To check CLI execution manually you need to run
268
271
  ```{bash}
269
272
  uv run gentroutils
270
273
  ```
274
+
271
275
  ---
272
276
 
273
277
  This software was developed as part of the Open Targets project. For more
@@ -6,17 +6,17 @@ gentroutils/io/path/__init__.py,sha256=oBNiw3xr84QAFejVgRIW_PttJ2ZQHZV2uYtIklXle
6
6
  gentroutils/io/path/ftp.py,sha256=kus0GS9Bcm3IzooL8duu7fdtMDr0-Sm7LCtpiUd1Kxs,1585
7
7
  gentroutils/io/path/gcs.py,sha256=itEEyUBQLqCBV-F7JF7ZJHUn5OBObmTCsrEfib_V7p8,1514
8
8
  gentroutils/io/transfer/__init__.py,sha256=2KasKyCMTpnUvU2SVzZds7ggYBS5ReXn8q3K_TE0YYA,310
9
- gentroutils/io/transfer/ftp_to_gcs.py,sha256=SGIpxTaQoiIeW2p9JHIMckNWhFpz1Ne4YqwoiYqdWdo,2107
9
+ gentroutils/io/transfer/ftp_to_gcs.py,sha256=O4wDNVyAt24_1zIkfGFdNQ-er1CtJ7J5rdj0bxrG7SI,2793
10
10
  gentroutils/io/transfer/model.py,sha256=-REhDC8nIT_RoOlRwHpvNfqN_khOm7OnlAi8-U7qX1k,1198
11
11
  gentroutils/io/transfer/polars_to_gcs.py,sha256=l1vWDLVw5OuAjPRWEvA69w5V5cz36ARGqCOA9R9iecM,762
12
12
  gentroutils/parsers/__init__.py,sha256=HtgvopQ3Xx_cjC2lA3Tp81Rd5-k4CUJGJu0GD7W9r0o,59
13
13
  gentroutils/parsers/curation.py,sha256=34GvUTD9SmUpk_CEMoqJbpDyBVW3kKOyMLMiZ4nc9e0,7357
14
- gentroutils/tasks/__init__.py,sha256=dN9Od2_I504AZjLwBixj04M0h6dmmxGRV7FMwHHEdzM,3537
15
- gentroutils/tasks/crawl.py,sha256=njCveYWvJz6CHWQfjbciGp57yZcsKuFiZqVWR57_XeU,6838
16
- gentroutils/tasks/curation.py,sha256=cJZQmaD-44rej-8K6dUW6IlzArZt-YcxtVVTZdKh-Fk,4539
17
- gentroutils/tasks/fetch.py,sha256=jwqIQ49P--63X1EyKXIUf2iBtv3QRsuNgQWEBwlvgCk,6444
18
- gentroutils-1.6.0.dev2.dist-info/METADATA,sha256=uMWLGO-9GX8_DBbMg0hbizSwiizQQ96ayIjItzBBQUI,14941
19
- gentroutils-1.6.0.dev2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
20
- gentroutils-1.6.0.dev2.dist-info/entry_points.txt,sha256=KjODdAGrWKebI3ogqs9r6snAJ_DtLGpm1jREbZ6OXGs,49
21
- gentroutils-1.6.0.dev2.dist-info/licenses/LICENSE,sha256=8rMKYP7K5vL-KA2WmdBkUBz4iFRveUOWUMAFP8uc3P0,10945
22
- gentroutils-1.6.0.dev2.dist-info/RECORD,,
14
+ gentroutils/tasks/__init__.py,sha256=WvMdT_m0F1Kos-FWaVlXLTCuRs-xQD1cRHtHEIZ_U4c,3745
15
+ gentroutils/tasks/crawl.py,sha256=kHLT5EBFUT6yubNKPshHKNrlDMuQ7RM9c22g44VN734,6766
16
+ gentroutils/tasks/curation.py,sha256=qBI0PF6Bby-J-113VM2xgW4a7dq8L99rRq8YMo6dDww,4675
17
+ gentroutils/tasks/fetch.py,sha256=cDP6FCzSCIRq_MZGXZEVQtcLOG6TsOzIg6z_s2U6dkM,6289
18
+ gentroutils-3.0.0.dist-info/METADATA,sha256=-h_nxtgCIubVMGMvzYr0A9_zxmS8JCQ7TCNA2tmHMws,15268
19
+ gentroutils-3.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
20
+ gentroutils-3.0.0.dist-info/entry_points.txt,sha256=KjODdAGrWKebI3ogqs9r6snAJ_DtLGpm1jREbZ6OXGs,49
21
+ gentroutils-3.0.0.dist-info/licenses/LICENSE,sha256=8rMKYP7K5vL-KA2WmdBkUBz4iFRveUOWUMAFP8uc3P0,10945
22
+ gentroutils-3.0.0.dist-info/RECORD,,