gentroutils 3.1.0__py3-none-any.whl → 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,12 @@
1
1
  """Transfer files from FTP to Google Cloud Storage (GCS)."""
2
2
 
3
+ import asyncio
3
4
  import io
4
5
  import re
5
6
  from typing import Annotated
6
7
 
7
8
  import aioftp
8
- from google.cloud import storage # type: ignore[attr-defined]
9
+ from google.cloud import storage
9
10
  from loguru import logger
10
11
  from pydantic import AfterValidator
11
12
 
@@ -24,6 +25,36 @@ class FTPtoGCPTransferableObject(TransferableObject):
24
25
 
25
26
  This function fetches the data for the file provided in the local FTP path, collects the
26
27
  data asynchronously to buffer, and uploads it to the provided GCP bucket blob.
28
+
29
+ Implements retry logic with exponential backoff for handling transient network errors.
30
+ """
31
+ max_retries = 3
32
+ retry_delay = 1 # Initial delay in seconds
33
+
34
+ for attempt in range(max_retries):
35
+ try:
36
+ await self._perform_transfer()
37
+ return # Success, exit the retry loop
38
+ except (ConnectionResetError, OSError, aioftp.errors.AIOFTPException) as e:
39
+ if attempt < max_retries - 1:
40
+ wait_time = retry_delay * (2**attempt) # Exponential backoff
41
+ logger.warning(
42
+ f"Transfer attempt {attempt + 1}/{max_retries} failed for {self.source}: {e}. "
43
+ f"Retrying in {wait_time}s..."
44
+ )
45
+ await asyncio.sleep(wait_time)
46
+ else:
47
+ logger.error(f"Transfer failed after {max_retries} attempts for {self.source}: {e}")
48
+ raise
49
+ except Exception as e:
50
+ # For non-retryable exceptions, log and raise immediately
51
+ logger.error(f"Non-retryable error during transfer from {self.source} to {self.destination}: {e}")
52
+ raise
53
+
54
+ async def _perform_transfer(self) -> None:
55
+ """Perform the actual transfer operation.
56
+
57
+ This is separated from the transfer method to allow for retry logic.
27
58
  """
28
59
  logger.info(f"Attempting to transfer data from {self.source} to {self.destination}.")
29
60
  gcs_obj = GCSPath(self.destination)
@@ -39,23 +70,74 @@ class FTPtoGCPTransferableObject(TransferableObject):
39
70
  logger.info(f"Found release date to search in the ftp {dir_match.group('release_date')}.")
40
71
  release_date = dir_match.group("release_date")
41
72
  try:
73
+ logger.debug(f"We are in the directory: {await ftp.get_current_directory()}")
74
+ logger.debug(f"Changing directory to: {ftp_obj.base_dir}")
42
75
  await ftp.change_directory(ftp_obj.base_dir)
76
+ logger.success(f"Successfully changed directory to: {ftp_obj.base_dir}")
43
77
  except aioftp.StatusCodeError as e:
44
- logger.error(f"Failed to change directory to {ftp_obj.base_dir}: {e}")
45
- logger.warning("Attempting to load the `latest` release.")
46
- ftp_obj = FTPPath(self.source.replace(release_date, "latest"))
78
+ logger.warning(f"Failed to change directory to {ftp_obj.base_dir}: {e}")
79
+ logger.warning(f"Probably the release date {release_date} is out of sync with the api endpoint.")
47
80
  try:
81
+ logger.warning("Attempting to load the `latest` release.")
82
+ ftp_obj = FTPPath(self.source.replace(release_date, "latest"))
48
83
  await ftp.change_directory(ftp_obj.base_dir)
84
+ logger.success(f"Successfully changed directory to: {ftp_obj.base_dir}")
85
+
49
86
  except aioftp.StatusCodeError as e:
50
87
  logger.error(f"Failed to find the latest release under {ftp_obj}")
51
88
  raise
52
89
 
53
- buffer = io.BytesIO()
54
- stream = await ftp.download_stream(ftp_obj.filename)
55
- async with stream:
56
- async for block in stream.iter_by_block():
57
- buffer.write(block)
58
- buffer.seek(0)
59
- content = buffer.getvalue().decode("utf-8")
60
- buffer.close()
61
- blob.upload_from_string("".join(content))
90
+ logger.debug("Creating in-memory buffer to store downloaded data.")
91
+ buffer = io.BytesIO()
92
+ logger.debug(f"Downloading data from FTP path: {ftp_obj.filename}")
93
+ stream = await ftp.download_stream(ftp_obj.filename)
94
+ logger.info("Successfully connected to the FTP stream, beginning data transfer to buffer.")
95
+ async with stream:
96
+ async for block in stream.iter_by_block():
97
+ buffer.write(block)
98
+ buffer.seek(0)
99
+ if ftp_obj.filename.endswith(".zip"):
100
+ logger.info("Uploading zipped content to GCS blob.")
101
+ logger.info("Unzipping content before upload.")
102
+ content = unzip_buffer(buffer)
103
+ blob.upload_from_string(content)
104
+ else:
105
+ content = buffer.getvalue()
106
+ buffer.close()
107
+ blob.upload_from_string(content)
108
+
109
+ else:
110
+ logger.error(f"Failed to extract release date from the provided ftp path: {ftp_obj.base_dir}.")
111
+ raise ValueError("Release date could not be extracted from the FTP path.")
112
+
113
+
114
+ def unzip_buffer(buffer: io.BytesIO) -> bytes:
115
+ """Unzip a BytesIO buffer and return a dictionary of file names to their content.
116
+
117
+ Args:
118
+ buffer (io.BytesIO): The in-memory buffer containing zipped data.
119
+
120
+ Returns:
121
+ bytes: The unzipped content of the single file.
122
+
123
+ Raises:
124
+ ValueError: If multiple files are found in the zipped buffer or if no files are found.
125
+ """
126
+ import zipfile
127
+
128
+ unzipped_files: dict[str, bytes] = {}
129
+ with zipfile.ZipFile(buffer) as z:
130
+ for file_info in z.infolist():
131
+ with z.open(file_info) as unzipped_file:
132
+ unzipped_files[file_info.filename] = unzipped_file.read()
133
+
134
+ if len(unzipped_files) == 0:
135
+ logger.error("No files were found in the zipped buffer.")
136
+ raise ValueError("No files were found in the zipped buffer.")
137
+ if len(unzipped_files) != 1:
138
+ logger.error("Multiple files were found in the zipped buffer.")
139
+ raise ValueError("Multiple files were found in the zipped buffer.")
140
+ keys = list(unzipped_files.keys())
141
+ logger.info(f"Unzipped file: {keys[0]} with size {len(unzipped_files[keys[0]])} bytes.")
142
+
143
+ return unzipped_files[keys[0]]
@@ -16,5 +16,5 @@ class PolarsDataFrameToGCSTransferableObject(TransferableObject):
16
16
  """Transfer the Polars DataFrame to the specified GCS destination."""
17
17
  # Convert Polars DataFrame to CSV and upload to GCS
18
18
  logger.info(f"Transferring Polars DataFrame to {self.destination}.")
19
- self.source.write_csv(self.destination)
19
+ self.source.write_csv(self.destination, separator="\t", include_header=True)
20
20
  logger.info(f"Uploading DataFrame to {self.destination}")
@@ -5,6 +5,7 @@ from __future__ import annotations
5
5
  from enum import StrEnum
6
6
 
7
7
  import polars as pl
8
+ from google.cloud.storage import Client
8
9
  from loguru import logger
9
10
 
10
11
  from gentroutils.errors import GentroutilsError, GentroutilsErrorMessage
@@ -69,31 +70,102 @@ class DownloadStudiesSchema(StrEnum):
69
70
  return [member.value for member in cls]
70
71
 
71
72
 
73
+ class SyncedSummaryStatisticsSchema(StrEnum):
74
+ """Enum to define the columns for synced summary statistics."""
75
+
76
+ FILE_PATH = "filePath"
77
+ """The GCS file path of the summary statistics file."""
78
+ SYNCED = "isSynced"
79
+ """Flag indicating whether the file has been synced."""
80
+ STUDY_ID = "studyId"
81
+ """The unique identifier for a study."""
82
+
83
+ @classmethod
84
+ def columns(cls) -> list[str]:
85
+ """Get the list of columns defined in the schema."""
86
+ return [member.value for member in cls]
87
+
88
+
72
89
  class CuratedStudyStatus(StrEnum):
73
90
  """Enum to define the status of a curated study."""
74
91
 
75
92
  REMOVED = "removed"
76
93
  """The study has been removed from the GWAS Catalog."""
77
- NEW = "new"
78
- """The study is new in the GWAS Catalog."""
94
+ TO_CURATE = "to_curate"
95
+ """The study is new and needs to be curated."""
79
96
  CURATED = "curated"
80
97
  """The study has been curated and is still in the GWAS Catalog."""
98
+ NO_SUMSTATS = "no_summary_statistics"
99
+ """The study has no associated summary statistics."""
100
+
101
+
102
+ class GCSSummaryStatisticsFileCrawler:
103
+ """Class to crawl GCS for summary statistics files."""
104
+
105
+ def __init__(self, gcs_glob: str):
106
+ """Initialize the GCSSummaryStatisticsFileCrawler with a GCS glob pattern."""
107
+ self.gcs_glob = gcs_glob
108
+ logger.debug("Initialized GCSSummaryStatisticsFileCrawler with glob: {}", gcs_glob)
109
+
110
+ def _fetch_paths(self) -> list[str]:
111
+ """Fetch file paths from GCS based on the glob pattern."""
112
+ # Implementation to fetch file paths from GCS
113
+ c = Client()
114
+ bucket_name = self.gcs_glob.split("/")[2]
115
+ prefix = "/".join(self.gcs_glob.split("/")[3:-1])
116
+ suffix = self.gcs_glob.split("/")[-1].replace("*", "")
117
+ logger.debug("Crawling GCS bucket: {}, prefix: {}, suffix: {}", bucket_name, prefix, suffix)
118
+ bucket = c.bucket(bucket_name)
119
+ blobs = bucket.list_blobs(prefix=prefix)
120
+ return [f"gs://{bucket_name}/{blob.name}" for blob in blobs if blob.name.endswith(suffix)]
121
+
122
+ def crawl(self) -> pl.DataFrame:
123
+ """Crawl GCS and return a DataFrame of summary statistics files."""
124
+ # Implementation to crawl GCS and return a DataFrame
125
+ file_paths = self._fetch_paths()
126
+ logger.debug("Found {} summary statistics files.", len(file_paths))
127
+ data = pl.DataFrame({
128
+ SyncedSummaryStatisticsSchema.FILE_PATH: file_paths,
129
+ SyncedSummaryStatisticsSchema.SYNCED: [True] * len(file_paths),
130
+ }).with_columns(
131
+ pl.col(SyncedSummaryStatisticsSchema.FILE_PATH)
132
+ .str.extract(r"\/(GCST\d+)\/", 1)
133
+ .alias(SyncedSummaryStatisticsSchema.STUDY_ID)
134
+ )
135
+ # Post check to find if there are any studies with multiple files.
136
+ multi_files = data.group_by(SyncedSummaryStatisticsSchema.STUDY_ID).len().filter(pl.col("len") > 1)
137
+ if not multi_files.is_empty():
138
+ logger.warning("Studies with multiple summary statistics files found: {}", multi_files)
139
+ logger.warning("DataFrame shape before deduplication: {}", data.shape)
140
+ logger.warning("Synced data preview:\n{}", data.head())
141
+ data = data.unique(subset=SyncedSummaryStatisticsSchema.STUDY_ID)
142
+ logger.warning("Synced data after deduplication:\n{}", data.shape)
143
+ return data
81
144
 
82
145
 
83
146
  class GWASCatalogCuration:
84
147
  """Class to handle the curation of GWAS Catalog data."""
85
148
 
86
- def __init__(self, previous_curation: pl.DataFrame, studies: pl.DataFrame):
149
+ def __init__(self, previous_curation: pl.DataFrame, studies: pl.DataFrame, synced: pl.DataFrame):
87
150
  """Initialize the GWASCatalogCuration with previous curation and studies data."""
88
151
  logger.debug("Initializing GWASCatalogCuration with previous curation and studies data.")
89
152
  self.previous_curation = previous_curation
90
153
  logger.debug("Previous curation data loaded with shape: {}", previous_curation.shape)
91
154
  self.studies = studies
92
155
  logger.debug("Studies data loaded with shape: {}", studies.shape)
156
+ self.synced = synced
157
+ logger.debug("Synced summary statistics data loaded with shape: {}", synced.shape)
93
158
 
94
159
  @classmethod
95
- def from_prev_curation(cls, previous_curation_path: str, download_studies_path: str) -> GWASCatalogCuration:
160
+ def from_prev_curation(
161
+ cls,
162
+ previous_curation_path: str,
163
+ download_studies_path: str,
164
+ summary_statistics_glob: str,
165
+ ) -> GWASCatalogCuration:
96
166
  """Create a GWASCatalogCuration instance from previous curation and studies."""
167
+ crawled_summary_statistics = GCSSummaryStatisticsFileCrawler(summary_statistics_glob).crawl()
168
+
97
169
  previous_curation_df = pl.read_csv(
98
170
  previous_curation_path,
99
171
  separator="\t",
@@ -112,7 +184,7 @@ class GWASCatalogCuration:
112
184
  if studies_df.is_empty():
113
185
  raise GentroutilsError(GentroutilsErrorMessage.DOWNLOAD_STUDIES_EMPTY, path=download_studies_path)
114
186
  studies_df = studies_df.rename(mapping=DownloadStudiesSchema.mapping())
115
- return cls(previous_curation_df, studies_df)
187
+ return cls(previous_curation_df, studies_df, crawled_summary_statistics)
116
188
 
117
189
  @property
118
190
  def result(self) -> pl.DataFrame:
@@ -144,7 +216,11 @@ class GWASCatalogCuration:
144
216
  assert all(prev_studies.select(CurationSchema.STUDY_ID).is_unique()), "Study IDs must be unique after merging."
145
217
 
146
218
  # Studies that are new in the GWAS Catalog
147
- new_studies = self.studies.join(self.previous_curation, on=CurationSchema.STUDY_ID, how="anti").select(
219
+ new_studies = self.studies.join(self.previous_curation, on=CurationSchema.STUDY_ID, how="anti")
220
+ # Annotate new studies with info if they have summary statistics synced to the GCS bucket
221
+ new_studies_annotated = new_studies.join(self.synced, on=CurationSchema.STUDY_ID, how="left")
222
+ # Assign status NO_SUMSTATS to new studies without synced summary statistics (left join to drop info about already curated studies)
223
+ new_studies_annotated = new_studies_annotated.select(
148
224
  CurationSchema.STUDY_ID,
149
225
  pl.lit(None).alias(CurationSchema.STUDY_TYPE),
150
226
  pl.lit(None).alias(CurationSchema.ANALYSIS_FLAG),
@@ -153,12 +229,16 @@ class GWASCatalogCuration:
153
229
  CurationSchema.PUBMED_ID,
154
230
  CurationSchema.PUBLICATION_TITLE,
155
231
  CurationSchema.TRAIT_FROM_SOURCE,
156
- pl.lit(CuratedStudyStatus.NEW).alias("status"),
232
+ pl.when(pl.col(SyncedSummaryStatisticsSchema.SYNCED).is_null())
233
+ .then(pl.lit(CuratedStudyStatus.NO_SUMSTATS))
234
+ .otherwise(pl.lit(CuratedStudyStatus.TO_CURATE))
235
+ .alias("status"),
157
236
  )
158
237
  logger.debug("New studies identified: {}", new_studies.shape[0])
159
238
 
160
239
  # Union of new studies and previously curated studies
161
- all_studies = pl.concat([prev_studies, new_studies], how="vertical")
240
+ all_studies = pl.concat([prev_studies, new_studies_annotated], how="vertical")
241
+
162
242
  logger.debug("All studies after combining new and previous: {}", all_studies.shape[0])
163
243
 
164
244
  # Ensure the contract on the output dataframe
@@ -29,6 +29,7 @@ class CurationSpec(Spec):
29
29
  ... name="curate gwas catalog data",
30
30
  ... previous_curation="gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv",
31
31
  ... studies="gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv",
32
+ ... summary_statistics_glob="gs://gwas_catalog_inputs/raw_summary_statistics/**/*.tsv.gz",
32
33
  ... destination_template="gs://gwas_catalog_inputs/{release_date}/pending/curation.tsv"
33
34
  ... )
34
35
  >>> cs.name
@@ -39,6 +40,8 @@ class CurationSpec(Spec):
39
40
  'gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv'
40
41
  >>> cs.destination_template
41
42
  'gs://gwas_catalog_inputs/{release_date}/pending/curation.tsv'
43
+ >>> cs.summary_statistics_glob
44
+ 'gs://gwas_catalog_inputs/raw_summary_statistics/**/*.tsv.gz'
42
45
  """
43
46
 
44
47
  name: str = "curate gwas catalog data"
@@ -53,6 +56,9 @@ class CurationSpec(Spec):
53
56
  destination_template: Annotated[str, AfterValidator(destination_validator)]
54
57
  """The destination path for the curation data."""
55
58
 
59
+ summary_statistics_glob: str
60
+ """The glob pattern to locate summary statistics files."""
61
+
56
62
  promote: bool = False
57
63
  """Whether to promote the curation data to the latest version."""
58
64
 
@@ -102,7 +108,9 @@ class Curation(Task):
102
108
  logger.debug(f"Using release date: {release_date}")
103
109
  destinations = self.spec.substituted_destinations(release_date)
104
110
  logger.debug(f"Destinations for curation data: {destinations}")
105
- curation = GWASCatalogCuration.from_prev_curation(self.spec.previous_curation, self.spec.studies)
111
+ curation = GWASCatalogCuration.from_prev_curation(
112
+ self.spec.previous_curation, self.spec.studies, self.spec.summary_statistics_glob
113
+ )
106
114
  logger.debug(f"Curation result preview:\n{curation.result.head()}")
107
115
  transfer_objects = [
108
116
  PolarsDataFrameToGCSTransferableObject(source=curation.result, destination=d) for d in destinations
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gentroutils
3
- Version: 3.1.0
3
+ Version: 4.0.0
4
4
  Summary: Open Targets python genetics utility CLI tools
5
5
  Author-email: Szymon Szyszkowski <ss60@sanger.ac.uk>
6
6
  License-Expression: Apache-2.0
@@ -12,13 +12,13 @@ Classifier: License :: OSI Approved :: Apache Software License
12
12
  Classifier: Operating System :: Unix
13
13
  Classifier: Programming Language :: Python :: 3.13
14
14
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
- Requires-Python: >=3.13
15
+ Requires-Python: <=3.13,>3.11
16
16
  Requires-Dist: aioftp>=0.25.1
17
17
  Requires-Dist: aiohttp>=3.11.18
18
18
  Requires-Dist: gcsfs>=2025.7.0
19
19
  Requires-Dist: google-cloud-storage>=3.1.1
20
20
  Requires-Dist: loguru>=0.7.3
21
- Requires-Dist: opentargets-otter>=25.0.2
21
+ Requires-Dist: opentargets-otter>=25.0.15
22
22
  Requires-Dist: polars[fsspec]>=1.31.0
23
23
  Requires-Dist: pydantic>=2.10.6
24
24
  Requires-Dist: tqdm>=4.67.1
@@ -99,6 +99,7 @@ steps:
99
99
  previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
100
100
  studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
101
101
  destination_template: gs://gwas_catalog_inputs/gentroutils/curation/{release_date}/GWAS_Catalog_study_curation.tsv
102
+ summary_statistics_glob: gs://gwas_catalog_inputs/raw_summary_statistics/*.h.tsv.gz
102
103
  promote: true
103
104
  ```
104
105
 
@@ -164,7 +165,7 @@ This task fetches the GWAS Catalog associations file from the specified FTP serv
164
165
 
165
166
  This task fetches the GWAS Catalog studies file from the specified FTP server and saves it to the specified destination.
166
167
 
167
- > [!NOTE]
168
+ > [!NOTE]
168
169
  > **Task parameters**
169
170
  >
170
171
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
@@ -186,7 +187,7 @@ This task fetches the GWAS Catalog studies file from the specified FTP server an
186
187
 
187
188
  This task fetches the GWAS Catalog ancestries file from the specified FTP server and saves it to the specified destination.
188
189
 
189
- > [!NOTE]
190
+ > [!NOTE]
190
191
  > **Task parameters**
191
192
  >
192
193
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
@@ -205,6 +206,7 @@ This task fetches the GWAS Catalog ancestries file from the specified FTP server
205
206
  previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
206
207
  studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
207
208
  destination_template: gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv
209
+ summary_statistics_glob: gs://gwas_catalog_inputs/raw_summary_statistics/*.h.tsv.gz
208
210
  promote: true
209
211
  ```
210
212
 
@@ -218,24 +220,26 @@ This task is used to build the GWAS Catalog curation file that is later used as
218
220
  > - The `studies` field is the path to the studies file that was fetched in the `fetch studies` task. This file is used to build the curation file.
219
221
  > - The `destination_template` is where the curation file will be saved, and it uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
220
222
  > - The `promote` field is set to `true`, which means the output will be promoted to the latest release. Meaning that the file will be saved under `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` after the task is completed. If the `promote` field is set to `false`, the file will not be promoted and will be saved under the specified path with the release date.
223
+ > The `summary_statistics_glob` field is used to specify the glob pattern to list all synced summary statistics files from GCS. This is used to identify which studies have summary statistics available.
221
224
 
222
225
  ---
223
226
 
224
227
  ## Curation process
225
228
 
226
- The base of the curation process for GWAS Catalog data is defined in the [docs/gwas_catalog_curation.md](docs/gwas_catalog_curation.md). The original solution uses R script to prepare the data for curation and then manually curates the data. The solution proposed in the `curation` task autommates the preparation of the data for curation and provides a template for manual curation. The manual curation process is still required, but the data preparation is automated.
229
+ The base of the curation process for GWAS Catalog data is defined in the [docs/gwas_catalog_curation.md](docs/gwas_catalog_curation.md). The original solution uses R script to prepare the data for curation and then manually curates the data. The solution proposed in the `curation` task automates the preparation of the data for curation and provides a template for manual curation. The manual curation process is still required, but the data preparation is automated.
227
230
 
228
231
  The automated process includes:
229
232
 
230
233
  1. Reading `download studies` file with the list of studies that are currently comming from the latest GWAS Catalog release.
231
234
  2. Reading `previous curation` file that contains the list of the curated studies from the previous release.
232
- 3. Comparing the two datasets with following logic:
235
+ 3. Listing all synced summary statistics files from the `summary_statistics_glob` parameter to identify which studies have summary statistics available. Note that this can be more then the list of studies in the `download studies` file as syncing also involves the unpublished studies.
236
+ 4. Comparing the three datasets with following logic:
233
237
  - In case the study is present in the `previous curation` and `download studies`, the study is marked as `curated`
234
- * In case the study is present in the `download studies` but not in the `previous curation`, the study is marked as `new`
235
- * In case the study is present in the `previous curation` but not in the `download studies`, the study is marked as `removed`
236
- 4. The output of the curation process is a file that contains the list of studies with their status (curated, new, removed) and the fields that are required for manual curation. The output file is saved to the `destination_template` path specified in the task configuration. The file is saved under `gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv` path.
237
- 5. The output file is then promoted to the latest release path `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` so that it can be used for manual curation.
238
- 6. The manual curation process is then performed on the `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` file. The manual curation process is not automated and requires manual intervention. The output from the manual curation process should be saved then to the `gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv` and `gs://gwas_catalog_inputs/curation/{release_date}/curated/GWAS_Catalog_study_curation.tsv` file. This file is then used for the [Open Targets Staging Dags](https://github.com/opentargets/orchestration).
238
+ - In case the study is present in the `download studies` but not in the `previous curation`, the study is marked as `to_curate` or `has_no_sumstats` depending on the presence of summary statistics files
239
+ - In case the study is present in the `previous curation` but not in the `download studies`, the study is marked as `removed`
240
+ 5. The output of the curation process is a file that contains the list of studies with their status (curated, new, removed) and the fields that are required for manual curation. The output file is saved to the `destination_template` path specified in the task configuration. The file is saved under `gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv` path.
241
+ 6. The output file is then promoted to the latest release path `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` so that it can be used for manual curation.
242
+ 7. The manual curation process is then performed on the `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` file. The manual curation process is not automated and requires manual intervention. The output from the manual curation process should be saved then to the `gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv` and `gs://gwas_catalog_inputs/curation/{release_date}/curated/GWAS_Catalog_study_curation.tsv` file. This file is then used for the [Open Targets Staging Dags](https://github.com/opentargets/orchestration).
239
243
 
240
244
  ---
241
245
 
@@ -6,17 +6,17 @@ gentroutils/io/path/__init__.py,sha256=oBNiw3xr84QAFejVgRIW_PttJ2ZQHZV2uYtIklXle
6
6
  gentroutils/io/path/ftp.py,sha256=kus0GS9Bcm3IzooL8duu7fdtMDr0-Sm7LCtpiUd1Kxs,1585
7
7
  gentroutils/io/path/gcs.py,sha256=itEEyUBQLqCBV-F7JF7ZJHUn5OBObmTCsrEfib_V7p8,1514
8
8
  gentroutils/io/transfer/__init__.py,sha256=2KasKyCMTpnUvU2SVzZds7ggYBS5ReXn8q3K_TE0YYA,310
9
- gentroutils/io/transfer/ftp_to_gcs.py,sha256=O4wDNVyAt24_1zIkfGFdNQ-er1CtJ7J5rdj0bxrG7SI,2793
9
+ gentroutils/io/transfer/ftp_to_gcs.py,sha256=yWPm_V8VvAERSUF8OEprB14EETh2Ur-nr08bqEYtpSw,6714
10
10
  gentroutils/io/transfer/model.py,sha256=-REhDC8nIT_RoOlRwHpvNfqN_khOm7OnlAi8-U7qX1k,1198
11
- gentroutils/io/transfer/polars_to_gcs.py,sha256=l1vWDLVw5OuAjPRWEvA69w5V5cz36ARGqCOA9R9iecM,762
11
+ gentroutils/io/transfer/polars_to_gcs.py,sha256=75aw2CEcjy5RoXquyDGp5HaDuc5AaOHJM_IVhP_v1Ig,799
12
12
  gentroutils/parsers/__init__.py,sha256=HtgvopQ3Xx_cjC2lA3Tp81Rd5-k4CUJGJu0GD7W9r0o,59
13
- gentroutils/parsers/curation.py,sha256=34GvUTD9SmUpk_CEMoqJbpDyBVW3kKOyMLMiZ4nc9e0,7357
13
+ gentroutils/parsers/curation.py,sha256=RWD_BXlzPaVTn25Md7ln-ALHcJC5bzZPwX8YscWnEnI,11252
14
14
  gentroutils/tasks/__init__.py,sha256=WvMdT_m0F1Kos-FWaVlXLTCuRs-xQD1cRHtHEIZ_U4c,3745
15
15
  gentroutils/tasks/crawl.py,sha256=kHLT5EBFUT6yubNKPshHKNrlDMuQ7RM9c22g44VN734,6766
16
- gentroutils/tasks/curation.py,sha256=qBI0PF6Bby-J-113VM2xgW4a7dq8L99rRq8YMo6dDww,4675
16
+ gentroutils/tasks/curation.py,sha256=BQlMO0TUn8B9wlst3nQQg-u72WhWwlNOW6BT1UOMGHA,5029
17
17
  gentroutils/tasks/fetch.py,sha256=cDP6FCzSCIRq_MZGXZEVQtcLOG6TsOzIg6z_s2U6dkM,6289
18
- gentroutils-3.1.0.dist-info/METADATA,sha256=8ERSzEvSQV7HsySDmB6wLc6Qm7IUvVmHnV-OnzwXYjg,15272
19
- gentroutils-3.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
20
- gentroutils-3.1.0.dist-info/entry_points.txt,sha256=KjODdAGrWKebI3ogqs9r6snAJ_DtLGpm1jREbZ6OXGs,49
21
- gentroutils-3.1.0.dist-info/licenses/LICENSE,sha256=8rMKYP7K5vL-KA2WmdBkUBz4iFRveUOWUMAFP8uc3P0,10945
22
- gentroutils-3.1.0.dist-info/RECORD,,
18
+ gentroutils-4.0.0.dist-info/METADATA,sha256=wki4WFLVTJCFcL4eigS-l6OTroVyCR_pwXuLDZsXyhI,16027
19
+ gentroutils-4.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
20
+ gentroutils-4.0.0.dist-info/entry_points.txt,sha256=KjODdAGrWKebI3ogqs9r6snAJ_DtLGpm1jREbZ6OXGs,49
21
+ gentroutils-4.0.0.dist-info/licenses/LICENSE,sha256=8rMKYP7K5vL-KA2WmdBkUBz4iFRveUOWUMAFP8uc3P0,10945
22
+ gentroutils-4.0.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any