genal-python 1.4.7__tar.gz → 1.4.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {genal_python-1.4.7 → genal_python-1.4.9}/PKG-INFO +3 -2
  2. {genal_python-1.4.7 → genal_python-1.4.9}/README.md +2 -1
  3. {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/introduction.md +1 -1
  4. {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/methods.md +9 -0
  5. {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/workflows.md +2 -0
  6. {genal_python-1.4.7 → genal_python-1.4.9}/genal/Geno.py +22 -14
  7. {genal_python-1.4.7 → genal_python-1.4.9}/genal/__init__.py +1 -1
  8. {genal_python-1.4.7 → genal_python-1.4.9}/genal/colocalization.py +2 -2
  9. {genal_python-1.4.7 → genal_python-1.4.9}/genal/extract_prs.py +107 -46
  10. {genal_python-1.4.7 → genal_python-1.4.9}/genal/geno_tools.py +116 -6
  11. {genal_python-1.4.7 → genal_python-1.4.9}/genal/lift.py +2 -0
  12. {genal_python-1.4.7 → genal_python-1.4.9}/genal/snp_query.py +1 -1
  13. {genal_python-1.4.7 → genal_python-1.4.9}/genal/tools.py +5 -1
  14. {genal_python-1.4.7 → genal_python-1.4.9}/pyproject.toml +1 -1
  15. {genal_python-1.4.7 → genal_python-1.4.9}/.DS_Store +0 -0
  16. {genal_python-1.4.7 → genal_python-1.4.9}/.gitignore +0 -0
  17. {genal_python-1.4.7 → genal_python-1.4.9}/.readthedocs.yaml +0 -0
  18. {genal_python-1.4.7 → genal_python-1.4.9}/Genal_flowchart.png +0 -0
  19. {genal_python-1.4.7 → genal_python-1.4.9}/LICENSE +0 -0
  20. {genal_python-1.4.7 → genal_python-1.4.9}/docs/.DS_Store +0 -0
  21. {genal_python-1.4.7 → genal_python-1.4.9}/docs/Makefile +0 -0
  22. {genal_python-1.4.7 → genal_python-1.4.9}/docs/make.bat +0 -0
  23. {genal_python-1.4.7 → genal_python-1.4.9}/docs/requirements.txt +0 -0
  24. {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/.DS_Store +0 -0
  25. {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/Images/Genal_flowchart.png +0 -0
  26. {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/Images/MR_plot_SBP_AS.png +0 -0
  27. {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/Images/genal_logo.png +0 -0
  28. {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/api.md +0 -0
  29. {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/concepts.md +0 -0
  30. {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/conf.py +0 -0
  31. {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/faq.md +0 -0
  32. {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/index.md +0 -0
  33. {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/setup.md +0 -0
  34. {genal_python-1.4.7 → genal_python-1.4.9}/genal/MR.py +0 -0
  35. {genal_python-1.4.7 → genal_python-1.4.9}/genal/MR_tools.py +0 -0
  36. {genal_python-1.4.7 → genal_python-1.4.9}/genal/MRpresso.py +0 -0
  37. {genal_python-1.4.7 → genal_python-1.4.9}/genal/association.py +0 -0
  38. {genal_python-1.4.7 → genal_python-1.4.9}/genal/clump.py +0 -0
  39. {genal_python-1.4.7 → genal_python-1.4.9}/genal/constants.py +0 -0
  40. {genal_python-1.4.7 → genal_python-1.4.9}/genal/genes.py +0 -0
  41. {genal_python-1.4.7 → genal_python-1.4.9}/genal/proxy.py +0 -0
  42. {genal_python-1.4.7 → genal_python-1.4.9}/genal_logo.png +0 -0
  43. {genal_python-1.4.7 → genal_python-1.4.9}/gitignore +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: genal-python
3
- Version: 1.4.7
3
+ Version: 1.4.9
4
4
  Summary: A python toolkit for polygenic risk scoring and mendelian randomization.
5
5
  Author-email: Cyprien Rivier <riviercyprien@gmail.com>
6
6
  Requires-Python: >=3.8
@@ -218,6 +218,7 @@ What preprocessing typically does (depending on options):
218
218
  - validates types, formats, and values of CHR/POS/EA/NEA/BETA/SE/P/EAF columns
219
219
  - detects OR vs beta columns (and log-transforms OR when needed)
220
220
  - fills missing columns (e.g., rsID from CHR/POS, SE from BETA+P, P from BETA+SE)
221
+ - computes **FSTAT** (F-statistic) from BETA and SE when possible, with a fallback method when only P is present
221
222
  - handles duplicates and invalid rows under `"Fill_delete"`
222
223
 
223
224
  You can inspect the standardized dataset at any time:
@@ -379,7 +380,7 @@ G_adj.association_test(
379
380
  )
380
381
  ```
381
382
 
382
- This updates `SBP_adj.data[["BETA","SE","P"]]` with cohort-specific estimates.
383
+ This updates `G_adj.data[["BETA","SE","P"]]` with cohort-specific estimates and recomputes `FSTAT` to be consistent with the updated values.
383
384
 
384
385
  ### 8) Lift to a different build
385
386
 
@@ -190,6 +190,7 @@ What preprocessing typically does (depending on options):
190
190
  - validates types, formats, and values of CHR/POS/EA/NEA/BETA/SE/P/EAF columns
191
191
  - detects OR vs beta columns (and log-transforms OR when needed)
192
192
  - fills missing columns (e.g., rsID from CHR/POS, SE from BETA+P, P from BETA+SE)
193
+ - computes **FSTAT** (F-statistic) from BETA and SE when possible, with a fallback method when only P is present
193
194
  - handles duplicates and invalid rows under `"Fill_delete"`
194
195
 
195
196
  You can inspect the standardized dataset at any time:
@@ -351,7 +352,7 @@ G_adj.association_test(
351
352
  )
352
353
  ```
353
354
 
354
- This updates `SBP_adj.data[["BETA","SE","P"]]` with cohort-specific estimates.
355
+ This updates `G_adj.data[["BETA","SE","P"]]` with cohort-specific estimates and recomputes `FSTAT` to be consistent with the updated values.
355
356
 
356
357
  ### 8) Lift to a different build
357
358
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  `genal` is a Python toolkit for common GWAS-derived workflows:
4
4
 
5
- - **Preprocess** GWAS summary statistics into a consistent SNP table (column validation, allele checks, optional filling of missing `SNP`/`CHR`/`POS`/`EA`/`NEA`/`SE`/`P` using reference data).
5
+ - **Preprocess** GWAS summary statistics into a consistent SNP table (column validation, allele checks, optional filling of missing `SNP`/`CHR`/`POS`/`EA`/`NEA`/`SE`/`P` using reference data, and computation of per-variant F-statistic `FSTAT`).
6
6
  - **Select instruments** via LD clumping (PLINK 2).
7
7
  - **Compute PRS** on individual-level genotype data (PLINK 2), with optional **proxy SNP** support.
8
8
  - **Run two-sample MR** (multiple estimators + sensitivity analyses), with plotting helpers.
@@ -2,6 +2,15 @@
2
2
 
3
3
  This page documents the main statistical models implemented in `genal`. It is intentionally brief and focuses on what is implemented in the codebase.
4
4
 
5
+ ## Per-variant F-statistic (FSTAT)
6
+
7
+ Implementation: {py:func}`genal.geno_tools.fill_fstatistic`
8
+
9
+ `genal` computes a per-variant F-statistic (`FSTAT`) during preprocessing and after association testing. This statistic measures instrument strength for each variant.
10
+
11
+ - **Primary:** `FSTAT = (BETA / SE)²` when `BETA` and `SE` are available and `SE > 0`.
12
+ - **Fallback:** `FSTAT = χ²_isf(P, df=1)` (equivalent to `Z²` for a two-sided p-value) when `BETA/SE` are unavailable but `P` is present.
13
+
5
14
  ## MR harmonization
6
15
 
7
16
  MR workflows rely on aligning exposure and outcome effects to the same effect allele.
@@ -59,6 +59,7 @@ Key arguments you commonly tune:
59
59
  - `effect_column="OR"` forces log-transforming odds ratios into betas and adjusts SE accordingly.
60
60
  - `fill_snpids` / `fill_coordinates`: override the default logic if you want to force filling rsIDs from `CHR+POS` or vice-versa.
61
61
  - `keep_indel` / `keep_dups`: keep indels or duplicated IDs (generally you keep these `False` unless you have a reason).
62
+ - `fill_f=True`: force recomputation of the F-statistic (`FSTAT`) column even if it already exists. By default, FSTAT is created if missing or only missing values are filled.
62
63
 
63
64
  ## 3) Select independent instruments via LD clumping
64
65
 
@@ -237,6 +238,7 @@ What you typically tune / watch:
237
238
  - `covar`: covariate names (must be present in `pheno_df` and numeric; constant covariates are dropped).
238
239
  - `standardize=True`: for quantitative traits; set `False` if you want raw-scale effects.
239
240
  - Variant matching: if `CHR+POS` are present in `G.data`, genal will map to cohort SNP IDs before running PLINK (reduces ID mismatch losses).
241
+ - After updating `BETA`, `SE`, and `P`, the F-statistic (`FSTAT`) column is automatically recomputed to remain consistent with the updated estimates.
240
242
 
241
243
  ### Liftover between builds
242
244
 
@@ -34,6 +34,7 @@ from .geno_tools import (
34
34
  check_beta_column,
35
35
  check_p_column,
36
36
  fill_se_p,
37
+ fill_fstatistic,
37
38
  check_allele_column,
38
39
  check_snp_column,
39
40
  remove_na,
@@ -145,15 +146,9 @@ class Geno:
145
146
  # List to keep track of checks performed
146
147
  self.checks = CHECKS_DICT.copy()
147
148
 
148
- # Set the maximal amount of ram/cpu to be used by the methods and dask chunksize
149
- self.cpus = int(os.environ.get("SLURM_CPUS_PER_TASK", default=os.cpu_count())) - 1
150
- non_hpc_ram_per_cpu = psutil.virtual_memory().total / (
151
- 1024**2 * self.cpus
152
- )
153
- ram_per_cpu = int(
154
- os.environ.get("SLURM_MEM_PER_CPU", default=non_hpc_ram_per_cpu)
155
- )
156
- self.ram = int(ram_per_cpu * self.cpus * 0.8)
149
+ # Set the maximal amount of ram/cpu to be used by the methods
150
+ self.cpus = os.cpu_count() - 1
151
+ self.ram = int(psutil.virtual_memory().total / 1024**2 * 0.8)
157
152
 
158
153
  create_tmp()
159
154
 
@@ -168,6 +163,7 @@ class Geno:
168
163
  keep_dups=None,
169
164
  fill_snpids=None,
170
165
  fill_coordinates=None,
166
+ fill_f=False,
171
167
  ):
172
168
  """
173
169
  Clean and preprocess the main dataframe of Single Nucleotide Polymorphisms (SNP) data.
@@ -187,6 +183,9 @@ class Geno:
187
183
  keep_dups (bool, optional): Determines if rows with duplicate SNP IDs should be kept. If None, defers to preprocessing value. Defaults to None.
188
184
  fill_snpids (bool, optional): Decides if the SNP (rsID) column should be created or replaced based on CHR/POS columns and a reference genome. If None, defers to preprocessing value. Defaults to None.
189
185
  fill_coordinates (bool, optional): Decides if CHR and/or POS should be created or replaced based on SNP column and a reference genome. If None, defers to preprocessing value. Defaults to None.
186
+ fill_f (bool, optional): If True, force recomputation/overwrite of the FSTAT column even if it
187
+ already exists. If False (default), FSTAT is created if missing, or only missing values
188
+ are filled if the column already exists.
190
189
 
191
190
  Note:
192
191
  If you pass a standard reference_panel name (e.g. "EUR_37"), it will be converted to "37".
@@ -256,8 +255,8 @@ class Geno:
256
255
  data = fill_ea_nea(data, self.get_reference_panel(reference_panel))
257
256
 
258
257
  # Convert effect column to Beta estimates if present
259
- if "BETA" in data.columns:
260
- check_beta_column(data, effect_column, preprocessing)
258
+ if "BETA" in data.columns and preprocessing in ['Fill', 'Fill_delete']:
259
+ check_beta_column(data, effect_column)
261
260
  self.checks["BETA"] = True
262
261
 
263
262
  # Ensure P column contains valid values
@@ -269,6 +268,12 @@ class Geno:
269
268
  if preprocessing in ['Fill', 'Fill_delete']:
270
269
  fill_se_p(data)
271
270
 
271
+ # Compute or fill the FSTAT column
272
+ # Under normal preprocessing: compute/create/fill FSTAT
273
+ # If preprocessing == "None": only compute FSTAT when fill_f=True
274
+ if preprocessing in ['Fill', 'Fill_delete'] or fill_f:
275
+ fill_fstatistic(data, overwrite=fill_f)
276
+
272
277
  # Process allele columns
273
278
  for allele_col in ["EA", "NEA"]:
274
279
  check_allele_condition = (allele_col in data.columns) and (
@@ -590,7 +595,7 @@ class Geno:
590
595
  if not self.checks.get("EA"):
591
596
  check_allele_column(data_prs, "EA", keep_indel=False)
592
597
  if not self.checks.get("BETA"):
593
- check_beta_column(data_prs, effect_column=None, preprocessing='Fill_delete')
598
+ check_beta_column(data_prs, effect_column=None)
594
599
 
595
600
  initial_rows = data_prs.shape[0]
596
601
  data_prs.dropna(subset=["SNP", "EA", "BETA"], inplace=True)
@@ -740,8 +745,8 @@ class Geno:
740
745
  to make results more interpretable. Default is True.
741
746
 
742
747
  Returns:
743
- None: Updates the BETA, SE, and P columns of the data attribute based on the results
744
- of the association tests.
748
+ None: Updates the BETA, SE, P, and FSTAT columns of the data attribute based on the
749
+ results of the association tests.
745
750
 
746
751
  Note:
747
752
  This method requires the phenotype to be set using the set_phenotype() function.
@@ -796,6 +801,9 @@ class Geno:
796
801
  if n_updated < n_original:
797
802
  print(f"{n_original - n_updated}({(n_original - n_updated)/n_original*100:.3f}%) SNPs have been removed.")
798
803
 
804
+ # Recompute FSTAT to be consistent with the updated BETA/SE/P values
805
+ fill_fstatistic(updated_data, overwrite=True)
806
+
799
807
  # Update the instance data
800
808
  self.data = updated_data
801
809
  return
@@ -5,7 +5,7 @@ from .geno_tools import Combine_Geno
5
5
  from .genes import filter_by_gene_func
6
6
  from .constants import CONFIG_DIR
7
7
 
8
- __version__ = "1.4.7"
8
+ __version__ = "1.4.9"
9
9
 
10
10
  config_path = os.path.join(CONFIG_DIR, "config.json")
11
11
 
@@ -29,8 +29,8 @@ def coloc_abf_func(data1, data2, trait1_type="quant", trait2_type="quant",
29
29
  """
30
30
 
31
31
  # Ensure that the BETA columns are preprocessed
32
- check_beta_column(data1, 'BETA', 'Fill')
33
- check_beta_column(data2, 'BETA', 'Fill')
32
+ check_beta_column(data1, 'BETA')
33
+ check_beta_column(data2, 'BETA')
34
34
 
35
35
  # Adjust EAF column names before merging in case one of the datasets does not have it
36
36
  if 'EAF' in data1.columns:
@@ -6,6 +6,8 @@ from concurrent.futures import ProcessPoolExecutor
6
6
  from .tools import check_bfiles, check_pfiles, setup_genetic_path, get_plink_path
7
7
 
8
8
 
9
+ MIN_RAM_PER_WORKER_MB = 3500 # Minimum RAM per PLINK process (conservative for large genotype files)
10
+
9
11
  ### ____________________
10
12
  ### PRS functions
11
13
  ### ____________________
@@ -126,7 +128,7 @@ def extract_snps_func(snp_list, name=None, path=None, ram=20000, cpus=4):
126
128
  path, filetype = setup_genetic_path(path)
127
129
 
128
130
  # Prepare the SNP list
129
- snp_list = snp_list.dropna()
131
+ snp_list = snp_list.dropna().drop_duplicates()
130
132
  snp_list_name = f"{name}_list.txt"
131
133
  snp_list_path = os.path.join("tmp_GENAL", snp_list_name)
132
134
  snp_list.to_csv(snp_list_path, sep=" ", index=False, header=None)
@@ -136,16 +138,29 @@ def extract_snps_func(snp_list, name=None, path=None, ram=20000, cpus=4):
136
138
  filetype_split = "split" if "$" in path else "combined"
137
139
 
138
140
  output_path = os.path.join("tmp_GENAL", f"{name}_allchr")
141
+
142
+ # Guard against empty SNP list (applies to both split and combined)
143
+ if nrow == 0:
144
+ print("The SNP list is empty after deduplication.")
145
+ return "FAILED"
146
+
139
147
  if filetype_split == "split":
140
- ram_estimate_per_cpu = nrow/(1.5*10**2)
141
- n_cpus = max(1, int(ram // ram_estimate_per_cpu))
142
- workers = min(n_cpus, cpus)
148
+ # Calculate workers based on memory budget (not SNP count)
149
+ max_workers_by_ram = max(1, int(ram // MIN_RAM_PER_WORKER_MB))
150
+ workers = max(1, min(max_workers_by_ram, cpus, 22)) # Cap at 22 chromosomes, min 1
151
+
152
+ # Allocate RAM per worker
153
+ per_worker_ram = int(ram // workers)
154
+
155
+ #print(f"Parallelizing extraction across {workers} workers with {per_worker_ram}MB RAM each")
156
+
143
157
  merge_command, bedlist_path = extract_snps_from_split_data(
144
- name, path, output_path, snp_list_path, filetype, workers=workers
158
+ name, path, output_path, snp_list_path, filetype,
159
+ workers=workers, per_worker_ram=per_worker_ram, ram=ram
145
160
  )
146
161
  handle_multiallelic_variants(name, merge_command, bedlist_path)
147
162
  else:
148
- extract_snps_from_combined_data(name, path, output_path, snp_list_path, filetype)
163
+ extract_snps_from_combined_data(name, path, output_path, snp_list_path, filetype, ram=ram)
149
164
 
150
165
  #Check that at least 1 variant has been extracted. If not, return "FAILED" to warn downstream functions (prs, association_test)
151
166
  log_path = output_path + ".log"
@@ -164,7 +179,7 @@ def extract_snps_func(snp_list, name=None, path=None, ram=20000, cpus=4):
164
179
  return output_path
165
180
 
166
181
 
167
- def extract_command_parallel(task_id, name, path, snp_list_path, filetype):
182
+ def extract_command_parallel(task_id, name, path, snp_list_path, filetype, per_worker_ram=4000):
168
183
  """
169
184
  Helper function to run SNP extraction in parallel for different chromosomes.
170
185
  Args:
@@ -173,8 +188,11 @@ def extract_command_parallel(task_id, name, path, snp_list_path, filetype):
173
188
  path (str): Path to the data set.
174
189
  snp_list_path (str): Path to the list of SNPs to extract.
175
190
  filetype (str): Type of genetic files ("bed" or "pgen")
191
+ per_worker_ram (int): RAM limit in MB for this PLINK process.
176
192
  Returns:
177
193
  int: Returns the task_id if no valid files are found.
194
+ dict: Returns error dict {'failed': True, 'chr': task_id, ...} if extraction fails.
195
+ None: Returns None on success.
178
196
  """
179
197
  input_path = path.replace("$", str(task_id))
180
198
 
@@ -185,65 +203,108 @@ def extract_command_parallel(task_id, name, path, snp_list_path, filetype):
185
203
  return task_id
186
204
 
187
205
  output_path = os.path.join("tmp_GENAL", f"{name}_extract_chr{task_id}")
188
-
206
+
189
207
  # Build command based on filetype
190
208
  base_cmd = f"{get_plink_path()}"
191
209
  if filetype == "bed":
192
210
  base_cmd += f" --bfile {input_path}"
193
211
  else: # pgen
194
212
  base_cmd += f" --pfile {input_path}"
195
-
196
- command = f"{base_cmd} --extract {snp_list_path} --rm-dup force-first --make-pgen --out {output_path}"
197
-
198
- subprocess.run(
213
+
214
+ command = f"{base_cmd} --extract {snp_list_path} --memory {per_worker_ram} --threads 1 --rm-dup force-first --make-pgen --out {output_path}"
215
+
216
+ result = subprocess.run(
199
217
  command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
200
218
  )
201
219
 
220
+ # Check for failures and return diagnostic info (diagnostics are in .log file)
221
+ if result.returncode != 0:
222
+ return {'failed': True, 'chr': task_id, 'log': f"{output_path}.log", 'returncode': result.returncode}
202
223
 
203
- def create_bedlist(bedlist_path, output_name, not_found):
204
- """
205
- Creates a bedlist file for SNP extraction.
206
- Args:
207
- bedlist_path (str): Path to save the bedlist file.
208
- output_name (str): Base name for the output files.
209
- not_found (List[int]): List of chromosome numbers for which no files were found.
210
- """
211
- with open(bedlist_path, "w+") as bedlist_file:
212
- found = []
213
- for i in range(1, 23):
214
- if i in not_found:
215
- print(f"bed/bim/fam or pgen/pvar/psam files not found for chr{i}.")
216
- elif check_pfiles(f"{output_name}_chr{i}"):
217
- bedlist_file.write(f"{output_name}_chr{i}\n")
218
- found.append(i)
219
- print(f"SNPs extracted for chr{i}.")
220
- else:
221
- print(f"No SNPs extracted for chr{i}.")
222
- return found
224
+ # Also check if output files were created
225
+ if not check_pfiles(output_path):
226
+ return {'failed': True, 'chr': task_id, 'log': f"{output_path}.log", 'returncode': -1}
227
+
228
+ return None # Success
223
229
 
224
230
 
225
- def extract_snps_from_split_data(name, path, output_path, snp_list_path, filetype, workers=4):
231
+ def extract_snps_from_split_data(name, path, output_path, snp_list_path, filetype, workers=4, per_worker_ram=4000, ram=20000):
226
232
  """Extract SNPs from data split by chromosome."""
227
233
  print("Extracting SNPs for each chromosome...")
228
234
  num_tasks = 22
229
235
  partial_extract_command_parallel = partial(
230
- extract_command_parallel,
231
- name=name,
232
- path=path,
236
+ extract_command_parallel,
237
+ name=name,
238
+ path=path,
233
239
  snp_list_path=snp_list_path,
234
- filetype=filetype
240
+ filetype=filetype,
241
+ per_worker_ram=per_worker_ram
235
242
  ) # Wrapper function
243
+
244
+ # First attempt with calculated workers
245
+ results = []
236
246
  with ProcessPoolExecutor(max_workers=workers) as executor:
237
- not_found = list(
247
+ results = list(
238
248
  executor.map(partial_extract_command_parallel, range(1, num_tasks + 1))
239
249
  )
240
250
 
251
+ # Check for failures (non-None returns indicate errors)
252
+ failed_chrs = [r for r in results if r is not None and isinstance(r, dict) and r.get('failed')]
253
+ not_found = [r for r in results if r is not None and not isinstance(r, dict)]
254
+
255
+ # Retry failed chromosomes with reduced workers if any failures occurred
256
+ if failed_chrs and workers > 1:
257
+ print(f"{len(failed_chrs)} chromosome(s) failed. Retrying with reduced parallelization...")
258
+ retry_workers = max(1, workers // 2)
259
+ # Recalculate RAM per worker based on original total budget
260
+ total_ram_budget = per_worker_ram * workers
261
+ per_worker_ram_retry = int(total_ram_budget // retry_workers)
262
+
263
+ partial_retry = partial(
264
+ extract_command_parallel,
265
+ name=name,
266
+ path=path,
267
+ snp_list_path=snp_list_path,
268
+ filetype=filetype,
269
+ per_worker_ram=per_worker_ram_retry
270
+ )
271
+
272
+ failed_chr_ids = [r['chr'] for r in failed_chrs]
273
+ with ProcessPoolExecutor(max_workers=retry_workers) as executor:
274
+ retry_results = list(executor.map(partial_retry, failed_chr_ids))
275
+
276
+ # Update results - surface errors for persistent failures
277
+ for orig_id, retry_result in zip(failed_chr_ids, retry_results):
278
+ if retry_result is not None and isinstance(retry_result, dict) and retry_result.get('failed'):
279
+ # Still failed - surface the error
280
+ log_path = os.path.join("tmp_GENAL", f"{name}_extract_chr{orig_id}.log")
281
+ if os.path.exists(log_path):
282
+ print(f"Chr{orig_id} failed after retry. Check log: {log_path}")
283
+ try:
284
+ with open(log_path, 'r') as f:
285
+ lines = f.readlines()
286
+ print(f"Last 10 lines of log:\n{''.join(lines[-10:])}")
287
+ except Exception:
288
+ pass
289
+
241
290
  # Merge extracted SNPs from each chromosome
242
291
  bedlist_name = f"{name}_bedlist.txt"
243
292
  bedlist_path = os.path.join("tmp_GENAL", bedlist_name)
244
- found = create_bedlist(
245
- bedlist_path, os.path.join("tmp_GENAL", f"{name}_extract"), not_found
246
- )
293
+
294
+ # Create the bedlist file
295
+ output_name = os.path.join("tmp_GENAL", f"{name}_extract")
296
+ with open(bedlist_path, "w+") as bedlist_file:
297
+ found = []
298
+ for i in range(1, 23):
299
+ if i in not_found:
300
+ print(f"bed/bim/fam or pgen/pvar/psam files not found for chr{i}.")
301
+ elif check_pfiles(f"{output_name}_chr{i}"):
302
+ bedlist_file.write(f"{output_name}_chr{i}\n")
303
+ found.append(i)
304
+ print(f"SNPs extracted for chr{i}.")
305
+ else:
306
+ print(f"No SNPs extracted for chr{i}.")
307
+
247
308
  if len(found) == 0:
248
309
  raise Warning("No SNPs were extracted from any chromosome.")
249
310
 
@@ -255,7 +316,7 @@ def extract_snps_from_split_data(name, path, output_path, snp_list_path, filetyp
255
316
  return None, bedlist_path
256
317
 
257
318
  print("Merging SNPs extracted from each chromosome...")
258
- merge_command = f"{get_plink_path()} --pmerge-list {bedlist_path} pfile --out {output_path}"
319
+ merge_command = f"{get_plink_path()} --memory {ram} --pmerge-list {bedlist_path} pfile --out {output_path}"
259
320
  try:
260
321
  subprocess.run(
261
322
  merge_command, shell=True, capture_output=True, text=True, check=True
@@ -269,18 +330,18 @@ def extract_snps_from_split_data(name, path, output_path, snp_list_path, filetyp
269
330
  return merge_command, bedlist_path
270
331
 
271
332
 
272
- def extract_snps_from_combined_data(name, path, output_path, snp_list_path, filetype):
333
+ def extract_snps_from_combined_data(name, path, output_path, snp_list_path, filetype, ram=20000):
273
334
  """Extract SNPs from combined data."""
274
335
  print("Extracting SNPs...")
275
-
336
+
276
337
  # Build command based on filetype
277
338
  base_cmd = f"{get_plink_path()}"
278
339
  if filetype == "bed":
279
340
  base_cmd += f" --bfile {path}"
280
341
  else: # pgen
281
342
  base_cmd += f" --pfile {path}"
282
-
283
- extract_command = f"{base_cmd} --extract {snp_list_path} --rm-dup force-first --make-pgen --out {output_path}"
343
+
344
+ extract_command = f"{base_cmd} --memory {ram} --extract {snp_list_path} --rm-dup force-first --make-pgen --out {output_path}"
284
345
 
285
346
  subprocess.run(
286
347
  extract_command,
@@ -68,10 +68,21 @@ def check_allele_column(data, allele_col, keep_indel):
68
68
 
69
69
  def fill_se_p(data):
70
70
  """If either P or SE is missing but the other and BETA are present, fill it."""
71
+ # Ensure SE is numeric and non-negative
72
+ if ("SE" in data.columns):
73
+ data["SE"] = pd.to_numeric(data["SE"], errors="coerce")
74
+ data.loc[data["SE"] < 0, "SE"] = np.nan
75
+ n_missing = data["SE"].isna().sum()
76
+ if n_missing > 0:
77
+ print(
78
+ f"{n_missing}({n_missing/data.shape[0]*100:.3f}%) values in the SE column have been set to nan for being missing, negative or non-numeric."
79
+ )
71
80
  # If SE is missing
72
81
  if ("P" in data.columns) & ("BETA" in data.columns) & ("SE" not in data.columns):
73
- data["SE"] = np.where(
74
- data["P"] < 1, np.abs(data.BETA / st.norm.ppf(data.P / 2)), 0
82
+ data["SE"] = np.select(
83
+ [data["P"] == 0, data["P"] >= 1],
84
+ [0, np.nan],
85
+ default=np.abs(data.BETA / st.norm.ppf(data.P / 2)),
75
86
  )
76
87
  print("The SE (Standard Error) column has been created.")
77
88
  # If P is missing
@@ -83,6 +94,104 @@ def fill_se_p(data):
83
94
  return
84
95
 
85
96
 
97
+ def fill_fstatistic(data, overwrite=False):
98
+ """
99
+ Compute or fill the per-variant F-statistic (FSTAT) column.
100
+
101
+ The F-statistic is computed as:
102
+ - Primary: FSTAT = (BETA / SE)² when BETA and SE are available and SE > 0.
103
+ For SE=0 (extremely significant variants), FSTAT is set to inf.
104
+ - Fallback: FSTAT = χ²_isf(P, df=1) when BETA/SE are unavailable but P is present.
105
+ For P=0 (extremely significant variants), this produces inf.
106
+
107
+ Args:
108
+ data (pd.DataFrame): SNP-level DataFrame.
109
+ overwrite (bool): If False (default), only fill missing FSTAT values if the column
110
+ exists; if it doesn't exist, create it. If True, recompute FSTAT for all rows
111
+ where computable, overwriting existing values; non-computable rows become NaN.
112
+
113
+ Returns:
114
+ None: Modifies data in place.
115
+
116
+ Note:
117
+ FSTAT is NOT added to STANDARD_COLUMNS to avoid row deletion due to missing values.
118
+ """
119
+ nrows = data.shape[0]
120
+ column_created = False
121
+
122
+ # Determine which rows need computation
123
+ if "FSTAT" not in data.columns:
124
+ data["FSTAT"] = np.nan
125
+ column_created = True
126
+ rows_to_compute = pd.Series([True] * nrows, index=data.index)
127
+ elif overwrite:
128
+ # Clear FSTAT first so non-computable rows become NaN (not stale values)
129
+ data["FSTAT"] = np.nan
130
+ rows_to_compute = pd.Series([True] * nrows, index=data.index)
131
+ else:
132
+ # Only compute for rows with missing FSTAT
133
+ rows_to_compute = data["FSTAT"].isna()
134
+
135
+ if not rows_to_compute.any():
136
+ return
137
+
138
+ # Track how many values are assigned
139
+ n_assigned = 0
140
+
141
+ # Primary route: FSTAT = (BETA / SE)² when BETA and SE are available (SE=0 produces inf)
142
+ beta_se_computable = pd.Series([False] * nrows, index=data.index)
143
+ if "BETA" in data.columns and "SE" in data.columns:
144
+ method = "BETA/SE"
145
+ beta_se_computable = (
146
+ rows_to_compute &
147
+ data["BETA"].notna() &
148
+ data["SE"].notna() &
149
+ (data["SE"] >= 0)
150
+ )
151
+ if beta_se_computable.any():
152
+ data.loc[beta_se_computable, "FSTAT"] = (
153
+ data.loc[beta_se_computable, "BETA"] / data.loc[beta_se_computable, "SE"]
154
+ ) ** 2
155
+ n_assigned += beta_se_computable.sum()
156
+
157
+ # Fallback route: FSTAT = χ²_isf(P, df=1) for remaining rows where P is present
158
+ # Allow P=0 (produces inf for extremely significant variants)
159
+ if "P" in data.columns:
160
+ method = "P-values"
161
+ p_fallback_computable = (
162
+ rows_to_compute &
163
+ ~beta_se_computable &
164
+ data["P"].notna() &
165
+ (data["P"] >= 0) &
166
+ (data["P"] <= 1)
167
+ )
168
+ if p_fallback_computable.any():
169
+ data.loc[p_fallback_computable, "FSTAT"] = st.chi2.isf(
170
+ data.loc[p_fallback_computable, "P"], df=1
171
+ )
172
+ n_assigned += p_fallback_computable.sum()
173
+
174
+ # Logging
175
+ if column_created:
176
+ print(
177
+ f"The FSTAT (F-statistic) column has been created using {method}. "
178
+ f"{n_assigned}({n_assigned/nrows*100:.3f}%) values computed."
179
+ )
180
+ elif overwrite:
181
+ print(
182
+ f"The FSTAT (F-statistic) column has been re-created using {method}. "
183
+ f"{n_assigned}({n_assigned/nrows*100:.3f}%) values computed."
184
+ )
185
+ else:
186
+ if n_assigned > 0:
187
+ print(
188
+ f"The FSTAT (F-statistic) column: {n_assigned}({n_assigned/nrows*100:.3f}%)"
189
+ f"missing values have been filled using {method}."
190
+ )
191
+
192
+ return
193
+
194
+
86
195
  def check_p_column(data):
87
196
  """Verify that the P column contains numeric values in the range [0,1]. Set inappropriate values to NA."""
88
197
  nrows = data.shape[0]
@@ -96,14 +205,15 @@ def check_p_column(data):
96
205
  return
97
206
 
98
207
 
99
- def check_beta_column(data, effect_column, preprocessing):
208
+ def check_beta_column(data, effect_column):
100
209
  """
101
210
  If the BETA column is a column of odds ratios, log-transform it.
102
211
  If no effect_column argument is specified, determine if the BETA column are beta estimates or odds ratios.
103
212
  """
213
+ # Ensure the BETA column is numeric
214
+ data["BETA"] = pd.to_numeric(data["BETA"], errors="coerce")
215
+
104
216
  if effect_column is None:
105
- if preprocessing == 'None':
106
- return data
107
217
  median = data.BETA.median()
108
218
  has_negative = (data.BETA < 0).any()
109
219
 
@@ -126,7 +236,7 @@ def check_beta_column(data, effect_column, preprocessing):
126
236
  )
127
237
  if effect_column == "OR":
128
238
  data["BETA"] = np.log(data["BETA"].clip(lower=0.01))
129
- data.drop(columns="SE", errors="ignore", inplace=True)
239
+ data.drop(columns=["SE"], errors="ignore", inplace=True)
130
240
  print("The BETA column has been log-transformed to obtain Beta estimates.")
131
241
  return
132
242
 
@@ -51,6 +51,8 @@ def lift_data(
51
51
  # Prepare the data for lifting: handle missing values in CHR, POS columns
52
52
  nrows = data.shape[0]
53
53
  data.dropna(subset=["CHR", "POS"], inplace=True)
54
+ # Remove absurd positions
55
+ data.drop(data[data.POS >= 300_000_000].index, inplace=True)
54
56
  data.reset_index(drop=True, inplace=True)
55
57
  n_na = nrows - data.shape[0]
56
58
  if n_na:
@@ -2,7 +2,7 @@ import aiohttp
2
2
  import asyncio
3
3
  import numpy as np
4
4
  import nest_asyncio
5
- from tqdm.auto import tqdm
5
+ from tqdm import tqdm
6
6
 
7
7
  # Using nest_asyncio to allow execution in notebooks
8
8
  nest_asyncio.apply()
@@ -113,7 +113,11 @@ def create_tmp():
113
113
  def delete_tmp():
114
114
  """Delete the tmp folder."""
115
115
  if os.path.isdir("tmp_GENAL"):
116
- shutil.rmtree("tmp_GENAL")
116
+ def _onerror(func, path, exc_info):
117
+ if isinstance(exc_info[1], FileNotFoundError):
118
+ return
119
+ raise exc_info[1]
120
+ shutil.rmtree("tmp_GENAL", onerror=_onerror)
117
121
  print("The tmp_GENAL folder has been successfully deleted.")
118
122
  else:
119
123
  print("There is no tmp_GENAL folder to delete in the current directory.")
@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
4
4
 
5
5
  [project]
6
6
  name = "genal-python" # Updated name for PyPI
7
- version = "1.4.7"
7
+ version = "1.4.9"
8
8
  authors = [{name = "Cyprien Rivier", email = "riviercyprien@gmail.com"}]
9
9
  description = "A python toolkit for polygenic risk scoring and mendelian randomization."
10
10
  readme = "README.md"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes