genal-python 1.4.7__tar.gz → 1.4.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {genal_python-1.4.7 → genal_python-1.4.9}/PKG-INFO +3 -2
- {genal_python-1.4.7 → genal_python-1.4.9}/README.md +2 -1
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/introduction.md +1 -1
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/methods.md +9 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/workflows.md +2 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/genal/Geno.py +22 -14
- {genal_python-1.4.7 → genal_python-1.4.9}/genal/__init__.py +1 -1
- {genal_python-1.4.7 → genal_python-1.4.9}/genal/colocalization.py +2 -2
- {genal_python-1.4.7 → genal_python-1.4.9}/genal/extract_prs.py +107 -46
- {genal_python-1.4.7 → genal_python-1.4.9}/genal/geno_tools.py +116 -6
- {genal_python-1.4.7 → genal_python-1.4.9}/genal/lift.py +2 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/genal/snp_query.py +1 -1
- {genal_python-1.4.7 → genal_python-1.4.9}/genal/tools.py +5 -1
- {genal_python-1.4.7 → genal_python-1.4.9}/pyproject.toml +1 -1
- {genal_python-1.4.7 → genal_python-1.4.9}/.DS_Store +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/.gitignore +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/.readthedocs.yaml +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/Genal_flowchart.png +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/LICENSE +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/.DS_Store +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/Makefile +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/make.bat +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/requirements.txt +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/.DS_Store +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/Images/Genal_flowchart.png +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/Images/MR_plot_SBP_AS.png +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/Images/genal_logo.png +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/api.md +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/concepts.md +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/conf.py +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/faq.md +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/index.md +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/docs/source/setup.md +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/genal/MR.py +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/genal/MR_tools.py +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/genal/MRpresso.py +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/genal/association.py +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/genal/clump.py +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/genal/constants.py +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/genal/genes.py +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/genal/proxy.py +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/genal_logo.png +0 -0
- {genal_python-1.4.7 → genal_python-1.4.9}/gitignore +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: genal-python
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.9
|
|
4
4
|
Summary: A python toolkit for polygenic risk scoring and mendelian randomization.
|
|
5
5
|
Author-email: Cyprien Rivier <riviercyprien@gmail.com>
|
|
6
6
|
Requires-Python: >=3.8
|
|
@@ -218,6 +218,7 @@ What preprocessing typically does (depending on options):
|
|
|
218
218
|
- validates types, formats, and values of CHR/POS/EA/NEA/BETA/SE/P/EAF columns
|
|
219
219
|
- detects OR vs beta columns (and log-transforms OR when needed)
|
|
220
220
|
- fills missing columns (e.g., rsID from CHR/POS, SE from BETA+P, P from BETA+SE)
|
|
221
|
+
- computes **FSTAT** (F-statistic) from BETA and SE when possible, with a fallback method when only P is present
|
|
221
222
|
- handles duplicates and invalid rows under `"Fill_delete"`
|
|
222
223
|
|
|
223
224
|
You can inspect the standardized dataset at any time:
|
|
@@ -379,7 +380,7 @@ G_adj.association_test(
|
|
|
379
380
|
)
|
|
380
381
|
```
|
|
381
382
|
|
|
382
|
-
This updates `
|
|
383
|
+
This updates `G_adj.data[["BETA","SE","P"]]` with cohort-specific estimates and recomputes `FSTAT` to be consistent with the updated values.
|
|
383
384
|
|
|
384
385
|
### 8) Lift to a different build
|
|
385
386
|
|
|
@@ -190,6 +190,7 @@ What preprocessing typically does (depending on options):
|
|
|
190
190
|
- validates types, formats, and values of CHR/POS/EA/NEA/BETA/SE/P/EAF columns
|
|
191
191
|
- detects OR vs beta columns (and log-transforms OR when needed)
|
|
192
192
|
- fills missing columns (e.g., rsID from CHR/POS, SE from BETA+P, P from BETA+SE)
|
|
193
|
+
- computes **FSTAT** (F-statistic) from BETA and SE when possible, with a fallback method when only P is present
|
|
193
194
|
- handles duplicates and invalid rows under `"Fill_delete"`
|
|
194
195
|
|
|
195
196
|
You can inspect the standardized dataset at any time:
|
|
@@ -351,7 +352,7 @@ G_adj.association_test(
|
|
|
351
352
|
)
|
|
352
353
|
```
|
|
353
354
|
|
|
354
|
-
This updates `
|
|
355
|
+
This updates `G_adj.data[["BETA","SE","P"]]` with cohort-specific estimates and recomputes `FSTAT` to be consistent with the updated values.
|
|
355
356
|
|
|
356
357
|
### 8) Lift to a different build
|
|
357
358
|
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
`genal` is a Python toolkit for common GWAS-derived workflows:
|
|
4
4
|
|
|
5
|
-
- **Preprocess** GWAS summary statistics into a consistent SNP table (column validation, allele checks, optional filling of missing `SNP`/`CHR`/`POS`/`EA`/`NEA`/`SE`/`P` using reference data).
|
|
5
|
+
- **Preprocess** GWAS summary statistics into a consistent SNP table (column validation, allele checks, optional filling of missing `SNP`/`CHR`/`POS`/`EA`/`NEA`/`SE`/`P` using reference data, and computation of per-variant F-statistic `FSTAT`).
|
|
6
6
|
- **Select instruments** via LD clumping (PLINK 2).
|
|
7
7
|
- **Compute PRS** on individual-level genotype data (PLINK 2), with optional **proxy SNP** support.
|
|
8
8
|
- **Run two-sample MR** (multiple estimators + sensitivity analyses), with plotting helpers.
|
|
@@ -2,6 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
This page documents the main statistical models implemented in `genal`. It is intentionally brief and focuses on what is implemented in the codebase.
|
|
4
4
|
|
|
5
|
+
## Per-variant F-statistic (FSTAT)
|
|
6
|
+
|
|
7
|
+
Implementation: {py:func}`genal.geno_tools.fill_fstatistic`
|
|
8
|
+
|
|
9
|
+
`genal` computes a per-variant F-statistic (`FSTAT`) during preprocessing and after association testing. This statistic measures instrument strength for each variant.
|
|
10
|
+
|
|
11
|
+
- **Primary:** `FSTAT = (BETA / SE)²` when `BETA` and `SE` are available and `SE > 0`.
|
|
12
|
+
- **Fallback:** `FSTAT = χ²_isf(P, df=1)` (equivalent to `Z²` for a two-sided p-value) when `BETA/SE` are unavailable but `P` is present.
|
|
13
|
+
|
|
5
14
|
## MR harmonization
|
|
6
15
|
|
|
7
16
|
MR workflows rely on aligning exposure and outcome effects to the same effect allele.
|
|
@@ -59,6 +59,7 @@ Key arguments you commonly tune:
|
|
|
59
59
|
- `effect_column="OR"` forces log-transforming odds ratios into betas and adjusts SE accordingly.
|
|
60
60
|
- `fill_snpids` / `fill_coordinates`: override the default logic if you want to force filling rsIDs from `CHR+POS` or vice-versa.
|
|
61
61
|
- `keep_indel` / `keep_dups`: keep indels or duplicated IDs (generally you keep these `False` unless you have a reason).
|
|
62
|
+
- `fill_f=True`: force recomputation of the F-statistic (`FSTAT`) column even if it already exists. By default, FSTAT is created if missing or only missing values are filled.
|
|
62
63
|
|
|
63
64
|
## 3) Select independent instruments via LD clumping
|
|
64
65
|
|
|
@@ -237,6 +238,7 @@ What you typically tune / watch:
|
|
|
237
238
|
- `covar`: covariate names (must be present in `pheno_df` and numeric; constant covariates are dropped).
|
|
238
239
|
- `standardize=True`: for quantitative traits; set `False` if you want raw-scale effects.
|
|
239
240
|
- Variant matching: if `CHR+POS` are present in `G.data`, genal will map to cohort SNP IDs before running PLINK (reduces ID mismatch losses).
|
|
241
|
+
- After updating `BETA`, `SE`, and `P`, the F-statistic (`FSTAT`) column is automatically recomputed to remain consistent with the updated estimates.
|
|
240
242
|
|
|
241
243
|
### Liftover between builds
|
|
242
244
|
|
|
@@ -34,6 +34,7 @@ from .geno_tools import (
|
|
|
34
34
|
check_beta_column,
|
|
35
35
|
check_p_column,
|
|
36
36
|
fill_se_p,
|
|
37
|
+
fill_fstatistic,
|
|
37
38
|
check_allele_column,
|
|
38
39
|
check_snp_column,
|
|
39
40
|
remove_na,
|
|
@@ -145,15 +146,9 @@ class Geno:
|
|
|
145
146
|
# List to keep track of checks performed
|
|
146
147
|
self.checks = CHECKS_DICT.copy()
|
|
147
148
|
|
|
148
|
-
# Set the maximal amount of ram/cpu to be used by the methods
|
|
149
|
-
self.cpus =
|
|
150
|
-
|
|
151
|
-
1024**2 * self.cpus
|
|
152
|
-
)
|
|
153
|
-
ram_per_cpu = int(
|
|
154
|
-
os.environ.get("SLURM_MEM_PER_CPU", default=non_hpc_ram_per_cpu)
|
|
155
|
-
)
|
|
156
|
-
self.ram = int(ram_per_cpu * self.cpus * 0.8)
|
|
149
|
+
# Set the maximal amount of ram/cpu to be used by the methods
|
|
150
|
+
self.cpus = os.cpu_count() - 1
|
|
151
|
+
self.ram = int(psutil.virtual_memory().total / 1024**2 * 0.8)
|
|
157
152
|
|
|
158
153
|
create_tmp()
|
|
159
154
|
|
|
@@ -168,6 +163,7 @@ class Geno:
|
|
|
168
163
|
keep_dups=None,
|
|
169
164
|
fill_snpids=None,
|
|
170
165
|
fill_coordinates=None,
|
|
166
|
+
fill_f=False,
|
|
171
167
|
):
|
|
172
168
|
"""
|
|
173
169
|
Clean and preprocess the main dataframe of Single Nucleotide Polymorphisms (SNP) data.
|
|
@@ -187,6 +183,9 @@ class Geno:
|
|
|
187
183
|
keep_dups (bool, optional): Determines if rows with duplicate SNP IDs should be kept. If None, defers to preprocessing value. Defaults to None.
|
|
188
184
|
fill_snpids (bool, optional): Decides if the SNP (rsID) column should be created or replaced based on CHR/POS columns and a reference genome. If None, defers to preprocessing value. Defaults to None.
|
|
189
185
|
fill_coordinates (bool, optional): Decides if CHR and/or POS should be created or replaced based on SNP column and a reference genome. If None, defers to preprocessing value. Defaults to None.
|
|
186
|
+
fill_f (bool, optional): If True, force recomputation/overwrite of the FSTAT column even if it
|
|
187
|
+
already exists. If False (default), FSTAT is created if missing, or only missing values
|
|
188
|
+
are filled if the column already exists.
|
|
190
189
|
|
|
191
190
|
Note:
|
|
192
191
|
If you pass a standard reference_panel name (e.g. "EUR_37"), it will be converted to "37".
|
|
@@ -256,8 +255,8 @@ class Geno:
|
|
|
256
255
|
data = fill_ea_nea(data, self.get_reference_panel(reference_panel))
|
|
257
256
|
|
|
258
257
|
# Convert effect column to Beta estimates if present
|
|
259
|
-
if "BETA" in data.columns:
|
|
260
|
-
check_beta_column(data, effect_column
|
|
258
|
+
if "BETA" in data.columns and preprocessing in ['Fill', 'Fill_delete']:
|
|
259
|
+
check_beta_column(data, effect_column)
|
|
261
260
|
self.checks["BETA"] = True
|
|
262
261
|
|
|
263
262
|
# Ensure P column contains valid values
|
|
@@ -269,6 +268,12 @@ class Geno:
|
|
|
269
268
|
if preprocessing in ['Fill', 'Fill_delete']:
|
|
270
269
|
fill_se_p(data)
|
|
271
270
|
|
|
271
|
+
# Compute or fill the FSTAT column
|
|
272
|
+
# Under normal preprocessing: compute/create/fill FSTAT
|
|
273
|
+
# If preprocessing == "None": only compute FSTAT when fill_f=True
|
|
274
|
+
if preprocessing in ['Fill', 'Fill_delete'] or fill_f:
|
|
275
|
+
fill_fstatistic(data, overwrite=fill_f)
|
|
276
|
+
|
|
272
277
|
# Process allele columns
|
|
273
278
|
for allele_col in ["EA", "NEA"]:
|
|
274
279
|
check_allele_condition = (allele_col in data.columns) and (
|
|
@@ -590,7 +595,7 @@ class Geno:
|
|
|
590
595
|
if not self.checks.get("EA"):
|
|
591
596
|
check_allele_column(data_prs, "EA", keep_indel=False)
|
|
592
597
|
if not self.checks.get("BETA"):
|
|
593
|
-
check_beta_column(data_prs, effect_column=None
|
|
598
|
+
check_beta_column(data_prs, effect_column=None)
|
|
594
599
|
|
|
595
600
|
initial_rows = data_prs.shape[0]
|
|
596
601
|
data_prs.dropna(subset=["SNP", "EA", "BETA"], inplace=True)
|
|
@@ -740,8 +745,8 @@ class Geno:
|
|
|
740
745
|
to make results more interpretable. Default is True.
|
|
741
746
|
|
|
742
747
|
Returns:
|
|
743
|
-
None: Updates the BETA, SE, and
|
|
744
|
-
of the association tests.
|
|
748
|
+
None: Updates the BETA, SE, P, and FSTAT columns of the data attribute based on the
|
|
749
|
+
results of the association tests.
|
|
745
750
|
|
|
746
751
|
Note:
|
|
747
752
|
This method requires the phenotype to be set using the set_phenotype() function.
|
|
@@ -796,6 +801,9 @@ class Geno:
|
|
|
796
801
|
if n_updated < n_original:
|
|
797
802
|
print(f"{n_original - n_updated}({(n_original - n_updated)/n_original*100:.3f}%) SNPs have been removed.")
|
|
798
803
|
|
|
804
|
+
# Recompute FSTAT to be consistent with the updated BETA/SE/P values
|
|
805
|
+
fill_fstatistic(updated_data, overwrite=True)
|
|
806
|
+
|
|
799
807
|
# Update the instance data
|
|
800
808
|
self.data = updated_data
|
|
801
809
|
return
|
|
@@ -29,8 +29,8 @@ def coloc_abf_func(data1, data2, trait1_type="quant", trait2_type="quant",
|
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
31
|
# Ensure that the BETA columns are preprocessed
|
|
32
|
-
check_beta_column(data1, 'BETA'
|
|
33
|
-
check_beta_column(data2, 'BETA'
|
|
32
|
+
check_beta_column(data1, 'BETA')
|
|
33
|
+
check_beta_column(data2, 'BETA')
|
|
34
34
|
|
|
35
35
|
# Adjust EAF column names before merging in case one of the datasets does not have it
|
|
36
36
|
if 'EAF' in data1.columns:
|
|
@@ -6,6 +6,8 @@ from concurrent.futures import ProcessPoolExecutor
|
|
|
6
6
|
from .tools import check_bfiles, check_pfiles, setup_genetic_path, get_plink_path
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
MIN_RAM_PER_WORKER_MB = 3500 # Minimum RAM per PLINK process (conservative for large genotype files)
|
|
10
|
+
|
|
9
11
|
### ____________________
|
|
10
12
|
### PRS functions
|
|
11
13
|
### ____________________
|
|
@@ -126,7 +128,7 @@ def extract_snps_func(snp_list, name=None, path=None, ram=20000, cpus=4):
|
|
|
126
128
|
path, filetype = setup_genetic_path(path)
|
|
127
129
|
|
|
128
130
|
# Prepare the SNP list
|
|
129
|
-
snp_list = snp_list.dropna()
|
|
131
|
+
snp_list = snp_list.dropna().drop_duplicates()
|
|
130
132
|
snp_list_name = f"{name}_list.txt"
|
|
131
133
|
snp_list_path = os.path.join("tmp_GENAL", snp_list_name)
|
|
132
134
|
snp_list.to_csv(snp_list_path, sep=" ", index=False, header=None)
|
|
@@ -136,16 +138,29 @@ def extract_snps_func(snp_list, name=None, path=None, ram=20000, cpus=4):
|
|
|
136
138
|
filetype_split = "split" if "$" in path else "combined"
|
|
137
139
|
|
|
138
140
|
output_path = os.path.join("tmp_GENAL", f"{name}_allchr")
|
|
141
|
+
|
|
142
|
+
# Guard against empty SNP list (applies to both split and combined)
|
|
143
|
+
if nrow == 0:
|
|
144
|
+
print("The SNP list is empty after deduplication.")
|
|
145
|
+
return "FAILED"
|
|
146
|
+
|
|
139
147
|
if filetype_split == "split":
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
workers = min(
|
|
148
|
+
# Calculate workers based on memory budget (not SNP count)
|
|
149
|
+
max_workers_by_ram = max(1, int(ram // MIN_RAM_PER_WORKER_MB))
|
|
150
|
+
workers = max(1, min(max_workers_by_ram, cpus, 22)) # Cap at 22 chromosomes, min 1
|
|
151
|
+
|
|
152
|
+
# Allocate RAM per worker
|
|
153
|
+
per_worker_ram = int(ram // workers)
|
|
154
|
+
|
|
155
|
+
#print(f"Parallelizing extraction across {workers} workers with {per_worker_ram}MB RAM each")
|
|
156
|
+
|
|
143
157
|
merge_command, bedlist_path = extract_snps_from_split_data(
|
|
144
|
-
name, path, output_path, snp_list_path, filetype,
|
|
158
|
+
name, path, output_path, snp_list_path, filetype,
|
|
159
|
+
workers=workers, per_worker_ram=per_worker_ram, ram=ram
|
|
145
160
|
)
|
|
146
161
|
handle_multiallelic_variants(name, merge_command, bedlist_path)
|
|
147
162
|
else:
|
|
148
|
-
extract_snps_from_combined_data(name, path, output_path, snp_list_path, filetype)
|
|
163
|
+
extract_snps_from_combined_data(name, path, output_path, snp_list_path, filetype, ram=ram)
|
|
149
164
|
|
|
150
165
|
#Check that at least 1 variant has been extracted. If not, return "FAILED" to warn downstream functions (prs, association_test)
|
|
151
166
|
log_path = output_path + ".log"
|
|
@@ -164,7 +179,7 @@ def extract_snps_func(snp_list, name=None, path=None, ram=20000, cpus=4):
|
|
|
164
179
|
return output_path
|
|
165
180
|
|
|
166
181
|
|
|
167
|
-
def extract_command_parallel(task_id, name, path, snp_list_path, filetype):
|
|
182
|
+
def extract_command_parallel(task_id, name, path, snp_list_path, filetype, per_worker_ram=4000):
|
|
168
183
|
"""
|
|
169
184
|
Helper function to run SNP extraction in parallel for different chromosomes.
|
|
170
185
|
Args:
|
|
@@ -173,8 +188,11 @@ def extract_command_parallel(task_id, name, path, snp_list_path, filetype):
|
|
|
173
188
|
path (str): Path to the data set.
|
|
174
189
|
snp_list_path (str): Path to the list of SNPs to extract.
|
|
175
190
|
filetype (str): Type of genetic files ("bed" or "pgen")
|
|
191
|
+
per_worker_ram (int): RAM limit in MB for this PLINK process.
|
|
176
192
|
Returns:
|
|
177
193
|
int: Returns the task_id if no valid files are found.
|
|
194
|
+
dict: Returns error dict {'failed': True, 'chr': task_id, ...} if extraction fails.
|
|
195
|
+
None: Returns None on success.
|
|
178
196
|
"""
|
|
179
197
|
input_path = path.replace("$", str(task_id))
|
|
180
198
|
|
|
@@ -185,65 +203,108 @@ def extract_command_parallel(task_id, name, path, snp_list_path, filetype):
|
|
|
185
203
|
return task_id
|
|
186
204
|
|
|
187
205
|
output_path = os.path.join("tmp_GENAL", f"{name}_extract_chr{task_id}")
|
|
188
|
-
|
|
206
|
+
|
|
189
207
|
# Build command based on filetype
|
|
190
208
|
base_cmd = f"{get_plink_path()}"
|
|
191
209
|
if filetype == "bed":
|
|
192
210
|
base_cmd += f" --bfile {input_path}"
|
|
193
211
|
else: # pgen
|
|
194
212
|
base_cmd += f" --pfile {input_path}"
|
|
195
|
-
|
|
196
|
-
command = f"{base_cmd} --extract {snp_list_path} --rm-dup force-first --make-pgen --out {output_path}"
|
|
197
|
-
|
|
198
|
-
subprocess.run(
|
|
213
|
+
|
|
214
|
+
command = f"{base_cmd} --extract {snp_list_path} --memory {per_worker_ram} --threads 1 --rm-dup force-first --make-pgen --out {output_path}"
|
|
215
|
+
|
|
216
|
+
result = subprocess.run(
|
|
199
217
|
command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
|
|
200
218
|
)
|
|
201
219
|
|
|
220
|
+
# Check for failures and return diagnostic info (diagnostics are in .log file)
|
|
221
|
+
if result.returncode != 0:
|
|
222
|
+
return {'failed': True, 'chr': task_id, 'log': f"{output_path}.log", 'returncode': result.returncode}
|
|
202
223
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
output_name (str): Base name for the output files.
|
|
209
|
-
not_found (List[int]): List of chromosome numbers for which no files were found.
|
|
210
|
-
"""
|
|
211
|
-
with open(bedlist_path, "w+") as bedlist_file:
|
|
212
|
-
found = []
|
|
213
|
-
for i in range(1, 23):
|
|
214
|
-
if i in not_found:
|
|
215
|
-
print(f"bed/bim/fam or pgen/pvar/psam files not found for chr{i}.")
|
|
216
|
-
elif check_pfiles(f"{output_name}_chr{i}"):
|
|
217
|
-
bedlist_file.write(f"{output_name}_chr{i}\n")
|
|
218
|
-
found.append(i)
|
|
219
|
-
print(f"SNPs extracted for chr{i}.")
|
|
220
|
-
else:
|
|
221
|
-
print(f"No SNPs extracted for chr{i}.")
|
|
222
|
-
return found
|
|
224
|
+
# Also check if output files were created
|
|
225
|
+
if not check_pfiles(output_path):
|
|
226
|
+
return {'failed': True, 'chr': task_id, 'log': f"{output_path}.log", 'returncode': -1}
|
|
227
|
+
|
|
228
|
+
return None # Success
|
|
223
229
|
|
|
224
230
|
|
|
225
|
-
def extract_snps_from_split_data(name, path, output_path, snp_list_path, filetype, workers=4):
|
|
231
|
+
def extract_snps_from_split_data(name, path, output_path, snp_list_path, filetype, workers=4, per_worker_ram=4000, ram=20000):
|
|
226
232
|
"""Extract SNPs from data split by chromosome."""
|
|
227
233
|
print("Extracting SNPs for each chromosome...")
|
|
228
234
|
num_tasks = 22
|
|
229
235
|
partial_extract_command_parallel = partial(
|
|
230
|
-
extract_command_parallel,
|
|
231
|
-
name=name,
|
|
232
|
-
path=path,
|
|
236
|
+
extract_command_parallel,
|
|
237
|
+
name=name,
|
|
238
|
+
path=path,
|
|
233
239
|
snp_list_path=snp_list_path,
|
|
234
|
-
filetype=filetype
|
|
240
|
+
filetype=filetype,
|
|
241
|
+
per_worker_ram=per_worker_ram
|
|
235
242
|
) # Wrapper function
|
|
243
|
+
|
|
244
|
+
# First attempt with calculated workers
|
|
245
|
+
results = []
|
|
236
246
|
with ProcessPoolExecutor(max_workers=workers) as executor:
|
|
237
|
-
|
|
247
|
+
results = list(
|
|
238
248
|
executor.map(partial_extract_command_parallel, range(1, num_tasks + 1))
|
|
239
249
|
)
|
|
240
250
|
|
|
251
|
+
# Check for failures (non-None returns indicate errors)
|
|
252
|
+
failed_chrs = [r for r in results if r is not None and isinstance(r, dict) and r.get('failed')]
|
|
253
|
+
not_found = [r for r in results if r is not None and not isinstance(r, dict)]
|
|
254
|
+
|
|
255
|
+
# Retry failed chromosomes with reduced workers if any failures occurred
|
|
256
|
+
if failed_chrs and workers > 1:
|
|
257
|
+
print(f"{len(failed_chrs)} chromosome(s) failed. Retrying with reduced parallelization...")
|
|
258
|
+
retry_workers = max(1, workers // 2)
|
|
259
|
+
# Recalculate RAM per worker based on original total budget
|
|
260
|
+
total_ram_budget = per_worker_ram * workers
|
|
261
|
+
per_worker_ram_retry = int(total_ram_budget // retry_workers)
|
|
262
|
+
|
|
263
|
+
partial_retry = partial(
|
|
264
|
+
extract_command_parallel,
|
|
265
|
+
name=name,
|
|
266
|
+
path=path,
|
|
267
|
+
snp_list_path=snp_list_path,
|
|
268
|
+
filetype=filetype,
|
|
269
|
+
per_worker_ram=per_worker_ram_retry
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
failed_chr_ids = [r['chr'] for r in failed_chrs]
|
|
273
|
+
with ProcessPoolExecutor(max_workers=retry_workers) as executor:
|
|
274
|
+
retry_results = list(executor.map(partial_retry, failed_chr_ids))
|
|
275
|
+
|
|
276
|
+
# Update results - surface errors for persistent failures
|
|
277
|
+
for orig_id, retry_result in zip(failed_chr_ids, retry_results):
|
|
278
|
+
if retry_result is not None and isinstance(retry_result, dict) and retry_result.get('failed'):
|
|
279
|
+
# Still failed - surface the error
|
|
280
|
+
log_path = os.path.join("tmp_GENAL", f"{name}_extract_chr{orig_id}.log")
|
|
281
|
+
if os.path.exists(log_path):
|
|
282
|
+
print(f"Chr{orig_id} failed after retry. Check log: {log_path}")
|
|
283
|
+
try:
|
|
284
|
+
with open(log_path, 'r') as f:
|
|
285
|
+
lines = f.readlines()
|
|
286
|
+
print(f"Last 10 lines of log:\n{''.join(lines[-10:])}")
|
|
287
|
+
except Exception:
|
|
288
|
+
pass
|
|
289
|
+
|
|
241
290
|
# Merge extracted SNPs from each chromosome
|
|
242
291
|
bedlist_name = f"{name}_bedlist.txt"
|
|
243
292
|
bedlist_path = os.path.join("tmp_GENAL", bedlist_name)
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
)
|
|
293
|
+
|
|
294
|
+
# Create the bedlist file
|
|
295
|
+
output_name = os.path.join("tmp_GENAL", f"{name}_extract")
|
|
296
|
+
with open(bedlist_path, "w+") as bedlist_file:
|
|
297
|
+
found = []
|
|
298
|
+
for i in range(1, 23):
|
|
299
|
+
if i in not_found:
|
|
300
|
+
print(f"bed/bim/fam or pgen/pvar/psam files not found for chr{i}.")
|
|
301
|
+
elif check_pfiles(f"{output_name}_chr{i}"):
|
|
302
|
+
bedlist_file.write(f"{output_name}_chr{i}\n")
|
|
303
|
+
found.append(i)
|
|
304
|
+
print(f"SNPs extracted for chr{i}.")
|
|
305
|
+
else:
|
|
306
|
+
print(f"No SNPs extracted for chr{i}.")
|
|
307
|
+
|
|
247
308
|
if len(found) == 0:
|
|
248
309
|
raise Warning("No SNPs were extracted from any chromosome.")
|
|
249
310
|
|
|
@@ -255,7 +316,7 @@ def extract_snps_from_split_data(name, path, output_path, snp_list_path, filetyp
|
|
|
255
316
|
return None, bedlist_path
|
|
256
317
|
|
|
257
318
|
print("Merging SNPs extracted from each chromosome...")
|
|
258
|
-
merge_command = f"{get_plink_path()} --pmerge-list {bedlist_path} pfile --out {output_path}"
|
|
319
|
+
merge_command = f"{get_plink_path()} --memory {ram} --pmerge-list {bedlist_path} pfile --out {output_path}"
|
|
259
320
|
try:
|
|
260
321
|
subprocess.run(
|
|
261
322
|
merge_command, shell=True, capture_output=True, text=True, check=True
|
|
@@ -269,18 +330,18 @@ def extract_snps_from_split_data(name, path, output_path, snp_list_path, filetyp
|
|
|
269
330
|
return merge_command, bedlist_path
|
|
270
331
|
|
|
271
332
|
|
|
272
|
-
def extract_snps_from_combined_data(name, path, output_path, snp_list_path, filetype):
|
|
333
|
+
def extract_snps_from_combined_data(name, path, output_path, snp_list_path, filetype, ram=20000):
|
|
273
334
|
"""Extract SNPs from combined data."""
|
|
274
335
|
print("Extracting SNPs...")
|
|
275
|
-
|
|
336
|
+
|
|
276
337
|
# Build command based on filetype
|
|
277
338
|
base_cmd = f"{get_plink_path()}"
|
|
278
339
|
if filetype == "bed":
|
|
279
340
|
base_cmd += f" --bfile {path}"
|
|
280
341
|
else: # pgen
|
|
281
342
|
base_cmd += f" --pfile {path}"
|
|
282
|
-
|
|
283
|
-
extract_command = f"{base_cmd} --extract {snp_list_path} --rm-dup force-first --make-pgen --out {output_path}"
|
|
343
|
+
|
|
344
|
+
extract_command = f"{base_cmd} --memory {ram} --extract {snp_list_path} --rm-dup force-first --make-pgen --out {output_path}"
|
|
284
345
|
|
|
285
346
|
subprocess.run(
|
|
286
347
|
extract_command,
|
|
@@ -68,10 +68,21 @@ def check_allele_column(data, allele_col, keep_indel):
|
|
|
68
68
|
|
|
69
69
|
def fill_se_p(data):
|
|
70
70
|
"""If either P or SE is missing but the other and BETA are present, fill it."""
|
|
71
|
+
# Ensure SE is numeric and non-negative
|
|
72
|
+
if ("SE" in data.columns):
|
|
73
|
+
data["SE"] = pd.to_numeric(data["SE"], errors="coerce")
|
|
74
|
+
data.loc[data["SE"] < 0, "SE"] = np.nan
|
|
75
|
+
n_missing = data["SE"].isna().sum()
|
|
76
|
+
if n_missing > 0:
|
|
77
|
+
print(
|
|
78
|
+
f"{n_missing}({n_missing/data.shape[0]*100:.3f}%) values in the SE column have been set to nan for being missing, negative or non-numeric."
|
|
79
|
+
)
|
|
71
80
|
# If SE is missing
|
|
72
81
|
if ("P" in data.columns) & ("BETA" in data.columns) & ("SE" not in data.columns):
|
|
73
|
-
data["SE"] = np.
|
|
74
|
-
data["P"]
|
|
82
|
+
data["SE"] = np.select(
|
|
83
|
+
[data["P"] == 0, data["P"] >= 1],
|
|
84
|
+
[0, np.nan],
|
|
85
|
+
default=np.abs(data.BETA / st.norm.ppf(data.P / 2)),
|
|
75
86
|
)
|
|
76
87
|
print("The SE (Standard Error) column has been created.")
|
|
77
88
|
# If P is missing
|
|
@@ -83,6 +94,104 @@ def fill_se_p(data):
|
|
|
83
94
|
return
|
|
84
95
|
|
|
85
96
|
|
|
97
|
+
def fill_fstatistic(data, overwrite=False):
|
|
98
|
+
"""
|
|
99
|
+
Compute or fill the per-variant F-statistic (FSTAT) column.
|
|
100
|
+
|
|
101
|
+
The F-statistic is computed as:
|
|
102
|
+
- Primary: FSTAT = (BETA / SE)² when BETA and SE are available and SE > 0.
|
|
103
|
+
For SE=0 (extremely significant variants), FSTAT is set to inf.
|
|
104
|
+
- Fallback: FSTAT = χ²_isf(P, df=1) when BETA/SE are unavailable but P is present.
|
|
105
|
+
For P=0 (extremely significant variants), this produces inf.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
data (pd.DataFrame): SNP-level DataFrame.
|
|
109
|
+
overwrite (bool): If False (default), only fill missing FSTAT values if the column
|
|
110
|
+
exists; if it doesn't exist, create it. If True, recompute FSTAT for all rows
|
|
111
|
+
where computable, overwriting existing values; non-computable rows become NaN.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
None: Modifies data in place.
|
|
115
|
+
|
|
116
|
+
Note:
|
|
117
|
+
FSTAT is NOT added to STANDARD_COLUMNS to avoid row deletion due to missing values.
|
|
118
|
+
"""
|
|
119
|
+
nrows = data.shape[0]
|
|
120
|
+
column_created = False
|
|
121
|
+
|
|
122
|
+
# Determine which rows need computation
|
|
123
|
+
if "FSTAT" not in data.columns:
|
|
124
|
+
data["FSTAT"] = np.nan
|
|
125
|
+
column_created = True
|
|
126
|
+
rows_to_compute = pd.Series([True] * nrows, index=data.index)
|
|
127
|
+
elif overwrite:
|
|
128
|
+
# Clear FSTAT first so non-computable rows become NaN (not stale values)
|
|
129
|
+
data["FSTAT"] = np.nan
|
|
130
|
+
rows_to_compute = pd.Series([True] * nrows, index=data.index)
|
|
131
|
+
else:
|
|
132
|
+
# Only compute for rows with missing FSTAT
|
|
133
|
+
rows_to_compute = data["FSTAT"].isna()
|
|
134
|
+
|
|
135
|
+
if not rows_to_compute.any():
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
# Track how many values are assigned
|
|
139
|
+
n_assigned = 0
|
|
140
|
+
|
|
141
|
+
# Primary route: FSTAT = (BETA / SE)² when BETA and SE are available (SE=0 produces inf)
|
|
142
|
+
beta_se_computable = pd.Series([False] * nrows, index=data.index)
|
|
143
|
+
if "BETA" in data.columns and "SE" in data.columns:
|
|
144
|
+
method = "BETA/SE"
|
|
145
|
+
beta_se_computable = (
|
|
146
|
+
rows_to_compute &
|
|
147
|
+
data["BETA"].notna() &
|
|
148
|
+
data["SE"].notna() &
|
|
149
|
+
(data["SE"] >= 0)
|
|
150
|
+
)
|
|
151
|
+
if beta_se_computable.any():
|
|
152
|
+
data.loc[beta_se_computable, "FSTAT"] = (
|
|
153
|
+
data.loc[beta_se_computable, "BETA"] / data.loc[beta_se_computable, "SE"]
|
|
154
|
+
) ** 2
|
|
155
|
+
n_assigned += beta_se_computable.sum()
|
|
156
|
+
|
|
157
|
+
# Fallback route: FSTAT = χ²_isf(P, df=1) for remaining rows where P is present
|
|
158
|
+
# Allow P=0 (produces inf for extremely significant variants)
|
|
159
|
+
if "P" in data.columns:
|
|
160
|
+
method = "P-values"
|
|
161
|
+
p_fallback_computable = (
|
|
162
|
+
rows_to_compute &
|
|
163
|
+
~beta_se_computable &
|
|
164
|
+
data["P"].notna() &
|
|
165
|
+
(data["P"] >= 0) &
|
|
166
|
+
(data["P"] <= 1)
|
|
167
|
+
)
|
|
168
|
+
if p_fallback_computable.any():
|
|
169
|
+
data.loc[p_fallback_computable, "FSTAT"] = st.chi2.isf(
|
|
170
|
+
data.loc[p_fallback_computable, "P"], df=1
|
|
171
|
+
)
|
|
172
|
+
n_assigned += p_fallback_computable.sum()
|
|
173
|
+
|
|
174
|
+
# Logging
|
|
175
|
+
if column_created:
|
|
176
|
+
print(
|
|
177
|
+
f"The FSTAT (F-statistic) column has been created using {method}. "
|
|
178
|
+
f"{n_assigned}({n_assigned/nrows*100:.3f}%) values computed."
|
|
179
|
+
)
|
|
180
|
+
elif overwrite:
|
|
181
|
+
print(
|
|
182
|
+
f"The FSTAT (F-statistic) column has been re-created using {method}. "
|
|
183
|
+
f"{n_assigned}({n_assigned/nrows*100:.3f}%) values computed."
|
|
184
|
+
)
|
|
185
|
+
else:
|
|
186
|
+
if n_assigned > 0:
|
|
187
|
+
print(
|
|
188
|
+
f"The FSTAT (F-statistic) column: {n_assigned}({n_assigned/nrows*100:.3f}%)"
|
|
189
|
+
f"missing values have been filled using {method}."
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
return
|
|
193
|
+
|
|
194
|
+
|
|
86
195
|
def check_p_column(data):
|
|
87
196
|
"""Verify that the P column contains numeric values in the range [0,1]. Set inappropriate values to NA."""
|
|
88
197
|
nrows = data.shape[0]
|
|
@@ -96,14 +205,15 @@ def check_p_column(data):
|
|
|
96
205
|
return
|
|
97
206
|
|
|
98
207
|
|
|
99
|
-
def check_beta_column(data, effect_column
|
|
208
|
+
def check_beta_column(data, effect_column):
|
|
100
209
|
"""
|
|
101
210
|
If the BETA column is a column of odds ratios, log-transform it.
|
|
102
211
|
If no effect_column argument is specified, determine if the BETA column are beta estimates or odds ratios.
|
|
103
212
|
"""
|
|
213
|
+
# Ensure the BETA column is numeric
|
|
214
|
+
data["BETA"] = pd.to_numeric(data["BETA"], errors="coerce")
|
|
215
|
+
|
|
104
216
|
if effect_column is None:
|
|
105
|
-
if preprocessing == 'None':
|
|
106
|
-
return data
|
|
107
217
|
median = data.BETA.median()
|
|
108
218
|
has_negative = (data.BETA < 0).any()
|
|
109
219
|
|
|
@@ -126,7 +236,7 @@ def check_beta_column(data, effect_column, preprocessing):
|
|
|
126
236
|
)
|
|
127
237
|
if effect_column == "OR":
|
|
128
238
|
data["BETA"] = np.log(data["BETA"].clip(lower=0.01))
|
|
129
|
-
data.drop(columns="SE", errors="ignore", inplace=True)
|
|
239
|
+
data.drop(columns=["SE"], errors="ignore", inplace=True)
|
|
130
240
|
print("The BETA column has been log-transformed to obtain Beta estimates.")
|
|
131
241
|
return
|
|
132
242
|
|
|
@@ -51,6 +51,8 @@ def lift_data(
|
|
|
51
51
|
# Prepare the data for lifting: handle missing values in CHR, POS columns
|
|
52
52
|
nrows = data.shape[0]
|
|
53
53
|
data.dropna(subset=["CHR", "POS"], inplace=True)
|
|
54
|
+
# Remove absurd positions
|
|
55
|
+
data.drop(data[data.POS >= 300_000_000].index, inplace=True)
|
|
54
56
|
data.reset_index(drop=True, inplace=True)
|
|
55
57
|
n_na = nrows - data.shape[0]
|
|
56
58
|
if n_na:
|
|
@@ -113,7 +113,11 @@ def create_tmp():
|
|
|
113
113
|
def delete_tmp():
|
|
114
114
|
"""Delete the tmp folder."""
|
|
115
115
|
if os.path.isdir("tmp_GENAL"):
|
|
116
|
-
|
|
116
|
+
def _onerror(func, path, exc_info):
|
|
117
|
+
if isinstance(exc_info[1], FileNotFoundError):
|
|
118
|
+
return
|
|
119
|
+
raise exc_info[1]
|
|
120
|
+
shutil.rmtree("tmp_GENAL", onerror=_onerror)
|
|
117
121
|
print("The tmp_GENAL folder has been successfully deleted.")
|
|
118
122
|
else:
|
|
119
123
|
print("There is no tmp_GENAL folder to delete in the current directory.")
|
|
@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "genal-python" # Updated name for PyPI
|
|
7
|
-
version = "1.4.
|
|
7
|
+
version = "1.4.9"
|
|
8
8
|
authors = [{name = "Cyprien Rivier", email = "riviercyprien@gmail.com"}]
|
|
9
9
|
description = "A python toolkit for polygenic risk scoring and mendelian randomization."
|
|
10
10
|
readme = "README.md"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|