genal-python 1.5.0__tar.gz → 1.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {genal_python-1.5.0 → genal_python-1.5.1}/.gitignore +5 -1
- genal_python-1.5.0/README.md → genal_python-1.5.1/PKG-INFO +82 -6
- genal_python-1.5.0/PKG-INFO → genal_python-1.5.1/README.md +49 -35
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/source/concepts.md +8 -4
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/source/methods.md +41 -2
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/source/workflows.md +69 -2
- {genal_python-1.5.0 → genal_python-1.5.1}/genal/Geno.py +228 -152
- {genal_python-1.5.0 → genal_python-1.5.1}/genal/MR.py +38 -138
- {genal_python-1.5.0 → genal_python-1.5.1}/genal/MR_tools.py +236 -13
- {genal_python-1.5.0 → genal_python-1.5.1}/genal/MRpresso.py +82 -68
- {genal_python-1.5.0 → genal_python-1.5.1}/genal/__init__.py +1 -1
- {genal_python-1.5.0 → genal_python-1.5.1}/genal/association.py +41 -45
- {genal_python-1.5.0 → genal_python-1.5.1}/genal/extract_prs.py +9 -2
- {genal_python-1.5.0 → genal_python-1.5.1}/genal/geno_tools.py +2 -2
- genal_python-1.5.1/genal/plots.py +1088 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/pyproject.toml +8 -1
- genal_python-1.5.1/pytest.ini +12 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/.DS_Store +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/.readthedocs.yaml +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/Genal_flowchart.png +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/LICENSE +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/.DS_Store +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/Makefile +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/make.bat +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/requirements.txt +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/source/.DS_Store +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/source/Images/Genal_flowchart.png +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/source/Images/MR_plot_SBP_AS.png +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/source/Images/genal_logo.png +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/source/api.md +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/source/conf.py +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/source/faq.md +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/source/index.md +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/source/introduction.md +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/docs/source/setup.md +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/genal/clump.py +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/genal/colocalization.py +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/genal/constants.py +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/genal/genes.py +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/genal/lift.py +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/genal/proxy.py +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/genal/snp_query.py +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/genal/tools.py +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/genal_logo.png +0 -0
- {genal_python-1.5.0 → genal_python-1.5.1}/gitignore +0 -0
|
@@ -1,14 +1,18 @@
|
|
|
1
|
+
.DS_Store
|
|
1
2
|
__pycache__/
|
|
3
|
+
.pytest_cache/
|
|
4
|
+
.coverage
|
|
5
|
+
htmlcov/
|
|
2
6
|
dist/
|
|
3
7
|
.ipynb_checkpoints/
|
|
4
8
|
ipynb_checkpoints/
|
|
5
9
|
genal/.ipynb_checkpoints/
|
|
6
10
|
test_data/
|
|
7
11
|
cursor/
|
|
8
|
-
tests/
|
|
9
12
|
tmp_GENAL/
|
|
10
13
|
docs/build/
|
|
11
14
|
docs/_build/
|
|
12
15
|
REVIEW_REPORT.md
|
|
13
16
|
TASKS.md
|
|
14
17
|
code_concatenated
|
|
18
|
+
tests/
|
|
@@ -1,3 +1,35 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: genal-python
|
|
3
|
+
Version: 1.5.1
|
|
4
|
+
Summary: A python toolkit for polygenic risk scoring and mendelian randomization.
|
|
5
|
+
Author-email: Cyprien Rivier <riviercyprien@gmail.com>
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: aiohttp>=3.7
|
|
13
|
+
Requires-Dist: nest_asyncio>=1.5
|
|
14
|
+
Requires-Dist: numpy>=1.17.3
|
|
15
|
+
Requires-Dist: pandas>=1.0
|
|
16
|
+
Requires-Dist: plotnine>=0.9
|
|
17
|
+
Requires-Dist: psutil>=5.0
|
|
18
|
+
Requires-Dist: requests>=2.0
|
|
19
|
+
Requires-Dist: pyliftover>=0.4
|
|
20
|
+
Requires-Dist: scikit_learn>=0.24
|
|
21
|
+
Requires-Dist: scipy>=1.7,<1.13
|
|
22
|
+
Requires-Dist: statsmodels>=0.13,<0.15
|
|
23
|
+
Requires-Dist: tqdm>=4.38
|
|
24
|
+
Requires-Dist: wget>=3.0
|
|
25
|
+
Requires-Dist: fastparquet>=0.4
|
|
26
|
+
Requires-Dist: pyarrow>=3.0
|
|
27
|
+
Requires-Dist: pytest>=7.0 ; extra == "test"
|
|
28
|
+
Requires-Dist: pytest-cov ; extra == "test"
|
|
29
|
+
Requires-Dist: pytest-xdist ; extra == "test"
|
|
30
|
+
Project-URL: Home, https://github.com/CypRiv/genal
|
|
31
|
+
Provides-Extra: test
|
|
32
|
+
|
|
1
33
|
<div align="center">
|
|
2
34
|
<img src="genal_logo.png" height="80" alt="genal logo" />
|
|
3
35
|
<h1>genal</h1>
|
|
@@ -123,7 +155,7 @@ G_instruments.query_outcome(G_outcome, proxy=True, reference_panel="EUR_37")
|
|
|
123
155
|
mr_results = G_instruments.MR(action=2, heterogeneity=True, odds=False)
|
|
124
156
|
|
|
125
157
|
# 5.5) Plot MR results
|
|
126
|
-
G_instruments.MR_plot(filename="mr_scatter")
|
|
158
|
+
G_instruments.MR_plot(filename="mr_scatter", figure_size=(10, 6))
|
|
127
159
|
```
|
|
128
160
|
## Core concept: the `Geno` object
|
|
129
161
|
|
|
@@ -133,6 +165,7 @@ G_instruments.MR_plot(filename="mr_scatter")
|
|
|
133
165
|
- `G.phenotype`: stored after `G.set_phenotype(...)` (phenotype DataFrame + metadata)
|
|
134
166
|
- `G.MR_data`: stored after `G.query_outcome(...)` (exposure/outcome association tables used by MR)
|
|
135
167
|
- `G.MR_results`: stored after `G.MR(...)` (results table + harmonized SNP table; used by plotting)
|
|
168
|
+
- `G.MRpresso_subset_data`: stored after `G.MRpresso(...)` (outlier-removed harmonized table)
|
|
136
169
|
|
|
137
170
|
Most methods either:
|
|
138
171
|
- return a **new `Geno`** object (e.g., `clump()`), or
|
|
@@ -147,6 +180,7 @@ Most methods either:
|
|
|
147
180
|
- **Align rsIDs to a target genotype dataset**: `Geno.update_snpids(path=..., replace=...)`
|
|
148
181
|
- **Extract genotype subset**: `Geno.extract_snps(path=...)` → writes extracted files under `tmp_GENAL/`
|
|
149
182
|
- **Two-sample MR pipeline**: `Geno.query_outcome(...)` → `Geno.MR(...)` (+ `MR_plot`)
|
|
183
|
+
- **Leave-one-out MR**: `Geno.MR_loo(...)` → `Geno.MR_loo_plot(...)` (identify influential variants)
|
|
150
184
|
- **MR-PRESSO**: `Geno.MRpresso(...)` (parallel; outlier + distortion tests)
|
|
151
185
|
- **Colocalization**: `Geno.colocalize(...)` (approx Bayes factors; returns posterior probabilities)
|
|
152
186
|
- **Association testing (individual-level)**: `Geno.set_phenotype(...)` → `Geno.association_test(...)`
|
|
@@ -298,7 +332,17 @@ About `action` (palindromic SNP handling during harmonization):
|
|
|
298
332
|
Plot the MR scatter:
|
|
299
333
|
|
|
300
334
|
```python
|
|
301
|
-
G_clumped.MR_plot(filename="mr_scatter")
|
|
335
|
+
G_clumped.MR_plot(filename="mr_scatter", figure_size=(10, 6))
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
You can also draw a funnel plot of single-SNP ratio estimates (Wald ratios):
|
|
339
|
+
|
|
340
|
+
```python
|
|
341
|
+
G_clumped.MR_funnel(
|
|
342
|
+
methods=["IVW", "WM", "Egger"], # vertical reference lines (optional)
|
|
343
|
+
filename="mr_funnel",
|
|
344
|
+
figure_size=(10, 6),
|
|
345
|
+
)
|
|
302
346
|
```
|
|
303
347
|
|
|
304
348
|
### 6) Sensitivity: MR-PRESSO
|
|
@@ -311,7 +355,16 @@ mod_table, GlobalTest, OutlierTest, BiasTest = G_clumped.MRpresso(
|
|
|
311
355
|
cpus=-1, # use all CPU cores
|
|
312
356
|
)
|
|
313
357
|
```
|
|
314
|
-
|
|
358
|
+
|
|
359
|
+
To highlight MR-PRESSO outliers on plots, pass `use_mrpresso_data=True` (outliers are colored in red):
|
|
360
|
+
|
|
361
|
+
```python
|
|
362
|
+
G_clumped.MR_plot(filename="mr_scatter_mrpresso", figure_size=(10, 6), use_mrpresso_data=True)
|
|
363
|
+
G_clumped.MR_funnel(filename="mr_funnel_mrpresso", figure_size=(10, 6), use_mrpresso_data=True)
|
|
364
|
+
G_clumped.MR_loo_plot(filename="loo_forest_mrpresso", figure_size=(10, 8), use_mrpresso_data=True)
|
|
365
|
+
```
|
|
366
|
+
|
|
367
|
+
If you want to rerun MR methods after removing outliers with MR-PRESSO, you can use the `use_mrpresso_data=True` argument in `MR()`:
|
|
315
368
|
```python
|
|
316
369
|
res = G_clumped.MR(
|
|
317
370
|
action=2,
|
|
@@ -325,7 +378,29 @@ res = G_clumped.MR(
|
|
|
325
378
|
res
|
|
326
379
|
```
|
|
327
380
|
|
|
328
|
-
### 7)
|
|
381
|
+
### 7) Sensitivity: Leave-One-Out MR
|
|
382
|
+
Leave-one-out MR helps identify influential variants that may be driving the causal estimate.
|
|
383
|
+
|
|
384
|
+
```python
|
|
385
|
+
# Run leave-one-out analysis (default uses IVW)
|
|
386
|
+
loo_results = G_clumped.MR_loo(method="IVW", heterogeneity=False, odds=False)
|
|
387
|
+
```
|
|
388
|
+
|
|
389
|
+
Visualize the results with a forest plot:
|
|
390
|
+
|
|
391
|
+
```python
|
|
392
|
+
# Default: show top influential instruments
|
|
393
|
+
G_clumped.MR_loo_plot(filename="loo_forest", figure_size=(10, 8))
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
Tips:
|
|
397
|
+
- `MR_loo` accepts the same `action`, `use_mrpresso_data`, and method parameters as `MR`.
|
|
398
|
+
- `MR_loo_plot` supports `top_influential=True` (default) for a compact figure showing the most influential SNPs, or `top_influential=False` for paginated output with all instruments.
|
|
399
|
+
- `MR_loo_plot(..., use_mrpresso_data=True)` colors MR-PRESSO outliers in red (requires running `MRpresso()` first). When outliers exist, an extra summary row ("MR-PRESSO corrected") is added using the same MR method as the leave-one-out analysis.
|
|
400
|
+
- `MR_loo_plot(..., methods=["WM", "Egger"])` adds extra overall estimates for the requested methods (computed on all instruments).
|
|
401
|
+
- Set `odds=True` in `MR_loo` if you want odds ratio scaling on the plot.
|
|
402
|
+
|
|
403
|
+
### 8) Single-SNP association tests (individual-level data)
|
|
329
404
|
|
|
330
405
|
Use individual-level data to re-estimate SNP–trait effects in a specific cohort (e.g., different ancestry, different measurement protocol).
|
|
331
406
|
|
|
@@ -354,7 +429,7 @@ G_adj.association_test(
|
|
|
354
429
|
|
|
355
430
|
This updates `G_adj.data[["BETA","SE","P"]]` with cohort-specific estimates and recomputes `FSTAT` to be consistent with the updated values.
|
|
356
431
|
|
|
357
|
-
###
|
|
432
|
+
### 9) Lift to a different build
|
|
358
433
|
|
|
359
434
|
Lift variants between builds (e.g., hg19 → hg38):
|
|
360
435
|
|
|
@@ -365,7 +440,7 @@ lifted.head()
|
|
|
365
440
|
|
|
366
441
|
For large datasets, you can provide a UCSC LiftOver executable via `liftover_path`.
|
|
367
442
|
|
|
368
|
-
###
|
|
443
|
+
### 10) Query the GWAS Catalog
|
|
369
444
|
|
|
370
445
|
Attach a per-SNP list of associated traits using the GWAS Catalog API:
|
|
371
446
|
|
|
@@ -520,3 +595,4 @@ If you use methods derived from other packages (e.g., MR-PRESSO), please also ci
|
|
|
520
595
|
|
|
521
596
|
## License
|
|
522
597
|
GPL-3.0-or-later (see `LICENSE`).
|
|
598
|
+
|
|
@@ -1,31 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: genal-python
|
|
3
|
-
Version: 1.5.0
|
|
4
|
-
Summary: A python toolkit for polygenic risk scoring and mendelian randomization.
|
|
5
|
-
Author-email: Cyprien Rivier <riviercyprien@gmail.com>
|
|
6
|
-
Requires-Python: >=3.8
|
|
7
|
-
Description-Content-Type: text/markdown
|
|
8
|
-
Classifier: Programming Language :: Python :: 3
|
|
9
|
-
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
10
|
-
Classifier: Operating System :: OS Independent
|
|
11
|
-
License-File: LICENSE
|
|
12
|
-
Requires-Dist: aiohttp>=3.7
|
|
13
|
-
Requires-Dist: nest_asyncio>=1.5
|
|
14
|
-
Requires-Dist: numpy>=1.17.3
|
|
15
|
-
Requires-Dist: pandas>=1.0
|
|
16
|
-
Requires-Dist: plotnine>=0.9
|
|
17
|
-
Requires-Dist: psutil>=5.0
|
|
18
|
-
Requires-Dist: requests>=2.0
|
|
19
|
-
Requires-Dist: pyliftover>=0.4
|
|
20
|
-
Requires-Dist: scikit_learn>=0.24
|
|
21
|
-
Requires-Dist: scipy>=1.7,<1.13
|
|
22
|
-
Requires-Dist: statsmodels>=0.13,<0.15
|
|
23
|
-
Requires-Dist: tqdm>=4.38
|
|
24
|
-
Requires-Dist: wget>=3.0
|
|
25
|
-
Requires-Dist: fastparquet>=0.4
|
|
26
|
-
Requires-Dist: pyarrow>=3.0
|
|
27
|
-
Project-URL: Home, https://github.com/CypRiv/genal
|
|
28
|
-
|
|
29
1
|
<div align="center">
|
|
30
2
|
<img src="genal_logo.png" height="80" alt="genal logo" />
|
|
31
3
|
<h1>genal</h1>
|
|
@@ -151,7 +123,7 @@ G_instruments.query_outcome(G_outcome, proxy=True, reference_panel="EUR_37")
|
|
|
151
123
|
mr_results = G_instruments.MR(action=2, heterogeneity=True, odds=False)
|
|
152
124
|
|
|
153
125
|
# 5.5) Plot MR results
|
|
154
|
-
G_instruments.MR_plot(filename="mr_scatter")
|
|
126
|
+
G_instruments.MR_plot(filename="mr_scatter", figure_size=(10, 6))
|
|
155
127
|
```
|
|
156
128
|
## Core concept: the `Geno` object
|
|
157
129
|
|
|
@@ -161,6 +133,7 @@ G_instruments.MR_plot(filename="mr_scatter")
|
|
|
161
133
|
- `G.phenotype`: stored after `G.set_phenotype(...)` (phenotype DataFrame + metadata)
|
|
162
134
|
- `G.MR_data`: stored after `G.query_outcome(...)` (exposure/outcome association tables used by MR)
|
|
163
135
|
- `G.MR_results`: stored after `G.MR(...)` (results table + harmonized SNP table; used by plotting)
|
|
136
|
+
- `G.MRpresso_subset_data`: stored after `G.MRpresso(...)` (outlier-removed harmonized table)
|
|
164
137
|
|
|
165
138
|
Most methods either:
|
|
166
139
|
- return a **new `Geno`** object (e.g., `clump()`), or
|
|
@@ -175,6 +148,7 @@ Most methods either:
|
|
|
175
148
|
- **Align rsIDs to a target genotype dataset**: `Geno.update_snpids(path=..., replace=...)`
|
|
176
149
|
- **Extract genotype subset**: `Geno.extract_snps(path=...)` → writes extracted files under `tmp_GENAL/`
|
|
177
150
|
- **Two-sample MR pipeline**: `Geno.query_outcome(...)` → `Geno.MR(...)` (+ `MR_plot`)
|
|
151
|
+
- **Leave-one-out MR**: `Geno.MR_loo(...)` → `Geno.MR_loo_plot(...)` (identify influential variants)
|
|
178
152
|
- **MR-PRESSO**: `Geno.MRpresso(...)` (parallel; outlier + distortion tests)
|
|
179
153
|
- **Colocalization**: `Geno.colocalize(...)` (approx Bayes factors; returns posterior probabilities)
|
|
180
154
|
- **Association testing (individual-level)**: `Geno.set_phenotype(...)` → `Geno.association_test(...)`
|
|
@@ -326,7 +300,17 @@ About `action` (palindromic SNP handling during harmonization):
|
|
|
326
300
|
Plot the MR scatter:
|
|
327
301
|
|
|
328
302
|
```python
|
|
329
|
-
G_clumped.MR_plot(filename="mr_scatter")
|
|
303
|
+
G_clumped.MR_plot(filename="mr_scatter", figure_size=(10, 6))
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
You can also draw a funnel plot of single-SNP ratio estimates (Wald ratios):
|
|
307
|
+
|
|
308
|
+
```python
|
|
309
|
+
G_clumped.MR_funnel(
|
|
310
|
+
methods=["IVW", "WM", "Egger"], # vertical reference lines (optional)
|
|
311
|
+
filename="mr_funnel",
|
|
312
|
+
figure_size=(10, 6),
|
|
313
|
+
)
|
|
330
314
|
```
|
|
331
315
|
|
|
332
316
|
### 6) Sensitivity: MR-PRESSO
|
|
@@ -339,7 +323,16 @@ mod_table, GlobalTest, OutlierTest, BiasTest = G_clumped.MRpresso(
|
|
|
339
323
|
cpus=-1, # use all CPU cores
|
|
340
324
|
)
|
|
341
325
|
```
|
|
342
|
-
|
|
326
|
+
|
|
327
|
+
To highlight MR-PRESSO outliers on plots, pass `use_mrpresso_data=True` (outliers are colored in red):
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
G_clumped.MR_plot(filename="mr_scatter_mrpresso", figure_size=(10, 6), use_mrpresso_data=True)
|
|
331
|
+
G_clumped.MR_funnel(filename="mr_funnel_mrpresso", figure_size=(10, 6), use_mrpresso_data=True)
|
|
332
|
+
G_clumped.MR_loo_plot(filename="loo_forest_mrpresso", figure_size=(10, 8), use_mrpresso_data=True)
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
If you want to rerun MR methods after removing outliers with MR-PRESSO, you can use the `use_mrpresso_data=True` argument in `MR()`:
|
|
343
336
|
```python
|
|
344
337
|
res = G_clumped.MR(
|
|
345
338
|
action=2,
|
|
@@ -353,7 +346,29 @@ res = G_clumped.MR(
|
|
|
353
346
|
res
|
|
354
347
|
```
|
|
355
348
|
|
|
356
|
-
### 7)
|
|
349
|
+
### 7) Sensitivity: Leave-One-Out MR
|
|
350
|
+
Leave-one-out MR helps identify influential variants that may be driving the causal estimate.
|
|
351
|
+
|
|
352
|
+
```python
|
|
353
|
+
# Run leave-one-out analysis (default uses IVW)
|
|
354
|
+
loo_results = G_clumped.MR_loo(method="IVW", heterogeneity=False, odds=False)
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
Visualize the results with a forest plot:
|
|
358
|
+
|
|
359
|
+
```python
|
|
360
|
+
# Default: show top influential instruments
|
|
361
|
+
G_clumped.MR_loo_plot(filename="loo_forest", figure_size=(10, 8))
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
Tips:
|
|
365
|
+
- `MR_loo` accepts the same `action`, `use_mrpresso_data`, and method parameters as `MR`.
|
|
366
|
+
- `MR_loo_plot` supports `top_influential=True` (default) for a compact figure showing the most influential SNPs, or `top_influential=False` for paginated output with all instruments.
|
|
367
|
+
- `MR_loo_plot(..., use_mrpresso_data=True)` colors MR-PRESSO outliers in red (requires running `MRpresso()` first). When outliers exist, an extra summary row ("MR-PRESSO corrected") is added using the same MR method as the leave-one-out analysis.
|
|
368
|
+
- `MR_loo_plot(..., methods=["WM", "Egger"])` adds extra overall estimates for the requested methods (computed on all instruments).
|
|
369
|
+
- Set `odds=True` in `MR_loo` if you want odds ratio scaling on the plot.
|
|
370
|
+
|
|
371
|
+
### 8) Single-SNP association tests (individual-level data)
|
|
357
372
|
|
|
358
373
|
Use individual-level data to re-estimate SNP–trait effects in a specific cohort (e.g., different ancestry, different measurement protocol).
|
|
359
374
|
|
|
@@ -382,7 +397,7 @@ G_adj.association_test(
|
|
|
382
397
|
|
|
383
398
|
This updates `G_adj.data[["BETA","SE","P"]]` with cohort-specific estimates and recomputes `FSTAT` to be consistent with the updated values.
|
|
384
399
|
|
|
385
|
-
###
|
|
400
|
+
### 9) Lift to a different build
|
|
386
401
|
|
|
387
402
|
Lift variants between builds (e.g., hg19 → hg38):
|
|
388
403
|
|
|
@@ -393,7 +408,7 @@ lifted.head()
|
|
|
393
408
|
|
|
394
409
|
For large datasets, you can provide a UCSC LiftOver executable via `liftover_path`.
|
|
395
410
|
|
|
396
|
-
###
|
|
411
|
+
### 10) Query the GWAS Catalog
|
|
397
412
|
|
|
398
413
|
Attach a per-SNP list of associated traits using the GWAS Catalog API:
|
|
399
414
|
|
|
@@ -548,4 +563,3 @@ If you use methods derived from other packages (e.g., MR-PRESSO), please also ci
|
|
|
548
563
|
|
|
549
564
|
## License
|
|
550
565
|
GPL-3.0-or-later (see `LICENSE`).
|
|
551
|
-
|
|
@@ -10,6 +10,7 @@ Key attributes (you don't need to manipulate these directly):
|
|
|
10
10
|
- `G.phenotype`: set by {py:meth}`genal.Geno.set_phenotype` (phenotype DataFrame + metadata)
|
|
11
11
|
- `G.MR_data`: set by {py:meth}`genal.Geno.query_outcome` (exposure/outcome tables used by MR)
|
|
12
12
|
- `G.MR_results`: set by {py:meth}`genal.Geno.MR` (results table + harmonized SNP table; used by plotting)
|
|
13
|
+
- `G.MR_loo_results`: set by {py:meth}`genal.Geno.MR_loo` (leave-one-out results tuple; used by `MR_loo_plot`)
|
|
13
14
|
- `G.MRpresso_results` / `G.MRpresso_subset_data`: set by {py:meth}`genal.Geno.MRpresso`
|
|
14
15
|
|
|
15
16
|
## Standard columns
|
|
@@ -38,7 +39,7 @@ This is a *practical guide*, not an exhaustive contract. When a method can work
|
|
|
38
39
|
| {py:meth}`genal.Geno.clump` | `SNP`, `P` | LD clumping via PLINK; returns a new `Geno` (or `None` if nothing passes). |
|
|
39
40
|
| {py:meth}`genal.Geno.prs` | `EA`, `BETA`, plus `SNP (or CHR+POS)` | If `CHR+POS` are available, genal will prefer position-based matching to your genotype dataset to reduce ID-mismatch losses. |
|
|
40
41
|
| {py:meth}`genal.Geno.query_outcome` | `SNP`, `EA`, `NEA`, `BETA`, `SE` (exposure and outcome) | Outcome querying is rsID-based; proxy search is optional. If you plan to use `action=2` later, `EAF` in both datasets is strongly recommended. |
|
|
41
|
-
| {py:meth}`genal.Geno.MR` / {py:meth}`genal.Geno.MRpresso` | `MR_data` |
|
|
42
|
+
| {py:meth}`genal.Geno.MR` / {py:meth}`genal.Geno.MR_loo` / {py:meth}`genal.Geno.MRpresso` | `MR_data` | All consume `MR_data` produced by `query_outcome()`. |
|
|
42
43
|
| {py:meth}`genal.Geno.colocalize` | `BETA`, `SE`, plus `CHR+POS` (preferred) **or** `SNP` (in both datasets) | If `EA/NEA` are present in both datasets, effects are allele-aligned; otherwise results assume both GWAS use the same reference allele. For quantitative traits, provide `sdY` or (`EAF` + `n`) to avoid the default `sdY=1` assumption. |
|
|
43
44
|
| {py:meth}`genal.Geno.update_eaf` | `EA`, plus `CHR+POS` **or** `SNP` | Uses PLINK to compute allele frequencies from a reference panel; coordinate-based matching is faster when available. |
|
|
44
45
|
| {py:meth}`genal.Geno.filter_by_gene` / {py:meth}`genal.Geno.lift` | `CHR`, `POS` | Genomic coordinate operations. |
|
|
@@ -65,8 +66,11 @@ A helpful mental framework:
|
|
|
65
66
|
| `association_test()` | `None` | runs PLINK `--glm`; mutates `G.data` (`BETA/SE/P`) |
|
|
66
67
|
| `query_outcome()` | `None` | sets `G.MR_data` (exposure/outcome tables used by MR) |
|
|
67
68
|
| `MR()` | `pd.DataFrame` | sets `G.MR_results` and returns the results table |
|
|
68
|
-
| `MR_plot()` | plot object | requires `G.MR_results`; writes `.png` if `filename
|
|
69
|
-
| `
|
|
69
|
+
| `MR_plot()` | plot object | requires `G.MR_results`; writes `.png` if `filename=...`; supports `use_mrpresso_data=True` for outlier highlighting |
|
|
70
|
+
| `MR_funnel()` | plot object | requires `G.MR_results`; writes `.png` if `filename=...`; supports `use_mrpresso_data=True` for outlier highlighting |
|
|
71
|
+
| `MR_loo()` | `pd.DataFrame` | sets `G.MR_loo_results` and returns the LOO results table |
|
|
72
|
+
| `MR_loo_plot()` | plot object(s) | requires `G.MR_loo_results`; writes `.png` if `filename=...`; may return a list for multi-page output; supports `methods=[...]` overall rows and `use_mrpresso_data=True` for outlier highlighting |
|
|
73
|
+
| `MRpresso()` | tuple | sets `G.MRpresso_results` and `G.MRpresso_subset_data` (outlier-removed harmonized table; SNP-indexed) |
|
|
70
74
|
| `prs()` | `None` | writes `<name>.csv` and uses PLINK temp files |
|
|
71
75
|
| `query_gwas_catalog()` | `pd.DataFrame` | adds an `ASSOC` column (network-bound); `replace=True` overwrites `G.data` |
|
|
72
76
|
| `filter_by_gene(replace=False)` | `Geno` | returns a new `Geno` filtered to a locus |
|
|
@@ -80,7 +84,7 @@ Be aware of these common side effects:
|
|
|
80
84
|
|
|
81
85
|
- `~/.genal/config.json` is created/updated as you configure PLINK, reference folders, or default genotype paths.
|
|
82
86
|
- `tmp_GENAL/` is used as a scratch directory for PLINK commands and is **not** automatically deleted.
|
|
83
|
-
- Some methods generate output files in your current directory (notably `prs()`, and plot saving in `MR_plot()`).
|
|
87
|
+
- Some methods generate output files in your current directory (notably `prs()`, and plot saving in `MR_plot()`, `MR_funnel()`, and `MR_loo_plot()`).
|
|
84
88
|
|
|
85
89
|
## Resource usage (`ram`, `cpus`)
|
|
86
90
|
|
|
@@ -132,6 +132,30 @@ The bandwidth uses a modified Silverman rule multiplied by the user-provided fac
|
|
|
132
132
|
|
|
133
133
|
The sign method tests whether exposure and outcome effects tend to have the same sign across variants. `genal` performs a binomial test against the null of 50% sign agreement.
|
|
134
134
|
|
|
135
|
+
## Leave-one-out MR
|
|
136
|
+
|
|
137
|
+
Implementation: {py:func}`genal.MR_tools.MR_loo_func`, wrapped by {py:meth}`genal.Geno.MR_loo`.
|
|
138
|
+
|
|
139
|
+
Leave-one-out MR iterates over all instruments, sequentially removing each SNP and re-estimating the causal effect using the remaining instruments. This identifies variants that have a disproportionate influence on the overall estimate.
|
|
140
|
+
|
|
141
|
+
For each SNP $i$ in the instrument set:
|
|
142
|
+
|
|
143
|
+
1. Remove SNP $i$ from the harmonized data.
|
|
144
|
+
2. Re-run the selected MR method on the remaining $J-1$ SNPs.
|
|
145
|
+
3. Store the resulting estimate $\hat{\theta}_{-i}$.
|
|
146
|
+
|
|
147
|
+
The "influence" of SNP $i$ is defined as:
|
|
148
|
+
|
|
149
|
+
```{math}
|
|
150
|
+
\text{influence}_i = \left| \hat{\theta}_{-i} - \hat{\theta}_{\text{all}} \right|
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
where $\hat{\theta}_{\text{all}}$ is the estimate using all instruments.
|
|
154
|
+
|
|
155
|
+
Notes:
|
|
156
|
+
- Any single MR method can be used (IVW, Egger, weighted median, mode-based, etc.).
|
|
157
|
+
- At least 3 instruments are required (so that each LOO subset has ≥2 instruments).
|
|
158
|
+
|
|
135
159
|
## MR-PRESSO (summary)
|
|
136
160
|
|
|
137
161
|
Implementation: {py:func}`genal.MRpresso.mr_presso`, wrapped by {py:meth}`genal.Geno.MRpresso`.
|
|
@@ -144,11 +168,26 @@ At a high level, `genal`'s MR-PRESSO implementation:
|
|
|
144
168
|
- an outlier test (per-variant p-values, Bonferroni-corrected),
|
|
145
169
|
- an optional distortion test (whether the causal estimate changes materially after removing outliers).
|
|
146
170
|
|
|
171
|
+
### Distortion test
|
|
172
|
+
|
|
173
|
+
The distortion test assesses whether detected outliers materially bias the causal estimate. `genal` implements the following version:
|
|
174
|
+
|
|
175
|
+
1. **Observed distortion**: $D_\text{obs} = (\hat{\theta}_\text{all} - \hat{\theta}_\text{no outliers}) / |\hat{\theta}_\text{no outliers}|$
|
|
176
|
+
2. **Expected distortion null**: bootstrap resampling is performed *exclusively on the non-outlier subset*. For each iteration:
|
|
177
|
+
- Sample with replacement $J-k$ SNPs from the non-outlier data (where $k$ is the number of detected outliers).
|
|
178
|
+
- Fit the IVW model on the sampled data and record $\hat{\theta}_\text{exp}$.
|
|
179
|
+
- Compute $D_\text{exp} = (\hat{\theta}_\text{all} - \hat{\theta}_\text{exp}) / |\hat{\theta}_\text{exp}|$.
|
|
180
|
+
3. **P-value**: $p = \text{mean}(|D_\text{exp}| > |D_\text{obs}|)$.
|
|
181
|
+
|
|
182
|
+
This differs from the original MR-PRESSO R implementation, which in some cases samples from the full dataset (including outliers) for the expected-bias regressions and was inconsistent with the paper's description.
|
|
183
|
+
|
|
184
|
+
### Output structure
|
|
185
|
+
|
|
147
186
|
`Geno.MRpresso()` returns four objects:
|
|
148
187
|
- `mod_table`: a small results table (`Raw` and `Outlier-corrected` rows; IVW model),
|
|
149
188
|
- `GlobalTest`: RSS and global p-value,
|
|
150
|
-
- `OutlierTest`: per-variant outlier p-values (empty if the global test is not significant),
|
|
151
|
-
- `BiasTest`: distortion test result dictionary (empty if distortion test was not run).
|
|
189
|
+
- `OutlierTest`: per-variant outlier p-values (empty if the global test is not significant); SNP IDs as row labels,
|
|
190
|
+
- `BiasTest`: distortion test result dictionary containing `"outliers_indices"` (SNP IDs), `"distortion_test_coefficient"`, and `"distortion_test_p"` (empty if distortion test was not run).
|
|
152
191
|
|
|
153
192
|
If outliers are found, `genal` stores the outlier-removed harmonized table and allows rerunning MR with `Geno.MR(use_mrpresso_data=True)`.
|
|
154
193
|
|
|
@@ -175,10 +175,20 @@ Key arguments you commonly tune:
|
|
|
175
175
|
After `MR()`, you can generate a scatter plot with method lines:
|
|
176
176
|
|
|
177
177
|
```python
|
|
178
|
-
G_instruments.MR_plot(filename="mr_scatter") # saves mr_scatter.png
|
|
178
|
+
G_instruments.MR_plot(filename="mr_scatter", figure_size=(10, 6)) # saves mr_scatter.png
|
|
179
179
|
```
|
|
180
180
|
|
|
181
|
-
|
|
181
|
+
You can also draw a funnel plot of single-SNP ratio estimates (Wald ratios):
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
G_instruments.MR_funnel(
|
|
185
|
+
methods=["IVW", "WM", "Egger"], # vertical reference lines (optional)
|
|
186
|
+
filename="mr_funnel",
|
|
187
|
+
figure_size=(10, 6),
|
|
188
|
+
)
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## 6a) MR-PRESSO (outlier detection and distortion testing)
|
|
182
192
|
|
|
183
193
|
{py:meth}`genal.Geno.MRpresso` runs a parallel MR-PRESSO implementation.
|
|
184
194
|
|
|
@@ -196,14 +206,71 @@ What you typically tune:
|
|
|
196
206
|
- `significance_p`: threshold for global/outlier tests.
|
|
197
207
|
- `outlier_test` / `distortion_test`: disable if you only want the global test.
|
|
198
208
|
|
|
209
|
+
Output structure:
|
|
210
|
+
- `OutlierTest`: DataFrame with per-SNP outlier p-values; SNP identifiers (rsIDs) are used as row labels (not numeric indices).
|
|
211
|
+
- `BiasTest`: dictionary containing `"outliers_indices"` (list of SNP IDs), `"distortion_test_coefficient"`, and `"distortion_test_p"`.
|
|
212
|
+
|
|
199
213
|
If outliers are found, you can rerun MR using the outlier-removed subset:
|
|
200
214
|
|
|
201
215
|
```python
|
|
202
216
|
res_no_outliers = G_instruments.MR(use_mrpresso_data=True)
|
|
203
217
|
```
|
|
204
218
|
|
|
219
|
+
To highlight MR-PRESSO outliers on plots, pass `use_mrpresso_data=True` (outliers are colored in red and shown in the legend):
|
|
220
|
+
|
|
221
|
+
```python
|
|
222
|
+
G_instruments.MR_plot(filename="mr_scatter_mrpresso", figure_size=(10, 6), use_mrpresso_data=True)
|
|
223
|
+
G_instruments.MR_funnel(filename="mr_funnel_mrpresso", figure_size=(10, 6), use_mrpresso_data=True)
|
|
224
|
+
G_instruments.MR_loo_plot(filename="loo_forest_mrpresso", figure_size=(10, 8), use_mrpresso_data=True)
|
|
225
|
+
```
|
|
226
|
+
|
|
205
227
|
See {doc}`methods` for algorithm details and outputs.
|
|
206
228
|
|
|
229
|
+
## 6b) Leave-one-out MR (sensitivity analysis)
|
|
230
|
+
|
|
231
|
+
{py:meth}`genal.Geno.MR_loo` iteratively removes each SNP and re-estimates the causal effect. This helps identify influential variants that may be driving the overall result.
|
|
232
|
+
|
|
233
|
+
```python
|
|
234
|
+
loo_df = G_instruments.MR_loo(
|
|
235
|
+
method="IVW", # any single MR method key (see MR method map)
|
|
236
|
+
action=2,
|
|
237
|
+
heterogeneity=False, # set True to include Q statistics
|
|
238
|
+
odds=False, # set True for OR-scale output
|
|
239
|
+
)
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
Key arguments:
|
|
243
|
+
- `method`: a single MR method key (e.g., `"IVW"`, `"Egger"`, `"WM"`); must not be `"all"`.
|
|
244
|
+
- `use_mrpresso_data=True`: use the outlier-removed dataset from MR-PRESSO instead of all instruments.
|
|
245
|
+
|
|
246
|
+
### Visualizing leave-one-out results
|
|
247
|
+
|
|
248
|
+
{py:meth}`genal.Geno.MR_loo_plot` creates a forest plot from the stored `MR_loo_results`:
|
|
249
|
+
|
|
250
|
+
```python
|
|
251
|
+
# Default: show top influential instruments
|
|
252
|
+
G_instruments.MR_loo_plot(filename="loo_forest", figure_size=(10, 8))
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
```python
|
|
256
|
+
# Or paginate all instruments
|
|
257
|
+
G_instruments.MR_loo_plot(
|
|
258
|
+
top_influential=False, # show all, not just influential
|
|
259
|
+
snps_per_page=30,
|
|
260
|
+
page=1, # or None for all pages
|
|
261
|
+
filename="loo_forest_all",
|
|
262
|
+
figure_size=(10, 12),
|
|
263
|
+
)
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
Key arguments:
|
|
267
|
+
- `top_influential=True` (default): select the `snps_per_page` most influential SNPs (largest change in estimate when removed) and render a single compact figure.
|
|
268
|
+
- `top_influential=False`: paginate all instruments; use `page=N` to select a specific page or `page=None` to render all pages.
|
|
269
|
+
- `snps_per_page`: number of SNPs per page (minimum 5).
|
|
270
|
+
- `use_mrpresso_data=True`: color MR-PRESSO outliers in red (requires `MRpresso()` first). When outliers exist, an extra summary row ("MR-PRESSO corrected") is added using the same MR method as the leave-one-out analysis.
|
|
271
|
+
- `methods=["WM", "Egger"]`: add extra overall estimates for the requested methods (computed on all instruments).
|
|
272
|
+
|
|
273
|
+
|
|
207
274
|
## 7) Additional capabilities (beyond the core pipeline)
|
|
208
275
|
|
|
209
276
|
### Single-SNP association tests (individual-level data)
|