genal-python 1.4.0__tar.gz → 1.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {genal_python-1.4.0 → genal_python-1.4.2}/.gitignore +3 -1
- {genal_python-1.4.0 → genal_python-1.4.2}/PKG-INFO +1 -1
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/source/api.rst +1 -1
- {genal_python-1.4.0 → genal_python-1.4.2}/genal/Geno.py +53 -2
- {genal_python-1.4.0 → genal_python-1.4.2}/genal/__init__.py +3 -2
- genal_python-1.4.2/genal/genes.py +125 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/genal/geno_tools.py +112 -146
- {genal_python-1.4.0 → genal_python-1.4.2}/pyproject.toml +1 -1
- genal_python-1.4.0/REVIEW_REPORT.md +0 -63
- {genal_python-1.4.0 → genal_python-1.4.2}/.DS_Store +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/.readthedocs.yaml +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/Genal_flowchart.png +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/LICENSE +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/README.md +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/.DS_Store +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/Makefile +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/.DS_Store +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/.buildinfo +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/.doctrees/api.doctree +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/.doctrees/environment.pickle +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/.doctrees/genal.doctree +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/.doctrees/index.doctree +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/.doctrees/introduction.doctree +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/.doctrees/modules.doctree +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_images/MR_plot_SBP_AS.png +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/Geno.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/MR.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/MR_tools.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/MRpresso.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/association.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/clump.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/extract_prs.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/geno_tools.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/lift.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/proxy.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/snp_query.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/tools.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/index.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_sources/api.rst.txt +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_sources/genal.rst.txt +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_sources/index.rst.txt +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_sources/introduction.rst.txt +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_sources/modules.rst.txt +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/basic.css +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/badge_only.css +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff2 +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff2 +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/fontawesome-webfont.eot +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/fontawesome-webfont.svg +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/fontawesome-webfont.ttf +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/fontawesome-webfont.woff +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/fontawesome-webfont.woff2 +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-bold-italic.woff +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-bold-italic.woff2 +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-bold.woff +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-bold.woff2 +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-normal-italic.woff +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-normal-italic.woff2 +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-normal.woff +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-normal.woff2 +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/theme.css +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/doctools.js +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/documentation_options.js +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/file.png +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/js/badge_only.js +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/js/html5shiv-printshiv.min.js +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/js/html5shiv.min.js +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/js/theme.js +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/language_data.js +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/minus.png +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/plus.png +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/pygments.css +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/searchtools.js +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/sphinx_highlight.js +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/api.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/genal.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/genindex.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/index.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/introduction.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/modules.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/objects.inv +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/py-modindex.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/search.html +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/build/searchindex.js +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/make.bat +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/requirements.txt +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/source/.DS_Store +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/source/Images/Genal_flowchart.png +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/source/Images/MR_plot_SBP_AS.png +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/source/Images/genal_logo.png +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/source/conf.py +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/source/index.rst +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/source/introduction.rst +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/docs/source/modules.rst +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/genal/MR.py +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/genal/MR_tools.py +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/genal/MRpresso.py +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/genal/association.py +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/genal/clump.py +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/genal/colocalization.py +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/genal/constants.py +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/genal/extract_prs.py +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/genal/lift.py +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/genal/proxy.py +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/genal/snp_query.py +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/genal/tools.py +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/genal_logo.png +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/gitignore +0 -0
- {genal_python-1.4.0 → genal_python-1.4.2}/readthedocs.yaml +0 -0
|
@@ -6,7 +6,7 @@ genal.GENO class
|
|
|
6
6
|
-----------------
|
|
7
7
|
|
|
8
8
|
.. autoclass:: genal.Geno
|
|
9
|
-
:members:
|
|
9
|
+
:members: __init__, preprocess_data, get_reference_panel, clump, update_snpids, extract_snps, prs, set_phenotype, association_test, query_outcome, MR, MR_plot, MR_forest, MRpresso, filter_by_gene, colocalize, lift, query_gwas_catalog, standardize_betas, update_eaf, sort_group, copy, save
|
|
10
10
|
:undoc-members:
|
|
11
11
|
:show-inheritance:
|
|
12
12
|
|
|
@@ -15,7 +15,13 @@ from .proxy import find_proxies, apply_proxies
|
|
|
15
15
|
from .MR_tools import query_outcome_func, MR_func, mrpresso_func
|
|
16
16
|
from .clump import clump_data_plink2
|
|
17
17
|
from .lift import lift_data
|
|
18
|
-
from .
|
|
18
|
+
from .genes import filter_by_gene_func
|
|
19
|
+
from .tools import (
|
|
20
|
+
create_tmp,
|
|
21
|
+
load_reference_panel,
|
|
22
|
+
setup_genetic_path,
|
|
23
|
+
check_reference_panel
|
|
24
|
+
)
|
|
19
25
|
from .geno_tools import (
|
|
20
26
|
save_data,
|
|
21
27
|
check_arguments,
|
|
@@ -31,7 +37,7 @@ from .geno_tools import (
|
|
|
31
37
|
check_allele_column,
|
|
32
38
|
check_snp_column,
|
|
33
39
|
remove_na,
|
|
34
|
-
|
|
40
|
+
update_eaf_func,
|
|
35
41
|
)
|
|
36
42
|
from .association import set_phenotype_func, association_test_func_plink2
|
|
37
43
|
from .extract_prs import extract_snps_func, prs_func
|
|
@@ -47,6 +53,7 @@ from .colocalization import coloc_abf_func
|
|
|
47
53
|
# Check stability with variants on sexual chromosomes
|
|
48
54
|
# Check the build of user data (potentially with a list of SNPs with different positions)
|
|
49
55
|
# update_snpids function: take alleles into account during the merge if they are present in the user data
|
|
56
|
+
# Consider how update_snpids could be replaced by the extract range function in plink2 (to gain speed)
|
|
50
57
|
|
|
51
58
|
|
|
52
59
|
class Geno:
|
|
@@ -1596,7 +1603,51 @@ class Geno:
|
|
|
1596
1603
|
data['SE'] = data['SE'] / sd
|
|
1597
1604
|
return data
|
|
1598
1605
|
|
|
1606
|
+
def update_eaf(self, reference_panel="EUR_37", replace=False, fill=True):
|
|
1607
|
+
"""
|
|
1608
|
+
Update or create the EAF (Effect Allele Frequency) column using a reference panel.
|
|
1609
|
+
|
|
1610
|
+
This method calculates allele frequencies from a specified reference panel using PLINK
|
|
1611
|
+
and updates the 'EAF' column for SNPs in the dataset. It match SNPs based on
|
|
1612
|
+
CHR/POS or SNP ID and considers the effect allele ('EA') to assign the correct EAF.
|
|
1613
|
+
|
|
1614
|
+
Args:
|
|
1615
|
+
reference_panel (str, optional): The reference panel to use for deriving EAF.
|
|
1616
|
+
Can be a standard name (e.g., "EUR_37", "AFR_38") or a path to a
|
|
1617
|
+
custom PLINK fileset (bed/bim/fam or pgen/pvar/psam).
|
|
1618
|
+
Defaults to "EUR_37".
|
|
1619
|
+
replace (bool, optional): If True, modifies the instance's `data` attribute
|
|
1620
|
+
in place. If False (default), operates on a copy.
|
|
1621
|
+
fill (bool, optional): If `True` (default), existing `EAF` values for SNPs not
|
|
1622
|
+
found in the reference panel will be preserved. If `False`, `EAF` values
|
|
1623
|
+
for unmatched SNPs will be set to `NaN`.
|
|
1624
|
+
|
|
1625
|
+
Returns:
|
|
1626
|
+
pd.DataFrame or None: A new DataFrame with the updated 'EAF' column.
|
|
1599
1627
|
|
|
1628
|
+
Raises:
|
|
1629
|
+
ValueError: If the required columns ('EA' and either 'SNP' or 'CHR'/'POS')
|
|
1630
|
+
are not present in the data.
|
|
1631
|
+
"""
|
|
1632
|
+
|
|
1633
|
+
if 'EA' not in self.data.columns:
|
|
1634
|
+
raise ValueError("The 'EA' column is required to update EAF.")
|
|
1635
|
+
if not ('SNP' in self.data.columns or ('CHR' in self.data.columns and 'POS' in self.data.columns)):
|
|
1636
|
+
raise ValueError("Either 'SNP' or both 'CHR' and 'POS' columns are required.")
|
|
1637
|
+
|
|
1638
|
+
data = self.data if replace else self.data.copy()
|
|
1639
|
+
|
|
1640
|
+
data_updated = update_eaf_func(
|
|
1641
|
+
data=data,
|
|
1642
|
+
reference_panel=reference_panel,
|
|
1643
|
+
object_name=self.name,
|
|
1644
|
+
ram=self.ram,
|
|
1645
|
+
fill=fill
|
|
1646
|
+
)
|
|
1647
|
+
|
|
1648
|
+
if replace:
|
|
1649
|
+
self.data = data_updated
|
|
1650
|
+
return data_updated
|
|
1600
1651
|
|
|
1601
1652
|
|
|
1602
1653
|
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import json
|
|
3
3
|
from .tools import default_config, write_config, set_plink, install_plink, delete_tmp, get_reference_panel_path, get_plink_path
|
|
4
|
-
from .geno_tools import Combine_Geno
|
|
4
|
+
from .geno_tools import Combine_Geno
|
|
5
|
+
from .genes import filter_by_gene_func
|
|
5
6
|
from .constants import CONFIG_DIR
|
|
6
7
|
|
|
7
|
-
__version__ = "1.4.
|
|
8
|
+
__version__ = "1.4.2"
|
|
8
9
|
|
|
9
10
|
config_path = os.path.join(CONFIG_DIR, "config.json")
|
|
10
11
|
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import os
|
|
4
|
+
import wget
|
|
5
|
+
|
|
6
|
+
from .constants import BUCKET_URL
|
|
7
|
+
from .tools import read_config
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def filter_by_gene_func(data, gene_identifier, id_type="symbol", window_size=1000000, build="37"):
|
|
12
|
+
"""
|
|
13
|
+
Filtering the data to include only variants that are within a specified distance of a specific gene.
|
|
14
|
+
Corresponds to the :meth:`Geno.filter_by_gene` method.
|
|
15
|
+
Args:
|
|
16
|
+
data (pd.DataFrame): Input data with at least 'CHR' and 'POS' columns.
|
|
17
|
+
gene_identifier (str): Identifier for the gene/protein to filter variants around.
|
|
18
|
+
id_type (str, optional): Type of identifier provided. Options are:
|
|
19
|
+
- "symbol": Gene symbol (e.g., "APOE")
|
|
20
|
+
- "HGNC": HGNC ID (e.g., "HGNC:613")
|
|
21
|
+
- "name": Full gene name (e.g., "apolipoprotein E")
|
|
22
|
+
- "Ensembl": Ensembl gene ID (e.g., "ENSG00000130203")
|
|
23
|
+
- "NCBI": NCBI gene ID (e.g., "348")
|
|
24
|
+
- "UCSC": UCSC gene ID (e.g., "uc001hbu.2")
|
|
25
|
+
- "Vega": Vega gene ID (e.g., "OTTHUMG00000019505")
|
|
26
|
+
Default is "symbol".
|
|
27
|
+
window_size (int, optional): Size of the window around the gene in base pairs. Default is 1,000,000 (1Mb).
|
|
28
|
+
build (str, optional): Genome build of the data. Default is "37".
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
pd.DataFrame: Filtered DataFrame containing only variants within the specified window
|
|
32
|
+
around the gene, with additional column 'Distance'.
|
|
33
|
+
|
|
34
|
+
Notes:
|
|
35
|
+
- Distance is calculated from the nearest gene boundary (start or end position)
|
|
36
|
+
- Null distances indicate the variant is within the gene
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
# Validate id_type
|
|
40
|
+
valid_id_types = ["symbol", "HGNC_id", "name", "gene_id", "NCBI_id", "UCSC_id", "Vega_id"]
|
|
41
|
+
if id_type in ["HGNC", "NCBI", "UCSC", "Vega"]:
|
|
42
|
+
id_type = id_type + "_id"
|
|
43
|
+
if id_type == "Ensembl":
|
|
44
|
+
id_type = "gene_id"
|
|
45
|
+
if id_type not in valid_id_types:
|
|
46
|
+
raise ValueError(f"Invalid id_type. Must be one of: {', '.join(valid_id_types)}")
|
|
47
|
+
|
|
48
|
+
# Validate build
|
|
49
|
+
if int(build) not in [37, 38]:
|
|
50
|
+
raise ValueError(f"Invalid build. Must be one of: 37, 38")
|
|
51
|
+
|
|
52
|
+
# Download the gene info file if not already present in the reference folder
|
|
53
|
+
config = read_config()
|
|
54
|
+
ref_path = config["paths"]["ref_path"]
|
|
55
|
+
gene_info_file = os.path.join(ref_path, "gene_id_mapping_filtered.parquet")
|
|
56
|
+
if not os.path.exists(gene_info_file):
|
|
57
|
+
# Download parquet file
|
|
58
|
+
print(f"Downloading gene info file to {gene_info_file}...")
|
|
59
|
+
url = BUCKET_URL + "gene_id_mapping_filtered.parquet"
|
|
60
|
+
try:
|
|
61
|
+
wget.download(url, gene_info_file)
|
|
62
|
+
print("\nDownload complete.")
|
|
63
|
+
except Exception as e:
|
|
64
|
+
if os.path.exists(gene_info_file):
|
|
65
|
+
os.remove(gene_info_file)
|
|
66
|
+
raise RuntimeError(f"Failed to download gene info: {e}")
|
|
67
|
+
|
|
68
|
+
df_gene_info = pd.read_parquet(gene_info_file, engine="pyarrow")
|
|
69
|
+
|
|
70
|
+
# Find gene coordinates
|
|
71
|
+
gene_data = df_gene_info[df_gene_info[id_type] == gene_identifier]
|
|
72
|
+
|
|
73
|
+
if gene_data.empty:
|
|
74
|
+
raise ValueError(f"Gene with {id_type}='{gene_identifier}' not found in gene info database.")
|
|
75
|
+
|
|
76
|
+
if len(gene_data) > 1:
|
|
77
|
+
print(f"Warning: Multiple entries found for {id_type}='{gene_identifier}'. Using the first entry.")
|
|
78
|
+
gene_data = gene_data.iloc[0,:]
|
|
79
|
+
|
|
80
|
+
print(f"Filtering variants within {window_size}bp window based on genome build {build} around gene: {', '.join(f'{col}: {gene_data[col]}' for col in valid_id_types)}")
|
|
81
|
+
|
|
82
|
+
# Extract gene location information
|
|
83
|
+
chrom = gene_data['CHR']
|
|
84
|
+
# Convert to integer if possible
|
|
85
|
+
if str(chrom).isdigit():
|
|
86
|
+
chrom = int(chrom)
|
|
87
|
+
elif chrom=="X":
|
|
88
|
+
chrom=23
|
|
89
|
+
else:
|
|
90
|
+
raise ValueError(f"Gene {gene_identifier} is located on chromosome {chrom}, which is not supported.")
|
|
91
|
+
|
|
92
|
+
gene_start = int(gene_data[f'gene_start_{build}'])
|
|
93
|
+
gene_end = int(gene_data[f'gene_end_{build}'])
|
|
94
|
+
|
|
95
|
+
# Define the window boundaries
|
|
96
|
+
window_start = max(0, gene_start - window_size/2)
|
|
97
|
+
window_end = gene_end + window_size/2
|
|
98
|
+
|
|
99
|
+
# Filter variants within the window
|
|
100
|
+
filtered = data[
|
|
101
|
+
(data['CHR'] == chrom) &
|
|
102
|
+
(data['POS'] >= window_start) &
|
|
103
|
+
(data['POS'] <= window_end)
|
|
104
|
+
].copy()
|
|
105
|
+
|
|
106
|
+
if not filtered.empty:
|
|
107
|
+
# Calculate distance from gene: if inside the gene, distance is 0, if before, distance is negative, if after, distance is positive
|
|
108
|
+
filtered.loc[:, 'Distance'] = np.nan
|
|
109
|
+
|
|
110
|
+
# Create boolean masks
|
|
111
|
+
mask_inside = filtered['POS'].between(gene_start, gene_end)
|
|
112
|
+
mask_before = filtered['POS'] < gene_start
|
|
113
|
+
mask_after = filtered['POS'] > gene_end
|
|
114
|
+
|
|
115
|
+
filtered.loc[mask_inside, 'Distance'] = 0
|
|
116
|
+
filtered.loc[mask_before, 'Distance'] = filtered['POS'] - gene_start
|
|
117
|
+
filtered.loc[mask_after, 'Distance'] = filtered['POS'] - gene_end
|
|
118
|
+
|
|
119
|
+
filtered["Distance"] = filtered["Distance"].astype("Int64")
|
|
120
|
+
|
|
121
|
+
print(f"Found {len(filtered)} variants.")
|
|
122
|
+
else:
|
|
123
|
+
print(f"No variants found in a {window_size}bp window around {gene_identifier}")
|
|
124
|
+
|
|
125
|
+
return filtered
|
|
@@ -1,15 +1,12 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
import numpy as np
|
|
3
3
|
import scipy.stats as st
|
|
4
|
-
import os
|
|
5
|
-
import shutil
|
|
4
|
+
import os
|
|
6
5
|
import warnings
|
|
7
6
|
from collections import Counter
|
|
8
|
-
import wget
|
|
9
|
-
|
|
10
|
-
from .constants import STANDARD_COLUMNS, BUCKET_URL
|
|
11
|
-
from .tools import read_config
|
|
12
7
|
|
|
8
|
+
from .constants import STANDARD_COLUMNS
|
|
9
|
+
from .tools import get_reference_panel_path, run_plink_command, create_tmp, get_plink_path
|
|
13
10
|
|
|
14
11
|
|
|
15
12
|
def remove_na(data):
|
|
@@ -282,7 +279,7 @@ def fill_snpids_func(data, reference_panel_df, keep_indel):
|
|
|
282
279
|
return data
|
|
283
280
|
|
|
284
281
|
def check_int_column(data, int_col):
|
|
285
|
-
"""Set the type of the int_col column to Int64 and non-numeric values to NA."""
|
|
282
|
+
"""Set the type of the int_col column to Int64 and non-numeric values to NA. This function is used to check the validity of the CHR and POS columns."""
|
|
286
283
|
nrows = data.shape[0]
|
|
287
284
|
# Remove any non-digit characters, convert to numeric, setting non-numeric to NaN
|
|
288
285
|
data[int_col] = pd.to_numeric(data[int_col].astype(str).str.extract('(\d+)', expand=False), errors='coerce')
|
|
@@ -403,53 +400,39 @@ def check_arguments(
|
|
|
403
400
|
|
|
404
401
|
def save_data(data, name, path="", fmt="h5", sep="\t", header=True):
|
|
405
402
|
"""
|
|
406
|
-
Save
|
|
407
|
-
|
|
408
|
-
Args:
|
|
409
|
-
- data (pd.DataFrame): The data to be saved.
|
|
410
|
-
- name (str): The name of the file without extension.
|
|
411
|
-
- path (str, optional): Directory path for saving. Default is the current directory.
|
|
412
|
-
- fmt (str, optional): Format for the file, e.g., "h5", "csv", "txt". Default is "h5".
|
|
413
|
-
- sep (str, optional): Delimiter for csv or txt files. Default is tab.
|
|
414
|
-
- header (bool, optional): Whether to include header in csv or txt files. Default is True.
|
|
403
|
+
Save data to a specified file format.
|
|
415
404
|
|
|
416
|
-
|
|
417
|
-
|
|
405
|
+
Supported formats: .h5 (default), .csv, .txt.
|
|
406
|
+
Future supported formats: .vcf, .vcf.gz.
|
|
418
407
|
|
|
419
|
-
|
|
420
|
-
|
|
408
|
+
Args:
|
|
409
|
+
data (pd.DataFrame): DataFrame to be saved.
|
|
410
|
+
name (str): A unique identifier for the data, used as the filename.
|
|
411
|
+
path (str, optional): The directory where the file will be saved. Defaults to current directory.
|
|
412
|
+
fmt (str, optional): The desired file format. Defaults to "h5".
|
|
413
|
+
sep (str, optional): Delimiter for text-based formats (.csv, .txt). Defaults to tab.
|
|
414
|
+
header (bool, optional): Whether to include column names in text-based formats. Defaults to True.
|
|
421
415
|
"""
|
|
422
|
-
|
|
423
|
-
path_name = os.path.join(path, f"{name}.{fmt}")
|
|
424
|
-
else:
|
|
425
|
-
path_name = f"{name}.{fmt}"
|
|
426
|
-
|
|
416
|
+
path = os.path.join(path, name)
|
|
427
417
|
if fmt == "h5":
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
elif fmt in ["csv", "txt"]:
|
|
434
|
-
data.to_csv(path_name, sep=sep, header=header, index=False)
|
|
435
|
-
|
|
418
|
+
data.to_hdf(f"{path}.h5", key="data", mode="w", format="table")
|
|
419
|
+
elif fmt == "csv":
|
|
420
|
+
data.to_csv(f"{path}.csv", sep=sep, header=header, index=False)
|
|
421
|
+
elif fmt == "txt":
|
|
422
|
+
data.to_csv(f"{path}.txt", sep=sep, header=header, index=False)
|
|
436
423
|
else:
|
|
437
|
-
|
|
438
|
-
"The fmt argument takes value in (h5 (default), csv, txt)."
|
|
439
|
-
)
|
|
440
|
-
|
|
441
|
-
print(f"Data saved to {path_name}")
|
|
424
|
+
print(f"Format {fmt} is not supported yet.")
|
|
442
425
|
|
|
443
426
|
|
|
444
427
|
def Combine_Geno(Gs):
|
|
445
428
|
"""
|
|
446
|
-
Combine
|
|
429
|
+
Combine multiple Geno instances.
|
|
447
430
|
|
|
448
431
|
Args:
|
|
449
|
-
|
|
432
|
+
Gs (list): A list of Geno instances to combine.
|
|
450
433
|
|
|
451
434
|
Returns:
|
|
452
|
-
|
|
435
|
+
Geno: A new Geno instance containing the combined data.
|
|
453
436
|
"""
|
|
454
437
|
from .Geno import Geno
|
|
455
438
|
|
|
@@ -462,118 +445,101 @@ def Combine_Geno(Gs):
|
|
|
462
445
|
|
|
463
446
|
return Geno(C)
|
|
464
447
|
|
|
465
|
-
|
|
448
|
+
|
|
449
|
+
def update_eaf_func(data, reference_panel, object_name, ram=10000, fill=True):
|
|
466
450
|
"""
|
|
467
|
-
|
|
468
|
-
Corresponds to the :meth:`Geno.filter_by_gene` method.
|
|
469
|
-
Args:
|
|
470
|
-
data (pd.DataFrame): Input data with at least 'CHR' and 'POS' columns.
|
|
471
|
-
gene_identifier (str): Identifier for the gene/protein to filter variants around.
|
|
472
|
-
id_type (str, optional): Type of identifier provided. Options are:
|
|
473
|
-
- "symbol": Gene symbol (e.g., "APOE")
|
|
474
|
-
- "HGNC": HGNC ID (e.g., "HGNC:613")
|
|
475
|
-
- "name": Full gene name (e.g., "apolipoprotein E")
|
|
476
|
-
- "Ensembl": Ensembl gene ID (e.g., "ENSG00000130203")
|
|
477
|
-
- "NCBI": NCBI gene ID (e.g., "348")
|
|
478
|
-
- "UCSC": UCSC gene ID (e.g., "uc001hbu.2")
|
|
479
|
-
- "Vega": Vega gene ID (e.g., "OTTHUMG00000019505")
|
|
480
|
-
Default is "symbol".
|
|
481
|
-
window_size (int, optional): Size of the window around the gene in base pairs. Default is 1,000,000 (1Mb).
|
|
482
|
-
build (str, optional): Genome build of the data. Default is "37".
|
|
483
|
-
|
|
484
|
-
Returns:
|
|
485
|
-
pd.DataFrame: Filtered DataFrame containing only variants within the specified window
|
|
486
|
-
around the gene, with additional column 'Distance'.
|
|
451
|
+
Core logic to update or create the EAF (Effect Allele Frequency) column.
|
|
487
452
|
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
453
|
+
This function calculates EAF from a reference panel. If CHR/POS are available,
|
|
454
|
+
it uses a fast, coordinate-based extraction with PLINK. Otherwise, it falls
|
|
455
|
+
back to SNP-ID-based extraction.
|
|
491
456
|
"""
|
|
457
|
+
ref_panel_path, ref_filetype = get_reference_panel_path(reference_panel)
|
|
458
|
+
create_tmp()
|
|
459
|
+
|
|
460
|
+
by_coordinate = "CHR" in data.columns and "POS" in data.columns
|
|
461
|
+
|
|
462
|
+
# --- Match by CHR/POS or SNP ID ---
|
|
463
|
+
if by_coordinate:
|
|
464
|
+
print("CHR/POS columns present. SNPs searched based on genomic positions.")
|
|
492
465
|
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
config = read_config()
|
|
508
|
-
ref_path = config["paths"]["ref_path"]
|
|
509
|
-
gene_info_file = os.path.join(ref_path, "gene_id_mapping_filtered.parquet")
|
|
510
|
-
if not os.path.exists(gene_info_file):
|
|
511
|
-
# Download parquet file
|
|
512
|
-
print(f"Downloading gene info file to {gene_info_file}...")
|
|
513
|
-
url = BUCKET_URL + "gene_id_mapping_filtered.parquet"
|
|
514
|
-
try:
|
|
515
|
-
wget.download(url, gene_info_file)
|
|
516
|
-
print("\nDownload complete.")
|
|
517
|
-
except Exception as e:
|
|
518
|
-
if os.path.exists(gene_info_file):
|
|
519
|
-
os.remove(gene_info_file)
|
|
520
|
-
raise RuntimeError(f"Failed to download gene info: {e}")
|
|
521
|
-
|
|
522
|
-
df_gene_info = pd.read_parquet(gene_info_file, engine="pyarrow")
|
|
523
|
-
|
|
524
|
-
# Find gene coordinates
|
|
525
|
-
gene_data = df_gene_info[df_gene_info[id_type] == gene_identifier]
|
|
526
|
-
|
|
527
|
-
if gene_data.empty:
|
|
528
|
-
raise ValueError(f"Gene with {id_type}='{gene_identifier}' not found in gene info database.")
|
|
529
|
-
|
|
530
|
-
if len(gene_data) > 1:
|
|
531
|
-
print(f"Warning: Multiple entries found for {id_type}='{gene_identifier}'. Using the first entry.")
|
|
532
|
-
gene_data = gene_data.iloc[0,:]
|
|
466
|
+
# 1. Write coordinates to a temp file for PLINK's --extract range
|
|
467
|
+
coord_path = os.path.join("tmp_GENAL", f"{object_name}_coord_list.txt")
|
|
468
|
+
data[['CHR', 'POS', 'POS']].dropna().to_csv(coord_path, sep='\t', index=False, header=False)
|
|
469
|
+
|
|
470
|
+
# 2. Run --freq directly, extracting by range and adding POS to output
|
|
471
|
+
freq_prefix = os.path.join("tmp_GENAL", f"{object_name}_eaf_freqs")
|
|
472
|
+
plink_command = (
|
|
473
|
+
f"{get_plink_path()} --{'pfile' if ref_filetype == 'pgen' else 'bfile'} {ref_panel_path} "
|
|
474
|
+
f"--memory {ram} "
|
|
475
|
+
f"--extract range {coord_path} "
|
|
476
|
+
f"--freq cols=+pos "
|
|
477
|
+
f"--out {freq_prefix}"
|
|
478
|
+
)
|
|
479
|
+
run_plink_command(plink_command)
|
|
533
480
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
481
|
+
# 3. Load frequency results
|
|
482
|
+
freq_path = f"{freq_prefix}.afreq"
|
|
483
|
+
if not os.path.exists(freq_path) or os.path.getsize(freq_path) == 0:
|
|
484
|
+
warnings.warn("No variants from your data were found in the reference panel by coordinate, or PLINK failed.")
|
|
485
|
+
return data.copy()
|
|
486
|
+
|
|
487
|
+
freqs_df = pd.read_csv(freq_path, sep='\t')
|
|
488
|
+
freqs_df.rename(columns={'#CHROM': 'CHR', 'ALT': 'ALT_calc', 'ALT_FREQS': 'EAF_ref'}, inplace=True)
|
|
489
|
+
|
|
490
|
+
# 4. Merge with original data to get EA and compute final EAF
|
|
491
|
+
data = data.merge(freqs_df[["CHR", "POS", "ALT_calc", "EAF_ref"]], on=['CHR', 'POS'], how='left')
|
|
492
|
+
|
|
543
493
|
else:
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
494
|
+
print("Using SNP IDs to extract frequencies.")
|
|
495
|
+
if "SNP" not in data.columns:
|
|
496
|
+
raise ValueError("SNP column is required when CHR/POS are not available.")
|
|
497
|
+
|
|
498
|
+
snp_list_path = os.path.join("tmp_GENAL", f"{object_name}_snp_list.txt")
|
|
499
|
+
data[["SNP"]].dropna().to_csv(snp_list_path, index=False, header=False)
|
|
500
|
+
|
|
501
|
+
freq_prefix = os.path.join("tmp_GENAL", f"{object_name}_eaf_freqs")
|
|
502
|
+
plink_command = (
|
|
503
|
+
f"{get_plink_path()} --{'pfile' if ref_filetype == 'pgen' else 'bfile'} {ref_panel_path} "
|
|
504
|
+
f"--memory {ram} "
|
|
505
|
+
f"--extract {snp_list_path} "
|
|
506
|
+
f"--freq "
|
|
507
|
+
f"--out {freq_prefix}"
|
|
508
|
+
)
|
|
509
|
+
run_plink_command(plink_command)
|
|
548
510
|
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
# Filter variants within the window
|
|
554
|
-
filtered = data[
|
|
555
|
-
(data['CHR'] == chrom) &
|
|
556
|
-
(data['POS'] >= window_start) &
|
|
557
|
-
(data['POS'] <= window_end)
|
|
558
|
-
].copy()
|
|
559
|
-
|
|
560
|
-
if not filtered.empty:
|
|
561
|
-
# Calculate distance from gene: if inside the gene, distance is 0, if before, distance is negative, if after, distance is positive
|
|
562
|
-
filtered.loc[:, 'Distance'] = np.nan
|
|
563
|
-
|
|
564
|
-
# Create boolean masks
|
|
565
|
-
mask_inside = filtered['POS'].between(gene_start, gene_end)
|
|
566
|
-
mask_before = filtered['POS'] < gene_start
|
|
567
|
-
mask_after = filtered['POS'] > gene_end
|
|
511
|
+
freq_path = f"{freq_prefix}.afreq"
|
|
512
|
+
if not os.path.exists(freq_path):
|
|
513
|
+
warnings.warn("PLINK did not generate a frequency file. Cannot update EAF.")
|
|
514
|
+
return data.copy()
|
|
568
515
|
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
filtered.loc[mask_after, 'Distance'] = filtered['POS'] - gene_end
|
|
516
|
+
freqs_df = pd.read_csv(freq_path, sep='\t')
|
|
517
|
+
freqs_df.rename(columns={"#CHROM": "CHR", "ID": "SNP", "ALT": "ALT_calc", "ALT_FREQS": "EAF_ref"}, inplace=True)
|
|
572
518
|
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
519
|
+
data = data.merge(freqs_df[["SNP", "ALT_calc", "EAF_ref"]], on="SNP", how="left")
|
|
520
|
+
|
|
521
|
+
if data["EAF_ref"].isna().all():
|
|
522
|
+
warnings.warn("No matching SNPs found in the reference panel.")
|
|
523
|
+
return data.copy()
|
|
524
|
+
|
|
525
|
+
# Handle allele direction to ensure correct EAF is returned
|
|
526
|
+
conditions = [
|
|
527
|
+
data["EA"] == data["ALT_calc"],
|
|
528
|
+
data["NEA"] == data["ALT_calc"],
|
|
529
|
+
]
|
|
530
|
+
choices = [
|
|
531
|
+
data["EAF_ref"],
|
|
532
|
+
1 - data["EAF_ref"],
|
|
533
|
+
]
|
|
534
|
+
data["EAF_new"] = np.select(conditions, choices, default=np.nan)
|
|
535
|
+
|
|
536
|
+
# Create updated EAF column
|
|
537
|
+
if 'EAF' not in data.columns:
|
|
538
|
+
data['EAF'] = np.nan
|
|
539
|
+
if fill:
|
|
540
|
+
data['EAF'] = np.where(pd.notna(data["EAF_new"]), data["EAF_new"], data['EAF'])
|
|
576
541
|
else:
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
542
|
+
data['EAF'] = np.where(pd.notna(data["EAF_new"]), data["EAF_new"], np.nan)
|
|
543
|
+
data.drop(columns=["EAF_new", "EAF_ref", "ALT_calc"], inplace=True)
|
|
544
|
+
|
|
545
|
+
return data
|
|
@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "genal-python" # Updated name for PyPI
|
|
7
|
-
version = "1.4.
|
|
7
|
+
version = "1.4.2"
|
|
8
8
|
authors = [{name = "Cyprien Rivier", email = "riviercyprien@gmail.com"}]
|
|
9
9
|
description = "A python toolkit for polygenic risk scoring and mendelian randomization."
|
|
10
10
|
readme = "README.md"
|
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
# Genal Code Review Report
|
|
2
|
-
|
|
3
|
-
This report provides a thorough theoretical, methodological, and mathematical analysis of the `genal` codebase, following the guidelines provided in `CODE_REVIEW_GUIDE.md`.
|
|
4
|
-
|
|
5
|
-
---
|
|
6
|
-
|
|
7
|
-
## **Finding 1: Allele harmonization logic for palindromic SNPs is flawed**
|
|
8
|
-
|
|
9
|
-
**Severity:** Critical
|
|
10
|
-
|
|
11
|
-
**Concern:** The `apply_action_2` function in `MR.py`, which is responsible for harmonizing palindromic SNPs using allele frequencies, contains flawed logic. The decision to flip an allele is based almost exclusively on the exposure allele frequency (`EAF_e`), without correctly considering the outcome allele frequency (`EAF_o`) or the relationship between the two. Specifically, it may incorrectly flip SNPs whose alleles are already concordant, simply because the exposure allele frequency is high. This can lead to incorrectly aligned effect alleles between the exposure and outcome, causing a sign error in the effect estimate (`BETA_e`). Such an error will severely bias the resulting Mendelian Randomization estimates, potentially reversing the direction of the causal effect or introducing spurious null results.
|
|
12
|
-
|
|
13
|
-
**Recommendation:** The harmonization logic for palindromic SNPs should be rewritten to follow established best practices, such as those implemented in the `TwoSampleMR` R package. The corrected logic must properly compare `EAF_e` and `EAF_o` to infer strand orientation. For example, when alleles match, a flip should only be considered if frequencies are complementary (e.g., `EAF_e` > 0.5 and `EAF_o` < 0.5, or vice-versa, within a defined tolerance).
|
|
14
|
-
|
|
15
|
-
**Evidence/Rationale:** The faulty logic is in the `apply_action_2` function in `genal/MR.py`. A correct implementation would compare both `EAF_e` and `EAF_o` to determine if they are on opposite strands. The current implementation's reliance on `EAF_e` alone is methodologically incorrect for inferring strand flips for palindromic SNPs.
|
|
16
|
-
|
|
17
|
-
---
|
|
18
|
-
|
|
19
|
-
## **Finding 2: The MR-PRESSO distortion test implementation is incorrect**
|
|
20
|
-
|
|
21
|
-
**Severity:** High
|
|
22
|
-
|
|
23
|
-
**Concern:** The implementation of the MR-PRESSO distortion test in `MRpresso.py` appears to deviate from the methodology described in the original publication (Verbanck et al., *Nature Genetics*, 2018). The purpose of the distortion test is to assess whether the causal estimate changes significantly after removing outlier SNPs. This is tested by comparing the observed change to a null distribution of changes generated by removing the same number of *random* SNPs. The current implementation generates a null distribution by removing subsets of *non-outlier* SNPs, as indicated by a comment in the code from the original author expressing uncertainty (`## Is there an error in the MRPRESSO code?...`). This approach does not correctly simulate the null hypothesis and will likely produce an inaccurate p-value for the distortion test, leading to incorrect conclusions about the impact of pleiotropy.
|
|
24
|
-
|
|
25
|
-
**Recommendation:** The logic for the distortion test's permutation procedure should be corrected to follow the original MR-PRESSO method. The null distribution should be generated by repeatedly removing a random set of SNPs (equal in number to the identified outliers) from the full dataset and calculating the resulting change in the causal estimate.
|
|
26
|
-
|
|
27
|
-
**Evidence/Rationale:** The implementation in `genal/MRpresso.py` (around line 130 and in the `get_random_bias` function) and the associated comment from the developer strongly suggest the implementation is not faithful to the published MR-PRESSO methodology for the distortion test.
|
|
28
|
-
|
|
29
|
-
---
|
|
30
|
-
|
|
31
|
-
## **Finding 3: The default LD clumping window size is excessively large**
|
|
32
|
-
|
|
33
|
-
**Severity:** High
|
|
34
|
-
|
|
35
|
-
**Concern:** The `clump` method in the `Geno` class uses a default clumping window of 10,000 kb (`kb=10000`). This is substantially larger than the commonly recommended window sizes of 250 kb to 1000 kb. Using such a large window risks incorrectly grouping SNPs that are not in linkage disequilibrium, especially across regions with moderate to high recombination rates. This can lead to the erroneous removal of genuinely independent genetic variants, reducing the resolution of the analysis and potentially biasing downstream results for Polygenic Risk Scores and Mendelian Randomization by discarding valid instruments.
|
|
36
|
-
|
|
37
|
-
**Recommendation:** The default value for the `kb` parameter should be reduced to a more conventional and evidence-based value, such as 250 kb or 500 kb. The docstring should be updated to explain the choice and guide the user on selecting an appropriate window size based on their specific analysis and population.
|
|
38
|
-
|
|
39
|
-
**Evidence/Rationale:** The `Geno.py` file, line 356, defines `def clump(self, kb=10000, ...)`. Peer-reviewed literature and best-practice guides (e.g., Privé et al., *Am J Hum Genet*, 2019; PLINK documentation) use smaller windows (e.g., 250kb-1000kb) as defaults for LD clumping. A 10,000kb window is typically only used for special cases like checking for long-range LD, not as a general-purpose default. A search of recent literature confirms this ("plink ld clumping window size best practice GWAS MR").
|
|
40
|
-
|
|
41
|
-
---
|
|
42
|
-
|
|
43
|
-
## **Finding 4: The default window size for proxy SNP searching is inefficient**
|
|
44
|
-
|
|
45
|
-
**Severity:** Medium
|
|
46
|
-
|
|
47
|
-
**Concern:** The `query_outcome` and `prs` methods, when finding proxies, default to a search window of 5,000 kb. While the `r2` threshold of 0.8 is appropriate, searching for proxies over a 5 Mb window is computationally inefficient and methodologically questionable. Linkage disequilibrium is a local phenomenon, and proxies are almost always found within a much smaller window (e.g., 50-250 kb). Searching a 5 Mb window significantly increases computation time and risks identifying spurious, long-range correlations that do not reflect true local LD structure.
|
|
48
|
-
|
|
49
|
-
**Recommendation:** Reduce the default `kb` parameter for proxy searches to a more standard and efficient value, such as 250 kb or 500 kb.
|
|
50
|
-
|
|
51
|
-
**Evidence/Rationale:** `Geno.py`, line 810: `query_outcome(..., kb=5000, r2=0.8, ...)` and line 536: `prs(..., kb=5000, r2=0.8, ...)`. Standard practice for proxy searching is to use a much smaller window as LD is not expected to extend over several megabases. This large window offers little benefit at a significant computational cost.
|
|
52
|
-
|
|
53
|
-
---
|
|
54
|
-
|
|
55
|
-
## **Finding 5: The default LD clumping r-squared threshold is unusually stringent**
|
|
56
|
-
|
|
57
|
-
**Severity:** Medium
|
|
58
|
-
|
|
59
|
-
**Concern:** The `clump` method defaults to an `r2` threshold of 0.01. While this ensures a set of highly independent SNPs, it is a very stringent cutoff compared to values commonly used in the literature (e.g., 0.1, 0.2, or 0.5 for selecting instruments). This may lead to an overly aggressive clumping procedure that retains multiple, weakly correlated signals from the same locus. For methods that assume independent instruments (like standard IVW MR), this could be problematic. Furthermore, for PRS construction, this stringent threshold might not be optimal for predictive power.
|
|
60
|
-
|
|
61
|
-
**Recommendation:** Consider changing the default `r2` to a more moderate and widely used value, such as 0.2 or 0.1. The rationale for the default should be clearly documented, and the user should be encouraged to select a threshold appropriate for their specific analysis.
|
|
62
|
-
|
|
63
|
-
**Evidence/Rationale:** `Geno.py`, line 356: `def clump(self, kb=10000, r2=0.01, ...)`. The paper "Making the Most of Clumping and Thresholding for Polygenic Scores" by Privé et al. (2019) demonstrates that the optimal `r2` varies widely by trait and that values higher than 0.01 are often optimal. Many tutorials and standard pipelines use less stringent thresholds.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff
RENAMED
|
File without changes
|
{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff2
RENAMED
|
File without changes
|
{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff
RENAMED
|
File without changes
|
{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff2
RENAMED
|
File without changes
|
{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/fontawesome-webfont.eot
RENAMED
|
File without changes
|
{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/fontawesome-webfont.svg
RENAMED
|
File without changes
|
{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/fontawesome-webfont.ttf
RENAMED
|
File without changes
|
{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/fontawesome-webfont.woff
RENAMED
|
File without changes
|
{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/fontawesome-webfont.woff2
RENAMED
|
File without changes
|
{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-bold-italic.woff
RENAMED
|
File without changes
|
{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-bold-italic.woff2
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-normal-italic.woff
RENAMED
|
File without changes
|
{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-normal-italic.woff2
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|