genal-python 1.3.1__tar.gz → 1.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {genal_python-1.3.1 → genal_python-1.3.2}/PKG-INFO +1 -1
- {genal_python-1.3.1 → genal_python-1.3.2}/genal/Geno.py +98 -66
- {genal_python-1.3.1 → genal_python-1.3.2}/genal/__init__.py +2 -2
- genal_python-1.3.2/genal/colocalization.py +249 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/genal/constants.py +3 -2
- {genal_python-1.3.1 → genal_python-1.3.2}/genal/geno_tools.py +120 -3
- {genal_python-1.3.1 → genal_python-1.3.2}/pyproject.toml +1 -1
- genal_python-1.3.1/genal/colocalization.py +0 -159
- {genal_python-1.3.1 → genal_python-1.3.2}/.DS_Store +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/.gitignore +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/.readthedocs.yaml +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/Genal_flowchart.png +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/LICENSE +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/README.md +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/.DS_Store +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/Makefile +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/.DS_Store +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/.buildinfo +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/.doctrees/api.doctree +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/.doctrees/environment.pickle +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/.doctrees/genal.doctree +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/.doctrees/index.doctree +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/.doctrees/introduction.doctree +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/.doctrees/modules.doctree +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_images/MR_plot_SBP_AS.png +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_modules/genal/Geno.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_modules/genal/MR.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_modules/genal/MR_tools.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_modules/genal/MRpresso.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_modules/genal/association.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_modules/genal/clump.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_modules/genal/extract_prs.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_modules/genal/geno_tools.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_modules/genal/lift.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_modules/genal/proxy.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_modules/genal/snp_query.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_modules/genal/tools.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_modules/index.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_sources/api.rst.txt +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_sources/genal.rst.txt +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_sources/index.rst.txt +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_sources/introduction.rst.txt +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_sources/modules.rst.txt +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/basic.css +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/badge_only.css +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff2 +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff2 +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/fontawesome-webfont.eot +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/fontawesome-webfont.svg +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/fontawesome-webfont.ttf +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/fontawesome-webfont.woff +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/fontawesome-webfont.woff2 +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-bold-italic.woff +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-bold-italic.woff2 +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-bold.woff +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-bold.woff2 +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-normal-italic.woff +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-normal-italic.woff2 +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-normal.woff +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-normal.woff2 +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/theme.css +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/doctools.js +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/documentation_options.js +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/file.png +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/js/badge_only.js +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/js/html5shiv-printshiv.min.js +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/js/html5shiv.min.js +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/js/theme.js +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/language_data.js +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/minus.png +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/plus.png +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/pygments.css +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/searchtools.js +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/sphinx_highlight.js +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/api.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/genal.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/genindex.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/index.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/introduction.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/modules.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/objects.inv +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/py-modindex.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/search.html +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/build/searchindex.js +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/make.bat +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/requirements.txt +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/source/.DS_Store +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/source/Images/Genal_flowchart.png +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/source/Images/MR_plot_SBP_AS.png +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/source/Images/genal_logo.png +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/source/api.rst +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/source/conf.py +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/source/index.rst +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/source/introduction.rst +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/docs/source/modules.rst +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/genal/MR.py +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/genal/MR_tools.py +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/genal/MRpresso.py +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/genal/association.py +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/genal/clump.py +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/genal/extract_prs.py +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/genal/lift.py +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/genal/proxy.py +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/genal/snp_query.py +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/genal/tools.py +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/genal_logo.png +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/gitignore +0 -0
- {genal_python-1.3.1 → genal_python-1.3.2}/readthedocs.yaml +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
import numpy as np
|
|
3
3
|
import warnings
|
|
4
|
-
import os
|
|
4
|
+
import os
|
|
5
5
|
import copy
|
|
6
6
|
import psutil
|
|
7
7
|
import uuid
|
|
@@ -30,7 +30,8 @@ from .geno_tools import (
|
|
|
30
30
|
fill_se_p,
|
|
31
31
|
check_allele_column,
|
|
32
32
|
check_snp_column,
|
|
33
|
-
remove_na
|
|
33
|
+
remove_na,
|
|
34
|
+
filter_by_gene_func
|
|
34
35
|
)
|
|
35
36
|
from .association import set_phenotype_func, association_test_func_plink2
|
|
36
37
|
from .extract_prs import extract_snps_func, prs_func
|
|
@@ -117,9 +118,22 @@ class Geno:
|
|
|
117
118
|
|
|
118
119
|
Attributes:
|
|
119
120
|
name (str): Randomly generated ID for the Geno object.
|
|
120
|
-
outcome (list): List of outcomes (initialized as empty).
|
|
121
121
|
cpus (int): Number of CPUs to be used.
|
|
122
122
|
ram (int): Amount of RAM to be used in MBs.
|
|
123
|
+
checks (dict): Dictionary of checks performed on the main DataFrame.
|
|
124
|
+
reference_panel (pd.DataFrame): Reference population SNP data used for SNP info
|
|
125
|
+
adjustments. Initialized when first needed.
|
|
126
|
+
reference_panel_name (str): string to identify the reference_panel (path or population string)
|
|
127
|
+
phenotype (pd.DataFrame, str): Tuple with a DataFrame of individual-level phenotype
|
|
128
|
+
data and a string representing the phenotype trait column. Initialized after
|
|
129
|
+
running the 'set_phenotype' method.
|
|
130
|
+
MR_data (pd.DataFrame, pd.DataFrame, str): Tuple containing DataFrames for associations
|
|
131
|
+
with exposure and outcome, and a string for the outcome name. Initialized after
|
|
132
|
+
running the 'query_outcome' method.
|
|
133
|
+
MR_results (pd.DataFrame, pd.DataFrame, str, str): Contains an MR results dataframe, a dataframe of harmonized SNPs, an exposure name, an outcome name.
|
|
134
|
+
Assigned after calling the MR method and used for plotting with the MR_plot method.
|
|
135
|
+
MRpresso_subset_data (pd.DataFrame, pd.DataFrame, str, str): Contains a dataframe of subsetted harmonized SNPs without outliers.
|
|
136
|
+
Assigned after calling the MRpresso method.
|
|
123
137
|
"""
|
|
124
138
|
|
|
125
139
|
# Validate df type
|
|
@@ -399,10 +413,7 @@ class Geno:
|
|
|
399
413
|
|
|
400
414
|
# If clumped data is successfully generated, assign it to the object's attribute
|
|
401
415
|
if clumped_data is not None:
|
|
402
|
-
Clumped =
|
|
403
|
-
Clumped.checks = self.checks.copy()
|
|
404
|
-
if hasattr(self, "phenotype"):
|
|
405
|
-
Clumped.phenotype = self.phenotype
|
|
416
|
+
Clumped = self.copy(clumped_data)
|
|
406
417
|
return Clumped
|
|
407
418
|
return None
|
|
408
419
|
|
|
@@ -700,6 +711,8 @@ class Geno:
|
|
|
700
711
|
# Assign the processed data and inferred phenotype type to the .phenotype attribute
|
|
701
712
|
self.phenotype = (processed_data, inferred_pheno_type, PHENO)
|
|
702
713
|
|
|
714
|
+
return
|
|
715
|
+
|
|
703
716
|
def association_test(self, path=None, covar=[], standardize=True):
|
|
704
717
|
"""
|
|
705
718
|
Conduct single-SNP association tests against a phenotype.
|
|
@@ -1201,23 +1214,77 @@ class Geno:
|
|
|
1201
1214
|
|
|
1202
1215
|
return mod_table, GlobalTest, OutlierTest, BiasTest
|
|
1203
1216
|
|
|
1217
|
+
def filter_by_gene(self, gene_id, id_type="symbol", window_size=1000000, build="37", replace=False):
|
|
1218
|
+
"""
|
|
1219
|
+
Filter the data to include only variants that are within a specified distance of a specific gene.
|
|
1220
|
+
|
|
1221
|
+
Args:
|
|
1222
|
+
gene_id (str): Identifier for the gene/protein to filter variants around.
|
|
1223
|
+
id_type (str, optional): Type of identifier provided. Options are:
|
|
1224
|
+
- "symbol": Gene symbol (e.g., "APOE")
|
|
1225
|
+
- "HGNC": HGNC ID (e.g., "HGNC:613")
|
|
1226
|
+
- "name": Full gene name (e.g., "apolipoprotein E")
|
|
1227
|
+
- "Ensembl": Ensembl gene ID (e.g., "ENSG00000130203")
|
|
1228
|
+
- "NCBI": NCBI gene ID (e.g., "348")
|
|
1229
|
+
- "UCSC": UCSC gene ID (e.g., "uc001hbu.2")
|
|
1230
|
+
- "Vega": Vega gene ID (e.g., "OTTHUMG00000019505")
|
|
1231
|
+
Default is "symbol".
|
|
1232
|
+
window_size (int, optional): Size of the window around the gene in base pairs. Default is 1,000,000 (1Mb).
|
|
1233
|
+
build (str, optional): Genome build of the data. Default is "37".
|
|
1234
|
+
replace (bool, optional): If True, replace the existing data attribute with the filtered data. Default is True.
|
|
1235
|
+
Returns:
|
|
1236
|
+
if replace is True:
|
|
1237
|
+
pd.DataFrame: Filtered DataFrame containing only variants within the specified window
|
|
1238
|
+
around the gene, with additional column 'Distance'.
|
|
1239
|
+
if replace is False:
|
|
1240
|
+
genal.Geno: A new Geno object with the filtered data.
|
|
1241
|
+
|
|
1242
|
+
Raises:
|
|
1243
|
+
ValueError: If required columns are missing, gene information cannot be found, or invalid id_type is provided.
|
|
1244
|
+
|
|
1245
|
+
Notes:
|
|
1246
|
+
- Distance is calculated from the nearest gene boundary (start or end position)
|
|
1247
|
+
- Null distances indicate the variant is within the gene
|
|
1248
|
+
"""
|
|
1249
|
+
# Check required columns
|
|
1250
|
+
for col in ["CHR", "POS"]:
|
|
1251
|
+
if col not in self.data.columns:
|
|
1252
|
+
raise ValueError(f"Column {col} must be present in the input data!")
|
|
1253
|
+
|
|
1254
|
+
# Do the appropriate preprocessing on CHR and POS columns if not already done
|
|
1255
|
+
if not self.checks.get("CHR"):
|
|
1256
|
+
check_int_column(self.data, "CHR")
|
|
1257
|
+
self.checks["CHR"] = True
|
|
1258
|
+
if not self.checks.get("POS"):
|
|
1259
|
+
check_int_column(self.data, "POS")
|
|
1260
|
+
self.checks["POS"] = True
|
|
1261
|
+
|
|
1262
|
+
filtered = filter_by_gene_func(self.data, gene_id, id_type, window_size, build)
|
|
1263
|
+
|
|
1264
|
+
if replace:
|
|
1265
|
+
self.data = filtered
|
|
1266
|
+
else:
|
|
1267
|
+
Geno_filtered = self.copy(filtered)
|
|
1268
|
+
return Geno_filtered
|
|
1269
|
+
|
|
1204
1270
|
def colocalize(self, outcome, method="abf", trait1_type=None, trait2_type=None,
|
|
1205
|
-
sdY1=None, sdY2=None, n1=None, n2=None, p1=1e-4, p2=1e-4, p12=1e-5):
|
|
1271
|
+
sdY1=None, sdY2=None, n1=None, n2=None, p1=1e-4, p2=1e-4, p12=1e-5, merge_on_snp=False):
|
|
1206
1272
|
"""
|
|
1207
1273
|
Perform colocalization analysis between two GWAS datasets.
|
|
1208
1274
|
|
|
1209
1275
|
Args:
|
|
1210
1276
|
outcome: Another Geno object containing the outcome dataset
|
|
1211
1277
|
method: Method to use for colocalization (default: "abf")
|
|
1212
|
-
trait1_type: Type of exposure trait ("quant" or "cc")
|
|
1213
|
-
trait2_type: Type of outcome trait ("quant" or "cc")
|
|
1214
|
-
sdY1: Standard deviation of exposure trait (required for quantitative traits)
|
|
1215
|
-
sdY2: Standard deviation of outcome trait (required for quantitative traits)
|
|
1216
|
-
n1: Sample size for exposure (used to estimate sdY1 if not provided)
|
|
1217
|
-
n2: Sample size for outcome (used to estimate sdY2 if not provided)
|
|
1278
|
+
trait1_type: Type of exposure trait ("quant" for quantitative traits or "cc" for case-control traits)
|
|
1279
|
+
trait2_type: Type of outcome trait ("quant" for quantitative traits or "cc" for case-control traits)
|
|
1280
|
+
sdY1: Standard deviation of exposure trait (required for quantitative traits, but can be estimated from EAF and sample size)
|
|
1281
|
+
sdY2: Standard deviation of outcome trait (required for quantitative traits, but can be estimated from EAF and sample size)
|
|
1282
|
+
n1: Sample size for exposure (used to estimate sdY1 if sdY1 is not provided)
|
|
1283
|
+
n2: Sample size for outcome (used to estimate sdY2 if sdY2 is not provided)
|
|
1218
1284
|
p1: Prior probability SNP associated with exposure
|
|
1219
1285
|
p2: Prior probability SNP associated with outcome
|
|
1220
1286
|
p12: Prior probability SNP associated with both traits
|
|
1287
|
+
merge_on_snp: If True, merge the datasets on SNP column. If False, first attempt to merge on CHR and POS columns.
|
|
1221
1288
|
"""
|
|
1222
1289
|
# Ensure required columns exist in both datasets
|
|
1223
1290
|
required_cols = ['BETA', 'SE']
|
|
@@ -1237,56 +1304,10 @@ class Geno:
|
|
|
1237
1304
|
# Make copies of the data to avoid modifying the original data
|
|
1238
1305
|
data1 = self.data.copy()
|
|
1239
1306
|
data2 = outcome.data.copy()
|
|
1240
|
-
|
|
1241
|
-
# Ensure that the BETA columns are preprocessed
|
|
1242
|
-
check_beta_column(data1, 'BETA', 'Fill')
|
|
1243
|
-
check_beta_column(data2, 'BETA', 'Fill')
|
|
1244
|
-
|
|
1245
|
-
# Adjust EAF column names before merging in case one of the datasets does not have it
|
|
1246
|
-
if 'EAF' in data1.columns:
|
|
1247
|
-
data1.rename(columns={'EAF': 'EAF_1'}, inplace=True)
|
|
1248
|
-
if 'EAF' in data2.columns:
|
|
1249
|
-
data2.rename(columns={'EAF': 'EAF_2'}, inplace=True)
|
|
1250
|
-
|
|
1251
|
-
# Determine merge strategy based on available columns
|
|
1252
|
-
if all(col in self.data.columns for col in ['CHR', 'POS']) and \
|
|
1253
|
-
all(col in outcome.data.columns for col in ['CHR', 'POS']):
|
|
1254
|
-
print("Merging datasets using CHR and POS")
|
|
1255
|
-
|
|
1256
|
-
#Ensure that the CHR, POS columns are preprocessed
|
|
1257
|
-
check_int_column(data1, "CHR")
|
|
1258
|
-
check_int_column(data1, "POS")
|
|
1259
|
-
check_int_column(data2, "CHR")
|
|
1260
|
-
check_int_column(data2, "POS")
|
|
1261
|
-
|
|
1262
|
-
# Merge the datasets
|
|
1263
|
-
merged_data = pd.merge(data1, data2,
|
|
1264
|
-
on=['CHR', 'POS'],
|
|
1265
|
-
suffixes=('_1', '_2'))
|
|
1266
|
-
|
|
1267
|
-
elif 'SNP' in self.data.columns and 'SNP' in outcome.data.columns:
|
|
1268
|
-
print("Merging datasets using SNP IDs")
|
|
1269
|
-
|
|
1270
|
-
# Ensure that the SNP column is preprocessed
|
|
1271
|
-
check_snp_column(data1)
|
|
1272
|
-
check_snp_column(data2)
|
|
1273
|
-
|
|
1274
|
-
# Merge the datasets
|
|
1275
|
-
merged_data = pd.merge(data1, data2,
|
|
1276
|
-
on='SNP',
|
|
1277
|
-
suffixes=('_1', '_2'))
|
|
1278
|
-
else:
|
|
1279
|
-
raise ValueError("Either CHR/POS or SNP columns must be present in both datasets for merging")
|
|
1280
|
-
|
|
1281
|
-
# Drop any rows with missing values
|
|
1282
|
-
merged_data = merged_data.dropna()
|
|
1283
|
-
if merged_data.empty:
|
|
1284
|
-
raise ValueError("No overlapping variants found between the datasets")
|
|
1285
|
-
|
|
1286
|
-
print(f"Using {len(merged_data)} overlapping variants for colocalization analysis")
|
|
1287
1307
|
|
|
1288
1308
|
# Call the implementation function
|
|
1289
|
-
return coloc_abf_func(
|
|
1309
|
+
return coloc_abf_func(data1,
|
|
1310
|
+
data2,
|
|
1290
1311
|
trait1_type=trait1_type,
|
|
1291
1312
|
trait2_type=trait2_type,
|
|
1292
1313
|
sdY1=sdY1,
|
|
@@ -1295,7 +1316,8 @@ class Geno:
|
|
|
1295
1316
|
n2=n2,
|
|
1296
1317
|
p1=p1,
|
|
1297
1318
|
p2=p2,
|
|
1298
|
-
p12=p12
|
|
1319
|
+
p12=p12,
|
|
1320
|
+
merge_on_snp=merge_on_snp)
|
|
1299
1321
|
|
|
1300
1322
|
|
|
1301
1323
|
def lift(
|
|
@@ -1480,14 +1502,24 @@ class Geno:
|
|
|
1480
1502
|
self.data = self.data.groupby(by=["SNP"]).first().reset_index(drop=False)
|
|
1481
1503
|
return
|
|
1482
1504
|
|
|
1483
|
-
def copy(self):
|
|
1505
|
+
def copy(self, data):
|
|
1484
1506
|
"""
|
|
1485
|
-
Create
|
|
1507
|
+
Create another Geno instance with the updated data attribute.
|
|
1508
|
+
The relevant attributes are copied as well (checks, phenotype, reference_panel, reference_panel_name).
|
|
1509
|
+
Attributes that are not copied are MR_data, MR_results, MRpresso_subset_data, MRpresso_results.
|
|
1486
1510
|
|
|
1487
1511
|
Returns:
|
|
1488
1512
|
Geno: A deep copy of the instance.
|
|
1489
1513
|
"""
|
|
1490
|
-
|
|
1514
|
+
Geno_copy = Geno(data, keep_columns=True)
|
|
1515
|
+
Geno_copy.checks = self.checks.copy()
|
|
1516
|
+
if hasattr(self, "phenotype"):
|
|
1517
|
+
Geno_copy.phenotype = self.phenotype
|
|
1518
|
+
if hasattr(self, "reference_panel"):
|
|
1519
|
+
Geno_copy.reference_panel = self.reference_panel
|
|
1520
|
+
if hasattr(self, "reference_panel_name"):
|
|
1521
|
+
Geno_copy.reference_panel_name = self.reference_panel_name
|
|
1522
|
+
return Geno_copy
|
|
1491
1523
|
|
|
1492
1524
|
def save(self, path="", fmt="h5", sep="\t", header=True):
|
|
1493
1525
|
"""
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import json
|
|
3
3
|
from .tools import default_config, write_config, set_plink, install_plink, delete_tmp, get_reference_panel_path, get_plink_path
|
|
4
|
-
from .geno_tools import Combine_Geno
|
|
4
|
+
from .geno_tools import Combine_Geno, filter_by_gene_func
|
|
5
5
|
from .constants import CONFIG_DIR
|
|
6
6
|
|
|
7
|
-
__version__ = "1.3.
|
|
7
|
+
__version__ = "1.3.2"
|
|
8
8
|
|
|
9
9
|
config_path = os.path.join(CONFIG_DIR, "config.json")
|
|
10
10
|
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from numpy import exp, log
|
|
4
|
+
from genal.geno_tools import check_beta_column, check_allele_column, check_snp_column, check_int_column
|
|
5
|
+
|
|
6
|
+
# Currently does not support multi-allelic SNPs
|
|
7
|
+
|
|
8
|
+
def coloc_abf_func(data1, data2, trait1_type="quant", trait2_type="quant",
|
|
9
|
+
sdY1=None, sdY2=None, n1=None, n2=None,
|
|
10
|
+
p1=1e-4, p2=1e-4, p12=1e-5, merge_on_snp=False):
|
|
11
|
+
"""
|
|
12
|
+
Perform colocalization analysis between two GWAS datasets using approximate Bayes factors.
|
|
13
|
+
Corresponds to the :meth:`Geno.colocalize` method.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
data1: DataFrame containing GWAS results for trait 1
|
|
17
|
+
data2: DataFrame containing GWAS results for trait 2
|
|
18
|
+
trait1_type: Type of trait 1 ("quant" for quantitative traits or "cc" for case-control traits)
|
|
19
|
+
trait2_type: Type of trait 2 ("quant" for quantitative traits or "cc" for case-control traits)
|
|
20
|
+
sdY1: Standard deviation of trait 1 (required for quantitative traits)
|
|
21
|
+
sdY2: Standard deviation of trait 2 (required for quantitative traits)
|
|
22
|
+
n1: Sample size for trait 1 (used to estimate sdY if not provided)
|
|
23
|
+
n2: Sample size for trait 2 (used to estimate sdY if not provided)
|
|
24
|
+
p1: Prior probability SNP associated with trait 1
|
|
25
|
+
p2: Prior probability SNP associated with trait 2
|
|
26
|
+
p12: Prior probability SNP associated with both traits
|
|
27
|
+
merge_on_snp: If True, merge the datasets on SNP column. If False, first attempt to merge on CHR and POS columns.
|
|
28
|
+
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# Ensure that the BETA columns are preprocessed
|
|
32
|
+
check_beta_column(data1, 'BETA', 'Fill')
|
|
33
|
+
check_beta_column(data2, 'BETA', 'Fill')
|
|
34
|
+
|
|
35
|
+
# Adjust EAF column names before merging in case one of the datasets does not have it
|
|
36
|
+
if 'EAF' in data1.columns:
|
|
37
|
+
data1.rename(columns={'EAF': 'EAF_1'}, inplace=True)
|
|
38
|
+
if 'EAF' in data2.columns:
|
|
39
|
+
data2.rename(columns={'EAF': 'EAF_2'}, inplace=True)
|
|
40
|
+
|
|
41
|
+
# First determine if we can merge on position, otherwise try SNP
|
|
42
|
+
if all(col in data1.columns for col in ['CHR', 'POS']) and \
|
|
43
|
+
all(col in data2.columns for col in ['CHR', 'POS']) and not merge_on_snp:
|
|
44
|
+
|
|
45
|
+
print("Merging datasets using genomic positions (CHR, POS)")
|
|
46
|
+
|
|
47
|
+
# Ensure that the CHR and POS columns are preprocessed
|
|
48
|
+
check_int_column(data1, "CHR")
|
|
49
|
+
check_int_column(data2, "CHR")
|
|
50
|
+
check_int_column(data1, "POS")
|
|
51
|
+
check_int_column(data2, "POS")
|
|
52
|
+
|
|
53
|
+
# Merge using position
|
|
54
|
+
merged_data = pd.merge(
|
|
55
|
+
data1,
|
|
56
|
+
data2,
|
|
57
|
+
on=['CHR', 'POS'],
|
|
58
|
+
how='left',
|
|
59
|
+
suffixes=('_1', '_2')
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
elif 'SNP' in data1.columns and 'SNP' in data2.columns:
|
|
63
|
+
print("Position columns (CHR, POS) not present in both datasets. Merging datasets using SNP IDs.")
|
|
64
|
+
|
|
65
|
+
# Ensure that the SNP column is preprocessed
|
|
66
|
+
check_snp_column(data1)
|
|
67
|
+
check_snp_column(data2)
|
|
68
|
+
|
|
69
|
+
# Merge using SNP
|
|
70
|
+
merged_data = pd.merge(
|
|
71
|
+
data1,
|
|
72
|
+
data2,
|
|
73
|
+
on='SNP',
|
|
74
|
+
suffixes=('_1', '_2')
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
else:
|
|
78
|
+
raise ValueError("At least CHR/POS or SNP columns must be present in both datasets for colocalization analysis")
|
|
79
|
+
|
|
80
|
+
# After merging, check if we can align alleles
|
|
81
|
+
if all(col in merged_data.columns for col in ['EA_1', 'NEA_1', 'EA_2', 'NEA_2']):
|
|
82
|
+
print("Aligning effect alleles between datasets")
|
|
83
|
+
|
|
84
|
+
# Ensure allele columns are preprocessed
|
|
85
|
+
check_allele_column(data1, "EA", keep_indel=False)
|
|
86
|
+
check_allele_column(data1, "NEA", keep_indel=False)
|
|
87
|
+
check_allele_column(data2, "EA", keep_indel=False)
|
|
88
|
+
check_allele_column(data2, "NEA", keep_indel=False)
|
|
89
|
+
|
|
90
|
+
# Adjust BETA from trait 2 to correspond to the same effect allele as trait 1
|
|
91
|
+
conditions = [
|
|
92
|
+
merged_data["EA_1"] == merged_data["EA_2"],
|
|
93
|
+
merged_data["EA_1"] == merged_data["NEA_2"],
|
|
94
|
+
True,
|
|
95
|
+
]
|
|
96
|
+
choices = [
|
|
97
|
+
merged_data["BETA_2"],
|
|
98
|
+
-merged_data["BETA_2"],
|
|
99
|
+
np.nan,
|
|
100
|
+
]
|
|
101
|
+
merged_data["BETA_2"] = np.select(conditions, choices)
|
|
102
|
+
else:
|
|
103
|
+
print("Allele columns (EA, NEA) not present in both datasets. "
|
|
104
|
+
"This might lead to incorrect results if the effect estimates (BETA) were not obtained with the same reference allele in both datasets.")
|
|
105
|
+
|
|
106
|
+
# Clean up columns
|
|
107
|
+
merged_data.drop(columns=["EA_2", "NEA_2", "SNP_2", "CHR_2", "POS_2"], inplace=True, errors='ignore')
|
|
108
|
+
merged_data.rename(columns={"SNP_1": "SNP", "CHR_1": "CHR", "POS_1": "POS"}, inplace=True, errors='ignore')
|
|
109
|
+
|
|
110
|
+
# Drop any rows with duplicate values
|
|
111
|
+
if "SNP" in merged_data.columns:
|
|
112
|
+
merged_data.drop_duplicates(subset=['SNP'], keep='first', inplace=True)
|
|
113
|
+
if "CHR" in merged_data.columns and "POS" in merged_data.columns:
|
|
114
|
+
merged_data.drop_duplicates(subset=["CHR", "POS"], keep='first', inplace=True)
|
|
115
|
+
|
|
116
|
+
# Drop any rows with missing values
|
|
117
|
+
merged_data = merged_data.dropna()
|
|
118
|
+
if merged_data.empty:
|
|
119
|
+
raise ValueError("No overlapping variants found between the datasets")
|
|
120
|
+
|
|
121
|
+
print(f"Using {len(merged_data)} overlapping variants for colocalization analysis")
|
|
122
|
+
|
|
123
|
+
# Estimate sdY if not provided for quantitative traits
|
|
124
|
+
if trait1_type == "quant" and sdY1 is None:
|
|
125
|
+
if 'EAF_1' not in merged_data.columns or n1 is None:
|
|
126
|
+
print("Neither sdY1 nor EAF and n1 are provided for trait 1. Assuming sdY1 = 1.")
|
|
127
|
+
sdY1 = 1
|
|
128
|
+
else:
|
|
129
|
+
sdY1 = sdY_est(merged_data['SE_1']**2, merged_data['EAF_1'], n1)
|
|
130
|
+
print(f"Using EAF and n1 to estimate sdY1: {sdY1:.2f}")
|
|
131
|
+
|
|
132
|
+
if trait2_type == "quant" and sdY2 is None:
|
|
133
|
+
if 'EAF_2' not in merged_data.columns or n2 is None:
|
|
134
|
+
print("Neither sdY2 nor EAF and n2 are provided for trait 2. Assuming sdY2 = 1.")
|
|
135
|
+
sdY2 = 1
|
|
136
|
+
else:
|
|
137
|
+
sdY2 = sdY_est(merged_data['SE_2']**2, merged_data['EAF_2'], n2)
|
|
138
|
+
print(f"Using EAF and n2 to estimate sdY2: {sdY2:.2f}")
|
|
139
|
+
|
|
140
|
+
# Calculate Bayes factors for each dataset
|
|
141
|
+
lABF_1 = approx_bf_estimates(merged_data['BETA_1'], merged_data['SE_1']**2,
|
|
142
|
+
trait_type=trait1_type, sdY=sdY1)
|
|
143
|
+
lABF_2 = approx_bf_estimates(merged_data['BETA_2'], merged_data['SE_2']**2,
|
|
144
|
+
trait_type=trait2_type, sdY=sdY2)
|
|
145
|
+
|
|
146
|
+
# Adjust priors based on number of SNPs
|
|
147
|
+
n_snps = len(merged_data)
|
|
148
|
+
if n_snps * p1 >= 1:
|
|
149
|
+
p1 = 1 / (n_snps + 1)
|
|
150
|
+
if n_snps * p2 >= 1:
|
|
151
|
+
p2 = 1 / (n_snps + 1)
|
|
152
|
+
if n_snps * p12 >= 1:
|
|
153
|
+
p12 = 1 / (n_snps + 1)
|
|
154
|
+
|
|
155
|
+
# Calculate posterior probabilities
|
|
156
|
+
pp = combine_abf(lABF_1, lABF_2, p1, p2, p12)
|
|
157
|
+
|
|
158
|
+
# Add SNP-specific results
|
|
159
|
+
results_df = merged_data.copy()
|
|
160
|
+
results_df['lABF_1'] = lABF_1
|
|
161
|
+
results_df['lABF_2'] = lABF_2
|
|
162
|
+
results_df['internal.sum.lABF'] = lABF_1 + lABF_2
|
|
163
|
+
|
|
164
|
+
# Calculate SNP-specific PP for H4
|
|
165
|
+
my_denom_log_abf = logsum(results_df['internal.sum.lABF'])
|
|
166
|
+
results_df['SNP.PP.H4'] = np.exp(results_df['internal.sum.lABF'] - my_denom_log_abf)
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
'nsnps': n_snps,
|
|
170
|
+
**pp
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
def approx_bf_estimates(beta, varbeta, trait_type="quant", sdY=1, effect_priors={'quant': 0.15, 'cc': 0.2}):
|
|
174
|
+
"""
|
|
175
|
+
Calculate approximate Bayes factors using regression estimates.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
beta: effect size estimate
|
|
179
|
+
varbeta: variance of the effect size estimate
|
|
180
|
+
trait_type: either "quant" for quantitative trait or "cc" for case-control
|
|
181
|
+
sdY: standard deviation of the trait (for quantitative traits)
|
|
182
|
+
effect_priors: dictionary with prior effect sizes for quantitative and case-control traits
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
array: log approximate Bayes factors
|
|
186
|
+
"""
|
|
187
|
+
z = beta / np.sqrt(varbeta)
|
|
188
|
+
|
|
189
|
+
# Set prior standard deviation based on trait type
|
|
190
|
+
if trait_type == "quant":
|
|
191
|
+
sd_prior = effect_priors['quant'] * sdY
|
|
192
|
+
else: # case-control
|
|
193
|
+
sd_prior = effect_priors['cc']
|
|
194
|
+
|
|
195
|
+
r = sd_prior**2 / (sd_prior**2 + varbeta)
|
|
196
|
+
lABF = 0.5 * (np.log(1 - r) + (r * z**2))
|
|
197
|
+
return lABF
|
|
198
|
+
|
|
199
|
+
def logsum(x):
|
|
200
|
+
"""Calculate log of sum of exponentials"""
|
|
201
|
+
my_max = np.max(x)
|
|
202
|
+
return my_max + np.log(np.sum(np.exp(x - my_max)))
|
|
203
|
+
|
|
204
|
+
def logdiff(x, y):
|
|
205
|
+
"""Calculate log of difference of exponentials"""
|
|
206
|
+
my_max = max(x, y)
|
|
207
|
+
return my_max + np.log(exp(x - my_max) - np.exp(y - my_max))
|
|
208
|
+
|
|
209
|
+
def combine_abf(l1, l2, p1, p2, p12):
|
|
210
|
+
"""Calculate posterior probabilities for different hypotheses"""
|
|
211
|
+
lsum = l1 + l2
|
|
212
|
+
|
|
213
|
+
lH0_abf = 0
|
|
214
|
+
lH1_abf = np.log(p1) + logsum(l1)
|
|
215
|
+
lH2_abf = np.log(p2) + logsum(l2)
|
|
216
|
+
lH3_abf = np.log(p1) + np.log(p2) + logdiff(logsum(l1) + logsum(l2), logsum(lsum))
|
|
217
|
+
lH4_abf = np.log(p12) + logsum(lsum)
|
|
218
|
+
|
|
219
|
+
all_abf = np.array([lH0_abf, lH1_abf, lH2_abf, lH3_abf, lH4_abf])
|
|
220
|
+
denom_log_abf = logsum(all_abf)
|
|
221
|
+
pp_abf = np.exp(all_abf - denom_log_abf)
|
|
222
|
+
|
|
223
|
+
return {
|
|
224
|
+
'PP.H0.abf': pp_abf[0],
|
|
225
|
+
'PP.H1.abf': pp_abf[1],
|
|
226
|
+
'PP.H2.abf': pp_abf[2],
|
|
227
|
+
'PP.H3.abf': pp_abf[3],
|
|
228
|
+
'PP.H4.abf': pp_abf[4]
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
def sdY_est(vbeta, maf, n):
|
|
232
|
+
"""
|
|
233
|
+
Estimate trait standard deviation given vectors of variance of coefficients, MAF and sample size.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
vbeta: vector of variance of coefficients
|
|
237
|
+
maf: vector of MAF (same length as vbeta)
|
|
238
|
+
n: sample size
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
float: estimated standard deviation of Y
|
|
242
|
+
"""
|
|
243
|
+
oneover = 1/vbeta
|
|
244
|
+
nvx = 2 * n * maf * (1-maf)
|
|
245
|
+
# Fit linear regression through origin
|
|
246
|
+
coef = np.sum(nvx * oneover) / np.sum(oneover**2)
|
|
247
|
+
if coef < 0:
|
|
248
|
+
raise ValueError("Estimated sdY is negative - this can happen with small datasets, or those with errors. A reasonable estimate of sdY is required to continue.")
|
|
249
|
+
return np.sqrt(coef)
|
|
@@ -5,8 +5,9 @@ BUILDS = ["37", "38"]
|
|
|
5
5
|
POPULATIONS = ["EUR", "AFR", "EAS", "AMR", "SAS"]
|
|
6
6
|
REF_PANELS = [f"{pop}_{build}" for pop in POPULATIONS for build in BUILDS]
|
|
7
7
|
REF_PANEL_COLUMNS = ["CHR", "SNP", "POS", "A1", "A2"]
|
|
8
|
-
|
|
9
|
-
|
|
8
|
+
BUCKET_URL = "https://storage.googleapis.com/genal_files/"
|
|
9
|
+
REF_PANELS_URL = BUCKET_URL + "{panel}.tar.gz"
|
|
10
|
+
REF_PARQUET_URL = BUCKET_URL + "reference_variants_{build}.parquet"
|
|
10
11
|
CONFIG_DIR = os.path.expanduser("~/.genal/")
|
|
11
12
|
CHECKS_DICT = {
|
|
12
13
|
"CHR": False,
|
|
@@ -5,8 +5,11 @@ import os, subprocess
|
|
|
5
5
|
import shutil
|
|
6
6
|
import warnings
|
|
7
7
|
from collections import Counter
|
|
8
|
+
import wget
|
|
9
|
+
|
|
10
|
+
from .constants import STANDARD_COLUMNS, BUCKET_URL
|
|
11
|
+
from .tools import read_config
|
|
8
12
|
|
|
9
|
-
from .constants import STANDARD_COLUMNS
|
|
10
13
|
|
|
11
14
|
|
|
12
15
|
def remove_na(data):
|
|
@@ -275,7 +278,6 @@ def fill_snpids_func(data, reference_panel_df, keep_indel):
|
|
|
275
278
|
|
|
276
279
|
return data
|
|
277
280
|
|
|
278
|
-
|
|
279
281
|
def check_int_column(data, int_col):
|
|
280
282
|
"""Set the type of the int_col column to Int64 and non-numeric values to NA."""
|
|
281
283
|
nrows = data.shape[0]
|
|
@@ -290,7 +292,6 @@ def check_int_column(data, int_col):
|
|
|
290
292
|
)
|
|
291
293
|
return
|
|
292
294
|
|
|
293
|
-
|
|
294
295
|
def adjust_column_names(data, CHR, POS, SNP, EA, NEA, BETA, SE, P, EAF, keep_columns):
|
|
295
296
|
"""
|
|
296
297
|
Rename columns to the standard names making sure that there are no duplicated names.
|
|
@@ -461,3 +462,119 @@ def Combine_Geno(Gs):
|
|
|
461
462
|
C = C.reset_index(drop=True)
|
|
462
463
|
|
|
463
464
|
return Geno(C)
|
|
465
|
+
|
|
466
|
+
def filter_by_gene_func(data, gene_identifier, id_type="symbol", window_size=1000000, build="37"):
|
|
467
|
+
"""
|
|
468
|
+
Filtering the data to include only variants that are within a specified distance of a specific gene.
|
|
469
|
+
Corresponds to the :meth:`Geno.filter_by_gene` method.
|
|
470
|
+
Args:
|
|
471
|
+
data (pd.DataFrame): Input data with at least 'CHR' and 'POS' columns.
|
|
472
|
+
gene_identifier (str): Identifier for the gene/protein to filter variants around.
|
|
473
|
+
id_type (str, optional): Type of identifier provided. Options are:
|
|
474
|
+
- "symbol": Gene symbol (e.g., "APOE")
|
|
475
|
+
- "HGNC": HGNC ID (e.g., "HGNC:613")
|
|
476
|
+
- "name": Full gene name (e.g., "apolipoprotein E")
|
|
477
|
+
- "Ensembl": Ensembl gene ID (e.g., "ENSG00000130203")
|
|
478
|
+
- "NCBI": NCBI gene ID (e.g., "348")
|
|
479
|
+
- "UCSC": UCSC gene ID (e.g., "uc001hbu.2")
|
|
480
|
+
- "Vega": Vega gene ID (e.g., "OTTHUMG00000019505")
|
|
481
|
+
Default is "symbol".
|
|
482
|
+
window_size (int, optional): Size of the window around the gene in base pairs. Default is 1,000,000 (1Mb).
|
|
483
|
+
build (str, optional): Genome build of the data. Default is "37".
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
pd.DataFrame: Filtered DataFrame containing only variants within the specified window
|
|
487
|
+
around the gene, with additional column 'Distance'.
|
|
488
|
+
|
|
489
|
+
Notes:
|
|
490
|
+
- Distance is calculated from the nearest gene boundary (start or end position)
|
|
491
|
+
- Null distances indicate the variant is within the gene
|
|
492
|
+
"""
|
|
493
|
+
|
|
494
|
+
# Validate id_type
|
|
495
|
+
valid_id_types = ["symbol", "HGNC_id", "name", "gene_id", "NCBI_id", "UCSC_id", "Vega_id"]
|
|
496
|
+
if id_type in ["HGNC", "NCBI", "UCSC", "Vega"]:
|
|
497
|
+
id_type = id_type + "_id"
|
|
498
|
+
if id_type == "Ensembl":
|
|
499
|
+
id_type = "gene_id"
|
|
500
|
+
if id_type not in valid_id_types:
|
|
501
|
+
raise ValueError(f"Invalid id_type. Must be one of: {', '.join(valid_id_types)}")
|
|
502
|
+
|
|
503
|
+
# Validate build
|
|
504
|
+
if int(build) not in [37, 38]:
|
|
505
|
+
raise ValueError(f"Invalid build. Must be one of: 37, 38")
|
|
506
|
+
|
|
507
|
+
# Download the gene info file if not already present in the reference folder
|
|
508
|
+
config = read_config()
|
|
509
|
+
ref_path = config["paths"]["ref_path"]
|
|
510
|
+
gene_info_file = os.path.join(ref_path, "gene_id_mapping_filtered.parquet")
|
|
511
|
+
if not os.path.exists(gene_info_file):
|
|
512
|
+
# Download parquet file
|
|
513
|
+
print(f"Downloading gene info file to {gene_info_file}...")
|
|
514
|
+
url = BUCKET_URL + "gene_id_mapping_filtered.parquet"
|
|
515
|
+
try:
|
|
516
|
+
wget.download(url, gene_info_file)
|
|
517
|
+
print("\nDownload complete.")
|
|
518
|
+
except Exception as e:
|
|
519
|
+
if os.path.exists(gene_info_file):
|
|
520
|
+
os.remove(gene_info_file)
|
|
521
|
+
raise RuntimeError(f"Failed to download gene info: {e}")
|
|
522
|
+
|
|
523
|
+
df_gene_info = pd.read_parquet(gene_info_file, engine="pyarrow")
|
|
524
|
+
|
|
525
|
+
# Find gene coordinates
|
|
526
|
+
gene_data = df_gene_info[df_gene_info[id_type] == gene_identifier]
|
|
527
|
+
|
|
528
|
+
if gene_data.empty:
|
|
529
|
+
raise ValueError(f"Gene with {id_type}='{gene_identifier}' not found in gene info database.")
|
|
530
|
+
|
|
531
|
+
if len(gene_data) > 1:
|
|
532
|
+
print(f"Warning: Multiple entries found for {id_type}='{gene_identifier}'. Using the first entry.")
|
|
533
|
+
gene_data = gene_data.iloc[0,:]
|
|
534
|
+
|
|
535
|
+
print(f"Filtering variants within {window_size}bp window based on genome build {build} around gene: {', '.join(f'{col}: {gene_data[col]}' for col in valid_id_types)}")
|
|
536
|
+
|
|
537
|
+
# Extract gene location information
|
|
538
|
+
chrom = gene_data['CHR']
|
|
539
|
+
# Convert to integer if possible
|
|
540
|
+
if str(chrom).isdigit():
|
|
541
|
+
chrom = int(chrom)
|
|
542
|
+
elif chrom=="X":
|
|
543
|
+
chrom=23
|
|
544
|
+
else:
|
|
545
|
+
raise ValueError(f"Gene {gene_identifier} is located on chromosome {chrom}, which is not supported.")
|
|
546
|
+
|
|
547
|
+
gene_start = int(gene_data[f'gene_start_{build}'])
|
|
548
|
+
gene_end = int(gene_data[f'gene_end_{build}'])
|
|
549
|
+
|
|
550
|
+
# Define the window boundaries
|
|
551
|
+
window_start = max(0, gene_start - window_size/2)
|
|
552
|
+
window_end = gene_end + window_size/2
|
|
553
|
+
|
|
554
|
+
# Filter variants within the window
|
|
555
|
+
filtered = data[
|
|
556
|
+
(data['CHR'] == chrom) &
|
|
557
|
+
(data['POS'] >= window_start) &
|
|
558
|
+
(data['POS'] <= window_end)
|
|
559
|
+
].copy()
|
|
560
|
+
|
|
561
|
+
if not filtered.empty:
|
|
562
|
+
# Calculate distance from gene: if inside the gene, distance is 0, if before, distance is negative, if after, distance is positive
|
|
563
|
+
filtered.loc[:, 'Distance'] = np.nan
|
|
564
|
+
|
|
565
|
+
# Create boolean masks
|
|
566
|
+
mask_inside = filtered['POS'].between(gene_start, gene_end)
|
|
567
|
+
mask_before = filtered['POS'] < gene_start
|
|
568
|
+
mask_after = filtered['POS'] > gene_end
|
|
569
|
+
|
|
570
|
+
filtered.loc[mask_inside, 'Distance'] = 0
|
|
571
|
+
filtered.loc[mask_before, 'Distance'] = filtered['POS'] - gene_start
|
|
572
|
+
filtered.loc[mask_after, 'Distance'] = filtered['POS'] - gene_end
|
|
573
|
+
|
|
574
|
+
filtered["Distance"] = filtered["Distance"].astype("Int64")
|
|
575
|
+
|
|
576
|
+
print(f"Found {len(filtered)} variants.")
|
|
577
|
+
else:
|
|
578
|
+
print(f"No variants found in a {window_size}bp window around {gene_identifier}")
|
|
579
|
+
|
|
580
|
+
return filtered
|
|
@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "genal-python" # Updated name for PyPI
|
|
7
|
-
version = "1.3.
|
|
7
|
+
version = "1.3.2"
|
|
8
8
|
authors = [{name = "Cyprien Rivier", email = "riviercyprien@gmail.com"}]
|
|
9
9
|
description = "A python toolkit for polygenic risk scoring and mendelian randomization."
|
|
10
10
|
readme = "README.md"
|
|
@@ -1,159 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from numpy import exp, log
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def coloc_abf_func(data, trait1_type="quant", trait2_type="quant",
|
|
9
|
-
sdY1=None, sdY2=None, n1=None, n2=None,
|
|
10
|
-
p1=1e-4, p2=1e-4, p12=1e-5):
|
|
11
|
-
"""
|
|
12
|
-
Perform colocalization analysis between two GWAS datasets using approximate Bayes factors.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
data: DataFrame containing merged GWAS results
|
|
16
|
-
trait1_type: Type of trait 1 ("quant" or "cc")
|
|
17
|
-
trait2_type: Type of trait 2 ("quant" or "cc")
|
|
18
|
-
sdY1: Standard deviation of trait 1 (required for quantitative traits)
|
|
19
|
-
sdY2: Standard deviation of trait 2 (required for quantitative traits)
|
|
20
|
-
n1: Sample size for trait 1 (used to estimate sdY if not provided)
|
|
21
|
-
n2: Sample size for trait 2 (used to estimate sdY if not provided)
|
|
22
|
-
p1: Prior probability SNP associated with trait 1
|
|
23
|
-
p2: Prior probability SNP associated with trait 2
|
|
24
|
-
p12: Prior probability SNP associated with both traits
|
|
25
|
-
"""
|
|
26
|
-
# Estimate sdY if not provided for quantitative traits
|
|
27
|
-
if trait1_type == "quant" and sdY1 is None:
|
|
28
|
-
if 'EAF_1' not in data.columns or n1 is None:
|
|
29
|
-
print("Neither sdY1 nor EAF and n1 are provided for trait 1. Assuming sdY1 = 1.")
|
|
30
|
-
sdY1 = 1
|
|
31
|
-
else:
|
|
32
|
-
sdY1 = sdY_est(data['SE_1']**2, data['EAF_1'], n1)
|
|
33
|
-
print(f"Using EAF and n1 to estimate sdY1: {sdY1:.2f}")
|
|
34
|
-
|
|
35
|
-
if trait2_type == "quant" and sdY2 is None:
|
|
36
|
-
if 'EAF_2' not in data.columns or n2 is None:
|
|
37
|
-
print("Neither sdY2 nor EAF and n2 are provided for trait 2. Assuming sdY2 = 1.")
|
|
38
|
-
sdY2 = 1
|
|
39
|
-
else:
|
|
40
|
-
sdY2 = sdY_est(data['SE_2']**2, data['EAF_2'], n2)
|
|
41
|
-
print(f"Using EAF and n2 to estimate sdY2: {sdY2:.2f}")
|
|
42
|
-
# Calculate Bayes factors for each dataset
|
|
43
|
-
lABF_1 = approx_bf_estimates(data['BETA_1'], data['SE_1']**2,
|
|
44
|
-
trait_type=trait1_type, sdY=sdY1)
|
|
45
|
-
lABF_2 = approx_bf_estimates(data['BETA_2'], data['SE_2']**2,
|
|
46
|
-
trait_type=trait2_type, sdY=sdY2)
|
|
47
|
-
|
|
48
|
-
# Adjust priors based on number of SNPs
|
|
49
|
-
n_snps = len(data)
|
|
50
|
-
if n_snps * p1 >= 1:
|
|
51
|
-
p1 = 1 / (n_snps + 1)
|
|
52
|
-
if n_snps * p2 >= 1:
|
|
53
|
-
p2 = 1 / (n_snps + 1)
|
|
54
|
-
if n_snps * p12 >= 1:
|
|
55
|
-
p12 = 1 / (n_snps + 1)
|
|
56
|
-
|
|
57
|
-
# Calculate posterior probabilities
|
|
58
|
-
pp = combine_abf(lABF_1, lABF_2, p1, p2, p12)
|
|
59
|
-
|
|
60
|
-
# Add SNP-specific results
|
|
61
|
-
results_df = data.copy()
|
|
62
|
-
results_df['lABF_1'] = lABF_1
|
|
63
|
-
results_df['lABF_2'] = lABF_2
|
|
64
|
-
results_df['internal.sum.lABF'] = lABF_1 + lABF_2
|
|
65
|
-
|
|
66
|
-
# Calculate SNP-specific PP for H4
|
|
67
|
-
my_denom_log_abf = logsum(results_df['internal.sum.lABF'])
|
|
68
|
-
results_df['SNP.PP.H4'] = np.exp(results_df['internal.sum.lABF'] - my_denom_log_abf)
|
|
69
|
-
|
|
70
|
-
return {
|
|
71
|
-
'summary': {
|
|
72
|
-
'nsnps': n_snps,
|
|
73
|
-
**pp
|
|
74
|
-
},
|
|
75
|
-
'results': results_df,
|
|
76
|
-
'priors': {
|
|
77
|
-
'p1': p1,
|
|
78
|
-
'p2': p2,
|
|
79
|
-
'p12': p12
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
def approx_bf_estimates(beta, varbeta, trait_type="quant", sdY=1, effect_priors={'quant': 0.15, 'cc': 0.2}):
|
|
84
|
-
"""
|
|
85
|
-
Calculate approximate Bayes factors using regression estimates.
|
|
86
|
-
|
|
87
|
-
Args:
|
|
88
|
-
beta: effect size estimate
|
|
89
|
-
varbeta: variance of the effect size estimate
|
|
90
|
-
trait_type: either "quant" for quantitative trait or "cc" for case-control
|
|
91
|
-
sdY: standard deviation of the trait (for quantitative traits)
|
|
92
|
-
effect_priors: dictionary with prior effect sizes for quantitative and case-control traits
|
|
93
|
-
|
|
94
|
-
Returns:
|
|
95
|
-
array: log approximate Bayes factors
|
|
96
|
-
"""
|
|
97
|
-
z = beta / np.sqrt(varbeta)
|
|
98
|
-
|
|
99
|
-
# Set prior standard deviation based on trait type
|
|
100
|
-
if trait_type == "quant":
|
|
101
|
-
sd_prior = effect_priors['quant'] * sdY
|
|
102
|
-
else: # case-control
|
|
103
|
-
sd_prior = effect_priors['cc']
|
|
104
|
-
|
|
105
|
-
r = sd_prior**2 / (sd_prior**2 + varbeta)
|
|
106
|
-
lABF = 0.5 * (np.log(1 - r) + (r * z**2))
|
|
107
|
-
return lABF
|
|
108
|
-
|
|
109
|
-
def logsum(x):
|
|
110
|
-
"""Calculate log of sum of exponentials"""
|
|
111
|
-
my_max = np.max(x)
|
|
112
|
-
return my_max + np.log(np.sum(np.exp(x - my_max)))
|
|
113
|
-
|
|
114
|
-
def logdiff(x, y):
|
|
115
|
-
"""Calculate log of difference of exponentials"""
|
|
116
|
-
my_max = max(x, y)
|
|
117
|
-
return my_max + np.log(exp(x - my_max) - np.exp(y - my_max))
|
|
118
|
-
|
|
119
|
-
def combine_abf(l1, l2, p1, p2, p12):
|
|
120
|
-
"""Calculate posterior probabilities for different hypotheses"""
|
|
121
|
-
lsum = l1 + l2
|
|
122
|
-
|
|
123
|
-
lH0_abf = 0
|
|
124
|
-
lH1_abf = np.log(p1) + logsum(l1)
|
|
125
|
-
lH2_abf = np.log(p2) + logsum(l2)
|
|
126
|
-
lH3_abf = np.log(p1) + np.log(p2) + logdiff(logsum(l1) + logsum(l2), logsum(lsum))
|
|
127
|
-
lH4_abf = np.log(p12) + logsum(lsum)
|
|
128
|
-
|
|
129
|
-
all_abf = np.array([lH0_abf, lH1_abf, lH2_abf, lH3_abf, lH4_abf])
|
|
130
|
-
denom_log_abf = logsum(all_abf)
|
|
131
|
-
pp_abf = np.exp(all_abf - denom_log_abf)
|
|
132
|
-
|
|
133
|
-
return {
|
|
134
|
-
'PP.H0.abf': pp_abf[0],
|
|
135
|
-
'PP.H1.abf': pp_abf[1],
|
|
136
|
-
'PP.H2.abf': pp_abf[2],
|
|
137
|
-
'PP.H3.abf': pp_abf[3],
|
|
138
|
-
'PP.H4.abf': pp_abf[4]
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
def sdY_est(vbeta, maf, n):
|
|
142
|
-
"""
|
|
143
|
-
Estimate trait standard deviation given vectors of variance of coefficients, MAF and sample size.
|
|
144
|
-
|
|
145
|
-
Args:
|
|
146
|
-
vbeta: vector of variance of coefficients
|
|
147
|
-
maf: vector of MAF (same length as vbeta)
|
|
148
|
-
n: sample size
|
|
149
|
-
|
|
150
|
-
Returns:
|
|
151
|
-
float: estimated standard deviation of Y
|
|
152
|
-
"""
|
|
153
|
-
oneover = 1/vbeta
|
|
154
|
-
nvx = 2 * n * maf * (1-maf)
|
|
155
|
-
# Fit linear regression through origin
|
|
156
|
-
coef = np.sum(nvx * oneover) / np.sum(oneover**2)
|
|
157
|
-
if coef < 0:
|
|
158
|
-
raise ValueError("Estimated sdY is negative - this can happen with small datasets, or those with errors. A reasonable estimate of sdY is required to continue.")
|
|
159
|
-
return np.sqrt(coef)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff
RENAMED
|
File without changes
|
{genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff2
RENAMED
|
File without changes
|
{genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff
RENAMED
|
File without changes
|
{genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff2
RENAMED
|
File without changes
|
{genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/fontawesome-webfont.eot
RENAMED
|
File without changes
|
{genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/fontawesome-webfont.svg
RENAMED
|
File without changes
|
{genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/fontawesome-webfont.ttf
RENAMED
|
File without changes
|
{genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/fontawesome-webfont.woff
RENAMED
|
File without changes
|
{genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/fontawesome-webfont.woff2
RENAMED
|
File without changes
|
{genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-bold-italic.woff
RENAMED
|
File without changes
|
{genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-bold-italic.woff2
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-normal-italic.woff
RENAMED
|
File without changes
|
{genal_python-1.3.1 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-normal-italic.woff2
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|