genal-python 1.3.0__tar.gz → 1.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. {genal_python-1.3.0 → genal_python-1.3.2}/PKG-INFO +2 -2
  2. {genal_python-1.3.0 → genal_python-1.3.2}/README.md +1 -1
  3. {genal_python-1.3.0 → genal_python-1.3.2}/genal/Geno.py +103 -69
  4. {genal_python-1.3.0 → genal_python-1.3.2}/genal/__init__.py +2 -2
  5. genal_python-1.3.2/genal/colocalization.py +249 -0
  6. {genal_python-1.3.0 → genal_python-1.3.2}/genal/constants.py +3 -2
  7. {genal_python-1.3.0 → genal_python-1.3.2}/genal/geno_tools.py +120 -3
  8. {genal_python-1.3.0 → genal_python-1.3.2}/pyproject.toml +1 -1
  9. genal_python-1.3.0/genal/colocalization.py +0 -159
  10. {genal_python-1.3.0 → genal_python-1.3.2}/.DS_Store +0 -0
  11. {genal_python-1.3.0 → genal_python-1.3.2}/.gitignore +0 -0
  12. {genal_python-1.3.0 → genal_python-1.3.2}/.readthedocs.yaml +0 -0
  13. {genal_python-1.3.0 → genal_python-1.3.2}/Genal_flowchart.png +0 -0
  14. {genal_python-1.3.0 → genal_python-1.3.2}/LICENSE +0 -0
  15. {genal_python-1.3.0 → genal_python-1.3.2}/docs/.DS_Store +0 -0
  16. {genal_python-1.3.0 → genal_python-1.3.2}/docs/Makefile +0 -0
  17. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/.DS_Store +0 -0
  18. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/.buildinfo +0 -0
  19. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/.doctrees/api.doctree +0 -0
  20. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/.doctrees/environment.pickle +0 -0
  21. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/.doctrees/genal.doctree +0 -0
  22. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/.doctrees/index.doctree +0 -0
  23. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/.doctrees/introduction.doctree +0 -0
  24. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/.doctrees/modules.doctree +0 -0
  25. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_images/MR_plot_SBP_AS.png +0 -0
  26. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_modules/genal/Geno.html +0 -0
  27. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_modules/genal/MR.html +0 -0
  28. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_modules/genal/MR_tools.html +0 -0
  29. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_modules/genal/MRpresso.html +0 -0
  30. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_modules/genal/association.html +0 -0
  31. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_modules/genal/clump.html +0 -0
  32. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_modules/genal/extract_prs.html +0 -0
  33. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_modules/genal/geno_tools.html +0 -0
  34. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_modules/genal/lift.html +0 -0
  35. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_modules/genal/proxy.html +0 -0
  36. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_modules/genal/snp_query.html +0 -0
  37. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_modules/genal/tools.html +0 -0
  38. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_modules/index.html +0 -0
  39. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_sources/api.rst.txt +0 -0
  40. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_sources/genal.rst.txt +0 -0
  41. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_sources/index.rst.txt +0 -0
  42. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_sources/introduction.rst.txt +0 -0
  43. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_sources/modules.rst.txt +0 -0
  44. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/basic.css +0 -0
  45. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/badge_only.css +0 -0
  46. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff +0 -0
  47. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff2 +0 -0
  48. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff +0 -0
  49. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff2 +0 -0
  50. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/fontawesome-webfont.eot +0 -0
  51. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/fontawesome-webfont.svg +0 -0
  52. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/fontawesome-webfont.ttf +0 -0
  53. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/fontawesome-webfont.woff +0 -0
  54. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/fontawesome-webfont.woff2 +0 -0
  55. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-bold-italic.woff +0 -0
  56. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-bold-italic.woff2 +0 -0
  57. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-bold.woff +0 -0
  58. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-bold.woff2 +0 -0
  59. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-normal-italic.woff +0 -0
  60. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-normal-italic.woff2 +0 -0
  61. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-normal.woff +0 -0
  62. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/fonts/lato-normal.woff2 +0 -0
  63. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/css/theme.css +0 -0
  64. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/doctools.js +0 -0
  65. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/documentation_options.js +0 -0
  66. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/file.png +0 -0
  67. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/js/badge_only.js +0 -0
  68. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/js/html5shiv-printshiv.min.js +0 -0
  69. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/js/html5shiv.min.js +0 -0
  70. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/js/theme.js +0 -0
  71. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/language_data.js +0 -0
  72. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/minus.png +0 -0
  73. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/plus.png +0 -0
  74. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/pygments.css +0 -0
  75. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/searchtools.js +0 -0
  76. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/_static/sphinx_highlight.js +0 -0
  77. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/api.html +0 -0
  78. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/genal.html +0 -0
  79. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/genindex.html +0 -0
  80. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/index.html +0 -0
  81. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/introduction.html +0 -0
  82. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/modules.html +0 -0
  83. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/objects.inv +0 -0
  84. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/py-modindex.html +0 -0
  85. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/search.html +0 -0
  86. {genal_python-1.3.0 → genal_python-1.3.2}/docs/build/searchindex.js +0 -0
  87. {genal_python-1.3.0 → genal_python-1.3.2}/docs/make.bat +0 -0
  88. {genal_python-1.3.0 → genal_python-1.3.2}/docs/requirements.txt +0 -0
  89. {genal_python-1.3.0 → genal_python-1.3.2}/docs/source/.DS_Store +0 -0
  90. {genal_python-1.3.0 → genal_python-1.3.2}/docs/source/Images/Genal_flowchart.png +0 -0
  91. {genal_python-1.3.0 → genal_python-1.3.2}/docs/source/Images/MR_plot_SBP_AS.png +0 -0
  92. {genal_python-1.3.0 → genal_python-1.3.2}/docs/source/Images/genal_logo.png +0 -0
  93. {genal_python-1.3.0 → genal_python-1.3.2}/docs/source/api.rst +0 -0
  94. {genal_python-1.3.0 → genal_python-1.3.2}/docs/source/conf.py +0 -0
  95. {genal_python-1.3.0 → genal_python-1.3.2}/docs/source/index.rst +0 -0
  96. {genal_python-1.3.0 → genal_python-1.3.2}/docs/source/introduction.rst +0 -0
  97. {genal_python-1.3.0 → genal_python-1.3.2}/docs/source/modules.rst +0 -0
  98. {genal_python-1.3.0 → genal_python-1.3.2}/genal/MR.py +0 -0
  99. {genal_python-1.3.0 → genal_python-1.3.2}/genal/MR_tools.py +0 -0
  100. {genal_python-1.3.0 → genal_python-1.3.2}/genal/MRpresso.py +0 -0
  101. {genal_python-1.3.0 → genal_python-1.3.2}/genal/association.py +0 -0
  102. {genal_python-1.3.0 → genal_python-1.3.2}/genal/clump.py +0 -0
  103. {genal_python-1.3.0 → genal_python-1.3.2}/genal/extract_prs.py +0 -0
  104. {genal_python-1.3.0 → genal_python-1.3.2}/genal/lift.py +0 -0
  105. {genal_python-1.3.0 → genal_python-1.3.2}/genal/proxy.py +0 -0
  106. {genal_python-1.3.0 → genal_python-1.3.2}/genal/snp_query.py +0 -0
  107. {genal_python-1.3.0 → genal_python-1.3.2}/genal/tools.py +0 -0
  108. {genal_python-1.3.0 → genal_python-1.3.2}/genal_logo.png +0 -0
  109. {genal_python-1.3.0 → genal_python-1.3.2}/gitignore +0 -0
  110. {genal_python-1.3.0 → genal_python-1.3.2}/readthedocs.yaml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: genal-python
3
- Version: 1.3.0
3
+ Version: 1.3.2
4
4
  Summary: A python toolkit for polygenic risk scoring and mendelian randomization.
5
5
  Author-email: Cyprien Rivier <riviercyprien@gmail.com>
6
6
  Requires-Python: >=3.8
@@ -72,7 +72,7 @@ Genal is a python module designed to make it easy and intuitive to run genetic r
72
72
  - **Comprehensive MR Pipeline**: From data preprocessing to sensitivity analyses and plotting in a single package
73
73
  - **Reference Panel Support**: Automatically download and use the latest 1000 Genomes reference panels in builds 37 and 38 with the option to use custom reference panels
74
74
  - **Customizable**: Ability to choose all the parameters, but defaults are set to the most common values
75
- - **Proxy SNP Support**: Includes functionality for finding and using proxy SNPs when instruments are missing (for polygenic risk scores, Mendelian Randomization, and association testing)
75
+ - **Proxy SNP Support**: Includes functionality for finding and using proxy SNPs when instruments are missing (for polygenic risk scores, Mendelian Randomization)
76
76
 
77
77
  The objective of genal is to bring the functionalities of well-established R packages such as TwoSampleMR, MR-Presso, MendelianRandomization, and gwasvcf, in a more user-friendly Python environment. This approach ensures that users have access to tried and tested techniques with the versatility of Python's data science tools.
78
78
 
@@ -47,7 +47,7 @@ Genal is a python module designed to make it easy and intuitive to run genetic r
47
47
  - **Comprehensive MR Pipeline**: From data preprocessing to sensitivity analyses and plotting in a single package
48
48
  - **Reference Panel Support**: Automatically download and use the latest 1000 Genomes reference panels in builds 37 and 38 with the option to use custom reference panels
49
49
  - **Customizable**: Ability to choose all the parameters, but defaults are set to the most common values
50
- - **Proxy SNP Support**: Includes functionality for finding and using proxy SNPs when instruments are missing (for polygenic risk scores, Mendelian Randomization, and association testing)
50
+ - **Proxy SNP Support**: Includes functionality for finding and using proxy SNPs when instruments are missing (for polygenic risk scores, Mendelian Randomization)
51
51
 
52
52
  The objective of genal is to bring the functionalities of well-established R packages such as TwoSampleMR, MR-Presso, MendelianRandomization, and gwasvcf, in a more user-friendly Python environment. This approach ensures that users have access to tried and tested techniques with the versatility of Python's data science tools.
53
53
 
@@ -1,7 +1,7 @@
1
1
  import pandas as pd
2
2
  import numpy as np
3
3
  import warnings
4
- import os, subprocess
4
+ import os
5
5
  import copy
6
6
  import psutil
7
7
  import uuid
@@ -30,7 +30,8 @@ from .geno_tools import (
30
30
  fill_se_p,
31
31
  check_allele_column,
32
32
  check_snp_column,
33
- remove_na
33
+ remove_na,
34
+ filter_by_gene_func
34
35
  )
35
36
  from .association import set_phenotype_func, association_test_func_plink2
36
37
  from .extract_prs import extract_snps_func, prs_func
@@ -42,8 +43,10 @@ from .colocalization import coloc_abf_func
42
43
  # Add proxying function (input is df + searchspace (list of SNP or path to .bim, can be separated by chromosomes) and returns proxied df)
43
44
  # Get proxies (simply return a list of proxies)
44
45
  # Include proxying option to association_test
45
- # Multi-MR
46
+ # Multivariable-MR
46
47
  # Check stability with variants on sexual chromosomes
48
+ # Check the build of user data (potentially with a list of SNPs with different positions)
49
+ # update_snpids function: take alleles into account during the merge if they are present in the user data
47
50
 
48
51
 
49
52
  class Geno:
@@ -115,9 +118,22 @@ class Geno:
115
118
 
116
119
  Attributes:
117
120
  name (str): Randomly generated ID for the Geno object.
118
- outcome (list): List of outcomes (initialized as empty).
119
121
  cpus (int): Number of CPUs to be used.
120
122
  ram (int): Amount of RAM to be used in MBs.
123
+ checks (dict): Dictionary of checks performed on the main DataFrame.
124
+ reference_panel (pd.DataFrame): Reference population SNP data used for SNP info
125
+ adjustments. Initialized when first needed.
126
+ reference_panel_name (str): string to identify the reference_panel (path or population string)
127
+ phenotype (pd.DataFrame, str): Tuple with a DataFrame of individual-level phenotype
128
+ data and a string representing the phenotype trait column. Initialized after
129
+ running the 'set_phenotype' method.
130
+ MR_data (pd.DataFrame, pd.DataFrame, str): Tuple containing DataFrames for associations
131
+ with exposure and outcome, and a string for the outcome name. Initialized after
132
+ running the 'query_outcome' method.
133
+ MR_results (pd.DataFrame, pd.DataFrame, str, str): Contains an MR results dataframe, a dataframe of harmonized SNPs, an exposure name, an outcome name.
134
+ Assigned after calling the MR method and used for plotting with the MR_plot method.
135
+ MRpresso_subset_data (pd.DataFrame, pd.DataFrame, str, str): Contains a dataframe of subsetted harmonized SNPs without outliers.
136
+ Assigned after calling the MRpresso method.
121
137
  """
122
138
 
123
139
  # Validate df type
@@ -154,7 +170,7 @@ class Geno:
154
170
  def preprocess_data(
155
171
  self,
156
172
  preprocessing='Fill',
157
- reference_panel="38",
173
+ reference_panel="37",
158
174
  effect_column=None,
159
175
  keep_indel=None,
160
176
  keep_dups=None,
@@ -290,7 +306,7 @@ class Geno:
290
306
 
291
307
  self.data = data
292
308
 
293
- def get_reference_panel(self, reference_panel="38"):
309
+ def get_reference_panel(self, reference_panel="37"):
294
310
  """
295
311
  Retrieve or set the reference panel for the Geno object.
296
312
 
@@ -397,10 +413,7 @@ class Geno:
397
413
 
398
414
  # If clumped data is successfully generated, assign it to the object's attribute
399
415
  if clumped_data is not None:
400
- Clumped = Geno(clumped_data, keep_columns=True)
401
- Clumped.checks = self.checks.copy()
402
- if hasattr(self, "phenotype"):
403
- Clumped.phenotype = self.phenotype
416
+ Clumped = self.copy(clumped_data)
404
417
  return Clumped
405
418
  return None
406
419
 
@@ -698,6 +711,8 @@ class Geno:
698
711
  # Assign the processed data and inferred phenotype type to the .phenotype attribute
699
712
  self.phenotype = (processed_data, inferred_pheno_type, PHENO)
700
713
 
714
+ return
715
+
701
716
  def association_test(self, path=None, covar=[], standardize=True):
702
717
  """
703
718
  Conduct single-SNP association tests against a phenotype.
@@ -1199,23 +1214,77 @@ class Geno:
1199
1214
 
1200
1215
  return mod_table, GlobalTest, OutlierTest, BiasTest
1201
1216
 
1217
+ def filter_by_gene(self, gene_id, id_type="symbol", window_size=1000000, build="37", replace=False):
1218
+ """
1219
+ Filter the data to include only variants that are within a specified distance of a specific gene.
1220
+
1221
+ Args:
1222
+ gene_id (str): Identifier for the gene/protein to filter variants around.
1223
+ id_type (str, optional): Type of identifier provided. Options are:
1224
+ - "symbol": Gene symbol (e.g., "APOE")
1225
+ - "HGNC": HGNC ID (e.g., "HGNC:613")
1226
+ - "name": Full gene name (e.g., "apolipoprotein E")
1227
+ - "Ensembl": Ensembl gene ID (e.g., "ENSG00000130203")
1228
+ - "NCBI": NCBI gene ID (e.g., "348")
1229
+ - "UCSC": UCSC gene ID (e.g., "uc001hbu.2")
1230
+ - "Vega": Vega gene ID (e.g., "OTTHUMG00000019505")
1231
+ Default is "symbol".
1232
+ window_size (int, optional): Size of the window around the gene in base pairs. Default is 1,000,000 (1Mb).
1233
+ build (str, optional): Genome build of the data. Default is "37".
1234
+ replace (bool, optional): If True, replace the existing data attribute with the filtered data. Default is True.
1235
+ Returns:
1236
+ if replace is True:
1237
+ pd.DataFrame: Filtered DataFrame containing only variants within the specified window
1238
+ around the gene, with additional column 'Distance'.
1239
+ if replace is False:
1240
+ genal.Geno: A new Geno object with the filtered data.
1241
+
1242
+ Raises:
1243
+ ValueError: If required columns are missing, gene information cannot be found, or invalid id_type is provided.
1244
+
1245
+ Notes:
1246
+ - Distance is calculated from the nearest gene boundary (start or end position)
1247
+ - Null distances indicate the variant is within the gene
1248
+ """
1249
+ # Check required columns
1250
+ for col in ["CHR", "POS"]:
1251
+ if col not in self.data.columns:
1252
+ raise ValueError(f"Column {col} must be present in the input data!")
1253
+
1254
+ # Do the appropriate preprocessing on CHR and POS columns if not already done
1255
+ if not self.checks.get("CHR"):
1256
+ check_int_column(self.data, "CHR")
1257
+ self.checks["CHR"] = True
1258
+ if not self.checks.get("POS"):
1259
+ check_int_column(self.data, "POS")
1260
+ self.checks["POS"] = True
1261
+
1262
+ filtered = filter_by_gene_func(self.data, gene_id, id_type, window_size, build)
1263
+
1264
+ if replace:
1265
+ self.data = filtered
1266
+ else:
1267
+ Geno_filtered = self.copy(filtered)
1268
+ return Geno_filtered
1269
+
1202
1270
  def colocalize(self, outcome, method="abf", trait1_type=None, trait2_type=None,
1203
- sdY1=None, sdY2=None, n1=None, n2=None, p1=1e-4, p2=1e-4, p12=1e-5):
1271
+ sdY1=None, sdY2=None, n1=None, n2=None, p1=1e-4, p2=1e-4, p12=1e-5, merge_on_snp=False):
1204
1272
  """
1205
1273
  Perform colocalization analysis between two GWAS datasets.
1206
1274
 
1207
1275
  Args:
1208
1276
  outcome: Another Geno object containing the outcome dataset
1209
1277
  method: Method to use for colocalization (default: "abf")
1210
- trait1_type: Type of exposure trait ("quant" or "cc")
1211
- trait2_type: Type of outcome trait ("quant" or "cc")
1212
- sdY1: Standard deviation of exposure trait (required for quantitative traits)
1213
- sdY2: Standard deviation of outcome trait (required for quantitative traits)
1214
- n1: Sample size for exposure (used to estimate sdY1 if not provided)
1215
- n2: Sample size for outcome (used to estimate sdY2 if not provided)
1278
+ trait1_type: Type of exposure trait ("quant" for quantitative traits or "cc" for case-control traits)
1279
+ trait2_type: Type of outcome trait ("quant" for quantitative traits or "cc" for case-control traits)
1280
+ sdY1: Standard deviation of exposure trait (required for quantitative traits, but can be estimated from EAF and sample size)
1281
+ sdY2: Standard deviation of outcome trait (required for quantitative traits, but can be estimated from EAF and sample size)
1282
+ n1: Sample size for exposure (used to estimate sdY1 if sdY1 is not provided)
1283
+ n2: Sample size for outcome (used to estimate sdY2 if sdY2 is not provided)
1216
1284
  p1: Prior probability SNP associated with exposure
1217
1285
  p2: Prior probability SNP associated with outcome
1218
1286
  p12: Prior probability SNP associated with both traits
1287
+ merge_on_snp: If True, merge the datasets on SNP column. If False, first attempt to merge on CHR and POS columns.
1219
1288
  """
1220
1289
  # Ensure required columns exist in both datasets
1221
1290
  required_cols = ['BETA', 'SE']
@@ -1235,56 +1304,10 @@ class Geno:
1235
1304
  # Make copies of the data to avoid modifying the original data
1236
1305
  data1 = self.data.copy()
1237
1306
  data2 = outcome.data.copy()
1238
-
1239
- # Ensure that the BETA columns are preprocessed
1240
- check_beta_column(data1, 'BETA', 'Fill')
1241
- check_beta_column(data2, 'BETA', 'Fill')
1242
-
1243
- # Adjust EAF column names before merging in case one of the datasets does not have it
1244
- if 'EAF' in data1.columns:
1245
- data1.rename(columns={'EAF': 'EAF_1'}, inplace=True)
1246
- if 'EAF' in data2.columns:
1247
- data2.rename(columns={'EAF': 'EAF_2'}, inplace=True)
1248
-
1249
- # Determine merge strategy based on available columns
1250
- if all(col in self.data.columns for col in ['CHR', 'POS']) and \
1251
- all(col in outcome.data.columns for col in ['CHR', 'POS']):
1252
- print("Merging datasets using CHR and POS")
1253
-
1254
- #Ensure that the CHR, POS columns are preprocessed
1255
- check_int_column(data1, "CHR")
1256
- check_int_column(data1, "POS")
1257
- check_int_column(data2, "CHR")
1258
- check_int_column(data2, "POS")
1259
-
1260
- # Merge the datasets
1261
- merged_data = pd.merge(data1, data2,
1262
- on=['CHR', 'POS'],
1263
- suffixes=('_1', '_2'))
1264
-
1265
- elif 'SNP' in self.data.columns and 'SNP' in outcome.data.columns:
1266
- print("Merging datasets using SNP IDs")
1267
-
1268
- # Ensure that the SNP column is preprocessed
1269
- check_snp_column(data1)
1270
- check_snp_column(data2)
1271
-
1272
- # Merge the datasets
1273
- merged_data = pd.merge(data1, data2,
1274
- on='SNP',
1275
- suffixes=('_1', '_2'))
1276
- else:
1277
- raise ValueError("Either CHR/POS or SNP columns must be present in both datasets for merging")
1278
-
1279
- # Drop any rows with missing values
1280
- merged_data = merged_data.dropna()
1281
- if merged_data.empty:
1282
- raise ValueError("No overlapping variants found between the datasets")
1283
-
1284
- print(f"Using {len(merged_data)} overlapping variants for colocalization analysis")
1285
1307
 
1286
1308
  # Call the implementation function
1287
- return coloc_abf_func(merged_data,
1309
+ return coloc_abf_func(data1,
1310
+ data2,
1288
1311
  trait1_type=trait1_type,
1289
1312
  trait2_type=trait2_type,
1290
1313
  sdY1=sdY1,
@@ -1293,7 +1316,8 @@ class Geno:
1293
1316
  n2=n2,
1294
1317
  p1=p1,
1295
1318
  p2=p2,
1296
- p12=p12)
1319
+ p12=p12,
1320
+ merge_on_snp=merge_on_snp)
1297
1321
 
1298
1322
 
1299
1323
  def lift(
@@ -1478,14 +1502,24 @@ class Geno:
1478
1502
  self.data = self.data.groupby(by=["SNP"]).first().reset_index(drop=False)
1479
1503
  return
1480
1504
 
1481
- def copy(self):
1505
+ def copy(self, data):
1482
1506
  """
1483
- Create a deep copy of the Geno instance.
1507
+ Create another Geno instance with the updated data attribute.
1508
+ The relevant attributes are copied as well (checks, phenotype, reference_panel, reference_panel_name).
1509
+ Attributes that are not copied are MR_data, MR_results, MRpresso_subset_data, MRpresso_results.
1484
1510
 
1485
1511
  Returns:
1486
1512
  Geno: A deep copy of the instance.
1487
1513
  """
1488
- return copy.deepcopy(self)
1514
+ Geno_copy = Geno(data, keep_columns=True)
1515
+ Geno_copy.checks = self.checks.copy()
1516
+ if hasattr(self, "phenotype"):
1517
+ Geno_copy.phenotype = self.phenotype
1518
+ if hasattr(self, "reference_panel"):
1519
+ Geno_copy.reference_panel = self.reference_panel
1520
+ if hasattr(self, "reference_panel_name"):
1521
+ Geno_copy.reference_panel_name = self.reference_panel_name
1522
+ return Geno_copy
1489
1523
 
1490
1524
  def save(self, path="", fmt="h5", sep="\t", header=True):
1491
1525
  """
@@ -1,10 +1,10 @@
1
1
  import os
2
2
  import json
3
3
  from .tools import default_config, write_config, set_plink, install_plink, delete_tmp, get_reference_panel_path, get_plink_path
4
- from .geno_tools import Combine_Geno
4
+ from .geno_tools import Combine_Geno, filter_by_gene_func
5
5
  from .constants import CONFIG_DIR
6
6
 
7
- __version__ = "1.3.0"
7
+ __version__ = "1.3.2"
8
8
 
9
9
  config_path = os.path.join(CONFIG_DIR, "config.json")
10
10
 
@@ -0,0 +1,249 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from numpy import exp, log
4
+ from genal.geno_tools import check_beta_column, check_allele_column, check_snp_column, check_int_column
5
+
6
+ # Currently does not support multi-allelic SNPs
7
+
8
+ def coloc_abf_func(data1, data2, trait1_type="quant", trait2_type="quant",
9
+ sdY1=None, sdY2=None, n1=None, n2=None,
10
+ p1=1e-4, p2=1e-4, p12=1e-5, merge_on_snp=False):
11
+ """
12
+ Perform colocalization analysis between two GWAS datasets using approximate Bayes factors.
13
+ Corresponds to the :meth:`Geno.colocalize` method.
14
+
15
+ Args:
16
+ data1: DataFrame containing GWAS results for trait 1
17
+ data2: DataFrame containing GWAS results for trait 2
18
+ trait1_type: Type of trait 1 ("quant" for quantitative traits or "cc" for case-control traits)
19
+ trait2_type: Type of trait 2 ("quant" for quantitative traits or "cc" for case-control traits)
20
+ sdY1: Standard deviation of trait 1 (required for quantitative traits)
21
+ sdY2: Standard deviation of trait 2 (required for quantitative traits)
22
+ n1: Sample size for trait 1 (used to estimate sdY if not provided)
23
+ n2: Sample size for trait 2 (used to estimate sdY if not provided)
24
+ p1: Prior probability SNP associated with trait 1
25
+ p2: Prior probability SNP associated with trait 2
26
+ p12: Prior probability SNP associated with both traits
27
+ merge_on_snp: If True, merge the datasets on SNP column. If False, first attempt to merge on CHR and POS columns.
28
+
29
+ """
30
+
31
+ # Ensure that the BETA columns are preprocessed
32
+ check_beta_column(data1, 'BETA', 'Fill')
33
+ check_beta_column(data2, 'BETA', 'Fill')
34
+
35
+ # Adjust EAF column names before merging in case one of the datasets does not have it
36
+ if 'EAF' in data1.columns:
37
+ data1.rename(columns={'EAF': 'EAF_1'}, inplace=True)
38
+ if 'EAF' in data2.columns:
39
+ data2.rename(columns={'EAF': 'EAF_2'}, inplace=True)
40
+
41
+ # First determine if we can merge on position, otherwise try SNP
42
+ if all(col in data1.columns for col in ['CHR', 'POS']) and \
43
+ all(col in data2.columns for col in ['CHR', 'POS']) and not merge_on_snp:
44
+
45
+ print("Merging datasets using genomic positions (CHR, POS)")
46
+
47
+ # Ensure that the CHR and POS columns are preprocessed
48
+ check_int_column(data1, "CHR")
49
+ check_int_column(data2, "CHR")
50
+ check_int_column(data1, "POS")
51
+ check_int_column(data2, "POS")
52
+
53
+ # Merge using position
54
+ merged_data = pd.merge(
55
+ data1,
56
+ data2,
57
+ on=['CHR', 'POS'],
58
+ how='left',
59
+ suffixes=('_1', '_2')
60
+ )
61
+
62
+ elif 'SNP' in data1.columns and 'SNP' in data2.columns:
63
+ print("Position columns (CHR, POS) not present in both datasets. Merging datasets using SNP IDs.")
64
+
65
+ # Ensure that the SNP column is preprocessed
66
+ check_snp_column(data1)
67
+ check_snp_column(data2)
68
+
69
+ # Merge using SNP
70
+ merged_data = pd.merge(
71
+ data1,
72
+ data2,
73
+ on='SNP',
74
+ suffixes=('_1', '_2')
75
+ )
76
+
77
+ else:
78
+ raise ValueError("At least CHR/POS or SNP columns must be present in both datasets for colocalization analysis")
79
+
80
+ # After merging, check if we can align alleles
81
+ if all(col in merged_data.columns for col in ['EA_1', 'NEA_1', 'EA_2', 'NEA_2']):
82
+ print("Aligning effect alleles between datasets")
83
+
84
+ # Ensure allele columns are preprocessed
85
+ check_allele_column(data1, "EA", keep_indel=False)
86
+ check_allele_column(data1, "NEA", keep_indel=False)
87
+ check_allele_column(data2, "EA", keep_indel=False)
88
+ check_allele_column(data2, "NEA", keep_indel=False)
89
+
90
+ # Adjust BETA from trait 2 to correspond to the same effect allele as trait 1
91
+ conditions = [
92
+ merged_data["EA_1"] == merged_data["EA_2"],
93
+ merged_data["EA_1"] == merged_data["NEA_2"],
94
+ True,
95
+ ]
96
+ choices = [
97
+ merged_data["BETA_2"],
98
+ -merged_data["BETA_2"],
99
+ np.nan,
100
+ ]
101
+ merged_data["BETA_2"] = np.select(conditions, choices)
102
+ else:
103
+ print("Allele columns (EA, NEA) not present in both datasets. "
104
+ "This might lead to incorrect results if the effect estimates (BETA) were not obtained with the same reference allele in both datasets.")
105
+
106
+ # Clean up columns
107
+ merged_data.drop(columns=["EA_2", "NEA_2", "SNP_2", "CHR_2", "POS_2"], inplace=True, errors='ignore')
108
+ merged_data.rename(columns={"SNP_1": "SNP", "CHR_1": "CHR", "POS_1": "POS"}, inplace=True, errors='ignore')
109
+
110
+ # Drop any rows with duplicate values
111
+ if "SNP" in merged_data.columns:
112
+ merged_data.drop_duplicates(subset=['SNP'], keep='first', inplace=True)
113
+ if "CHR" in merged_data.columns and "POS" in merged_data.columns:
114
+ merged_data.drop_duplicates(subset=["CHR", "POS"], keep='first', inplace=True)
115
+
116
+ # Drop any rows with missing values
117
+ merged_data = merged_data.dropna()
118
+ if merged_data.empty:
119
+ raise ValueError("No overlapping variants found between the datasets")
120
+
121
+ print(f"Using {len(merged_data)} overlapping variants for colocalization analysis")
122
+
123
+ # Estimate sdY if not provided for quantitative traits
124
+ if trait1_type == "quant" and sdY1 is None:
125
+ if 'EAF_1' not in merged_data.columns or n1 is None:
126
+ print("Neither sdY1 nor EAF and n1 are provided for trait 1. Assuming sdY1 = 1.")
127
+ sdY1 = 1
128
+ else:
129
+ sdY1 = sdY_est(merged_data['SE_1']**2, merged_data['EAF_1'], n1)
130
+ print(f"Using EAF and n1 to estimate sdY1: {sdY1:.2f}")
131
+
132
+ if trait2_type == "quant" and sdY2 is None:
133
+ if 'EAF_2' not in merged_data.columns or n2 is None:
134
+ print("Neither sdY2 nor EAF and n2 are provided for trait 2. Assuming sdY2 = 1.")
135
+ sdY2 = 1
136
+ else:
137
+ sdY2 = sdY_est(merged_data['SE_2']**2, merged_data['EAF_2'], n2)
138
+ print(f"Using EAF and n2 to estimate sdY2: {sdY2:.2f}")
139
+
140
+ # Calculate Bayes factors for each dataset
141
+ lABF_1 = approx_bf_estimates(merged_data['BETA_1'], merged_data['SE_1']**2,
142
+ trait_type=trait1_type, sdY=sdY1)
143
+ lABF_2 = approx_bf_estimates(merged_data['BETA_2'], merged_data['SE_2']**2,
144
+ trait_type=trait2_type, sdY=sdY2)
145
+
146
+ # Adjust priors based on number of SNPs
147
+ n_snps = len(merged_data)
148
+ if n_snps * p1 >= 1:
149
+ p1 = 1 / (n_snps + 1)
150
+ if n_snps * p2 >= 1:
151
+ p2 = 1 / (n_snps + 1)
152
+ if n_snps * p12 >= 1:
153
+ p12 = 1 / (n_snps + 1)
154
+
155
+ # Calculate posterior probabilities
156
+ pp = combine_abf(lABF_1, lABF_2, p1, p2, p12)
157
+
158
+ # Add SNP-specific results
159
+ results_df = merged_data.copy()
160
+ results_df['lABF_1'] = lABF_1
161
+ results_df['lABF_2'] = lABF_2
162
+ results_df['internal.sum.lABF'] = lABF_1 + lABF_2
163
+
164
+ # Calculate SNP-specific PP for H4
165
+ my_denom_log_abf = logsum(results_df['internal.sum.lABF'])
166
+ results_df['SNP.PP.H4'] = np.exp(results_df['internal.sum.lABF'] - my_denom_log_abf)
167
+
168
+ return {
169
+ 'nsnps': n_snps,
170
+ **pp
171
+ }
172
+
173
+ def approx_bf_estimates(beta, varbeta, trait_type="quant", sdY=1, effect_priors={'quant': 0.15, 'cc': 0.2}):
174
+ """
175
+ Calculate approximate Bayes factors using regression estimates.
176
+
177
+ Args:
178
+ beta: effect size estimate
179
+ varbeta: variance of the effect size estimate
180
+ trait_type: either "quant" for quantitative trait or "cc" for case-control
181
+ sdY: standard deviation of the trait (for quantitative traits)
182
+ effect_priors: dictionary with prior effect sizes for quantitative and case-control traits
183
+
184
+ Returns:
185
+ array: log approximate Bayes factors
186
+ """
187
+ z = beta / np.sqrt(varbeta)
188
+
189
+ # Set prior standard deviation based on trait type
190
+ if trait_type == "quant":
191
+ sd_prior = effect_priors['quant'] * sdY
192
+ else: # case-control
193
+ sd_prior = effect_priors['cc']
194
+
195
+ r = sd_prior**2 / (sd_prior**2 + varbeta)
196
+ lABF = 0.5 * (np.log(1 - r) + (r * z**2))
197
+ return lABF
198
+
199
+ def logsum(x):
200
+ """Calculate log of sum of exponentials"""
201
+ my_max = np.max(x)
202
+ return my_max + np.log(np.sum(np.exp(x - my_max)))
203
+
204
+ def logdiff(x, y):
205
+ """Calculate log of difference of exponentials"""
206
+ my_max = max(x, y)
207
+ return my_max + np.log(exp(x - my_max) - np.exp(y - my_max))
208
+
209
+ def combine_abf(l1, l2, p1, p2, p12):
210
+ """Calculate posterior probabilities for different hypotheses"""
211
+ lsum = l1 + l2
212
+
213
+ lH0_abf = 0
214
+ lH1_abf = np.log(p1) + logsum(l1)
215
+ lH2_abf = np.log(p2) + logsum(l2)
216
+ lH3_abf = np.log(p1) + np.log(p2) + logdiff(logsum(l1) + logsum(l2), logsum(lsum))
217
+ lH4_abf = np.log(p12) + logsum(lsum)
218
+
219
+ all_abf = np.array([lH0_abf, lH1_abf, lH2_abf, lH3_abf, lH4_abf])
220
+ denom_log_abf = logsum(all_abf)
221
+ pp_abf = np.exp(all_abf - denom_log_abf)
222
+
223
+ return {
224
+ 'PP.H0.abf': pp_abf[0],
225
+ 'PP.H1.abf': pp_abf[1],
226
+ 'PP.H2.abf': pp_abf[2],
227
+ 'PP.H3.abf': pp_abf[3],
228
+ 'PP.H4.abf': pp_abf[4]
229
+ }
230
+
231
+ def sdY_est(vbeta, maf, n):
232
+ """
233
+ Estimate trait standard deviation given vectors of variance of coefficients, MAF and sample size.
234
+
235
+ Args:
236
+ vbeta: vector of variance of coefficients
237
+ maf: vector of MAF (same length as vbeta)
238
+ n: sample size
239
+
240
+ Returns:
241
+ float: estimated standard deviation of Y
242
+ """
243
+ oneover = 1/vbeta
244
+ nvx = 2 * n * maf * (1-maf)
245
+ # Fit linear regression through origin
246
+ coef = np.sum(nvx * oneover) / np.sum(oneover**2)
247
+ if coef < 0:
248
+ raise ValueError("Estimated sdY is negative - this can happen with small datasets, or those with errors. A reasonable estimate of sdY is required to continue.")
249
+ return np.sqrt(coef)
@@ -5,8 +5,9 @@ BUILDS = ["37", "38"]
5
5
  POPULATIONS = ["EUR", "AFR", "EAS", "AMR", "SAS"]
6
6
  REF_PANELS = [f"{pop}_{build}" for pop in POPULATIONS for build in BUILDS]
7
7
  REF_PANEL_COLUMNS = ["CHR", "SNP", "POS", "A1", "A2"]
8
- REF_PANELS_URL = "https://storage.googleapis.com/genal_files/{panel}.tar.gz"
9
- REF_PARQUET_URL = "https://storage.googleapis.com/genal_files/reference_variants_{build}.parquet"
8
+ BUCKET_URL = "https://storage.googleapis.com/genal_files/"
9
+ REF_PANELS_URL = BUCKET_URL + "{panel}.tar.gz"
10
+ REF_PARQUET_URL = BUCKET_URL + "reference_variants_{build}.parquet"
10
11
  CONFIG_DIR = os.path.expanduser("~/.genal/")
11
12
  CHECKS_DICT = {
12
13
  "CHR": False,