genal-python 0.3__tar.gz → 0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {genal_python-0.3 → genal_python-0.4}/PKG-INFO +43 -41
  2. {genal_python-0.3 → genal_python-0.4}/README.md +42 -40
  3. {genal_python-0.3 → genal_python-0.4}/genal/Geno.py +103 -0
  4. {genal_python-0.3 → genal_python-0.4}/genal/MR.py +15 -15
  5. {genal_python-0.3 → genal_python-0.4}/genal/MR_tools.py +35 -47
  6. {genal_python-0.3 → genal_python-0.4}/genal/MRpresso.py +1 -0
  7. {genal_python-0.3 → genal_python-0.4}/genal/__init__.py +1 -1
  8. {genal_python-0.3 → genal_python-0.4}/pyproject.toml +1 -1
  9. {genal_python-0.3 → genal_python-0.4}/requirements.txt +5 -5
  10. {genal_python-0.3 → genal_python-0.4}/.gitignore +0 -0
  11. {genal_python-0.3 → genal_python-0.4}/LICENSE +0 -0
  12. {genal_python-0.3 → genal_python-0.4}/docs/Images/MR_plot_SBP_AS.png +0 -0
  13. {genal_python-0.3 → genal_python-0.4}/docs/Makefile +0 -0
  14. {genal_python-0.3 → genal_python-0.4}/docs/_build/doctrees/environment.pickle +0 -0
  15. {genal_python-0.3 → genal_python-0.4}/docs/_build/doctrees/index.doctree +0 -0
  16. {genal_python-0.3 → genal_python-0.4}/docs/_build/doctrees/source/genal.doctree +0 -0
  17. {genal_python-0.3 → genal_python-0.4}/docs/_build/doctrees/source/modules.doctree +0 -0
  18. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/.buildinfo +0 -0
  19. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_sources/index.rst.txt +0 -0
  20. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_sources/source/genal.rst.txt +0 -0
  21. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_sources/source/modules.rst.txt +0 -0
  22. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/_sphinx_javascript_frameworks_compat.js +0 -0
  23. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/basic.css +0 -0
  24. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/badge_only.css +0 -0
  25. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/Roboto-Slab-Bold.woff +0 -0
  26. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/Roboto-Slab-Bold.woff2 +0 -0
  27. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/Roboto-Slab-Regular.woff +0 -0
  28. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/Roboto-Slab-Regular.woff2 +0 -0
  29. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/fontawesome-webfont.eot +0 -0
  30. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/fontawesome-webfont.svg +0 -0
  31. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/fontawesome-webfont.ttf +0 -0
  32. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/fontawesome-webfont.woff +0 -0
  33. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/fontawesome-webfont.woff2 +0 -0
  34. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/lato-bold-italic.woff +0 -0
  35. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/lato-bold-italic.woff2 +0 -0
  36. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/lato-bold.woff +0 -0
  37. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/lato-bold.woff2 +0 -0
  38. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/lato-normal-italic.woff +0 -0
  39. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/lato-normal-italic.woff2 +0 -0
  40. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/lato-normal.woff +0 -0
  41. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/fonts/lato-normal.woff2 +0 -0
  42. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/css/theme.css +0 -0
  43. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/doctools.js +0 -0
  44. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/documentation_options.js +0 -0
  45. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/file.png +0 -0
  46. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/jquery.js +0 -0
  47. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/js/badge_only.js +0 -0
  48. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/js/html5shiv-printshiv.min.js +0 -0
  49. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/js/html5shiv.min.js +0 -0
  50. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/js/theme.js +0 -0
  51. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/language_data.js +0 -0
  52. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/minus.png +0 -0
  53. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/plus.png +0 -0
  54. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/pygments.css +0 -0
  55. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/searchtools.js +0 -0
  56. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/_static/sphinx_highlight.js +0 -0
  57. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/genindex.html +0 -0
  58. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/index.html +0 -0
  59. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/objects.inv +0 -0
  60. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/py-modindex.html +0 -0
  61. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/search.html +0 -0
  62. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/searchindex.js +0 -0
  63. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/source/genal.html +0 -0
  64. {genal_python-0.3 → genal_python-0.4}/docs/_build/html/source/modules.html +0 -0
  65. {genal_python-0.3 → genal_python-0.4}/docs/make.bat +0 -0
  66. {genal_python-0.3 → genal_python-0.4}/docs/requirements.txt +0 -0
  67. {genal_python-0.3 → genal_python-0.4}/docs/source/api.rst +0 -0
  68. {genal_python-0.3 → genal_python-0.4}/docs/source/conf.py +0 -0
  69. {genal_python-0.3 → genal_python-0.4}/docs/source/genal.rst +0 -0
  70. {genal_python-0.3 → genal_python-0.4}/docs/source/index.rst +0 -0
  71. {genal_python-0.3 → genal_python-0.4}/docs/source/introduction.rst +0 -0
  72. {genal_python-0.3 → genal_python-0.4}/docs/source/modules.rst +0 -0
  73. {genal_python-0.3 → genal_python-0.4}/genal/association.py +0 -0
  74. {genal_python-0.3 → genal_python-0.4}/genal/clump.py +0 -0
  75. {genal_python-0.3 → genal_python-0.4}/genal/constants.py +0 -0
  76. {genal_python-0.3 → genal_python-0.4}/genal/extract_prs.py +0 -0
  77. {genal_python-0.3 → genal_python-0.4}/genal/geno_tools.py +0 -0
  78. {genal_python-0.3 → genal_python-0.4}/genal/lift.py +0 -0
  79. {genal_python-0.3 → genal_python-0.4}/genal/proxy.py +0 -0
  80. {genal_python-0.3 → genal_python-0.4}/genal/tools.py +0 -0
  81. {genal_python-0.3 → genal_python-0.4}/gitignore +0 -0
  82. {genal_python-0.3 → genal_python-0.4}/readthedocs.yaml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: genal-python
3
- Version: 0.3
3
+ Version: 0.4
4
4
  Summary: A python toolkit for polygenic risk scoring and mendelian randomization.
5
5
  Author-email: Cyprien Rivier <riviercyprien@gmail.com>
6
6
  Requires-Python: >=3.7
@@ -28,8 +28,8 @@ Project-URL: Home, https://github.com/CypRiv/genal
28
28
 
29
29
  # Table of contents
30
30
  1. [Introduction](#introduction)
31
- 2. [Requirements for the GENAL module](#paragraph1)
32
- 3. [Installation and how to use GENAL](#paragraph2)
31
+ 2. [Requirements for the genal module](#paragraph1)
32
+ 3. [Installation and how to use genal](#paragraph2)
33
33
  1. [Installation](#paragraph2.1)
34
34
  4. [Tutorial and presentation of the main tools](#paragraph3)
35
35
  1. [Data loading](#paragraph3.1)
@@ -50,11 +50,11 @@ The module prioritizes user-friendliness and intuitive operation, aiming to redu
50
50
 
51
51
  Genal draws on concepts from well-established R packages such as TwoSampleMR, MR-Presso, MendelianRandomization, and gwasvcf, adapting their proven methodologies to the Python environment. This approach ensures that users have access to tried and tested techniques with the versatility of Python's data science tools.
52
52
 
53
- ## Requirements for the GENAL module <a name="paragraph1"></a>
53
+ ## Requirements for the genal module <a name="paragraph1"></a>
54
54
  ***Python 3.9 or later***. https://www.python.org/ <br>
55
55
 
56
56
 
57
- ## Installation and How to use the GENAL module <a name="paragraph2"></a>
57
+ ## Installation and How to use the genal module <a name="paragraph2"></a>
58
58
 
59
59
  ### Installation <a name="paragraph2.1"></a>
60
60
 
@@ -113,7 +113,7 @@ sbp_gwas.head(5)
113
113
  | 10:100003304:SNP | a | g | 0.9609| 0.0245 | 0.0838 | 0.769800| 737054 | 663809 |
114
114
  | 10:100003785:SNP | t | c | 0.6406| -0.0680| 0.0313 | 0.029870| 738169 | 735681 |
115
115
 
116
- We can now load this data into a `genal.Geno` object. The `genal.Geno` class is the central piece of the package. It is designed to store Single Nucleotide Polymorphisms (SNP) data and make it easy to preprocess and clean.
116
+ We can now load this data into a `genal.Geno` instance. The `genal.Geno` class is the central piece of the package. It is designed to store Single Nucleotide Polymorphisms (SNP) data and make it easy to preprocess and clean.
117
117
 
118
118
  The `genal.Geno` takes as input a pandas dataframe where each row corresponds to a SNP, with columns describing the position and possibly the effect of the SNP for the given trait (SBP in our case). To indicate the names of the columns, the following arguments can be passed:
119
119
  - **CHR**: Column name for chromosome. Defaults to "CHR".
@@ -140,7 +140,7 @@ sbp_gwas.head(5)
140
140
  | 10:100003304:SNP | a | g | 0.9609| 0.0245 | 0.0838 | 0.769800 | 737054 | 663809 | 10 | 100003304 | SNP |
141
141
  | 10:100003785:SNP | t | c | 0.6406| -0.0680| 0.0313 | 0.029870 | 738169 | 735681 | 10 | 100003785 | SNP |
142
142
 
143
- And it can now be loaded into a `genal.Geno` object:
143
+ And it can now be loaded into a `genal.Geno` instance:
144
144
 
145
145
  ```python
146
146
  import genal
@@ -155,7 +155,7 @@ The last argument (`keep_columns = False`) indicates that we do not wish to keep
155
155
 
156
156
  ### Data preprocessing <a name="paragraph3.2"></a>
157
157
 
158
- Now that we have loaded the data into a `genal.Geno` object, we can begin cleaning and formatting it. Methods such as Polygenic Risk Scoring or Mendelian Randomization require the SNP data to be in a specific format. Also, raw summary statistics can sometimes contain missing or invalid values that need to be handled. Additionally, some columns may be missing from the data (such as the SNP rsid column, or the non-effect allele column) and these columns can be created based on existing ones and a reference panel.
158
+ Now that we have loaded the data into a `genal.Geno` instance, we can begin cleaning and formatting it. Methods such as Polygenic Risk Scoring or Mendelian Randomization require the SNP data to be in a specific format. Also, raw summary statistics can sometimes contain missing or invalid values that need to be handled. Additionally, some columns may be missing from the data (such as the SNP rsid column, or the non-effect allele column) and these columns can be created based on existing ones and a reference panel.
159
159
 
160
160
  Genal can run all the basic cleaning and preprocessing steps in one command:
161
161
 
@@ -188,7 +188,7 @@ In our case, the SNP column (for SNP identifier - rsid) was missing from our dat
188
188
  The SNP column (rsID) has been created. 197511(2.787%) SNPs were not found in the reference data and their ID set to CHR:POS:EA.
189
189
  The BETA column looks like Beta estimates. Use effect_column='OR' if it is a column of Odds Ratios.
190
190
 
191
- You can always check the data of a `genal.Geno` object by accessing the 'data' attribute:
191
+ You can always check the data of a `genal.Geno` instance by accessing the 'data' attribute:
192
192
 
193
193
  ```python
194
194
  SBP_Geno.data
@@ -204,7 +204,7 @@ SBP_Geno.data
204
204
  | 7088120 | A | G | 0.9028| -0.0184| 0.0517 | 0.722300 | 9 | 99999468 | rs10981301 |
205
205
 
206
206
  And we see that the SNP column with the rsids has been added based on the reference data.
207
- You do not need to obtain the 1000 genome reference panel yourself, Genal will download it the first time you use it. By default, the reference panel used is the european (eur) one. You can specify another valid reference panel (afr, eas, sas, amr) with the reference_panel argument:
207
+ You do not need to obtain the 1000 genome reference panel yourself, genal will download it the first time you use it. By default, the reference panel used is the european (eur) one. You can specify another valid reference panel (afr, eas, sas, amr) with the reference_panel argument:
208
208
 
209
209
  ```python
210
210
  SBP_Geno.preprocess_data(preprocessing = 'Fill_delete', reference_panel = "afr")
@@ -216,7 +216,7 @@ You can also use a custom reference panel by specifying to the reference_panel a
216
216
 
217
217
  Clumping is the step at which we select the SNPs that will be used as our genetic instruments in future Polygenic Risk Scores and Mendelian Randomization analyses. The process involves identifying the SNPs that are strongly associated with our trait of interest (systolic blood pressure in this tutorial) and are independent from each other. This second step ensures that selected SNPs are not highly correlated, (i.e., they are not in close linkage disequilibrium). For this step, we again need to use a reference panel.
218
218
 
219
- The SNP-data loaded in a `genal.Geno` object can be clumped using the `genal.Geno.clump` method. It will return another `genal.Geno` object containing only the clumped data:
219
+ The SNP-data loaded in a `genal.Geno` instance can be clumped using the `genal.Geno.clump` method. It will return another `genal.Geno` instance containing only the clumped data:
220
220
 
221
221
 
222
222
  ```python
@@ -243,7 +243,7 @@ Computing a Polygenic Risk Score (PRS) can be done in one line with the `genal.G
243
243
  SBP_clumped.prs(name = "SBP_prs", path = "path/to/genetic/files")
244
244
  ```
245
245
 
246
- The genetic files of the target population can be either one triple of bed/bim/fam files containing information for all SNPs, or they can be divided by chromosome (one bed/bim/fam triple for chr 1, another for chr 2, etc...). In the latter case, provide the path by replacing the chromosome number by '$' and Genal will extract the necessary SNPs from each chromosome and merge them before running the PRS. For instance, if the genetic files are named `Pop_chr1.bed`, `Pop_chr1.bim`, `Pop_chr1.fam`, `Pop_chr2.bed`, ..., you can use:
246
+ The genetic files of the target population can be either one triple of bed/bim/fam files containing information for all SNPs, or they can be divided by chromosome (one bed/bim/fam triple for chr 1, another for chr 2, etc...). In the latter case, provide the path by replacing the chromosome number by '$' and genal will extract the necessary SNPs from each chromosome and merge them before running the PRS. For instance, if the genetic files are named `Pop_chr1.bed`, `Pop_chr1.bim`, `Pop_chr1.fam`, `Pop_chr2.bed`, ..., you can use:
247
247
 
248
248
  ```python
249
249
  SBP_clumped.prs(name = "SBP_prs", path = "Pop_chr$")
@@ -339,12 +339,12 @@ You can customize how the proxies are chosen with the following arguments:
339
339
 
340
340
  > **Note:**
341
341
  >
342
- > You can call the `genal.Geno.prs` method on any `Geno` object (containing at least the EA, BETA, and either SNP or CHR/POS columns). The data does not need to be clumped, and there is no limit to the number of instruments used to compute the scores.
342
+ > You can call the `genal.Geno.prs` method on any `Geno` instance (containing at least the EA, BETA, and either SNP or CHR/POS columns). The data does not need to be clumped, and there is no limit to the number of instruments used to compute the scores.
343
343
 
344
344
 
345
345
  ### Mendelian Randomization <a name="paragraph3.5"></a>
346
346
 
347
- To run MR, we need to load both our exposure and outcome SNP-level data in `genal.Geno` objects. In our case, the genetic instruments of the MR are the SNPs associated with blood pressure at genome-wide significant levels resulting from the clumping of the blood pressure GWAS. They are stored in our `SBP_clumped` `genal.Geno` object which also include their association with the exposure trait (instrument-SBP estimates in the BETA column).
347
+ To run MR, we need to load both our exposure and outcome SNP-level data in `genal.Geno` instances. In our case, the genetic instruments of the MR are the SNPs associated with blood pressure at genome-wide significant levels resulting from the clumping of the blood pressure GWAS. They are stored in our `SBP_clumped` `genal.Geno` instance which also include their association with the exposure trait (instrument-SBP estimates in the BETA column).
348
348
 
349
349
  To get their association with the outcome trait (instrument-stroke estimates), we are going to use SNP-level data from a large GWAS of stroke performed by the GIGASTROKE consortium ([https://www.nature.com/articles/s41586-022-05165-3](https://www.nature.com/articles/s41586-022-05165-3)):
350
350
 
@@ -362,7 +362,7 @@ We inspect it to determine the column names:
362
362
  | 3 | 62707519 | 0.0536 | 0.0152 | 0.0152 | 0.3177 | 1.015316 | 0.985514 | 1.046019 | T | C |
363
363
  | 2 | 80464120 | 0.9789 | 0.0057 | 0.0254 | 0.8223 | 1.005716 | 0.956874 | 1.057052 | T | G |
364
364
 
365
- We load it in a `genal.Geno` object:
365
+ We load it in a `genal.Geno` instance:
366
366
 
367
367
  ```python
368
368
  Stroke_Geno = genal.Geno(stroke_gwas, CHR = "chromosome", POS = "base_pair_location", EA = "effect_allele", NEA = "other_allele", BETA = "beta", SE = "standard_error", P = "p_value", EAF = "effect_allele_frequency", keep_columns = False)
@@ -382,7 +382,7 @@ SBP_clumped.query_outcome(Stroke_Geno, proxy = False)
382
382
 
383
383
  Genal will print how many SNPs were successfully found and extracted from the outcome data:
384
384
 
385
- Outcome data successfully loaded from 'b352e412' geno object.
385
+ Outcome data successfully loaded from 'b352e412' geno instance.
386
386
  Identifying the exposure SNPs present in the outcome data...
387
387
  1541 SNPs out of 1545 are present in the outcome data.
388
388
  (Exposure data, Outcome data, Outcome name) stored in the .MR_data attribute.
@@ -393,9 +393,9 @@ Here as well you have the option to use proxies for the instruments that are not
393
393
  SBP_clumped.query_outcome(Stroke_geno, proxy = True, reference_panel = "eur", kb = 5000, r2 = 0.6, window_snps = 5000)
394
394
  ```
395
395
 
396
- And Genal will print the number of missing instruments which have been proxied:
396
+ And genal will print the number of missing instruments which have been proxied:
397
397
 
398
- Outcome data successfully loaded from 'b352e412' geno object.
398
+ Outcome data successfully loaded from 'b352e412' geno instance.
399
399
  Identifying the exposure SNPs present in the outcome data...
400
400
  1541 SNPs out of 1545 are present in the outcome data.
401
401
  Searching proxies for 4 SNPs...
@@ -403,26 +403,27 @@ And Genal will print the number of missing instruments which have been proxied:
403
403
  Found proxies for 4 SNPs.
404
404
  (Exposure data, Outcome data, Outcome name) stored in the .MR_data attribute.
405
405
 
406
- After extracting the instruments from the outcome data, the SBP_clumped `genal.Geno` object contains an 'MR_data' attribute containing the instruments-exposure and instruments-outcome associations necessary to run MR. Running MR is now as simple as calling the `genal.Geno.MR` method of the SBP_clumped `genal.Geno` object:
406
+ After extracting the instruments from the outcome data, the SBP_clumped `genal.Geno` instance contains an 'MR_data' attribute containing the instruments-exposure and instruments-outcome associations necessary to run MR. Running MR is now as simple as calling the `genal.Geno.MR` method of the SBP_clumped `genal.Geno` instance:
407
407
 
408
408
  ```python
409
409
  SBP_clumped.MR(action = 3, exposure_name = "SBP", outcome_name = "Stroke_eur")
410
410
  ```
411
411
 
412
412
  The `genal.Geno.MR` method returns a dataframe containing the estimates and p-values for different MR methods:
413
- | exposure | outcome | method | nSNP | b | se | pval |
414
- |----------|------------|--------------------------------------------|------|----------|----------|----------|
415
- | SBP | Stroke_eur | Inverse-Variance Weighted | 1312 | 0.023394 | 0.001132 | <e-100 |
416
- | SBP | Stroke_eur | Inverse Variance weighted (Fixed effects) | 1312 | 0.023394 | 0.000807 | <e-100 |
417
- | SBP | Stroke_eur | Unweighted regression | 1312 | 0.021764 | 0.078648 | 0.781986 |
418
- | SBP | Stroke_eur | Weighted Median | 1312 | 0.022891 | 0.001423 | <e-100 |
419
- | SBP | Stroke_eur | Penalised weighted median | 1312 | 0.021525 | 0.001432 | <e-100 |
420
- | SBP | Stroke_eur | Simple median | 1312 | 0.021480 | 0.001364 | <e-100 |
421
- | SBP | Stroke_eur | Sign concordance test | 1312 | 0.373476 | NaN | 0.0 |
422
- | SBP | Stroke_eur | MR Egger | 1312 | 0.029312 | 0.003063 | <e-100 |
423
- | SBP | Stroke_eur | Egger Intercept | 1312 |-0.001799 | 0.000865 | 0.037777 |
424
- | SBP | Stroke_eur | MR Egger bootstrap | 1312 | 0.030342 | 0.002093 | <e-100 |
425
- | SBP | Stroke_eur | Egger Intercept bootstrap | 1312 |-0.002758 | 0.000740 | 0.0015 |
413
+ | exposure | outcome | method | nSNP | b | se | pval |
414
+ |----------|-------------|-----------------------------------------|------|----------|----------|---------------|
415
+ | SBP | Stroke_eur | Inverse-Variance Weighted | 1314 | 0.023376 | 0.001131 | 7.238794e-95 |
416
+ | SBP | Stroke_eur | Inverse Variance Weighted (Fixed Effects)| 1314 | 0.023376 | 0.000806 | 8.391230e-185 |
417
+ | SBP | Stroke_eur | Unweighted Regression | 1314 | 0.021736 | 0.078596 | 7.821255e-01 |
418
+ | SBP | Stroke_eur | Weighted Median | 1314 | 0.022872 | 0.001437 | 4.984142e-57 |
419
+ | SBP | Stroke_eur | Penalised Weighted Median | 1314 | 0.021472 | 0.001418 | 8.950351e-52 |
420
+ | SBP | Stroke_eur | Simple Median | 1314 | 0.021447 | 0.001374 | 6.521750e-55 |
421
+ | SBP | Stroke_eur | Sign concordance test | 1312 | 0.373476 | NaN | 1.664938e-42 |
422
+ | SBP | Stroke_eur | MR Egger | 1314 | 0.029292 | 0.003060 | 5.009624e-21 |
423
+ | SBP | Stroke_eur | Egger Intercept | 1314 | -0.001798| 0.000864 | 3.768249e-02 |
424
+ | SBP | Stroke_eur | MR Egger bootstrap | 1314 | 0.030269 | 0.002076 | 0.000000e+00 |
425
+ | SBP | Stroke_eur | Egger Intercept bootstrap | 1314 | -0.002794| 0.000699 | 0.000000e+00 |
426
+
426
427
 
427
428
  You can specify several arguments. We refer to the API for a full list, but the most important one is the 'action' argument. It determines how palindromic SNPs are treated during the exposure-outcome harmonization step. Palindromic SNPs are SNPs where the nucleotide change reads the same forward and backward on complementary strands of DNA (for instance EA = 'A' and NEA = 'T').
428
429
 
@@ -430,7 +431,7 @@ You can specify several arguments. We refer to the API for a full list, but the
430
431
  - **action = 2**: Uses effect allele frequencies to attempt to flip them (conservative, default)
431
432
  - **action = 3**: Removes all palindromic SNPs (very conservative)
432
433
 
433
- If you choose the option 2 or 3 (recommended), Genal will print the list of palindromic SNPs that have been removed from the analysis.
434
+ If you choose the option 2 or 3 (recommended), genal will print the list of palindromic SNPs that have been removed from the analysis.
434
435
 
435
436
  By default, all MR methods (inverse-variance weighted, weighted median, MR-Egger, etc.) are going to be run. But if you do not wish to run all of them, you can specify a 'methods' argument. More details in the `genal.Geno.MR` API.
436
437
 
@@ -443,7 +444,7 @@ SBP_clumped.MR_plot(filename="MR_plot_SBP_AS")
443
444
  ```
444
445
 
445
446
  ![MR plot](docs/Images/MR_plot_SBP_AS.png)
446
- You can select which MR methods you wish to plot with the 'methods' argument. Note that for an MR method to be plotted, they must be included in the latest `genal.Geno.MR` call of this `genal.Geno` object.
447
+ You can select which MR methods you wish to plot with the 'methods' argument. Note that for an MR method to be plotted, they must be included in the latest `genal.Geno.MR` call of this `genal.Geno` instance.
447
448
 
448
449
  If you wish to include the heterogeneity values (Cochran's Q) in the results, you can use the heterogeneity argument in the `genal.Geno.MR` call. Here, the heterogeneity for the inverse-variance weighted method:
449
450
 
@@ -452,9 +453,10 @@ SBP_clumped.MR(action = 3, methods = ["IVW"], exposure_name = "SBP", outcome_nam
452
453
  ```
453
454
 
454
455
  And that will give:
455
- | exposure | outcome | method | nSNP | b | se | pval | Q | Q_df | Q_pval |
456
- |----------|------------|----------------------------|------|----------|----------|--------|--------------|------|---------|
457
- | SBP | Stroke_eur | Inverse-Variance Weighted | 1312 | 0.023394 | 0.001132 | <e-100 | 2583.740268 | 1311 | <e-100 |
456
+ | exposure | outcome | method | nSNP | b | se | pval | Q | Q_df | Q_pval |
457
+ |----------|------------|---------------------------|------|----------|----------|---------------|-------------|------|--------------|
458
+ | SBP | Stroke_eur | Inverse-Variance Weighted | 1314 | 0.023376 | 0.001131 | 7.238794e-95 | 2584.415624 | 1313 | 1.568683e-85 |
459
+
458
460
 
459
461
 
460
462
  As expected, many MR methods indicate that SBP is strongly associated with stroke, but there are some signs of horizontal pleiotropy (instruments influencing the outcome through a different pathway than the one used as exposure) given the significant MR-Egger intercept p-value.
@@ -488,7 +490,7 @@ df_pheno = pd.read_csv("path/to/trait/data")
488
490
  >
489
491
  > One important detail is to make sure that the individual IDs are identical between the phenotypic data and the genetic data for the target population.
490
492
 
491
- Then, it is advised to make a copy of the `genal.Geno` object containing our instruments as we are going to update their coefficients and to avoid any confusion:
493
+ Then, it is advised to make a copy of the `genal.Geno` instance containing our instruments as we are going to update their coefficients and to avoid any confusion:
492
494
 
493
495
  ```python
494
496
  SBP_adjusted = SBP_clumped.copy()
@@ -500,7 +502,7 @@ We can then call the `genal.Geno.set_phenotype` method, specifying which column
500
502
  SBP_adjusted.set_phenotype(df_pheno, PHENO = "htn", IID = "IID")
501
503
  ```
502
504
 
503
- At this point, Genal will identify if the phenotype is binary or quantitative (to determine the regression model. If the phenotype is binary, it will assume that the most frequent value is coding for control (and the other value for case), this can be changed with 'alternate_control=True':
505
+ At this point, genal will identify if the phenotype is binary or quantitative (to determine the regression model. If the phenotype is binary, it will assume that the most frequent value is coding for control (and the other value for case), this can be changed with 'alternate_control=True':
504
506
 
505
507
  Detected a binary phenotype in the 'PHENO' column. Specify 'PHENO_type="quant"' if this is incorrect.
506
508
  Identified 0 as the control code in 'PHENO'. Set 'alternate_control=True' to inverse this interpretation.
@@ -549,13 +551,13 @@ The SBP_adjusted.data attribute has been updated in the BETA, SE, and P columns
549
551
  ### Lifting <a name="paragraph3.7"></a>
550
552
 
551
553
  It is sometimes necessary to lift the SNP data to a different build. For instance, if the genetic data of our target population is in build 38 (hg38), but the GWAS summary statistics are in build 37 (hg19).
552
- This can easily be done in Genal using the `genal.Geno.lift` method:
554
+ This can easily be done in genal using the `genal.Geno.lift` method:
553
555
 
554
556
  ```python
555
557
  SBP_clumped.lift(start = "hg19", end = "hg38", replace = False)
556
558
  ```
557
559
 
558
- This outputs a table with the lifted SBP instruments (stored in the `SBP_clumped` object) from build 37 (hg19) to build 38 (hg38). We specified `replace = False` to not modify the `SBP_clumped.data` attribute, but we may want to modify it (before running a PRS in a population stored in build 38 for instance). Genal will download the appropriate chain files required for the lift, and it will be done in pure python by default. However, if you plan to lift large datasets of SNPs (the whole summary statistics for instance), it may be useful to install the LiftOver executable that will run faster than the pure python version. It can be downloaded here: [https://genome-store.ucsc.edu/](https://genome-store.ucsc.edu/) You will need to create an account, scroll down to "LiftOver program", add it to your cart, and declare that you are a non-profit user.
560
+ This outputs a table with the lifted SBP instruments (stored in the `SBP_clumped` instance) from build 37 (hg19) to build 38 (hg38). We specified `replace = False` to not modify the `SBP_clumped.data` attribute, but we may want to modify it (before running a PRS in a population stored in build 38 for instance). Genal will download the appropriate chain files required for the lift, and it will be done in pure python by default. However, if you plan to lift large datasets of SNPs (the whole summary statistics for instance), it may be useful to install the LiftOver executable that will run faster than the pure python version. It can be downloaded here: [https://genome-store.ucsc.edu/](https://genome-store.ucsc.edu/) You will need to create an account, scroll down to "LiftOver program", add it to your cart, and declare that you are a non-profit user.
559
561
 
560
562
  You can specify the path of the LiftOver executable to the `liftover_path` argument:
561
563
 
@@ -6,8 +6,8 @@
6
6
 
7
7
  # Table of contents
8
8
  1. [Introduction](#introduction)
9
- 2. [Requirements for the GENAL module](#paragraph1)
10
- 3. [Installation and how to use GENAL](#paragraph2)
9
+ 2. [Requirements for the genal module](#paragraph1)
10
+ 3. [Installation and how to use genal](#paragraph2)
11
11
  1. [Installation](#paragraph2.1)
12
12
  4. [Tutorial and presentation of the main tools](#paragraph3)
13
13
  1. [Data loading](#paragraph3.1)
@@ -28,11 +28,11 @@ The module prioritizes user-friendliness and intuitive operation, aiming to redu
28
28
 
29
29
  Genal draws on concepts from well-established R packages such as TwoSampleMR, MR-Presso, MendelianRandomization, and gwasvcf, adapting their proven methodologies to the Python environment. This approach ensures that users have access to tried and tested techniques with the versatility of Python's data science tools.
30
30
 
31
- ## Requirements for the GENAL module <a name="paragraph1"></a>
31
+ ## Requirements for the genal module <a name="paragraph1"></a>
32
32
  ***Python 3.9 or later***. https://www.python.org/ <br>
33
33
 
34
34
 
35
- ## Installation and How to use the GENAL module <a name="paragraph2"></a>
35
+ ## Installation and How to use the genal module <a name="paragraph2"></a>
36
36
 
37
37
  ### Installation <a name="paragraph2.1"></a>
38
38
 
@@ -91,7 +91,7 @@ sbp_gwas.head(5)
91
91
  | 10:100003304:SNP | a | g | 0.9609| 0.0245 | 0.0838 | 0.769800| 737054 | 663809 |
92
92
  | 10:100003785:SNP | t | c | 0.6406| -0.0680| 0.0313 | 0.029870| 738169 | 735681 |
93
93
 
94
- We can now load this data into a `genal.Geno` object. The `genal.Geno` class is the central piece of the package. It is designed to store Single Nucleotide Polymorphisms (SNP) data and make it easy to preprocess and clean.
94
+ We can now load this data into a `genal.Geno` instance. The `genal.Geno` class is the central piece of the package. It is designed to store Single Nucleotide Polymorphisms (SNP) data and make it easy to preprocess and clean.
95
95
 
96
96
  The `genal.Geno` takes as input a pandas dataframe where each row corresponds to a SNP, with columns describing the position and possibly the effect of the SNP for the given trait (SBP in our case). To indicate the names of the columns, the following arguments can be passed:
97
97
  - **CHR**: Column name for chromosome. Defaults to "CHR".
@@ -118,7 +118,7 @@ sbp_gwas.head(5)
118
118
  | 10:100003304:SNP | a | g | 0.9609| 0.0245 | 0.0838 | 0.769800 | 737054 | 663809 | 10 | 100003304 | SNP |
119
119
  | 10:100003785:SNP | t | c | 0.6406| -0.0680| 0.0313 | 0.029870 | 738169 | 735681 | 10 | 100003785 | SNP |
120
120
 
121
- And it can now be loaded into a `genal.Geno` object:
121
+ And it can now be loaded into a `genal.Geno` instance:
122
122
 
123
123
  ```python
124
124
  import genal
@@ -133,7 +133,7 @@ The last argument (`keep_columns = False`) indicates that we do not wish to keep
133
133
 
134
134
  ### Data preprocessing <a name="paragraph3.2"></a>
135
135
 
136
- Now that we have loaded the data into a `genal.Geno` object, we can begin cleaning and formatting it. Methods such as Polygenic Risk Scoring or Mendelian Randomization require the SNP data to be in a specific format. Also, raw summary statistics can sometimes contain missing or invalid values that need to be handled. Additionally, some columns may be missing from the data (such as the SNP rsid column, or the non-effect allele column) and these columns can be created based on existing ones and a reference panel.
136
+ Now that we have loaded the data into a `genal.Geno` instance, we can begin cleaning and formatting it. Methods such as Polygenic Risk Scoring or Mendelian Randomization require the SNP data to be in a specific format. Also, raw summary statistics can sometimes contain missing or invalid values that need to be handled. Additionally, some columns may be missing from the data (such as the SNP rsid column, or the non-effect allele column) and these columns can be created based on existing ones and a reference panel.
137
137
 
138
138
  Genal can run all the basic cleaning and preprocessing steps in one command:
139
139
 
@@ -166,7 +166,7 @@ In our case, the SNP column (for SNP identifier - rsid) was missing from our dat
166
166
  The SNP column (rsID) has been created. 197511(2.787%) SNPs were not found in the reference data and their ID set to CHR:POS:EA.
167
167
  The BETA column looks like Beta estimates. Use effect_column='OR' if it is a column of Odds Ratios.
168
168
 
169
- You can always check the data of a `genal.Geno` object by accessing the 'data' attribute:
169
+ You can always check the data of a `genal.Geno` instance by accessing the 'data' attribute:
170
170
 
171
171
  ```python
172
172
  SBP_Geno.data
@@ -182,7 +182,7 @@ SBP_Geno.data
182
182
  | 7088120 | A | G | 0.9028| -0.0184| 0.0517 | 0.722300 | 9 | 99999468 | rs10981301 |
183
183
 
184
184
  And we see that the SNP column with the rsids has been added based on the reference data.
185
- You do not need to obtain the 1000 genome reference panel yourself, Genal will download it the first time you use it. By default, the reference panel used is the european (eur) one. You can specify another valid reference panel (afr, eas, sas, amr) with the reference_panel argument:
185
+ You do not need to obtain the 1000 genome reference panel yourself, genal will download it the first time you use it. By default, the reference panel used is the european (eur) one. You can specify another valid reference panel (afr, eas, sas, amr) with the reference_panel argument:
186
186
 
187
187
  ```python
188
188
  SBP_Geno.preprocess_data(preprocessing = 'Fill_delete', reference_panel = "afr")
@@ -194,7 +194,7 @@ You can also use a custom reference panel by specifying to the reference_panel a
194
194
 
195
195
  Clumping is the step at which we select the SNPs that will be used as our genetic instruments in future Polygenic Risk Scores and Mendelian Randomization analyses. The process involves identifying the SNPs that are strongly associated with our trait of interest (systolic blood pressure in this tutorial) and are independent from each other. This second step ensures that selected SNPs are not highly correlated, (i.e., they are not in close linkage disequilibrium). For this step, we again need to use a reference panel.
196
196
 
197
- The SNP-data loaded in a `genal.Geno` object can be clumped using the `genal.Geno.clump` method. It will return another `genal.Geno` object containing only the clumped data:
197
+ The SNP-data loaded in a `genal.Geno` instance can be clumped using the `genal.Geno.clump` method. It will return another `genal.Geno` instance containing only the clumped data:
198
198
 
199
199
 
200
200
  ```python
@@ -221,7 +221,7 @@ Computing a Polygenic Risk Score (PRS) can be done in one line with the `genal.G
221
221
  SBP_clumped.prs(name = "SBP_prs", path = "path/to/genetic/files")
222
222
  ```
223
223
 
224
- The genetic files of the target population can be either one triple of bed/bim/fam files containing information for all SNPs, or they can be divided by chromosome (one bed/bim/fam triple for chr 1, another for chr 2, etc...). In the latter case, provide the path by replacing the chromosome number by '$' and Genal will extract the necessary SNPs from each chromosome and merge them before running the PRS. For instance, if the genetic files are named `Pop_chr1.bed`, `Pop_chr1.bim`, `Pop_chr1.fam`, `Pop_chr2.bed`, ..., you can use:
224
+ The genetic files of the target population can be either one triple of bed/bim/fam files containing information for all SNPs, or they can be divided by chromosome (one bed/bim/fam triple for chr 1, another for chr 2, etc...). In the latter case, provide the path by replacing the chromosome number by '$' and genal will extract the necessary SNPs from each chromosome and merge them before running the PRS. For instance, if the genetic files are named `Pop_chr1.bed`, `Pop_chr1.bim`, `Pop_chr1.fam`, `Pop_chr2.bed`, ..., you can use:
225
225
 
226
226
  ```python
227
227
  SBP_clumped.prs(name = "SBP_prs", path = "Pop_chr$")
@@ -317,12 +317,12 @@ You can customize how the proxies are chosen with the following arguments:
317
317
 
318
318
  > **Note:**
319
319
  >
320
- > You can call the `genal.Geno.prs` method on any `Geno` object (containing at least the EA, BETA, and either SNP or CHR/POS columns). The data does not need to be clumped, and there is no limit to the number of instruments used to compute the scores.
320
+ > You can call the `genal.Geno.prs` method on any `Geno` instance (containing at least the EA, BETA, and either SNP or CHR/POS columns). The data does not need to be clumped, and there is no limit to the number of instruments used to compute the scores.
321
321
 
322
322
 
323
323
  ### Mendelian Randomization <a name="paragraph3.5"></a>
324
324
 
325
- To run MR, we need to load both our exposure and outcome SNP-level data in `genal.Geno` objects. In our case, the genetic instruments of the MR are the SNPs associated with blood pressure at genome-wide significant levels resulting from the clumping of the blood pressure GWAS. They are stored in our `SBP_clumped` `genal.Geno` object which also include their association with the exposure trait (instrument-SBP estimates in the BETA column).
325
+ To run MR, we need to load both our exposure and outcome SNP-level data in `genal.Geno` instances. In our case, the genetic instruments of the MR are the SNPs associated with blood pressure at genome-wide significant levels resulting from the clumping of the blood pressure GWAS. They are stored in our `SBP_clumped` `genal.Geno` instance which also include their association with the exposure trait (instrument-SBP estimates in the BETA column).
326
326
 
327
327
  To get their association with the outcome trait (instrument-stroke estimates), we are going to use SNP-level data from a large GWAS of stroke performed by the GIGASTROKE consortium ([https://www.nature.com/articles/s41586-022-05165-3](https://www.nature.com/articles/s41586-022-05165-3)):
328
328
 
@@ -340,7 +340,7 @@ We inspect it to determine the column names:
340
340
  | 3 | 62707519 | 0.0536 | 0.0152 | 0.0152 | 0.3177 | 1.015316 | 0.985514 | 1.046019 | T | C |
341
341
  | 2 | 80464120 | 0.9789 | 0.0057 | 0.0254 | 0.8223 | 1.005716 | 0.956874 | 1.057052 | T | G |
342
342
 
343
- We load it in a `genal.Geno` object:
343
+ We load it in a `genal.Geno` instance:
344
344
 
345
345
  ```python
346
346
  Stroke_Geno = genal.Geno(stroke_gwas, CHR = "chromosome", POS = "base_pair_location", EA = "effect_allele", NEA = "other_allele", BETA = "beta", SE = "standard_error", P = "p_value", EAF = "effect_allele_frequency", keep_columns = False)
@@ -360,7 +360,7 @@ SBP_clumped.query_outcome(Stroke_Geno, proxy = False)
360
360
 
361
361
  Genal will print how many SNPs were successfully found and extracted from the outcome data:
362
362
 
363
- Outcome data successfully loaded from 'b352e412' geno object.
363
+ Outcome data successfully loaded from 'b352e412' geno instance.
364
364
  Identifying the exposure SNPs present in the outcome data...
365
365
  1541 SNPs out of 1545 are present in the outcome data.
366
366
  (Exposure data, Outcome data, Outcome name) stored in the .MR_data attribute.
@@ -371,9 +371,9 @@ Here as well you have the option to use proxies for the instruments that are not
371
371
  SBP_clumped.query_outcome(Stroke_geno, proxy = True, reference_panel = "eur", kb = 5000, r2 = 0.6, window_snps = 5000)
372
372
  ```
373
373
 
374
- And Genal will print the number of missing instruments which have been proxied:
374
+ And genal will print the number of missing instruments which have been proxied:
375
375
 
376
- Outcome data successfully loaded from 'b352e412' geno object.
376
+ Outcome data successfully loaded from 'b352e412' geno instance.
377
377
  Identifying the exposure SNPs present in the outcome data...
378
378
  1541 SNPs out of 1545 are present in the outcome data.
379
379
  Searching proxies for 4 SNPs...
@@ -381,26 +381,27 @@ And Genal will print the number of missing instruments which have been proxied:
381
381
  Found proxies for 4 SNPs.
382
382
  (Exposure data, Outcome data, Outcome name) stored in the .MR_data attribute.
383
383
 
384
- After extracting the instruments from the outcome data, the SBP_clumped `genal.Geno` object contains an 'MR_data' attribute containing the instruments-exposure and instruments-outcome associations necessary to run MR. Running MR is now as simple as calling the `genal.Geno.MR` method of the SBP_clumped `genal.Geno` object:
384
+ After extracting the instruments from the outcome data, the SBP_clumped `genal.Geno` instance contains an 'MR_data' attribute containing the instruments-exposure and instruments-outcome associations necessary to run MR. Running MR is now as simple as calling the `genal.Geno.MR` method of the SBP_clumped `genal.Geno` instance:
385
385
 
386
386
  ```python
387
387
  SBP_clumped.MR(action = 3, exposure_name = "SBP", outcome_name = "Stroke_eur")
388
388
  ```
389
389
 
390
390
  The `genal.Geno.MR` method returns a dataframe containing the estimates and p-values for different MR methods:
391
- | exposure | outcome | method | nSNP | b | se | pval |
392
- |----------|------------|--------------------------------------------|------|----------|----------|----------|
393
- | SBP | Stroke_eur | Inverse-Variance Weighted | 1312 | 0.023394 | 0.001132 | <e-100 |
394
- | SBP | Stroke_eur | Inverse Variance weighted (Fixed effects) | 1312 | 0.023394 | 0.000807 | <e-100 |
395
- | SBP | Stroke_eur | Unweighted regression | 1312 | 0.021764 | 0.078648 | 0.781986 |
396
- | SBP | Stroke_eur | Weighted Median | 1312 | 0.022891 | 0.001423 | <e-100 |
397
- | SBP | Stroke_eur | Penalised weighted median | 1312 | 0.021525 | 0.001432 | <e-100 |
398
- | SBP | Stroke_eur | Simple median | 1312 | 0.021480 | 0.001364 | <e-100 |
399
- | SBP | Stroke_eur | Sign concordance test | 1312 | 0.373476 | NaN | 0.0 |
400
- | SBP | Stroke_eur | MR Egger | 1312 | 0.029312 | 0.003063 | <e-100 |
401
- | SBP | Stroke_eur | Egger Intercept | 1312 |-0.001799 | 0.000865 | 0.037777 |
402
- | SBP | Stroke_eur | MR Egger bootstrap | 1312 | 0.030342 | 0.002093 | <e-100 |
403
- | SBP | Stroke_eur | Egger Intercept bootstrap | 1312 |-0.002758 | 0.000740 | 0.0015 |
391
+ | exposure | outcome | method | nSNP | b | se | pval |
392
+ |----------|-------------|-----------------------------------------|------|----------|----------|---------------|
393
+ | SBP | Stroke_eur | Inverse-Variance Weighted | 1314 | 0.023376 | 0.001131 | 7.238794e-95 |
394
+ | SBP | Stroke_eur | Inverse Variance Weighted (Fixed Effects)| 1314 | 0.023376 | 0.000806 | 8.391230e-185 |
395
+ | SBP | Stroke_eur | Unweighted Regression | 1314 | 0.021736 | 0.078596 | 7.821255e-01 |
396
+ | SBP | Stroke_eur | Weighted Median | 1314 | 0.022872 | 0.001437 | 4.984142e-57 |
397
+ | SBP | Stroke_eur | Penalised Weighted Median | 1314 | 0.021472 | 0.001418 | 8.950351e-52 |
398
+ | SBP | Stroke_eur | Simple Median | 1314 | 0.021447 | 0.001374 | 6.521750e-55 |
399
+ | SBP | Stroke_eur | Sign concordance test | 1312 | 0.373476 | NaN | 1.664938e-42 |
400
+ | SBP | Stroke_eur | MR Egger | 1314 | 0.029292 | 0.003060 | 5.009624e-21 |
401
+ | SBP | Stroke_eur | Egger Intercept | 1314 | -0.001798| 0.000864 | 3.768249e-02 |
402
+ | SBP | Stroke_eur | MR Egger bootstrap | 1314 | 0.030269 | 0.002076 | 0.000000e+00 |
403
+ | SBP | Stroke_eur | Egger Intercept bootstrap | 1314 | -0.002794| 0.000699 | 0.000000e+00 |
404
+
404
405
 
405
406
  You can specify several arguments. We refer to the API for a full list, but the most important one is the 'action' argument. It determines how palindromic SNPs are treated during the exposure-outcome harmonization step. Palindromic SNPs are SNPs where the nucleotide change reads the same forward and backward on complementary strands of DNA (for instance EA = 'A' and NEA = 'T').
406
407
 
@@ -408,7 +409,7 @@ You can specify several arguments. We refer to the API for a full list, but the
408
409
  - **action = 2**: Uses effect allele frequencies to attempt to flip them (conservative, default)
409
410
  - **action = 3**: Removes all palindromic SNPs (very conservative)
410
411
 
411
- If you choose the option 2 or 3 (recommended), Genal will print the list of palindromic SNPs that have been removed from the analysis.
412
+ If you choose the option 2 or 3 (recommended), genal will print the list of palindromic SNPs that have been removed from the analysis.
412
413
 
413
414
  By default, all MR methods (inverse-variance weighted, weighted median, MR-Egger, etc.) are going to be run. But if you do not wish to run all of them, you can specify a 'methods' argument. More details in the `genal.Geno.MR` API.
414
415
 
@@ -421,7 +422,7 @@ SBP_clumped.MR_plot(filename="MR_plot_SBP_AS")
421
422
  ```
422
423
 
423
424
  ![MR plot](docs/Images/MR_plot_SBP_AS.png)
424
- You can select which MR methods you wish to plot with the 'methods' argument. Note that for an MR method to be plotted, they must be included in the latest `genal.Geno.MR` call of this `genal.Geno` object.
425
+ You can select which MR methods you wish to plot with the 'methods' argument. Note that for an MR method to be plotted, they must be included in the latest `genal.Geno.MR` call of this `genal.Geno` instance.
425
426
 
426
427
  If you wish to include the heterogeneity values (Cochran's Q) in the results, you can use the heterogeneity argument in the `genal.Geno.MR` call. Here, the heterogeneity for the inverse-variance weighted method:
427
428
 
@@ -430,9 +431,10 @@ SBP_clumped.MR(action = 3, methods = ["IVW"], exposure_name = "SBP", outcome_nam
430
431
  ```
431
432
 
432
433
  And that will give:
433
- | exposure | outcome | method | nSNP | b | se | pval | Q | Q_df | Q_pval |
434
- |----------|------------|----------------------------|------|----------|----------|--------|--------------|------|---------|
435
- | SBP | Stroke_eur | Inverse-Variance Weighted | 1312 | 0.023394 | 0.001132 | <e-100 | 2583.740268 | 1311 | <e-100 |
434
+ | exposure | outcome | method | nSNP | b | se | pval | Q | Q_df | Q_pval |
435
+ |----------|------------|---------------------------|------|----------|----------|---------------|-------------|------|--------------|
436
+ | SBP | Stroke_eur | Inverse-Variance Weighted | 1314 | 0.023376 | 0.001131 | 7.238794e-95 | 2584.415624 | 1313 | 1.568683e-85 |
437
+
436
438
 
437
439
 
438
440
  As expected, many MR methods indicate that SBP is strongly associated with stroke, but there are some signs of horizontal pleiotropy (instruments influencing the outcome through a different pathway than the one used as exposure) given the significant MR-Egger intercept p-value.
@@ -466,7 +468,7 @@ df_pheno = pd.read_csv("path/to/trait/data")
466
468
  >
467
469
  > One important detail is to make sure that the individual IDs are identical between the phenotypic data and the genetic data for the target population.
468
470
 
469
- Then, it is advised to make a copy of the `genal.Geno` object containing our instruments as we are going to update their coefficients and to avoid any confusion:
471
+ Then, it is advised to make a copy of the `genal.Geno` instance containing our instruments as we are going to update their coefficients and to avoid any confusion:
470
472
 
471
473
  ```python
472
474
  SBP_adjusted = SBP_clumped.copy()
@@ -478,7 +480,7 @@ We can then call the `genal.Geno.set_phenotype` method, specifying which column
478
480
  SBP_adjusted.set_phenotype(df_pheno, PHENO = "htn", IID = "IID")
479
481
  ```
480
482
 
481
- At this point, Genal will identify if the phenotype is binary or quantitative (to determine the regression model. If the phenotype is binary, it will assume that the most frequent value is coding for control (and the other value for case), this can be changed with 'alternate_control=True':
483
+ At this point, genal will identify if the phenotype is binary or quantitative (to determine the regression model. If the phenotype is binary, it will assume that the most frequent value is coding for control (and the other value for case), this can be changed with 'alternate_control=True':
482
484
 
483
485
  Detected a binary phenotype in the 'PHENO' column. Specify 'PHENO_type="quant"' if this is incorrect.
484
486
  Identified 0 as the control code in 'PHENO'. Set 'alternate_control=True' to inverse this interpretation.
@@ -527,13 +529,13 @@ The SBP_adjusted.data attribute has been updated in the BETA, SE, and P columns
527
529
  ### Lifting <a name="paragraph3.7"></a>
528
530
 
529
531
  It is sometimes necessary to lift the SNP data to a different build. For instance, if the genetic data of our target population is in build 38 (hg38), but the GWAS summary statistics are in build 37 (hg19).
530
- This can easily be done in Genal using the `genal.Geno.lift` method:
532
+ This can easily be done in genal using the `genal.Geno.lift` method:
531
533
 
532
534
  ```python
533
535
  SBP_clumped.lift(start = "hg19", end = "hg38", replace = False)
534
536
  ```
535
537
 
536
- This outputs a table with the lifted SBP instruments (stored in the `SBP_clumped` object) from build 37 (hg19) to build 38 (hg38). We specified `replace = False` to not modify the `SBP_clumped.data` attribute, but we may want to modify it (before running a PRS in a population stored in build 38 for instance). Genal will download the appropriate chain files required for the lift, and it will be done in pure python by default. However, if you plan to lift large datasets of SNPs (the whole summary statistics for instance), it may be useful to install the LiftOver executable that will run faster than the pure python version. It can be downloaded here: [https://genome-store.ucsc.edu/](https://genome-store.ucsc.edu/) You will need to create an account, scroll down to "LiftOver program", add it to your cart, and declare that you are a non-profit user.
538
+ This outputs a table with the lifted SBP instruments (stored in the `SBP_clumped` instance) from build 37 (hg19) to build 38 (hg38). We specified `replace = False` to not modify the `SBP_clumped.data` attribute, but we may want to modify it (before running a PRS in a population stored in build 38 for instance). Genal will download the appropriate chain files required for the lift, and it will be done in pure python by default. However, if you plan to lift large datasets of SNPs (the whole summary statistics for instance), it may be useful to install the LiftOver executable that will run faster than the pure python version. It can be downloaded here: [https://genome-store.ucsc.edu/](https://genome-store.ucsc.edu/) You will need to create an account, scroll down to "LiftOver program", add it to your cart, and declare that you are a non-profit user.
537
539
 
538
540
  You can specify the path of the LiftOver executable to the `liftover_path` argument:
539
541
 
@@ -931,7 +931,110 @@ class Geno:
931
931
  plot.save(f"{filename}.png", dpi=500, width=10, height=6, verbose=False)
932
932
 
933
933
  return plot
934
+
935
+ def MR_forest(
936
+ self,
937
+ methods=[
938
+ "IVW",
939
+ "WM",
940
+ "Simple-median",
941
+ "Egger",
942
+ ],
943
+ exposure_name=None,
944
+ outcome_name=None,
945
+ filename=None
946
+ ):
947
+ """
948
+ Creates and returns a scatter plot of individual SNP effects with lines representing different Mendelian Randomization (MR) methods. Each MR method specified in the 'methods' argument is represented as a line in the plot.
949
+
950
+ Args:
951
+ methods (list of str, optional): A list of MR methods to be included in the plot. Default methods are "IVW", "WM", "Simple-median", and "Egger".
952
+ exposure_name (str, optional): A custom label for the exposure effect axis. If None, uses the label provided in the MR function call or a default label.
953
+ outcome_name (str, optional): A custom label for the outcome effect axis. If None, uses the label provided in the MR function call or a default label.
954
+ filename (str, optional): The filename where the plot will be saved. If None, the plot is not saved.
955
+
956
+ Returns:
957
+ plotnine.ggplot.ggplot: A plotnine ggplot object representing the scatter plot of individual SNP effects with MR method lines.
958
+
959
+ Raises:
960
+ ValueError: If MR analysis has not been performed prior to calling this function.
961
+
962
+ Note:
963
+ This function requires prior execution of the `MR` method to compute MR results. Make sure the MR analysis is performed on the data before calling `MR_plot`.
964
+ """
965
+ if not hasattr(self, "MR_results"):
966
+ raise ValueError("You need to run an MR analysis with the MR method before calling the MR_plot function.")
967
+
968
+ ## Extract the previously computed MR results
969
+ df_mr = self.MR_results[1]
970
+ res = self.MR_results[0]
971
+ exposure_name = self.MR_results[2] if not exposure_name else exposure_name
972
+ exposure_name = "Effect on the exposure" if not exposure_name else f"Effect on {exposure_name}"
973
+ outcome_name = self.MR_results[3] if not outcome_name else outcome_name
974
+ outcome_name = "Effect on the outcome" if not outcome_name else f"Effect on {outcome_name}"
934
975
 
976
+ ## Switch all exposure betas to >= 0
977
+ df_mr['BETA_e'], df_mr['BETA_o'] = np.where(df_mr['BETA_e'] < 0, (-df_mr['BETA_e'], -df_mr['BETA_o']), (df_mr['BETA_e'], df_mr['BETA_o']))
978
+
979
+ ## Create the scatter plot with error bars
980
+ plot = (
981
+ ggplot(df_mr, aes('BETA_e', 'BETA_o'))
982
+
983
+ + geom_errorbarh(aes(xmin='BETA_e-SE_e', xmax='BETA_e+SE_e'), height=0, color="gray", size=0.1)
984
+ + geom_errorbar(aes(ymin='BETA_o-SE_o', ymax='BETA_o+SE_o'), width=0, color="gray", size=0.1)
985
+ + geom_point(color='black', size=0.2)
986
+ + geom_abline(slope=0, intercept=0, color='black')
987
+ + labs(x=exposure_name, y=outcome_name)
988
+ + theme(
989
+ axis_title=element_text(size=12),
990
+ axis_text=element_text(size=10),
991
+ figure_size=(10,6)
992
+ )
993
+ + expand_limits(x=0)
994
+ )
995
+
996
+ ## Add the lines corresponding to the specified MR methods (if present in the computation)
997
+ lines = []
998
+ for method in methods:
999
+ if method not in MR_METHODS_NAMES.keys():
1000
+ warnings.warn(f"{method} is not an appropriate MR method. MR methods can be IVW, WM, Egger... Please refer to the documentation for more.")
1001
+ continue
1002
+ ## If not an Egger method: simply need to get the slope
1003
+ if not method.startswith("Egger"):
1004
+ method_name = MR_METHODS_NAMES[method]
1005
+ res_row = res[res.method == method_name]
1006
+ if res_row.shape[0] == 0:
1007
+ warnings.warn(f"The {method_name} ({method}) method was not included in the MR method call and will be excluded from the plot.")
1008
+ elif res_row.shape[0] == 1:
1009
+ lines.append({
1010
+ 'slope': res_row["b"].values[0],
1011
+ 'intercept': 0,
1012
+ 'MR Methods': method_name # Use method_name as the color label
1013
+ })
1014
+ ## For Egger methods: need to get the slope and the intercept
1015
+ else:
1016
+ method_name = MR_METHODS_NAMES[method][0]
1017
+ method_name_intercept = MR_METHODS_NAMES[method][1]
1018
+ res_row = res[res.method == method_name]
1019
+ res_row_intercept = res[res.method == method_name_intercept]
1020
+ if res_row.shape[0] == 0:
1021
+ warnings.warn(f"The {method_name} ({method}) method was not included in the MR method call and will be excluded from the plot.")
1022
+ elif res_row.shape[0] == 1 and res_row_intercept.shape[0] == 1:
1023
+ lines.append({
1024
+ 'slope': res_row["b"].values[0],
1025
+ 'intercept': res_row_intercept["b"].values[0],
1026
+ 'MR Methods': method_name # Use method_name as the color label
1027
+ })
1028
+ line_data = pd.DataFrame(lines)
1029
+ plot += geom_abline(aes(slope='slope', intercept='intercept', color='MR Methods'), data=line_data)
1030
+
1031
+ ## Save plot if filename is specified
1032
+ if filename:
1033
+ plot.save(f"{filename}.png", dpi=500, width=10, height=6, verbose=False)
1034
+
1035
+ return plot
1036
+
1037
+
935
1038
 
936
1039
  def MRpresso(
937
1040
  self,
@@ -3,7 +3,7 @@ import pandas as pd
3
3
  import statsmodels.api as sm
4
4
  import statsmodels.formula.api as smf
5
5
  from scipy import stats
6
- from scipy.stats import norm, chi2, binomtest
6
+ from scipy.stats import norm, chi2, binomtest, t
7
7
  from concurrent.futures import ProcessPoolExecutor, as_completed, ThreadPoolExecutor
8
8
  from sklearn.linear_model import LinearRegression
9
9
  from tqdm import tqdm
@@ -79,15 +79,15 @@ def mr_egger_regression(BETA_e, SE_e, BETA_o, SE_o):
79
79
  if len(mod.params) > 1:
80
80
  b = mod.params.iloc[1]
81
81
  se = mod.bse.iloc[1] / min(1, np.sqrt(mod.mse_resid))
82
- pval = 2 * (1 - stats.t.cdf(np.abs(b / se), df=l - 2))
82
+ pval = 2 * t.sf(abs(b / se), l - 2)
83
83
 
84
84
  b_i = mod.params.iloc[0]
85
85
  se_i = mod.bse.iloc[0] / min(1, np.sqrt(mod.mse_resid))
86
- pval_i = 2 * (1 - stats.t.cdf(np.abs(b_i / se_i), df=l - 2))
86
+ pval_i = 2 * t.sf(abs(b_i / se_i), l - 2)
87
87
 
88
88
  Q = mod.mse_resid * (l - 2)
89
89
  Q_df = l - 2
90
- Q_pval = 1 - chi2.cdf(Q, Q_df)
90
+ Q_pval = chi2.sf(Q, Q_df)
91
91
 
92
92
  return [
93
93
  {
@@ -147,7 +147,7 @@ def linreg(x, y, w=None):
147
147
  se = np.sqrt(
148
148
  sum(w * residuals**2) / (np.sum(~np.isnan(yhat)) - 2) / np.sum(w * x**2)
149
149
  )
150
- pval = 2 * (1 - norm.cdf(abs(bhat / se)))
150
+ pval = 2 * norm.sf(abs(bhat / se))
151
151
 
152
152
  return {"ahat": ahat, "bhat": bhat, "se": se, "pval": pval}
153
153
 
@@ -297,7 +297,7 @@ def mr_weighted_median(BETA_e, SE_e, BETA_o, SE_o, nboot):
297
297
 
298
298
  b = weighted_median(b_iv, 1 / VBj)
299
299
  se = weighted_median_bootstrap(BETA_e, SE_e, BETA_o, SE_o, 1 / VBj, nboot)
300
- pval = 2 * (1 - norm.cdf(abs(b / se)))
300
+ pval = 2 * norm.sf(abs(b / se))
301
301
  return [{"method": MR_METHODS_NAMES["WM"], "nSNP": l, "b": b, "se": se, "pval": pval}]
302
302
 
303
303
 
@@ -344,7 +344,7 @@ def mr_pen_wm(BETA_e, SE_e, BETA_o, SE_o, nboot, penk):
344
344
 
345
345
  b = weighted_median(betaIV, pen_weights)
346
346
  se = weighted_median_bootstrap(BETA_e, SE_e, BETA_o, SE_o, pen_weights, nboot)
347
- pval = 2 * (1 - norm.cdf(abs(b / se)))
347
+ pval = 2 * norm.sf(abs(b / se))
348
348
 
349
349
  return [
350
350
  {
@@ -395,7 +395,7 @@ def mr_simple_median(BETA_e, SE_e, BETA_o, SE_o, nboot):
395
395
  weights = np.repeat(1 / len(BETA_e), len(BETA_e))
396
396
  b = weighted_median(b_iv, weights)
397
397
  se = weighted_median_bootstrap(BETA_e, SE_e, BETA_o, SE_o, weights, nboot)
398
- pval = 2 * (1 - norm.cdf(abs(b / se)))
398
+ pval = 2 * norm.sf(abs(b / se))
399
399
  return [{"method": MR_METHODS_NAMES["Simple-median"], "b": b, "se": se, "pval": pval, "nSNP": l}]
400
400
 
401
401
 
@@ -457,11 +457,11 @@ def mr_ivw(BETA_e, SE_e, BETA_o, SE_o):
457
457
  # Extract coefficients
458
458
  b = model.params.iloc[0]
459
459
  se = model.bse.iloc[0] / min(1, np.sqrt(model.mse_resid))
460
- pval = 2 * (1 - norm.cdf(abs(b / se)))
460
+ pval = 2 * norm.sf(abs(b / se))
461
461
 
462
462
  Q_df = l - 1
463
463
  Q = model.scale * Q_df
464
- Q_pval = 1 - chi2.cdf(Q, Q_df)
464
+ Q_pval = chi2.sf(Q, Q_df)
465
465
 
466
466
  return [
467
467
  {
@@ -521,7 +521,7 @@ def mr_ivw_re(BETA_e, SE_e, BETA_o, SE_o):
521
521
  # Extract coefficients
522
522
  b = model.params[0]
523
523
  se = model.bse[0]
524
- pval = 2 * (1 - norm.cdf(abs(b / se)))
524
+ pval = 2 * norm.sf(abs(b / se))
525
525
  Q_df = l - 1
526
526
  Q = model.scale * Q_df
527
527
  Q_pval = chi2.sf(Q, Q_df)
@@ -594,7 +594,7 @@ def mr_ivw_fe(BETA_e, SE_e, BETA_o, SE_o):
594
594
  # Extract coefficients
595
595
  b = model.params.iloc[0]
596
596
  se = model.bse.iloc[0] / model.mse_resid**0.5
597
- pval = 2 * norm.sf(np.abs(b / se))
597
+ pval = 2 * norm.sf(abs(b / se))
598
598
  Q_df = l - 1
599
599
  Q = model.scale * Q_df
600
600
  Q_pval = chi2.sf(Q, Q_df)
@@ -667,9 +667,9 @@ def mr_uwr(BETA_e, SE_e, BETA_o, SE_o):
667
667
  "se": se,
668
668
  "pval": pval,
669
669
  "nSNP": l,
670
- "Q": Q,
671
- "Q_df": Q_df,
672
- "Q_pval": Q_pval,
670
+ "Q": np.nan,
671
+ "Q_df": np.nan,
672
+ "Q_pval": np.nan,
673
673
  }
674
674
  ]
675
675
 
@@ -61,27 +61,7 @@ def mrpresso_func(
61
61
  df_exposure, df_outcome, action=action, eaf_threshold=eaf_threshold
62
62
  )
63
63
 
64
- # Delete NAs, infinite or null values and print the SNP names and if the invalid value came from exposure or outcome data.
65
- df_mr = df_mr[["SNP", "BETA_e", "SE_e", "BETA_o", "SE_o"]]
66
- df_mr.replace([np.inf, -np.inf], np.nan, inplace=True)
67
- df_mr.replace(0, np.nan, inplace=True)
68
- mask_exposure = df_mr[["BETA_e", "SE_e"]].isna().any(axis=1)
69
- mask_outcome = df_mr[["BETA_o", "SE_o"]].isna().any(axis=1)
70
- rows_to_delete_exposure = df_mr[mask_exposure]
71
- rows_to_delete_outcome = df_mr[mask_outcome]
72
- n_deleted_exposure = len(rows_to_delete_exposure)
73
- n_deleted_outcome = len(rows_to_delete_outcome)
74
- if n_deleted_exposure > 0:
75
- print(
76
- f"Deleting {n_deleted_exposure} SNPs with NA, infinite, or null values in BETA/SE columns (exposure data): {rows_to_delete_exposure['SNP'].tolist()}"
77
- )
78
- if n_deleted_outcome > 0:
79
- print(
80
- f"Deleting {n_deleted_outcome} SNPs with NA, infinite, or null values in BETA/SE columns (outcome data): {rows_to_delete_outcome['SNP'].tolist()}"
81
- )
82
-
83
- df_mr = df_mr[["BETA_e", "SE_e", "BETA_o", "SE_o"]]
84
- df_mr = df_mr.dropna().reset_index(drop=True)
64
+ df_mr = df_mr_formatting(df_mr)
85
65
 
86
66
  # Call and return the results of MR-PRESSO
87
67
  return mr_presso(
@@ -163,27 +143,7 @@ def MR_func(
163
143
  df_exposure, df_outcome, action=action, eaf_threshold=eaf_threshold
164
144
  )
165
145
 
166
- # Delete NAs, infinite or null values and print the SNP names and if the invalid value came from exposure or outcome data.
167
- df_mr = df_mr[["SNP", "BETA_e", "SE_e", "BETA_o", "SE_o"]]
168
- df_mr.replace([np.inf, -np.inf], np.nan, inplace=True)
169
- df_mr.replace(0, np.nan, inplace=True)
170
- mask_exposure = df_mr[["BETA_e", "SE_e"]].isna().any(axis=1)
171
- mask_outcome = df_mr[["BETA_o", "SE_o"]].isna().any(axis=1)
172
- rows_to_delete_exposure = df_mr[mask_exposure]
173
- rows_to_delete_outcome = df_mr[mask_outcome]
174
- n_deleted_exposure = len(rows_to_delete_exposure)
175
- n_deleted_outcome = len(rows_to_delete_outcome)
176
- if n_deleted_exposure > 0:
177
- print(
178
- f"Deleting {n_deleted_exposure} SNPs with NA, infinite, or null values in BETA/SE columns (exposure data): {rows_to_delete_exposure['SNP'].tolist()}"
179
- )
180
- if n_deleted_outcome > 0:
181
- print(
182
- f"Deleting {n_deleted_outcome} SNPs with NA, infinite, or null values in BETA/SE columns (outcome data): {rows_to_delete_outcome['SNP'].tolist()}"
183
- )
184
-
185
- df_mr = df_mr[["BETA_e", "SE_e", "BETA_o", "SE_o"]]
186
- df_mr = df_mr.dropna().reset_index(drop=True)
146
+ df_mr = df_mr_formatting(df_mr)
187
147
 
188
148
  # Prepare values for MR methods
189
149
  BETA_e, BETA_o, SE_e, SE_o = (
@@ -191,7 +151,7 @@ def MR_func(
191
151
  df_mr["BETA_o"],
192
152
  df_mr["SE_e"],
193
153
  df_mr["SE_o"],
194
- )
154
+ )
195
155
 
196
156
  print(
197
157
  f"Running Mendelian Randomization with {name_exposure} as exposure and {name_outcome} as outcome."
@@ -223,9 +183,9 @@ def MR_func(
223
183
  res = pd.DataFrame(results)
224
184
  res["exposure"], res["outcome"] = name_exposure, name_outcome
225
185
 
226
- res.loc[res['pval'].astype(float) < 1e-100, 'pval'] = 0
227
- res["pval"] = res["pval"].replace(0, '<e-100')
228
- res["Q_pval"] = res["Q_pval"].replace(0, '<e-100')
186
+ #res.loc[res['pval'].astype(float) < 1e-100, 'pval'] = 0
187
+ #res["pval"] = res["pval"].replace(0, '<e-100')
188
+ #res["Q_pval"] = res["Q_pval"].replace(0, '<e-100')
229
189
 
230
190
 
231
191
  if not heterogeneity:
@@ -249,6 +209,34 @@ def MR_func(
249
209
 
250
210
  return res, df_mr
251
211
 
212
+ def df_mr_formatting(df_mr):
213
+ """
214
+ Function to delete invalid values from the MR dataframe (after the harmonization step)
215
+ """
216
+ # Delete NAs or infinite values (or null values in SE columns, null values in BETA are accepted) and print the SNP names and if the invalid value came from exposure or outcome data.
217
+ df_mr = df_mr[["SNP", "BETA_e", "SE_e", "BETA_o", "SE_o"]].copy()
218
+ df_mr.replace([np.inf, -np.inf], np.nan, inplace=True)
219
+ df_mr.loc[:, ["SE_e", "SE_o"]] = df_mr.loc[:, ["SE_e", "SE_o"]].replace(0, np.nan)
220
+ mask_exposure = df_mr[["BETA_e", "SE_e"]].isna().any(axis=1)
221
+ mask_outcome = df_mr[["BETA_o", "SE_o"]].isna().any(axis=1)
222
+ rows_to_delete_exposure = df_mr[mask_exposure]
223
+ rows_to_delete_outcome = df_mr[mask_outcome]
224
+ n_deleted_exposure = len(rows_to_delete_exposure)
225
+ n_deleted_outcome = len(rows_to_delete_outcome)
226
+ if n_deleted_exposure > 0:
227
+ print(
228
+ f"Deleting {n_deleted_exposure} SNPs with NA or infinite values in BETA/SE columns, or null values in SE column (exposure data): {rows_to_delete_exposure['SNP'].tolist()}"
229
+ )
230
+ if n_deleted_outcome > 0:
231
+ print(
232
+ f"Deleting {n_deleted_outcome} SNPs with NA or infinite values in BETA/SE columns, or null values in SE column (outcome data): {rows_to_delete_outcome['SNP'].tolist()}"
233
+ )
234
+
235
+ df_mr = df_mr[["BETA_e", "SE_e", "BETA_o", "SE_o"]]
236
+ df_mr = df_mr.dropna().reset_index(drop=True)
237
+
238
+ return df_mr
239
+
252
240
 
253
241
  def query_outcome_func(
254
242
  data, outcome, name, proxy, reference_panel, kb, r2, window_snps, cpus
@@ -331,7 +319,7 @@ def load_outcome_from_geno_object(outcome):
331
319
  """Load outcome data from a Geno object."""
332
320
  df_outcome = outcome.data
333
321
  name = outcome.name
334
- print(f"Outcome data successfully loaded from '{name}' Geno object.")
322
+ print(f"Outcome data successfully loaded from '{name}' Geno instance.")
335
323
  return df_outcome, name
336
324
 
337
325
 
@@ -11,6 +11,7 @@ from numpy.random import default_rng
11
11
  from functools import partial
12
12
 
13
13
  ##todo: implement the multivariable option, for the moment we assume only 1 BETA_e column
14
+ # Also: check if we can replace the LinearRegression of sklearn with one from statsmodels to avoid using sklearn just for that
14
15
 
15
16
 
16
17
  # MR-PRESSO main function
@@ -3,7 +3,7 @@ import json
3
3
  from .tools import default_config, write_config, set_plink, delete_tmp
4
4
  from .geno_tools import Combine_Geno
5
5
 
6
- __version__ = "0.3"
6
+ __version__ = "0.4"
7
7
 
8
8
  config_dir = os.path.expanduser(
9
9
  "~/.genal/"
@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
4
4
 
5
5
  [project]
6
6
  name = "genal-python" # Updated name for PyPI
7
- version = "0.3"
7
+ version = "0.4"
8
8
  authors = [{name = "Cyprien Rivier", email = "riviercyprien@gmail.com"}]
9
9
  description = "A python toolkit for polygenic risk scoring and mendelian randomization."
10
10
  readme = "README.md"
@@ -1,12 +1,12 @@
1
- numpy==1.26.3
2
- pandas==2.0.3
1
+ numpy>=1.24.4, <2.0
2
+ pandas>=2.0.3
3
3
  plotnine==0.12.3
4
4
  psutil==5.9.1
5
5
  pyliftover==0.4
6
- scikit_learn==1.3.0
7
- scipy==1.11.4
6
+ scikit_learn>=1.3.0
7
+ scipy>=1.11.4
8
8
  setuptools==62.3.3
9
9
  sphinx_rtd_theme==1.3.0
10
- statsmodels==0.14.0
10
+ statsmodels>=0.14.0
11
11
  tqdm==4.66.1
12
12
  wget==3.2
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes