genal-python 0.0.dev0__py3-none-any.whl → 0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1140 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import warnings
4
+ import os, subprocess
5
+ import copy
6
+ import psutil
7
+ import uuid
8
+ from functools import partial
9
+ from concurrent.futures import ProcessPoolExecutor
10
+ from plotnine import ggplot, aes, geom_point, geom_errorbarh, geom_errorbar, theme, element_text, geom_abline, labs
11
+
12
+
13
+ from .proxy import find_proxies, apply_proxies, query_outcome_proxy
14
+ from .MR_tools import query_outcome_func, harmonize_MR, MR_func, mrpresso_func
15
+ from .clump import clump_data
16
+ from .lift import lift_data
17
+ from .tools import create_tmp, get_plink19_path, load_reference_panel, setup_genetic_path
18
+ from .geno_tools import (
19
+ save_data,
20
+ check_arguments,
21
+ adjust_column_names,
22
+ check_int_column,
23
+ fill_snpids_func,
24
+ fill_coordinates_func,
25
+ fill_nea,
26
+ fill_ea_nea,
27
+ check_beta_column,
28
+ check_p_column,
29
+ fill_se_p,
30
+ check_allele_column,
31
+ check_snp_column,
32
+ remove_na
33
+ )
34
+ from .association import set_phenotype_func, association_test_func
35
+ from .extract_prs import extract_snps_func, prs_func
36
+ from .constants import STANDARD_COLUMNS, REF_PANEL_COLUMNS, CHECKS_DICT, MR_METHODS_NAMES
37
+
38
+ # Do all the MR steps (query_outcome, harmonize etc) based on CHR/POS and not SNPs
39
+ # Add proxying function (input is df + searchspace (list of SNP or path to .bim, can be separated by chromosomes) and returns proxied df)
40
+ # Get proxies (simply return a list of proxies)
41
+ # Multi-MR with python MR
42
+ # Warning that users might not have shell (for the .ram attribute)
43
+ # Phenoscanner
44
+
45
+
46
+
47
+ class Geno:
48
+ """
49
+ A class to handle GWAS-derived data, including SNP rsID, genome position,
50
+ SNP-trait effects, and effect allele frequencies.
51
+
52
+ Attributes:
53
+ data (pd.DataFrame): Main DataFrame containing SNP data.
54
+ phenotype (pd.DataFrame, str): Tuple with a DataFrame of individual-level phenotype
55
+ data and a string representing the phenotype trait column. Initialized after
56
+ running the 'set_phenotype' method.
57
+ MR_data (pd.DataFrame, pd.DataFrame, str): Tuple containing DataFrames for associations
58
+ with exposure and outcome, and a string for the outcome name. Initialized after
59
+ running the 'query_outcome' method.
60
+ MR_results (pd.DataFrame, pd.DataFrame, str, str): Contains an MR results dataframe, a dataframe of harmonized SNPs, an exposure name, an outcome name. Assigned after calling the MR method and used for plotting with the MR_plot method.
61
+ ram (int): Available memory.
62
+ cpus (int): Number of available CPUs.
63
+ checks (dict): Dictionary of checks performed on the main DataFrame.
64
+ name (str): ID of the object (for internal reference and debugging purposes).
65
+ reference_panel (pd.DataFrame): Reference population SNP data used for SNP info
66
+ adjustments. Initialized when first needed.
67
+
68
+ Methods:
69
+ preprocess_data():
70
+ Clean and preprocess dataframe of SNP data.
71
+
72
+ clump():
73
+ Clumps the main data and stores the result in data_clumped.
74
+
75
+ prs():
76
+ Computes Polygenic Risk Score on genomic data.
77
+
78
+ set_phenotype():
79
+ Assigns a DataFrame with individual-level data and a phenotype trait to
80
+ the phenotype attribute.
81
+
82
+ association_test():
83
+ Computes SNP-trait effect estimates, standard errors, and p-values.
84
+
85
+ query_outcome():
86
+ Extracts SNPs from outcome data with proxying and initializes MR_data.
87
+
88
+ MR():
89
+ Performs Mendelian Randomization between SNP-exposure and SNP-outcome data.
90
+
91
+ MRpresso():
92
+ Executes the MR-PRESSO algorithm for horizontal pleiotropy correction between
93
+ SNP-exposure and SNP-outcome data.
94
+
95
+ lift():
96
+ Lifts SNP data from one genomic build to another.
97
+ """
98
+
99
+ def __init__(
100
+ self,
101
+ df,
102
+ CHR="CHR",
103
+ POS="POS",
104
+ SNP="SNP",
105
+ EA="EA",
106
+ NEA="NEA",
107
+ BETA="BETA",
108
+ SE="SE",
109
+ P="P",
110
+ EAF="EAF",
111
+ keep_columns=True,
112
+ ):
113
+ """
114
+ Initializes the Geno object used to store and transform Single Nucleotide Polymorphisms (SNP) data.
115
+
116
+ Args:
117
+ df (pd.DataFrame): DataFrame where each row represents a SNP.
118
+ CHR (str, optional): Column name for chromosome. Defaults to "CHR".
119
+ POS (str, optional): Column name for genomic position. Defaults to "POS".
120
+ SNP (str, optional): Column name for SNP identifier. Defaults to "SNP".
121
+ EA (str, optional): Column name for effect allele. Defaults to "EA".
122
+ NEA (str, optional): Column name for non-effect allele. Defaults to "NEA".
123
+ BETA (str, optional): Column name for effect estimate. Defaults to "BETA".
124
+ SE (str, optional): Column name for effect standard error. Defaults to "SE".
125
+ P (str, optional): Column name for p-value. Defaults to "P".
126
+ EAF (str, optional): Column name for effect allele frequency. Defaults to "EAF".
127
+ keep_columns (bool, optional): Determines if non-main columns should be kept. Defaults to True.
128
+
129
+ Attributes:
130
+ name (str): Randomly generated ID for the Geno object.
131
+ outcome (list): List of outcomes (initialized as empty).
132
+ cpus (int): Number of CPUs to be used.
133
+ ram (int): Amount of RAM to be used in MBs.
134
+ """
135
+
136
+ # Validate df type
137
+ if not isinstance(df, pd.DataFrame):
138
+ raise TypeError("df needs to be a pandas dataframe.")
139
+ data = df.copy()
140
+
141
+ # Standardize column names based on provided parameters +/- delete other columns
142
+ data = adjust_column_names(
143
+ data, CHR, POS, SNP, EA, NEA, BETA, SE, P, EAF, keep_columns
144
+ )
145
+
146
+ # Set object attributes
147
+ self.data = data
148
+ self.name = str(uuid.uuid4())[:8]
149
+
150
+ # List to keep track of checks performed
151
+ self.checks = CHECKS_DICT.copy()
152
+
153
+ # Set the maximal amount of ram/cpu to be used by the methods and dask chunksize
154
+ self.cpus = int(os.environ.get("SLURM_CPUS_PER_TASK", default=os.cpu_count()))
155
+ non_hpc_ram_per_cpu = psutil.virtual_memory().available / (
156
+ 1024**2 * self.cpus
157
+ )
158
+ ram_per_cpu = int(
159
+ os.environ.get("SLURM_MEM_PER_CPU", default=non_hpc_ram_per_cpu)
160
+ )
161
+ self.ram = int(ram_per_cpu * self.cpus * 0.8)
162
+
163
+ create_tmp()
164
+
165
+ return
166
+
167
+ def preprocess_data(
168
+ self,
169
+ preprocessing='Fill',
170
+ reference_panel="eur",
171
+ effect_column=None,
172
+ keep_multi=None,
173
+ keep_dups=None,
174
+ fill_snpids=None,
175
+ fill_coordinates=None,
176
+ ):
177
+ """
178
+ Clean and preprocess the main dataframe of Single Nucleotide Polymorphisms (SNP) data.
179
+
180
+ Args:
181
+ preprocessing (str, optional): Level of preprocessing to apply. Options include:
182
+ - "None": The dataframe is not modified.
183
+ - "Fill": Missing columns are added based on reference data and invalid values set to NaN, but no rows are deleted.
184
+ - "Fill_delete": Missing columns are added, and rows with missing, duplicated, or invalid values are deleted.
185
+ Defaults to 'Fill'.
186
+ reference_panel (str or pd.DataFrame, optional): Reference panel for SNP adjustments. Can be a string representing ancestry classification ("eur", "afr", "eas", "sas", "amr") or a DataFrame with ["CHR","SNP","POS","A1","A2"] columns or a path to a .bim file. Defaults to "eur".
187
+ effect_column (str, optional): Specifies the type of effect column ("BETA" or "OR"). If None, the method tries to determine it. Odds Ratios will be log-transformed and the standard error adjusted. Defaults to None.
188
+ keep_multi (bool, optional): Determines if multiallelic SNPs should be kept. If None, defers to preprocessing value. Defaults to None.
189
+ keep_dups (bool, optional): Determines if rows with duplicate SNP IDs should be kept. If None, defers to preprocessing value. Defaults to None.
190
+ fill_snpids (bool, optional): Decides if the SNP (rsID) column should be created or replaced based on CHR/POS columns and a reference genome. If None, defers to preprocessing value. Defaults to None.
191
+ fill_coordinates (bool, optional): Decides if CHR and/or POS should be created or replaced based on SNP column and a reference genome. If None, defers to preprocessing value. Defaults to None.
192
+ """
193
+
194
+ data = self.data
195
+
196
+ # Check arguments and solve arguments logic.
197
+ keep_multi, keep_dups, fill_snpids, fill_coordinates = check_arguments(
198
+ preprocessing,
199
+ reference_panel,
200
+ effect_column,
201
+ fill_snpids,
202
+ fill_coordinates,
203
+ keep_multi,
204
+ keep_dups,
205
+ )
206
+
207
+ # Ensure CHR and POS columns are integers if preprocessing is enabled
208
+ for int_col in ["CHR", "POS"]:
209
+ if int_col in data.columns and preprocessing in ['Fill', 'Fill_delete']:
210
+ check_int_column(data, int_col)
211
+ self.checks[int_col] = True
212
+
213
+ # Fill missing SNP column from reference data if necessary
214
+ should_fill_snpids = (
215
+ ("CHR" in data.columns)
216
+ and ("POS" in data.columns)
217
+ and ("SNP" not in data.columns)
218
+ ) or fill_snpids
219
+ if should_fill_snpids and fill_snpids is not False:
220
+ data = fill_snpids_func(data, self.get_reference_panel(reference_panel))
221
+
222
+ # Fill missing CHR/POS columns from reference data if necessary
223
+ should_fill_coordinates = (
224
+ (not ("CHR" in data.columns) or not ("POS" in data.columns))
225
+ and ("SNP" in data.columns)
226
+ ) or fill_coordinates
227
+ if should_fill_coordinates and fill_coordinates is not False:
228
+ data = fill_coordinates_func(
229
+ data, self.get_reference_panel(reference_panel)
230
+ )
231
+
232
+ # Fill missing NEA column from reference data if necessary and preprocessing is enabled
233
+ missing_nea_condition = (
234
+ "CHR" in data.columns
235
+ and "POS" in data.columns
236
+ and "NEA" not in data.columns
237
+ and "EA" in data.columns
238
+ )
239
+ if missing_nea_condition and preprocessing in ['Fill', 'Fill_delete']:
240
+ data = fill_nea(data, self.get_reference_panel(reference_panel))
241
+
242
+ # Fill missing EA and NEA columns from reference data if necessary and preprocessing is enabled
243
+ missing_ea_nea_condition = (
244
+ "CHR" in data.columns
245
+ and "POS" in data.columns
246
+ and "NEA" not in data.columns
247
+ and "EA" not in data.columns
248
+ )
249
+ if missing_ea_nea_condition and preprocessing in ['Fill', 'Fill_delete']:
250
+ data = fill_ea_nea(data, self.get_reference_panel(reference_panel))
251
+
252
+ # Convert effect column to Beta estimates if present
253
+ if "BETA" in data.columns:
254
+ check_beta_column(data, effect_column, preprocessing)
255
+ self.checks["BETA"] = True
256
+
257
+ # Ensure P column contains valid values
258
+ if "P" in data.columns and preprocessing in ['Fill', 'Fill_delete']:
259
+ check_p_column(data)
260
+ self.checks["P"] = True
261
+
262
+ # Fill missing SE or P columns if necessary
263
+ if preprocessing in ['Fill', 'Fill_delete']:
264
+ fill_se_p(data)
265
+
266
+ # Process allele columns
267
+ for allele_col in ["EA", "NEA"]:
268
+ check_allele_condition = (allele_col in data.columns) and (
269
+ (preprocessing in ['Fill', 'Fill_delete']) or (not keep_multi)
270
+ )
271
+ if check_allele_condition:
272
+ check_allele_column(data, allele_col, keep_multi)
273
+ self.checks[allele_col] = True
274
+
275
+ # Check for and handle duplicate SNPs if necessary
276
+ if "SNP" in data.columns and not keep_dups:
277
+ check_snp_column(data)
278
+ self.checks["SNP"] = True
279
+
280
+ # Warn if essential columns are missing
281
+ for column in STANDARD_COLUMNS:
282
+ if not (column in data.columns):
283
+ print(
284
+ f"Warning: the data doesn't include a {column} column. This may become an issue later on."
285
+ )
286
+
287
+ # Remove missing values if preprocessing level is set to 'Fill_delete'
288
+ if preprocessing == 'Fill_delete':
289
+ remove_na(data)
290
+ self.checks["NA_removal"] = True
291
+
292
+ ## Reset index
293
+ # self.data.reset_index(drop=True, inplace=True)
294
+
295
+ self.data = data
296
+
297
+ def get_reference_panel(self, reference_panel="eur"):
298
+ """
299
+ Retrieve or set the reference panel for the Geno object.
300
+
301
+ If the Geno object does not have a reference panel attribute set,
302
+ this method will try to set it based on the provided `reference_panel`
303
+ argument. This can be either a string indicating a predefined reference panel
304
+ or a DataFrame with specific columns or a path to a .bim file.
305
+
306
+ Args:
307
+ reference_panel (str or pd.DataFrame, optional): Either a string indicating a predefined
308
+ reference panel (default is "eur", options are "afr", "amr", "eas", "sas") or a DataFrame with necessary columns or a valid path to a .bim file
309
+
310
+ Returns:
311
+ pd.DataFrame: The reference panel DataFrame for the Geno object.
312
+
313
+ Raises:
314
+ ValueError: If the provided DataFrame doesn't have the necessary columns.
315
+ """
316
+
317
+ # Check if the object already has a reference panel set
318
+ if not hasattr(self, "reference_panel"):
319
+ # If the provided reference_panel is a DataFrame, verify its structure and dtypes
320
+ if isinstance(reference_panel, pd.DataFrame):
321
+ for col in REF_PANEL_COLUMNS:
322
+ if col not in reference_panel.columns:
323
+ raise ValueError(
324
+ f"The {col} column is not present in the reference_panel provided and is necessary."
325
+ )
326
+
327
+ print(
328
+ "Using the provided reference_panel dataframe as the reference panel."
329
+ )
330
+ self.reference_panel = reference_panel.copy()
331
+ else:
332
+ # Load the reference panel based on the provided string identifier
333
+ self.reference_panel = load_reference_panel(reference_panel)
334
+
335
+ return self.reference_panel
336
+
337
+ def clump(self, kb=250, r2=0.1, p1=5e-8, p2=0.01, reference_panel="eur"):
338
+ """
339
+ Clump the data based on linkage disequilibrium and return another Geno object with the clumped data.
340
+ The clumping process is executed using plink.
341
+
342
+ Args:
343
+ kb (int, optional): Clumping window in thousands of SNPs. Default is 250.
344
+ r2 (float, optional): Linkage disequilibrium threshold, values between 0 and 1. Default is 0.1.
345
+ p1 (float, optional): P-value threshold during clumping. SNPs with a P-value higher than this value are excluded. Default is 5e-8.
346
+ p2 (float, optional): P-value threshold post-clumping to further filter the clumped SNPs. If p2 < p1, it won't be considered. Default is 0.01.
347
+ reference_panel (str, optional): The reference population for linkage disequilibrium values. Accepts values "eur", "sas", "afr", "eas", "amr". Alternatively, a path leading to a specific bed/bim/fam reference panel can be provided. Default is "eur".
348
+
349
+ Returns:
350
+ genal.Geno: A new Geno object based on the clumped data.
351
+ """
352
+
353
+ # Ensure required columns exist in the data
354
+ for column in ["SNP", "P"]:
355
+ if column not in self.data.columns:
356
+ raise ValueError(f"The column {column} is not found in the data")
357
+
358
+ create_tmp() #Make sure temporary folder exists
359
+
360
+ # Validate and process SNP and P columns, if not already done
361
+ if "SNP" not in self.checks:
362
+ check_snp_column(self.data)
363
+ self.checks["SNP"] = True
364
+
365
+ if "P" not in self.checks:
366
+ check_p_column(self.data)
367
+ self.checks["P"] = True
368
+
369
+ initial_rows = self.data.shape[0]
370
+ self.data.dropna(subset=["SNP", "P"], inplace=True)
371
+ deleted_rows = initial_rows - self.data.shape[0]
372
+ if deleted_rows > 0:
373
+ print(
374
+ f"{deleted_rows} ({deleted_rows/initial_rows*100:.3f}%) rows with NA values in columns SNP or P have been deleted."
375
+ )
376
+
377
+ # Create tmp directory if it doesn't exist
378
+ create_tmp()
379
+
380
+ # Clump the data using the specified parameters
381
+ clumped_data = clump_data(
382
+ self.data,
383
+ reference_panel,
384
+ kb,
385
+ r2,
386
+ p1,
387
+ p2,
388
+ self.name,
389
+ self.ram,
390
+ )
391
+
392
+ # If clumped data is successfully generated, assign it to the object's attribute
393
+ if clumped_data is not None:
394
+ Clumped = Geno(clumped_data, keep_columns=True)
395
+ Clumped.checks = self.checks.copy()
396
+ if hasattr(self, "phenotype"):
397
+ Clumped.phenotype = self.phenotype
398
+ return Clumped
399
+ return None
400
+
401
+ def update_snpids(self, path=None, replace=False):
402
+ """
403
+ Update or create the column of SNP name based on genetic data and genomic position.
404
+
405
+ Args:
406
+ path (str, optional): Path to a bed/bim/fam set of genetic files.
407
+ If files are split by chromosomes, replace the chromosome number with '$'.
408
+ For instance: path = "ukb_chr$_file". Defaults to the path from the configuration.
409
+ replace (bool, optional): To update the .data attribute with the updated SNP column or not.
410
+
411
+ Returns:
412
+ None: It updates the dataframe in the .data attribute.
413
+
414
+ Notes:
415
+ This can be used before extracting SNPs from the genetic data if there is possibility of a mismatch between the SNP name contained in the Geno dataframe (SNP-level data) and the SNP name used in the genetic data (individual-level data). Notably, this can avoid losing SNPs due to ID mismatch during polygenic risk scoring or single-SNP association testing.
416
+ """
417
+
418
+ # Check mandatory columns
419
+ for column in ["CHR", "POS"]:
420
+ if not (column in self.data.columns):
421
+ raise ValueError(
422
+ f"The column {column} is not found in the data and is mandatory to update snpIDs!"
423
+ )
424
+ data = self.data.copy() # We don't want to modify the data attribute
425
+ path = setup_genetic_path(path) #Verify the path
426
+ filetype = "split" if "$" in path else "combined" #If data is split by chromosomes or not
427
+ # Merge data with the bim dataframe
428
+ if filetype == "combined":
429
+ bim = pd.read_csv(
430
+ path + ".bim", sep="\t", names=["CHR", "SNP", "F", "POS", "A1", "A2"]
431
+ )
432
+ data = data.merge(
433
+ bim[["CHR", "POS", "SNP"]], on=["CHR", "POS"], how="left", suffixes=('', '_new')
434
+ )
435
+ else:
436
+ chr_dict = {k: v for k, v in data.groupby('CHR')} #Split the dataframe by chromosome
437
+ partial_merge_command_parallel = partial(
438
+ merge_command_parallel, path=path
439
+ ) # Wrapper function
440
+ with ProcessPoolExecutor() as executor: #Merge each dataframe subset
441
+ results = list(executor.map(partial_merge_command_parallel, chr_dict.values()))
442
+ data = pd.concat(results, ignore_index=True, axis=0) #And combine them again
443
+ # Update the SNP column
444
+ data['SNP'] = data['SNP_new'].fillna(data['SNP'])
445
+ n_absent = data['SNP_new'].isna().sum()
446
+ data.drop(columns = ['SNP_new'], inplace=True)
447
+ #if n_absent > 0:
448
+ # print(f"{n_absent}({n_absent/data.shape[0]*100:.3f}%) are not present in the genetic data.")
449
+ #else:
450
+ # print("All SNPs are present in the genetic data.")
451
+ if replace: self.data = data #Update attribute if replace argument
452
+ return data
453
+
454
+ def extract_snps(self, path=None):
455
+ """
456
+ Extract the list of SNPs of this Geno object from the genetic data provided.
457
+
458
+ Args:
459
+ path (str, optional): Path to a bed/bim/fam set of genetic files.
460
+ If files are split by chromosomes, replace the chromosome number with '$'.
461
+ For instance: path = "ukb_chr$_file". Default is None.
462
+
463
+ Returns:
464
+ None: The output is a bed/bim/fam triple in the tmp_GENAL folder
465
+ with the format "{name}_extract_allchr" which includes the SNPs from the UKB.
466
+
467
+ Notes:
468
+ The provided path is saved to the config file. If this function is called again,
469
+ you don't need to specify the path if you want to use the same genomic files.
470
+ """
471
+
472
+ create_tmp() #Make sure temporary folder exists
473
+
474
+ # Extract the SNP list
475
+ snp_list = self.data["SNP"]
476
+
477
+ # Extract SNPs using the provided path and SNP list
478
+ _ = extract_snps_func(snp_list, self.name, path)
479
+
480
+ return
481
+
482
+ def prs(self, name=None,
483
+ weighted=True,
484
+ path=None,
485
+ proxy=False,
486
+ reference_panel="eur",
487
+ kb=5000,
488
+ r2=0.6,
489
+ window_snps=5000,
490
+
491
+ ):
492
+ """
493
+ Compute a Polygenic Risk Score (PRS) and save it as a CSV file in the current directory.
494
+
495
+ Args:
496
+ name (str, optional): Name or path of the saved PRS file.
497
+ weighted (bool, optional): If True, performs a PRS weighted by the BETA column estimates.
498
+ If False, performs an unweighted PRS. Default is True.
499
+ path (str, optional): Path to a bed/bim/fam set of genetic files for PRS calculation.
500
+ If files are split by chromosomes, replace the chromosome number
501
+ with '$'. For instance: path = "ukb_chr$_file".
502
+ If not provided, it will use the genetic path most recently used
503
+ (if any). Default is None.
504
+ position (bool, optional): Use the genomic positions instead of the SNP names to find the
505
+ SNPs in the genetic data (recommended).
506
+ proxy (bool, optional): If true, proxies are searched. Default is True.
507
+ reference_panel (str, optional): The reference population used to derive linkage
508
+ disequilibrium values and find proxies (only if proxy=True). Acceptable values
509
+ include "EUR", "SAS", "AFR", "EAS", "AMR" or a path to a specific bed/bim/fam panel.
510
+ Default is "EUR".
511
+ kb (int, optional): Width of the genomic window to look for proxies. Default is 5000.
512
+ r2 (float, optional): Minimum linkage disequilibrium value with the main SNP
513
+ for a proxy to be included. Default is 0.6.
514
+ window_snps (int, optional): Compute the LD value for SNPs that are not
515
+ more than x SNPs away from the main SNP. Default is 5000.
516
+
517
+ Returns:
518
+ pd.DataFrame: The computed PRS data.
519
+
520
+ Raises:
521
+ ValueError: If the data hasn't been clumped and 'clumped' parameter is True.
522
+ """
523
+
524
+ path = setup_genetic_path(path) # Check path
525
+ create_tmp() #Make sure temporary folder exists
526
+
527
+ # Check for mandatory columns in data
528
+ mandatory_cols = ["EA", "BETA"]
529
+ for col in mandatory_cols:
530
+ if col not in self.data.columns:
531
+ raise ValueError(f"The column {col} is not found in the data!")
532
+
533
+ # Based on column presents, run the PRS with SNP names or genomic positions (with preference for positions)
534
+ if "CHR" in self.data.columns and "POS" in self.data.columns:
535
+ print("CHR/POS columns present: SNPs searched based on genomic positions.")
536
+ data_prs = self.update_snpids(path = path)
537
+ elif "SNP" in self.data.columns:
538
+ print("CHR/POS columns absent: SNPs searched based on SNP name.")
539
+ data_prs = self.data.copy()
540
+ else:
541
+ raise ValueError("Either the SNP or the CHR/POS columns need to be present to run a PRS.")
542
+
543
+ # Check SNP and EA columns
544
+ if "SNP" not in self.checks:
545
+ check_snp_column(data_prs)
546
+ if "EA" not in self.checks:
547
+ check_allele_column(data_prs, "EA", keep_multi=False)
548
+ if "BETA" not in self.checks:
549
+ check_beta_column(data_prs, effect_column=None, preprocessing='Fill_delete')
550
+
551
+ initial_rows = data_prs.shape[0]
552
+ data_prs.dropna(subset=["SNP", "P", "BETA"], inplace=True)
553
+ deleted_rows = initial_rows - data_prs.shape[0]
554
+ if deleted_rows > 0:
555
+ print(
556
+ f"{deleted_rows} ({deleted_rows/initial_rows*100:.3f}%) rows with NA values in columns SNP, P, or BETA have been deleted."
557
+ )
558
+
559
+ # If proxy option
560
+ if proxy:
561
+ print("Identifying the SNPs present in the genetic data...")
562
+ # Obtain the list of SNPs present in the genetic data
563
+ if path.count("$") == 1: #If split: merge all SNP columns of the .bim files
564
+ genetic_snp_list = []
565
+ for i in range(1,23):
566
+ path_i = path.replace("$", str(i))+ ".bim"
567
+ if os.path.exists(path_i):
568
+ bim_i = pd.read_csv(
569
+ path_i, sep="\t", names=["CHR", "SNP", "F", "POS", "A1", "A2"]
570
+ )
571
+ genetic_snp_list.extend(bim_i.SNP.tolist())
572
+ else: #If not split
573
+ bim = pd.read_csv(
574
+ os.path.join(path, ".bim"),
575
+ sep="\t", names=["CHR", "SNP", "F", "POS", "A1", "A2"]
576
+ )
577
+ genetic_snp_list = bim.SNP.tolist()
578
+ # Identify the SNPs already present in the genetic data
579
+ genetic_snps = set(genetic_snp_list)
580
+ exposure_snps = set(data_prs.SNP.values)
581
+ snps_present = exposure_snps & genetic_snps
582
+ print(
583
+ f"{len(snps_present)} SNPs out of {len(exposure_snps)} are present in the genetic data."
584
+ )
585
+ # Search proxies for absent SNPs
586
+ if len(exposure_snps) - len(snps_present) > 0:
587
+ snps_absent = exposure_snps - snps_present
588
+ print(f"Searching proxies for {len(snps_absent)} SNPs...")
589
+ ld = find_proxies(
590
+ snps_absent,
591
+ reference_panel=reference_panel,
592
+ kb=kb,
593
+ r2=r2,
594
+ window_snps=window_snps,
595
+ threads=self.cpus,
596
+ )
597
+ data_prs = apply_proxies(data_prs, ld, searchspace = genetic_snps)
598
+ check_snp_column(data_prs)
599
+
600
+ # Compute PRS
601
+ prs_data = prs_func(data_prs, weighted, path, ram=self.ram, name=self.name)
602
+
603
+ # Save the computed PRS data as a CSV file
604
+ name = self.name if not name else name
605
+ prs_filename = os.path.splitext(name)[0] + ".csv"
606
+ prs_data.to_csv(prs_filename, index=False, header=True)
607
+ print(f"PRS data saved to {prs_filename}")
608
+
609
+ return
610
+
611
+ def set_phenotype(
612
+ self, data, IID=None, PHENO=None, PHENO_type=None, alternate_control=False
613
+ ):
614
+ """
615
+ Assign a phenotype dataframe to the .phenotype attribute.
616
+
617
+ This method sets the .phenotype attribute which is essential to perform
618
+ single-SNP association tests using the association_test method.
619
+
620
+ Args:
621
+ data (pd.DataFrame): DataFrame containing individual-level row data with at least an individual IDs column
622
+ and one phenotype column.
623
+ IID (str, optional): Name of the individual IDs column in 'data'. These IDs should
624
+ correspond to the genetic IDs in the FAM file that will be used for association testing.
625
+ PHENO (str, optional): Name of the phenotype column in 'data' which will be used
626
+ as the dependent variable for association tests.
627
+ PHENO_type (str, optional): If not specified, the function will try to infer if
628
+ the phenotype is binary or quantitative. To bypass this,
629
+ use "quant" for quantitative or "binary" for binary phenotypes.
630
+ Default is None.
631
+ alternate_control (bool, optional): By default, the function assumes that for a binary
632
+ trait, the controls have the most frequent value.
633
+ Set to True if this is not the case. Default is False.
634
+
635
+ Returns:
636
+ None: Sets the .phenotype attribute for the instance.
637
+ """
638
+
639
+ processed_data, inferred_pheno_type = set_phenotype_func(
640
+ data, PHENO, PHENO_type, IID, alternate_control
641
+ )
642
+
643
+ # Assign the processed data and inferred phenotype type to the .phenotype attribute
644
+ self.phenotype = (processed_data, inferred_pheno_type)
645
+
646
+ def association_test(self, path=None, covar=[], standardize=True):
647
+ """
648
+ Conduct single-SNP association tests against a phenotype.
649
+
650
+ This method requires the phenotype to be set using the set_phenotype() function.
651
+
652
+ Args:
653
+ path (str, optional): Path to a bed/bim/fam set of genetic files.
654
+ If files are split by chromosomes, replace the chromosome number with '$'.
655
+ For instance: path = "ukb_chr$_file". Default is None.
656
+ covar (list, optional): List of columns in the phenotype dataframe to be used
657
+ as covariates in the association tests. Default is an empty list.
658
+ standardize (bool, optional): If True, it will standardize a quantitative phenotype
659
+ before performing association tests. This is typically done
660
+ to make results more interpretable. Default is True.
661
+
662
+ Returns:
663
+ None: Updates the BETA, SE, and P columns of the data attribute based on the results
664
+ of the association tests.
665
+ """
666
+
667
+ # Ensure that the phenotype has been set using set_phenotype
668
+ if not hasattr(self, "phenotype"):
669
+ raise ValueError(
670
+ "You first need to set a phenotype using .set_phenotype(data, PHENO, PHENO_type, IID)!"
671
+ )
672
+
673
+ create_tmp() #Make sure temporary folder exists
674
+
675
+ # Based on column presents, extract the SNP based names or genomic positions (with preference for positions)
676
+ if "CHR" in self.data.columns and "POS" in self.data.columns:
677
+ print("CHR/POS columns present: SNPs searched based on genomic positions.")
678
+ data = self.update_snpids(path = path)
679
+ elif "SNP" in self.data.columns:
680
+ print("CHR/POS columns absent: SNPs searched based on SNP name.")
681
+ data = self.data
682
+ else:
683
+ raise ValueError("Either the SNP or the CHR/POS columns need to be present to identify SNPs in genetic data.")
684
+
685
+ # Extract the SNP list
686
+ snp_list = data["SNP"]
687
+
688
+ # Extract SNPs using the provided path and SNP list
689
+ _ = extract_snps_func(snp_list, self.name, path)
690
+
691
+ # Perform the association test
692
+ updated_data = association_test_func(
693
+ data,
694
+ covar,
695
+ standardize,
696
+ self.name,
697
+ self.phenotype[0],
698
+ self.phenotype[1],
699
+ )
700
+
701
+ # Update the instance data
702
+ self.data = updated_data
703
+
704
+ print(f"The BETA, SE, P columns of the .data attribute have been updated.")
705
+ return
706
+
707
+ def query_outcome(
708
+ self,
709
+ outcome,
710
+ name=None,
711
+ proxy=True,
712
+ reference_panel="eur",
713
+ kb=5000,
714
+ r2=0.6,
715
+ window_snps=5000,
716
+ ):
717
+ """
718
+ Prepares dataframes required for Mendelian Randomization (MR) with the SNP information in `data` as exposure.
719
+
720
+ Queries the outcome data, with or without proxying, and assigns a tuple to
721
+ the outcome attribute: (exposure_data, outcome_data, name) ready for MR methods.
722
+
723
+ Args:
724
+ outcome: Can be a Geno object (from a GWAS) or a filepath of types: .h5 or .hdf5 (created with the :meth:`Geno.save` method.
725
+ name (str, optional): Name for the outcome data. Defaults to None.
726
+ proxy (bool, optional): If true, proxies are searched. Default is True.
727
+ reference_panel (str, optional): The reference population to get linkage
728
+ disequilibrium values and find proxies (only if proxy=True). Acceptable values
729
+ include "EUR", "SAS", "AFR", "EAS", "AMR" or a path to a specific bed/bim/fam panel.
730
+ Default is "EUR".
731
+ kb (int, optional): Width of the genomic window to look for proxies. Default is 5000.
732
+ r2 (float, optional): Minimum linkage disequilibrium value with the main SNP
733
+ for a proxy to be included. Default is 0.6.
734
+ window_snps (int, optional): Compute the LD value for SNPs that are not
735
+ more than x SNPs away from the main SNP. Default is 5000.
736
+
737
+ Returns:
738
+ None: Sets the `MR_data` attribute for the instance.
739
+ """
740
+
741
+ exposure, outcome_data, outcome_name = query_outcome_func(
742
+ self.data,
743
+ outcome,
744
+ name,
745
+ proxy,
746
+ reference_panel,
747
+ kb,
748
+ r2,
749
+ window_snps,
750
+ self.cpus,
751
+ )
752
+
753
+ # Assign the processed data to the MR_data attribute
754
+ self.MR_data = [exposure, outcome_data, outcome_name]
755
+ return
756
+
757
+ def MR(
758
+ self,
759
+ methods=[
760
+ "IVW",
761
+ "IVW-FE",
762
+ "UWR",
763
+ "WM",
764
+ "WM-pen",
765
+ "Simple-median",
766
+ "Sign",
767
+ "Egger",
768
+ "Egger-boot",
769
+ ],
770
+ action=2,
771
+ eaf_threshold=0.42,
772
+ heterogeneity=False,
773
+ nboot=10000,
774
+ penk=20,
775
+ exposure_name=None,
776
+ outcome_name=None,
777
+ ):
778
+ """
779
+ Executes Mendelian Randomization (MR) using the `data_clumped` attribute as exposure data and `MR_data` attribute as outcome data queried using the `query_outcome` method.
780
+
781
+ Args:
782
+ methods (list, optional): List of MR methods to run. Possible options include:
783
+ "IVW": inverse variance-weighted with random effects and under-dispersion correction
784
+ "IVW-FE": inverse variance-weighted with fixed effects
785
+ "IVW-RE": inverse variance-weighted with random effects and without under-dispersion correction
786
+ "UWR": unweighted regression
787
+ "WM": weighted median (bootstrapped standard errors)
788
+ "WM-pen": penalised weighted median (bootstrapped standard errors)
789
+ "Simple-median": simple median (bootstrapped standard errors)
790
+ "Sign": sign concordance test
791
+ "Egger": egger regression
792
+ "Egger-boot": egger regression with bootstrapped standard errors
793
+ Default is ["IVW","IVW-FE","UWR","WM","WM-pen","Simple-median","Sign","Egger","Egger-boot"].
794
+ action (int, optional): How to treat palindromes during harmonizing between
795
+ exposure and outcome data. Accepts:
796
+ 1: Doesn't flip them (Assumes all alleles are on the forward strand)
797
+ 2: Uses allele frequencies to attempt to flip (conservative, default)
798
+ 3: Removes all palindromic SNPs (very conservative)
799
+ eaf_threshold (float, optional): Max effect allele frequency accepted when
800
+ flipping palindromic SNPs (relevant if action=2). Default is 0.42.
801
+ heterogeneity (bool, optional): If True, includes heterogeneity tests in the results (Cochran's Q test).Default is False.
802
+ nboot (int, optional): Number of bootstrap replications for methods with bootstrapping. Default is 10000.
803
+ penk (int, optional): Penalty value for the WM-pen method. Default is 20.
804
+ exposure_name (str, optional): Name of the exposure data (only for display purposes).
805
+ outcome_name (str, optional): Name of the outcome data (only for display purposes).
806
+
807
+ Returns:
808
+ pd.DataFrame: A table with MR results.
809
+ """
810
+
811
+ # Ensure that query_outcome has been previously called
812
+ if not hasattr(self, "MR_data"):
813
+ raise ValueError("You must first call query_outcome() before running MR.")
814
+
815
+ if outcome_name:
816
+ self.MR_data[2] = outcome_name
817
+ exp_name = exposure_name if exposure_name else self.name
818
+ res, df_mr = MR_func(
819
+ self.MR_data,
820
+ methods,
821
+ action,
822
+ heterogeneity,
823
+ eaf_threshold,
824
+ nboot,
825
+ penk,
826
+ exp_name,
827
+ self.cpus,
828
+ )
829
+
830
+ self.MR_results = (res, df_mr, exposure_name, outcome_name)
831
+ return res
832
+
833
+ def MR_plot(
834
+ self,
835
+ methods=[
836
+ "IVW",
837
+ "WM",
838
+ "Simple-median",
839
+ "Egger",
840
+ ],
841
+ exposure_name=None,
842
+ outcome_name=None,
843
+ filename=None
844
+ ):
845
+ """
846
+ Creates and returns a scatter plot of individual SNP effects with lines representing different Mendelian Randomization (MR) methods. Each MR method specified in the 'methods' argument is represented as a line in the plot.
847
+
848
+ Args:
849
+ methods (list of str, optional): A list of MR methods to be included in the plot. Default methods are "IVW", "WM", "Simple-median", and "Egger".
850
+ exposure_name (str, optional): A custom label for the exposure effect axis. If None, uses the label provided in the MR function call or a default label.
851
+ outcome_name (str, optional): A custom label for the outcome effect axis. If None, uses the label provided in the MR function call or a default label.
852
+ filename (str, optional): The filename where the plot will be saved. If None, the plot is not saved.
853
+
854
+ Returns:
855
+ plotnine.ggplot.ggplot: A plotnine ggplot object representing the scatter plot of individual SNP effects with MR method lines.
856
+
857
+ Raises:
858
+ ValueError: If MR analysis has not been performed prior to calling this function.
859
+
860
+ Note:
861
+ This function requires prior execution of the `MR` method to compute MR results. Make sure the MR analysis is performed on the data before calling `MR_plot`.
862
+ """
863
+ if not hasattr(self, "MR_results"):
864
+ raise ValueError("You need to run an MR analysis with the MR method before calling the MR_plot function.")
865
+
866
+ ## Extract the previously computed MR results
867
+ df_mr = self.MR_results[1]
868
+ res = self.MR_results[0]
869
+ exposure_name = self.MR_results[2] if not exposure_name else exposure_name
870
+ exposure_name = "Effect on the exposure" if not exposure_name else f"Effect on {exposure_name}"
871
+ outcome_name = self.MR_results[3] if not outcome_name else outcome_name
872
+ outcome_name = "Effect on the outcome" if not outcome_name else f"Effect on {outcome_name}"
873
+
874
+ ## Switch all exposure betas to >= 0
875
+ df_mr['BETA_e'], df_mr['BETA_o'] = np.where(df_mr['BETA_e'] < 0, (-df_mr['BETA_e'], -df_mr['BETA_o']), (df_mr['BETA_e'], df_mr['BETA_o']))
876
+
877
+ ## Create the scatter plot with error bars
878
+ plot = (
879
+ ggplot(df_mr, aes('BETA_e', 'BETA_o'))
880
+
881
+ + geom_errorbarh(aes(xmin='BETA_e-SE_e', xmax='BETA_e+SE_e'), height=0, color="gray", size=0.1)
882
+ + geom_errorbar(aes(ymin='BETA_o-SE_o', ymax='BETA_o+SE_o'), width=0, color="gray", size=0.1)
883
+ + geom_point(color='black', size=0.2)
884
+ + geom_abline(slope=0, intercept=0, color='black')
885
+ + labs(x=exposure_name, y=outcome_name)
886
+ + theme(
887
+ axis_title=element_text(size=12),
888
+ axis_text=element_text(size=10),
889
+ figure_size=(10,6)
890
+ )
891
+ )
892
+
893
+ ## Add the lines corresponding to the specified MR methods (if present in the computation)
894
+ lines = []
895
+ for method in methods:
896
+ if method not in MR_METHODS_NAMES.keys():
897
+ warnings.warn(f"{method} is not an appropriate MR method. MR methods can be IVW, WM, Egger... Please refer to the documentation for more.")
898
+ continue
899
+ ## If not an Egger method: simply need to get the slope
900
+ if not method.startswith("Egger"):
901
+ method_name = MR_METHODS_NAMES[method]
902
+ res_row = res[res.method == method_name]
903
+ if res_row.shape[0] == 0:
904
+ warnings.warn(f"The {method_name} ({method}) method was not included in the MR method call and will be excluded from the plot.")
905
+ elif res_row.shape[0] == 1:
906
+ lines.append({
907
+ 'slope': res_row["b"].values[0],
908
+ 'intercept': 0,
909
+ 'MR Methods': method_name # Use method_name as the color label
910
+ })
911
+ ## For Egger methods: need to get the slope and the intercept
912
+ else:
913
+ method_name = MR_METHODS_NAMES[method][0]
914
+ method_name_intercept = MR_METHODS_NAMES[method][1]
915
+ res_row = res[res.method == method_name]
916
+ res_row_intercept = res[res.method == method_name_intercept]
917
+ if res_row.shape[0] == 0:
918
+ warnings.warn(f"The {method_name} ({method}) method was not included in the MR method call and will be excluded from the plot.")
919
+ elif res_row.shape[0] == 1 and res_row_intercept.shape[0] == 1:
920
+ lines.append({
921
+ 'slope': res_row["b"].values[0],
922
+ 'intercept': res_row_intercept["b"].values[0],
923
+ 'MR Methods': method_name # Use method_name as the color label
924
+ })
925
+ line_data = pd.DataFrame(lines)
926
+ plot += geom_abline(aes(slope='slope', intercept='intercept', color='MR Methods'), data=line_data)
927
+
928
+ ## Save plot if filename is specified
929
+ if filename:
930
+ plot.save(f"{filename}.png", dpi=500, width=10, height=6, verbose=False)
931
+
932
+ return plot
933
+
934
+
935
+ def MRpresso(
936
+ self,
937
+ action=2,
938
+ eaf_threshold=0.42,
939
+ n_iterations=10000,
940
+ outlier_test=True,
941
+ distortion_test=True,
942
+ significance_p=0.05,
943
+ cpus=-1,
944
+ ):
945
+ """
946
+ Executes the MR-PRESSO Mendelian Randomization algorithm for detection and correction of horizontal pleiotropy.
947
+
948
+ Args:
949
+ action (int, optional): Treatment for palindromes during harmonizing between
950
+ exposure and outcome data. Options:
951
+ - 1: Don't flip (assume all alleles are on the forward strand)
952
+ - 2: Use allele frequencies to flip (default)
953
+ - 3: Remove all palindromic SNPs
954
+ eaf_threshold (float, optional): Max effect allele frequency when flipping
955
+ palindromic SNPs (relevant if action=2). Default is 0.42.
956
+ n_iterations (int, optional): Number of random data generation steps for
957
+ improved result stability. Default is 10000.
958
+ outlier_test (bool, optional): Identify outlier SNPs responsible for horizontal
959
+ pleiotropy if global test p_value < significance_p. Default is True.
960
+ distortion_test (bool, optional): Test significant distortion in causal estimates
961
+ before and after outlier removal if global test p_value < significance_p.
962
+ Default is True.
963
+ significance_p (float, optional): Statistical significance threshold for
964
+ horizontal pleiotropy detection (both global test and outlier identification).
965
+ Default is 0.05.
966
+ cpus (int, optional): number of cpu cores to be used for the parallel random data generation.
967
+
968
+ Returns:
969
+ list: Contains the following elements:
970
+ - mod_table: DataFrame containing the original (before outlier removal)
971
+ and outlier-corrected (after outlier removal) inverse variance-weighted MR results.
972
+ - GlobalTest: p-value of the global MR-PRESSO test indicating the presence of horizontal pleiotropy.
973
+ - OutlierTest: DataFrame assigning a p-value to each SNP representing the likelihood of this
974
+ SNP being responsible for the global pleiotropy. Set to NaN if global test p_value > significance_p.
975
+ - DistortionTest: p-value for the distortion test.
976
+ """
977
+
978
+ if not hasattr(self, "MR_data"):
979
+ raise ValueError("You must first call query_outcome() before running MR.")
980
+ cpus = self.cpus if cpus == -1 else cpus
981
+
982
+ return mrpresso_func(
983
+ self.MR_data,
984
+ action,
985
+ eaf_threshold,
986
+ n_iterations,
987
+ outlier_test,
988
+ distortion_test,
989
+ significance_p,
990
+ cpus,
991
+ )
992
+
993
+ def lift(
994
+ self,
995
+ start="hg19",
996
+ end="hg38",
997
+ replace=False,
998
+ extraction_file=False,
999
+ chain_file=None,
1000
+ name=None,
1001
+ liftover_path=None,
1002
+ ):
1003
+ """
1004
+ Perform a liftover from one genetic build to another.
1005
+
1006
+ Args:
1007
+ start (str, optional): Current build of the data. Default is "hg19".
1008
+ end (str, optional): Target build for the liftover. Default is "hg38".
1009
+ replace (bool, optional): If True, updates the data attribute in place. Default is False.
1010
+ extraction_file (bool, optional): If True, prints a CHR POS SNP space-delimited
1011
+ file. Default is False.
1012
+ chain_file (str, optional): Path to a local chain file for the lift.
1013
+ If provided, `start` and `end` arguments are not considered. Default is None.
1014
+ name (str, optional): Filename or filepath (without extension) to save the lifted dataframe.
1015
+ If not provided, the data is not saved.
1016
+ liftover_path (str, optional): Specify the path to the USCS liftover executable. If not provided, the lift will be done in python (slower for large amount of SNPs).
1017
+
1018
+ Returns:
1019
+ pd.DataFrame: Data after being lifted.
1020
+ """
1021
+ # Ensure mandatory columns are present in the input data
1022
+ for column in ["CHR", "POS"]:
1023
+ if column not in self.data.columns:
1024
+ raise ValueError(f"The column {column} is not found in the data!")
1025
+
1026
+ create_tmp() # Create tmp folder if does not exist
1027
+
1028
+ # Select appropriate data or copy of data depending on replace argument
1029
+ if not replace:
1030
+ data = self.data.copy()
1031
+ else:
1032
+ data = self.data
1033
+
1034
+ # Do the appropriate preprocessing on CHR and POS columns if not already done
1035
+ if not self.checks["CHR"]:
1036
+ check_int_column(data, "CHR")
1037
+ if not self.checks["POS"]:
1038
+ check_int_column(data, "POS")
1039
+
1040
+ # Update the checks if replace = True
1041
+ if replace:
1042
+ self.checks["CHR"] = True
1043
+ self.checks["POS"] = True
1044
+
1045
+ print(
1046
+ f"Lifting the data{' inplace' if replace else ''}. "
1047
+ f"The .data attribute will {'' if replace else 'not '}be modified. "
1048
+ f"Use replace={'False' if replace else 'True'} to {'leave it as is' if replace else 'lift inplace'}."
1049
+ )
1050
+
1051
+ with warnings.catch_warnings():
1052
+ warnings.simplefilter("ignore")
1053
+ data = lift_data(
1054
+ data,
1055
+ start=start,
1056
+ end=end,
1057
+ extraction_file=extraction_file,
1058
+ chain_file=chain_file,
1059
+ name=name,
1060
+ liftover_path=liftover_path,
1061
+ object_id=self.name,
1062
+ )
1063
+
1064
+ return data
1065
+
1066
+ def standardize(self):
1067
+ """
1068
+ Standardize the Betas and adjust the SE column accordingly.
1069
+
1070
+ Raises:
1071
+ ValueError: If the required columns are not found in the data.
1072
+ """
1073
+ required_columns = ["BETA", "SE"]
1074
+ for column in required_columns:
1075
+ if column not in self.data.columns:
1076
+ raise ValueError(f"The column {column} is not found in the data!")
1077
+
1078
+ self.data["BETA"] = (self.data.BETA - np.mean(self.data.BETA)) / np.std(
1079
+ self.data.BETA
1080
+ )
1081
+ self.data["SE"] = np.abs(self.data.BETA / st.norm.ppf(self.data.P / 2))
1082
+ print(
1083
+ "The Beta column has been standardized and the SE column has been adjusted."
1084
+ )
1085
+
1086
+ def sort_group(self, method="lowest_p"):
1087
+ """
1088
+ Handle duplicate SNPs. Useful if the instance combines different Genos.
1089
+
1090
+ Args:
1091
+ method (str, optional): How to handle duplicates. Default is "lowest_p",
1092
+ which retains the lowest P-value for each SNP.
1093
+
1094
+ Returns:
1095
+ None
1096
+ """
1097
+ if method == "lowest_p":
1098
+ self.data = self.data.sort_values(by=["P"])
1099
+ self.data = self.data.groupby(by=["SNP"]).first().reset_index(drop=False)
1100
+ return
1101
+
1102
+ def copy(self):
1103
+ """
1104
+ Create a deep copy of the Geno instance.
1105
+
1106
+ Returns:
1107
+ Geno: A deep copy of the instance.
1108
+ """
1109
+ return copy.deepcopy(self)
1110
+
1111
+ def save(self, path="", fmt="h5", sep="\t", header=True):
1112
+ """
1113
+ Save the Geno data to a file.
1114
+
1115
+ Args:
1116
+ path (str, optional): Folder path to save the file. Defaults to the current directory.
1117
+ fmt (str, optional): File format. Options: .h5 (default), .csv, .txt. Future: .vcf, .vcf.gz.
1118
+ sep (str, optional): Delimiter for .csv and .txt formats. Default is tab.
1119
+ header (bool, optional): Save column names for .csv and .txt formats. Default is True.
1120
+
1121
+ Raises:
1122
+ ValueError: If clumped data is requested but data is not clumped.
1123
+ """
1124
+ save_data(self.data, name=self.name, path=path, fmt=fmt, sep=sep, header=header)
1125
+ return
1126
+
1127
+ def merge_command_parallel(df_subset, path):
1128
+ """Helper function of the update_snpids method to update SNP in parallel when genetic data is split by chromosome."""
1129
+ chr_number = df_subset.iloc[0]['CHR']
1130
+ bim_path = path.replace("$", str(chr_number)) + ".bim"
1131
+ if not os.path.exists(bim_path):
1132
+ return
1133
+ bim = pd.read_csv(
1134
+ bim_path, sep="\t", names=["CHR", "SNP", "F", "POS", "A1", "A2"]
1135
+ )
1136
+ bim.drop_duplicates(subset=["CHR", "POS"], keep='first', inplace=True)
1137
+
1138
+ return df_subset.merge(
1139
+ bim[["CHR", "POS", "SNP"]], on=["CHR", "POS"], how="left", suffixes=('', '_new')
1140
+ )