gwaslab 3.4.37__py3-none-any.whl → 3.4.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +8 -0
- gwaslab/g_Sumstats.py +26 -147
- gwaslab/g_SumstatsPair.py +6 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +29 -15
- gwaslab/hm_harmonize_sumstats.py +291 -163
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +43 -37
- gwaslab/io_to_formats.py +428 -295
- gwaslab/qc_check_datatype.py +3 -3
- gwaslab/qc_fix_sumstats.py +793 -682
- gwaslab/util_ex_calculate_ldmatrix.py +29 -11
- gwaslab/util_ex_gwascatalog.py +1 -1
- gwaslab/util_ex_ldproxyfinder.py +1 -1
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_convert_h2.py +1 -1
- gwaslab/util_in_fill_data.py +2 -2
- gwaslab/util_in_filter_value.py +122 -34
- gwaslab/util_in_get_density.py +2 -2
- gwaslab/util_in_get_sig.py +41 -9
- gwaslab/viz_aux_quickfix.py +24 -19
- gwaslab/viz_aux_reposition_text.py +7 -4
- gwaslab/viz_aux_save_figure.py +6 -5
- gwaslab/viz_plot_compare_af.py +5 -5
- gwaslab/viz_plot_miamiplot2.py +28 -20
- gwaslab/viz_plot_mqqplot.py +109 -72
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +3 -1
- gwaslab/viz_plot_trumpetplot.py +15 -6
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/METADATA +2 -2
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/RECORD +37 -37
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0
gwaslab/hm_rsid_to_chrpos.py
CHANGED
|
@@ -135,7 +135,7 @@ def rsID2chrpos(path,
|
|
|
135
135
|
print("Writing to "+path+".rsid")
|
|
136
136
|
sumstats = sumstats.loc[sumstats["#POS"]!=0]
|
|
137
137
|
|
|
138
|
-
sumstats
|
|
138
|
+
sumstats[to_round] = sumstats[to_round].round(4)
|
|
139
139
|
sumstats.sort_values(["#CHROM","#POS"]).to_csv(path+".rsid","\t",na_rep='\.')
|
|
140
140
|
|
|
141
141
|
return sumstats
|
gwaslab/io_preformat_input.py
CHANGED
|
@@ -67,7 +67,7 @@ def preformat(sumstats,
|
|
|
67
67
|
#######################################################################################################################################################
|
|
68
68
|
if fmt is not None:
|
|
69
69
|
# loading format parameters
|
|
70
|
-
|
|
70
|
+
log.write("Start to load format from formatbook....",verbose=verbose)
|
|
71
71
|
|
|
72
72
|
# load format data
|
|
73
73
|
meta_data,rename_dictionary = get_format_dict(fmt)
|
|
@@ -97,7 +97,7 @@ def preformat(sumstats,
|
|
|
97
97
|
inpath = sumstats
|
|
98
98
|
###load sumstats by each chromosome #################################################
|
|
99
99
|
if "@" in inpath:
|
|
100
|
-
|
|
100
|
+
log.write(" -Detected @ in path: load sumstats by each chromosome...",verbose=verbose)
|
|
101
101
|
inpath_chr_list=[]
|
|
102
102
|
inpath_chr_num_list=[]
|
|
103
103
|
for chromosome in list(range(1,26))+["x","y","X","Y","MT","mt","m","M"]:
|
|
@@ -105,9 +105,10 @@ def preformat(sumstats,
|
|
|
105
105
|
if isfile_casesensitive(inpath_chr):
|
|
106
106
|
inpath_chr_num_list.append(str(chromosome))
|
|
107
107
|
inpath_chr_list.append(inpath_chr)
|
|
108
|
-
|
|
108
|
+
log.write(" -Chromosomes detected:",",".join(inpath_chr_num_list),verbose=verbose)
|
|
109
109
|
readargs_header = get_readargs_header(inpath = inpath_chr_list[0], readargs = readargs)
|
|
110
110
|
row_one = pd.read_table(inpath_chr_list[0],**readargs_header)
|
|
111
|
+
# columns in the sumstats
|
|
111
112
|
raw_cols = row_one.columns
|
|
112
113
|
else:
|
|
113
114
|
##### loading data from tabular file#################################################
|
|
@@ -173,6 +174,7 @@ def preformat(sumstats,
|
|
|
173
174
|
usecols.append(eaf)
|
|
174
175
|
rename_dictionary[eaf]= "EAF"
|
|
175
176
|
elif neaf:
|
|
177
|
+
# neaf will be converted to eaf
|
|
176
178
|
usecols.append(neaf)
|
|
177
179
|
rename_dictionary[neaf]= "EAF"
|
|
178
180
|
if maf:
|
|
@@ -281,10 +283,10 @@ def preformat(sumstats,
|
|
|
281
283
|
## loading data from path
|
|
282
284
|
inpath = sumstats
|
|
283
285
|
if "@" in inpath:
|
|
284
|
-
|
|
286
|
+
log.write("Start to initialize gl.Sumstats from files with pattern :" + inpath,verbose=verbose)
|
|
285
287
|
sumstats_chr_list=[]
|
|
286
288
|
for i in inpath_chr_list:
|
|
287
|
-
|
|
289
|
+
log.write(" -Loading:" + i)
|
|
288
290
|
skip_rows = get_skip_rows(i)
|
|
289
291
|
readargs["skiprows"] = skip_rows
|
|
290
292
|
sumstats_chr = pd.read_table(i,
|
|
@@ -292,14 +294,14 @@ def preformat(sumstats,
|
|
|
292
294
|
dtype=dtype_dictionary,
|
|
293
295
|
**readargs)
|
|
294
296
|
sumstats_chr_list.append(sumstats_chr)
|
|
295
|
-
|
|
297
|
+
log.write(" -Merging sumstats for chromosomes:",",".join(inpath_chr_num_list),verbose=verbose)
|
|
296
298
|
sumstats = pd.concat(sumstats_chr_list, axis=0, ignore_index=True)
|
|
297
299
|
del(sumstats_chr_list)
|
|
298
300
|
gc.collect()
|
|
299
301
|
else:
|
|
300
302
|
skip_rows = get_skip_rows(inpath)
|
|
301
303
|
readargs["skiprows"] = skip_rows
|
|
302
|
-
|
|
304
|
+
log.write("Start to initialize gl.Sumstats from file :" + inpath,verbose=verbose)
|
|
303
305
|
sumstats = pd.read_table(inpath,
|
|
304
306
|
usecols=set(usecols),
|
|
305
307
|
dtype=dtype_dictionary,
|
|
@@ -307,8 +309,8 @@ def preformat(sumstats,
|
|
|
307
309
|
|
|
308
310
|
elif type(sumstats) is pd.DataFrame:
|
|
309
311
|
## loading data from dataframe
|
|
310
|
-
|
|
311
|
-
sumstats = sumstats
|
|
312
|
+
log.write("Start to initialize gl.Sumstats from pandas DataFrame ...",verbose=verbose)
|
|
313
|
+
sumstats = sumstats[usecols].copy()
|
|
312
314
|
for key,value in dtype_dictionary.items():
|
|
313
315
|
if key in usecols:
|
|
314
316
|
sumstats[key] = sumstats[key].astype(value)
|
|
@@ -324,9 +326,9 @@ def preformat(sumstats,
|
|
|
324
326
|
converted_columns = list(map(lambda x: rename_dictionary[x], set(usecols)))
|
|
325
327
|
|
|
326
328
|
## renaming log
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
329
|
+
log.write(" -Reading columns :", ",".join(set(usecols)),verbose=verbose)
|
|
330
|
+
log.write(" -Renaming columns to :", ",".join(converted_columns),verbose=verbose)
|
|
331
|
+
log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns),verbose=verbose)
|
|
330
332
|
|
|
331
333
|
## renaming #####################################################################################
|
|
332
334
|
sumstats = sumstats.rename(columns=rename_dictionary)
|
|
@@ -356,7 +358,7 @@ def preformat(sumstats,
|
|
|
356
358
|
gc.collect()
|
|
357
359
|
check_dataframe_memory_usage(sumstats,log=log,verbose=verbose)
|
|
358
360
|
|
|
359
|
-
|
|
361
|
+
log.write("Finished loading data successfully!",verbose=verbose)
|
|
360
362
|
return sumstats
|
|
361
363
|
|
|
362
364
|
|
|
@@ -396,17 +398,21 @@ def get_skip_rows(inpath):
|
|
|
396
398
|
return 0
|
|
397
399
|
|
|
398
400
|
def parse_vcf_study(sumstats,format_cols,study,vcf_usecols,log,verbose=True):
|
|
399
|
-
|
|
400
|
-
|
|
401
|
+
log.write(" -Parsing based on FORMAT: ", format_cols,verbose=verbose)
|
|
402
|
+
log.write(" -Parsing vcf study : ", study,verbose=verbose)
|
|
401
403
|
sumstats[format_cols] = sumstats[study].str.split(":",expand=True).values
|
|
402
404
|
sumstats = sumstats.drop(["FORMAT",study],axis=1)
|
|
403
|
-
sumstats = sumstats
|
|
405
|
+
sumstats = sumstats[ vcf_usecols]
|
|
404
406
|
gc.collect()
|
|
405
407
|
return sumstats
|
|
406
408
|
|
|
407
|
-
def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=False):
|
|
408
|
-
|
|
409
|
+
def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=False, skip_meta_records=None):
|
|
410
|
+
log.write(" -"+fmt+" format meta info:",verbose=verbose)
|
|
411
|
+
if skip_meta_records is None:
|
|
412
|
+
skip_meta_records =[]
|
|
409
413
|
for key,value in meta_data.items():
|
|
414
|
+
if key in skip_meta_records:
|
|
415
|
+
continue
|
|
410
416
|
if value is None:
|
|
411
417
|
continue
|
|
412
418
|
if type(value) is str:
|
|
@@ -431,32 +437,32 @@ def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=Fals
|
|
|
431
437
|
if fmt!="gwaslab":
|
|
432
438
|
if output == False:
|
|
433
439
|
if fmt!="auto":
|
|
434
|
-
if verbose:log.write(" -"+fmt+" to gwaslab format dictionary:")
|
|
435
|
-
if verbose:log.write(" - "+fmt+" keys:",",".join(keys))
|
|
436
|
-
if verbose:log.write(" - gwaslab values:",",".join(values))
|
|
440
|
+
if verbose:log.write(" -"+fmt+" to gwaslab format dictionary:",verbose=verbose)
|
|
441
|
+
if verbose:log.write(" - "+fmt+" keys:",",".join(keys),verbose=verbose)
|
|
442
|
+
if verbose:log.write(" - gwaslab values:",",".join(values),verbose=verbose)
|
|
437
443
|
else:
|
|
438
|
-
if verbose:log.write(" - Auto-detection mode. Note: auto-detection assumes A1=EA; Alt=EA and Frq=EAF...")
|
|
439
|
-
if verbose:log.write(" - Header conversion source: https://github.com/Cloufield/formatbook/blob/main/formats/auto.json")
|
|
444
|
+
if verbose:log.write(" - Auto-detection mode. Note: auto-detection assumes A1=EA; Alt=EA and Frq=EAF...",verbose=verbose)
|
|
445
|
+
if verbose:log.write(" - Header conversion source: https://github.com/Cloufield/formatbook/blob/main/formats/auto.json",verbose=verbose)
|
|
440
446
|
else:
|
|
441
|
-
if verbose:log.write(" -gwaslab to "+fmt+" format dictionary:",)
|
|
447
|
+
if verbose:log.write(" -gwaslab to "+fmt+" format dictionary:",verbose=verbose)
|
|
442
448
|
keys=[]
|
|
443
449
|
values=[]
|
|
444
450
|
for key,value in rename_dictionary.items():
|
|
445
451
|
keys.append(key)
|
|
446
452
|
values.append(value)
|
|
447
|
-
if verbose:log.write(" - gwaslab keys:", ','.join(keys))
|
|
448
|
-
if verbose:log.write(" - "+fmt+" values:" , ','.join(values))
|
|
453
|
+
if verbose:log.write(" - gwaslab keys:", ','.join(keys),verbose=verbose)
|
|
454
|
+
if verbose:log.write(" - "+fmt+" values:" , ','.join(values),verbose=verbose)
|
|
449
455
|
|
|
450
456
|
def process_neaf(sumstats,log,verbose):
|
|
451
|
-
|
|
457
|
+
log.write(" -NEAF is specified...",verbose=verbose)
|
|
452
458
|
pre_number=len(sumstats)
|
|
453
|
-
|
|
454
|
-
sumstats
|
|
459
|
+
log.write(" -Checking if 0<= NEAF <=1 ...",verbose=verbose)
|
|
460
|
+
sumstats["EAF"] = pd.to_numeric(sumstats["EAF"], errors='coerce')
|
|
455
461
|
sumstats = sumstats.loc[(sumstats["EAF"]>=0) & (sumstats["EAF"]<=1),:]
|
|
456
|
-
sumstats
|
|
457
|
-
|
|
462
|
+
sumstats["EAF"] = 1- sumstats["EAF"]
|
|
463
|
+
log.write(" -Converted NEAF to EAF.",verbose=verbose)
|
|
458
464
|
after_number=len(sumstats)
|
|
459
|
-
|
|
465
|
+
log.write(" -Removed "+str(pre_number - after_number)+" variants with bad NEAF.",verbose=verbose)
|
|
460
466
|
return sumstats
|
|
461
467
|
|
|
462
468
|
def process_allele(sumstats,log,verbose):
|
|
@@ -466,17 +472,17 @@ def process_allele(sumstats,log,verbose):
|
|
|
466
472
|
if "REF" in sumstats.columns and "ALT" in sumstats.columns:
|
|
467
473
|
|
|
468
474
|
if "NEA" not in sumstats.columns:
|
|
469
|
-
|
|
475
|
+
log.write(" NEA not available: assigning REF to NEA...",verbose=verbose)
|
|
470
476
|
sumstats["NEA"]=sumstats["REF"]
|
|
471
477
|
|
|
472
|
-
|
|
478
|
+
log.write(" -EA,REF and ALT columns are available: assigning NEA...",verbose=verbose)
|
|
473
479
|
ea_alt = sumstats["EA"]==sumstats["ALT"]
|
|
474
480
|
|
|
475
|
-
|
|
481
|
+
log.write(" -For variants with EA == ALT : assigning REF to NEA ...",verbose=verbose)
|
|
476
482
|
sumstats.loc[ea_alt,"NEA"] = sumstats.loc[ea_alt,"REF"]
|
|
477
483
|
|
|
478
484
|
ea_not_alt = sumstats["EA"]!=sumstats["ALT"]
|
|
479
|
-
|
|
485
|
+
log.write(" -For variants with EA != ALT : assigning ALT to NEA ...",verbose=verbose)
|
|
480
486
|
sumstats.loc[ea_not_alt,"NEA"] = sumstats.loc[ea_not_alt,"ALT"]
|
|
481
487
|
|
|
482
488
|
#sumstats = sumstats.drop(labels=["REF","ALT"],axis=1)
|
|
@@ -489,7 +495,7 @@ def process_allele(sumstats,log,verbose):
|
|
|
489
495
|
return sumstats
|
|
490
496
|
|
|
491
497
|
def process_status(sumstats,build,log,verbose):
|
|
492
|
-
|
|
498
|
+
log.write(" -Initiating a status column: STATUS ...",verbose=verbose)
|
|
493
499
|
#sumstats["STATUS"] = int(build)*(10**5) +99999
|
|
494
500
|
build = _process_build(build,log,verbose)
|
|
495
501
|
sumstats["STATUS"] = build +"99999"
|