gwaslab 3.4.36__py3-none-any.whl → 3.4.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +1 -1
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +8 -0
- gwaslab/g_Sumstats.py +80 -178
- gwaslab/g_SumstatsPair.py +6 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_meta.py +13 -3
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +29 -15
- gwaslab/hm_harmonize_sumstats.py +312 -159
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +46 -37
- gwaslab/io_to_formats.py +428 -295
- gwaslab/qc_check_datatype.py +15 -1
- gwaslab/qc_fix_sumstats.py +956 -719
- gwaslab/util_ex_calculate_ldmatrix.py +29 -11
- gwaslab/util_ex_gwascatalog.py +1 -1
- gwaslab/util_ex_ldproxyfinder.py +1 -1
- gwaslab/util_ex_process_h5.py +26 -17
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_convert_h2.py +1 -1
- gwaslab/util_in_fill_data.py +44 -5
- gwaslab/util_in_filter_value.py +122 -34
- gwaslab/util_in_get_density.py +2 -2
- gwaslab/util_in_get_sig.py +41 -9
- gwaslab/viz_aux_quickfix.py +26 -21
- gwaslab/viz_aux_reposition_text.py +7 -4
- gwaslab/viz_aux_save_figure.py +6 -5
- gwaslab/viz_plot_compare_af.py +5 -5
- gwaslab/viz_plot_compare_effect.py +22 -5
- gwaslab/viz_plot_miamiplot2.py +28 -20
- gwaslab/viz_plot_mqqplot.py +214 -98
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +16 -9
- gwaslab/viz_plot_trumpetplot.py +15 -6
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/METADATA +3 -3
- gwaslab-3.4.38.dist-info/RECORD +72 -0
- gwaslab-3.4.36.dist-info/RECORD +0 -72
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0
gwaslab/hm_rsid_to_chrpos.py
CHANGED
|
@@ -135,7 +135,7 @@ def rsID2chrpos(path,
|
|
|
135
135
|
print("Writing to "+path+".rsid")
|
|
136
136
|
sumstats = sumstats.loc[sumstats["#POS"]!=0]
|
|
137
137
|
|
|
138
|
-
sumstats
|
|
138
|
+
sumstats[to_round] = sumstats[to_round].round(4)
|
|
139
139
|
sumstats.sort_values(["#CHROM","#POS"]).to_csv(path+".rsid","\t",na_rep='\.')
|
|
140
140
|
|
|
141
141
|
return sumstats
|
gwaslab/io_preformat_input.py
CHANGED
|
@@ -8,6 +8,7 @@ from gwaslab.bd_common_data import get_format_dict
|
|
|
8
8
|
from gwaslab.qc_fix_sumstats import sortcolumn
|
|
9
9
|
from gwaslab.qc_fix_sumstats import _process_build
|
|
10
10
|
from gwaslab.qc_check_datatype import check_datatype
|
|
11
|
+
from gwaslab.qc_check_datatype import check_dataframe_memory_usage
|
|
11
12
|
|
|
12
13
|
#20221030
|
|
13
14
|
def preformat(sumstats,
|
|
@@ -66,7 +67,7 @@ def preformat(sumstats,
|
|
|
66
67
|
#######################################################################################################################################################
|
|
67
68
|
if fmt is not None:
|
|
68
69
|
# loading format parameters
|
|
69
|
-
|
|
70
|
+
log.write("Start to load format from formatbook....",verbose=verbose)
|
|
70
71
|
|
|
71
72
|
# load format data
|
|
72
73
|
meta_data,rename_dictionary = get_format_dict(fmt)
|
|
@@ -96,7 +97,7 @@ def preformat(sumstats,
|
|
|
96
97
|
inpath = sumstats
|
|
97
98
|
###load sumstats by each chromosome #################################################
|
|
98
99
|
if "@" in inpath:
|
|
99
|
-
|
|
100
|
+
log.write(" -Detected @ in path: load sumstats by each chromosome...",verbose=verbose)
|
|
100
101
|
inpath_chr_list=[]
|
|
101
102
|
inpath_chr_num_list=[]
|
|
102
103
|
for chromosome in list(range(1,26))+["x","y","X","Y","MT","mt","m","M"]:
|
|
@@ -104,9 +105,10 @@ def preformat(sumstats,
|
|
|
104
105
|
if isfile_casesensitive(inpath_chr):
|
|
105
106
|
inpath_chr_num_list.append(str(chromosome))
|
|
106
107
|
inpath_chr_list.append(inpath_chr)
|
|
107
|
-
|
|
108
|
+
log.write(" -Chromosomes detected:",",".join(inpath_chr_num_list),verbose=verbose)
|
|
108
109
|
readargs_header = get_readargs_header(inpath = inpath_chr_list[0], readargs = readargs)
|
|
109
110
|
row_one = pd.read_table(inpath_chr_list[0],**readargs_header)
|
|
111
|
+
# columns in the sumstats
|
|
110
112
|
raw_cols = row_one.columns
|
|
111
113
|
else:
|
|
112
114
|
##### loading data from tabular file#################################################
|
|
@@ -172,6 +174,7 @@ def preformat(sumstats,
|
|
|
172
174
|
usecols.append(eaf)
|
|
173
175
|
rename_dictionary[eaf]= "EAF"
|
|
174
176
|
elif neaf:
|
|
177
|
+
# neaf will be converted to eaf
|
|
175
178
|
usecols.append(neaf)
|
|
176
179
|
rename_dictionary[neaf]= "EAF"
|
|
177
180
|
if maf:
|
|
@@ -280,10 +283,10 @@ def preformat(sumstats,
|
|
|
280
283
|
## loading data from path
|
|
281
284
|
inpath = sumstats
|
|
282
285
|
if "@" in inpath:
|
|
283
|
-
|
|
286
|
+
log.write("Start to initialize gl.Sumstats from files with pattern :" + inpath,verbose=verbose)
|
|
284
287
|
sumstats_chr_list=[]
|
|
285
288
|
for i in inpath_chr_list:
|
|
286
|
-
|
|
289
|
+
log.write(" -Loading:" + i)
|
|
287
290
|
skip_rows = get_skip_rows(i)
|
|
288
291
|
readargs["skiprows"] = skip_rows
|
|
289
292
|
sumstats_chr = pd.read_table(i,
|
|
@@ -291,14 +294,14 @@ def preformat(sumstats,
|
|
|
291
294
|
dtype=dtype_dictionary,
|
|
292
295
|
**readargs)
|
|
293
296
|
sumstats_chr_list.append(sumstats_chr)
|
|
294
|
-
|
|
297
|
+
log.write(" -Merging sumstats for chromosomes:",",".join(inpath_chr_num_list),verbose=verbose)
|
|
295
298
|
sumstats = pd.concat(sumstats_chr_list, axis=0, ignore_index=True)
|
|
296
299
|
del(sumstats_chr_list)
|
|
297
300
|
gc.collect()
|
|
298
301
|
else:
|
|
299
302
|
skip_rows = get_skip_rows(inpath)
|
|
300
303
|
readargs["skiprows"] = skip_rows
|
|
301
|
-
|
|
304
|
+
log.write("Start to initialize gl.Sumstats from file :" + inpath,verbose=verbose)
|
|
302
305
|
sumstats = pd.read_table(inpath,
|
|
303
306
|
usecols=set(usecols),
|
|
304
307
|
dtype=dtype_dictionary,
|
|
@@ -306,8 +309,8 @@ def preformat(sumstats,
|
|
|
306
309
|
|
|
307
310
|
elif type(sumstats) is pd.DataFrame:
|
|
308
311
|
## loading data from dataframe
|
|
309
|
-
|
|
310
|
-
sumstats = sumstats
|
|
312
|
+
log.write("Start to initialize gl.Sumstats from pandas DataFrame ...",verbose=verbose)
|
|
313
|
+
sumstats = sumstats[usecols].copy()
|
|
311
314
|
for key,value in dtype_dictionary.items():
|
|
312
315
|
if key in usecols:
|
|
313
316
|
sumstats[key] = sumstats[key].astype(value)
|
|
@@ -323,9 +326,9 @@ def preformat(sumstats,
|
|
|
323
326
|
converted_columns = list(map(lambda x: rename_dictionary[x], set(usecols)))
|
|
324
327
|
|
|
325
328
|
## renaming log
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
+
log.write(" -Reading columns :", ",".join(set(usecols)),verbose=verbose)
|
|
330
|
+
log.write(" -Renaming columns to :", ",".join(converted_columns),verbose=verbose)
|
|
331
|
+
log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns),verbose=verbose)
|
|
329
332
|
|
|
330
333
|
## renaming #####################################################################################
|
|
331
334
|
sumstats = sumstats.rename(columns=rename_dictionary)
|
|
@@ -353,7 +356,9 @@ def preformat(sumstats,
|
|
|
353
356
|
sumstats = sortcolumn(sumstats=sumstats,log=log,verbose=verbose)
|
|
354
357
|
check_datatype(sumstats,log=log,verbose=verbose)
|
|
355
358
|
gc.collect()
|
|
356
|
-
|
|
359
|
+
check_dataframe_memory_usage(sumstats,log=log,verbose=verbose)
|
|
360
|
+
|
|
361
|
+
log.write("Finished loading data successfully!",verbose=verbose)
|
|
357
362
|
return sumstats
|
|
358
363
|
|
|
359
364
|
|
|
@@ -393,17 +398,21 @@ def get_skip_rows(inpath):
|
|
|
393
398
|
return 0
|
|
394
399
|
|
|
395
400
|
def parse_vcf_study(sumstats,format_cols,study,vcf_usecols,log,verbose=True):
|
|
396
|
-
|
|
397
|
-
|
|
401
|
+
log.write(" -Parsing based on FORMAT: ", format_cols,verbose=verbose)
|
|
402
|
+
log.write(" -Parsing vcf study : ", study,verbose=verbose)
|
|
398
403
|
sumstats[format_cols] = sumstats[study].str.split(":",expand=True).values
|
|
399
404
|
sumstats = sumstats.drop(["FORMAT",study],axis=1)
|
|
400
|
-
sumstats = sumstats
|
|
405
|
+
sumstats = sumstats[ vcf_usecols]
|
|
401
406
|
gc.collect()
|
|
402
407
|
return sumstats
|
|
403
408
|
|
|
404
|
-
def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=False):
|
|
405
|
-
|
|
409
|
+
def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=False, skip_meta_records=None):
|
|
410
|
+
log.write(" -"+fmt+" format meta info:",verbose=verbose)
|
|
411
|
+
if skip_meta_records is None:
|
|
412
|
+
skip_meta_records =[]
|
|
406
413
|
for key,value in meta_data.items():
|
|
414
|
+
if key in skip_meta_records:
|
|
415
|
+
continue
|
|
407
416
|
if value is None:
|
|
408
417
|
continue
|
|
409
418
|
if type(value) is str:
|
|
@@ -428,32 +437,32 @@ def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=Fals
|
|
|
428
437
|
if fmt!="gwaslab":
|
|
429
438
|
if output == False:
|
|
430
439
|
if fmt!="auto":
|
|
431
|
-
if verbose:log.write(" -"+fmt+" to gwaslab format dictionary:")
|
|
432
|
-
if verbose:log.write(" - "+fmt+" keys:",",".join(keys))
|
|
433
|
-
if verbose:log.write(" - gwaslab values:",",".join(values))
|
|
440
|
+
if verbose:log.write(" -"+fmt+" to gwaslab format dictionary:",verbose=verbose)
|
|
441
|
+
if verbose:log.write(" - "+fmt+" keys:",",".join(keys),verbose=verbose)
|
|
442
|
+
if verbose:log.write(" - gwaslab values:",",".join(values),verbose=verbose)
|
|
434
443
|
else:
|
|
435
|
-
if verbose:log.write(" - Auto-detection mode. Note: auto-detection assumes A1=EA; Alt=EA and Frq=EAF...")
|
|
436
|
-
if verbose:log.write(" - Header conversion source: https://github.com/Cloufield/formatbook/blob/main/formats/auto.json")
|
|
444
|
+
if verbose:log.write(" - Auto-detection mode. Note: auto-detection assumes A1=EA; Alt=EA and Frq=EAF...",verbose=verbose)
|
|
445
|
+
if verbose:log.write(" - Header conversion source: https://github.com/Cloufield/formatbook/blob/main/formats/auto.json",verbose=verbose)
|
|
437
446
|
else:
|
|
438
|
-
if verbose:log.write(" -gwaslab to "+fmt+" format dictionary:",)
|
|
447
|
+
if verbose:log.write(" -gwaslab to "+fmt+" format dictionary:",verbose=verbose)
|
|
439
448
|
keys=[]
|
|
440
449
|
values=[]
|
|
441
450
|
for key,value in rename_dictionary.items():
|
|
442
451
|
keys.append(key)
|
|
443
452
|
values.append(value)
|
|
444
|
-
if verbose:log.write(" - gwaslab keys:", ','.join(keys))
|
|
445
|
-
if verbose:log.write(" - "+fmt+" values:" , ','.join(values))
|
|
453
|
+
if verbose:log.write(" - gwaslab keys:", ','.join(keys),verbose=verbose)
|
|
454
|
+
if verbose:log.write(" - "+fmt+" values:" , ','.join(values),verbose=verbose)
|
|
446
455
|
|
|
447
456
|
def process_neaf(sumstats,log,verbose):
|
|
448
|
-
|
|
457
|
+
log.write(" -NEAF is specified...",verbose=verbose)
|
|
449
458
|
pre_number=len(sumstats)
|
|
450
|
-
|
|
451
|
-
sumstats
|
|
459
|
+
log.write(" -Checking if 0<= NEAF <=1 ...",verbose=verbose)
|
|
460
|
+
sumstats["EAF"] = pd.to_numeric(sumstats["EAF"], errors='coerce')
|
|
452
461
|
sumstats = sumstats.loc[(sumstats["EAF"]>=0) & (sumstats["EAF"]<=1),:]
|
|
453
|
-
sumstats
|
|
454
|
-
|
|
462
|
+
sumstats["EAF"] = 1- sumstats["EAF"]
|
|
463
|
+
log.write(" -Converted NEAF to EAF.",verbose=verbose)
|
|
455
464
|
after_number=len(sumstats)
|
|
456
|
-
|
|
465
|
+
log.write(" -Removed "+str(pre_number - after_number)+" variants with bad NEAF.",verbose=verbose)
|
|
457
466
|
return sumstats
|
|
458
467
|
|
|
459
468
|
def process_allele(sumstats,log,verbose):
|
|
@@ -463,17 +472,17 @@ def process_allele(sumstats,log,verbose):
|
|
|
463
472
|
if "REF" in sumstats.columns and "ALT" in sumstats.columns:
|
|
464
473
|
|
|
465
474
|
if "NEA" not in sumstats.columns:
|
|
466
|
-
|
|
475
|
+
log.write(" NEA not available: assigning REF to NEA...",verbose=verbose)
|
|
467
476
|
sumstats["NEA"]=sumstats["REF"]
|
|
468
477
|
|
|
469
|
-
|
|
478
|
+
log.write(" -EA,REF and ALT columns are available: assigning NEA...",verbose=verbose)
|
|
470
479
|
ea_alt = sumstats["EA"]==sumstats["ALT"]
|
|
471
480
|
|
|
472
|
-
|
|
481
|
+
log.write(" -For variants with EA == ALT : assigning REF to NEA ...",verbose=verbose)
|
|
473
482
|
sumstats.loc[ea_alt,"NEA"] = sumstats.loc[ea_alt,"REF"]
|
|
474
483
|
|
|
475
484
|
ea_not_alt = sumstats["EA"]!=sumstats["ALT"]
|
|
476
|
-
|
|
485
|
+
log.write(" -For variants with EA != ALT : assigning ALT to NEA ...",verbose=verbose)
|
|
477
486
|
sumstats.loc[ea_not_alt,"NEA"] = sumstats.loc[ea_not_alt,"ALT"]
|
|
478
487
|
|
|
479
488
|
#sumstats = sumstats.drop(labels=["REF","ALT"],axis=1)
|
|
@@ -486,7 +495,7 @@ def process_allele(sumstats,log,verbose):
|
|
|
486
495
|
return sumstats
|
|
487
496
|
|
|
488
497
|
def process_status(sumstats,build,log,verbose):
|
|
489
|
-
|
|
498
|
+
log.write(" -Initiating a status column: STATUS ...",verbose=verbose)
|
|
490
499
|
#sumstats["STATUS"] = int(build)*(10**5) +99999
|
|
491
500
|
build = _process_build(build,log,verbose)
|
|
492
501
|
sumstats["STATUS"] = build +"99999"
|